reorganize moving non-submodule packages into src/main/scala

2016-08-19 10:58:56 -07:00
parent f78da0b0ea
commit 7b20609d4d
110 changed files with 3 additions and 381 deletions
--- a/src/main/scala/coreplex/Configs.scala
+++ b/src/main/scala/coreplex/Configs.scala
@ -0,0 +1,381 @@
+// See LICENSE for license details.
+
+package coreplex
+
+import Chisel._
+import junctions._
+import uncore.tilelink._
+import uncore.coherence._
+import uncore.agents._
+import uncore.devices._
+import uncore.converters._
+import rocket._
+import rocket.Util._
+import scala.math.max
+import scala.collection.mutable.{LinkedHashSet, ListBuffer}
+import DefaultTestSuites._
+import cde.{Parameters, Config, Dump, Knob, CDEMatchError}
+
+object ConfigUtils {
+  def max_int(values: Int*): Int = {
+    values.reduce((a, b) => max(a, b))
+  }
+}
+import ConfigUtils._
+
+class BaseCoreplexConfig extends Config (
+  topDefinitions = { (pname,site,here) => 
+    type PF = PartialFunction[Any,Any]
+    def findBy(sname:Any):Any = here[PF](site[Any](sname))(pname)
+    lazy val innerDataBits = 64
+    lazy val innerDataBeats = (8 * site(CacheBlockBytes)) / innerDataBits
+    pname match {
+      //Memory Parameters
+      case PAddrBits => 32
+      case PgLevels => if (site(XLen) == 64) 3 /* Sv39 */ else 2 /* Sv32 */
+      case ASIdBits => 7
+      //Params used by all caches
+      case NSets => findBy(CacheName)
+      case NWays => findBy(CacheName)
+      case RowBits => findBy(CacheName)
+      case NTLBEntries => findBy(CacheName)
+      case CacheIdBits => findBy(CacheName)
+      case SplitMetadata => findBy(CacheName)
+      case "L1I" => {
+        case NSets => Knob("L1I_SETS") //64
+        case NWays => Knob("L1I_WAYS") //4
+        case RowBits => site(TLKey("L1toL2")).dataBitsPerBeat
+        case NTLBEntries => 8
+        case CacheIdBits => 0
+        case SplitMetadata => false
+      }:PF
+      case "L1D" => {
+        case NSets => Knob("L1D_SETS") //64
+        case NWays => Knob("L1D_WAYS") //4
+        case RowBits => site(TLKey("L1toL2")).dataBitsPerBeat
+        case NTLBEntries => 8
+        case CacheIdBits => 0
+        case SplitMetadata => false
+      }:PF
+      case ECCCode => None
+      case Replacer => () => new RandomReplacement(site(NWays))
+      //L1InstCache
+      case BtbKey => BtbParameters()
+      //L1DataCache
+      case DCacheKey => DCacheConfig(nMSHRs = site(Knob("L1D_MSHRS")))
+      //L2 Memory System Params
+      case AmoAluOperandBits => site(XLen)
+      case NAcquireTransactors => 7
+      case L2StoreDataQueueDepth => 1
+      case L2DirectoryRepresentation => new NullRepresentation(site(NTiles))
+      case BuildL2CoherenceManager => (id: Int, p: Parameters) =>
+        Module(new L2BroadcastHub()(p.alterPartial({
+          case InnerTLId => "L1toL2"
+          case OuterTLId => "L2toMC" })))
+      case NCachedTileLinkPorts => 1
+      case NUncachedTileLinkPorts => 1
+      //Tile Constants
+      case BuildTiles => {
+        val env = if(site(UseVM)) List("p","v") else List("p")
+        site(FPUKey) foreach { case cfg =>
+          TestGeneration.addSuite(rv32udBenchmarks)
+          TestGeneration.addSuites(env.map(rv64ufNoDiv))
+          TestGeneration.addSuites(env.map(rv64udNoDiv))
+          if (cfg.divSqrt) {
+            TestGeneration.addSuites(env.map(rv64uf))
+            TestGeneration.addSuites(env.map(rv64ud))
+          }
+        }
+        if (site(UseAtomics)) TestGeneration.addSuites(env.map(if (site(XLen) == 64) rv64ua else rv32ua))
+        if (site(UseCompressed)) TestGeneration.addSuites(env.map(if (site(XLen) == 64) rv64uc else rv32uc))
+        val (rvi, rvu) =
+          if (site(XLen) == 64) ((if (site(UseVM)) rv64i else rv64pi), rv64u)
+          else ((if (site(UseVM)) rv32i else rv32pi), rv32u)
+        TestGeneration.addSuites(rvi.map(_("p")))
+        TestGeneration.addSuites((if(site(UseVM)) List("v") else List()).flatMap(env => rvu.map(_(env))))
+        TestGeneration.addSuite(if (site(UseVM)) benchmarks else emptyBmarks)
+        List.fill(site(NTiles)){ (r: Bool, p: Parameters) =>
+          Module(new RocketTile(resetSignal = r)(p.alterPartial({
+            case TLId => "L1toL2"
+            case NUncachedTileLinkPorts => 1 + site(RoccNMemChannels)
+          })))
+        }
+      }
+      case BuildRoCC => Nil
+      case RoccNMemChannels => site(BuildRoCC).map(_.nMemChannels).foldLeft(0)(_ + _)
+      case RoccNPTWPorts => site(BuildRoCC).map(_.nPTWPorts).foldLeft(0)(_ + _)
+      //Rocket Core Constants
+      case CoreInstBits => if (site(UseCompressed)) 16 else 32
+      case FetchWidth => if (site(UseCompressed)) 2 else 1
+      case RetireWidth => 1
+      case UseVM => true
+      case UseUser => true
+      case UseDebug => true
+      case NBreakpoints => 1
+      case FastLoadWord => true
+      case FastLoadByte => false
+      case XLen => 64
+      case FPUKey => Some(FPUConfig())
+      case MulDivKey => Some(MulDivConfig())
+      case UseAtomics => true
+      case UseCompressed => true
+      case PLICKey => PLICConfig(site(NTiles), site(UseVM), site(NExtInterrupts), 0)
+      case DMKey => new DefaultDebugModuleConfig(site(NTiles), site(XLen))
+      case NCustomMRWCSRs => 0
+      case ResetVector => BigInt(0x1000)
+      case MtvecInit => BigInt(0x1010)
+      case MtvecWritable => true
+      //Uncore Paramters
+      case RTCPeriod => 100 // gives 10 MHz RTC assuming 1 GHz uncore clock
+      case LNEndpoints => site(TLKey(site(TLId))).nManagers + site(TLKey(site(TLId))).nClients
+      case LNHeaderBits => log2Ceil(site(TLKey(site(TLId))).nManagers) +
+                             log2Up(site(TLKey(site(TLId))).nClients)
+      case TLKey("L1toL2") => {
+        val useMEI = site(NTiles) <= 1 && site(NCachedTileLinkPorts) <= 1
+        TileLinkParameters(
+          coherencePolicy = (
+            if (useMEI) new MEICoherence(site(L2DirectoryRepresentation))
+            else new MESICoherence(site(L2DirectoryRepresentation))),
+          nManagers = site(NBanksPerMemoryChannel)*site(NMemoryChannels) + 1 /* MMIO */,
+          nCachingClients = site(NCachedTileLinkPorts),
+          nCachelessClients = site(NExternalClients) + site(NUncachedTileLinkPorts),
+          maxClientXacts = max_int(
+              // L1 cache
+              site(DCacheKey).nMSHRs + 1 /* IOMSHR */,
+              // RoCC
+              if (site(BuildRoCC).isEmpty) 1 else site(RoccMaxTaggedMemXacts)),
+          maxClientsPerPort = if (site(BuildRoCC).isEmpty) 1 else 2,
+          maxManagerXacts = site(NAcquireTransactors) + 2,
+          dataBeats = innerDataBeats,
+          dataBits = site(CacheBlockBytes)*8)
+      }
+      case TLKey("L2toMC") => 
+        TileLinkParameters(
+          coherencePolicy = new MEICoherence(
+            new NullRepresentation(site(NBanksPerMemoryChannel))),
+          nManagers = 1,
+          nCachingClients = site(NBanksPerMemoryChannel),
+          nCachelessClients = 0,
+          maxClientXacts = 1,
+          maxClientsPerPort = site(NAcquireTransactors) + 2,
+          maxManagerXacts = 1,
+          dataBeats = innerDataBeats,
+          dataBits = site(CacheBlockBytes)*8)
+      case TLKey("Outermost") => site(TLKey("L2toMC")).copy(
+        maxClientXacts = site(NAcquireTransactors) + 2,
+        maxClientsPerPort = site(NBanksPerMemoryChannel),
+        dataBeats = site(MIFDataBeats))
+      case TLKey("L2toMMIO") => {
+        TileLinkParameters(
+          coherencePolicy = new MICoherence(
+            new NullRepresentation(site(NBanksPerMemoryChannel))),
+          nManagers = site(GlobalAddrMap).subMap("io").numSlaves,
+          nCachingClients = 0,
+          nCachelessClients = 1,
+          maxClientXacts = 4,
+          maxClientsPerPort = 1,
+          maxManagerXacts = 1,
+          dataBeats = innerDataBeats,
+          dataBits = site(CacheBlockBytes) * 8)
+      }
+      case TLKey("MMIO_Outermost") => site(TLKey("L2toMMIO")).copy(dataBeats = site(MIFDataBeats))
+
+      case BootROMFile => "./bootrom/bootrom.img"
+      case NTiles => Knob("NTILES")
+      case NBanksPerMemoryChannel => Knob("NBANKS_PER_MEM_CHANNEL")
+      case BankIdLSB => 0
+      case CacheBlockBytes => Dump("CACHE_BLOCK_BYTES", 64)
+      case CacheBlockOffsetBits => log2Up(here(CacheBlockBytes))
+      case EnableL2Logging => false
+      case ExtraCoreplexPorts => (p: Parameters) => new Bundle
+      case RegressionTestNames => LinkedHashSet(
+        "rv64ud-v-fcvt",
+        "rv64ud-p-fdiv",
+        "rv64ud-v-fadd",
+        "rv64uf-v-fadd",
+        "rv64um-v-mul",
+        "rv64mi-p-breakpoint",
+        "rv64uc-v-rvc",
+        "rv64ud-v-structural",
+        "rv64si-p-wfi",
+        "rv64um-v-divw",
+        "rv64ua-v-lrsc",
+        "rv64ui-v-fence_i",
+        "rv64ud-v-fcvt_w",
+        "rv64uf-v-fmin",
+        "rv64ui-v-sb",
+        "rv64ua-v-amomax_d",
+        "rv64ud-v-move",
+        "rv64ud-v-fclass",
+        "rv64ua-v-amoand_d",
+        "rv64ua-v-amoxor_d",
+        "rv64si-p-sbreak",
+        "rv64ud-v-fmadd",
+        "rv64uf-v-ldst",
+        "rv64um-v-mulh",
+        "rv64si-p-dirty")
+      case _ => throw new CDEMatchError
+  }},
+  knobValues = {
+    case "NTILES" => 1
+    case "NBANKS_PER_MEM_CHANNEL" => 1
+    case "L1D_MSHRS" => 2
+    case "L1D_SETS" => 64
+    case "L1D_WAYS" => 4
+    case "L1I_SETS" => 64
+    case "L1I_WAYS" => 4
+    case _ => throw new CDEMatchError
+  }
+)
+
+class WithNCores(n: Int) extends Config(
+  knobValues = { case"NTILES" => n; case _ => throw new CDEMatchError })
+
+class WithNBanksPerMemChannel(n: Int) extends Config(
+  knobValues = {
+    case "NBANKS_PER_MEM_CHANNEL" => n
+    case _ => throw new CDEMatchError
+  })
+
+class WithL2Cache extends Config(
+  (pname,site,here) => pname match {
+    case "L2_CAPACITY_IN_KB" => Knob("L2_CAPACITY_IN_KB")
+    case "L2Bank" => {
+      case NSets => (((here[Int]("L2_CAPACITY_IN_KB")*1024) /
+                        site(CacheBlockBytes)) /
+                          (site(NBanksPerMemoryChannel)*site(NMemoryChannels))) /
+                            site(NWays)
+      case NWays => Knob("L2_WAYS")
+      case RowBits => site(TLKey(site(TLId))).dataBitsPerBeat
+      case CacheIdBits => log2Ceil(site(NMemoryChannels) * site(NBanksPerMemoryChannel))
+      case SplitMetadata => Knob("L2_SPLIT_METADATA")
+    }: PartialFunction[Any,Any] 
+    case NAcquireTransactors => 2
+    case NSecondaryMisses => 4
+    case L2DirectoryRepresentation => new FullRepresentation(site(NTiles))
+    case BuildL2CoherenceManager => (id: Int, p: Parameters) =>
+      Module(new L2HellaCacheBank()(p.alterPartial({
+        case CacheId => id
+        case CacheName => "L2Bank"
+        case InnerTLId => "L1toL2"
+        case OuterTLId => "L2toMC"})))
+    case L2Replacer => () => new SeqRandom(site(NWays))
+    case _ => throw new CDEMatchError
+  },
+  knobValues = { case "L2_WAYS" => 8; case "L2_CAPACITY_IN_KB" => 2048; case "L2_SPLIT_METADATA" => false; case _ => throw new CDEMatchError }
+)
+
+class WithBufferlessBroadcastHub extends Config(
+  (pname, site, here) => pname match {
+    case BuildL2CoherenceManager => (id: Int, p: Parameters) =>
+      Module(new BufferlessBroadcastHub()(p.alterPartial({
+        case InnerTLId => "L1toL2"
+        case OuterTLId => "L2toMC" })))
+  })
+
+/**
+ * WARNING!!! IGNORE AT YOUR OWN PERIL!!!
+ *
+ * There is a very restrictive set of conditions under which the stateless
+ * bridge will function properly. There can only be a single tile. This tile
+ * MUST use the blocking data cache (L1D_MSHRS == 0) and MUST NOT have an
+ * uncached channel capable of writes (i.e. a RoCC accelerator).
+ *
+ * This is because the stateless bridge CANNOT generate probes, so if your
+ * system depends on coherence between channels in any way,
+ * DO NOT use this configuration.
+ */
+class WithStatelessBridge extends Config (
+  topDefinitions = (pname, site, here) => pname match {
+    case BuildL2CoherenceManager => (id: Int, p: Parameters) =>
+      Module(new ManagerToClientStatelessBridge()(p.alterPartial({
+        case InnerTLId => "L1toL2"
+        case OuterTLId => "L2toMC" })))
+  },
+  knobValues = {
+    case "L1D_MSHRS" => 0
+    case _ => throw new CDEMatchError
+  }
+)
+
+class WithPLRU extends Config(
+  (pname, site, here) => pname match {
+    case L2Replacer => () => new SeqPLRU(site(NSets), site(NWays))
+    case _ => throw new CDEMatchError
+  })
+
+class WithL2Capacity(size_kb: Int) extends Config(
+  knobValues = {
+    case "L2_CAPACITY_IN_KB" => size_kb
+    case _ => throw new CDEMatchError
+  })
+
+class WithNL2Ways(n: Int) extends Config(
+  knobValues = {
+    case "L2_WAYS" => n
+    case _ => throw new CDEMatchError
+  })
+
+class WithRV32 extends Config(
+  (pname,site,here) => pname match {
+    case XLen => 32
+    case UseVM => false
+    case UseUser => false
+    case UseAtomics => false
+    case FPUKey => None
+    case RegressionTestNames => LinkedHashSet(
+      "rv32mi-p-ma_addr",
+      "rv32mi-p-csr",
+      "rv32ui-p-sh",
+      "rv32ui-p-lh",
+      "rv32mi-p-sbreak",
+      "rv32ui-p-sll")
+    case _ => throw new CDEMatchError
+  }
+)
+
+class WithBlockingL1 extends Config (
+  knobValues = {
+    case "L1D_MSHRS" => 0
+    case _ => throw new CDEMatchError
+  }
+)
+
+class WithSmallCores extends Config (
+    topDefinitions = { (pname,site,here) => pname match {
+      case FPUKey => None
+      case NTLBEntries => 4
+      case BtbKey => BtbParameters(nEntries = 0)
+      case NAcquireTransactors => 2
+      case _ => throw new CDEMatchError
+    }},
+  knobValues = {
+    case "L1D_SETS" => 64
+    case "L1D_WAYS" => 1
+    case "L1I_SETS" => 64
+    case "L1I_WAYS" => 1
+    case "L1D_MSHRS" => 0
+    case _ => throw new CDEMatchError
+  }
+)
+
+class WithRoccExample extends Config(
+  (pname, site, here) => pname match {
+    case BuildRoCC => Seq(
+      RoccParameters(
+        opcodes = OpcodeSet.custom0,
+        generator = (p: Parameters) => Module(new AccumulatorExample()(p))),
+      RoccParameters(
+        opcodes = OpcodeSet.custom1,
+        generator = (p: Parameters) => Module(new TranslatorExample()(p)),
+        nPTWPorts = 1),
+      RoccParameters(
+        opcodes = OpcodeSet.custom2,
+        generator = (p: Parameters) => Module(new CharacterCountExample()(p))))
+
+    case RoccMaxTaggedMemXacts => 1
+    case _ => throw new CDEMatchError
+  })
+
+class WithSplitL2Metadata extends Config(
+  knobValues = { case "L2_SPLIT_METADATA" => true; case _ => throw new CDEMatchError })
--- a/src/main/scala/coreplex/Coreplex.scala
+++ b/src/main/scala/coreplex/Coreplex.scala
@ -0,0 +1,283 @@
+package coreplex
+
+import Chisel._
+import cde.{Parameters, Field}
+import junctions._
+import uncore.tilelink._
+import uncore.coherence._
+import uncore.agents._
+import uncore.devices._
+import uncore.util._
+import uncore.converters._
+import rocket._
+import rocket.Util._
+import java.nio.{ByteBuffer,ByteOrder}
+import java.nio.file.{Files, Paths}
+
+/** Number of memory channels */
+case object NMemoryChannels extends Field[Int]
+/** Number of banks per memory channel */
+case object NBanksPerMemoryChannel extends Field[Int]
+/** Least significant bit of address used for bank partitioning */
+case object BankIdLSB extends Field[Int]
+/** Function for building some kind of coherence manager agent */
+case object BuildL2CoherenceManager extends Field[(Int, Parameters) => CoherenceAgent]
+/** Function for building some kind of tile connected to a reset signal */
+case object BuildTiles extends Field[Seq[(Bool, Parameters) => Tile]]
+/** A string describing on-chip devices, readable by target software */
+case object ConfigString extends Field[Array[Byte]]
+/** Number of external interrupt sources */
+case object NExtInterrupts extends Field[Int]
+/** Interrupt controller configuration */
+case object PLICKey extends Field[PLICConfig]
+/** Number of clock cycles per RTC tick */
+case object RTCPeriod extends Field[Int]
+/** The file to read the BootROM contents from */
+case object BootROMFile extends Field[String]
+/** Export an external MMIO slave port */
+case object ExportMMIOPort extends Field[Boolean]
+/** Expose additional TileLink client ports */
+case object NExternalClients extends Field[Int]
+/** Extra top-level ports exported from the coreplex */
+case object ExtraCoreplexPorts extends Field[Parameters => Bundle]
+
+trait HasCoreplexParameters {
+  implicit val p: Parameters
+  lazy val nTiles = p(NTiles)
+  lazy val nCachedTilePorts = p(NCachedTileLinkPorts)
+  lazy val nUncachedTilePorts = p(NUncachedTileLinkPorts)
+  lazy val nMemChannels = p(NMemoryChannels)
+  lazy val nBanksPerMemChannel = p(NBanksPerMemoryChannel)
+  lazy val nBanks = nMemChannels*nBanksPerMemChannel
+  lazy val lsb = p(BankIdLSB)
+  lazy val innerParams = p.alterPartial({ case TLId => "L1toL2" })
+  lazy val outermostParams = p.alterPartial({ case TLId => "Outermost" })
+  lazy val outermostMMIOParams = p.alterPartial({ case TLId => "MMIO_Outermost" })
+  lazy val nExtClients = p(NExternalClients)
+  lazy val exportMMIO = p(ExportMMIOPort)
+}
+
+/** Wrapper around everything that isn't a Tile.
+  *
+  * Usually this is clocked and/or place-and-routed separately from the Tiles.
+  */
+class Uncore(implicit val p: Parameters) extends Module
+    with HasCoreplexParameters {
+
+  val io = new Bundle {
+    val mem  = Vec(nMemChannels, new ClientUncachedTileLinkIO()(outermostParams))
+    val tiles_cached = Vec(nCachedTilePorts, new ClientTileLinkIO).flip
+    val tiles_uncached = Vec(nUncachedTilePorts, new ClientUncachedTileLinkIO).flip
+    val ext_uncached = Vec(nExtClients, new ClientUncachedTileLinkIO()(innerParams)).flip
+    val prci = Vec(nTiles, new PRCITileIO).asOutput
+    val mmio = if (exportMMIO) Some(new ClientUncachedTileLinkIO()(outermostMMIOParams)) else None
+    val interrupts = Vec(p(NExtInterrupts), Bool()).asInput
+    val debug = new DebugBusIO()(p).flip
+  }
+
+  val outmemsys = if (nCachedTilePorts + nUncachedTilePorts > 0)
+    Module(new DefaultOuterMemorySystem) // NoC, LLC and SerDes
+  else Module(new DummyOuterMemorySystem)
+  outmemsys.io.incoherent foreach (_ := false)
+  outmemsys.io.tiles_uncached <> io.tiles_uncached
+  outmemsys.io.tiles_cached <> io.tiles_cached
+  outmemsys.io.ext_uncached <> io.ext_uncached
+  io.mem <> outmemsys.io.mem
+
+  buildMMIONetwork(p.alterPartial({case TLId => "L2toMMIO"}))
+
+  def makeBootROM()(implicit p: Parameters) = {
+    val romdata = Files.readAllBytes(Paths.get(p(BootROMFile)))
+    val rom = ByteBuffer.wrap(romdata)
+
+    rom.order(ByteOrder.LITTLE_ENDIAN)
+
+    // for now, have the reset vector jump straight to memory
+    val resetToMemDist = p(GlobalAddrMap)("mem").start - p(ResetVector)
+    require(resetToMemDist == (resetToMemDist.toInt >> 12 << 12))
+    val configStringAddr = p(ResetVector).toInt + rom.capacity
+
+    require(rom.getInt(12) == 0,
+      "Config string address position should not be occupied by code")
+    rom.putInt(12, configStringAddr)
+    rom.array() ++ p(ConfigString).toSeq
+  }
+
+  def buildMMIONetwork(implicit p: Parameters) = {
+    val ioAddrMap = p(GlobalAddrMap).subMap("io")
+
+    val mmioNetwork = Module(new TileLinkRecursiveInterconnect(1, ioAddrMap))
+    mmioNetwork.io.in.head <> outmemsys.io.mmio
+
+    val plic = Module(new PLIC(p(PLICKey)))
+    plic.io.tl <> mmioNetwork.port("int:plic")
+    for (i <- 0 until io.interrupts.size) {
+      val gateway = Module(new LevelGateway)
+      gateway.io.interrupt := io.interrupts(i)
+      plic.io.devices(i) <> gateway.io.plic
+    }
+
+    val debugModule = Module(new DebugModule)
+    debugModule.io.tl <> mmioNetwork.port("int:debug")
+    debugModule.io.db <> io.debug
+
+    val prci = Module(new PRCI)
+    prci.io.tl <> mmioNetwork.port("int:prci")
+    io.prci := prci.io.tiles
+    prci.io.rtcTick := Counter(p(RTCPeriod)).inc() // placeholder for real RTC
+
+    for (i <- 0 until nTiles) {
+      prci.io.interrupts(i).meip := plic.io.harts(plic.cfg.context(i, 'M'))
+      if (p(UseVM))
+        prci.io.interrupts(i).seip := plic.io.harts(plic.cfg.context(i, 'S'))
+      prci.io.interrupts(i).debug := debugModule.io.debugInterrupts(i)
+
+      io.prci(i).reset := reset
+    }
+
+    val bootROM = Module(new ROMSlave(makeBootROM()))
+    bootROM.io <> mmioNetwork.port("int:bootrom")
+
+    io.mmio.map { ext => ext <> mmioNetwork.port("ext") }
+  }
+}
+
+abstract class OuterMemorySystem(implicit val p: Parameters)
+    extends Module with HasCoreplexParameters {
+  val io = new Bundle {
+    val tiles_cached = Vec(nCachedTilePorts, new ClientTileLinkIO).flip
+    val tiles_uncached = Vec(nUncachedTilePorts, new ClientUncachedTileLinkIO).flip
+    val ext_uncached = Vec(nExtClients, new ClientUncachedTileLinkIO()(innerParams)).flip
+    val incoherent = Vec(nCachedTilePorts, Bool()).asInput
+    val mem  = Vec(nMemChannels, new ClientUncachedTileLinkIO()(outermostParams))
+    val mmio = new ClientUncachedTileLinkIO()(p.alterPartial({case TLId => "L2toMMIO"}))
+  }
+}
+
+/** Use in place of OuterMemorySystem if there are no clients to connect. */
+class DummyOuterMemorySystem(implicit p: Parameters) extends OuterMemorySystem()(p) {
+  require(nCachedTilePorts + nUncachedTilePorts + nExtClients == 0)
+
+  io.mem.foreach { tl =>
+    tl.acquire.valid := Bool(false)
+    tl.grant.ready := Bool(false)
+  }
+
+  io.mmio.acquire.valid := Bool(false)
+  io.mmio.grant.ready := Bool(false)
+}
+
+/** The whole outer memory hierarchy, including a NoC, some kind of coherence
+  * manager agent, and a converter from TileLink to MemIO.
+  */ 
+class DefaultOuterMemorySystem(implicit p: Parameters) extends OuterMemorySystem()(p) {
+  // Create a simple L1toL2 NoC between the tiles and the banks of outer memory
+  // Cached ports are first in client list, making sharerToClientId just an indentity function
+  // addrToBank is sed to hash physical addresses (of cache blocks) to banks (and thereby memory channels)
+  def sharerToClientId(sharerId: UInt) = sharerId
+  def addrToBank(addr: UInt): UInt = {
+    val isMemory = p(GlobalAddrMap).isInRegion("mem", addr << log2Up(p(CacheBlockBytes)))
+    Mux(isMemory,
+      if (nBanks > 1) addr(lsb + log2Up(nBanks) - 1, lsb) else UInt(0),
+      UInt(nBanks))
+  }
+  val preBuffering = TileLinkDepths(1,1,2,2,0)
+  val l1tol2net = Module(new PortedTileLinkCrossbar(addrToBank, sharerToClientId, preBuffering))
+
+  // Create point(s) of coherence serialization
+  val managerEndpoints = List.tabulate(nBanks){id => p(BuildL2CoherenceManager)(id, p)}
+  managerEndpoints.foreach { _.incoherent := io.incoherent }
+
+  val mmioManager = Module(new MMIOTileLinkManager()(p.alterPartial({
+    case TLId => "L1toL2"
+    case InnerTLId => "L1toL2"
+    case OuterTLId => "L2toMMIO"
+  })))
+  io.mmio <> mmioManager.io.outer
+
+  // Wire the tiles to the TileLink client ports of the L1toL2 network,
+  // and coherence manager(s) to the other side
+  l1tol2net.io.clients_cached <> io.tiles_cached
+  l1tol2net.io.clients_uncached <> io.tiles_uncached ++ io.ext_uncached
+  l1tol2net.io.managers <> managerEndpoints.map(_.innerTL) :+ mmioManager.io.inner
+
+  // Create a converter between TileLinkIO and MemIO for each channel
+  val outerTLParams = p.alterPartial({ case TLId => "L2toMC" })
+  val backendBuffering = TileLinkDepths(0,0,0,0,0)
+
+  // TODO: the code to print this stuff should live somewhere else
+  println("Generated Address Map")
+  for (entry <- p(GlobalAddrMap).flatten) {
+    val name = entry.name
+    val start = entry.region.start
+    val end = entry.region.start + entry.region.size - 1
+    println(f"\t$name%s $start%x - $end%x")
+  }
+  println("Generated Configuration String")
+  println(new String(p(ConfigString)))
+
+  val mem_ic = Module(new TileLinkMemoryInterconnect(nBanksPerMemChannel, nMemChannels)(outermostParams))
+
+  for ((bank, icPort) <- managerEndpoints zip mem_ic.io.in) {
+    val unwrap = Module(new ClientTileLinkIOUnwrapper()(outerTLParams))
+    unwrap.io.in <> ClientTileLinkEnqueuer(bank.outerTL, backendBuffering)(outerTLParams)
+    TileLinkWidthAdapter(icPort, unwrap.io.out)
+  }
+
+  io.mem <> mem_ic.io.out
+}
+
+abstract class Coreplex(implicit val p: Parameters) extends Module
+    with HasCoreplexParameters {
+  class CoreplexIO(implicit val p: Parameters) extends Bundle {
+    val mem  = Vec(nMemChannels, new ClientUncachedTileLinkIO()(outermostParams))
+    val ext_clients = Vec(nExtClients, new ClientUncachedTileLinkIO()(innerParams)).flip
+    val mmio = if(p(ExportMMIOPort)) Some(new ClientUncachedTileLinkIO()(outermostMMIOParams)) else None
+    val interrupts = Vec(p(NExtInterrupts), Bool()).asInput
+    val debug = new DebugBusIO()(p).flip
+    val extra = p(ExtraCoreplexPorts)(p)
+    val success: Option[Bool] = if (hasSuccessFlag) Some(Bool(OUTPUT)) else None
+  }
+
+  def hasSuccessFlag: Boolean = false
+  val io = new CoreplexIO
+}
+
+class DefaultCoreplex(topParams: Parameters) extends Coreplex()(topParams) {
+  // Build an Uncore and a set of Tiles
+  val tileResets = Wire(Vec(nTiles, Bool()))
+  val tileList = p(BuildTiles).zip(tileResets).map {
+    case (tile, rst) => tile(rst, p)
+  }
+  val nCachedPorts = tileList.map(tile => tile.io.cached.size).reduce(_ + _)
+  val nUncachedPorts = tileList.map(tile => tile.io.uncached.size).reduce(_ + _)
+
+  val innerTLParams = p.alterPartial({
+    case HastiId => "TL"
+    case TLId => "L1toL2"
+    case NCachedTileLinkPorts => nCachedPorts
+    case NUncachedTileLinkPorts => nUncachedPorts
+  })
+
+  val uncore = Module(new Uncore()(innerTLParams))
+
+  (uncore.io.prci, tileResets, tileList).zipped.foreach {
+    case (prci, rst, tile) =>
+      rst := prci.reset
+      tile.io.prci <> prci
+  }
+
+  // Connect the uncore to the tile memory ports, HostIO and MemIO
+  uncore.io.tiles_cached <> tileList.map(_.io.cached).flatten
+  uncore.io.tiles_uncached <> tileList.map(_.io.uncached).flatten
+  uncore.io.interrupts <> io.interrupts
+  uncore.io.debug <> io.debug
+  uncore.io.ext_uncached <> io.ext_clients
+  if (exportMMIO) { io.mmio.get <> uncore.io.mmio.get }
+  io.mem <> uncore.io.mem
+}
+
+class GroundTestCoreplex(topParams: Parameters) extends DefaultCoreplex(topParams) {
+  override def hasSuccessFlag = true
+  io.success.get := tileList.flatMap(_.io.elements get "success").map(_.asInstanceOf[Bool]).reduce(_&&_)
+}
--- a/src/main/scala/coreplex/DirectGroundTest.scala
+++ b/src/main/scala/coreplex/DirectGroundTest.scala
@ -0,0 +1,62 @@
+package coreplex
+
+import Chisel._
+import cde.{Parameters, Field}
+import groundtest._
+import uncore.tilelink._
+import uncore.agents._
+
+case object ExportGroundTestStatus extends Field[Boolean]
+
+class DirectGroundTestCoreplex(topParams: Parameters) extends Coreplex()(topParams) {
+  // Not using the debug 
+  io.debug.req.ready := Bool(false)
+  io.debug.resp.valid := Bool(false)
+
+  require(!exportMMIO)
+  require(nExtClients == 0)
+  require(nMemChannels == 1)
+  require(nTiles == 1)
+
+  val test = p(BuildGroundTest)(outermostParams.alterPartial({
+    case GroundTestId => 0
+    case CacheName => "L1D"
+  }))
+  require(test.io.cache.size == 0)
+  require(test.io.mem.size == nBanksPerMemChannel)
+  require(test.io.ptw.size == 0)
+
+  val mem_ic = Module(new TileLinkMemoryInterconnect(
+    nBanksPerMemChannel, nMemChannels)(outermostParams))
+
+  mem_ic.io.in <> test.io.mem
+  io.mem <> mem_ic.io.out
+
+  if (p(ExportGroundTestStatus)) {
+    val status = io.extra.asInstanceOf[GroundTestStatus]
+
+    val s_running :: s_finished :: s_errored :: s_timeout :: Nil = Enum(Bits(), 4)
+    val state = Reg(init = s_running)
+    val error_code = Reg(status.error.bits)
+    val timeout_code = Reg(status.timeout.bits)
+    when (state === s_running) {
+      when (test.io.status.finished) { state := s_finished }
+      when (test.io.status.error.valid) {
+        state := s_errored
+        error_code := test.io.status.error.bits
+      }
+      when (test.io.status.timeout.valid) {
+        state := s_timeout
+        timeout_code := test.io.status.timeout.bits
+      }
+    }
+    status.finished := (state === s_finished)
+    status.error.valid := (state === s_errored)
+    status.error.bits := error_code
+    status.timeout.valid := (state === s_timeout)
+    status.timeout.bits := timeout_code
+  }
+
+  override def hasSuccessFlag = true
+  io.success.get := test.io.status.finished
+}
--- a/src/main/scala/coreplex/TestConfigs.scala
+++ b/src/main/scala/coreplex/TestConfigs.scala
@ -0,0 +1,200 @@
+package coreplex
+
+import Chisel._
+import groundtest._
+import rocket._
+import uncore.tilelink._
+import uncore.coherence._
+import uncore.agents._
+import uncore.devices.NTiles
+import uncore.unittests._
+import junctions._
+import junctions.unittests._
+import scala.collection.mutable.LinkedHashSet
+import cde.{Parameters, Config, Dump, Knob, CDEMatchError}
+import scala.math.max
+import ConfigUtils._
+
+class WithComparator extends Config(
+  (pname, site, here) => pname match {
+    case GroundTestKey => Seq.fill(site(NTiles)) {
+      GroundTestTileSettings(uncached = site(ComparatorKey).targets.size)
+    }
+    case BuildGroundTest =>
+      (p: Parameters) => Module(new ComparatorCore()(p))
+    case ComparatorKey => ComparatorParameters(
+      targets    = Seq("mem", "io:ext:testram").map(name =>
+                    site(GlobalAddrMap)(name).start.longValue),
+      width      = 8,
+      operations = 1000,
+      atomics    = site(UseAtomics),
+      prefetches = site("COMPARATOR_PREFETCHES"))
+    case FPUConfig => None
+    case UseAtomics => false
+    case "COMPARATOR_PREFETCHES" => false
+    case _ => throw new CDEMatchError
+  })
+
+class WithAtomics extends Config(
+  (pname, site, here) => pname match {
+    case UseAtomics => true
+    case _ => throw new CDEMatchError
+  })
+
+class WithPrefetches extends Config(
+  (pname, site, here) => pname match {
+    case "COMPARATOR_PREFETCHES" => true
+    case _ => throw new CDEMatchError
+  })
+
+class WithMemtest extends Config(
+  (pname, site, here) => pname match {
+    case GroundTestKey => Seq.fill(site(NTiles)) {
+      GroundTestTileSettings(1, 1)
+    }
+    case GeneratorKey => GeneratorParameters(
+      maxRequests = 128,
+      startAddress = site(GlobalAddrMap)("mem").start)
+    case BuildGroundTest =>
+      (p: Parameters) => Module(new GeneratorTest()(p))
+    case _ => throw new CDEMatchError
+  })
+
+class WithNGenerators(nUncached: Int, nCached: Int) extends Config(
+  (pname, site, here) => pname match {
+    case GroundTestKey => Seq.fill(site(NTiles)) {
+      GroundTestTileSettings(nUncached, nCached)
+    }
+    case _ => throw new CDEMatchError
+  })
+
+class WithCacheFillTest extends Config(
+  (pname, site, here) => pname match {
+    case GroundTestKey => Seq.fill(site(NTiles)) {
+      GroundTestTileSettings(uncached = 1)
+    }
+    case BuildGroundTest =>
+      (p: Parameters) => Module(new CacheFillTest()(p))
+    case _ => throw new CDEMatchError
+  },
+  knobValues = {
+    case "L2_WAYS" => 4
+    case "L2_CAPACITY_IN_KB" => 4
+    case _ => throw new CDEMatchError
+  })
+
+class WithBroadcastRegressionTest extends Config(
+  (pname, site, here) => pname match {
+    case GroundTestKey => Seq.fill(site(NTiles)) {
+      GroundTestTileSettings(1, 1, maxXacts = 3)
+    }
+    case BuildGroundTest =>
+      (p: Parameters) => Module(new RegressionTest()(p))
+    case GroundTestRegressions =>
+      (p: Parameters) => RegressionTests.broadcastRegressions(p)
+    case _ => throw new CDEMatchError
+  })
+
+class WithCacheRegressionTest extends Config(
+  (pname, site, here) => pname match {
+    case GroundTestKey => Seq.fill(site(NTiles)) {
+      GroundTestTileSettings(1, 1, maxXacts = 5)
+    }
+    case BuildGroundTest =>
+      (p: Parameters) => Module(new RegressionTest()(p))
+    case GroundTestRegressions =>
+      (p: Parameters) => RegressionTests.cacheRegressions(p)
+    case _ => throw new CDEMatchError
+  })
+
+class WithNastiConverterTest extends Config(
+  (pname, site, here) => pname match {
+    case GroundTestKey => Seq.fill(site(NTiles)) {
+      GroundTestTileSettings(uncached = 1)
+    }
+    case GeneratorKey => GeneratorParameters(
+      maxRequests = 128,
+      startAddress = site(GlobalAddrMap)("mem").start)
+    case BuildGroundTest =>
+      (p: Parameters) => Module(new NastiConverterTest()(p))
+    case _ => throw new CDEMatchError
+  })
+
+class WithTraceGen extends Config(
+  topDefinitions = (pname, site, here) => pname match {
+    case GroundTestKey => Seq.fill(site(NTiles)) {
+      GroundTestTileSettings(uncached = 1, cached = 1)
+    }
+    case BuildGroundTest =>
+      (p: Parameters) => Module(new GroundTestTraceGenerator()(p))
+    case GeneratorKey => GeneratorParameters(
+      maxRequests = 256,
+      startAddress = 0)
+    case AddressBag => {
+      val nSets = 32 // L2 NSets
+      val nWays = 1
+      val blockOffset = site(CacheBlockOffsetBits)
+      val baseAddr = site(GlobalAddrMap)("mem").start
+      val nBeats = site(MIFDataBeats)
+      List.tabulate(4 * nWays) { i =>
+        Seq.tabulate(nBeats) { j => (j * 8) + ((i * nSets) << blockOffset) }
+      }.flatten.map(addr => baseAddr + BigInt(addr))
+    }
+    case UseAtomics => true
+    case _ => throw new CDEMatchError
+  },
+  knobValues = {
+    case "L1D_SETS" => 16
+    case "L1D_WAYS" => 1
+  })
+
+class WithPCIeMockupTest extends Config(
+  (pname, site, here) => pname match {
+    case NTiles => 2
+    case GroundTestKey => Seq(
+      GroundTestTileSettings(1, 1),
+      GroundTestTileSettings(1))
+    case GeneratorKey => GeneratorParameters(
+      maxRequests = 128,
+      startAddress = site(GlobalAddrMap)("mem").start)
+    case BuildGroundTest =>
+      (p: Parameters) => {
+        val id = p(GroundTestId)
+        if (id == 0) Module(new GeneratorTest()(p))
+        else Module(new NastiConverterTest()(p))
+      }
+    case _ => throw new CDEMatchError
+  })
+
+class WithDirectMemtest extends Config(
+  (pname, site, here) => {
+    val nGens = 8
+    pname match {
+      case GroundTestKey => Seq(GroundTestTileSettings(uncached = nGens))
+      case GeneratorKey => GeneratorParameters(
+        maxRequests = 1024,
+        startAddress = 0)
+      case BuildGroundTest =>
+        (p: Parameters) => Module(new GeneratorTest()(p))
+      case _ => throw new CDEMatchError
+    }
+  })
+
+class WithDirectComparator extends Config(
+  (pname, site, here) => pname match {
+    case GroundTestKey => Seq.fill(site(NTiles)) {
+      GroundTestTileSettings(uncached = site(ComparatorKey).targets.size)
+    }
+    case BuildGroundTest =>
+      (p: Parameters) => Module(new ComparatorCore()(p))
+    case ComparatorKey => ComparatorParameters(
+      targets    = Seq(0L, 0x100L),
+      width      = 8,
+      operations = 1000,
+      atomics    = site(UseAtomics),
+      prefetches = site("COMPARATOR_PREFETCHES"))
+    case FPUConfig => None
+    case UseAtomics => false
+    case "COMPARATOR_PREFETCHES" => false
+    case _ => throw new CDEMatchError
+  })
--- a/src/main/scala/coreplex/Testing.scala
+++ b/src/main/scala/coreplex/Testing.scala
@ -0,0 +1,186 @@
+// See LICENSE for license details.
+
+package coreplex
+
+import Chisel._
+import scala.collection.mutable.{LinkedHashSet,LinkedHashMap}
+import cde.{Parameters, ParameterDump, Config, Field, CDEMatchError}
+
+case object RegressionTestNames extends Field[LinkedHashSet[String]]
+
+abstract class RocketTestSuite {
+  val dir: String
+  val makeTargetName: String
+  val names: LinkedHashSet[String]
+  val envName: String
+  def postScript = s"""
+
+$$(addprefix $$(output_dir)/, $$(addsuffix .hex, $$($makeTargetName))): $$(output_dir)/%.hex: $dir/%.hex
+\tmkdir -p $$(output_dir)
+\tln -fs $$< $$@
+
+$$(addprefix $$(output_dir)/, $$($makeTargetName)): $$(output_dir)/%: $dir/%
+\tmkdir -p $$(output_dir)
+\tln -fs $$< $$@
+
+run-$makeTargetName: $$(addprefix $$(output_dir)/, $$(addsuffix .out, $$($makeTargetName)))
+\t@echo; perl -ne 'print "  [$$$$1] $$$$ARGV \\t$$$$2\\n" if( /\\*{3}(.{8})\\*{3}(.*)/ || /ASSERTION (FAILED)/i )' $$^ /dev/null | perl -ne 'if(/(.*)/){print "$$$$1\\n\\n"; exit(1) if eof()}'
+
+run-$makeTargetName-debug: $$(addprefix $$(output_dir)/, $$(addsuffix .vpd, $$($makeTargetName)))
+\t@echo; perl -ne 'print "  [$$$$1] $$$$ARGV \\t$$$$2\\n" if( /\\*{3}(.{8})\\*{3}(.*)/ || /ASSERTION (FAILED)/i )' $$(patsubst %.vpd,%.out,$$^) /dev/null | perl -ne 'if(/(.*)/){print "$$$$1\\n\\n"; exit(1) if eof()}'
+"""
+}
+
+class AssemblyTestSuite(prefix: String, val names: LinkedHashSet[String])(val envName: String) extends RocketTestSuite {
+  val dir = "$(RISCV)/riscv64-unknown-elf/share/riscv-tests/isa"
+  val makeTargetName = prefix + "-" + envName + "-asm-tests"
+  override def toString = s"$makeTargetName = \\\n" + names.map(n => s"\t$prefix-$envName-$n").mkString(" \\\n") + postScript
+}
+
+class BenchmarkTestSuite(makePrefix: String, val dir: String, val names: LinkedHashSet[String]) extends RocketTestSuite {
+  val envName = ""
+  val makeTargetName = makePrefix + "-bmark-tests"
+  override def toString = s"$makeTargetName = \\\n" + names.map(n => s"\t$n.riscv").mkString(" \\\n") + postScript
+}
+
+class RegressionTestSuite(val names: LinkedHashSet[String]) extends RocketTestSuite {
+  val envName = ""
+  val dir = "$(RISCV)/riscv64-unknown-elf/share/riscv-tests/isa"
+  val makeTargetName = "regression-tests"
+  override def toString = s"$makeTargetName = \\\n" + names.mkString(" \\\n")
+}
+
+object TestGeneration {
+  import scala.collection.mutable.HashMap
+  val asmSuites = new LinkedHashMap[String,AssemblyTestSuite]()
+  val bmarkSuites = new LinkedHashMap[String,BenchmarkTestSuite]()
+  val regressionSuites = new LinkedHashMap[String,RegressionTestSuite]()
+
+  def addSuite(s: RocketTestSuite) {
+    s match {
+      case a: AssemblyTestSuite => asmSuites += (a.makeTargetName -> a)
+      case b: BenchmarkTestSuite => bmarkSuites += (b.makeTargetName -> b)
+      case r: RegressionTestSuite => regressionSuites += (r.makeTargetName -> r)
+    }
+  }
+  
+  def addSuites(s: Seq[RocketTestSuite]) { s.foreach(addSuite) }
+
+  def generateMakefrag(topModuleName: String, configClassName: String) {
+    def gen(kind: String, s: Seq[RocketTestSuite]) = {
+      if(s.length > 0) {
+        val envs = s.groupBy(_.envName)
+        val targets = s.map(t => s"$$(${t.makeTargetName})").mkString(" ")
+        s.map(_.toString).mkString("\n") +
+        envs.filterKeys(_ != "").map( {
+          case (env,envsuites) => {
+          val suites = envsuites.map(t => s"$$(${t.makeTargetName})").mkString(" ")
+        s"""
+run-$kind-$env-tests: $$(addprefix $$(output_dir)/, $$(addsuffix .out, $suites))
+\t@echo; perl -ne 'print "  [$$$$1] $$$$ARGV \\t$$$$2\\n" if( /\\*{3}(.{8})\\*{3}(.*)/ || /ASSERTION (FAILED)/i )' $$^ /dev/null | perl -ne 'if(/(.*)/){print "$$$$1\\n\\n"; exit(1) if eof()}'
+run-$kind-$env-tests-debug: $$(addprefix $$(output_dir)/, $$(addsuffix .vpd, $suites))
+\t@echo; perl -ne 'print "  [$$$$1] $$$$ARGV \\t$$$$2\\n" if( /\\*{3}(.{8})\\*{3}(.*)/ || /ASSERTION (FAILED)/i )' $$(patsubst %.vpd,%.out,$$^) /dev/null | perl -ne 'if(/(.*)/){print "$$$$1\\n\\n"; exit(1) if eof()}'
+run-$kind-$env-tests-fast: $$(addprefix $$(output_dir)/, $$(addsuffix .run, $suites))
+\t@echo; perl -ne 'print "  [$$$$1] $$$$ARGV \\t$$$$2\\n" if( /\\*{3}(.{8})\\*{3}(.*)/ || /ASSERTION (FAILED)/i )' $$^ /dev/null | perl -ne 'if(/(.*)/){print "$$$$1\\n\\n"; exit(1) if eof()}'
+"""} } ).mkString("\n") + s"""
+run-$kind-tests: $$(addprefix $$(output_dir)/, $$(addsuffix .out, $targets))
+\t@echo; perl -ne 'print "  [$$$$1] $$$$ARGV \\t$$$$2\\n" if( /\\*{3}(.{8})\\*{3}(.*)/ || /ASSERTION (FAILED)/i )' $$^ /dev/null | perl -ne 'if(/(.*)/){print "$$$$1\\n\\n"; exit(1) if eof()}'
+run-$kind-tests-debug: $$(addprefix $$(output_dir)/, $$(addsuffix .vpd, $targets))
+\t@echo; perl -ne 'print "  [$$$$1] $$$$ARGV \\t$$$$2\\n" if( /\\*{3}(.{8})\\*{3}(.*)/ || /ASSERTION (FAILED)/i )' $$(patsubst %.vpd,%.out,$$^) /dev/null | perl -ne 'if(/(.*)/){print "$$$$1\\n\\n"; exit(1) if eof()}'
+run-$kind-tests-fast: $$(addprefix $$(output_dir)/, $$(addsuffix .run, $targets))
+\t@echo; perl -ne 'print "  [$$$$1] $$$$ARGV \\t$$$$2\\n" if( /\\*{3}(.{8})\\*{3}(.*)/ || /ASSERTION (FAILED)/i )' $$^ /dev/null | perl -ne 'if(/(.*)/){print "$$$$1\\n\\n"; exit(1) if eof()}'
+"""
+      } else { "\n" }
+    }
+
+    val f = createOutputFile(s"$topModuleName.$configClassName.d")
+    f.write(
+      List(
+        gen("asm", asmSuites.values.toSeq),
+        gen("bmark", bmarkSuites.values.toSeq),
+        gen("regression", regressionSuites.values.toSeq)
+      ).mkString("\n"))
+    f.close
+  }
+
+  def createOutputFile(name: String) =
+    new java.io.FileWriter(s"${Driver.targetDir}/$name")
+}
+
+object DefaultTestSuites {
+  val rv32uiNames = LinkedHashSet(
+    "simple", "add", "addi", "and", "andi", "auipc", "beq", "bge", "bgeu", "blt", "bltu", "bne", "fence_i", 
+    "jal", "jalr", "lb", "lbu", "lh", "lhu", "lui", "lw", "or", "ori", "sb", "sh", "sw", "sll", "slli",
+    "slt", "slti", "sra", "srai", "srl", "srli", "sub", "xor", "xori")
+  val rv32ui = new AssemblyTestSuite("rv32ui", rv32uiNames)(_)
+
+  val rv32ucNames = LinkedHashSet("rvc")
+  val rv32uc = new AssemblyTestSuite("rv32uc", rv32ucNames)(_)
+
+  val rv32umNames = LinkedHashSet("mul", "mulh", "mulhsu", "mulhu", "div", "divu", "rem", "remu")
+  val rv32um = new AssemblyTestSuite("rv32um", rv32umNames)(_)
+
+  val rv32uaNames = LinkedHashSet("lrsc", "amoadd_w", "amoand_w", "amoor_w", "amoxor_w", "amoswap_w", "amomax_w", "amomaxu_w", "amomin_w", "amominu_w")
+  val rv32ua = new AssemblyTestSuite("rv32ua", rv32uaNames)(_)
+
+  val rv32siNames = LinkedHashSet("csr", "ma_fetch", "scall", "sbreak", "wfi", "dirty")
+  val rv32si = new AssemblyTestSuite("rv32si", rv32siNames)(_)
+
+  val rv32miNames = LinkedHashSet("csr", "mcsr", "illegal", "ma_addr", "ma_fetch", "sbreak", "scall")
+  val rv32mi = new AssemblyTestSuite("rv32mi", rv32miNames)(_)
+
+  val rv32u = List(rv32ui, rv32um)
+  val rv32i = List(rv32ui, rv32si, rv32mi)
+  val rv32pi = List(rv32ui, rv32mi)
+
+  val rv64uiNames = LinkedHashSet("addw", "addiw", "ld", "lwu", "sd", "slliw", "sllw", "sltiu", "sltu", "sraiw", "sraw", "srliw", "srlw", "subw")
+  val rv64ui = new AssemblyTestSuite("rv64ui", rv32uiNames ++ rv64uiNames)(_)
+
+  val rv64umNames = LinkedHashSet("divuw", "divw", "mulw", "remuw", "remw")
+  val rv64um = new AssemblyTestSuite("rv64um", rv32umNames ++ rv64umNames)(_)
+
+  val rv64uaNames = rv32uaNames.map(_.replaceAll("_w","_d"))
+  val rv64ua = new AssemblyTestSuite("rv64ua", rv32uaNames ++ rv64uaNames)(_)
+
+  val rv64ucNames = rv32ucNames
+  val rv64uc = new AssemblyTestSuite("rv64uc", rv64ucNames)(_)
+
+  val rv64ufNames = LinkedHashSet("ldst", "move", "fsgnj", "fcmp", "fcvt", "fcvt_w", "fclass", "fadd", "fdiv", "fmin", "fmadd")
+  val rv64uf = new AssemblyTestSuite("rv64uf", rv64ufNames)(_)
+  val rv64ufNoDiv = new AssemblyTestSuite("rv64uf", rv64ufNames - "fdiv")(_)
+
+  val rv64udNames = rv64ufNames + "structural"
+  val rv64ud = new AssemblyTestSuite("rv64ud", rv64udNames)(_)
+  val rv64udNoDiv = new AssemblyTestSuite("rv64ud", rv64udNames - "fdiv")(_)
+
+  val rv64siNames = rv32siNames
+  val rv64si = new AssemblyTestSuite("rv64si", rv64siNames)(_)
+
+  val rv64miNames = rv32miNames + "breakpoint"
+  val rv64mi = new AssemblyTestSuite("rv64mi", rv64miNames)(_)
+
+  val groundtestNames = LinkedHashSet("simple")
+  val groundtest64 = new AssemblyTestSuite("rv64ui", groundtestNames)(_)
+  val groundtest32 = new AssemblyTestSuite("rv32ui", groundtestNames)(_)
+
+  // TODO: "rv64ui-pm-lrsc", "rv64mi-pm-ipi",
+
+  val rv64u = List(rv64ui, rv64um)
+  val rv64i = List(rv64ui, rv64si, rv64mi)
+  val rv64pi = List(rv64ui, rv64mi)
+
+  val benchmarks = new BenchmarkTestSuite("rvi", "$(RISCV)/riscv64-unknown-elf/share/riscv-tests/benchmarks", LinkedHashSet(
+    "median", "multiply", "qsort", "towers", "vvadd", "dhrystone", "mt-matmul"))
+
+  val rv32udBenchmarks = new BenchmarkTestSuite("rvd", "$(RISCV)/riscv64-unknown-elf/share/riscv-tests/benchmarks", LinkedHashSet(
+    "mm", "spmv", "mt-vvadd"))
+
+  val emptyBmarks = new BenchmarkTestSuite("empty",
+    "$(RISCV)/riscv64-unknown-elf/share/riscv-tests/benchmarks", LinkedHashSet.empty)
+
+  val mtBmarks = new BenchmarkTestSuite("mt", "$(RISCV)/riscv64-unknown-elf/share/riscv-tests/mt",
+    LinkedHashSet(((0 to 4).map("vvadd"+_) ++
+    List("ad","ae","af","ag","ai","ak","al","am","an","ap","aq","ar","at","av","ay","az",
+         "bb","bc","bf","bh","bj","bk","bm","bo","br","bs","ce","cf","cg","ci","ck","cl",
+         "cm","cs","cv","cy","dc","df","dm","do","dr","ds","du","dv").map(_+"_matmul")): _*))
+}
--- a/src/main/scala/coreplex/UnitTest.scala
+++ b/src/main/scala/coreplex/UnitTest.scala
@ -0,0 +1,22 @@
+package coreplex
+
+import Chisel._
+import junctions.unittests.UnitTestSuite
+import rocket.Tile
+import uncore.tilelink.TLId
+import cde.Parameters
+
+class UnitTestCoreplex(topParams: Parameters) extends Coreplex()(topParams) {
+  require(!exportMMIO)
+  require(nExtClients == 0)
+  require(nMemChannels == 0)
+
+  io.debug.req.ready := Bool(false)
+  io.debug.resp.valid := Bool(false)
+
+  val l1params = p.alterPartial({ case TLId => "L1toL2" })
+  val tests = Module(new UnitTestSuite()(l1params))
+
+  override def hasSuccessFlag = true
+  io.success.get := tests.io.finished
+}
--- a/src/main/scala/groundtest/BusMasterTest.scala
+++ b/src/main/scala/groundtest/BusMasterTest.scala
@ -0,0 +1,115 @@
+package groundtest
+
+import Chisel._
+import uncore.tilelink._
+import uncore.agents._
+import uncore.coherence.{InnerTLId, OuterTLId}
+import uncore.util._
+import junctions.HasAddrMapParameters
+import cde.Parameters
+
+/**
+ * An example bus mastering devices that writes some preset data to memory.
+ * When it receives an MMIO put request, it starts writing out the data.
+ * When it receives an MMIO get request, it responds with the progress of
+ * the write. A grant data of 1 means it is still writing, grant data 0 
+ * means it has finished.
+ */
+class ExampleBusMaster(implicit val p: Parameters) extends Module
+    with HasAddrMapParameters
+    with HasTileLinkParameters {
+  val mmioParams = p.alterPartial({ case TLId => p(InnerTLId) })
+  val memParams = p.alterPartial({ case TLId => p(OuterTLId) })
+  val memStart = addrMap("mem").start
+  val memStartBlock = memStart >> p(CacheBlockOffsetBits)
+
+  val io = new Bundle {
+    val mmio = new ClientUncachedTileLinkIO()(mmioParams).flip
+    val mem = new ClientUncachedTileLinkIO()(memParams)
+  }
+
+  val s_idle :: s_put :: s_resp :: Nil = Enum(Bits(), 3)
+  val state = Reg(init = s_idle)
+  val send_resp = Reg(init = Bool(false))
+  val r_acq = Reg(new AcquireMetadata)
+
+  io.mmio.acquire.ready := !send_resp
+  io.mmio.grant.valid := send_resp
+  io.mmio.grant.bits := Grant(
+    is_builtin_type = Bool(true),
+    g_type = r_acq.getBuiltInGrantType(),
+    client_xact_id = r_acq.client_xact_id,
+    manager_xact_id = UInt(0),
+    addr_beat = r_acq.addr_beat,
+    data = Mux(state === s_idle, UInt(0), UInt(1)))
+
+  when (io.mmio.acquire.fire()) {
+    send_resp := Bool(true)
+    r_acq := io.mmio.acquire.bits
+    when (state === s_idle && io.mmio.acquire.bits.hasData()) { state := s_put }
+  }
+  when (io.mmio.grant.fire()) { send_resp := Bool(false) }
+
+  val (put_beat, put_done) = Counter(io.mem.acquire.fire(), tlDataBeats)
+  when (put_done) { state := s_resp }
+  when (io.mem.grant.fire()) { state := s_idle }
+
+  io.mem.acquire.valid := state === s_put
+  io.mem.acquire.bits := PutBlock(
+    client_xact_id = UInt(0),
+    addr_block = UInt(memStartBlock),
+    addr_beat = put_beat,
+    data = put_beat)
+  io.mem.grant.ready := state === s_resp
+}
+
+class BusMasterTest(implicit p: Parameters) extends GroundTest()(p)
+    with HasTileLinkParameters {
+  val (s_idle :: s_req_start :: s_resp_start :: s_req_poll :: s_resp_poll ::
+       s_req_check :: s_resp_check :: s_done :: Nil) = Enum(Bits(), 8)
+  val state = Reg(init = s_idle)
+
+  val busMasterBlock = addrMap("io:ext:busmaster").start >> p(CacheBlockOffsetBits)
+  val start_acq = Put(
+    client_xact_id = UInt(0),
+    addr_block = UInt(busMasterBlock),
+    addr_beat = UInt(0),
+    data = UInt(1))
+  val poll_acq = Get(
+    client_xact_id = UInt(0),
+    addr_block = UInt(busMasterBlock),
+    addr_beat = UInt(0))
+  val check_acq = GetBlock(
+    client_xact_id = UInt(0),
+    addr_block = UInt(memStartBlock))
+
+  val acq = io.mem.head.acquire
+  val gnt = io.mem.head.grant
+
+  acq.valid := state.isOneOf(s_req_start, s_req_poll, s_req_check)
+  acq.bits := MuxLookup(state, check_acq, Seq(
+    s_req_start -> start_acq,
+    s_req_poll -> poll_acq))
+  gnt.ready := state.isOneOf(s_resp_start, s_resp_poll, s_resp_check)
+
+  val (get_beat, get_done) = Counter(
+    state === s_resp_check && gnt.valid, tlDataBeats)
+
+  when (state === s_idle) { state := s_req_start }
+  when (state === s_req_start && acq.ready) { state := s_resp_start }
+  when (state === s_resp_start && gnt.valid) { state := s_req_poll }
+  when (state === s_req_poll && acq.ready) { state := s_resp_poll }
+  when (state === s_resp_poll && gnt.valid) {
+    when (gnt.bits.data === UInt(0)) {
+      state := s_req_check
+    } .otherwise { state := s_req_poll }
+  }
+  when (state === s_req_check && acq.ready) { state := s_resp_check }
+  when (get_done) { state := s_done }
+
+  io.status.finished := state === s_done
+
+  assert(state =/= s_resp_check || !gnt.valid ||
+         gnt.bits.data === get_beat,
+         "BusMasterTest: data does not match")
+}
--- a/src/main/scala/groundtest/CacheFillTest.scala
+++ b/src/main/scala/groundtest/CacheFillTest.scala
@ -0,0 +1,50 @@
+package groundtest
+
+import Chisel._
+import uncore.tilelink._
+import uncore.constants._
+import uncore.agents._
+import uncore.util._
+import cde.{Parameters, Field}
+
+class CacheFillTest(implicit p: Parameters) extends GroundTest()(p)
+    with HasTileLinkParameters {
+  val capacityKb: Int = p("L2_CAPACITY_IN_KB")
+  val nblocks = capacityKb * 1024 / p(CacheBlockBytes)
+  val s_start :: s_prefetch :: s_retrieve :: s_finished :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_start)
+
+  val active = state.isOneOf(s_prefetch, s_retrieve)
+
+  val xact_pending = Reg(init = UInt(0, tlMaxClientXacts))
+  val xact_id = PriorityEncoder(~xact_pending)
+
+  val (req_block, round_done) = Counter(io.mem.head.acquire.fire(), nblocks)
+
+  io.mem.head.acquire.valid := active && !xact_pending.andR
+  io.mem.head.acquire.bits := Mux(state === s_prefetch,
+    GetPrefetch(xact_id, UInt(memStartBlock) + req_block),
+    GetBlock(xact_id, UInt(memStartBlock) + req_block))
+  io.mem.head.grant.ready := xact_pending.orR
+
+  def add_pending(acq: DecoupledIO[Acquire]): UInt =
+    Mux(acq.fire(), UIntToOH(acq.bits.client_xact_id), UInt(0))
+
+  def remove_pending(gnt: DecoupledIO[Grant]): UInt = {
+    val last_grant = !gnt.bits.hasMultibeatData() ||
+                      gnt.bits.addr_beat === UInt(tlDataBeats - 1)
+    ~Mux(gnt.fire() && last_grant, UIntToOH(gnt.bits.client_xact_id), UInt(0))
+  }
+
+  xact_pending := (xact_pending |
+    add_pending(io.mem.head.acquire)) &
+    remove_pending(io.mem.head.grant)
+
+  when (state === s_start) { state := s_prefetch }
+  when (state === s_prefetch && round_done) { state := s_retrieve }
+  when (state === s_retrieve && round_done) { state := s_finished }
+
+  io.status.finished := (state === s_finished)
+  io.status.timeout.valid := Bool(false)
+  io.status.error.valid := Bool(false)
+}
--- a/src/main/scala/groundtest/Comparator.scala
+++ b/src/main/scala/groundtest/Comparator.scala
@ -0,0 +1,387 @@
+package groundtest
+
+import Chisel._
+import uncore.tilelink._
+import uncore.constants._
+import junctions._
+import rocket._
+import scala.util.Random
+import cde.{Parameters, Field}
+
+case class ComparatorParameters(
+  targets:    Seq[Long], 
+  width:      Int,
+  operations: Int,
+  atomics:    Boolean,
+  prefetches: Boolean)
+case object ComparatorKey extends Field[ComparatorParameters]
+
+trait HasComparatorParameters {
+  implicit val p: Parameters
+  val comparatorParams = p(ComparatorKey)
+  val targets     = comparatorParams.targets
+  val nTargets    = targets.size
+  val targetWidth = comparatorParams.width
+  val nOperations = comparatorParams.operations
+  val atomics     = comparatorParams.atomics
+  val prefetches  = comparatorParams.prefetches
+}
+
+object LFSR64
+{
+  private var counter = 0
+  private def next: Int = {
+    counter += 1
+    counter
+  }
+  
+  def apply(increment: Bool = Bool(true), seed: Int = next): UInt =
+  {
+    val wide = 64
+    val lfsr = RegInit(UInt((seed * 0xDEADBEEFCAFEBAB1L) >>> 1, width = wide))
+    val xor = lfsr(0) ^ lfsr(1) ^ lfsr(3) ^ lfsr(4)
+    when (increment) { lfsr := Cat(xor, lfsr(wide-1,1)) }
+    lfsr
+  }
+}
+
+object NoiseMaker
+{
+  def apply(wide: Int, increment: Bool = Bool(true)): UInt = {
+    val lfsrs = Seq.fill((wide+63)/64) { LFSR64(increment) }
+    Cat(lfsrs)(wide-1,0)
+  }
+}
+
+object MaskMaker
+{
+  def apply(wide: Int, bits: UInt): UInt = 
+    Vec.tabulate(wide) {UInt(_) < bits} .asUInt
+}
+
+class ComparatorSource(implicit val p: Parameters) extends Module
+    with HasComparatorParameters
+    with HasTileLinkParameters
+{
+  val io = new Bundle {
+    val out = Decoupled(new Acquire)
+    val finished = Bool(OUTPUT)
+  }
+  
+  // Output exactly nOperations of Acquires
+  val finished = RegInit(Bool(false))
+  val valid    = RegInit(Bool(false))
+  
+  valid := Bool(true)
+  
+  io.finished  := finished
+  io.out.valid := !finished && valid
+  
+  // Generate random operand sizes
+  val inc = io.out.fire()
+  val raw_operand_size = NoiseMaker(2, inc) | UInt(0, M_SZ)
+  val max_operand_size = UInt(log2Up(tlDataBytes))
+  val get_operand_size = Mux(raw_operand_size > max_operand_size, max_operand_size, raw_operand_size)
+  val atomic_operand_size = UInt(2) + NoiseMaker(1, inc) // word or dword
+  
+  // Generate random, but valid addr_bytes
+  val raw_addr_byte = NoiseMaker(tlByteAddrBits, inc)
+  val get_addr_byte    = raw_addr_byte & ~MaskMaker(tlByteAddrBits, get_operand_size)
+  val atomic_addr_byte = raw_addr_byte & ~MaskMaker(tlByteAddrBits, atomic_operand_size)
+  
+  // Only allow some of the possible choices (M_XA_MAXU untested)
+  val atomic_opcode = MuxLookup(NoiseMaker(3, inc), M_XA_SWAP, Array(
+    UInt("b000") -> M_XA_ADD,
+    UInt("b001") -> M_XA_XOR,
+    UInt("b010") -> M_XA_OR,
+    UInt("b011") -> M_XA_AND,
+    UInt("b100") -> M_XA_MIN,
+    UInt("b101") -> M_XA_MAX,
+    UInt("b110") -> M_XA_MINU,
+    UInt("b111") -> M_XA_SWAP))
+  
+  // Addr_block range
+  val addr_block_mask = MaskMaker(tlBlockAddrBits, UInt(targetWidth-tlBeatAddrBits-tlByteAddrBits))
+  
+  // Generate some random values
+  val addr_block = NoiseMaker(tlBlockAddrBits, inc) & addr_block_mask
+  val addr_beat  = NoiseMaker(tlBeatAddrBits, inc)
+  val wmask      = NoiseMaker(tlDataBytes, inc)
+  val data       = NoiseMaker(tlDataBits, inc)
+  val client_xact_id = UInt(0) // filled by Client
+  
+  // Random transactions
+  val get         = Get(client_xact_id, addr_block, addr_beat, get_addr_byte, get_operand_size, Bool(false))
+  val getBlock    = GetBlock(client_xact_id, addr_block)
+  val put         = Put(client_xact_id, addr_block, addr_beat, data, Some(wmask))
+  val putBlock    = PutBlock(client_xact_id, addr_block, UInt(0), data)
+  val putAtomic   = if (atomics)
+    PutAtomic(client_xact_id, addr_block, addr_beat,
+      atomic_addr_byte, atomic_opcode, atomic_operand_size, data)
+    else put
+  val putPrefetch = if (prefetches)
+    PutPrefetch(client_xact_id, addr_block)
+    else put
+  val getPrefetch = if (prefetches)
+    GetPrefetch(client_xact_id, addr_block)
+    else get
+  val a_type_sel  = NoiseMaker(3, inc)
+
+  // We must initially putBlock all of memory to have a consistent starting state
+  val final_addr_block = addr_block_mask + UInt(1)
+  val wipe_addr_block  = RegInit(UInt(0, width = tlBlockAddrBits))
+  val done_wipe        = wipe_addr_block === final_addr_block
+
+  io.out.bits := Mux(!done_wipe,
+    // Override whatever else we were going to do if we are wiping
+    PutBlock(client_xact_id, wipe_addr_block, UInt(0), data),
+    // Generate a random a_type
+    MuxLookup(a_type_sel, get, Array(
+      UInt("b000") -> get,
+      UInt("b001") -> getBlock,
+      UInt("b010") -> put,
+      UInt("b011") -> putBlock,
+      UInt("b100") -> putAtomic,
+      UInt("b101") -> getPrefetch,
+      UInt("b110") -> putPrefetch)))
+  
+  val idx = Reg(init = UInt(0, log2Up(nOperations)))
+  when (io.out.fire()) {
+    when (idx === UInt(nOperations - 1)) { finished := Bool(true) }
+    when (!done_wipe) {
+      printf("[acq %d]: PutBlock(addr_block = %x, data = %x)\n",
+        idx, wipe_addr_block, data)
+      wipe_addr_block := wipe_addr_block + UInt(1)
+    } .otherwise {
+      switch (a_type_sel) {
+        is (UInt("b000")) {
+          printf("[acq %d]: Get(addr_block = %x, addr_beat = %x, addr_byte = %x, op_size = %x)\n",
+            idx, addr_block, addr_beat, get_addr_byte, get_operand_size)
+        }
+        is (UInt("b001")) {
+          printf("[acq %d]: GetBlock(addr_block = %x)\n", idx, addr_block)
+        }
+        is (UInt("b010")) {
+          printf("[acq %d]: Put(addr_block = %x, addr_beat = %x, data = %x, wmask = %x)\n",
+            idx, addr_block, addr_beat, data, wmask)
+        }
+        is (UInt("b011")) {
+          printf("[acq %d]: PutBlock(addr_block = %x, data = %x)\n", idx, addr_block, data)
+        }
+        is (UInt("b100")) {
+          if (atomics) {
+            printf("[acq %d]: PutAtomic(addr_block = %x, addr_beat = %x, addr_byte = %x, " +
+                   "opcode = %x, op_size = %x, data = %x)\n",
+                   idx, addr_block, addr_beat, atomic_addr_byte,
+                   atomic_opcode, atomic_operand_size, data)
+          } else {
+            printf("[acq %d]: Put(addr_block = %x, addr_beat = %x, data = %x, wmask = %x)\n",
+              idx, addr_block, addr_beat, data, wmask)
+          }
+        }
+        is (UInt("b101")) {
+          if (prefetches) {
+            printf("[acq %d]: GetPrefetch(addr_block = %x)\n", idx, addr_block)
+          } else {
+            printf("[acq %d]: Get(addr_block = %x, addr_beat = %x, addr_byte = %x, op_size = %x)\n",
+              idx, addr_block, addr_beat, get_addr_byte, get_operand_size)
+          }
+        }
+        is (UInt("b110")) {
+          if (prefetches) {
+            printf("[acq %d]: PutPrefetch(addr_block = %x)\n", idx, addr_block)
+          } else {
+            printf("[acq %d]: Put(addr_block = %x, addr_beat = %x, data = %x, wmask = %x)\n",
+              idx, addr_block, addr_beat, data, wmask)
+          }
+        }
+        is (UInt("b111")) {
+          printf("[acq %d]: Get(addr_block = %x, addr_beat = %x, addr_byte = %x, op_size = %x)\n",
+            idx, addr_block, addr_beat, get_addr_byte, get_operand_size)
+        }
+      }
+    }
+    idx := idx + UInt(1)
+  }
+}
+
+class ComparatorClient(val target: Long)(implicit val p: Parameters) extends Module
+    with HasComparatorParameters
+    with HasTileLinkParameters
+{
+  val io = new Bundle {
+    val in  = Decoupled(new Acquire).flip
+    val tl  = new ClientUncachedTileLinkIO()
+    val out = Decoupled(new Grant)
+    val finished = Bool(OUTPUT)
+    val timeout = Bool(OUTPUT)
+  }
+
+  val xacts = tlMaxClientXacts
+  val offset = (UInt(target) >> UInt(tlBeatAddrBits+tlByteAddrBits))
+
+  // Track the status of inflight requests
+  val issued  = RegInit(Vec.fill(xacts) {Bool(false)})
+  val ready   = RegInit(Vec.fill(xacts) {Bool(false)})
+  val result  = Reg(Vec(xacts, new Grant))
+  
+  val buffer = Queue(io.in, xacts)
+  val queue  = Module(new Queue(io.tl.acquire.bits.client_xact_id, xacts))
+  
+  val isMultiOut = buffer.bits.hasMultibeatData()
+  val isMultiIn  = io.tl.grant.bits.hasMultibeatData()
+  
+  val beatOut  = RegInit(UInt(0, width = tlBeatAddrBits))
+  val lastBeat = UInt(tlDataBeats-1)
+  val isFirstBeatOut= Mux(isMultiOut, beatOut === UInt(0),  Bool(true))
+  val isLastBeatOut = Mux(isMultiOut, beatOut === lastBeat, Bool(true))
+  val isLastBeatIn  = Mux(isMultiIn,  io.tl.grant.bits.addr_beat === lastBeat, Bool(true))
+  
+  // Remove this once HoldUnless is in chisel3
+  def holdUnless[T <: Data](in : T, enable: Bool): T = Mux(!enable, RegEnable(in, enable), in)
+
+  // Potentially issue a request, using a free xact id
+  // NOTE: we may retract valid and change xact_id on a !ready (allowed by spec)
+  val allow_acq = NoiseMaker(1)(0) && issued.map(!_).reduce(_ || _)
+  val xact_id   = holdUnless(PriorityEncoder(issued.map(!_)), isFirstBeatOut)
+  buffer.ready        := allow_acq && io.tl.acquire.ready && isLastBeatOut
+  io.tl.acquire.valid := allow_acq && buffer.valid
+  io.tl.acquire.bits  := buffer.bits
+  io.tl.acquire.bits.addr_block := buffer.bits.addr_block + offset
+  io.tl.acquire.bits.client_xact_id := xact_id
+  when (isMultiOut) {
+    val dataOut = (buffer.bits.data << beatOut) + buffer.bits.data // mix the data up a bit
+    io.tl.acquire.bits.addr_beat := beatOut
+    io.tl.acquire.bits.data := dataOut
+  }
+  
+  when (io.tl.acquire.fire()) {
+    issued(xact_id) := isLastBeatOut
+    when (isMultiOut) { beatOut := beatOut + UInt(1) }
+  }
+  
+  // Remember the xact ID so we can return results in-order
+  queue.io.enq.valid := io.tl.acquire.fire() && isLastBeatOut
+  queue.io.enq.bits  := xact_id
+  assert (queue.io.enq.ready || !queue.io.enq.valid) // should be big enough
+  
+  // Capture the results from the manager
+  io.tl.grant.ready := NoiseMaker(1)(0)
+  when (io.tl.grant.fire()) {
+    val id = io.tl.grant.bits.client_xact_id
+    assert (!ready(id)) // got same xact_id twice?
+    ready(id) := isLastBeatIn
+    result(id) := io.tl.grant.bits
+  }
+  
+  // Bad xact_id returned if ready but not issued!
+  assert ((ready zip issued) map {case (r,i) => i || !r} reduce (_ && _))
+  
+  // When we have the next grant result, send it to the sink
+  val next_id = queue.io.deq.bits
+  queue.io.deq.ready := io.out.ready && ready(next_id) // TODO: only compares last getBlock
+  io.out.valid := queue.io.deq.valid && ready(next_id)
+  io.out.bits  := result(queue.io.deq.bits)
+  
+  when (io.out.fire()) {
+    ready(next_id) := Bool(false)
+    issued(next_id) := Bool(false)
+  }
+  
+  io.finished := !buffer.valid && !issued.reduce(_ || _)
+
+  val (idx, acq_done) = Counter(
+    io.tl.acquire.fire() && io.tl.acquire.bits.last(), nOperations)
+  debug(idx)
+
+  val timer = Module(new Timer(8192, xacts))
+  timer.io.start.valid := io.tl.acquire.fire() && io.tl.acquire.bits.first()
+  timer.io.start.bits  := xact_id
+  timer.io.stop.valid  := io.tl.grant.fire() && io.tl.grant.bits.first()
+  timer.io.stop.bits   := io.tl.grant.bits.client_xact_id
+  assert(!timer.io.timeout.valid, "Comparator TL client timed out")
+  io.timeout := timer.io.timeout.valid
+}
+
+class ComparatorSink(implicit val p: Parameters) extends Module
+    with HasComparatorParameters
+    with HasTileLinkParameters
+    with HasGroundTestConstants
+{
+  val io = new Bundle {
+    val in = Vec(nTargets, Decoupled(new Grant)).flip
+    val finished = Bool(OUTPUT)
+    val error = Valid(UInt(width = errorCodeBits))
+  }
+  
+  // could use a smaller Queue here, but would couple targets flow controls together
+  val queues = io.in.map(Queue(_, nOperations))
+  
+  io.finished := queues.map(!_.valid).reduce(_ && _)
+  val all_valid = queues.map(_.valid).reduce(_ && _)
+  queues.foreach(_.ready := all_valid)
+  
+  val base = queues(0).bits
+  val idx = Reg(init = UInt(0, log2Up(nOperations)))
+
+  def check(g: Grant) = {
+    when (g.hasData() && base.data =/= g.data) {
+      printf("%d: %x =/= %x, g_type = %x\n", idx, base.data, g.data, g.g_type)
+    }
+
+    val assert_conds = Seq(
+      g.is_builtin_type,
+      base.g_type === g.g_type,
+      base.addr_beat === g.addr_beat || !g.hasData(),
+      base.data === g.data || !g.hasData())
+
+    assert (g.is_builtin_type, "grant not builtin")
+    assert (base.g_type === g.g_type, "g_type mismatch")
+    assert (base.addr_beat === g.addr_beat || !g.hasData(), "addr_beat mismatch")
+    assert (base.data === g.data || !g.hasData(), "data mismatch")
+
+    assert_conds.zipWithIndex.foreach { case (cond, i) =>
+      when (!cond) {
+        io.error.valid := Bool(true)
+        io.error.bits := UInt(i)
+      }
+    }
+  }
+  when (all_valid) {
+    when (base.hasData()) {
+      printf("[gnt %d]: g_type = %x, addr_beat = %x, data = %x\n",
+        idx, base.g_type, base.addr_beat, base.data)
+    } .otherwise {
+      printf("[gnt %d]: g_type = %x\n", idx, base.g_type)
+    }
+    queues.drop(1).map(_.bits).foreach(check)
+    idx := idx + UInt(1)
+  }
+}
+
+class ComparatorCore(implicit p: Parameters) extends GroundTest()(p)
+    with HasComparatorParameters
+    with HasTileLinkParameters {
+
+  require (io.mem.size == nTargets)
+  
+  val source = Module(new ComparatorSource)
+  val sink   = Module(new ComparatorSink)
+  val broadcast = Broadcaster(source.io.out, nTargets)
+  val clients = targets.zipWithIndex.map { case (target, index) =>
+    val client = Module(new ComparatorClient(target))
+    client.io.in <> broadcast(index)
+    io.mem(index) <> client.io.tl
+    sink.io.in(index) <> client.io.out
+    client
+  }
+  val client_timeouts = clients.map(_.io.timeout)
+  
+  io.status.finished := source.io.finished && sink.io.finished && clients.map(_.io.finished).reduce(_ && _)
+  io.status.timeout.valid := client_timeouts.reduce(_ || _)
+  io.status.timeout.bits := MuxCase(UInt(0),
+    client_timeouts.zipWithIndex.map {
+      case (timeout, i) => (timeout -> UInt(i))
+    })
+  io.status.error := sink.io.error
+}
--- a/src/main/scala/groundtest/Generator.scala
+++ b/src/main/scala/groundtest/Generator.scala
@ -0,0 +1,212 @@
+package groundtest
+
+import Chisel._
+import uncore.tilelink._
+import uncore.devices.NTiles
+import uncore.constants._
+import junctions._
+import rocket._
+import scala.util.Random
+import cde.{Parameters, Field}
+
+case class GeneratorParameters(
+  maxRequests: Int,
+  startAddress: BigInt)
+case object GeneratorKey extends Field[GeneratorParameters]
+
+trait HasGeneratorParameters extends HasGroundTestParameters {
+  implicit val p: Parameters
+
+  val genParams = p(GeneratorKey)
+  val nGens = p(GroundTestKey).map(
+    cs => cs.uncached + cs.cached).reduce(_ + _)
+  val genTimeout = 8192
+  val maxRequests = genParams.maxRequests
+  val startAddress = genParams.startAddress
+
+  val genWordBits = 32
+  val genWordBytes = genWordBits / 8
+  val wordOffset = log2Ceil(genWordBytes)
+  val wordSize = UInt(log2Ceil(genWordBytes))
+
+  require(startAddress % BigInt(genWordBytes) == 0)
+}
+
+class UncachedTileLinkGenerator(id: Int)
+    (implicit p: Parameters) extends TLModule()(p) with HasGeneratorParameters {
+
+  private val tlBlockOffset = tlBeatAddrBits + tlByteAddrBits
+
+  val io = new Bundle {
+    val mem = new ClientUncachedTileLinkIO
+    val status = new GroundTestStatus
+  }
+
+  val (s_start :: s_put :: s_get :: s_finished :: Nil) = Enum(Bits(), 4)
+  val state = Reg(init = s_start)
+
+  val (req_cnt, req_wrap) = Counter(io.mem.grant.fire(), maxRequests)
+
+  val sending = Reg(init = Bool(false))
+
+  when (state === s_start) {
+    sending := Bool(true)
+    state := s_put
+  }
+
+  when (io.mem.acquire.fire()) { sending := Bool(false) }
+  when (io.mem.grant.fire()) { sending := Bool(true) }
+  when (req_wrap) { state := Mux(state === s_put, s_get, s_finished) }
+
+  val timeout = Timer(genTimeout, io.mem.acquire.fire(), io.mem.grant.fire())
+  assert(!timeout, s"Uncached generator ${id} timed out waiting for grant")
+
+  io.status.finished := (state === s_finished)
+  io.status.timeout.valid := timeout
+  io.status.timeout.bits := UInt(id)
+
+  val part_of_full_addr =
+    if (log2Ceil(nGens) > 0) {
+      Cat(UInt(id, log2Ceil(nGens)),
+          UInt(0, wordOffset))
+    } else {
+      UInt(0, wordOffset)
+    }
+  val full_addr = UInt(startAddress) + Cat(req_cnt, part_of_full_addr)
+
+  val addr_block = full_addr >> UInt(tlBlockOffset)
+  val addr_beat = full_addr(tlBlockOffset - 1, tlByteAddrBits)
+  val addr_byte = full_addr(tlByteAddrBits - 1, 0)
+
+  val data_prefix = Cat(UInt(id, log2Up(nGens)), req_cnt)
+  val word_data = Wire(UInt(width = genWordBits))
+  word_data := Cat(data_prefix, part_of_full_addr)
+  val beat_data = Fill(tlDataBits / genWordBits, word_data)
+  val wshift = Cat(beatOffset(full_addr), UInt(0, wordOffset))
+  val wmask = Fill(genWordBits / 8, Bits(1, 1)) << wshift
+
+  val put_acquire = Put(
+    client_xact_id = UInt(0),
+    addr_block = addr_block,
+    addr_beat = addr_beat,
+    data = beat_data,
+    wmask = Some(wmask),
+    alloc = Bool(false))
+
+  val get_acquire = Get(
+    client_xact_id = UInt(0),
+    addr_block = addr_block,
+    addr_beat = addr_beat,
+    addr_byte = addr_byte,
+    operand_size = wordSize,
+    alloc = Bool(false))
+
+  io.mem.acquire.valid := sending && !io.status.finished
+  io.mem.acquire.bits := Mux(state === s_put, put_acquire, get_acquire)
+  io.mem.grant.ready := !sending && !io.status.finished
+
+  def wordFromBeat(addr: UInt, dat: UInt) = {
+    val shift = Cat(beatOffset(addr), UInt(0, wordOffset + 3))
+    (dat >> shift)(genWordBits - 1, 0)
+  }
+
+  val data_mismatch = io.mem.grant.fire() && state === s_get &&
+    wordFromBeat(full_addr, io.mem.grant.bits.data) =/= word_data
+
+  io.status.error.valid := data_mismatch
+  io.status.error.bits := UInt(id)
+
+  assert(!data_mismatch,
+    s"Get received incorrect data in uncached generator ${id}")
+
+  def beatOffset(addr: UInt) = // TODO zero-width
+    if (tlByteAddrBits > wordOffset) addr(tlByteAddrBits - 1, wordOffset)
+    else UInt(0)
+}
+
+class HellaCacheGenerator(id: Int)
+    (implicit p: Parameters) extends L1HellaCacheModule()(p) with HasGeneratorParameters {
+  val io = new Bundle {
+    val mem = new HellaCacheIO
+    val status = new GroundTestStatus
+  }
+
+  val timeout = Timer(genTimeout, io.mem.req.fire(), io.mem.resp.valid)
+  assert(!timeout, s"Cached generator ${id} timed out waiting for response")
+  io.status.timeout.valid := timeout
+  io.status.timeout.bits := UInt(id)
+
+  val (s_start :: s_write :: s_read :: s_finished :: Nil) = Enum(Bits(), 4)
+  val state = Reg(init = s_start)
+  val sending = Reg(init = Bool(false))
+
+  val (req_cnt, req_wrap) = Counter(io.mem.resp.valid, maxRequests)
+
+  val part_of_req_addr =
+    if (log2Ceil(nGens) > 0) {
+      Cat(UInt(id, log2Ceil(nGens)),
+          UInt(0, wordOffset))
+    } else {
+      UInt(0, wordOffset)
+    }
+  val req_addr = UInt(startAddress) + Cat(req_cnt, part_of_req_addr)
+  val req_data = Cat(UInt(id, log2Up(nGens)), req_cnt, part_of_req_addr)
+
+  io.mem.req.valid := sending && !io.status.finished
+  io.mem.req.bits.addr := req_addr
+  io.mem.req.bits.data := req_data
+  io.mem.req.bits.typ  := wordSize
+  io.mem.req.bits.cmd  := Mux(state === s_write, M_XWR, M_XRD)
+  io.mem.req.bits.tag  := UInt(0)
+
+  when (state === s_start) { sending := Bool(true); state := s_write }
+
+  when (io.mem.req.fire()) { sending := Bool(false) }
+  when (io.mem.resp.valid) { sending := Bool(true) }
+
+  when (req_wrap) { state := Mux(state === s_write, s_read, s_finished) }
+
+  io.status.finished := (state === s_finished)
+
+  def data_match(recv: Bits, expected: Bits): Bool = {
+    val recv_resized = Wire(Bits(width = genWordBits))
+    val exp_resized = Wire(Bits(width = genWordBits))
+
+    recv_resized := recv
+    exp_resized := expected
+    recv_resized === exp_resized
+  }
+
+  val data_mismatch = io.mem.resp.valid && io.mem.resp.bits.has_data &&
+    !data_match(io.mem.resp.bits.data, req_data)
+
+  io.status.error.valid := data_mismatch
+  io.status.error.bits := UInt(id)
+
+  assert(!data_mismatch,
+    s"Received incorrect data in cached generator ${id}")
+}
+
+class GeneratorTest(implicit p: Parameters)
+    extends GroundTest()(p) with HasGeneratorParameters {
+
+  val idStart = p(GroundTestKey).take(tileId)
+    .map(settings => settings.cached + settings.uncached)
+    .foldLeft(0)(_ + _)
+
+  val cached = List.tabulate(nCached) { i =>
+    val realId = idStart + i
+    Module(new HellaCacheGenerator(realId))
+  }
+
+  val uncached = List.tabulate(nUncached) { i =>
+    val realId = idStart + nCached + i
+    Module(new UncachedTileLinkGenerator(realId))
+  }
+
+  io.cache <> cached.map(_.io.mem)
+  io.mem <> uncached.map(_.io.mem)
+
+  val gen_debug = cached.map(_.io.status) ++ uncached.map(_.io.status)
+  io.status := DebugCombiner(gen_debug)
+}
--- a/src/main/scala/groundtest/NastiTest.scala
+++ b/src/main/scala/groundtest/NastiTest.scala
@ -0,0 +1,121 @@
+package groundtest
+
+import Chisel._
+import uncore.tilelink._
+import uncore.converters._
+import junctions._
+import cde.Parameters
+
+class NastiGenerator(id: Int)(implicit val p: Parameters) extends Module
+    with HasNastiParameters
+    with HasMIFParameters
+    with HasAddrMapParameters
+    with HasGeneratorParameters {
+
+  val io = new Bundle {
+    val status = new GroundTestStatus
+    val mem = new NastiIO
+  }
+
+  val mifDataBytes = mifDataBits / 8
+
+  val (s_start :: s_write_addr :: s_write_data ::
+       s_read  :: s_wait :: s_finish :: Nil) = Enum(Bits(), 6)
+  val state = Reg(init = s_start)
+
+  def ref_data(idx: UInt) = UInt(0x35abffcd, genWordBits) + (idx << UInt(3))
+
+  val part_of_addr =
+    if (log2Ceil(nGens) > 0) {
+      Cat(UInt(id, log2Ceil(nGens)),
+          UInt(0, wordOffset))
+    } else {
+      UInt(0, wordOffset)
+    }
+
+  val (write_idx, write_done) = Counter(io.mem.w.fire(), maxRequests)
+  val write_addr = UInt(startAddress) + Cat(write_idx, part_of_addr)
+  val write_data = Fill(mifDataBits / genWordBits, ref_data(write_idx))
+  val write_align = write_addr(log2Up(mifDataBytes) - 1, 0)
+  val write_mask = UInt((1 << genWordBytes) - 1, nastiWStrobeBits) << write_align
+
+  val (read_idx, read_done) = Counter(io.mem.ar.fire(), maxRequests)
+  val read_addr = UInt(startAddress) + Cat(read_idx, part_of_addr)
+
+  io.mem.aw.valid := (state === s_write_addr)
+  io.mem.aw.bits := NastiWriteAddressChannel(
+    id = write_idx(nastiXIdBits - 1, 0),
+    addr = write_addr,
+    len = UInt(0),
+    size = UInt(log2Ceil(genWordBytes)))
+
+  io.mem.w.valid := (state === s_write_data)
+  io.mem.w.bits := NastiWriteDataChannel(
+    data = write_data,
+    strb = Some(write_mask),
+    last = Bool(true))
+
+  io.mem.ar.valid := (state === s_read)
+  io.mem.ar.bits := NastiReadAddressChannel(
+    id = UInt(0),
+    addr = read_addr,
+    len = UInt(0),
+    size = UInt(log2Ceil(genWordBytes)))
+
+  io.mem.r.ready := Bool(true)
+  io.mem.b.ready := Bool(true)
+
+  io.status.finished := (state === s_finish)
+
+  val (read_resp_idx,  read_resp_done)  = Counter(io.mem.r.fire(), maxRequests)
+  val read_resp_addr = UInt(startAddress) + Cat(read_resp_idx, part_of_addr)
+  val read_offset = read_resp_addr(log2Up(nastiXDataBits / 8) - 1, 0)
+  val read_shift = Cat(read_offset, UInt(0, 3))
+  val read_data = (io.mem.r.bits.data >> read_shift)(genWordBits - 1, 0)
+
+  val data_mismatch = io.mem.r.valid && read_data =/= ref_data(read_resp_idx)
+  assert(!data_mismatch, "NASTI Test: results do not match")
+  io.status.error.valid := data_mismatch
+  io.status.error.bits := UInt(1)
+
+  when (state === s_start) { state := s_write_addr }
+  when (io.mem.aw.fire()) { state := s_write_data  }
+  when (io.mem.w.fire()) { state := s_write_addr }
+  when (write_done) { state := s_read }
+  when (read_done) { state := s_wait }
+  when (read_resp_done) { state := s_finish }
+
+  val r_timer = Module(new Timer(1000, 2))
+  r_timer.io.start.valid := io.mem.ar.fire()
+  r_timer.io.start.bits := io.mem.ar.bits.id
+  r_timer.io.stop.valid := io.mem.r.fire() && io.mem.r.bits.last
+  r_timer.io.stop.bits := io.mem.r.bits.id
+  assert(!r_timer.io.timeout.valid, "NASTI Read timed out")
+
+  val w_timer = Module(new Timer(1000, 2))
+  w_timer.io.start.valid := io.mem.aw.fire()
+  w_timer.io.start.bits := io.mem.aw.bits.id
+  w_timer.io.stop.valid := io.mem.b.fire()
+  w_timer.io.stop.bits := io.mem.b.bits.id
+  assert(!w_timer.io.timeout.valid, "NASTI Write timed out")
+
+  io.status.timeout.valid := r_timer.io.timeout.valid || w_timer.io.timeout.valid
+  io.status.timeout.bits := Mux(r_timer.io.timeout.valid, UInt(1), UInt(2))
+}
+
+class NastiConverterTest(implicit p: Parameters) extends GroundTest()(p)
+    with HasNastiParameters {
+  require(tileSettings.uncached == 1 && tileSettings.cached == 0)
+
+  val genId = p(GroundTestKey).take(tileId)
+    .map(settings => settings.cached + settings.uncached)
+    .foldLeft(0)(_ + _)
+
+  val test = Module(new NastiGenerator(genId))
+  val converter = Module(new TileLinkIONastiIOConverter()(
+    p.alterPartial { case TLId => "Outermost" }))
+
+  converter.io.nasti <> test.io.mem
+  TileLinkWidthAdapter(io.mem.head, converter.io.tl)
+  io.status := test.io.status
+}
--- a/src/main/scala/groundtest/Regression.scala
+++ b/src/main/scala/groundtest/Regression.scala
@ -0,0 +1,776 @@
+package groundtest
+
+import Chisel._
+import uncore.tilelink._
+import uncore.constants._
+import uncore.agents._
+import uncore.util._
+import junctions.{ParameterizedBundle, HasAddrMapParameters, Timer}
+import rocket.HellaCacheIO
+import cde.{Parameters, Field}
+
+class RegressionIO(implicit val p: Parameters) extends ParameterizedBundle()(p) {
+  val start = Bool(INPUT)
+  val cache = new HellaCacheIO
+  val mem = new ClientUncachedTileLinkIO
+  val finished = Bool(OUTPUT)
+  val errored = Bool(OUTPUT)
+}
+
+abstract class Regression(implicit val p: Parameters)
+    extends Module with HasTileLinkParameters with HasAddrMapParameters {
+  val memStart = addrMap("mem").start
+  val memStartBlock = memStart >> p(CacheBlockOffsetBits)
+  val io = new RegressionIO
+
+  def disableCache() {
+    io.cache.req.valid := Bool(false)
+    io.cache.req.bits.addr := UInt(memStart)
+    io.cache.req.bits.typ  := UInt(0)
+    io.cache.req.bits.cmd  := M_XRD
+    io.cache.req.bits.tag  := UInt(0)
+    io.cache.req.bits.data := Bits(0)
+    io.cache.req.bits.phys := Bool(true)
+    io.cache.invalidate_lr := Bool(false)
+  }
+
+  def disableMem() {
+    io.mem.acquire.valid := Bool(false)
+    io.mem.grant.ready := Bool(false)
+  }
+}
+
+/**
+ * This was a bug in which the TileLinkIONarrower logic screwed up
+ * when a PutBlock request and a narrow Get request are sent to it at the
+ * same time. Repeating this sequence enough times will cause a queue to
+ * get filled up and deadlock the system.
+ */
+class IOGetAfterPutBlockRegression(implicit p: Parameters) extends Regression()(p) {
+  val nRuns = 7
+  val run = Reg(init = UInt(0, log2Up(nRuns + 1)))
+
+  val (put_beat, put_done) = Counter(
+    io.mem.acquire.fire() && io.mem.acquire.bits.hasData(), tlDataBeats)
+
+  val started = Reg(init = Bool(false))
+  val put_sent = Reg(init = Bool(false))
+  val get_sent = Reg(init = Bool(false))
+  val put_acked = Reg(init = Bool(false))
+  val get_acked = Reg(init = Bool(false))
+  val both_acked = put_acked && get_acked
+
+  when (!started && io.start) { started := Bool(true) }
+
+  io.mem.acquire.valid := !put_sent && started
+  io.mem.acquire.bits := PutBlock(
+    client_xact_id = UInt(0),
+    addr_block = UInt(memStartBlock),
+    addr_beat = put_beat,
+    data = UInt(0))
+  io.mem.grant.ready := Bool(true)
+
+  io.cache.req.valid := !get_sent && started
+  io.cache.req.bits.addr := UInt(addrMap("io:int:bootrom").start)
+  io.cache.req.bits.typ := UInt(log2Ceil(32 / 8))
+  io.cache.req.bits.cmd := M_XRD
+  io.cache.req.bits.tag := UInt(0)
+  io.cache.invalidate_lr := Bool(false)
+
+  when (put_done) { put_sent := Bool(true) }
+  when (io.cache.req.fire()) { get_sent := Bool(true) }
+  when (io.mem.grant.fire()) { put_acked := Bool(true) }
+  when (io.cache.resp.valid) { get_acked := Bool(true) }
+
+  when (both_acked) {
+    when (run < UInt(nRuns - 1)) {
+      put_sent := Bool(false)
+      get_sent := Bool(false)
+    }
+    put_acked := Bool(false)
+    get_acked := Bool(false)
+    run := run + UInt(1)
+  }
+
+  io.finished := (run === UInt(nRuns))
+}
+
+/* This was a bug with merging two PutBlocks to the same address in the L2.
+ * The transactor would start accepting beats of the second transaction but
+ * acknowledge both of them when the first one finished.
+ * This caused the state to go funky since the next time around it would
+ * start the put in the middle */
+class PutBlockMergeRegression(implicit p: Parameters)
+    extends Regression()(p) with HasTileLinkParameters {
+  val s_idle :: s_put :: s_wait :: s_done :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_idle)
+
+  disableCache()
+
+  val l2params = p.alterPartial({ case CacheName => "L2Bank" })
+  val nSets = l2params(NSets)
+  val addr_blocks = Vec(Seq(0, 0, nSets).map(num => UInt(num + memStartBlock)))
+  val nSteps = addr_blocks.size
+  val (acq_beat, acq_done) = Counter(io.mem.acquire.fire(), tlDataBeats)
+  val (send_cnt, send_done) = Counter(acq_done, nSteps)
+  val (ack_cnt, ack_done) = Counter(io.mem.grant.fire(), nSteps)
+
+  io.mem.acquire.valid := (state === s_put)
+  io.mem.acquire.bits := PutBlock(
+    client_xact_id = send_cnt,
+    addr_block = addr_blocks(send_cnt),
+    addr_beat = acq_beat,
+    data = Cat(send_cnt, acq_beat))
+  io.mem.grant.ready := Bool(true)
+
+  when (state === s_idle && io.start) { state := s_put }
+  when (send_done) { state := s_wait }
+  when (ack_done) { state := s_done }
+
+  io.finished := (state === s_done)
+}
+
+/* Make sure the L2 does "the right thing" when a put is sent no-alloc but
+ * the block is already in cache. It should just treat the request as a
+ * regular allocating put */
+class NoAllocPutHitRegression(implicit p: Parameters) extends Regression()(p) {
+  val (s_idle :: s_prefetch :: s_put :: s_get ::
+       s_wait :: s_done :: Nil) = Enum(Bits(), 6)
+  val state = Reg(init = s_idle)
+
+  val acq = io.mem.acquire.bits
+  val gnt = io.mem.grant.bits
+
+  val (put_beat, put_done) = Counter(io.mem.acquire.fire() && acq.hasData(), tlDataBeats)
+  val acked = Reg(init = UInt(0, tlDataBeats + 2))
+
+  val addr_block = UInt(memStartBlock + 2)
+  val test_data = UInt(0x3446)
+
+  val prefetch_acq = GetPrefetch(
+    client_xact_id = UInt(0),
+    addr_block = addr_block)
+  val put_acq = PutBlock(
+    client_xact_id = UInt(1),
+    addr_block = addr_block,
+    addr_beat = put_beat,
+    data = test_data,
+    alloc = Bool(false))
+  val get_acq = GetBlock(
+    client_xact_id = UInt(2),
+    addr_block = addr_block)
+
+  io.mem.acquire.valid := state.isOneOf(s_prefetch, s_get, s_put)
+  io.mem.acquire.bits := MuxCase(get_acq, Seq(
+    (state === s_prefetch) -> prefetch_acq,
+    (state === s_put) -> put_acq))
+  io.mem.grant.ready := Bool(true)
+
+  when (state === s_idle && io.start) { state := s_prefetch }
+  when (state === s_prefetch && io.mem.acquire.ready) { state := s_put }
+  when (put_done) { state := s_get }
+  when (state === s_get && io.mem.acquire.ready) { state := s_wait }
+  when (state === s_wait && acked.andR) { state := s_done }
+
+  when (io.mem.grant.fire()) {
+    switch (gnt.client_xact_id) {
+      is (UInt(0)) { acked := acked | UInt(1 << tlDataBeats) }
+      is (UInt(1)) { acked := acked | UInt(1 << (tlDataBeats + 1)) }
+      is (UInt(2)) { acked := acked | UIntToOH(gnt.addr_beat) }
+    }
+  }
+
+  val data_mismatch = io.mem.grant.fire() && gnt.hasData() && gnt.data =/= test_data
+  assert(!data_mismatch, "NoAllocPutHitRegression: data does not match")
+
+  io.finished := (state === s_done)
+  io.errored := data_mismatch
+
+  disableCache()
+}
+
+/** Make sure L2 does the right thing when multiple puts are sent for the
+ *  same block, but only the first one has the alloc bit set. */
+class MixedAllocPutRegression(implicit p: Parameters) extends Regression()(p) {
+  val (s_idle :: s_pf_send :: s_pf_wait :: s_put_send :: s_put_wait ::
+       s_get_send :: s_get_wait :: s_done :: Nil) = Enum(Bits(), 8)
+  val state = Reg(init = s_idle)
+
+  /** We have to test two cases: one when the block is already cached
+   *  and one when the block is not yet cached.
+   *  We use prefetching to assure the first case. */
+  val test_data = Vec(
+    UInt("h2222222211111111"),
+    UInt("h3333333333333333"),
+    UInt("h4444444444444444"),
+    UInt("h5555555555555555"))
+  val test_alloc = Vec(Bool(false), Bool(false), Bool(true), Bool(false))
+  val test_block = Vec(
+    Seq.fill(2) { UInt(memStartBlock + 15) } ++
+    Seq.fill(2) { UInt(memStartBlock + 16) })
+  val test_beat = Vec(UInt(0), UInt(2), UInt(1), UInt(2))
+
+  val (put_acq_id, put_acq_done) = Counter(
+    state === s_put_send && io.mem.acquire.ready, test_data.size)
+  val (put_gnt_cnt, put_gnt_done) = Counter(
+    state === s_put_wait && io.mem.grant.valid, test_data.size)
+
+  val (get_acq_id, get_acq_done) = Counter(
+    state === s_get_send && io.mem.acquire.ready, test_data.size)
+  val (get_gnt_cnt, get_gnt_done) = Counter(
+    state === s_get_wait && io.mem.grant.valid, test_data.size)
+
+  val pf_acquire = PutPrefetch(
+    client_xact_id = UInt(0),
+    addr_block = UInt(memStartBlock + 15))
+
+  val put_acquire = Put(
+    client_xact_id = put_acq_id,
+    addr_block = test_block(put_acq_id),
+    addr_beat = test_beat(put_acq_id),
+    data = test_data(put_acq_id),
+    alloc = test_alloc(put_acq_id))
+
+  val get_acquire = Get(
+    client_xact_id = get_acq_id,
+    addr_block = test_block(get_acq_id),
+    addr_beat = test_beat(get_acq_id))
+
+  io.mem.acquire.valid := state.isOneOf(s_pf_send, s_put_send, s_get_send)
+  io.mem.acquire.bits := MuxLookup(state, pf_acquire, Seq(
+    s_put_send -> put_acquire,
+    s_get_send -> get_acquire))
+  io.mem.grant.ready := state.isOneOf(s_pf_wait, s_put_wait, s_get_wait)
+
+  when (state === s_idle && io.start) { state := s_pf_send }
+  when (state === s_pf_send && io.mem.acquire.ready) { state := s_pf_wait }
+  when (state === s_pf_wait && io.mem.grant.valid) { state := s_put_send }
+  when (put_acq_done) { state := s_put_wait }
+  when (put_gnt_done) { state := s_get_send }
+  when (get_acq_done) { state := s_get_wait }
+  when (get_gnt_done) { state := s_done }
+
+  io.finished := (state === s_done)
+
+  val data_mismatch = state === s_get_wait && io.mem.grant.fire() &&
+    io.mem.grant.bits.data =/= test_data(io.mem.grant.bits.client_xact_id)
+  assert(!data_mismatch, "MixedAllocPutRegression: data mismatch")
+  io.errored := data_mismatch
+
+  disableCache()
+}
+
+/* Make sure each no-alloc put triggers a request to outer memory.
+ * Unfortunately, there's no way to verify that this works except by looking
+ * at the waveform */
+class RepeatedNoAllocPutRegression(implicit p: Parameters) extends Regression()(p) {
+  disableCache()
+
+  val nPuts = 2
+  val (put_beat, put_done) = Counter(io.mem.acquire.fire(), tlDataBeats)
+  val (req_cnt, req_done) = Counter(put_done, nPuts)
+
+  val sending = Reg(init = Bool(false))
+  val acked = Reg(init = UInt(0, nPuts))
+
+  when (!sending && io.start) { sending := Bool(true) }
+  when (sending && req_done) { sending := Bool(false) }
+
+  io.mem.acquire.valid := sending
+  io.mem.acquire.bits := PutBlock(
+    client_xact_id = req_cnt,
+    addr_block = UInt(memStartBlock + 5),
+    addr_beat = put_beat,
+    data = Cat(req_cnt, UInt(0, 8)),
+    alloc = Bool(false))
+  io.mem.grant.ready := Bool(true)
+
+  when (io.mem.grant.fire()) {
+    acked := acked | UIntToOH(io.mem.grant.bits.client_xact_id)
+  }
+
+  io.finished := acked.andR
+}
+
+/* Make sure write masking works properly by writing a block of data
+ * piece by piece */
+class WriteMaskedPutBlockRegression(implicit p: Parameters) extends Regression()(p) {
+  disableCache()
+
+  val (s_idle :: s_put_send :: s_put_ack :: s_stall ::
+       s_get_send :: s_get_ack :: s_done :: Nil) = Enum(Bits(), 7)
+  val state = Reg(init = s_idle)
+  val post_stall_state = Reg(init = s_idle)
+
+  val gnt = io.mem.grant.bits
+  val acq = io.mem.acquire.bits
+
+  val stage = Reg(init = UInt(0, 1))
+
+  val (put_beat, put_block_done) = Counter(
+    io.mem.acquire.fire() && acq.hasData(), tlDataBeats)
+  val put_data = UInt(0x30010040, tlDataBits) + (put_beat << UInt(2))
+
+  val put_acq = PutBlock(
+    client_xact_id = UInt(0),
+    addr_block = UInt(memStartBlock + 7),
+    addr_beat = put_beat,
+    data = Mux(put_beat(0) === stage, put_data, UInt(0)),
+    wmask = Some(Mux(put_beat(0) === stage, Acquire.fullWriteMask, Bits(0))))
+
+  val get_acq = GetBlock(
+    client_xact_id = UInt(0),
+    addr_block = UInt(memStartBlock + 6) + stage)
+
+  io.mem.acquire.valid := state.isOneOf(s_put_send, s_get_send)
+  io.mem.acquire.bits := Mux(state === s_get_send, get_acq, put_acq)
+  io.mem.grant.ready := state.isOneOf(s_put_ack, s_get_ack)
+
+  val (get_cnt, get_done) = Counter(
+    io.mem.grant.fire() && gnt.hasData(), tlDataBeats)
+  val get_data = UInt(0x30010040, tlDataBits) + (get_cnt << UInt(2))
+
+  val (stall_cnt, stall_done) = Counter(state === s_stall, 16)
+
+  when (state === s_idle && io.start) { state := s_put_send }
+  when (put_block_done) { state := s_put_ack }
+  when (state === s_put_ack && io.mem.grant.valid) {
+    post_stall_state := s_get_send
+    state := s_stall
+  }
+  when (stall_done) { state := post_stall_state }
+  when (state === s_get_send && io.mem.acquire.ready) { state := s_get_ack }
+  when (get_done) {
+    // do a read in-between the two put-blocks to overwrite the data buffer
+    when (stage === UInt(0)) {
+      stage := stage + UInt(1)
+      post_stall_state := s_put_send
+      state := s_stall
+    } .otherwise { state := s_done }
+  }
+
+  io.finished := (state === s_done)
+
+  val data_mismatch = io.mem.grant.fire() && io.mem.grant.bits.hasData() &&
+                      stage =/= UInt(0) && io.mem.grant.bits.data =/= get_data
+  assert(!data_mismatch, "WriteMaskedPutBlockRegression: data does not match")
+  io.errored := data_mismatch
+}
+
+/* Make sure a prefetch that hits returns immediately. */
+class PrefetchHitRegression(implicit p: Parameters) extends Regression()(p) {
+  disableCache()
+
+  val sending = Reg(init = Bool(false))
+  val nPrefetches = 2
+  val (pf_cnt, pf_done) = Counter(io.mem.acquire.fire(), nPrefetches)
+  val acked = Reg(init = UInt(0, nPrefetches))
+
+  val acq_bits = Vec(
+    PutPrefetch(client_xact_id = UInt(0), addr_block = UInt(memStartBlock + 12)),
+    GetPrefetch(client_xact_id = UInt(1), addr_block = UInt(memStartBlock + 12)))
+
+  io.mem.acquire.valid := sending
+  io.mem.acquire.bits := acq_bits(pf_cnt)
+  io.mem.grant.ready := Bool(true)
+
+  when (io.mem.grant.fire()) {
+    acked := acked | UIntToOH(io.mem.grant.bits.client_xact_id)
+  }
+
+  when (!sending && io.start) { sending := Bool(true) }
+  when (sending && pf_done) { sending := Bool(false) }
+
+  io.finished := acked.andR
+  io.errored := Bool(false)
+}
+
+/* This tests the sort of access the pattern that Hwacha uses.
+ * Instead of using PutBlock/GetBlock, it uses word-sized puts and gets
+ * to the same block.
+ * Each request has the same client_xact_id, but there are multiple in flight.
+ * The responses therefore must come back in the order they are sent. */
+class SequentialSameIdGetRegression(implicit p: Parameters) extends Regression()(p) {
+  disableCache()
+
+  val sending = Reg(init = Bool(false))
+  val finished = Reg(init = Bool(false))
+
+  val (send_cnt, send_done) = Counter(io.mem.acquire.fire(), tlDataBeats)
+  val (recv_cnt, recv_done) = Counter(io.mem.grant.fire(), tlDataBeats)
+
+  when (!sending && io.start) { sending := Bool(true) }
+  when (send_done) { sending := Bool(false) }
+  when (recv_done) { finished := Bool(true) }
+
+  io.mem.acquire.valid := sending
+  io.mem.acquire.bits := Get(
+    client_xact_id = UInt(0),
+    addr_block = UInt(memStartBlock + 9),
+    addr_beat = send_cnt)
+  io.mem.grant.ready := !finished
+
+  io.finished := finished
+
+  val beat_mismatch = io.mem.grant.fire() && io.mem.grant.bits.addr_beat =/= recv_cnt
+  assert(!beat_mismatch, "SequentialSameIdGetRegression: grant received out of order")
+  io.errored := beat_mismatch
+}
+
+/* Test that a writeback will occur by writing nWays + 1 blocks to the same
+ * set. This assumes that there is only a single cache bank. If we want to
+ * test multibank configurations, we'll have to think of some other way to
+ * determine which banks are conflicting */
+class WritebackRegression(implicit p: Parameters) extends Regression()(p) {
+  disableCache()
+
+  val l2params = p.alterPartial({ case CacheName => "L2Bank" })
+  val nSets = l2params(NSets)
+  val nWays = l2params(NWays)
+
+  val addr_blocks = Vec.tabulate(nWays + 1) { i => UInt(memStartBlock + i * nSets) }
+  val data = Vec.tabulate(nWays + 1) { i => UInt((i + 1) * 1423) }
+
+  val (put_beat, put_done) = Counter(
+    io.mem.acquire.fire() && io.mem.acquire.bits.hasData(), tlDataBeats)
+  val (get_beat, get_done) = Counter(
+    io.mem.grant.fire() && io.mem.grant.bits.hasData(), tlDataBeats)
+  val (put_cnt, _) = Counter(put_done, nWays + 1)
+  val (get_cnt, _) = Counter(
+    io.mem.acquire.fire() && !io.mem.acquire.bits.hasData(), nWays + 1)
+  val (ack_cnt, ack_done) = Counter(
+    io.mem.grant.fire() && !io.mem.grant.bits.hasData() || get_done, nWays + 1)
+
+  val s_idle :: s_put :: s_get :: s_done :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_idle)
+  val sending = Reg(init = Bool(false))
+
+  io.mem.acquire.valid := sending
+  io.mem.acquire.bits := Mux(state === s_put,
+    PutBlock(
+      client_xact_id = UInt(0),
+      addr_block = addr_blocks(put_cnt),
+      addr_beat = put_beat,
+      data = data(put_cnt)),
+    GetBlock(
+      client_xact_id = UInt(0),
+      addr_block = addr_blocks(get_cnt)))
+  io.mem.grant.ready := !sending
+
+  when (state === s_idle && io.start) { state := s_put; sending := Bool(true) }
+  when (put_done || state === s_get && io.mem.acquire.fire()) {
+    sending := Bool(false)
+  }
+  when (get_done && !ack_done || state === s_put && io.mem.grant.fire()) {
+    sending := Bool(true)
+  }
+  when (ack_done) { state := Mux(state === s_put, s_get, s_done) }
+
+  io.finished := (state === s_done)
+
+  val data_mismatch = io.mem.grant.fire() && io.mem.grant.bits.hasData() &&
+                      io.mem.grant.bits.data =/= data(ack_cnt)
+  assert(!data_mismatch, "WritebackRegression: incorrect data")
+  io.errored := data_mismatch
+}
+
+class ReleaseRegression(implicit p: Parameters) extends Regression()(p) {
+  disableMem()
+
+  val l1params = p.alterPartial({ case CacheName => "L1D" })
+  val nSets = l1params(NSets)
+  val nWays = l1params(NWays)
+  val blockOffset = l1params(CacheBlockOffsetBits)
+
+  val startBlock = memStartBlock + 10
+  val addr_blocks = Vec.tabulate(nWays + 1) { i => UInt(startBlock + i * nSets) }
+  val data = Vec.tabulate(nWays + 1) { i => UInt((i + 1) * 1522) }
+  val (req_idx, req_done) = Counter(io.cache.req.fire(), nWays + 1)
+  val (resp_idx, resp_done) = Counter(io.cache.resp.valid, nWays + 1)
+
+  val sending = Reg(init = Bool(false))
+  val s_idle :: s_write :: s_read :: s_done :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_idle)
+
+  io.cache.req.valid := sending && state.isOneOf(s_write, s_read)
+  io.cache.req.bits.addr := Cat(addr_blocks(req_idx), UInt(0, blockOffset))
+  io.cache.req.bits.typ := UInt(log2Ceil(64 / 8))
+  io.cache.req.bits.cmd := Mux(state === s_write, M_XWR, M_XRD)
+  io.cache.req.bits.tag := UInt(0)
+  io.cache.req.bits.data := data(req_idx)
+  io.cache.req.bits.phys := Bool(true)
+  io.cache.invalidate_lr := Bool(false)
+
+  when (state === s_idle && io.start) {
+    sending := Bool(true)
+    state := s_write
+  }
+
+  when (resp_done) { state := Mux(state === s_write, s_read, s_done) }
+  when (io.cache.req.fire()) { sending := Bool(false) }
+  when (io.cache.resp.valid) { sending := Bool(true) }
+
+  io.finished := (state === s_done)
+
+  val data_mismatch = io.cache.resp.valid && io.cache.resp.bits.has_data &&
+                      io.cache.resp.bits.data =/= data(resp_idx)
+  assert(!data_mismatch, "ReleaseRegression: data mismatch")
+  io.errored := data_mismatch
+}
+
+class PutBeforePutBlockRegression(implicit p: Parameters) extends Regression()(p) {
+  val (s_idle :: s_put :: s_putblock :: s_wait ::
+       s_finished :: Nil) = Enum(Bits(), 5)
+  val state = Reg(init = s_idle)
+
+  disableCache()
+
+  val (put_block_beat, put_block_done) = Counter(
+    state === s_putblock && io.mem.acquire.ready, tlDataBeats)
+
+  val put_acquire = Put(
+    client_xact_id = UInt(0),
+    addr_block = UInt(memStartBlock),
+    addr_beat = UInt(0),
+    data = UInt(0),
+    wmask = Some(UInt((1 << 8) - 1)))
+
+  val put_block_acquire = PutBlock(
+    client_xact_id = UInt(1),
+    addr_block = UInt(memStartBlock + 1),
+    addr_beat = put_block_beat,
+    data = UInt(0))
+
+  val put_acked = Reg(init = UInt(0, 2))
+
+  val (ack_cnt, all_acked) = Counter(io.mem.grant.fire(), 2)
+
+  io.mem.acquire.valid := state.isOneOf(s_put, s_putblock)
+  io.mem.acquire.bits := Mux(state === s_put, put_acquire, put_block_acquire)
+  io.mem.grant.ready := (state === s_wait)
+
+  when (state === s_idle && io.start) { state := s_put }
+  when (state === s_put && io.mem.acquire.ready) { state := s_putblock }
+  when (put_block_done) { state := s_wait }
+  when (all_acked) { state := s_finished }
+
+  io.finished := (state === s_finished)
+  io.errored := Bool(false)
+}
+
+/**
+ * Make sure that multiple gets to the same line and beat are merged
+ * correctly, even if it is a cache miss.
+ */
+class MergedGetRegression(implicit p: Parameters) extends Regression()(p) {
+  disableCache()
+
+  val l2params = p.alterPartial({ case CacheName => "L2Bank" })
+  val nSets = l2params(NSets)
+  val nWays = l2params(NWays)
+
+  val (s_idle :: s_put :: s_get :: s_done :: Nil) = Enum(Bits(), 4)
+  val state = Reg(init = s_idle)
+
+  // Write NWays + 1 different conflicting lines to force an eviction of the first line
+  val (put_acq_cnt, put_acq_done) = Counter(state === s_put && io.mem.acquire.fire(), nWays + 1)
+  val (put_gnt_cnt, put_gnt_done) = Counter(state === s_put && io.mem.grant.fire(), nWays + 1)
+  val put_addr = UInt(memStartBlock) + Cat(put_acq_cnt, UInt(0, log2Up(nSets)))
+
+  val (get_acq_cnt, get_acq_done) = Counter(state === s_get && io.mem.acquire.fire(), 2)
+  val (get_gnt_cnt, get_gnt_done) = Counter(state === s_get && io.mem.grant.fire(), 2)
+  val sending = Reg(init = Bool(false))
+
+  when (state === s_idle && io.start) { state := s_put; sending := Bool(true) }
+  when (state === s_put) {
+    when (io.mem.acquire.fire()) { sending := Bool(false) }
+    when (io.mem.grant.fire()) { sending := Bool(true) }
+    when (put_gnt_done) { state := s_get }
+  }
+  when (state === s_get) {
+    when (get_acq_done) { sending := Bool(false) }
+    when (get_gnt_done) { state := s_done }
+  }
+
+  io.mem.acquire.valid := sending
+  io.mem.acquire.bits := Mux(state === s_put,
+    Put(
+      client_xact_id = UInt(0),
+      addr_block = put_addr,
+      addr_beat = UInt(3),
+      data = UInt("hdabb9321")),
+    Get(
+      client_xact_id = get_acq_cnt,
+      addr_block = UInt(memStartBlock),
+      addr_beat = UInt(3)))
+  io.mem.grant.ready := !sending
+
+  val data_mismatch = io.mem.grant.valid && io.mem.grant.bits.hasData() &&
+                      io.mem.grant.bits.data =/= UInt("hdabb9321")
+  assert(!data_mismatch, "RepeatedGetRegression: wrong data back")
+
+  io.finished := state === s_done
+  io.errored := data_mismatch
+}
+
+/**
+ * Make sure that multiple puts to the same line and beat are merged
+ * correctly, even if there is a release from the L1
+ */
+class MergedPutRegression(implicit p: Parameters) extends Regression()(p)
+    with HasTileLinkParameters {
+  val (s_idle :: s_cache_req :: s_cache_wait ::
+       s_put :: s_get :: s_done :: Nil) = Enum(Bits(), 6)
+  val state = Reg(init = s_idle)
+
+  io.cache.req.valid := (state === s_cache_req)
+  io.cache.req.bits.cmd := M_XWR
+  io.cache.req.bits.typ := UInt(log2Ceil(64 / 8))
+  io.cache.req.bits.addr := UInt(memStart)
+  io.cache.req.bits.data := UInt(1)
+  io.cache.req.bits.tag := UInt(0)
+
+  val sending = Reg(init = Bool(false))
+  val delaying = Reg(init = Bool(false))
+  val (put_cnt, put_done) = Counter(io.mem.acquire.fire(), tlMaxClientXacts)
+  val (delay_cnt, delay_done) = Counter(delaying, 8)
+  val put_acked = Reg(UInt(width = 3), init = UInt(0))
+
+  io.mem.acquire.valid := sending && !delaying
+  io.mem.acquire.bits := Mux(state === s_put,
+    Put(
+      client_xact_id = put_cnt,
+      addr_block = UInt(memStartBlock),
+      addr_beat = UInt(0),
+      data = put_cnt + UInt(2)),
+    Get(
+      client_xact_id = UInt(0),
+      addr_block = UInt(memStartBlock),
+      addr_beat = UInt(0)))
+  io.mem.grant.ready := Bool(true)
+
+  when (state === s_idle && io.start) { state := s_cache_req }
+  when (io.cache.req.fire()) { state := s_cache_wait }
+  when (io.cache.resp.valid) { state := s_put; sending := Bool(true) }
+
+  when (io.mem.acquire.fire()) {
+    delaying := Bool(true)
+    when (put_done || state === s_get) { sending := Bool(false) }
+  }
+  when (delay_done) { delaying := Bool(false) }
+
+  when (io.mem.grant.fire()) {
+    when (state === s_put) {
+      put_acked := put_acked | UIntToOH(io.mem.grant.bits.client_xact_id)
+    }
+    when (state === s_get) { state := s_done }
+  }
+
+  when (state === s_put && put_acked.andR) {
+    state := s_get
+    sending := Bool(true)
+  }
+
+  val expected_data = UInt(2 + tlMaxClientXacts - 1)
+  val data_mismatch = io.mem.grant.valid && io.mem.grant.bits.hasData() &&
+    io.mem.grant.bits.data =/= expected_data
+
+  assert(!data_mismatch, "MergedPutRegression: data mismatch")
+
+  io.finished := (state === s_done)
+  io.errored := data_mismatch
+}
+
+object RegressionTests {
+  def cacheRegressions(implicit p: Parameters) = Seq(
+    Module(new PutBlockMergeRegression),
+    Module(new NoAllocPutHitRegression),
+    Module(new RepeatedNoAllocPutRegression),
+    Module(new WriteMaskedPutBlockRegression),
+    Module(new PrefetchHitRegression),
+    Module(new SequentialSameIdGetRegression),
+    Module(new WritebackRegression),
+    Module(new PutBeforePutBlockRegression),
+    Module(new MixedAllocPutRegression),
+    Module(new ReleaseRegression),
+    Module(new MergedGetRegression),
+    Module(new MergedPutRegression))
+  def broadcastRegressions(implicit p: Parameters) = Seq(
+    Module(new IOGetAfterPutBlockRegression),
+    Module(new WriteMaskedPutBlockRegression),
+    Module(new PutBeforePutBlockRegression),
+    Module(new ReleaseRegression))
+}
+
+case object GroundTestRegressions extends Field[Parameters => Seq[Regression]]
+
+class RegressionTest(implicit p: Parameters) extends GroundTest()(p) {
+  val regressions = p(GroundTestRegressions)(p)
+  val regress_idx = Reg(init = UInt(0, log2Up(regressions.size + 1)))
+  val cur_finished = Wire(init = Bool(false))
+  val all_done = (regress_idx === UInt(regressions.size))
+  val start = Reg(init = Bool(true))
+
+  // default output values
+  io.mem.head.acquire.valid := Bool(false)
+  io.mem.head.acquire.bits := GetBlock(
+    client_xact_id = UInt(0),
+    addr_block = UInt(0))
+  io.mem.head.grant.ready := Bool(false)
+  io.cache.head.req.valid := Bool(false)
+  io.cache.head.req.bits.addr := UInt(0)
+  io.cache.head.req.bits.typ := UInt(log2Ceil(64 / 8))
+  io.cache.head.req.bits.cmd := M_XRD
+  io.cache.head.req.bits.tag := UInt(0)
+  io.cache.head.req.bits.phys := Bool(true)
+  io.cache.head.req.bits.data := UInt(0)
+  io.cache.head.invalidate_lr := Bool(false)
+
+  regressions.zipWithIndex.foreach { case (regress, i) =>
+    val me = regress_idx === UInt(i)
+    regress.io.start := me && start
+    regress.io.mem.acquire.ready := io.mem.head.acquire.ready && me
+    regress.io.mem.grant.valid   := io.mem.head.grant.valid && me
+    regress.io.mem.grant.bits    := io.mem.head.grant.bits
+    regress.io.cache.req.ready   := io.cache.head.req.ready && me
+    regress.io.cache.resp.valid  := io.cache.head.resp.valid && me
+    regress.io.cache.resp.bits   := io.cache.head.resp.bits
+
+    when (me) {
+      io.mem.head.acquire.valid := regress.io.mem.acquire.valid
+      io.mem.head.acquire.bits := regress.io.mem.acquire.bits
+      io.mem.head.grant.ready := regress.io.mem.grant.ready
+      io.cache.head.req.valid := regress.io.cache.req.valid
+      io.cache.head.req.bits := regress.io.cache.req.bits
+      io.cache.head.invalidate_lr := regress.io.cache.invalidate_lr
+      io.status.error.valid := regress.io.errored
+      io.status.error.bits := UInt(i)
+      cur_finished := regress.io.finished
+    }
+
+    when (regress.io.start) {
+      printf(s"Starting regression ${regress.getClass.getSimpleName}\n")
+    }
+  }
+
+  when (cur_finished && !all_done) {
+    start := Bool(true)
+    regress_idx := regress_idx + UInt(1)
+  }
+  when (start) { start := Bool(false) }
+
+  val timeout = Timer(5000, start, cur_finished)
+  assert(!timeout, "Regression timed out")
+
+  io.status.finished := all_done
+  io.status.timeout.valid := timeout
+  io.status.timeout.bits := UInt(0)
+
+  assert(!(all_done && io.mem.head.grant.valid),
+    "Getting grant after test completion")
+
+  when (all_done) {
+    io.status.error.valid := io.mem.head.grant.valid
+    io.status.error.bits := UInt(regressions.size)
+  }
+}
--- a/src/main/scala/groundtest/Tile.scala
+++ b/src/main/scala/groundtest/Tile.scala
@ -0,0 +1,139 @@
+package groundtest
+
+import Chisel._
+import rocket._
+import uncore.tilelink._
+import junctions._
+import scala.util.Random
+import scala.collection.mutable.ListBuffer
+import cde.{Parameters, Field}
+
+case object BuildGroundTest extends Field[Parameters => GroundTest]
+
+case class GroundTestTileSettings(
+  uncached: Int = 0, cached: Int = 0, ptw: Int = 0, maxXacts: Int = 1)
+case object GroundTestKey extends Field[Seq[GroundTestTileSettings]]
+case object GroundTestId extends Field[Int]
+
+trait HasGroundTestConstants {
+  val timeoutCodeBits = 4
+  val errorCodeBits = 4
+}
+
+trait HasGroundTestParameters extends HasAddrMapParameters {
+  implicit val p: Parameters
+  val tileId = p(GroundTestId)
+  val tileSettings = p(GroundTestKey)(tileId)
+  val nUncached = tileSettings.uncached
+  val nCached = tileSettings.cached
+  val nPTW = tileSettings.ptw
+  val memStart = addrMap("mem").start
+  val memStartBlock = memStart >> p(CacheBlockOffsetBits)
+}
+
+class DummyPTW(n: Int)(implicit p: Parameters) extends CoreModule()(p) {
+  val io = new Bundle {
+    val requestors = Vec(n, new TLBPTWIO).flip
+  }
+
+  val req_arb = Module(new RRArbiter(new PTWReq, n))
+  req_arb.io.in <> io.requestors.map(_.req)
+  req_arb.io.out.ready := Bool(true)
+
+  def vpn_to_ppn(vpn: UInt): UInt = vpn(ppnBits - 1, 0)
+
+  class QueueChannel extends ParameterizedBundle()(p) {
+    val ppn = UInt(width = ppnBits)
+    val chosen = UInt(width = log2Up(n))
+  }
+
+  val s1_ppn = vpn_to_ppn(req_arb.io.out.bits.addr)
+  val s2_ppn = RegEnable(s1_ppn, req_arb.io.out.valid)
+  val s2_chosen = RegEnable(req_arb.io.chosen, req_arb.io.out.valid)
+  val s2_valid = Reg(next = req_arb.io.out.valid)
+
+  val s2_resp = Wire(new PTWResp)
+  s2_resp.pte.ppn := s2_ppn
+  s2_resp.pte.reserved_for_software := UInt(0)
+  s2_resp.pte.d := Bool(true)
+  s2_resp.pte.a := Bool(false)
+  s2_resp.pte.g := Bool(false)
+  s2_resp.pte.u := Bool(true)
+  s2_resp.pte.r := Bool(true)
+  s2_resp.pte.w := Bool(true)
+  s2_resp.pte.x := Bool(false)
+  s2_resp.pte.v := Bool(true)
+
+  io.requestors.zipWithIndex.foreach { case (requestor, i) =>
+    requestor.resp.valid := s2_valid && s2_chosen === UInt(i)
+    requestor.resp.bits := s2_resp
+    requestor.status.vm := UInt("b01000")
+    requestor.status.prv := UInt(PRV.S)
+    requestor.invalidate := Bool(false)
+  }
+}
+
+class GroundTestStatus extends Bundle with HasGroundTestConstants {
+  val finished = Bool(OUTPUT)
+  val timeout = Valid(UInt(width = timeoutCodeBits))
+  val error = Valid(UInt(width = errorCodeBits))
+}
+
+class GroundTestIO(implicit val p: Parameters) extends ParameterizedBundle()(p)
+    with HasGroundTestParameters {
+  val cache = Vec(nCached, new HellaCacheIO)
+  val mem = Vec(nUncached, new ClientUncachedTileLinkIO)
+  val ptw = Vec(nPTW, new TLBPTWIO)
+  val status = new GroundTestStatus
+}
+
+abstract class GroundTest(implicit val p: Parameters) extends Module
+    with HasGroundTestParameters {
+  val io = new GroundTestIO
+}
+
+class GroundTestTile(resetSignal: Bool)
+                    (implicit val p: Parameters)
+                    extends Tile(resetSignal = resetSignal)(p)
+                    with HasGroundTestParameters {
+
+  override val io = new TileIO {
+    val success = Bool(OUTPUT)
+  }
+
+  val test = p(BuildGroundTest)(dcacheParams)
+
+  val ptwPorts = ListBuffer.empty ++= test.io.ptw
+  val memPorts = ListBuffer.empty ++= test.io.mem
+
+  if (nCached > 0) {
+    val dcache_io = HellaCache(p(DCacheKey))(dcacheParams)
+    val dcacheArb = Module(new HellaCacheArbiter(nCached)(dcacheParams))
+
+    dcacheArb.io.requestor.zip(test.io.cache).foreach {
+      case (requestor, cache) =>
+        val dcacheIF = Module(new SimpleHellaCacheIF()(dcacheParams))
+        dcacheIF.io.requestor <> cache
+        requestor <> dcacheIF.io.cache
+    }
+    dcache_io.cpu <> dcacheArb.io.mem
+    io.cached.head <> dcache_io.mem
+
+    // SimpleHellaCacheIF leaves invalidate_lr dangling, so we wire it to false
+    dcache_io.cpu.invalidate_lr := Bool(false)
+
+    ptwPorts += dcache_io.ptw
+  }
+
+  if (ptwPorts.size > 0) {
+    val ptw = Module(new DummyPTW(ptwPorts.size))
+    ptw.io.requestors <> ptwPorts
+  }
+
+  require(memPorts.size == io.uncached.size)
+  if (memPorts.size > 0) {
+    io.uncached <> memPorts
+  }
+
+  io.success := test.io.status.finished
+}
--- a/src/main/scala/groundtest/TraceGen.scala
+++ b/src/main/scala/groundtest/TraceGen.scala
@ -0,0 +1,629 @@
+// This file was originally written by Matthew Naylor, University of
+// Cambridge, based on code already present in the groundtest repo.
+//
+// This software was partly developed by the University of Cambridge
+// Computer Laboratory under DARPA/AFRL contract FA8750-10-C-0237
+// ("CTSRD"), as part of the DARPA CRASH research programme.
+// 
+// This software was partly developed by the University of Cambridge
+// Computer Laboratory under DARPA/AFRL contract FA8750-11-C-0249
+// ("MRC2"), as part of the DARPA MRC research programme.
+// 
+// This software was partly developed by the University of Cambridge
+// Computer Laboratory as part of the Rigorous Engineering of
+// Mainstream Systems (REMS) project, funded by EPSRC grant
+// EP/K008528/1.
+
+package groundtest
+ 
+import Chisel._
+import uncore.tilelink._
+import uncore.constants._
+import uncore.devices.NTiles
+import junctions._
+import rocket._
+import scala.util.Random
+import cde.{Parameters, Field}
+
+// =======
+// Outline
+// =======
+
+// Generate memory traces that result from random sequences of memory
+// operations.  These traces can then be validated by an external
+// tool.  A trace is a simply sequence of memory requests and
+// responses.
+
+// ==========================
+// Trace-generator parameters
+// ==========================
+
+// Compile-time parameters:
+//
+//   * The id of the generator (there may be more than one in a
+//     multi-core system).
+//
+//   * The total number of generators present in the system.
+//
+//   * The desired number of requests to be sent by each generator.
+//
+//   * A bag of physical addresses, shared by all cores, from which an
+//     address can be drawn when generating a fresh request.
+//
+//   * A number of random 'extra addresses', local to each core, from
+//     which an address can be drawn when generating a fresh request.
+//     (This is a way to generate a wider range of addresses without having
+//     to repeatedly recompile with a different address bag.)
+
+case object AddressBag extends Field[List[BigInt]]
+
+trait HasTraceGenParams {
+  implicit val p: Parameters
+  val numGens             = p(NTiles)
+  val numBitsInId         = log2Up(numGens)
+  val numReqsPerGen       = p(GeneratorKey).maxRequests
+  val memRespTimeout      = 8192
+  val numBitsInWord       = p(XLen)
+  val numBytesInWord      = numBitsInWord / 8
+  val numBitsInWordOffset = log2Up(numBytesInWord)
+  val addressBag          = p(AddressBag)
+  val addressBagLen       = addressBag.length
+  val logAddressBagLen    = log2Up(addressBagLen)
+  val genExtraAddrs       = false
+  val logNumExtraAddrs    = 1
+  val numExtraAddrs       = 1 << logNumExtraAddrs
+  val maxTags             = 8
+
+  require(numBytesInWord * 8 == numBitsInWord)
+  require((1 << logAddressBagLen) == addressBagLen)
+}
+
+// ============
+// Trace format
+// ============
+
+// Let <id>   denote a generator id;
+//     <addr> denote an address (in hex);
+//     <data> denote a value that is stored at an address;
+//     <tag>  denote a unique request/response id;
+// and <time> denote an integer representing a cycle-count.
+
+// Each line in the trace takes one of the following formats.
+//
+//   <id>: load-req                <addr> #<tag> @<time>
+//   <id>: load-reserve-req        <addr> #<tag> @<time>
+//   <id>: store-req        <data> <addr> #<tag> @<time>
+//   <id>: store-cond-req   <data> <addr> #<tag> @<time>
+//   <id>: swap-req         <data> <addr> #<tag> @<time>
+//   <id>: resp             <data>        #<tag> @<time>
+//   <id>: fence-req                             @<time>
+//   <id>: fence-resp                            @<time>
+
+// NOTE: The (address, value) pair of every generated store is unique,
+// i.e. the same value is never written to the same address twice.
+// This aids trace validation.
+
+// ============
+// Random seeds
+// ============
+
+// The generator employs "unitialised registers" to seed its PRNGs;
+// these are randomly initialised by the C++ backend.  This means that
+// the "-s" command-line argument to the Rocket emulator can be used
+// to generate new traces, or to replay specific ones.
+
+// ===========
+// Tag manager
+// ===========
+
+//  This is used to obtain unique tags for memory requests: each
+//  request must carry a unique tag since responses can come back
+//  out-of-order.
+//
+//  The tag manager can be viewed as a set of tags.  The user can take
+//  a tag out of the set (if there is one available) and later put it
+//  back.
+
+class TagMan(val logNumTags : Int) extends Module {
+  val io = new Bundle {
+    // Is there a tag available?
+    val available = Bool(OUTPUT)
+    // If so, which one?
+    val tagOut    = UInt(OUTPUT, logNumTags)
+    // User pulses this to take the currently available tag
+    val take      = Bool(INPUT)
+    // User pulses this to put a tag back
+    val put       = Bool(INPUT)
+    // And the tag put back is
+    val tagIn     = UInt(INPUT, logNumTags)
+  }
+
+  // Total number of tags available
+  val numTags = 1 << logNumTags
+
+  // For each tag, record whether or not it is in use
+  val inUse = List.fill(numTags)(Reg(init = Bool(false)))
+
+  // Mapping from each tag to its 'inUse' bit
+  val inUseMap = (0 to numTags-1).map(i => UInt(i)).zip(inUse)
+
+  // Next tag to offer
+  val nextTag = Reg(init = UInt(0, logNumTags))
+  io.tagOut := nextTag
+
+  // Is the next tag available?
+  io.available := ~MuxLookup(nextTag, Bool(true), inUseMap)
+
+  // When user takes a tag
+  when (io.take) {
+    for ((i, b) <- inUseMap) {
+      when (i === nextTag) { b := Bool(true) }
+    }
+    nextTag := nextTag + UInt(1)
+  }
+
+  // When user puts a tag back
+  when (io.put) {
+    for ((i, b) <- inUseMap) {
+      when (i === io.tagIn) { b := Bool(false) }
+    }
+  }
+}
+
+// ===============
+// Trace generator
+// ===============
+
+class TraceGenerator(id: Int)
+    (implicit p: Parameters) extends L1HellaCacheModule()(p)
+                                with HasTraceGenParams {
+  val io = new Bundle {
+    val finished = Bool(OUTPUT)
+    val timeout = Bool(OUTPUT)
+    val mem = new HellaCacheIO
+  }
+
+  val reqTimer = Module(new Timer(8192, maxTags))
+  reqTimer.io.start.valid := io.mem.req.fire()
+  reqTimer.io.start.bits := io.mem.req.bits.tag
+  reqTimer.io.stop.valid := io.mem.resp.valid
+  reqTimer.io.stop.bits := io.mem.resp.bits.tag
+
+  assert(!reqTimer.io.timeout.valid, s"TraceGen core ${id}: request timed out")
+
+  // Random addresses
+  // ----------------
+  
+  // Address bag, shared by all cores, taken from module parameters.
+  // In addition, there is a per-core random selection of extra addresses.
+
+  val addrHashMap = p(GlobalAddrMap)
+  val baseAddr = addrHashMap("mem").start + 0x01000000
+
+  val bagOfAddrs = addressBag.map(x => UInt(x, numBitsInWord))
+
+  val extraAddrs = (0 to numExtraAddrs-1).
+                   map(i => Reg(UInt(width = 16)))
+
+  // A random index into the address bag.
+
+  val randAddrBagIndex = LCG(logAddressBagLen)
+
+  // A random address from the address bag.
+
+  val addrBagIndices = (0 to addressBagLen-1).
+                    map(i => UInt(i, logAddressBagLen))
+  
+  val randAddrFromBag = MuxLookup(randAddrBagIndex, UInt(0),
+                          addrBagIndices.zip(bagOfAddrs))
+
+  // Random address from the address bag or the extra addresses.
+
+  val randAddr =
+        if (! genExtraAddrs) {
+          randAddrFromBag
+        }
+        else {
+          // A random index into the extra addresses.
+
+          val randExtraAddrIndex = LCG(logNumExtraAddrs)
+
+          // A random address from the extra addresses.
+
+          val extraAddrIndices = (0 to numExtraAddrs-1).
+                                 map(i => UInt(i, logNumExtraAddrs))
+  
+          val randAddrFromExtra = Cat(UInt(0),
+                MuxLookup(randExtraAddrIndex, UInt(0),
+                  extraAddrIndices.zip(extraAddrs)), UInt(0, 3))
+
+          Frequency(List(
+            (1, randAddrFromBag),
+            (1, randAddrFromExtra)))
+        }
+
+  // Random opcodes
+  // --------------
+ 
+  // Generate random opcodes for memory operations according to the
+  // given frequency distribution.
+
+  // Opcodes
+  val (opNop   :: opLoad :: opStore ::
+       opFence :: opLRSC :: opSwap  ::
+       opDelay :: Nil) = Enum(Bits(), 7)
+
+  // Distribution specified as a list of (frequency,value) pairs.
+  // NOTE: frequencies must sum to a power of two.
+
+  val randOp = Frequency(List(
+    (10, opLoad),
+    (10, opStore),
+    (4,  opFence),
+    (3,  opLRSC),
+    (3,  opSwap),
+    (2,  opDelay)))
+
+  // Request/response tags
+  // ---------------------
+
+  // Responses may come back out-of-order.  Each request and response
+  // therefore contains a unique 7-bit identifier, referred to as a
+  // "tag", used to match each response with its corresponding request.
+
+  // Create a tag manager giving out unique 3-bit tags
+  val tagMan = Module(new TagMan(log2Ceil(maxTags)))
+
+  // Default inputs
+  tagMan.io.take  := Bool(false);
+  tagMan.io.put   := Bool(false);
+  tagMan.io.tagIn := UInt(0);
+
+  // Cycle counter
+  // -------------
+
+  // 32-bit cycle count used to record send-times of requests and
+  // receive-times of respones.
+
+  val cycleCount = Reg(init = UInt(0, 32))
+  cycleCount := cycleCount + UInt(1);
+
+  // Delay timer
+  // -----------
+
+  // Used to implement the delay operation and to insert random
+  // delays between load-reserve and store-conditional commands.
+
+  // A 16-bit timer is plenty
+  val delayTimer = Module(new DynamicTimer(16))
+
+  // Used to generate a random delay period
+  val randDelayBase = LCG16()
+
+  // Random delay period: usually small, occasionally big
+  val randDelay = Frequency(List(
+    (14, UInt(0, 13) ## randDelayBase(2, 0)),
+    (2,  UInt(0, 11) ## randDelayBase(5, 0))))
+
+  // Default inputs
+  delayTimer.io.start  := Bool(false)
+  delayTimer.io.period := randDelay
+  delayTimer.io.stop   := Bool(false)
+
+  // Operation dispatch
+  // ------------------
+
+  // Hardware thread id
+  val tid = UInt(id, numBitsInId)
+
+  // Request & response count
+  val reqCount  = Reg(init = UInt(0, 32))
+  val respCount = Reg(init = UInt(0, 32))
+
+  // Current operation being executed
+  val currentOp = Reg(init = opNop)
+
+  // If larger than 0, a multi-cycle operation is in progress.
+  // Value indicates stage of progress.
+  val opInProgress = Reg(init = UInt(0, 2))
+
+  // Indicate when a fresh request is to be sent
+  val sendFreshReq = Wire(Bool())
+  sendFreshReq := Bool(false)
+
+  // Used to generate unique data values
+  val nextData = Reg(init = UInt(1, numBitsInWord-numBitsInId))
+
+  // Registers for all the interesting parts of a request
+  val reqValid = Reg(init = Bool(false))
+  val reqAddr  = Reg(init = UInt(0, numBitsInWord))
+  val reqData  = Reg(init = UInt(0, numBitsInWord))
+  val reqCmd   = Reg(init = UInt(0, 5))
+  val reqTag   = Reg(init = UInt(0, 7))
+
+   // Condition on being allowed to send a fresh request
+  val canSendFreshReq = (!reqValid || io.mem.req.fire()) &&
+                          tagMan.io.available
+
+  // Operation dispatch
+  when (reqCount < UInt(numReqsPerGen)) {
+
+    // No-op
+    when (currentOp === opNop) {
+      // Move on to a new operation
+      currentOp := randOp
+    }
+
+    // Fence
+    when (currentOp === opFence) {
+      when (opInProgress === UInt(0) && !reqValid) {
+        // Emit fence request
+        printf("%d: fence-req @%d\n", tid, cycleCount)
+        // Multi-cycle operation now in progress
+        opInProgress := UInt(1)
+      }
+      // Wait until all requests have had a response
+      .elsewhen (reqCount === respCount) {
+        // Emit fence response
+        printf("%d: fence-resp @%d\n", tid, cycleCount)
+        // Move on to a new operation
+        currentOp := randOp
+        // Operation finished
+        opInProgress := UInt(0)
+      }
+    }
+
+    // Delay
+    when (currentOp === opDelay) {
+      when (opInProgress === UInt(0)) {
+        // Start timer
+        delayTimer.io.start := Bool(true)
+        // Multi-cycle operation now in progress
+        opInProgress := UInt(1)
+      }
+      .elsewhen (delayTimer.io.timeout) {
+        // Move on to a new operation
+        currentOp := randOp
+        // Operation finished
+        opInProgress := UInt(0)
+      }
+    }
+  
+    // Load, store, or atomic swap
+    when (currentOp === opLoad  ||
+          currentOp === opStore ||
+          currentOp === opSwap) {
+      when (canSendFreshReq) {
+        // Set address
+        reqAddr := randAddr
+        // Set command
+        when (currentOp === opLoad) {
+          reqCmd := M_XRD
+        } .elsewhen (currentOp === opStore) {
+          reqCmd := M_XWR
+        } .elsewhen (currentOp === opSwap) {
+          reqCmd := M_XA_SWAP
+        }
+        // Send request
+        sendFreshReq := Bool(true)
+        // Move on to a new operation
+        currentOp := randOp
+      }
+    }
+  
+    // Load-reserve and store-conditional
+    // First issue an LR, then delay, then issue an SC
+    when (currentOp === opLRSC) {
+      // LR request has not yet been sent
+      when (opInProgress === UInt(0)) {
+        when (canSendFreshReq) {
+          // Set address and command
+          reqAddr := randAddr
+          reqCmd  := M_XLR
+          // Send request
+          sendFreshReq := Bool(true)
+          // Multi-cycle operation now in progress
+          opInProgress := UInt(1)
+        }
+      }
+      // LR request has been sent, start delay timer
+      when (opInProgress === UInt(1)) {
+        // Start timer
+        delayTimer.io.start := Bool(true)
+        // Indicate that delay has started
+        opInProgress := UInt(2)
+      }
+      // Delay in progress
+      when (opInProgress === UInt(2)) {
+        when (delayTimer.io.timeout) {
+          // Delay finished
+          opInProgress := UInt(3)
+        }
+      }
+      // Delay finished, send SC request
+      when (opInProgress === UInt(3)) {
+        when (canSendFreshReq) {
+          // Set command, but leave address
+          // i.e. use same address as LR did
+          reqCmd  := M_XSC
+          // Send request
+          sendFreshReq := Bool(true)
+          // Multi-cycle operation finished
+          opInProgress := UInt(0)
+          // Move on to a new operation
+          currentOp := randOp
+        }
+      }
+    }
+  }
+
+  // Sending of requests
+  // -------------------
+
+  when (sendFreshReq) {
+    // Grab a unique tag for the request
+    reqTag := tagMan.io.tagOut
+    tagMan.io.take := Bool(true)
+    // Fill in unique data
+    reqData := Cat(nextData, tid)
+    nextData := nextData + UInt(1)
+    // Request is good to go!
+    reqValid := Bool(true)
+    // Increment request count
+    reqCount := reqCount + UInt(1)
+  }
+  .elsewhen (io.mem.req.fire()) {
+    // Request has been sent and there is no new request ready
+    reqValid := Bool(false)
+  }
+
+  // Wire up interface to memory
+  io.mem.req.valid     := reqValid
+  io.mem.req.bits.addr := reqAddr
+  io.mem.req.bits.data := reqData
+  io.mem.req.bits.typ  := UInt(log2Ceil(numBytesInWord))
+  io.mem.req.bits.cmd  := reqCmd
+  io.mem.req.bits.tag  := reqTag
+
+  // On cycle when request is actually sent, print it
+  when (io.mem.req.fire()) {
+    // Short-hand for address
+    val addr = io.mem.req.bits.addr
+    // Print thread id
+    printf("%d:", tid)
+    // Print command
+    when (reqCmd === M_XRD) {
+      printf(" load-req 0x%x", addr)
+    }
+    when (reqCmd === M_XLR) {
+      printf(" load-reserve-req 0x%x", addr)
+    }
+    when (reqCmd === M_XWR) {
+      printf(" store-req %d 0x%x", reqData, addr)
+    }
+    when (reqCmd === M_XSC) {
+      printf(" store-cond-req %d 0x%x", reqData, addr)
+    }
+    when (reqCmd === M_XA_SWAP) {
+      printf(" swap-req %d 0x%x", reqData, addr)
+    }
+    // Print tag
+    printf(" #%d", reqTag)
+    // Print time
+    printf(" @%d\n", cycleCount)
+  }
+
+  // Handling of responses
+  // ---------------------
+
+  // When a response is received
+  when (io.mem.resp.valid) {
+    // Put tag back in tag set
+    tagMan.io.tagIn := io.mem.resp.bits.tag
+    tagMan.io.put   := Bool(true)
+    // Print response
+    printf("%d: resp %d #%d @%d\n", tid,
+      io.mem.resp.bits.data, io.mem.resp.bits.tag, cycleCount)
+    // Increment response count
+    respCount := respCount + UInt(1)
+  }
+
+  // Termination condition
+  // ---------------------
+
+  val done = reqCount  === UInt(numReqsPerGen) &&
+             respCount === UInt(numReqsPerGen)
+
+  val donePulse = done && !Reg(init = Bool(false), next = done)
+
+  // Emit that this thread has completed
+  when (donePulse) {
+    printf(s"FINISHED ${numGens}\n")
+  }
+
+  io.finished := Bool(false)
+  io.timeout := reqTimer.io.timeout.valid
+}
+
+class NoiseGenerator(implicit val p: Parameters) extends Module
+    with HasTraceGenParams
+    with HasTileLinkParameters {
+  val io = new Bundle {
+    val mem = new ClientUncachedTileLinkIO
+    val finished = Bool(INPUT)
+  }
+
+  val idBits = tlClientXactIdBits
+  val xact_id_free = Reg(UInt(width = idBits), init = ~UInt(0, idBits))
+  val xact_id_onehot = PriorityEncoderOH(xact_id_free)
+
+  val timer = Module(new DynamicTimer(8))
+  timer.io.start := io.mem.acquire.fire()
+  timer.io.period := LCG(8, io.mem.acquire.fire())
+  timer.io.stop := Bool(false)
+
+  val s_start :: s_send :: s_wait :: s_done :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_start)
+
+  when (state === s_start) { state := s_send }
+  when (io.mem.acquire.fire()) { state := s_wait }
+  when (state === s_wait) {
+    when (timer.io.timeout) { state := s_send }
+    when (io.finished) { state := s_done }
+  }
+
+  val acq_id = OHToUInt(xact_id_onehot)
+  val gnt_id = io.mem.grant.bits.client_xact_id
+
+  xact_id_free := (xact_id_free &
+                    ~Mux(io.mem.acquire.fire(), xact_id_onehot, UInt(0))) |
+                    Mux(io.mem.grant.fire(), UIntToOH(gnt_id), UInt(0))
+
+  val tlBlockOffset = tlBeatAddrBits + tlByteAddrBits
+  val addr_idx = LCG(logAddressBagLen, io.mem.acquire.fire())
+  val addr_bag = Vec(addressBag.map(
+    addr => UInt(addr >> tlBlockOffset, tlBlockAddrBits)))
+  val addr_block = addr_bag(addr_idx)
+  val addr_beat = LCG(tlBeatAddrBits, io.mem.acquire.fire())
+  val acq_select = LCG(1, io.mem.acquire.fire())
+
+  val get_acquire = Get(
+    client_xact_id = acq_id,
+    addr_block = addr_block,
+    addr_beat = addr_beat)
+  val put_acquire = Put(
+    client_xact_id = acq_id,
+    addr_block = addr_block,
+    addr_beat = addr_beat,
+    data = UInt(0),
+    wmask = Some(UInt(0)))
+
+  io.mem.acquire.valid := (state === s_send) && xact_id_free.orR
+  io.mem.acquire.bits := Mux(acq_select(0), get_acquire, put_acquire)
+  io.mem.grant.ready := !xact_id_free(gnt_id)
+}
+
+// =======================
+// Trace-generator wrapper
+// =======================
+
+class GroundTestTraceGenerator(implicit p: Parameters)
+    extends GroundTest()(p) with HasTraceGenParams {
+
+  require(io.mem.size <= 1)
+  require(io.cache.size == 1)
+
+  val traceGen = Module(new TraceGenerator(p(GroundTestId)))
+  io.cache.head <> traceGen.io.mem
+
+  if (io.mem.size == 1) {
+    val noiseGen = Module(new NoiseGenerator)
+    io.mem.head <> noiseGen.io.mem
+    noiseGen.io.finished := traceGen.io.finished
+  }
+
+  io.status.finished := traceGen.io.finished
+  io.status.timeout.valid := traceGen.io.timeout
+  io.status.timeout.bits := UInt(0)
+  io.status.error.valid := Bool(false)
+}
--- a/src/main/scala/groundtest/Util.scala
+++ b/src/main/scala/groundtest/Util.scala
@ -0,0 +1,194 @@
+package groundtest
+
+import Chisel._
+
+// =============
+// Dynamic timer
+// =============
+
+// Timer with a dynamically-settable period.
+
+class DynamicTimer(w: Int) extends Module {
+  val io = new Bundle {
+    val start   = Bool(INPUT)
+    val period  = UInt(INPUT, w)
+    val stop    = Bool(INPUT)
+    val timeout = Bool(OUTPUT)
+  }
+
+  val countdown = Reg(init = UInt(0, w))
+  val active = Reg(init = Bool(false))
+
+  when (io.start) {
+    countdown := io.period
+    active := Bool(true)
+  } .elsewhen (io.stop || countdown === UInt(0)) {
+    active := Bool(false)
+  } .elsewhen (active) {
+    countdown := countdown - UInt(1)
+  }
+
+  io.timeout := countdown === UInt(0) && active
+}
+
+// ============
+// LCG16 module
+// ============
+
+// A 16-bit psuedo-random generator based on a linear conguential
+// generator (LCG).  The state is stored in an unitialised register.
+// When using the C++ backend, it is straigtforward to arrange a
+// random initial value for each uninitialised register, effectively
+// seeding each LCG16 instance with a different seed.
+
+class LCG16 extends Module { 
+  val io = new Bundle { 
+    val out = UInt(OUTPUT, 16) 
+    val inc = Bool(INPUT)
+  } 
+  val state = Reg(UInt(width = 32))
+  when (io.inc) {
+    state := state * UInt(1103515245, 32) + UInt(12345, 32)
+  }
+  io.out := state(30, 15)
+} 
+ 
+// ==========
+// LCG module
+// ==========
+
+// An n-bit psuedo-random generator made from many instances of a
+// 16-bit LCG.  Parameter 'width' must be larger than 0.
+
+class LCG(val w: Int) extends Module {
+  val io = new Bundle { 
+    val out = UInt(OUTPUT, w) 
+    val inc = Bool(INPUT)
+  } 
+  require(w > 0)
+  val numLCG16s : Int = (w+15)/16
+  val outs = Seq.fill(numLCG16s) { LCG16(io.inc) }
+  io.out := Cat(outs)
+}
+
+object LCG16 {
+  def apply(inc: Bool = Bool(true)): UInt = {
+    val lcg = Module(new LCG16)
+    lcg.io.inc := inc
+    lcg.io.out
+  }
+}
+
+object LCG {
+  def apply(w: Int, inc: Bool = Bool(true)): UInt = {
+    val lcg = Module(new LCG(w))
+    lcg.io.inc := inc
+    lcg.io.out
+  }
+}
+
+// ======================
+// Frequency distribution
+// ======================
+
+// Given a list of (frequency, value) pairs, return a random value
+// according to the frequency distribution.  The sum of the
+// frequencies in the distribution must be a power of two.
+
+object Frequency {
+  def apply(dist : List[(Int, Bits)]) : Bits = {
+    // Distribution must be non-empty
+    require(dist.length > 0)
+
+    // Require that the frequencies sum to a power of two
+    val (freqs, vals) = dist.unzip
+    val total = freqs.sum
+    require(isPow2(total))
+
+    // First item in the distribution
+    val (firstFreq, firstVal) = dist.head
+
+    // Result wire
+    val result = Wire(Bits(width = firstVal.getWidth))
+    result := UInt(0)
+
+    // Random value
+    val randVal = LCG(log2Up(total))
+
+    // Pick return value
+    var count = firstFreq
+    var select = when (randVal < UInt(firstFreq)) { result := firstVal }
+    for (p <- dist.drop(1)) {
+      count = count + p._1
+      select = select.elsewhen(randVal < UInt(count)) { result := p._2 }
+    }
+
+    return result
+  }
+}
+
+object ValidMux {
+  def apply[T <: Data](v1: ValidIO[T], v2: ValidIO[T]*): ValidIO[T] = {
+    apply(v1 +: v2.toSeq)
+  }
+  def apply[T <: Data](valids: Seq[ValidIO[T]]): ValidIO[T] = {
+    val out = Wire(Valid(valids.head.bits))
+    out.valid := valids.map(_.valid).reduce(_ || _)
+    out.bits := MuxCase(valids.head.bits,
+      valids.map(v => (v.valid -> v.bits)))
+    out
+  }
+}
+
+object DebugCombiner {
+  def apply(debugs: Seq[GroundTestStatus]): GroundTestStatus = {
+    val out = Wire(new GroundTestStatus)
+    out.finished := debugs.map(_.finished).reduce(_ && _)
+    out.timeout  := ValidMux(debugs.map(_.timeout))
+    out.error    := ValidMux(debugs.map(_.error))
+    out
+  }
+}
+
+/**
+ * Takes in data on one decoupled interface and broadcasts it to
+ * N decoupled output interfaces
+ */
+class Broadcaster[T <: Data](typ: T, n: Int) extends Module {
+  val io = new Bundle {
+    val in = Decoupled(typ).flip
+    val out = Vec(n, Decoupled(typ))
+  }
+
+  require (n > 0)
+
+  if (n == 1) {
+    io.out.head <> io.in
+  } else {
+    val idx = Reg(init = UInt(0, log2Up(n)))
+    val save = Reg(typ)
+
+    io.out.head.valid := idx === UInt(0) && io.in.valid
+    io.out.head.bits := io.in.bits
+    for (i <- 1 until n) {
+      io.out(i).valid := idx === UInt(i)
+      io.out(i).bits := save
+    }
+    io.in.ready := io.out.head.ready && idx === UInt(0)
+
+    when (io.in.fire()) { save := io.in.bits }
+
+    when (io.out(idx).fire()) {
+      when (idx === UInt(n - 1)) { idx := UInt(0) }
+      .otherwise { idx := idx + UInt(1) }
+    }
+  }
+}
+
+object Broadcaster {
+  def apply[T <: Data](in: DecoupledIO[T], n: Int): Vec[DecoupledIO[T]] = {
+    val split = Module(new Broadcaster(in.bits, n))
+    split.io.in <> in
+    split.io.out
+  }
+}
--- a/src/main/scala/junctions/addrmap.scala
+++ b/src/main/scala/junctions/addrmap.scala
@ -0,0 +1,148 @@
+// See LICENSE for license details.
+
+package junctions
+
+import Chisel._
+import cde.{Parameters, Field}
+import scala.collection.mutable.HashMap
+
+case object PAddrBits extends Field[Int]
+case object GlobalAddrMap extends Field[AddrMap]
+
+trait HasAddrMapParameters {
+  implicit val p: Parameters
+
+  val paddrBits = p(PAddrBits)
+  val addrMap = p(GlobalAddrMap)
+}
+
+case class MemAttr(prot: Int, cacheable: Boolean = false)
+
+sealed abstract class MemRegion {
+  def start: BigInt
+  def size: BigInt
+  def numSlaves: Int
+  def attr: MemAttr
+
+  def containsAddress(x: UInt) = UInt(start) <= x && x < UInt(start + size)
+}
+
+case class MemSize(size: BigInt, attr: MemAttr) extends MemRegion {
+  def start = 0
+  def numSlaves = 1
+}
+
+case class MemRange(start: BigInt, size: BigInt, attr: MemAttr) extends MemRegion {
+  def numSlaves = 1
+}
+
+object AddrMapProt {
+  val R = 0x1
+  val W = 0x2
+  val X = 0x4
+  val RW = R | W
+  val RX = R | X
+  val RWX = R | W | X
+  val SZ = 3
+}
+
+class AddrMapProt extends Bundle {
+  val x = Bool()
+  val w = Bool()
+  val r = Bool()
+}
+
+case class AddrMapEntry(name: String, region: MemRegion)
+
+object AddrMap {
+  def apply(elems: AddrMapEntry*): AddrMap = new AddrMap(elems)
+}
+
+class AddrMap(
+    entriesIn: Seq[AddrMapEntry],
+    val start: BigInt = BigInt(0),
+    val collapse: Boolean = false) extends MemRegion {
+  private val slavePorts = HashMap[String, Int]()
+  private val mapping = HashMap[String, MemRegion]()
+
+  def isEmpty = entries.isEmpty
+  def length = entries.size
+  def numSlaves = slavePorts.size
+
+  val (size: BigInt, entries: Seq[AddrMapEntry], attr: MemAttr) = {
+    var ind = 0
+    var base = start
+    var rebasedEntries = collection.mutable.ArrayBuffer[AddrMapEntry]()
+    var prot = 0
+    var cacheable = true
+    for (AddrMapEntry(name, r) <- entriesIn) {
+      if (r.start != 0) {
+        val align = BigInt(1) << log2Ceil(r.size)
+        require(r.start >= base, s"region $name base address 0x${r.start.toString(16)} overlaps previous base 0x${base.toString(16)}")
+        base = r.start
+      } else {
+        base = (base + r.size - 1) / r.size * r.size
+      }
+
+      r match {
+        case r: AddrMap =>
+          val subMap = new AddrMap(r.entries, base, r.collapse)
+          rebasedEntries += AddrMapEntry(name, subMap)
+          mapping += name -> subMap
+          mapping ++= subMap.mapping.map { case (k, v) => s"$name:$k" -> v }
+          if (r.collapse) {
+            slavePorts += (name -> ind)
+            ind += 1
+          } else {
+            slavePorts ++= subMap.slavePorts.map {
+              case (k, v) => s"$name:$k" -> (ind + v)
+            }
+            ind += r.numSlaves
+          }
+        case _ =>
+          val e = MemRange(base, r.size, r.attr)
+          rebasedEntries += AddrMapEntry(name, e)
+          mapping += name -> e
+          slavePorts += name -> ind
+          ind += r.numSlaves
+      }
+
+      base += r.size
+      prot |= r.attr.prot
+      cacheable &&= r.attr.cacheable
+    }
+    (base - start, rebasedEntries, MemAttr(prot, cacheable))
+  }
+
+  val flatten: Seq[AddrMapEntry] = {
+    mapping.toSeq.map {
+      case (name, range: MemRange) => Some(AddrMapEntry(name, range))
+      case _ => None
+    }.flatten.sortBy(_.region.start)
+  }
+
+  def toRange: MemRange = MemRange(start, size, attr)
+  def apply(name: String): MemRegion = mapping(name)
+  def contains(name: String): Boolean = mapping.contains(name)
+  def port(name: String): Int = slavePorts(name)
+  def subMap(name: String): AddrMap = mapping(name).asInstanceOf[AddrMap]
+  def isInRegion(name: String, addr: UInt): Bool = mapping(name).containsAddress(addr)
+
+  def isCacheable(addr: UInt): Bool = {
+    flatten.filter(_.region.attr.cacheable).map(
+      _.region.containsAddress(addr)
+    ).foldLeft(Bool(false))(_ || _)
+  }
+
+  def isValid(addr: UInt): Bool = {
+    flatten.map(_.region.containsAddress(addr)).foldLeft(Bool(false))(_ || _)
+  }
+
+  def getProt(addr: UInt): AddrMapProt = {
+    val protForRegion = flatten.map { entry =>
+      Mux(entry.region.containsAddress(addr),
+        UInt(entry.region.attr.prot, AddrMapProt.SZ), UInt(0))
+    }
+    new AddrMapProt().fromBits(protForRegion.reduce(_|_))
+  }
+}
--- a/src/main/scala/junctions/atos.scala
+++ b/src/main/scala/junctions/atos.scala
@ -0,0 +1,333 @@
+package junctions
+
+import Chisel._
+import scala.math.max
+import cde.{Parameters, Field}
+
+trait HasAtosParameters extends HasNastiParameters {
+  // round up to a multiple of 32
+  def roundup(n: Int) = 32 * ((n - 1) / 32 + 1)
+
+  val atosUnionBits = max(
+    nastiXIdBits + nastiXDataBits + nastiWStrobeBits + 1,
+    nastiXIdBits + nastiXBurstBits +
+    nastiXSizeBits + nastiXLenBits + nastiXAddrBits)
+  val atosIdBits = nastiXIdBits
+  val atosTypBits = 2
+  val atosRespBits = nastiXRespBits
+  val atosDataBits = nastiXDataBits
+
+  val atosAddrOffset = atosIdBits
+  val atosLenOffset = atosIdBits + nastiXAddrBits
+  val atosSizeOffset = atosLenOffset + nastiXLenBits
+  val atosBurstOffset = atosSizeOffset + nastiXSizeBits
+
+  val atosDataOffset = atosIdBits
+  val atosStrobeOffset = nastiXDataBits + atosIdBits
+  val atosLastOffset = atosStrobeOffset + nastiWStrobeBits
+
+  val atosRequestBits = roundup(atosTypBits + atosUnionBits)
+  val atosResponseBits = roundup(atosTypBits + atosIdBits + atosRespBits + atosDataBits + 1)
+  val atosRequestBytes = atosRequestBits / 8
+  val atosResponseBytes = atosResponseBits / 8
+  val atosRequestWords = atosRequestBytes / 4
+  val atosResponseWords = atosResponseBytes / 4
+}
+
+abstract class AtosModule(implicit val p: Parameters)
+  extends Module with HasAtosParameters
+abstract class AtosBundle(implicit val p: Parameters)
+  extends ParameterizedBundle()(p) with HasAtosParameters
+
+object AtosRequest {
+  def arType = UInt("b00")
+  def awType = UInt("b01")
+  def wType  = UInt("b10")
+
+  def apply(typ: UInt, union: UInt)(implicit p: Parameters): AtosRequest = {
+    val areq = Wire(new AtosRequest)
+    areq.typ := typ
+    areq.union := union
+    areq
+  }
+
+  def apply(ar: NastiReadAddressChannel)(implicit p: Parameters): AtosRequest =
+    apply(arType, Cat(ar.burst, ar.size, ar.len, ar.addr, ar.id))
+
+  def apply(aw: NastiWriteAddressChannel)(implicit p: Parameters): AtosRequest =
+    apply(awType, Cat(aw.burst, aw.size, aw.len, aw.addr, aw.id))
+
+  def apply(w: NastiWriteDataChannel)(implicit p: Parameters): AtosRequest =
+    apply(wType, Cat(w.last, w.strb, w.data, w.id))
+}
+
+class AtosRequest(implicit p: Parameters)
+    extends AtosBundle()(p) with Serializable {
+  val typ = UInt(width = atosTypBits)
+  val union = UInt(width = atosUnionBits)
+
+  def burst(dummy: Int = 0) =
+    union(atosUnionBits - 1, atosBurstOffset)
+
+  def size(dummy: Int = 0) =
+    union(atosBurstOffset - 1, atosSizeOffset)
+
+  def len(dummy: Int = 0) =
+    union(atosSizeOffset - 1, atosLenOffset)
+
+  def addr(dummy: Int = 0) =
+    union(atosLenOffset - 1, atosAddrOffset)
+
+  def id(dummy: Int = 0) =
+    union(atosIdBits - 1, 0)
+
+  def data(dummy: Int = 0) =
+    union(atosStrobeOffset - 1, atosDataOffset)
+
+  def strb(dummy: Int = 0) =
+    union(atosLastOffset - 1, atosStrobeOffset)
+
+  def last(dummy: Int = 0) =
+    union(atosLastOffset)
+
+  def has_addr(dummy: Int = 0) =
+    typ === AtosRequest.arType || typ === AtosRequest.awType
+
+  def has_data(dummy: Int = 0) =
+    typ === AtosRequest.wType
+
+  def is_last(dummy: Int = 0) =
+    typ === AtosRequest.arType || (typ === AtosRequest.wType && last())
+
+  def nbits: Int = atosRequestBits
+
+  def resp_len(dummy: Int = 0) =
+    MuxLookup(typ, UInt(0), Seq(
+      AtosRequest.arType -> (len() + UInt(1)),
+      AtosRequest.awType -> UInt(1)))
+}
+
+object AtosResponse {
+  def rType = UInt("b00")
+  def bType = UInt("b01")
+
+  def apply(typ: UInt, id: UInt, resp: UInt, data: UInt, last: Bool)
+      (implicit p: Parameters): AtosResponse = {
+    val aresp = Wire(new AtosResponse)
+    aresp.typ := typ
+    aresp.id := id
+    aresp.resp := resp
+    aresp.data := data
+    aresp.last := last
+    aresp
+  }
+
+  def apply(r: NastiReadDataChannel)(implicit p: Parameters): AtosResponse =
+    apply(rType, r.id, r.resp, r.data, r.last)
+
+  def apply(b: NastiWriteResponseChannel)(implicit p: Parameters): AtosResponse =
+    apply(bType, b.id, b.resp, UInt(0), Bool(false))
+}
+
+class AtosResponse(implicit p: Parameters)
+    extends AtosBundle()(p) with Serializable {
+  val typ = UInt(width = atosTypBits)
+  val id = UInt(width = atosIdBits)
+  val resp = UInt(width = atosRespBits)
+  val last = Bool()
+  val data = UInt(width = atosDataBits)
+
+  def has_data(dummy: Int = 0) = typ === AtosResponse.rType
+
+  def is_last(dummy: Int = 0) = !has_data() || last
+
+  def nbits: Int = atosResponseBits
+}
+
+class AtosIO(implicit p: Parameters) extends AtosBundle()(p) {
+  val req = Decoupled(new AtosRequest)
+  val resp = Decoupled(new AtosResponse).flip
+}
+
+class AtosRequestEncoder(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val ar = Decoupled(new NastiReadAddressChannel).flip
+    val aw = Decoupled(new NastiWriteAddressChannel).flip
+    val w  = Decoupled(new NastiWriteDataChannel).flip
+    val req = Decoupled(new AtosRequest)
+  }
+
+  val writing = Reg(init = Bool(false))
+
+  io.ar.ready := !writing && io.req.ready
+  io.aw.ready := !writing && !io.ar.valid && io.req.ready
+  io.w.ready  := writing && io.req.ready
+
+  io.req.valid := Mux(writing, io.w.valid, io.ar.valid || io.aw.valid)
+  io.req.bits := Mux(writing, AtosRequest(io.w.bits),
+    Mux(io.ar.valid, AtosRequest(io.ar.bits), AtosRequest(io.aw.bits)))
+
+  when (io.aw.fire()) { writing := Bool(true) }
+  when (io.w.fire() && io.w.bits.last) { writing := Bool(false) }
+}
+
+class AtosResponseDecoder(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val resp = Decoupled(new AtosResponse).flip
+    val b = Decoupled(new NastiWriteResponseChannel)
+    val r = Decoupled(new NastiReadDataChannel)
+  }
+
+  val is_b = io.resp.bits.typ === AtosResponse.bType
+  val is_r = io.resp.bits.typ === AtosResponse.rType
+
+  io.b.valid := io.resp.valid && is_b
+  io.b.bits := NastiWriteResponseChannel(
+    id = io.resp.bits.id,
+    resp = io.resp.bits.resp)
+
+  io.r.valid := io.resp.valid && is_r
+  io.r.bits := NastiReadDataChannel(
+    id = io.resp.bits.id,
+    data = io.resp.bits.data,
+    last = io.resp.bits.last,
+    resp = io.resp.bits.resp)
+
+  io.resp.ready := (is_b && io.b.ready) || (is_r && io.r.ready)
+}
+
+class AtosClientConverter(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val atos = new AtosIO
+  }
+
+  val req_enc = Module(new AtosRequestEncoder)
+  req_enc.io.ar <> io.nasti.ar
+  req_enc.io.aw <> io.nasti.aw
+  req_enc.io.w  <> io.nasti.w
+  io.atos.req <> req_enc.io.req
+
+  val resp_dec = Module(new AtosResponseDecoder)
+  resp_dec.io.resp <> io.atos.resp
+  io.nasti.b <> resp_dec.io.b
+  io.nasti.r <> resp_dec.io.r
+}
+
+class AtosRequestDecoder(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val req = Decoupled(new AtosRequest).flip
+    val ar = Decoupled(new NastiReadAddressChannel)
+    val aw = Decoupled(new NastiWriteAddressChannel)
+    val w  = Decoupled(new NastiWriteDataChannel)
+  }
+
+  val is_ar = io.req.bits.typ === AtosRequest.arType
+  val is_aw = io.req.bits.typ === AtosRequest.awType
+  val is_w  = io.req.bits.typ === AtosRequest.wType
+
+  io.ar.valid := io.req.valid && is_ar
+  io.ar.bits := NastiReadAddressChannel(
+    id = io.req.bits.id(),
+    addr = io.req.bits.addr(),
+    size = io.req.bits.size(),
+    len = io.req.bits.len(),
+    burst = io.req.bits.burst())
+
+  io.aw.valid := io.req.valid && is_aw
+  io.aw.bits := NastiWriteAddressChannel(
+    id = io.req.bits.id(),
+    addr = io.req.bits.addr(),
+    size = io.req.bits.size(),
+    len = io.req.bits.len(),
+    burst = io.req.bits.burst())
+
+  io.w.valid := io.req.valid && is_w
+  io.w.bits := NastiWriteDataChannel(
+    id = io.req.bits.id(),
+    data = io.req.bits.data(),
+    strb = Some(io.req.bits.strb()),
+    last = io.req.bits.last())
+
+  io.req.ready := (io.ar.ready && is_ar) ||
+                  (io.aw.ready && is_aw) ||
+                  (io.w.ready  && is_w)
+}
+
+class AtosResponseEncoder(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val b = Decoupled(new NastiWriteResponseChannel).flip
+    val r = Decoupled(new NastiReadDataChannel).flip
+    val resp = Decoupled(new AtosResponse)
+  }
+
+  val locked = Reg(init = Bool(false))
+
+  io.resp.valid := (io.b.valid && !locked) || io.r.valid
+  io.resp.bits := Mux(io.r.valid,
+    AtosResponse(io.r.bits), AtosResponse(io.b.bits))
+
+  io.b.ready := !locked && !io.r.valid && io.resp.ready
+  io.r.ready := io.resp.ready
+
+  when (io.r.fire() && !io.r.bits.last) { locked := Bool(true) }
+  when (io.r.fire() && io.r.bits.last) { locked := Bool(false) }
+}
+
+class AtosManagerConverter(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val atos = (new AtosIO).flip
+    val nasti = new NastiIO
+  }
+
+  val req_dec = Module(new AtosRequestDecoder)
+  val resp_enc = Module(new AtosResponseEncoder)
+
+  req_dec.io.req <> io.atos.req
+  io.atos.resp <> resp_enc.io.resp
+
+  io.nasti.ar <> req_dec.io.ar
+  io.nasti.aw <> req_dec.io.aw
+  io.nasti.w  <> req_dec.io.w
+
+  resp_enc.io.b <> io.nasti.b
+  resp_enc.io.r <> io.nasti.r
+}
+
+class AtosSerializedIO(w: Int)(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val req = Decoupled(Bits(width = w))
+  val resp = Decoupled(Bits(width = w)).flip
+  val clk = Bool(OUTPUT)
+  val clk_edge = Bool(OUTPUT)
+  override def cloneType = new AtosSerializedIO(w)(p).asInstanceOf[this.type]
+}
+
+class AtosSerdes(w: Int)(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val wide = (new AtosIO).flip
+    val narrow = new AtosSerializedIO(w)
+  }
+
+  val ser = Module(new Serializer(w, new AtosRequest))
+  ser.io.in <> io.wide.req
+  io.narrow.req <> ser.io.out
+
+  val des = Module(new Deserializer(w, new AtosResponse))
+  des.io.in <> io.narrow.resp
+  io.wide.resp <> des.io.out
+}
+
+class AtosDesser(w: Int)(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val narrow = new AtosSerializedIO(w).flip
+    val wide = new AtosIO
+  }
+
+  val des = Module(new Deserializer(w, new AtosRequest))
+  des.io.in <> io.narrow.req
+  io.wide.req <> des.io.out
+
+  val ser = Module(new Serializer(w, new AtosResponse))
+  ser.io.in <> io.wide.resp
+  io.narrow.resp <> ser.io.out
+}
--- a/src/main/scala/junctions/crossing.scala
+++ b/src/main/scala/junctions/crossing.scala
@ -0,0 +1,150 @@
+package junctions
+import Chisel._
+
+class Crossing[T <: Data](gen: T, enq_sync: Boolean, deq_sync: Boolean) extends Bundle {
+    val enq = Decoupled(gen).flip()
+    val deq = Decoupled(gen)
+    val enq_clock = if (enq_sync) Some(Clock(INPUT)) else None
+    val deq_clock = if (deq_sync) Some(Clock(INPUT)) else None
+    val enq_reset = if (enq_sync) Some(Bool(INPUT))  else None
+    val deq_reset = if (deq_sync) Some(Bool(INPUT))  else None
+}
+
+// Output is 1 for one cycle after any edge of 'in'
+object AsyncHandshakePulse {
+  def apply(in: Bool, sync: Int): Bool = {
+    val syncv = RegInit(Vec.fill(sync+1){Bool(false)})
+    syncv.last := in
+    (syncv.init zip syncv.tail).foreach { case (sink, source) => sink := source }
+    syncv(0) =/= syncv(1)
+  }
+}
+
+class AsyncHandshakeSource[T <: Data](gen: T, sync: Int, clock: Clock, reset: Bool)
+    extends Module(_clock = clock, _reset = reset) {
+  val io = new Bundle {
+    // These come from the source clock domain
+    val enq  = Decoupled(gen).flip()
+    // These cross to the sink clock domain
+    val bits = gen.cloneType.asOutput
+    val push = Bool(OUTPUT)
+    val pop  = Bool(INPUT)
+  }
+
+  val ready = RegInit(Bool(true))
+  val bits = Reg(gen)
+  val push = RegInit(Bool(false))
+
+  io.enq.ready := ready
+  io.bits := bits
+  io.push := push
+
+  val pop = AsyncHandshakePulse(io.pop, sync)
+  assert (!pop || !ready)
+
+  when (pop) {
+    ready := Bool(true)
+  }
+
+  when (io.enq.fire()) {
+    ready := Bool(false)
+    bits := io.enq.bits
+    push := !push
+  }
+}
+
+class AsyncHandshakeSink[T <: Data](gen: T, sync: Int, clock: Clock, reset: Bool) 
+    extends Module(_clock = clock, _reset = reset) {
+  val io = new Bundle {
+    // These cross to the source clock domain
+    val bits = gen.cloneType.asInput
+    val push = Bool(INPUT)
+    val pop  = Bool(OUTPUT)
+    // These go to the sink clock domain
+    val deq = Decoupled(gen)
+  }
+
+  val valid = RegInit(Bool(false))
+  val bits  = Reg(gen)
+  val pop   = RegInit(Bool(false))
+
+  io.deq.valid := valid
+  io.deq.bits  := bits
+  io.pop := pop
+
+  val push = AsyncHandshakePulse(io.push, sync)
+  assert (!push || !valid)
+
+  when (push) {
+    valid := Bool(true)
+    bits  := io.bits
+  }
+
+  when (io.deq.fire()) {
+    valid := Bool(false)
+    pop := !pop
+  }
+}
+
+class AsyncHandshake[T <: Data](gen: T, sync: Int = 2) extends Module {
+  val io = new Crossing(gen, true, true)
+  require (sync >= 2)
+
+  val source = Module(new AsyncHandshakeSource(gen, sync, io.enq_clock.get, io.enq_reset.get))
+  val sink   = Module(new AsyncHandshakeSink  (gen, sync, io.deq_clock.get, io.deq_reset.get))
+
+  source.io.enq <> io.enq
+  io.deq <> sink.io.deq
+
+  sink.io.bits := source.io.bits
+  sink.io.push := source.io.push
+  source.io.pop := sink.io.pop
+}
+
+class AsyncDecoupledTo[T <: Data](gen: T, depth: Int = 0, sync: Int = 2) extends Module {
+  val io = new Crossing(gen, false, true)
+
+  // !!! if depth == 0 { use Handshake } else { use AsyncFIFO }
+  val crossing = Module(new AsyncHandshake(gen, sync)).io
+  crossing.enq_clock.get := clock
+  crossing.enq_reset.get := reset
+  crossing.enq <> io.enq
+  crossing.deq_clock.get := io.deq_clock.get
+  crossing.deq_reset.get := io.deq_reset.get
+  io.deq <> crossing.deq
+}
+
+object AsyncDecoupledTo {
+  // source is in our clock domain, output is in the 'to' clock domain
+  def apply[T <: Data](to_clock: Clock, to_reset: Bool, source: DecoupledIO[T], depth: Int = 0, sync: Int = 2): DecoupledIO[T] = {
+    val to = Module(new AsyncDecoupledTo(source.bits, depth, sync))
+    to.io.deq_clock.get := to_clock
+    to.io.deq_reset.get := to_reset
+    to.io.enq <> source
+    to.io.deq
+  }
+}
+
+class AsyncDecoupledFrom[T <: Data](gen: T, depth: Int = 0, sync: Int = 2) extends Module {
+  val io = new Crossing(gen, true, false)
+
+  // !!! if depth == 0 { use Handshake } else { use AsyncFIFO }
+  val crossing = Module(new AsyncHandshake(gen, sync)).io
+  crossing.enq_clock.get := io.enq_clock.get
+  crossing.enq_reset.get := io.enq_reset.get
+  crossing.enq <> io.enq
+  crossing.deq_clock.get := clock
+  crossing.deq_reset.get := reset
+  io.deq <> crossing.deq
+}
+
+object AsyncDecoupledFrom {
+  // source is in the 'from' clock domain, output is in our clock domain
+  def apply[T <: Data](from_clock: Clock, from_reset: Bool, source: DecoupledIO[T], depth: Int = 0, sync: Int = 2): DecoupledIO[T] = {
+    val from = Module(new AsyncDecoupledFrom(source.bits, depth, sync))
+    from.io.enq_clock.get := from_clock
+    from.io.enq_reset.get := from_reset
+    from.io.enq <> source
+    from.io.deq
+  }
+}
--- a/src/main/scala/junctions/hasti.scala
+++ b/src/main/scala/junctions/hasti.scala
@ -0,0 +1,549 @@
+package junctions
+
+import Chisel._
+import cde.{Parameters, Field}
+
+object HastiConstants
+{
+  // Values for htrans
+  val SZ_HTRANS     = 2
+  val HTRANS_IDLE   = UInt(0, SZ_HTRANS) // No transfer requested, not in a burst
+  val HTRANS_BUSY   = UInt(1, SZ_HTRANS) // No transfer requested, in a burst
+  val HTRANS_NONSEQ = UInt(2, SZ_HTRANS) // First (potentially only) request in a burst
+  val HTRANS_SEQ    = UInt(3, SZ_HTRANS) // Following requests in a burst
+
+  // Values for hburst
+  val SZ_HBURST     = 3
+  val HBURST_SINGLE = UInt(0, SZ_HBURST) // Single access (no burst)
+  val HBURST_INCR   = UInt(1, SZ_HBURST) // Incrementing burst of arbitrary length, not crossing 1KB
+  val HBURST_WRAP4  = UInt(2, SZ_HBURST) // 4-beat wrapping burst
+  val HBURST_INCR4  = UInt(3, SZ_HBURST) // 4-beat incrementing burst
+  val HBURST_WRAP8  = UInt(4, SZ_HBURST) // 8-beat wrapping burst
+  val HBURST_INCR8  = UInt(5, SZ_HBURST) // 8-beat incrementing burst
+  val HBURST_WRAP16 = UInt(6, SZ_HBURST) // 16-beat wrapping burst
+  val HBURST_INCR16 = UInt(7, SZ_HBURST) // 16-beat incrementing burst
+
+  // Values for hresp
+  val SZ_HRESP      = 1
+  val HRESP_OKAY    = UInt(0, SZ_HRESP)
+  val HRESP_ERROR   = UInt(1, SZ_HRESP)
+
+  // Values for hsize are identical to TileLink MT_SZ
+  // ie: 8*2^SZ_HSIZE bit transfers
+  val SZ_HSIZE = 3
+  
+  // Values for hprot (a bitmask)
+  val SZ_HPROT = 4
+  def HPROT_DATA       = UInt("b0001") // Data access or Opcode fetch
+  def HPROT_PRIVILEGED = UInt("b0010") // Privileged or User access
+  def HPROT_BUFFERABLE = UInt("b0100") // Bufferable or non-bufferable
+  def HPROT_CACHEABLE  = UInt("b1000") // Cacheable or non-cacheable
+
+  def dgate(valid: Bool, b: UInt) = Fill(b.getWidth, valid) & b
+}
+
+import HastiConstants._
+
+case class HastiParameters(dataBits: Int, addrBits: Int)
+case object HastiId extends Field[String]
+case class HastiKey(id: String) extends Field[HastiParameters]
+
+trait HasHastiParameters {
+  implicit val p: Parameters
+  val hastiParams = p(HastiKey(p(HastiId)))
+  val hastiAddrBits = hastiParams.addrBits
+  val hastiDataBits = hastiParams.dataBits
+  val hastiDataBytes = hastiDataBits/8
+  val hastiAlignment = log2Ceil(hastiDataBytes)
+}
+
+abstract class HastiModule(implicit val p: Parameters) extends Module
+  with HasHastiParameters
+abstract class HastiBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
+  with HasHastiParameters
+
+class HastiMasterIO(implicit p: Parameters) extends HastiBundle()(p) {
+  val htrans    = UInt(OUTPUT, SZ_HTRANS)
+  val hmastlock = Bool(OUTPUT)
+  val haddr     = UInt(OUTPUT, hastiAddrBits)
+  val hwrite    = Bool(OUTPUT)
+  val hburst    = UInt(OUTPUT, SZ_HBURST)
+  val hsize     = UInt(OUTPUT, SZ_HSIZE)
+  val hprot     = UInt(OUTPUT, SZ_HPROT)
+
+  val hwdata = Bits(OUTPUT, hastiDataBits)
+  val hrdata = Bits(INPUT,  hastiDataBits)
+
+  val hready = Bool(INPUT)
+  val hresp  = UInt(INPUT, SZ_HRESP)
+
+  def isNSeq(dummy:Int=0) = htrans === HTRANS_NONSEQ // SEQ does not start a NEW request
+  def isHold(dummy:Int=0) = htrans === HTRANS_BUSY || htrans === HTRANS_SEQ
+  def isIdle(dummy:Int=0) = htrans === HTRANS_IDLE || htrans === HTRANS_BUSY
+}
+
+class HastiSlaveIO(implicit p: Parameters) extends HastiBundle()(p) {
+  val htrans    = UInt(INPUT, SZ_HTRANS)
+  val hmastlock = Bool(INPUT)
+  val haddr     = UInt(INPUT, hastiAddrBits)
+  val hwrite    = Bool(INPUT)
+  val hburst    = UInt(INPUT, SZ_HBURST)
+  val hsize     = UInt(INPUT, SZ_HSIZE)
+  val hprot     = UInt(INPUT, SZ_HPROT)
+
+  val hwdata = Bits(INPUT,  hastiDataBits)
+  val hrdata = Bits(OUTPUT, hastiDataBits)
+
+  val hsel   = Bool(INPUT)
+  val hready = Bool(OUTPUT)
+  val hresp  = UInt(OUTPUT, SZ_HRESP)
+}
+
+/* A diverted master is told hready when his address phase goes nowhere.
+ * In this case, we buffer his address phase request and replay it later.
+ * NOTE: this must optimize to nothing when divert is constantly false.
+ */
+class MasterDiversion(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val in     = (new HastiMasterIO).flip
+    val out    = (new HastiMasterIO)
+    val divert = Bool(INPUT)
+  }
+  
+  val full   = Reg(init = Bool(false))
+  val buffer = Reg(new HastiMasterIO)
+  
+  when (io.out.hready) {
+    full := Bool(false)
+  }
+  when (io.divert) {
+    full := Bool(true)
+    buffer := io.in
+  }
+  
+  // If the master is diverted, he must also have been told hready
+  assert (!io.divert || io.in.hready,
+    "Diverted but not ready");
+  
+  // Replay the request we diverted
+  io.out.htrans    := Mux(full, buffer.htrans,    io.in.htrans)
+  io.out.hmastlock := Mux(full, buffer.hmastlock, io.in.hmastlock)
+  io.out.haddr     := Mux(full, buffer.haddr,     io.in.haddr)
+  io.out.hwrite    := Mux(full, buffer.hwrite,    io.in.hwrite)
+  io.out.hburst    := Mux(full, buffer.hburst,    io.in.hburst)
+  io.out.hsize     := Mux(full, buffer.hsize,     io.in.hsize)
+  io.out.hprot     := Mux(full, buffer.hprot,     io.in.hprot)
+  io.out.hwdata    := Mux(full, buffer.hwdata,    io.in.hwdata)
+  
+  // Pass slave response back
+  io.in.hrdata := io.out.hrdata
+  io.in.hresp  := io.out.hresp
+  io.in.hready := io.out.hready && !full // Block master while we steal his address phase
+}
+
+/* Masters with lower index have priority over higher index masters.
+ * However, a lower priority master will retain control of a slave when EITHER:
+ *   1. a burst is in progress (switching slaves mid-burst violates AHB-lite at slave)
+ *   2. a transfer was waited (the standard forbids changing requests in this case)
+ *
+ * If a master raises hmastlock, it will be waited until no other master has inflight
+ * requests; then, it acquires exclusive control of the crossbar until hmastlock is low.
+ *
+ * To implement an AHB-lite crossbar, it is important to realize that requests and
+ * responses are coupled. Unlike modern bus protocols where the response data has flow
+ * control independent of the request data, in AHB-lite, both flow at the same time at
+ * the sole discretion of the slave via the hready signal. The address and data are
+ * delivered on two back-to-back cycles, the so-called address and data phases.
+ *
+ * Masters can only be connected to a single slave at a time. If a master had two different
+ * slave connections on the address and data phases, there would be two independent hready
+ * signals. An AHB-lite slave can assume that data flows when it asserts hready. If the data
+ * slave deasserts hready while the address slave asserts hready, the master is put in the
+ * impossible position of being in data phase on two slaves at once. For this reason, when
+ * a master issues back-to-back accesses to distinct slaves, we inject a pipeline bubble
+ * between the two requests to limit the master to just a single slave at a time.
+ *
+ * Conversely, a slave CAN have two masters attached to it. This is unproblematic, because
+ * the only signal which governs data flow is hready. Thus, both masters can be stalled
+ * safely by the single slave.
+ */
+class HastiXbar(nMasters: Int, addressMap: Seq[UInt=>Bool])(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val masters = Vec(nMasters,        new HastiMasterIO).flip
+    val slaves  = Vec(addressMap.size, new HastiSlaveIO).flip
+  }
+  
+  val nSlaves = addressMap.size
+  
+  // Setup diversions infront of each master
+  val diversions = Seq.tabulate(nMasters) { m => Module(new MasterDiversion) }
+  (io.masters zip diversions) foreach { case (m, d) => d.io.in <> m }
+  
+  // Handy short-hand
+  val masters = diversions map (_.io.out)
+  val slaves  = io.slaves
+  
+  // Lock status of the crossbar
+  val lockedM = Reg(init = Vec.fill(nMasters)(Bool(false)))
+  val isLocked = lockedM.reduce(_ || _)
+  
+  // This matrix governs the master-slave connections in the address phase
+  // It is indexed by addressPhaseGrantSM(slave)(master)
+  // It is guaranteed to have at most one 'true' per column and per row
+  val addressPhaseGrantSM = Wire(Vec(nSlaves, Vec(nMasters, Bool())))
+  // This matrix governs the master-slave connections in the data phase
+  // It is guaranteed to have at most one 'true' per column and per row
+  val dataPhaseGrantSM    = Reg (init = Vec.fill(nSlaves)(Vec.fill(nMasters)(Bool(false))))
+  // This matrix is the union of the address and data phases.
+  // It is transposed with respect to the two previous matrices.
+  // It is guaranteed to contain at most one 'true' per master row.
+  // However, two 'true's per slave column are permitted.
+  val unionGrantMS = Vec.tabulate(nMasters) { m => Vec.tabulate(nSlaves) { s => 
+                       addressPhaseGrantSM(s)(m) || dataPhaseGrantSM(s)(m) } }
+  
+  // Confirm the guarantees made above
+  def justOnce(v: Vec[Bool]) = v.fold(Bool(false)) { case (p, v) =>
+    assert (!p || !v)
+    p || v
+  }
+  addressPhaseGrantSM foreach { s => justOnce(s) }
+  unionGrantMS        foreach { s => justOnce(s) }
+  
+  // Data phase follows address phase whenever the slave is ready
+  (slaves zip (dataPhaseGrantSM zip addressPhaseGrantSM)) foreach { case (s, (d, a)) =>
+    when (s.hready) { d := a }
+  }
+  
+  // Record the grant state from the previous cycle; needed in case we hold access
+  val priorAddressPhaseGrantSM = RegNext(addressPhaseGrantSM)
+  
+  // If a master says BUSY or SEQ, it is in the middle of a burst.
+  // In this case, it MUST stay attached to the same slave as before.
+  // Otherwise, it would violate the AHB-lite specification as seen by
+  // the slave, which is guaranteed a complete burst of the promised length.
+  // One case where this matters is preventing preemption of low-prio masters.
+  // NOTE: this exposes a slave to bad addresses when a master is buggy
+  val holdBurstM = Vec(masters map { _.isHold() })
+  
+  // Transform the burst hold requirement from master indexing to slave indexing
+  // We use the previous cycle's binding because the master continues the prior burst
+  val holdBurstS = Vec(priorAddressPhaseGrantSM map { m => Mux1H(m, holdBurstM) })
+  
+  // If a slave says !hready to a request, it must retain the same master next cycle.
+  // The AHB-lite specification requires that a waited transfer remain unchanged.
+  // If we preempted a waited master, the new master's request could potentially differ.
+  val holdBusyS = RegNext(Vec(slaves map { s => !s.hready && s.hsel }))
+  
+  // Combine the above two grounds to determine if the slave retains its prior master
+  val holdS = Vec((holdBurstS zip holdBusyS) map ({ case (a,b) => a||b }))
+  
+  // Determine which master addresses match which slaves
+  val matchMS = Vec(masters map { m => Vec(addressMap map { afn => afn(m.haddr) }) })
+  // Detect requests to nowhere; we need to allow progress in this case
+  val nowhereM = Vec(matchMS map { s => !s.reduce(_ || _) })
+  
+  // Detect if we need to inject a pipeline bubble between the master requests.
+  // Divert masters already granted a data phase different from next request.
+  // NOTE: if only one slave, matchMS is always true => bubble always false
+  //       => the diversion registers are optimized away as they are unread
+  // NOTE: bubble => dataPhase => have an hready signal
+  val bubbleM =
+    Vec.tabulate(nMasters) { m =>
+      Vec.tabulate(nSlaves) { s => dataPhaseGrantSM(s)(m) && !matchMS(m)(s) }
+      .reduce(_ || _) }
+  
+  // Block any request that requires bus ownership or conflicts with isLocked
+  val blockedM = 
+    Vec((lockedM zip masters) map { case(l, m) => !l && (isLocked || m.hmastlock) })
+  
+  // Requested access to slaves from masters (pre-arbitration)
+  // NOTE: isNSeq does NOT include SEQ; thus, masters who are midburst do not
+  // request access to a new slave. They stay tied to the old and do not get two.
+  // NOTE: if a master was waited, it must repeat the same request as last cycle;
+  // thus, it will request the same slave and not end up with two (unless buggy).
+  val NSeq = masters.map(_.isNSeq())
+  val requestSM = Vec.tabulate(nSlaves) { s => Vec.tabulate(nMasters) { m => 
+    matchMS(m)(s) && NSeq(m) && !bubbleM(m) && !blockedM(m) } }
+  
+  // Select at most one master request per slave (lowest index = highest priority)
+  val selectedRequestSM = Vec(requestSM map { m => Vec(PriorityEncoderOH(m)) })
+  
+  // Calculate new crossbar interconnect state
+  addressPhaseGrantSM := Vec((holdS zip (priorAddressPhaseGrantSM zip selectedRequestSM))
+                             map { case (h, (p, r)) => Mux(h, p, r) })
+
+  for (m <- 0 until nMasters) {
+    // If the master is connected to a slave, the slave determines hready.
+    // However, if no slave is connected, for progress report ready anyway, if:
+    //   bad address (swallow request) OR idle (permit stupid masters to move FSM)
+    val autoready = nowhereM(m) || masters(m).isIdle()
+    val hready = Mux1H(unionGrantMS(m), slaves.map(_.hready ^ autoready)) ^ autoready
+    masters(m).hready := hready
+    // If we diverted a master, we need to absorb his address phase to replay later
+    diversions(m).io.divert := (bubbleM(m) || blockedM(m)) && NSeq(m) && hready
+  }
+  
+  // Master muxes (address and data phase are the same)
+  (masters zip unionGrantMS) foreach { case (m, g) => {
+    m.hrdata := Mux1H(g, slaves.map(_.hrdata))
+    m.hresp  := Mux1H(g, slaves.map(_.hresp))
+  } }
+  
+  // Slave address phase muxes
+  (slaves zip addressPhaseGrantSM) foreach { case (s, g) => {
+    s.htrans    := Mux1H(g, masters.map(_.htrans))
+    s.haddr     := Mux1H(g, masters.map(_.haddr))
+    s.hmastlock := isLocked
+    s.hwrite    := Mux1H(g, masters.map(_.hwrite))
+    s.hsize     := Mux1H(g, masters.map(_.hsize))
+    s.hburst    := Mux1H(g, masters.map(_.hburst))
+    s.hprot     := Mux1H(g, masters.map(_.hprot))
+    s.hsel      := g.reduce(_ || _)
+  } }
+  
+  // Slave data phase muxes
+  (slaves zip dataPhaseGrantSM) foreach { case (s, g) => {
+    s.hwdata := Mux1H(g, masters.map(_.hwdata))
+  } }
+  
+  // When no master-slave connections are active, a master can take-over the bus
+  val canLock = !addressPhaseGrantSM.map({ v => v.reduce(_ || _) }).reduce(_ || _)
+  
+  // Lowest index highest priority for lock arbitration
+  val reqLock = masters.map(_.hmastlock)
+  val winLock = PriorityEncoderOH(reqLock)
+  
+  // Lock arbitration
+  when (isLocked) {
+    lockedM := (lockedM zip reqLock) map { case (a,b) => a && b }
+  } .elsewhen (canLock) {
+    lockedM := winLock
+  }
+}
+
+class HastiBus(amap: Seq[UInt=>Bool])(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val master = new HastiMasterIO().flip
+    val slaves = Vec(amap.size, new HastiSlaveIO).flip
+  }
+
+  val bar = Module(new HastiXbar(1, amap))
+  bar.io.masters(0) <> io.master
+  bar.io.slaves <> io.slaves
+}
+
+class HastiSlaveMux(n: Int)(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val ins = Vec(n, new HastiSlaveIO)
+    val out = new HastiSlaveIO().flip
+  }
+  
+  val amap = Seq({ (_:UInt) => Bool(true)})
+  val bar = Module(new HastiXbar(n, amap))
+  io.ins <> bar.io.masters
+  io.out <> bar.io.slaves(0)
+}
+
+class HastiSlaveToMaster(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val in  = new HastiSlaveIO
+    val out = new HastiMasterIO
+  }
+
+  io.out.htrans    := Mux(io.in.hsel, io.in.htrans, HTRANS_IDLE)
+  io.out.hmastlock := io.in.hmastlock
+  io.out.haddr     := io.in.haddr
+  io.out.hwrite    := io.in.hwrite
+  io.out.hburst    := io.in.hburst
+  io.out.hsize     := io.in.hsize
+  io.out.hprot     := io.in.hprot
+  io.out.hwdata    := io.in.hwdata
+  io.in.hrdata := io.out.hrdata
+  io.in.hready := io.out.hready
+  io.in.hresp  := io.out.hresp
+}
+
+class HastiMasterIONastiIOConverter(implicit p: Parameters) extends HastiModule()(p)
+    with HasNastiParameters {
+  val io = new Bundle {
+    val nasti = new NastiIO().flip
+    val hasti = new HastiMasterIO
+  }
+
+  require(hastiAddrBits == nastiXAddrBits)
+  require(hastiDataBits == nastiXDataBits)
+
+  val r_queue = Module(new Queue(new NastiReadDataChannel, 2, pipe = true))
+
+  val s_idle :: s_read :: s_write :: s_write_resp :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_idle)
+
+  val addr = Reg(UInt(width = hastiAddrBits))
+  val id = Reg(UInt(width = nastiXIdBits))
+  val size = Reg(UInt(width = nastiXSizeBits))
+  val len = Reg(UInt(width = nastiXLenBits))
+  val data = Reg(UInt(width = nastiXDataBits))
+  val first = Reg(init = Bool(false))
+  val is_rtrans = (state === s_read) &&
+                  (io.hasti.htrans === HTRANS_SEQ ||
+                   io.hasti.htrans === HTRANS_NONSEQ)
+  val rvalid = RegEnable(is_rtrans, Bool(false), io.hasti.hready)
+
+  io.nasti.aw.ready := (state === s_idle)
+  io.nasti.ar.ready := (state === s_idle) && !io.nasti.aw.valid
+  io.nasti.w.ready := (state === s_write) && io.hasti.hready
+  io.nasti.b.valid := (state === s_write_resp)
+  io.nasti.b.bits := NastiWriteResponseChannel(id = id)
+  io.nasti.r <> r_queue.io.deq
+
+  r_queue.io.enq.valid := io.hasti.hready && rvalid
+  r_queue.io.enq.bits := NastiReadDataChannel(
+    id = id,
+    data = io.hasti.hrdata,
+    last = (len === UInt(0)))
+
+  assert(!r_queue.io.enq.valid || r_queue.io.enq.ready,
+    "NASTI -> HASTI converter queue overflow")
+
+  // How many read requests have we not delivered a response for yet?
+  val pending_count = r_queue.io.count + rvalid
+
+  io.hasti.haddr := addr
+  io.hasti.hsize := size
+  io.hasti.hwrite := (state === s_write)
+  io.hasti.hburst := HBURST_INCR
+  io.hasti.hprot := UInt(0)
+  io.hasti.hwdata := data
+  io.hasti.hmastlock := Bool(false)
+  io.hasti.htrans := MuxLookup(state, HTRANS_IDLE, Seq(
+    s_write -> Mux(io.nasti.w.valid,
+      Mux(first, HTRANS_NONSEQ, HTRANS_SEQ),
+      Mux(first, HTRANS_IDLE, HTRANS_BUSY)),
+    s_read -> MuxCase(HTRANS_BUSY, Seq(
+      first -> HTRANS_NONSEQ,
+      (pending_count <= UInt(1)) -> HTRANS_SEQ))))
+
+  when (io.nasti.aw.fire()) {
+    first := Bool(true)
+    addr := io.nasti.aw.bits.addr
+    id := io.nasti.aw.bits.id
+    size := io.nasti.aw.bits.size
+    state := s_write
+  }
+
+  when (io.nasti.ar.fire()) {
+    first := Bool(true)
+    addr := io.nasti.ar.bits.addr
+    id := io.nasti.ar.bits.id
+    size := io.nasti.ar.bits.size
+    len := io.nasti.ar.bits.len
+    state := s_read
+  }
+
+  when (io.nasti.w.fire()) {
+    first := Bool(false)
+    addr := addr + (UInt(1) << size)
+    data := io.nasti.w.bits.data
+    when (io.nasti.w.bits.last) { state := s_write_resp }
+  }
+
+  when (io.nasti.b.fire()) { state := s_idle }
+
+  when (is_rtrans && io.hasti.hready) {
+    first := Bool(false)
+    addr := addr + (UInt(1) << size)
+    len := len - UInt(1)
+    when (len === UInt(0)) { state := s_idle }
+  }
+}
+
+class HastiTestSRAM(depth: Int)(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new HastiSlaveIO
+  
+  // This is a test SRAM with random delays
+  val ready = LFSR16(Bool(true))(0) // Bool(true)
+  
+  // Calculate the bitmask of which bytes are being accessed
+  val mask_decode = Vec.tabulate(hastiAlignment+1) (UInt(_) <= io.hsize)
+  val mask_wide   = Vec.tabulate(hastiDataBytes) { i => mask_decode(log2Up(i+1)) }
+  val mask_shift  = if (hastiAlignment == 0) UInt(1) else
+                    mask_wide.asUInt() << io.haddr(hastiAlignment-1,0)
+  
+  // The request had better have been aligned! (AHB-lite requires this)
+  if (hastiAlignment >= 1) {
+    assert (io.htrans === HTRANS_IDLE || io.htrans === HTRANS_BUSY ||
+      (io.haddr & mask_decode.asUInt()(hastiAlignment,1)) === UInt(0),
+      "HASTI request not aligned")
+  }
+  
+  // The mask and address during the address phase
+  val a_request   = io.hsel && (io.htrans === HTRANS_NONSEQ || io.htrans === HTRANS_SEQ)
+  val a_mask      = Wire(UInt(width = hastiDataBytes))
+  val a_address   = io.haddr(depth-1, hastiAlignment)
+  val a_write     = io.hwrite
+
+  // for backwards compatibility with chisel2, we needed a static width in definition
+  a_mask := mask_shift(hastiDataBytes-1, 0)
+  
+  // The data phase signals
+  val d_read  = RegEnable(a_request && !a_write, Bool(false), ready)
+  val d_mask  = RegEnable(a_mask, ready && a_request)
+  val d_wdata = Vec.tabulate(hastiDataBytes) { i => io.hwdata(8*(i+1)-1, 8*i) }
+  
+  // AHB writes must occur during the data phase; this poses a structural
+  // hazard with reads which must occur during the address phase. To solve
+  // this problem, we delay the writes until there is a free cycle.
+  //
+  // The idea is to record the address information from address phase and
+  // then as soon as possible flush the pending write. This cannot be done
+  // on a cycle when there is an address phase read, but on any other cycle
+  // the write will execute. In the case of reads following a write, the
+  // result must bypass data from the pending write into the read if they
+  // happen to have matching address.
+  
+  // Remove this once HoldUnless is in chisel3
+  def holdUnless[T <: Data](in : T, enable: Bool): T = Mux(!enable, RegEnable(in, enable), in)
+  
+  // Pending write?
+  val p_valid     = RegInit(Bool(false))
+  val p_address   = Reg(a_address)
+  val p_mask      = Reg(a_mask)
+  val p_latch_d   = RegNext(ready && a_request && a_write, Bool(false))
+  val p_wdata     = holdUnless(d_wdata, p_latch_d)
+  
+  // Use single-ported memory with byte-write enable
+  val mem = SeqMem(1 << (depth-hastiAlignment), Vec(hastiDataBytes, Bits(width = 8)))
+  
+  // Decide is the SRAM port is used for reading or (potentially) writing
+  val read = ready && a_request && !a_write
+  // In case we are stalled, we need to hold the read data
+  val d_rdata = holdUnless(mem.read(a_address, read), RegNext(read))
+  // Whenever the port is not needed for reading, execute pending writes
+  when (!read) {
+    when (p_valid) { mem.write(p_address, p_wdata, p_mask.toBools) }
+    p_valid := Bool(false)
+  }
+  
+  // Record the request for later?
+  when (ready && a_request && a_write) {
+    p_valid   := Bool(true)
+    p_address := a_address
+    p_mask    := a_mask
+  }
+  
+  // Does the read need to be muxed with the previous write?
+  val a_bypass = a_address === p_address && p_valid
+  val d_bypass = RegEnable(a_bypass, ready && a_request)
+  
+  // Mux in data from the pending write
+  val muxdata = Vec((p_mask.toBools zip (p_wdata zip d_rdata))
+                    map { case (m, (p, r)) => Mux(d_bypass && m, p, r) })
+  // Wipe out any data the master should not see (for testing)
+  val outdata = Vec((d_mask.toBools zip muxdata)
+                    map { case (m, p) => Mux(d_read && ready && m, p, Bits(0)) })
+
+  // Finally, the outputs
+  io.hrdata := outdata.asUInt
+  io.hready := ready
+  io.hresp  := HRESP_OKAY
+}
--- a/src/main/scala/junctions/memserdes.scala
+++ b/src/main/scala/junctions/memserdes.scala
@ -0,0 +1,317 @@
+// See LICENSE for license details.
+
+package junctions
+import Chisel._
+import scala.math._
+import cde.{Parameters, Field}
+
+case object MIFAddrBits extends Field[Int]
+case object MIFDataBits extends Field[Int]
+case object MIFTagBits extends Field[Int]
+case object MIFDataBeats extends Field[Int]
+
+trait HasMIFParameters {
+  implicit val p: Parameters
+  val mifTagBits = p(MIFTagBits)
+  val mifAddrBits = p(MIFAddrBits)
+  val mifDataBits = p(MIFDataBits)
+  val mifDataBeats = p(MIFDataBeats)
+}
+ 
+abstract class MIFModule(implicit val p: Parameters) extends Module with HasMIFParameters
+abstract class MIFBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
+  with HasMIFParameters
+
+trait HasMemData extends HasMIFParameters {
+  val data = Bits(width = mifDataBits)
+}
+
+trait HasMemAddr extends HasMIFParameters {
+  val addr = UInt(width = mifAddrBits)
+}
+
+trait HasMemTag extends HasMIFParameters {
+  val tag = UInt(width = mifTagBits)
+}
+
+class MemReqCmd(implicit p: Parameters) extends MIFBundle()(p) with HasMemAddr with HasMemTag {
+  val rw = Bool()
+}
+
+class MemTag(implicit p: Parameters) extends MIFBundle()(p) with HasMemTag
+class MemData(implicit p: Parameters) extends MIFBundle()(p) with HasMemData
+class MemResp(implicit p: Parameters) extends MIFBundle()(p) with HasMemData with HasMemTag
+
+class MemIO(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val req_cmd  = Decoupled(new MemReqCmd)
+  val req_data = Decoupled(new MemData)
+  val resp     = Decoupled(new MemResp).flip
+}
+
+class MemPipeIO(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val req_cmd  = Decoupled(new MemReqCmd)
+  val req_data = Decoupled(new MemData)
+  val resp     = Valid(new MemResp).flip
+}
+
+class MemSerializedIO(w: Int)(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val req = Decoupled(Bits(width = w))
+  val resp = Valid(Bits(width = w)).flip
+  override def cloneType = new MemSerializedIO(w)(p).asInstanceOf[this.type]
+}
+
+class MemSerdes(w: Int)(implicit p: Parameters) extends MIFModule
+{
+  val io = new Bundle {
+    val wide = new MemIO().flip
+    val narrow = new MemSerializedIO(w)
+  }
+  val abits = io.wide.req_cmd.bits.asUInt.getWidth
+  val dbits = io.wide.req_data.bits.asUInt.getWidth
+  val rbits = io.wide.resp.bits.getWidth
+
+  val out_buf = Reg(Bits())
+  val in_buf = Reg(Bits())
+
+  val s_idle :: s_read_addr :: s_write_addr :: s_write_idle :: s_write_data :: Nil = Enum(UInt(), 5)
+  val state = Reg(init=s_idle)
+  val send_cnt = Reg(init=UInt(0, log2Up((max(abits, dbits)+w-1)/w)))
+  val data_send_cnt = Reg(init=UInt(0, log2Up(mifDataBeats)))
+  val adone = io.narrow.req.ready && send_cnt === UInt((abits-1)/w)
+  val ddone = io.narrow.req.ready && send_cnt === UInt((dbits-1)/w)
+
+  when (io.narrow.req.valid && io.narrow.req.ready) {
+    send_cnt := send_cnt + UInt(1)
+    out_buf := out_buf >> UInt(w)
+  }
+  when (io.wide.req_cmd.valid && io.wide.req_cmd.ready) {
+    out_buf := io.wide.req_cmd.bits.asUInt
+  }
+  when (io.wide.req_data.valid && io.wide.req_data.ready) {
+    out_buf := io.wide.req_data.bits.asUInt
+  }
+
+  io.wide.req_cmd.ready := state === s_idle
+  io.wide.req_data.ready := state === s_write_idle
+  io.narrow.req.valid := state === s_read_addr || state === s_write_addr || state === s_write_data
+  io.narrow.req.bits := out_buf
+
+  when (state === s_idle && io.wide.req_cmd.valid) {
+    state := Mux(io.wide.req_cmd.bits.rw, s_write_addr, s_read_addr)
+  }
+  when (state === s_read_addr && adone) {
+    state := s_idle
+    send_cnt := UInt(0)
+  }
+  when (state === s_write_addr && adone) {
+    state := s_write_idle
+    send_cnt := UInt(0)
+  }
+  when (state === s_write_idle && io.wide.req_data.valid) {
+    state := s_write_data
+  }
+  when (state === s_write_data && ddone) {
+    data_send_cnt := data_send_cnt + UInt(1)
+    state := Mux(data_send_cnt === UInt(mifDataBeats-1), s_idle, s_write_idle)
+    send_cnt := UInt(0)
+  }
+
+  val recv_cnt = Reg(init=UInt(0, log2Up((rbits+w-1)/w)))
+  val data_recv_cnt = Reg(init=UInt(0, log2Up(mifDataBeats)))
+  val resp_val = Reg(init=Bool(false))
+
+  resp_val := Bool(false)
+  when (io.narrow.resp.valid) {
+    recv_cnt := recv_cnt + UInt(1)
+    when (recv_cnt === UInt((rbits-1)/w)) {
+      recv_cnt := UInt(0)
+      data_recv_cnt := data_recv_cnt + UInt(1)
+      resp_val := Bool(true)
+    }
+    in_buf := Cat(io.narrow.resp.bits, in_buf((rbits+w-1)/w*w-1,w))
+  }
+
+  io.wide.resp.valid := resp_val
+  io.wide.resp.bits := io.wide.resp.bits.fromBits(in_buf)
+}
+
+class MemDesserIO(w: Int)(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val narrow = new MemSerializedIO(w).flip
+  val wide = new MemIO
+}
+
+class MemDesser(w: Int)(implicit p: Parameters) extends Module // test rig side
+{
+  val io = new MemDesserIO(w)
+  val abits = io.wide.req_cmd.bits.asUInt.getWidth
+  val dbits = io.wide.req_data.bits.asUInt.getWidth
+  val rbits = io.wide.resp.bits.getWidth
+  val mifDataBeats = p(MIFDataBeats)
+
+  require(dbits >= abits && rbits >= dbits)
+  val recv_cnt = Reg(init=UInt(0, log2Up((rbits+w-1)/w)))
+  val data_recv_cnt = Reg(init=UInt(0, log2Up(mifDataBeats)))
+  val adone = io.narrow.req.valid && recv_cnt === UInt((abits-1)/w)
+  val ddone = io.narrow.req.valid && recv_cnt === UInt((dbits-1)/w)
+  val rdone = io.narrow.resp.valid && recv_cnt === UInt((rbits-1)/w)
+
+  val s_cmd_recv :: s_cmd :: s_data_recv :: s_data :: s_reply :: Nil = Enum(UInt(), 5)
+  val state = Reg(init=s_cmd_recv)
+
+  val in_buf = Reg(Bits())
+  when (io.narrow.req.valid && io.narrow.req.ready || io.narrow.resp.valid) {
+    recv_cnt := recv_cnt + UInt(1)
+    in_buf := Cat(io.narrow.req.bits, in_buf((rbits+w-1)/w*w-1,w))
+  }
+  io.narrow.req.ready := state === s_cmd_recv || state === s_data_recv
+
+  when (state === s_cmd_recv && adone) {
+    state := s_cmd
+    recv_cnt := UInt(0)
+  }
+  when (state === s_cmd && io.wide.req_cmd.ready) {
+    state := Mux(io.wide.req_cmd.bits.rw, s_data_recv, s_reply)
+  }
+  when (state === s_data_recv && ddone) {
+    state := s_data
+    recv_cnt := UInt(0)
+  }
+  when (state === s_data && io.wide.req_data.ready) {
+    state := s_data_recv
+    when (data_recv_cnt === UInt(mifDataBeats-1)) {
+      state := s_cmd_recv
+    }
+    data_recv_cnt := data_recv_cnt + UInt(1)
+  }
+  when (rdone) { // state === s_reply
+    when (data_recv_cnt === UInt(mifDataBeats-1)) {
+      state := s_cmd_recv
+    }
+    recv_cnt := UInt(0)
+    data_recv_cnt := data_recv_cnt + UInt(1)
+  }
+
+  val req_cmd = in_buf >> UInt(((rbits+w-1)/w - (abits+w-1)/w)*w)
+  io.wide.req_cmd.valid := state === s_cmd
+  io.wide.req_cmd.bits := io.wide.req_cmd.bits.fromBits(req_cmd)
+
+  io.wide.req_data.valid := state === s_data
+  io.wide.req_data.bits.data := in_buf >> UInt(((rbits+w-1)/w - (dbits+w-1)/w)*w)
+
+  val dataq = Module(new Queue(new MemResp, mifDataBeats))
+  dataq.io.enq <> io.wide.resp
+  dataq.io.deq.ready := recv_cnt === UInt((rbits-1)/w)
+
+  io.narrow.resp.valid := dataq.io.deq.valid
+  io.narrow.resp.bits := dataq.io.deq.bits.asUInt >> (recv_cnt * UInt(w))
+}
+
+class MemIOArbiter(val arbN: Int)(implicit p: Parameters) extends MIFModule {
+  val io = new Bundle {
+    val inner = Vec(arbN, new MemIO).flip
+    val outer = new MemIO
+  }
+
+  if(arbN > 1) {
+    val cmd_arb = Module(new RRArbiter(new MemReqCmd, arbN))
+    val choice_q = Module(new Queue(cmd_arb.io.chosen, 4))
+    val (data_cnt, data_done) = Counter(io.outer.req_data.fire(), mifDataBeats)
+
+    io.inner.map(_.req_cmd).zipWithIndex.zip(cmd_arb.io.in).map{ case ((req, id), arb) => {
+      arb.valid := req.valid
+      arb.bits := req.bits
+      arb.bits.tag := Cat(req.bits.tag, UInt(id))
+      req.ready := arb.ready
+    }}
+    io.outer.req_cmd.bits := cmd_arb.io.out.bits
+    io.outer.req_cmd.valid := cmd_arb.io.out.valid && choice_q.io.enq.ready
+    cmd_arb.io.out.ready := io.outer.req_cmd.ready && choice_q.io.enq.ready
+    choice_q.io.enq.bits := cmd_arb.io.chosen
+    choice_q.io.enq.valid := cmd_arb.io.out.fire() && cmd_arb.io.out.bits.rw
+
+    io.outer.req_data.bits := io.inner(choice_q.io.deq.bits).req_data.bits
+    io.outer.req_data.valid := io.inner(choice_q.io.deq.bits).req_data.valid && choice_q.io.deq.valid
+    io.inner.map(_.req_data.ready).zipWithIndex.foreach {
+      case(r, i) => r := UInt(i) === choice_q.io.deq.bits && choice_q.io.deq.valid
+    }
+    choice_q.io.deq.ready := data_done
+
+    io.outer.resp.ready := Bool(false)
+    for (i <- 0 until arbN) {
+      io.inner(i).resp.valid := Bool(false)
+      when(io.outer.resp.bits.tag(log2Up(arbN)-1,0) === UInt(i)) {
+        io.inner(i).resp.valid := io.outer.resp.valid
+        io.outer.resp.ready := io.inner(i).resp.ready
+      }
+      io.inner(i).resp.bits := io.outer.resp.bits
+      io.inner(i).resp.bits.tag := io.outer.resp.bits.tag >> UInt(log2Up(arbN))
+    }
+  } else { io.outer <> io.inner.head }
+}
+
+object MemIOMemPipeIOConverter {
+  def apply(in: MemPipeIO)(implicit p: Parameters): MemIO = {
+    val out = Wire(new MemIO())
+    in.resp.valid := out.resp.valid
+    in.resp.bits := out.resp.bits
+    out.resp.ready := Bool(true)
+    out.req_cmd.valid := in.req_cmd.valid
+    out.req_cmd.bits := in.req_cmd.bits
+    in.req_cmd.ready := out.req_cmd.ready
+    out.req_data.valid := in.req_data.valid
+    out.req_data.bits := in.req_data.bits
+    in.req_data.ready := out.req_data.ready
+    out
+  }
+}
+
+class MemPipeIOMemIOConverter(numRequests: Int)(implicit p: Parameters) extends MIFModule {
+  val io = new Bundle {
+    val cpu = new MemIO().flip
+    val mem = new MemPipeIO
+  }
+
+  val numEntries = numRequests * mifDataBeats
+  val size = log2Down(numEntries) + 1
+
+  val inc = Wire(Bool())
+  val dec = Wire(Bool())
+  val count = Reg(init=UInt(numEntries, size))
+  val watermark = count >= UInt(mifDataBeats)
+
+  when (inc && !dec) {
+    count := count + UInt(1)
+  }
+  when (!inc && dec) {
+    count := count - UInt(mifDataBeats)
+  }
+  when (inc && dec) {
+    count := count - UInt(mifDataBeats-1)
+  }
+
+  val cmdq_mask = io.cpu.req_cmd.bits.rw || watermark
+
+  io.mem.req_cmd.valid := io.cpu.req_cmd.valid && cmdq_mask
+  io.cpu.req_cmd.ready := io.mem.req_cmd.ready && cmdq_mask
+  io.mem.req_cmd.bits := io.cpu.req_cmd.bits
+
+  io.mem.req_data <> io.cpu.req_data
+
+  // Have separate queues to allow for different mem implementations
+  val resp_data_q = Module((new HellaQueue(numEntries)) { new MemData })
+  resp_data_q.io.enq.valid := io.mem.resp.valid
+  resp_data_q.io.enq.bits.data := io.mem.resp.bits.data
+
+  val resp_tag_q = Module((new HellaQueue(numEntries)) { new MemTag })
+  resp_tag_q.io.enq.valid := io.mem.resp.valid
+  resp_tag_q.io.enq.bits.tag := io.mem.resp.bits.tag
+
+  io.cpu.resp.valid := resp_data_q.io.deq.valid && resp_tag_q.io.deq.valid
+  io.cpu.resp.bits.data := resp_data_q.io.deq.bits.data
+  io.cpu.resp.bits.tag := resp_tag_q.io.deq.bits.tag
+  resp_data_q.io.deq.ready := io.cpu.resp.ready
+  resp_tag_q.io.deq.ready := io.cpu.resp.ready
+
+  inc := resp_data_q.io.deq.fire() && resp_tag_q.io.deq.fire()
+  dec := io.mem.req_cmd.fire() && !io.mem.req_cmd.bits.rw
+}
--- a/src/main/scala/junctions/nasti.scala
+++ b/src/main/scala/junctions/nasti.scala
@ -0,0 +1,737 @@
+/// See LICENSE for license details.
+
+package junctions
+import Chisel._
+import scala.math.max
+import scala.collection.mutable.ArraySeq
+import cde.{Parameters, Field}
+
+case object NastiKey extends Field[NastiParameters]
+
+case class NastiParameters(dataBits: Int, addrBits: Int, idBits: Int)
+
+trait HasNastiParameters {
+  implicit val p: Parameters
+  val nastiExternal = p(NastiKey)
+  val nastiXDataBits = nastiExternal.dataBits
+  val nastiWStrobeBits = nastiXDataBits / 8
+  val nastiXAddrBits = nastiExternal.addrBits
+  val nastiWIdBits = nastiExternal.idBits
+  val nastiRIdBits = nastiExternal.idBits
+  val nastiXIdBits = max(nastiWIdBits, nastiRIdBits)
+  val nastiXUserBits = 1
+  val nastiAWUserBits = nastiXUserBits
+  val nastiWUserBits = nastiXUserBits
+  val nastiBUserBits = nastiXUserBits
+  val nastiARUserBits = nastiXUserBits
+  val nastiRUserBits = nastiXUserBits
+  val nastiXLenBits = 8
+  val nastiXSizeBits = 3
+  val nastiXBurstBits = 2
+  val nastiXCacheBits = 4
+  val nastiXProtBits = 3
+  val nastiXQosBits = 4
+  val nastiXRegionBits = 4
+  val nastiXRespBits = 2
+
+  def bytesToXSize(bytes: UInt) = MuxLookup(bytes, UInt("b111"), Array(
+    UInt(1) -> UInt(0),
+    UInt(2) -> UInt(1),
+    UInt(4) -> UInt(2),
+    UInt(8) -> UInt(3),
+    UInt(16) -> UInt(4),
+    UInt(32) -> UInt(5),
+    UInt(64) -> UInt(6),
+    UInt(128) -> UInt(7)))
+}
+
+abstract class NastiModule(implicit val p: Parameters) extends Module
+  with HasNastiParameters
+abstract class NastiBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
+  with HasNastiParameters
+
+abstract class NastiChannel(implicit p: Parameters) extends NastiBundle()(p)
+abstract class NastiMasterToSlaveChannel(implicit p: Parameters) extends NastiChannel()(p)
+abstract class NastiSlaveToMasterChannel(implicit p: Parameters) extends NastiChannel()(p)
+
+trait HasNastiMetadata extends HasNastiParameters {
+  val addr   = UInt(width = nastiXAddrBits)
+  val len    = UInt(width = nastiXLenBits)
+  val size   = UInt(width = nastiXSizeBits)
+  val burst  = UInt(width = nastiXBurstBits)
+  val lock   = Bool()
+  val cache  = UInt(width = nastiXCacheBits)
+  val prot   = UInt(width = nastiXProtBits)
+  val qos    = UInt(width = nastiXQosBits)
+  val region = UInt(width = nastiXRegionBits)
+}
+
+trait HasNastiData extends HasNastiParameters {
+  val data = UInt(width = nastiXDataBits)
+  val last = Bool()
+}
+
+class NastiReadIO(implicit val p: Parameters) extends ParameterizedBundle()(p) {
+  val ar = Decoupled(new NastiReadAddressChannel)
+  val r  = Decoupled(new NastiReadDataChannel).flip
+}
+
+class NastiWriteIO(implicit val p: Parameters) extends ParameterizedBundle()(p) {
+  val aw = Decoupled(new NastiWriteAddressChannel)
+  val w  = Decoupled(new NastiWriteDataChannel)
+  val b  = Decoupled(new NastiWriteResponseChannel).flip
+}
+
+class NastiIO(implicit val p: Parameters) extends ParameterizedBundle()(p) {
+  val aw = Decoupled(new NastiWriteAddressChannel)
+  val w  = Decoupled(new NastiWriteDataChannel)
+  val b  = Decoupled(new NastiWriteResponseChannel).flip
+  val ar = Decoupled(new NastiReadAddressChannel)
+  val r  = Decoupled(new NastiReadDataChannel).flip
+}
+
+class NastiAddressChannel(implicit p: Parameters) extends NastiMasterToSlaveChannel()(p)
+    with HasNastiMetadata
+
+class NastiResponseChannel(implicit p: Parameters) extends NastiSlaveToMasterChannel()(p) {
+  val resp = UInt(width = nastiXRespBits)
+}
+
+class NastiWriteAddressChannel(implicit p: Parameters) extends NastiAddressChannel()(p) {
+  val id   = UInt(width = nastiWIdBits)
+  val user = UInt(width = nastiAWUserBits)
+}
+
+class NastiWriteDataChannel(implicit p: Parameters) extends NastiMasterToSlaveChannel()(p)
+    with HasNastiData {
+  val id   = UInt(width = nastiWIdBits)
+  val strb = UInt(width = nastiWStrobeBits)
+  val user = UInt(width = nastiWUserBits)
+}
+
+class NastiWriteResponseChannel(implicit p: Parameters) extends NastiResponseChannel()(p) {
+  val id   = UInt(width = nastiWIdBits)
+  val user = UInt(width = nastiBUserBits)
+}
+
+class NastiReadAddressChannel(implicit p: Parameters) extends NastiAddressChannel()(p) {
+  val id   = UInt(width = nastiRIdBits)
+  val user = UInt(width = nastiARUserBits)
+}
+
+class NastiReadDataChannel(implicit p: Parameters) extends NastiResponseChannel()(p)
+    with HasNastiData {
+  val id   = UInt(width = nastiRIdBits)
+  val user = UInt(width = nastiRUserBits)
+}
+
+object NastiConstants {
+  val BURST_FIXED = UInt("b00")
+  val BURST_INCR  = UInt("b01")
+  val BURST_WRAP  = UInt("b10")
+
+  val RESP_OKAY = UInt("b00")
+  val RESP_EXOKAY = UInt("b01")
+  val RESP_SLVERR = UInt("b10")
+  val RESP_DECERR = UInt("b11")
+}
+
+import NastiConstants._
+
+object NastiWriteAddressChannel {
+  def apply(id: UInt, addr: UInt, size: UInt,
+      len: UInt = UInt(0), burst: UInt = BURST_INCR)
+      (implicit p: Parameters) = {
+    val aw = Wire(new NastiWriteAddressChannel)
+    aw.id := id
+    aw.addr := addr
+    aw.len := len
+    aw.size := size
+    aw.burst := burst
+    aw.lock := Bool(false)
+    aw.cache := UInt("b0000")
+    aw.prot := UInt("b000")
+    aw.qos := UInt("b0000")
+    aw.region := UInt("b0000")
+    aw.user := UInt(0)
+    aw
+  }
+}
+
+object NastiReadAddressChannel {
+  def apply(id: UInt, addr: UInt, size: UInt,
+      len: UInt = UInt(0), burst: UInt = BURST_INCR)
+      (implicit p: Parameters) = {
+    val ar = Wire(new NastiReadAddressChannel)
+    ar.id := id
+    ar.addr := addr
+    ar.len := len
+    ar.size := size
+    ar.burst := burst
+    ar.lock := Bool(false)
+    ar.cache := UInt(0)
+    ar.prot := UInt(0)
+    ar.qos := UInt(0)
+    ar.region := UInt(0)
+    ar.user := UInt(0)
+    ar
+  }
+}
+
+object NastiWriteDataChannel {
+  def apply(data: UInt, strb: Option[UInt] = None,
+            last: Bool = Bool(true), id: UInt = UInt(0))
+           (implicit p: Parameters): NastiWriteDataChannel = {
+    val w = Wire(new NastiWriteDataChannel)
+    w.strb := strb.getOrElse(Fill(w.nastiWStrobeBits, UInt(1, 1)))
+    w.data := data
+    w.last := last
+    w.id   := id
+    w.user := UInt(0)
+    w
+  }
+}
+
+object NastiReadDataChannel {
+  def apply(id: UInt, data: UInt, last: Bool = Bool(true), resp: UInt = UInt(0))(
+      implicit p: Parameters) = {
+    val r = Wire(new NastiReadDataChannel)
+    r.id := id
+    r.data := data
+    r.last := last
+    r.resp := resp
+    r.user := UInt(0)
+    r
+  }
+}
+
+object NastiWriteResponseChannel {
+  def apply(id: UInt, resp: UInt = UInt(0))(implicit p: Parameters) = {
+    val b = Wire(new NastiWriteResponseChannel)
+    b.id := id
+    b.resp := resp
+    b.user := UInt(0)
+    b
+  }
+}
+
+class MemIONastiIOConverter(cacheBlockOffsetBits: Int)(implicit p: Parameters) extends MIFModule
+    with HasNastiParameters {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val mem = new MemIO
+  }
+
+  require(mifDataBits == nastiXDataBits, "Data sizes between LLC and MC don't agree")
+  val (mif_cnt_out, mif_wrap_out) = Counter(io.mem.resp.fire(), mifDataBeats)
+
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.size === UInt(log2Up(mifDataBits/8)),
+    "Nasti data size does not match MemIO data size")
+  assert(!io.nasti.ar.valid || io.nasti.ar.bits.size === UInt(log2Up(mifDataBits/8)),
+    "Nasti data size does not match MemIO data size")
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.len === UInt(mifDataBeats - 1),
+    "Nasti length does not match number of MemIO beats")
+  assert(!io.nasti.ar.valid || io.nasti.ar.bits.len === UInt(mifDataBeats - 1),
+    "Nasti length does not match number of MemIO beats")
+
+  // according to the spec, we can't send b until the last transfer on w
+  val b_ok = Reg(init = Bool(true))
+  when (io.nasti.aw.fire()) { b_ok := Bool(false) }
+  when (io.nasti.w.fire() && io.nasti.w.bits.last) { b_ok := Bool(true) }
+
+  val id_q = Module(new Queue(UInt(width = nastiWIdBits), 2))
+  id_q.io.enq.valid := io.nasti.aw.valid && io.mem.req_cmd.ready
+  id_q.io.enq.bits := io.nasti.aw.bits.id
+  id_q.io.deq.ready := io.nasti.b.ready && b_ok
+
+  io.mem.req_cmd.bits.addr := Mux(io.nasti.aw.valid, io.nasti.aw.bits.addr, io.nasti.ar.bits.addr) >>
+                                UInt(cacheBlockOffsetBits)
+  io.mem.req_cmd.bits.tag := Mux(io.nasti.aw.valid, io.nasti.aw.bits.id, io.nasti.ar.bits.id)
+  io.mem.req_cmd.bits.rw := io.nasti.aw.valid
+  io.mem.req_cmd.valid := (io.nasti.aw.valid && id_q.io.enq.ready) || io.nasti.ar.valid
+  io.nasti.ar.ready := io.mem.req_cmd.ready && !io.nasti.aw.valid
+  io.nasti.aw.ready := io.mem.req_cmd.ready && id_q.io.enq.ready
+
+  io.nasti.b.valid := id_q.io.deq.valid && b_ok
+  io.nasti.b.bits.id := id_q.io.deq.bits
+  io.nasti.b.bits.resp := UInt(0)
+
+  io.nasti.w.ready := io.mem.req_data.ready
+  io.mem.req_data.valid := io.nasti.w.valid
+  io.mem.req_data.bits.data := io.nasti.w.bits.data
+  assert(!io.nasti.w.valid || io.nasti.w.bits.strb.andR, "MemIO must write full cache line")
+
+  io.nasti.r.valid := io.mem.resp.valid
+  io.nasti.r.bits.data := io.mem.resp.bits.data
+  io.nasti.r.bits.last := mif_wrap_out
+  io.nasti.r.bits.id := io.mem.resp.bits.tag
+  io.nasti.r.bits.resp := UInt(0)
+  io.mem.resp.ready := io.nasti.r.ready
+}
+
+class NastiArbiterIO(arbN: Int)(implicit p: Parameters) extends Bundle {
+  val master = Vec(arbN, new NastiIO).flip
+  val slave = new NastiIO
+  override def cloneType =
+    new NastiArbiterIO(arbN).asInstanceOf[this.type]
+}
+
+/** Arbitrate among arbN masters requesting to a single slave */
+class NastiArbiter(val arbN: Int)(implicit p: Parameters) extends NastiModule {
+  val io = new NastiArbiterIO(arbN)
+
+  if (arbN > 1) {
+    val arbIdBits = log2Up(arbN)
+
+    val ar_arb = Module(new RRArbiter(new NastiReadAddressChannel, arbN))
+    val aw_arb = Module(new RRArbiter(new NastiWriteAddressChannel, arbN))
+
+    val slave_r_arb_id = io.slave.r.bits.id(arbIdBits - 1, 0)
+    val slave_b_arb_id = io.slave.b.bits.id(arbIdBits - 1, 0)
+
+    val w_chosen = Reg(UInt(width = arbIdBits))
+    val w_done = Reg(init = Bool(true))
+
+    when (aw_arb.io.out.fire()) {
+      w_chosen := aw_arb.io.chosen
+      w_done := Bool(false)
+    }
+
+    when (io.slave.w.fire() && io.slave.w.bits.last) {
+      w_done := Bool(true)
+    }
+
+    for (i <- 0 until arbN) {
+      val m_ar = io.master(i).ar
+      val m_aw = io.master(i).aw
+      val m_r = io.master(i).r
+      val m_b = io.master(i).b
+      val a_ar = ar_arb.io.in(i)
+      val a_aw = aw_arb.io.in(i)
+      val m_w = io.master(i).w
+
+      a_ar <> m_ar
+      a_ar.bits.id := Cat(m_ar.bits.id, UInt(i, arbIdBits))
+
+      a_aw <> m_aw
+      a_aw.bits.id := Cat(m_aw.bits.id, UInt(i, arbIdBits))
+
+      m_r.valid := io.slave.r.valid && slave_r_arb_id === UInt(i)
+      m_r.bits := io.slave.r.bits
+      m_r.bits.id := io.slave.r.bits.id >> UInt(arbIdBits)
+
+      m_b.valid := io.slave.b.valid && slave_b_arb_id === UInt(i)
+      m_b.bits := io.slave.b.bits
+      m_b.bits.id := io.slave.b.bits.id >> UInt(arbIdBits)
+
+      m_w.ready := io.slave.w.ready && w_chosen === UInt(i) && !w_done
+    }
+
+    io.slave.r.ready := io.master(slave_r_arb_id).r.ready
+    io.slave.b.ready := io.master(slave_b_arb_id).b.ready
+
+    io.slave.w.bits := io.master(w_chosen).w.bits
+    io.slave.w.valid := io.master(w_chosen).w.valid && !w_done
+
+    io.slave.ar <> ar_arb.io.out
+
+    io.slave.aw.bits <> aw_arb.io.out.bits
+    io.slave.aw.valid := aw_arb.io.out.valid && w_done
+    aw_arb.io.out.ready := io.slave.aw.ready && w_done
+
+  } else { io.slave <> io.master.head }
+}
+
+/** A slave that send decode error for every request it receives */
+class NastiErrorSlave(implicit p: Parameters) extends NastiModule {
+  val io = (new NastiIO).flip
+
+  when (io.ar.fire()) { printf("Invalid read address %x\n", io.ar.bits.addr) }
+  when (io.aw.fire()) { printf("Invalid write address %x\n", io.aw.bits.addr) }
+
+  val r_queue = Module(new Queue(new NastiReadAddressChannel, 1))
+  r_queue.io.enq <> io.ar
+
+  val responding = Reg(init = Bool(false))
+  val beats_left = Reg(init = UInt(0, nastiXLenBits))
+
+  when (!responding && r_queue.io.deq.valid) {
+    responding := Bool(true)
+    beats_left := r_queue.io.deq.bits.len
+  }
+
+  io.r.valid := r_queue.io.deq.valid && responding
+  io.r.bits.id := r_queue.io.deq.bits.id
+  io.r.bits.data := UInt(0)
+  io.r.bits.resp := RESP_DECERR
+  io.r.bits.last := beats_left === UInt(0)
+
+  r_queue.io.deq.ready := io.r.fire() && io.r.bits.last
+
+  when (io.r.fire()) {
+    when (beats_left === UInt(0)) {
+      responding := Bool(false)
+    } .otherwise {
+      beats_left := beats_left - UInt(1)
+    }
+  }
+
+  val draining = Reg(init = Bool(false))
+  io.w.ready := draining
+
+  when (io.aw.fire()) { draining := Bool(true) }
+  when (io.w.fire() && io.w.bits.last) { draining := Bool(false) }
+
+  val b_queue = Module(new Queue(UInt(width = nastiWIdBits), 1))
+  b_queue.io.enq.valid := io.aw.valid && !draining
+  b_queue.io.enq.bits := io.aw.bits.id
+  io.aw.ready := b_queue.io.enq.ready && !draining
+  io.b.valid := b_queue.io.deq.valid && !draining
+  io.b.bits.id := b_queue.io.deq.bits
+  io.b.bits.resp := Bits("b11")
+  b_queue.io.deq.ready := io.b.ready && !draining
+}
+
+class NastiRouterIO(nSlaves: Int)(implicit p: Parameters) extends Bundle {
+  val master = (new NastiIO).flip
+  val slave = Vec(nSlaves, new NastiIO)
+  override def cloneType =
+    new NastiRouterIO(nSlaves).asInstanceOf[this.type]
+}
+
+/** Take a single Nasti master and route its requests to various slaves
+ *  @param nSlaves the number of slaves
+ *  @param routeSel a function which takes an address and produces
+ *  a one-hot encoded selection of the slave to write to */
+class NastiRouter(nSlaves: Int, routeSel: UInt => UInt)(implicit p: Parameters)
+    extends NastiModule {
+
+  val io = new NastiRouterIO(nSlaves)
+
+  val ar_route = routeSel(io.master.ar.bits.addr)
+  val aw_route = routeSel(io.master.aw.bits.addr)
+
+  var ar_ready = Bool(false)
+  var aw_ready = Bool(false)
+  var w_ready = Bool(false)
+
+  io.slave.zipWithIndex.foreach { case (s, i) =>
+    s.ar.valid := io.master.ar.valid && ar_route(i)
+    s.ar.bits := io.master.ar.bits
+    ar_ready = ar_ready || (s.ar.ready && ar_route(i))
+
+    s.aw.valid := io.master.aw.valid && aw_route(i)
+    s.aw.bits := io.master.aw.bits
+    aw_ready = aw_ready || (s.aw.ready && aw_route(i))
+
+    val chosen = Reg(init = Bool(false))
+    when (s.w.fire() && s.w.bits.last) { chosen := Bool(false) }
+    when (s.aw.fire()) { chosen := Bool(true) }
+
+    s.w.valid := io.master.w.valid && chosen
+    s.w.bits := io.master.w.bits
+    w_ready = w_ready || (s.w.ready && chosen)
+  }
+
+  val r_invalid = !ar_route.orR
+  val w_invalid = !aw_route.orR
+
+  val err_slave = Module(new NastiErrorSlave)
+  err_slave.io.ar.valid := r_invalid && io.master.ar.valid
+  err_slave.io.ar.bits := io.master.ar.bits
+  err_slave.io.aw.valid := w_invalid && io.master.aw.valid
+  err_slave.io.aw.bits := io.master.aw.bits
+  err_slave.io.w.valid := io.master.w.valid
+  err_slave.io.w.bits := io.master.w.bits
+
+  io.master.ar.ready := ar_ready || (r_invalid && err_slave.io.ar.ready)
+  io.master.aw.ready := aw_ready || (w_invalid && err_slave.io.aw.ready)
+  io.master.w.ready := w_ready || err_slave.io.w.ready
+
+  val b_arb = Module(new RRArbiter(new NastiWriteResponseChannel, nSlaves + 1))
+  val r_arb = Module(new JunctionsPeekingArbiter(
+    new NastiReadDataChannel, nSlaves + 1,
+    // we can unlock if it's the last beat
+    (r: NastiReadDataChannel) => r.last))
+
+  for (i <- 0 until nSlaves) {
+    b_arb.io.in(i) <> io.slave(i).b
+    r_arb.io.in(i) <> io.slave(i).r
+  }
+
+  b_arb.io.in(nSlaves) <> err_slave.io.b
+  r_arb.io.in(nSlaves) <> err_slave.io.r
+
+  io.master.b <> b_arb.io.out
+  io.master.r <> r_arb.io.out
+}
+
+/** Crossbar between multiple Nasti masters and slaves
+ *  @param nMasters the number of Nasti masters
+ *  @param nSlaves the number of Nasti slaves
+ *  @param routeSel a function selecting the slave to route an address to */
+class NastiCrossbar(nMasters: Int, nSlaves: Int, routeSel: UInt => UInt)
+                   (implicit p: Parameters) extends NastiModule {
+  val io = new Bundle {
+    val masters = Vec(nMasters, new NastiIO).flip
+    val slaves = Vec(nSlaves, new NastiIO)
+  }
+
+  if (nMasters == 1) {
+    val router = Module(new NastiRouter(nSlaves, routeSel))
+    router.io.master <> io.masters.head
+    io.slaves <> router.io.slave
+  } else {
+    val routers = Vec.fill(nMasters) { Module(new NastiRouter(nSlaves, routeSel)).io }
+    val arbiters = Vec.fill(nSlaves) { Module(new NastiArbiter(nMasters)).io }
+
+    for (i <- 0 until nMasters) {
+      routers(i).master <> io.masters(i)
+    }
+
+    for (i <- 0 until nSlaves) {
+      arbiters(i).master <> Vec(routers.map(r => r.slave(i)))
+      io.slaves(i) <> arbiters(i).slave
+    }
+  }
+}
+
+class NastiInterconnectIO(val nMasters: Int, val nSlaves: Int)
+                         (implicit p: Parameters) extends Bundle {
+  /* This is a bit confusing. The interconnect is a slave to the masters and
+   * a master to the slaves. Hence why the declarations seem to be backwards. */
+  val masters = Vec(nMasters, new NastiIO).flip
+  val slaves = Vec(nSlaves, new NastiIO)
+  override def cloneType =
+    new NastiInterconnectIO(nMasters, nSlaves).asInstanceOf[this.type]
+}
+
+abstract class NastiInterconnect(implicit p: Parameters) extends NastiModule()(p) {
+  val nMasters: Int
+  val nSlaves: Int
+
+  lazy val io = new NastiInterconnectIO(nMasters, nSlaves)
+}
+
+class NastiRecursiveInterconnect(val nMasters: Int, addrMap: AddrMap)
+    (implicit p: Parameters) extends NastiInterconnect()(p) {
+  def port(name: String) = io.slaves(addrMap.port(name))
+  val nSlaves = addrMap.numSlaves
+  val routeSel = (addr: UInt) =>
+    Cat(addrMap.entries.map(e => addrMap(e.name).containsAddress(addr)).reverse)
+
+  val xbar = Module(new NastiCrossbar(nMasters, addrMap.length, routeSel))
+  xbar.io.masters <> io.masters
+
+  io.slaves <> addrMap.entries.zip(xbar.io.slaves).flatMap {
+    case (entry, xbarSlave) => {
+      entry.region match {
+        case submap: AddrMap if submap.entries.isEmpty =>
+          val err_slave = Module(new NastiErrorSlave)
+          err_slave.io <> xbarSlave
+          None
+        case submap: AddrMap =>
+          val ic = Module(new NastiRecursiveInterconnect(1, submap))
+          ic.io.masters.head <> xbarSlave
+          ic.io.slaves
+        case r: MemRange =>
+          Some(xbarSlave)
+      }
+    }
+  }
+}
+
+class ChannelHelper(nChannels: Int)
+    (implicit val p: Parameters) extends HasNastiParameters {
+
+  val dataBytes = p(MIFDataBits) * p(MIFDataBeats) / 8
+  val chanSelBits = log2Ceil(nChannels)
+  val selOffset = log2Up(dataBytes)
+  val blockOffset = selOffset + chanSelBits
+
+  def getSelect(addr: UInt) =
+    if (nChannels > 1) addr(blockOffset - 1, selOffset) else UInt(0)
+
+  def getAddr(addr: UInt) =
+    if (nChannels > 1)
+      Cat(addr(nastiXAddrBits - 1, blockOffset), addr(selOffset - 1, 0))
+    else addr
+}
+
+class NastiMemoryInterconnect(
+    nBanksPerChannel: Int, nChannels: Int)
+    (implicit p: Parameters) extends NastiInterconnect()(p) {
+
+  val nBanks = nBanksPerChannel * nChannels
+  val nMasters = nBanks
+  val nSlaves = nChannels
+
+  val chanHelper = new ChannelHelper(nChannels)
+  def connectChannel(outer: NastiIO, inner: NastiIO) {
+    outer <> inner
+    outer.ar.bits.addr := chanHelper.getAddr(inner.ar.bits.addr)
+    outer.aw.bits.addr := chanHelper.getAddr(inner.aw.bits.addr)
+  }
+
+  for (i <- 0 until nChannels) {
+    /* Bank assignments to channels are strided so that consecutive banks
+     * map to different channels. That way, consecutive cache lines also
+     * map to different channels */
+    val banks = (i until nBanks by nChannels).map(j => io.masters(j))
+
+    val channelArb = Module(new NastiArbiter(nBanksPerChannel))
+    channelArb.io.master <> banks
+    connectChannel(io.slaves(i), channelArb.io.slave)
+  }
+}
+
+/** Allows users to switch between various memory configurations.  Note that
+  * this is a dangerous operation: not only does switching the select input to
+  * this module violate Nasti, it also causes the memory of the machine to
+  * become garbled.  It's expected that select only changes at boot time, as
+  * part of the memory controller configuration. */
+class NastiMemorySelectorIO(val nBanks: Int, val maxMemChannels: Int, nConfigs: Int)
+                           (implicit p: Parameters)
+                           extends NastiInterconnectIO(nBanks, maxMemChannels) {
+  val select  = UInt(INPUT, width = log2Up(nConfigs))
+  override def cloneType =
+    new NastiMemorySelectorIO(nMasters, nSlaves, nConfigs).asInstanceOf[this.type]
+}
+
+class NastiMemorySelector(nBanks: Int, maxMemChannels: Int, configs: Seq[Int])
+                         (implicit p: Parameters)
+                         extends NastiInterconnect()(p) {
+  val nMasters = nBanks
+  val nSlaves  = maxMemChannels
+  val nConfigs = configs.size
+
+  override lazy val io = new NastiMemorySelectorIO(nBanks, maxMemChannels, nConfigs)
+
+  def muxOnSelect(up: DecoupledIO[Bundle], dn: DecoupledIO[Bundle], active: Bool): Unit = {
+    when (active) { dn.bits  := up.bits  }
+    when (active) { up.ready := dn.ready }
+    when (active) { dn.valid := up.valid }
+  }
+
+  def muxOnSelect(up: NastiIO, dn: NastiIO, active: Bool): Unit = {
+    muxOnSelect(up.aw, dn.aw, active)
+    muxOnSelect(up.w,  dn.w,  active)
+    muxOnSelect(dn.b,  up.b,  active)
+    muxOnSelect(up.ar, dn.ar, active)
+    muxOnSelect(dn.r,  up.r,  active)
+  }
+
+  def muxOnSelect(up: Vec[NastiIO], dn: Vec[NastiIO], active: Bool) : Unit = {
+    for (i <- 0 until up.size)
+      muxOnSelect(up(i), dn(i), active)
+  }
+
+  /* Disconnects a vector of Nasti ports, which involves setting them to
+   * invalid.  Due to Chisel reasons, we need to also set the bits to 0 (since
+   * there can't be any unconnected inputs). */
+  def disconnectSlave(slave: Vec[NastiIO]) = {
+    slave.foreach{ m =>
+      m.aw.valid := Bool(false)
+      m.aw.bits  := m.aw.bits.fromBits( UInt(0) )
+      m.w.valid  := Bool(false)
+      m.w.bits   := m.w.bits.fromBits( UInt(0) )
+      m.b.ready  := Bool(false)
+      m.ar.valid := Bool(false)
+      m.ar.bits  := m.ar.bits.fromBits( UInt(0) )
+      m.r.ready  := Bool(false)
+    }
+  }
+
+  def disconnectMaster(master: Vec[NastiIO]) = {
+    master.foreach{ m =>
+      m.aw.ready := Bool(false)
+      m.w.ready  := Bool(false)
+      m.b.valid  := Bool(false)
+      m.b.bits   := m.b.bits.fromBits( UInt(0) )
+      m.ar.ready := Bool(false)
+      m.r.valid  := Bool(false)
+      m.r.bits   := m.r.bits.fromBits( UInt(0) )
+    }
+  }
+
+  /* Provides default wires on all our outputs. */
+  disconnectMaster(io.masters)
+  disconnectSlave(io.slaves)
+
+  /* Constructs interconnects for each of the layouts suggested by the
+   * configuration and switches between them based on the select input. */
+  configs.zipWithIndex.foreach{ case (nChannels, select) =>
+    val nBanksPerChannel = nBanks / nChannels
+    val ic = Module(new NastiMemoryInterconnect(nBanksPerChannel, nChannels))
+    disconnectMaster(ic.io.slaves)
+    disconnectSlave(ic.io.masters)
+    muxOnSelect(   io.masters, ic.io.masters, io.select === UInt(select))
+    muxOnSelect(ic.io.slaves,     io.slaves,  io.select === UInt(select))
+  }
+}
+
+class NastiMemoryDemux(nRoutes: Int)(implicit p: Parameters) extends NastiModule()(p) {
+  val io = new Bundle {
+    val master = (new NastiIO).flip
+    val slaves = Vec(nRoutes, new NastiIO)
+    val select = UInt(INPUT, log2Up(nRoutes))
+  }
+
+  def connectReqChannel[T <: Data](idx: Int, out: DecoupledIO[T], in: DecoupledIO[T]) {
+    out.valid := in.valid && io.select === UInt(idx)
+    out.bits := in.bits
+    when (io.select === UInt(idx)) { in.ready := out.ready }
+  }
+
+  def connectRespChannel[T <: Data](idx: Int, out: DecoupledIO[T], in: DecoupledIO[T]) {
+    when (io.select === UInt(idx)) { out.valid := in.valid }
+    when (io.select === UInt(idx)) { out.bits := in.bits }
+    in.ready := out.ready && io.select === UInt(idx)
+  }
+
+  io.master.ar.ready := Bool(false)
+  io.master.aw.ready := Bool(false)
+  io.master.w.ready := Bool(false)
+  io.master.r.valid := Bool(false)
+  io.master.r.bits := NastiReadDataChannel(id = UInt(0), data = UInt(0))
+  io.master.b.valid := Bool(false)
+  io.master.b.bits := NastiWriteResponseChannel(id = UInt(0))
+
+  io.slaves.zipWithIndex.foreach { case (slave, i) =>
+    connectReqChannel(i, slave.ar, io.master.ar)
+    connectReqChannel(i, slave.aw, io.master.aw)
+    connectReqChannel(i, slave.w, io.master.w)
+    connectRespChannel(i, io.master.r, slave.r)
+    connectRespChannel(i, io.master.b, slave.b)
+  }
+}
+
+object AsyncNastiTo {
+  // source(master) is in our clock domain, output is in the 'to' clock domain
+  def apply[T <: Data](to_clock: Clock, to_reset: Bool, source: NastiIO, depth: Int = 3, sync: Int = 2)(implicit p: Parameters): NastiIO = {
+    val sink = Wire(new NastiIO)
+
+    sink.aw <> AsyncDecoupledTo(to_clock, to_reset, source.aw, depth, sync)
+    sink.ar <> AsyncDecoupledTo(to_clock, to_reset, source.ar, depth, sync)
+    sink.w  <> AsyncDecoupledTo(to_clock, to_reset, source.w,  depth, sync)
+    source.b <> AsyncDecoupledFrom(to_clock, to_reset, sink.b, depth, sync)
+    source.r <> AsyncDecoupledFrom(to_clock, to_reset, sink.r, depth, sync)
+
+    sink
+  }
+}
+
+object AsyncNastiFrom {
+  // source(master) is in the 'from' clock domain, output is in our clock domain
+  def apply[T <: Data](from_clock: Clock, from_reset: Bool, source: NastiIO, depth: Int = 3, sync: Int = 2)(implicit p: Parameters): NastiIO = {
+    val sink = Wire(new NastiIO)
+
+    sink.aw <> AsyncDecoupledFrom(from_clock, from_reset, source.aw, depth, sync)
+    sink.ar <> AsyncDecoupledFrom(from_clock, from_reset, source.ar, depth, sync)
+    sink.w  <> AsyncDecoupledFrom(from_clock, from_reset, source.w,  depth, sync)
+    source.b <> AsyncDecoupledTo(from_clock, from_reset, sink.b, depth, sync)
+    source.r <> AsyncDecoupledTo(from_clock, from_reset, sink.r, depth, sync)
+
+    sink
+  }
+}
--- a/src/main/scala/junctions/package.scala
+++ b/src/main/scala/junctions/package.scala
@ -0,0 +1 @@
+package object junctions
--- a/src/main/scala/junctions/poci.scala
+++ b/src/main/scala/junctions/poci.scala
@ -0,0 +1,82 @@
+package junctions
+
+import Chisel._
+import cde.{Parameters, Field}
+
+class PociIO(implicit p: Parameters) extends HastiBundle()(p)
+{
+  val paddr = UInt(OUTPUT, hastiAddrBits)
+  val pwrite = Bool(OUTPUT)
+  val psel = Bool(OUTPUT)
+  val penable = Bool(OUTPUT)
+  val pwdata = UInt(OUTPUT, hastiDataBits)
+  val prdata = UInt(INPUT, hastiDataBits)
+  val pready = Bool(INPUT)
+  val pslverr = Bool(INPUT)
+}
+
+class HastiToPociBridge(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val in = new HastiSlaveIO
+    val out = new PociIO
+  }
+
+  val s_idle :: s_setup :: s_access :: Nil = Enum(UInt(), 3)
+  val state = Reg(init = s_idle)
+  val transfer = io.in.hsel & io.in.htrans(1)
+
+  switch (state) {
+    is (s_idle) {
+      when (transfer) { state := s_setup }
+    }
+    is (s_setup) {
+      state := s_access
+    }
+    is (s_access) {
+      when (io.out.pready & ~transfer) { state := s_idle   }
+      when (io.out.pready & transfer)  { state := s_setup  }
+      when (~io.out.pready)            { state := s_access }
+    }
+  }
+
+  val haddr_reg = Reg(UInt(width = hastiAddrBits))
+  val hwrite_reg = Reg(UInt(width = 1))
+  when (transfer) {
+    haddr_reg  := io.in.haddr
+    hwrite_reg := io.in.hwrite
+  }
+
+  io.out.paddr := haddr_reg
+  io.out.pwrite := hwrite_reg(0)
+  io.out.psel := (state =/= s_idle)
+  io.out.penable := (state === s_access)
+  io.out.pwdata := io.in.hwdata
+  io.in.hrdata := io.out.prdata
+  io.in.hready := ((state === s_access) & io.out.pready) | (state === s_idle)
+  io.in.hresp := io.out.pslverr
+}
+
+class PociBus(amap: Seq[UInt=>Bool])(implicit p: Parameters) extends HastiModule()(p)
+{
+  val io = new Bundle {
+    val master = new PociIO().flip
+    val slaves = Vec(amap.size, new PociIO)
+  }
+
+  val psels = PriorityEncoderOH(
+    (io.slaves zip amap) map { case (s, afn) => {
+      s.paddr := io.master.paddr
+      s.pwrite := io.master.pwrite
+      s.pwdata := io.master.pwdata
+      afn(io.master.paddr) && io.master.psel
+  }})
+
+  (io.slaves zip psels) foreach { case (s, psel) => {
+    s.psel := psel
+    s.penable := io.master.penable && psel
+  } }
+
+  io.master.prdata := Mux1H(psels, io.slaves.map(_.prdata))
+  io.master.pready := Mux1H(psels, io.slaves.map(_.pready))
+  io.master.pslverr := Mux1H(psels, io.slaves.map(_.pslverr))
+}
--- a/src/main/scala/junctions/slowio.scala
+++ b/src/main/scala/junctions/slowio.scala
@ -0,0 +1,70 @@
+// See LICENSE for license details.
+
+package junctions
+import Chisel._
+
+class SlowIO[T <: Data](val divisor_max: Int)(data: => T) extends Module
+{
+  val io = new Bundle {
+    val out_fast = Decoupled(data).flip
+    val out_slow = Decoupled(data)
+    val in_fast = Decoupled(data)
+    val in_slow = Decoupled(data).flip
+    val clk_slow = Bool(OUTPUT)
+    val set_divisor = Valid(Bits(width = 32)).flip
+    val divisor = Bits(OUTPUT, 32)
+  }
+
+  require(divisor_max >= 8 && divisor_max <= 65536 && isPow2(divisor_max))
+  val divisor = Reg(init=UInt(divisor_max-1))
+  val d_shadow = Reg(init=UInt(divisor_max-1))
+  val hold = Reg(init=UInt(divisor_max/4-1))
+  val h_shadow = Reg(init=UInt(divisor_max/4-1))
+  when (io.set_divisor.valid) {
+    d_shadow := io.set_divisor.bits(log2Up(divisor_max)-1, 0)
+    h_shadow := io.set_divisor.bits(log2Up(divisor_max)-1+16, 16)
+  }
+  io.divisor := (hold << 16) | divisor
+
+  val count = Reg{UInt(width = log2Up(divisor_max))}
+  val myclock = Reg{Bool()}
+  count := count + UInt(1)
+
+  val rising = count === (divisor >> 1)
+  val falling = count === divisor
+  val held = count === (divisor >> 1) + hold
+
+  when (falling) {
+    divisor := d_shadow
+    hold := h_shadow
+    count := UInt(0)
+    myclock := Bool(false)
+  }
+  when (rising) {
+    myclock := Bool(true)
+  }
+
+  val in_slow_rdy = Reg(init=Bool(false))
+  val out_slow_val = Reg(init=Bool(false))
+  val out_slow_bits = Reg(data)
+
+  val fromhost_q = Module(new Queue(data,1))
+  fromhost_q.io.enq.valid := rising && (io.in_slow.valid && in_slow_rdy || this.reset)
+  fromhost_q.io.enq.bits := io.in_slow.bits
+  io.in_fast <> fromhost_q.io.deq
+
+  val tohost_q = Module(new Queue(data,1))
+  tohost_q.io.enq <> io.out_fast
+  tohost_q.io.deq.ready := rising && io.out_slow.ready && out_slow_val
+
+  when (held) {
+    in_slow_rdy := fromhost_q.io.enq.ready
+    out_slow_val := tohost_q.io.deq.valid
+    out_slow_bits := Mux(this.reset, fromhost_q.io.deq.bits, tohost_q.io.deq.bits)
+  }
+
+  io.in_slow.ready := in_slow_rdy
+  io.out_slow.valid := out_slow_val
+  io.out_slow.bits := out_slow_bits
+  io.clk_slow := myclock
+}
--- a/src/main/scala/junctions/smi.scala
+++ b/src/main/scala/junctions/smi.scala
@ -0,0 +1,281 @@
+package junctions
+
+import Chisel._
+import cde.Parameters
+
+class SmiReq(val dataWidth: Int, val addrWidth: Int) extends Bundle {
+  val rw = Bool()
+  val addr = UInt(width = addrWidth)
+  val data = Bits(width = dataWidth)
+
+  override def cloneType =
+    new SmiReq(dataWidth, addrWidth).asInstanceOf[this.type]
+}
+
+/** Simple Memory Interface IO. Used to communicate with PCR and SCR
+ *  @param dataWidth the width in bits of the data field
+ *  @param addrWidth the width in bits of the addr field */
+class SmiIO(val dataWidth: Int, val addrWidth: Int) extends Bundle {
+  val req = Decoupled(new SmiReq(dataWidth, addrWidth))
+  val resp = Decoupled(Bits(width = dataWidth)).flip
+
+  override def cloneType =
+    new SmiIO(dataWidth, addrWidth).asInstanceOf[this.type]
+}
+
+abstract class SmiPeripheral extends Module {
+  val dataWidth: Int
+  val addrWidth: Int
+
+  lazy val io = new SmiIO(dataWidth, addrWidth).flip
+}
+
+/** A simple sequential memory accessed through Smi */
+class SmiMem(val dataWidth: Int, val memDepth: Int) extends SmiPeripheral {
+  // override
+  val addrWidth = log2Up(memDepth)
+
+  val mem = SeqMem(memDepth, Bits(width = dataWidth))
+
+  val ren = io.req.fire() && !io.req.bits.rw
+  val wen = io.req.fire() && io.req.bits.rw
+
+  when (wen) { mem.write(io.req.bits.addr, io.req.bits.data) }
+
+  val resp_valid = Reg(init = Bool(false))
+
+  when (io.resp.fire()) { resp_valid := Bool(false) }
+  when (io.req.fire())  { resp_valid := Bool(true) }
+
+  io.resp.valid := resp_valid
+  io.resp.bits := mem.read(io.req.bits.addr, ren)
+  io.req.ready := !resp_valid
+}
+
+/** Arbitrate among several Smi clients
+ *  @param n the number of clients
+ *  @param dataWidth Smi data width
+ *  @param addrWidth Smi address width */
+class SmiArbiter(val n: Int, val dataWidth: Int, val addrWidth: Int)
+    extends Module {
+  val io = new Bundle {
+    val in = Vec(n, new SmiIO(dataWidth, addrWidth)).flip
+    val out = new SmiIO(dataWidth, addrWidth)
+  }
+
+  val wait_resp = Reg(init = Bool(false))
+  val choice = Reg(UInt(width = log2Up(n)))
+
+  val req_arb = Module(new RRArbiter(new SmiReq(dataWidth, addrWidth), n))
+  req_arb.io.in <> io.in.map(_.req)
+  req_arb.io.out.ready := io.out.req.ready && !wait_resp
+
+  io.out.req.bits := req_arb.io.out.bits
+  io.out.req.valid := req_arb.io.out.valid && !wait_resp
+
+  when (io.out.req.fire()) {
+    choice := req_arb.io.chosen
+    wait_resp := Bool(true)
+  }
+
+  when (io.out.resp.fire()) { wait_resp := Bool(false) }
+
+  for ((resp, i) <- io.in.map(_.resp).zipWithIndex) {
+    resp.bits := io.out.resp.bits
+    resp.valid := io.out.resp.valid && choice === UInt(i)
+  }
+
+  io.out.resp.ready := io.in(choice).resp.ready
+}
+
+class SmiIONastiReadIOConverter(val dataWidth: Int, val addrWidth: Int)
+                               (implicit p: Parameters) extends NastiModule()(p) {
+  val io = new Bundle {
+    val nasti = new NastiReadIO().flip
+    val smi = new SmiIO(dataWidth, addrWidth)
+  }
+
+  private val maxWordsPerBeat = nastiXDataBits / dataWidth
+  private val wordCountBits = log2Up(maxWordsPerBeat)
+  private val byteOffBits = log2Up(dataWidth / 8)
+  private val addrOffBits = addrWidth + byteOffBits
+
+  private def calcWordCount(size: UInt): UInt =
+    (UInt(1) << (size - UInt(byteOffBits))) - UInt(1)
+
+  val (s_idle :: s_read :: s_resp :: Nil) = Enum(Bits(), 3)
+  val state = Reg(init = s_idle)
+
+  val nWords = Reg(UInt(width = wordCountBits))
+  val nBeats = Reg(UInt(width = nastiXLenBits))
+  val addr = Reg(UInt(width = addrWidth))
+  val id = Reg(UInt(width = nastiRIdBits))
+
+  val byteOff = Reg(UInt(width = byteOffBits))
+  val recvInd = Reg(init = UInt(0, wordCountBits))
+  val sendDone = Reg(init = Bool(false))
+
+  val buffer = Reg(init = Vec.fill(maxWordsPerBeat) { Bits(0, dataWidth) })
+
+  io.nasti.ar.ready := (state === s_idle)
+
+  io.smi.req.valid := (state === s_read) && !sendDone
+  io.smi.req.bits.rw := Bool(false)
+  io.smi.req.bits.addr := addr
+
+  io.smi.resp.ready := (state === s_read)
+
+  io.nasti.r.valid := (state === s_resp)
+  io.nasti.r.bits := NastiReadDataChannel(
+    id = id,
+    data = buffer.asUInt,
+    last = (nBeats === UInt(0)))
+
+  when (io.nasti.ar.fire()) {
+    when (io.nasti.ar.bits.size < UInt(byteOffBits)) {
+      nWords := UInt(0)
+    } .otherwise {
+      nWords := calcWordCount(io.nasti.ar.bits.size)
+    }
+    nBeats := io.nasti.ar.bits.len
+    addr := io.nasti.ar.bits.addr(addrOffBits - 1, byteOffBits)
+    if (maxWordsPerBeat > 1)
+      recvInd := io.nasti.ar.bits.addr(wordCountBits + byteOffBits - 1, byteOffBits)
+    else
+      recvInd := UInt(0)
+    id := io.nasti.ar.bits.id
+    state := s_read
+  }
+
+  when (io.smi.req.fire()) {
+    addr := addr + UInt(1)
+    sendDone := (nWords === UInt(0))
+  }
+
+  when (io.smi.resp.fire()) {
+    recvInd := recvInd + UInt(1)
+    nWords := nWords - UInt(1)
+    buffer(recvInd) := io.smi.resp.bits
+    when (nWords === UInt(0)) { state := s_resp }
+  }
+
+  when (io.nasti.r.fire()) {
+    recvInd := UInt(0)
+    sendDone := Bool(false)
+    // clear all the registers in the buffer
+    buffer.foreach(_ := Bits(0))
+    nBeats := nBeats - UInt(1)
+    state := Mux(io.nasti.r.bits.last, s_idle, s_read)
+  }
+}
+
+class SmiIONastiWriteIOConverter(val dataWidth: Int, val addrWidth: Int)
+                                (implicit p: Parameters) extends NastiModule()(p) {
+  val io = new Bundle {
+    val nasti = new NastiWriteIO().flip
+    val smi = new SmiIO(dataWidth, addrWidth)
+  }
+
+  private val dataBytes = dataWidth / 8
+  private val maxWordsPerBeat = nastiXDataBits / dataWidth
+  private val byteOffBits = log2Floor(dataBytes)
+  private val addrOffBits = addrWidth + byteOffBits
+  private val nastiByteOffBits = log2Ceil(nastiXDataBits / 8)
+
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.size >= UInt(byteOffBits),
+    "Nasti size must be >= Smi size")
+
+  val id = Reg(UInt(width = nastiWIdBits))
+  val addr = Reg(UInt(width = addrWidth))
+  val offset = Reg(UInt(width = nastiByteOffBits))
+
+  def makeStrobe(offset: UInt, size: UInt, strb: UInt) = {
+    val sizemask = (UInt(1) << (UInt(1) << size)) - UInt(1)
+    val bytemask = strb & (sizemask << offset)
+    Vec.tabulate(maxWordsPerBeat){i => bytemask(dataBytes * i)}.asUInt
+  }
+
+  val size = Reg(UInt(width = nastiXSizeBits))
+  val strb = Reg(UInt(width = maxWordsPerBeat))
+  val data = Reg(UInt(width = nastiXDataBits))
+  val last = Reg(Bool())
+
+  val s_idle :: s_data :: s_send :: s_ack :: s_resp :: Nil = Enum(Bits(), 5)
+  val state = Reg(init = s_idle)
+
+  io.nasti.aw.ready := (state === s_idle)
+  io.nasti.w.ready := (state === s_data)
+  io.smi.req.valid := (state === s_send) && strb(0)
+  io.smi.req.bits.rw := Bool(true)
+  io.smi.req.bits.addr := addr
+  io.smi.req.bits.data := data(dataWidth - 1, 0)
+  io.smi.resp.ready := (state === s_ack)
+  io.nasti.b.valid := (state === s_resp)
+  io.nasti.b.bits := NastiWriteResponseChannel(id)
+
+  val jump = if (maxWordsPerBeat > 1)
+    PriorityMux(strb(maxWordsPerBeat - 1, 1),
+      (1 until maxWordsPerBeat).map(UInt(_)))
+    else UInt(1)
+
+  when (io.nasti.aw.fire()) {
+    if (dataWidth == nastiXDataBits) {
+      addr := io.nasti.aw.bits.addr(addrOffBits - 1, byteOffBits)
+    } else {
+      addr := Cat(io.nasti.aw.bits.addr(addrOffBits - 1, nastiByteOffBits),
+                  UInt(0, nastiByteOffBits - byteOffBits))
+    }
+    offset := io.nasti.aw.bits.addr(nastiByteOffBits - 1, 0)
+    id := io.nasti.aw.bits.id
+    size := io.nasti.aw.bits.size
+    last := Bool(false)
+    state := s_data
+  }
+
+  when (io.nasti.w.fire()) {
+    last := io.nasti.w.bits.last
+    strb := makeStrobe(offset, size, io.nasti.w.bits.strb)
+    data := io.nasti.w.bits.data
+    state := s_send
+  }
+
+  when (state === s_send) {
+    when (io.smi.req.ready || !strb(0)) {
+      strb := strb >> jump
+      data := data >> Cat(jump, UInt(0, log2Up(dataWidth)))
+      addr := addr + jump
+      when (strb(0)) { state := s_ack }
+    }
+  }
+
+  when (io.smi.resp.fire()) {
+    state := Mux(strb === UInt(0),
+              Mux(last, s_resp, s_data), s_send)
+  }
+
+  when (io.nasti.b.fire()) { state := s_idle }
+}
+
+/** Convert Nasti protocol to Smi protocol */
+class SmiIONastiIOConverter(val dataWidth: Int, val addrWidth: Int)
+                           (implicit p: Parameters) extends NastiModule()(p) {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val smi = new SmiIO(dataWidth, addrWidth)
+  }
+
+  require(isPow2(dataWidth), "SMI data width must be power of 2")
+  require(dataWidth <= nastiXDataBits,
+    "SMI data width must be less than or equal to NASTI data width")
+
+  val reader = Module(new SmiIONastiReadIOConverter(dataWidth, addrWidth))
+  reader.io.nasti <> io.nasti
+
+  val writer = Module(new SmiIONastiWriteIOConverter(dataWidth, addrWidth))
+  writer.io.nasti <> io.nasti
+
+  val arb = Module(new SmiArbiter(2, dataWidth, addrWidth))
+  arb.io.in(0) <> reader.io.smi
+  arb.io.in(1) <> writer.io.smi
+  io.smi <> arb.io.out
+}
--- a/src/main/scala/junctions/stream.scala
+++ b/src/main/scala/junctions/stream.scala
@ -0,0 +1,187 @@
+package junctions
+
+import Chisel._
+import NastiConstants._
+import cde.Parameters
+
+class StreamChannel(w: Int) extends Bundle {
+  val data = UInt(width = w)
+  val last = Bool()
+
+  override def cloneType = new StreamChannel(w).asInstanceOf[this.type]
+}
+
+class StreamIO(w: Int) extends Bundle {
+  val out = Decoupled(new StreamChannel(w))
+  val in = Decoupled(new StreamChannel(w)).flip
+
+  override def cloneType = new StreamIO(w).asInstanceOf[this.type]
+}
+
+class NastiIOStreamIOConverter(w: Int)(implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val stream = new StreamIO(w)
+  }
+
+  val streamSize = UInt(log2Up(w / 8))
+  assert(!io.nasti.ar.valid || io.nasti.ar.bits.size === streamSize,
+         "read channel wrong size on stream")
+  assert(!io.nasti.ar.valid || io.nasti.ar.bits.len === UInt(0) ||
+         io.nasti.ar.bits.burst === BURST_FIXED,
+         "read channel wrong burst type on stream")
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.size === streamSize,
+         "write channel wrong size on stream")
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.len === UInt(0) ||
+         io.nasti.aw.bits.burst === BURST_FIXED,
+         "write channel wrong burst type on stream")
+  assert(!io.nasti.w.valid || io.nasti.w.bits.strb.andR,
+         "write channel cannot take partial writes")
+
+  val read_id = Reg(io.nasti.ar.bits.id)
+  val read_cnt = Reg(io.nasti.ar.bits.len)
+  val reading = Reg(init = Bool(false))
+
+  io.nasti.ar.ready := !reading
+  io.nasti.r.valid := reading && io.stream.in.valid
+  io.nasti.r.bits := io.stream.in.bits
+  io.nasti.r.bits.resp := UInt(0)
+  io.nasti.r.bits.id := read_id
+  io.stream.in.ready := reading && io.nasti.r.ready
+
+  when (io.nasti.ar.fire()) {
+    read_id := io.nasti.ar.bits.id
+    read_cnt := io.nasti.ar.bits.len
+    reading := Bool(true)
+  }
+
+  when (io.nasti.r.fire()) {
+    when (read_cnt === UInt(0)) {
+      reading := Bool(false)
+    } .otherwise {
+      read_cnt := read_cnt - UInt(1)
+    }
+  }
+
+  val write_id = Reg(io.nasti.aw.bits.id)
+  val writing = Reg(init = Bool(false))
+  val write_resp = Reg(init = Bool(false))
+
+  io.nasti.aw.ready := !writing && !write_resp
+  io.nasti.w.ready := writing && io.stream.out.ready
+  io.stream.out.valid := writing && io.nasti.w.valid
+  io.stream.out.bits := io.nasti.w.bits
+  io.nasti.b.valid := write_resp
+  io.nasti.b.bits.resp := UInt(0)
+  io.nasti.b.bits.id := write_id
+
+  when (io.nasti.aw.fire()) {
+    write_id := io.nasti.aw.bits.id
+    writing := Bool(true)
+  }
+
+  when (io.nasti.w.fire() && io.nasti.w.bits.last) {
+    writing := Bool(false)
+    write_resp := Bool(true)
+  }
+
+  when (io.nasti.b.fire()) { write_resp := Bool(false) }
+}
+
+class StreamNarrower(win: Int, wout: Int) extends Module {
+  require(win > wout, "Stream narrower input width must be larger than input width")
+  require(win % wout == 0, "Stream narrower input width must be multiple of output width")
+
+  val io = new Bundle {
+    val in = Decoupled(new StreamChannel(win)).flip
+    val out = Decoupled(new StreamChannel(wout))
+  }
+
+  val n_pieces = win / wout
+  val buffer = Reg(Bits(width = win))
+  val (piece_idx, pkt_done) = Counter(io.out.fire(), n_pieces)
+  val pieces = Vec.tabulate(n_pieces) { i => buffer(wout * (i + 1) - 1, wout * i) }
+  val last_piece = (piece_idx === UInt(n_pieces - 1))
+  val sending = Reg(init = Bool(false))
+  val in_last = Reg(Bool())
+
+  when (io.in.fire()) {
+    buffer := io.in.bits.data
+    in_last := io.in.bits.last
+    sending := Bool(true)
+  }
+  when (pkt_done) { sending := Bool(false) }
+
+  io.out.valid := sending
+  io.out.bits.data := pieces(piece_idx)
+  io.out.bits.last := in_last && last_piece
+  io.in.ready := !sending
+}
+
+class StreamExpander(win: Int, wout: Int) extends Module {
+  require(win < wout, "Stream expander input width must be smaller than input width")
+  require(wout % win == 0, "Stream narrower output width must be multiple of input width")
+
+  val io = new Bundle {
+    val in = Decoupled(new StreamChannel(win)).flip
+    val out = Decoupled(new StreamChannel(wout))
+  }
+
+  val n_pieces = wout / win
+  val buffer = Reg(Vec(n_pieces, UInt(width = win)))
+  val last = Reg(Bool())
+  val collecting = Reg(init = Bool(true))
+  val (piece_idx, pkt_done) = Counter(io.in.fire(), n_pieces)
+
+  when (io.in.fire()) { buffer(piece_idx) := io.in.bits.data }
+  when (pkt_done) { last := io.in.bits.last; collecting := Bool(false) }
+  when (io.out.fire()) { collecting := Bool(true) }
+
+  io.in.ready := collecting
+  io.out.valid := !collecting
+  io.out.bits.data := buffer.asUInt
+  io.out.bits.last := last
+}
+
+object StreamUtils {
+  def connectStreams(a: StreamIO, b: StreamIO) {
+    a.in <> b.out
+    b.in <> a.out
+  }
+}
+
+trait Serializable {
+  def nbits: Int
+}
+
+class Serializer[T <: Data with Serializable](w: Int, typ: T) extends Module {
+  val io = new Bundle {
+    val in = Decoupled(typ).flip
+    val out = Decoupled(Bits(width = w))
+  }
+
+  val narrower = Module(new StreamNarrower(typ.nbits, w))
+  narrower.io.in.bits.data := io.in.bits.asUInt
+  narrower.io.in.bits.last := Bool(true)
+  narrower.io.in.valid := io.in.valid
+  io.in.ready := narrower.io.in.ready
+  io.out.valid := narrower.io.out.valid
+  io.out.bits := narrower.io.out.bits.data
+  narrower.io.out.ready := io.out.ready
+}
+
+class Deserializer[T <: Data with Serializable](w: Int, typ: T) extends Module {
+  val io = new Bundle {
+    val in = Decoupled(Bits(width = w)).flip
+    val out = Decoupled(typ)
+  }
+
+  val expander = Module(new StreamExpander(w, typ.nbits))
+  expander.io.in.valid := io.in.valid
+  expander.io.in.bits.data := io.in.bits
+  expander.io.in.bits.last := Bool(true)
+  io.in.ready := expander.io.in.ready
+  io.out.valid := expander.io.out.valid
+  io.out.bits := typ.cloneType.fromBits(expander.io.out.bits.data)
+  expander.io.out.ready := io.out.ready
+}
--- a/src/main/scala/junctions/unittests/MiscNastiTests.scala
+++ b/src/main/scala/junctions/unittests/MiscNastiTests.scala
@ -0,0 +1,163 @@
+package junctions.unittests
+
+import Chisel._
+import junctions._
+import junctions.NastiConstants._
+import cde.Parameters
+
+class NastiDriver(dataWidth: Int, burstLen: Int, nBursts: Int)
+    (implicit p: Parameters) extends NastiModule {
+  val io = new Bundle {
+    val nasti = new NastiIO
+    val finished = Bool(OUTPUT)
+    val start = Bool(INPUT)
+  }
+
+  val dataBytes = dataWidth / 8
+  val nastiDataBytes = nastiXDataBits / 8
+
+  val (write_cnt, write_done) = Counter(io.nasti.w.fire(), burstLen)
+  val (read_cnt, read_done) = Counter(io.nasti.r.fire(), burstLen)
+  val (req_cnt, reqs_done) = Counter(read_done, nBursts)
+
+  val req_addr = Cat(req_cnt, UInt(0, log2Up(burstLen * dataBytes)))
+
+  val write_data    = UInt(0x10000000L, dataWidth) | Cat(req_cnt, write_cnt)
+  val expected_data = UInt(0x10000000L, dataWidth) | Cat(req_cnt, read_cnt)
+
+  val (s_idle :: s_write_addr :: s_write_data :: s_write_stall :: s_write_resp ::
+       s_read_addr :: s_read_data :: s_read_stall :: s_done :: Nil) = Enum(Bits(), 9)
+  val state = Reg(init = s_idle)
+
+  val (stall_cnt, stall_done) = Counter(state === s_read_stall, 2)
+
+  io.nasti.aw.valid := (state === s_write_addr)
+  io.nasti.aw.bits := NastiWriteAddressChannel(
+    id = UInt(0),
+    addr = req_addr,
+    size = UInt(log2Up(dataBytes)),
+    len = UInt(burstLen - 1))
+
+  io.nasti.w.valid := (state === s_write_data)
+  io.nasti.w.bits := NastiWriteDataChannel(
+    data = Cat(write_data, write_data),
+    last = (write_cnt === UInt(burstLen - 1)))
+
+  io.nasti.b.ready := (state === s_write_resp)
+
+  io.nasti.ar.valid := (state === s_read_addr)
+  io.nasti.ar.bits := NastiReadAddressChannel(
+    id = UInt(0),
+    addr = req_addr,
+    size = UInt(log2Up(dataBytes)),
+    len = UInt(burstLen - 1))
+
+  io.nasti.r.ready := (state === s_read_data)
+
+  io.finished := (state === s_done)
+
+  when (state === s_idle && io.start) { state := s_write_addr }
+  when (io.nasti.aw.fire()) { state := s_write_data }
+  when (io.nasti.w.fire()) { state := s_write_stall }
+  when (state === s_write_stall) { state := s_write_data }
+  when (write_done) { state := s_write_resp }
+  when (io.nasti.b.fire()) { state := s_read_addr }
+  when (io.nasti.ar.fire()) { state := s_read_data }
+  when (io.nasti.r.fire()) { state := s_read_stall }
+  when (stall_done) { state := s_read_data }
+  when (read_done) { state := s_write_addr }
+  when (reqs_done) { state := s_done }
+
+  val full_addr = req_addr + (read_cnt << UInt(log2Up(dataBytes)))
+  val byteshift = full_addr(log2Up(nastiDataBytes) - 1, 0)
+  val bitshift = Cat(byteshift, UInt(0, 3))
+  val read_data = (io.nasti.r.bits.data >> bitshift) & Fill(dataWidth, UInt(1, 1))
+
+  assert(!io.nasti.r.valid || read_data === expected_data,
+    s"NastiDriver got wrong data")
+}
+
+
+class AtosConverterTestBackend(implicit p: Parameters) extends NastiModule()(p) {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val finished = Bool(OUTPUT)
+  }
+
+  val (s_waddr :: s_wdata :: s_wresp ::
+       s_raddr :: s_rresp :: s_done :: Nil) = Enum(Bits(), 6)
+  val state = Reg(init = s_waddr)
+
+  val n_words = 4
+  val test_data = Reg(Vec(n_words, UInt(width = nastiXDataBits)))
+  val req_id = Reg(UInt(width = nastiXIdBits))
+
+  val (w_count, w_last) = Counter(io.nasti.w.fire(), n_words)
+  val (r_count, r_last) = Counter(io.nasti.r.fire(), n_words)
+
+  when (io.nasti.aw.fire()) {
+    req_id := io.nasti.aw.bits.id
+    state := s_wdata
+  }
+  when (io.nasti.w.fire()) {
+    test_data(w_count) := io.nasti.w.bits.data
+    when (io.nasti.w.bits.last) { state := s_wresp }
+  }
+  when (io.nasti.b.fire()) { state := s_raddr }
+  when (io.nasti.ar.fire()) {
+    req_id := io.nasti.ar.bits.id
+    state := s_rresp
+  }
+  when (io.nasti.r.fire() && io.nasti.r.bits.last) { state := s_done }
+
+  io.nasti.aw.ready := (state === s_waddr)
+  io.nasti.w.ready := (state === s_wdata)
+  io.nasti.ar.ready := (state === s_raddr)
+
+  io.nasti.b.valid := (state === s_wresp)
+  io.nasti.b.bits := NastiWriteResponseChannel(id = req_id)
+
+  io.nasti.r.valid := (state === s_rresp)
+  io.nasti.r.bits := NastiReadDataChannel(
+    id = req_id,
+    data = test_data(r_count),
+    last = r_last)
+
+  io.finished := (state === s_done)
+}
+
+class AtosConverterTest(implicit val p: Parameters) extends UnitTest
+    with HasNastiParameters {
+  val frontend = Module(new NastiDriver(nastiXDataBits, 4, 1))
+  val backend = Module(new AtosConverterTestBackend)
+
+  val serdes = Module(new AtosSerdes(8))
+  val desser = Module(new AtosDesser(8))
+
+  val client_conv = Module(new AtosClientConverter)
+  val manager_conv = Module(new AtosManagerConverter)
+
+  client_conv.io.nasti <> frontend.io.nasti
+  serdes.io.wide <> client_conv.io.atos
+  desser.io.narrow <> serdes.io.narrow
+  manager_conv.io.atos <> desser.io.wide
+  backend.io.nasti <> manager_conv.io.nasti
+  frontend.io.start := io.start
+
+  io.finished := frontend.io.finished && backend.io.finished
+}
+
+class HastiTest(implicit p: Parameters) extends UnitTest {
+  val sram = Module(new HastiTestSRAM(8))
+  val bus = Module(new HastiBus(Seq(a => Bool(true))))
+  val conv = Module(new HastiMasterIONastiIOConverter)
+  val driver = Module(new NastiDriver(32, 8, 2))
+
+  bus.io.slaves(0) <> sram.io
+  bus.io.master <> conv.io.hasti
+  conv.io.nasti <> driver.io.nasti
+  io.finished := driver.io.finished
+  driver.io.start := io.start
+}
+
+
--- a/src/main/scala/junctions/unittests/MultiWidthFifoTest.scala
+++ b/src/main/scala/junctions/unittests/MultiWidthFifoTest.scala
@ -0,0 +1,85 @@
+package junctions.unittests
+
+import Chisel._
+import junctions._
+import junctions.NastiConstants._
+
+class MultiWidthFifoTest extends UnitTest {
+  val big2little = Module(new MultiWidthFifo(16, 8, 8))
+  val little2big = Module(new MultiWidthFifo(8, 16, 4))
+
+  val bl_send = Reg(init = Bool(false))
+  val lb_send = Reg(init = Bool(false))
+  val bl_recv = Reg(init = Bool(false))
+  val lb_recv = Reg(init = Bool(false))
+  val bl_finished = Reg(init = Bool(false))
+  val lb_finished = Reg(init = Bool(false))
+
+  val bl_data = Vec.tabulate(4){i => UInt((2 * i + 1) * 256 + 2 * i, 16)}
+  val lb_data = Vec.tabulate(8){i => UInt(i, 8)}
+
+  val (bl_send_cnt, bl_send_done) = Counter(big2little.io.in.fire(), 4)
+  val (lb_send_cnt, lb_send_done) = Counter(little2big.io.in.fire(), 8)
+
+  val (bl_recv_cnt, bl_recv_done) = Counter(big2little.io.out.fire(), 8)
+  val (lb_recv_cnt, lb_recv_done) = Counter(little2big.io.out.fire(), 4)
+
+  big2little.io.in.valid := bl_send
+  big2little.io.in.bits := bl_data(bl_send_cnt)
+  big2little.io.out.ready := bl_recv
+
+  little2big.io.in.valid := lb_send
+  little2big.io.in.bits := lb_data(lb_send_cnt)
+  little2big.io.out.ready := lb_recv
+
+  val bl_recv_data_idx = bl_recv_cnt >> UInt(1)
+  val bl_recv_data = Mux(bl_recv_cnt(0),
+    bl_data(bl_recv_data_idx)(15, 8),
+    bl_data(bl_recv_data_idx)(7, 0))
+
+  val lb_recv_data = Cat(
+    lb_data(Cat(lb_recv_cnt, UInt(1, 1))),
+    lb_data(Cat(lb_recv_cnt, UInt(0, 1))))
+
+  when (io.start) {
+    bl_send := Bool(true)
+    lb_send := Bool(true)
+  }
+
+  when (bl_send_done) {
+    bl_send := Bool(false)
+    bl_recv := Bool(true)
+  }
+
+  when (lb_send_done) {
+    lb_send := Bool(false)
+    lb_recv := Bool(true)
+  }
+
+  when (bl_recv_done) {
+    bl_recv := Bool(false)
+    bl_finished := Bool(true)
+  }
+
+  when (lb_recv_done) {
+    lb_recv := Bool(false)
+    lb_finished := Bool(true)
+  }
+
+  io.finished := bl_finished && lb_finished
+
+  val bl_start_recv = Reg(next = bl_send_done)
+  val lb_start_recv = Reg(next = lb_send_done)
+
+  assert(!little2big.io.out.valid || little2big.io.out.bits === lb_recv_data,
+    "Little to Big data mismatch")
+  assert(!big2little.io.out.valid || big2little.io.out.bits === bl_recv_data,
+    "Bit to Little data mismatch")
+
+  assert(!lb_start_recv || little2big.io.count === UInt(4),
+    "Little to Big count incorrect")
+  assert(!bl_start_recv || big2little.io.count === UInt(8),
+    "Big to Little count incorrect")
+}
+
+
--- a/src/main/scala/junctions/unittests/NastiDemuxTest.scala
+++ b/src/main/scala/junctions/unittests/NastiDemuxTest.scala
@ -0,0 +1,111 @@
+package junctions.unittests
+
+import Chisel._
+import junctions._
+import junctions.NastiConstants._
+import cde.Parameters
+
+class NastiDemuxDriver(n: Int)(implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val start = Bool(INPUT)
+    val finished = Bool(OUTPUT)
+    val nasti = new NastiIO
+    val select = UInt(OUTPUT, log2Up(n))
+  }
+
+  val (s_idle :: s_write_addr :: s_write_data :: s_write_resp ::
+       s_read_addr :: s_read_resp :: s_done :: Nil) = Enum(Bits(), 7)
+  val state = Reg(init = s_idle)
+
+  val select = Reg(init = UInt(0, log2Up(n)))
+
+  when (state === s_idle && io.start) { state := s_write_addr }
+  when (io.nasti.aw.fire()) { state := s_write_data }
+  when (io.nasti.w.fire()) { state := s_write_resp }
+  when (io.nasti.b.fire()) { state := s_read_addr }
+  when (io.nasti.ar.fire()) { state := s_read_resp }
+  when (io.nasti.r.fire()) {
+    when (select === UInt(n - 1)) {
+      state := s_done
+    } .otherwise {
+      select := select + UInt(1)
+      state := s_write_addr
+    }
+  }
+
+  io.nasti.aw.valid := (state === s_write_addr)
+  io.nasti.aw.bits := NastiWriteAddressChannel(
+    id = UInt(0),
+    addr = UInt(0),
+    size = UInt("b011"))
+  io.nasti.w.valid := (state === s_write_data)
+  io.nasti.w.bits := NastiWriteDataChannel(data = select)
+  io.nasti.b.ready := (state === s_write_resp)
+  io.nasti.ar.valid := (state === s_read_addr)
+  io.nasti.ar.bits := NastiReadAddressChannel(
+    id = UInt(0),
+    addr = UInt(0),
+    size = UInt("b011"))
+  io.nasti.r.ready := (state === s_read_resp)
+
+  io.finished := (state === s_done)
+  io.select := select
+
+  assert(!io.nasti.r.valid || io.nasti.r.bits.data === select,
+    "NASTI DeMux test: Read data did not match")
+}
+
+class NastiDemuxSlave(implicit p: Parameters) extends NastiModule()(p) {
+  val io = (new NastiIO).flip
+
+  val (s_write_wait :: s_write_data :: s_write_resp ::
+       s_read_wait :: s_read_resp :: s_done :: Nil) = Enum(Bits(), 6)
+  val state = Reg(init = s_write_wait)
+
+  val value = Reg(UInt(width = 64))
+  val id = Reg(UInt(width = nastiXIdBits))
+
+  when (io.aw.fire()) {
+    id := io.aw.bits.id
+    state := s_write_data
+  }
+
+  when (io.w.fire()) {
+    value := io.w.bits.data
+    state := s_write_resp
+  }
+
+  when (io.b.fire()) { state := s_read_wait }
+
+  when (io.ar.fire()) {
+    id := io.ar.bits.id
+    state := s_read_resp
+  }
+
+  when (io.r.fire()) { state := s_done }
+
+  io.aw.ready := (state === s_write_wait)
+  io.w.ready := (state === s_write_data)
+  io.b.valid := (state === s_write_resp)
+  io.b.bits := NastiWriteResponseChannel(id = id)
+  io.ar.ready := (state === s_read_wait)
+  io.r.valid := (state === s_read_resp)
+  io.r.bits := NastiReadDataChannel(id = id, data = value)
+}
+
+class NastiMemoryDemuxTest(implicit p: Parameters) extends UnitTest {
+  val nSlaves = 4
+
+  val driver = Module(new NastiDemuxDriver(nSlaves))
+  driver.io.start := io.start
+  io.finished := driver.io.finished
+
+  val demux = Module(new NastiMemoryDemux(nSlaves))
+  demux.io.master <> driver.io.nasti
+  demux.io.select := driver.io.select
+
+  for (i <- 0 until nSlaves) {
+    val slave = Module(new NastiDemuxSlave)
+    slave.io <> demux.io.slaves(i)
+  }
+}
--- a/src/main/scala/junctions/unittests/UnitTest.scala
+++ b/src/main/scala/junctions/unittests/UnitTest.scala
@ -0,0 +1,65 @@
+package junctions.unittests
+
+import Chisel._
+import junctions._
+import cde.{Field, Parameters}
+
+abstract class UnitTest extends Module {
+  val io = new Bundle {
+    val finished = Bool(OUTPUT)
+    val start = Bool(INPUT)
+  }
+
+  when (io.start) {
+    printf(s"Started UnitTest ${this.getClass.getSimpleName}\n")
+  }
+}
+
+case object UnitTests extends Field[Parameters => Seq[UnitTest]]
+
+class UnitTestSuite(implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val finished = Bool(OUTPUT)
+  }
+
+  val tests = p(UnitTests)(p)
+
+  val s_idle :: s_start :: s_wait :: s_done :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_idle)
+  val test_idx = Reg(init = UInt(0, log2Up(tests.size)))
+  val test_finished = Vec(tests.map(_.io.finished))
+
+  when (state === s_idle) { state := s_start }
+  when (state === s_start) { state := s_wait }
+  when (state === s_wait && test_finished(test_idx)) {
+    state := s_start
+    test_idx := test_idx + UInt(1)
+    state := Mux(test_idx === UInt(tests.size - 1), s_done, s_start)
+  }
+
+  val timer = Module(new Timer(1000, tests.size))
+  timer.io.start.valid := Bool(false)
+  timer.io.stop.valid := Bool(false)
+
+  tests.zipWithIndex.foreach { case (mod, i) =>
+    mod.io.start := (state === s_start) && test_idx === UInt(i)
+    when (test_idx === UInt(i)) {
+      timer.io.start.valid := mod.io.start
+      timer.io.start.bits := UInt(i)
+      timer.io.stop.valid := mod.io.finished
+      timer.io.stop.bits := UInt(i)
+    }
+  }
+  io.finished := (state === s_done)
+
+  assert(!timer.io.timeout.valid, "UnitTest timed out")
+}
+
+object JunctionsUnitTests {
+  def apply(implicit p: Parameters): Seq[UnitTest] =
+    Seq(
+      Module(new MultiWidthFifoTest),
+      Module(new AtosConverterTest),
+      Module(new NastiMemoryDemuxTest),
+      Module(new HastiTest))
+}
--- a/src/main/scala/junctions/util.scala
+++ b/src/main/scala/junctions/util.scala
@ -0,0 +1,365 @@
+/// See LICENSE for license details.
+package junctions
+import Chisel._
+import cde.Parameters
+
+class ParameterizedBundle(implicit p: Parameters) extends Bundle {
+  override def cloneType = {
+    try {
+      this.getClass.getConstructors.head.newInstance(p).asInstanceOf[this.type]
+    } catch {
+      case e: java.lang.IllegalArgumentException =>
+        throwException("Unable to use ParamaterizedBundle.cloneType on " +
+                       this.getClass + ", probably because " + this.getClass +
+                       "() takes more than one argument.  Consider overriding " +
+                       "cloneType() on " + this.getClass, e)
+    }
+  }
+}
+
+class HellaFlowQueue[T <: Data](val entries: Int)(data: => T) extends Module {
+  val io = new QueueIO(data, entries)
+  require(entries > 1)
+
+  val do_flow = Wire(Bool())
+  val do_enq = io.enq.fire() && !do_flow
+  val do_deq = io.deq.fire() && !do_flow
+
+  val maybe_full = Reg(init=Bool(false))
+  val enq_ptr = Counter(do_enq, entries)._1
+  val (deq_ptr, deq_done) = Counter(do_deq, entries)
+  when (do_enq =/= do_deq) { maybe_full := do_enq }
+
+  val ptr_match = enq_ptr === deq_ptr
+  val empty = ptr_match && !maybe_full
+  val full = ptr_match && maybe_full
+  val atLeastTwo = full || enq_ptr - deq_ptr >= UInt(2)
+  do_flow := empty && io.deq.ready
+
+  val ram = SeqMem(entries, data)
+  when (do_enq) { ram.write(enq_ptr, io.enq.bits) }
+
+  val ren = io.deq.ready && (atLeastTwo || !io.deq.valid && !empty)
+  val raddr = Mux(io.deq.valid, Mux(deq_done, UInt(0), deq_ptr + UInt(1)), deq_ptr)
+  val ram_out_valid = Reg(next = ren)
+
+  io.deq.valid := Mux(empty, io.enq.valid, ram_out_valid)
+  io.enq.ready := !full
+  io.deq.bits := Mux(empty, io.enq.bits, ram.read(raddr, ren))
+}
+
+class HellaQueue[T <: Data](val entries: Int)(data: => T) extends Module {
+  val io = new QueueIO(data, entries)
+
+  val fq = Module(new HellaFlowQueue(entries)(data))
+  fq.io.enq <> io.enq
+  io.deq <> Queue(fq.io.deq, 1, pipe = true)
+}
+
+object HellaQueue {
+  def apply[T <: Data](enq: DecoupledIO[T], entries: Int) = {
+    val q = Module((new HellaQueue(entries)) { enq.bits })
+    q.io.enq.valid := enq.valid // not using <> so that override is allowed
+    q.io.enq.bits := enq.bits
+    enq.ready := q.io.enq.ready
+    q.io.deq
+  }
+}
+
+/** A generalized locking RR arbiter that addresses the limitations of the
+ *  version in the Chisel standard library */
+abstract class JunctionsAbstractLockingArbiter[T <: Data](typ: T, arbN: Int)
+    extends Module {
+
+  val io = new Bundle {
+    val in = Vec(arbN, Decoupled(typ.cloneType)).flip
+    val out = Decoupled(typ.cloneType)
+  }
+
+  def rotateLeft[T <: Data](norm: Vec[T], rot: UInt): Vec[T] = {
+    val n = norm.size
+    Vec.tabulate(n) { i =>
+      Mux(rot < UInt(n - i), norm(UInt(i) + rot), norm(rot - UInt(n - i)))
+    }
+  }
+
+  val lockIdx = Reg(init = UInt(0, log2Up(arbN)))
+  val locked = Reg(init = Bool(false))
+
+  val choice = PriorityMux(
+    rotateLeft(Vec(io.in.map(_.valid)), lockIdx + UInt(1)),
+    rotateLeft(Vec((0 until arbN).map(UInt(_))), lockIdx + UInt(1)))
+
+  val chosen = Mux(locked, lockIdx, choice)
+
+  for (i <- 0 until arbN) {
+    io.in(i).ready := io.out.ready && chosen === UInt(i)
+  }
+
+  io.out.valid := io.in(chosen).valid
+  io.out.bits := io.in(chosen).bits
+}
+
+/** This locking arbiter determines when it is safe to unlock
+ *  by peeking at the data */
+class JunctionsPeekingArbiter[T <: Data](
+    typ: T, arbN: Int,
+    canUnlock: T => Bool,
+    needsLock: Option[T => Bool] = None)
+    extends JunctionsAbstractLockingArbiter(typ, arbN) {
+
+  def realNeedsLock(data: T): Bool =
+    needsLock.map(_(data)).getOrElse(Bool(true))
+
+  when (io.out.fire()) {
+    when (!locked && realNeedsLock(io.out.bits)) {
+      lockIdx := choice
+      locked := Bool(true)
+    }
+    // the unlock statement takes precedent
+    when (canUnlock(io.out.bits)) {
+      locked := Bool(false)
+    }
+  }
+}
+
+/** This arbiter determines when it is safe to unlock by counting transactions */
+class JunctionsCountingArbiter[T <: Data](
+    typ: T, arbN: Int, count: Int,
+    val needsLock: Option[T => Bool] = None)
+    extends JunctionsAbstractLockingArbiter(typ, arbN) {
+
+  def realNeedsLock(data: T): Bool =
+    needsLock.map(_(data)).getOrElse(Bool(true))
+
+  // if count is 1, you should use a non-locking arbiter
+  require(count > 1, "CountingArbiter cannot have count <= 1")
+
+  val lock_ctr = Counter(count)
+
+  when (io.out.fire()) {
+    when (!locked && realNeedsLock(io.out.bits)) {
+      lockIdx := choice
+      locked := Bool(true)
+      lock_ctr.inc()
+    }
+
+    when (locked) {
+      when (lock_ctr.inc()) { locked := Bool(false) }
+    }
+  }
+}
+
+class ReorderQueueWrite[T <: Data](dType: T, tagWidth: Int) extends Bundle {
+  val data = dType.cloneType
+  val tag = UInt(width = tagWidth)
+
+  override def cloneType =
+    new ReorderQueueWrite(dType, tagWidth).asInstanceOf[this.type]
+}
+
+class ReorderEnqueueIO[T <: Data](dType: T, tagWidth: Int)
+  extends DecoupledIO(new ReorderQueueWrite(dType, tagWidth)) {
+
+  override def cloneType =
+    new ReorderEnqueueIO(dType, tagWidth).asInstanceOf[this.type]
+}
+
+class ReorderDequeueIO[T <: Data](dType: T, tagWidth: Int) extends Bundle {
+  val valid = Bool(INPUT)
+  val tag = UInt(INPUT, tagWidth)
+  val data = dType.cloneType.asOutput
+  val matches = Bool(OUTPUT)
+
+  override def cloneType =
+    new ReorderDequeueIO(dType, tagWidth).asInstanceOf[this.type]
+}
+
+class ReorderQueue[T <: Data](dType: T, tagWidth: Int, size: Option[Int] = None)
+    extends Module {
+  val io = new Bundle {
+    val enq = new ReorderEnqueueIO(dType, tagWidth).flip
+    val deq = new ReorderDequeueIO(dType, tagWidth)
+  }
+
+  val tagSpaceSize = 1 << tagWidth
+  val actualSize = size.getOrElse(tagSpaceSize)
+
+  if (tagSpaceSize > actualSize) {
+    val roq_data = Reg(Vec(actualSize, dType))
+    val roq_tags = Reg(Vec(actualSize, UInt(width = tagWidth)))
+    val roq_free = Reg(init = Vec.fill(actualSize)(Bool(true)))
+
+    val roq_enq_addr = PriorityEncoder(roq_free)
+    val roq_matches = roq_tags.zip(roq_free)
+      .map { case (tag, free) => tag === io.deq.tag && !free }
+    val roq_deq_onehot = PriorityEncoderOH(roq_matches)
+
+    io.enq.ready := roq_free.reduce(_ || _)
+    io.deq.data := Mux1H(roq_deq_onehot, roq_data)
+    io.deq.matches := roq_matches.reduce(_ || _)
+
+    when (io.enq.valid && io.enq.ready) {
+      roq_data(roq_enq_addr) := io.enq.bits.data
+      roq_tags(roq_enq_addr) := io.enq.bits.tag
+      roq_free(roq_enq_addr) := Bool(false)
+    }
+
+    when (io.deq.valid) {
+      roq_free(OHToUInt(roq_deq_onehot)) := Bool(true)
+    }
+
+    println(s"Warning - using a CAM for ReorderQueue, tagBits: ${tagWidth} size: ${actualSize}")
+  } else {
+    val roq_data = Mem(tagSpaceSize, dType)
+    val roq_free = Reg(init = Vec.fill(tagSpaceSize)(Bool(true)))
+
+    io.enq.ready := roq_free(io.enq.bits.tag)
+    io.deq.data := roq_data(io.deq.tag)
+    io.deq.matches := !roq_free(io.deq.tag)
+
+    when (io.enq.valid && io.enq.ready) {
+      roq_data(io.enq.bits.tag) := io.enq.bits.data
+      roq_free(io.enq.bits.tag) := Bool(false)
+    }
+
+    when (io.deq.valid) {
+      roq_free(io.deq.tag) := Bool(true)
+    }
+  }
+}
+
+object DecoupledHelper {
+  def apply(rvs: Bool*) = new DecoupledHelper(rvs)
+}
+
+class DecoupledHelper(val rvs: Seq[Bool]) {
+  def fire(exclude: Bool, includes: Bool*) = {
+    (rvs.filter(_ ne exclude) ++ includes).reduce(_ && _)
+  }
+}
+
+class MultiWidthFifo(inW: Int, outW: Int, n: Int) extends Module {
+  val io = new Bundle {
+    val in = Decoupled(Bits(width = inW)).flip
+    val out = Decoupled(Bits(width = outW))
+    val count = UInt(OUTPUT, log2Up(n + 1))
+  }
+
+  if (inW == outW) {
+    val q = Module(new Queue(Bits(width = inW), n))
+    q.io.enq <> io.in
+    io.out <> q.io.deq
+    io.count := q.io.count
+  } else if (inW > outW) {
+    val nBeats = inW / outW
+
+    require(inW % outW == 0, s"MultiWidthFifo: in: $inW not divisible by out: $outW")
+    require(n % nBeats == 0, s"Cannot store $n output words when output beats is $nBeats")
+
+    val wdata = Reg(Vec(n / nBeats, Bits(width = inW)))
+    val rdata = Vec(wdata.flatMap { indat =>
+      (0 until nBeats).map(i => indat(outW * (i + 1) - 1, outW * i)) })
+
+    val head = Reg(init = UInt(0, log2Up(n / nBeats)))
+    val tail = Reg(init = UInt(0, log2Up(n)))
+    val size = Reg(init = UInt(0, log2Up(n + 1)))
+
+    when (io.in.fire()) {
+      wdata(head) := io.in.bits
+      head := head + UInt(1)
+    }
+
+    when (io.out.fire()) { tail := tail + UInt(1) }
+
+    size := MuxCase(size, Seq(
+      (io.in.fire() && io.out.fire()) -> (size + UInt(nBeats - 1)),
+      io.in.fire() -> (size + UInt(nBeats)),
+      io.out.fire() -> (size - UInt(1))))
+
+    io.out.valid := size > UInt(0)
+    io.out.bits := rdata(tail)
+    io.in.ready := size < UInt(n)
+    io.count := size
+  } else {
+    val nBeats = outW / inW
+
+    require(outW % inW == 0, s"MultiWidthFifo: out: $outW not divisible by in: $inW")
+
+    val wdata = Reg(Vec(n * nBeats, Bits(width = inW)))
+    val rdata = Vec.tabulate(n) { i =>
+      Cat(wdata.slice(i * nBeats, (i + 1) * nBeats).reverse)}
+
+    val head = Reg(init = UInt(0, log2Up(n * nBeats)))
+    val tail = Reg(init = UInt(0, log2Up(n)))
+    val size = Reg(init = UInt(0, log2Up(n * nBeats + 1)))
+
+    when (io.in.fire()) {
+      wdata(head) := io.in.bits
+      head := head + UInt(1)
+    }
+
+    when (io.out.fire()) { tail := tail + UInt(1) }
+
+    size := MuxCase(size, Seq(
+      (io.in.fire() && io.out.fire()) -> (size - UInt(nBeats - 1)),
+      io.in.fire() -> (size + UInt(1)),
+      io.out.fire() -> (size - UInt(nBeats))))
+
+    io.count := size >> UInt(log2Up(nBeats))
+    io.out.valid := io.count > UInt(0)
+    io.out.bits := rdata(tail)
+    io.in.ready := size < UInt(n * nBeats)
+  }
+}
+
+// ============
+// Static timer
+// ============
+
+// Timer with a statically-specified period.
+// Can take multiple inflight start-stop events with ID
+// Will continue to count down as long as at least one event is inflight
+
+class Timer(initCount: Int, maxInflight: Int) extends Module {
+  val io = new Bundle {
+    val start = Valid(UInt(width = log2Up(maxInflight))).flip
+    val stop = Valid(UInt(width = log2Up(maxInflight))).flip
+    val timeout = Valid(UInt(width = log2Up(maxInflight)))
+  }
+
+  val inflight = Reg(init = Vec.fill(maxInflight) { Bool(false) })
+  val countdown = Reg(UInt(width = log2Up(initCount)))
+  val active = inflight.reduce(_ || _)
+
+  when (active) {
+    countdown := countdown - UInt(1)
+  }
+
+  when (io.start.valid) {
+    inflight(io.start.bits) := Bool(true)
+    countdown := UInt(initCount - 1)
+  }
+  when (io.stop.valid) {
+    inflight(io.stop.bits) := Bool(false)
+  }
+
+  io.timeout.valid := countdown === UInt(0) && active
+  io.timeout.bits := PriorityEncoder(inflight)
+
+  assert(!io.stop.valid || inflight(io.stop.bits),
+         "Timer stop for transaction that's not inflight")
+}
+
+object Timer {
+  def apply(initCount: Int, start: Bool, stop: Bool): Bool = {
+    val timer = Module(new Timer(initCount, 1))
+    timer.io.start.valid := start
+    timer.io.start.bits  := UInt(0)
+    timer.io.stop.valid  := stop
+    timer.io.stop.bits   := UInt(0)
+    timer.io.timeout.valid
+  }
+}
+
+
--- a/src/main/scala/rocket/arbiter.scala
+++ b/src/main/scala/rocket/arbiter.scala
@ -0,0 +1,113 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import cde.{Parameters, Field}
+import junctions.{ParameterizedBundle, DecoupledHelper}
+
+class HellaCacheArbiter(n: Int)(implicit p: Parameters) extends Module
+{
+  val io = new Bundle {
+    val requestor = Vec(n, new HellaCacheIO).flip
+    val mem = new HellaCacheIO
+  }
+
+  if (n == 1) {
+    io.mem <> io.requestor.head
+  } else {
+    val s1_id = Reg(UInt())
+    val s2_id = Reg(next=s1_id)
+
+    io.mem.invalidate_lr := io.requestor.map(_.invalidate_lr).reduce(_||_)
+    io.mem.req.valid := io.requestor.map(_.req.valid).reduce(_||_)
+    io.requestor(0).req.ready := io.mem.req.ready
+    for (i <- 1 until n)
+      io.requestor(i).req.ready := io.requestor(i-1).req.ready && !io.requestor(i-1).req.valid
+
+    for (i <- n-1 to 0 by -1) {
+      val req = io.requestor(i).req
+      def connect_s0() = {
+        io.mem.req.bits.cmd := req.bits.cmd
+        io.mem.req.bits.typ := req.bits.typ
+        io.mem.req.bits.addr := req.bits.addr
+        io.mem.req.bits.phys := req.bits.phys
+        io.mem.req.bits.tag := Cat(req.bits.tag, UInt(i, log2Up(n)))
+        s1_id := UInt(i)
+      }
+      def connect_s1() = {
+        io.mem.s1_kill := io.requestor(i).s1_kill
+        io.mem.s1_data := io.requestor(i).s1_data
+      }
+
+      if (i == n-1) {
+        connect_s0()
+        connect_s1()
+      } else {
+        when (req.valid) { connect_s0() }
+        when (s1_id === UInt(i)) { connect_s1() }
+      }
+    }
+
+    for (i <- 0 until n) {
+      val resp = io.requestor(i).resp
+      val tag_hit = io.mem.resp.bits.tag(log2Up(n)-1,0) === UInt(i)
+      resp.valid := io.mem.resp.valid && tag_hit
+      io.requestor(i).xcpt := io.mem.xcpt
+      io.requestor(i).ordered := io.mem.ordered
+      io.requestor(i).s2_nack := io.mem.s2_nack && s2_id === UInt(i)
+      resp.bits := io.mem.resp.bits
+      resp.bits.tag := io.mem.resp.bits.tag >> log2Up(n)
+
+      io.requestor(i).replay_next := io.mem.replay_next
+    }
+  }
+}
+
+class InOrderArbiter[T <: Data, U <: Data](reqTyp: T, respTyp: U, n: Int)
+    (implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val in_req = Vec(n, Decoupled(reqTyp)).flip
+    val in_resp = Vec(n, Decoupled(respTyp))
+    val out_req = Decoupled(reqTyp)
+    val out_resp = Decoupled(respTyp).flip
+  }
+
+  if (n > 1) {
+    val route_q = Module(new Queue(UInt(width = log2Up(n)), 2))
+    val req_arb = Module(new RRArbiter(reqTyp, n))
+    req_arb.io.in <> io.in_req
+
+    val req_helper = DecoupledHelper(
+      req_arb.io.out.valid,
+      route_q.io.enq.ready,
+      io.out_req.ready)
+
+    io.out_req.bits := req_arb.io.out.bits
+    io.out_req.valid := req_helper.fire(io.out_req.ready)
+
+    route_q.io.enq.bits := req_arb.io.chosen
+    route_q.io.enq.valid := req_helper.fire(route_q.io.enq.ready)
+
+    req_arb.io.out.ready := req_helper.fire(req_arb.io.out.valid)
+
+    val resp_sel = route_q.io.deq.bits
+    val resp_ready = io.in_resp(resp_sel).ready
+    val resp_helper = DecoupledHelper(
+      resp_ready,
+      route_q.io.deq.valid,
+      io.out_resp.valid)
+
+    val resp_valid = resp_helper.fire(resp_ready)
+    for (i <- 0 until n) {
+      io.in_resp(i).bits := io.out_resp.bits
+      io.in_resp(i).valid := resp_valid && resp_sel === UInt(i)
+    }
+
+    route_q.io.deq.ready := resp_helper.fire(route_q.io.deq.valid)
+    io.out_resp.ready := resp_helper.fire(io.out_resp.valid)
+  } else {
+    io.out_req <> io.in_req.head
+    io.in_resp.head <> io.out_resp
+  }
+}
--- a/src/main/scala/rocket/breakpoint.scala
+++ b/src/main/scala/rocket/breakpoint.scala
@ -0,0 +1,82 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import Util._
+import cde.Parameters
+
+class TDRSelect(implicit p: Parameters) extends CoreBundle()(p) {
+  val tdrmode = Bool()
+  val reserved = UInt(width = xLen - 1 - log2Up(nTDR))
+  val tdrindex = UInt(width = log2Up(nTDR))
+
+  def nTDR = p(NBreakpoints)
+}
+
+class BPControl(implicit p: Parameters) extends CoreBundle()(p) {
+  val tdrtype = UInt(width = 4)
+  val bpamaskmax = UInt(width = 5)
+  val reserved = UInt(width = xLen-28)
+  val bpaction = UInt(width = 8)
+  val bpmatch = UInt(width = 4)
+  val m = Bool()
+  val h = Bool()
+  val s = Bool()
+  val u = Bool()
+  val r = Bool()
+  val w = Bool()
+  val x = Bool()
+
+  def tdrType = 1
+  def bpaMaskMax = 4
+  def enabled(mstatus: MStatus) = Cat(m, h, s, u)(mstatus.prv)
+}
+
+class BP(implicit p: Parameters) extends CoreBundle()(p) {
+  val control = new BPControl
+  val address = UInt(width = vaddrBits)
+
+  def mask(dummy: Int = 0) = {
+    var mask: UInt = control.bpmatch(1)
+    for (i <- 1 until control.bpaMaskMax)
+      mask = Cat(mask(i-1) && address(i-1), mask)
+    mask
+  }
+
+  def pow2AddressMatch(x: UInt) =
+    (~x | mask()) === (~address | mask())
+}
+
+class BreakpointUnit(implicit p: Parameters) extends CoreModule()(p) {
+  val io = new Bundle {
+    val status = new MStatus().asInput
+    val bp = Vec(p(NBreakpoints), new BP).asInput
+    val pc = UInt(INPUT, vaddrBits)
+    val ea = UInt(INPUT, vaddrBits)
+    val xcpt_if = Bool(OUTPUT)
+    val xcpt_ld = Bool(OUTPUT)
+    val xcpt_st = Bool(OUTPUT)
+  }
+
+  io.xcpt_if := false
+  io.xcpt_ld := false
+  io.xcpt_st := false
+
+  for (bp <- io.bp) {
+    when (bp.control.enabled(io.status)) {
+      when (bp.pow2AddressMatch(io.pc) && bp.control.x) { io.xcpt_if := true }
+      when (bp.pow2AddressMatch(io.ea) && bp.control.r) { io.xcpt_ld := true }
+      when (bp.pow2AddressMatch(io.ea) && bp.control.w) { io.xcpt_st := true }
+    }
+  }
+
+  if (!io.bp.isEmpty) for ((bpl, bph) <- io.bp zip io.bp.tail) {
+    def matches(x: UInt) = !(x < bpl.address) && x < bph.address
+    when (bph.control.enabled(io.status) && bph.control.bpmatch === 1) {
+      when (matches(io.pc) && bph.control.x) { io.xcpt_if := true }
+      when (matches(io.ea) && bph.control.r) { io.xcpt_ld := true }
+      when (matches(io.ea) && bph.control.w) { io.xcpt_st := true }
+    }
+  }
+}
--- a/src/main/scala/rocket/btb.scala
+++ b/src/main/scala/rocket/btb.scala
@ -0,0 +1,269 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import junctions._
+import cde.{Parameters, Field}
+import Util._
+import uncore.util._
+
+case object BtbKey extends Field[BtbParameters]
+
+case class BtbParameters(
+  nEntries: Int = 62,
+  nRAS: Int = 2,
+  updatesOutOfOrder: Boolean = false)
+
+abstract trait HasBtbParameters extends HasCoreParameters {
+  val matchBits = pgIdxBits
+  val entries = p(BtbKey).nEntries
+  val nRAS = p(BtbKey).nRAS
+  val updatesOutOfOrder = p(BtbKey).updatesOutOfOrder
+  val nPages = ((1 max(log2Up(entries)))+1)/2*2 // control logic assumes 2 divides pages
+  val opaqueBits = log2Up(entries)
+  val nBHT = 1 << log2Up(entries*2)
+}
+
+abstract class BtbModule(implicit val p: Parameters) extends Module with HasBtbParameters
+abstract class BtbBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
+  with HasBtbParameters
+
+class RAS(nras: Int) {
+  def push(addr: UInt): Unit = {
+    when (count < nras) { count := count + 1 }
+    val nextPos = Mux(Bool(isPow2(nras)) || pos < nras-1, pos+1, UInt(0))
+    stack(nextPos) := addr
+    pos := nextPos
+  }
+  def peek: UInt = stack(pos)
+  def pop(): Unit = when (!isEmpty) {
+    count := count - 1
+    pos := Mux(Bool(isPow2(nras)) || pos > 0, pos-1, UInt(nras-1))
+  }
+  def clear(): Unit = count := UInt(0)
+  def isEmpty: Bool = count === UInt(0)
+
+  private val count = Reg(UInt(width = log2Up(nras+1)))
+  private val pos = Reg(UInt(width = log2Up(nras)))
+  private val stack = Reg(Vec(nras, UInt()))
+}
+
+class BHTResp(implicit p: Parameters) extends BtbBundle()(p) {
+  val history = UInt(width = log2Up(nBHT).max(1))
+  val value = UInt(width = 2)
+}
+
+// BHT contains table of 2-bit counters and a global history register.
+// The BHT only predicts and updates when there is a BTB hit.
+// The global history:
+//    - updated speculatively in fetch (if there's a BTB hit).
+//    - on a mispredict, the history register is reset (again, only if BTB hit).
+// The counter table:
+//    - each counter corresponds with the address of the fetch packet ("fetch pc").
+//    - updated when a branch resolves (and BTB was a hit for that branch).
+//      The updating branch must provide its "fetch pc".
+class BHT(nbht: Int)(implicit val p: Parameters) extends HasCoreParameters {
+  val nbhtbits = log2Up(nbht)
+  def get(addr: UInt, update: Bool): BHTResp = {
+    val res = Wire(new BHTResp)
+    val index = addr(nbhtbits+1, log2Up(coreInstBytes)) ^ history
+    res.value := table(index)
+    res.history := history
+    val taken = res.value(0)
+    when (update) { history := Cat(taken, history(nbhtbits-1,1)) }
+    res
+  }
+  def update(addr: UInt, d: BHTResp, taken: Bool, mispredict: Bool): Unit = {
+    val index = addr(nbhtbits+1, log2Up(coreInstBytes)) ^ d.history
+    table(index) := Cat(taken, (d.value(1) & d.value(0)) | ((d.value(1) | d.value(0)) & taken))
+    when (mispredict) { history := Cat(taken, d.history(nbhtbits-1,1)) }
+  }
+
+  private val table = Mem(nbht, UInt(width = 2))
+  val history = Reg(UInt(width = nbhtbits))
+}
+
+// BTB update occurs during branch resolution (and only on a mispredict).
+//  - "pc" is what future fetch PCs will tag match against.
+//  - "br_pc" is the PC of the branch instruction.
+class BTBUpdate(implicit p: Parameters) extends BtbBundle()(p) {
+  val prediction = Valid(new BTBResp)
+  val pc = UInt(width = vaddrBits)
+  val target = UInt(width = vaddrBits)
+  val taken = Bool()
+  val isValid = Bool()
+  val isJump = Bool()
+  val isReturn = Bool()
+  val br_pc = UInt(width = vaddrBits)
+}
+
+// BHT update occurs during branch resolution on all conditional branches.
+//  - "pc" is what future fetch PCs will tag match against.
+class BHTUpdate(implicit p: Parameters) extends BtbBundle()(p) {
+  val prediction = Valid(new BTBResp)
+  val pc = UInt(width = vaddrBits)
+  val taken = Bool()
+  val mispredict = Bool()
+}
+
+class RASUpdate(implicit p: Parameters) extends BtbBundle()(p) {
+  val isCall = Bool()
+  val isReturn = Bool()
+  val returnAddr = UInt(width = vaddrBits)
+  val prediction = Valid(new BTBResp)
+}
+
+//  - "bridx" is the low-order PC bits of the predicted branch (after
+//     shifting off the lowest log(inst_bytes) bits off).
+//  - "mask" provides a mask of valid instructions (instructions are
+//     masked off by the predicted taken branch from the BTB).
+class BTBResp(implicit p: Parameters) extends BtbBundle()(p) {
+  val taken = Bool()
+  val mask = Bits(width = fetchWidth)
+  val bridx = Bits(width = log2Up(fetchWidth))
+  val target = UInt(width = vaddrBits)
+  val entry = UInt(width = opaqueBits)
+  val bht = new BHTResp
+}
+
+class BTBReq(implicit p: Parameters) extends BtbBundle()(p) {
+   val addr = UInt(width = vaddrBits)
+}
+
+// fully-associative branch target buffer
+// Higher-performance processors may cause BTB updates to occur out-of-order,
+// which requires an extra CAM port for updates (to ensure no duplicates get
+// placed in BTB).
+class BTB(implicit p: Parameters) extends BtbModule {
+  val io = new Bundle {
+    val req = Valid(new BTBReq).flip
+    val resp = Valid(new BTBResp)
+    val btb_update = Valid(new BTBUpdate).flip
+    val bht_update = Valid(new BHTUpdate).flip
+    val ras_update = Valid(new RASUpdate).flip
+  }
+
+  val idxs = Reg(Vec(entries, UInt(width=matchBits - log2Up(coreInstBytes))))
+  val idxPages = Reg(Vec(entries, UInt(width=log2Up(nPages))))
+  val tgts = Reg(Vec(entries, UInt(width=matchBits - log2Up(coreInstBytes))))
+  val tgtPages = Reg(Vec(entries, UInt(width=log2Up(nPages))))
+  val pages = Reg(Vec(nPages, UInt(width=vaddrBits - matchBits)))
+  val pageValid = Reg(init = UInt(0, nPages))
+  val idxPagesOH = idxPages.map(UIntToOH(_)(nPages-1,0))
+  val tgtPagesOH = tgtPages.map(UIntToOH(_)(nPages-1,0))
+
+  val isValid = Reg(init = UInt(0, entries))
+  val isReturn = Reg(UInt(width = entries))
+  val isJump = Reg(UInt(width = entries))
+  val brIdx = Reg(Vec(entries, UInt(width=log2Up(fetchWidth))))
+
+  private def page(addr: UInt) = addr >> matchBits
+  private def pageMatch(addr: UInt) = {
+    val p = page(addr)
+    pageValid & pages.map(_ === p).asUInt
+  }
+  private def tagMatch(addr: UInt, pgMatch: UInt) = {
+    val idxMatch = idxs.map(_ === addr(matchBits-1, log2Up(coreInstBytes))).asUInt
+    val idxPageMatch = idxPagesOH.map(_ & pgMatch).map(_.orR).asUInt
+    idxMatch & idxPageMatch & isValid
+  }
+
+  val r_btb_update = Pipe(io.btb_update)
+  val update_target = io.req.bits.addr
+
+  val pageHit = pageMatch(io.req.bits.addr)
+  val hitsVec = tagMatch(io.req.bits.addr, pageHit)
+  val hits = hitsVec.asUInt
+  val updatePageHit = pageMatch(r_btb_update.bits.pc)
+
+  val updateHits = tagMatch(r_btb_update.bits.pc, updatePageHit)
+  val updateHit = if (updatesOutOfOrder) updateHits.orR else r_btb_update.bits.prediction.valid
+  val updateHitAddr = if (updatesOutOfOrder) OHToUInt(updateHits) else r_btb_update.bits.prediction.bits.entry
+  val nextRepl = Counter(r_btb_update.valid && !updateHit, entries)._1
+
+  val useUpdatePageHit = updatePageHit.orR
+  val usePageHit = pageHit.orR
+  val doIdxPageRepl = !useUpdatePageHit
+  val nextPageRepl = Reg(UInt(width = log2Ceil(nPages)))
+  val idxPageRepl = Mux(usePageHit, Cat(pageHit(nPages-2,0), pageHit(nPages-1)), UIntToOH(nextPageRepl))
+  val idxPageUpdateOH = Mux(useUpdatePageHit, updatePageHit, idxPageRepl)
+  val idxPageUpdate = OHToUInt(idxPageUpdateOH)
+  val idxPageReplEn = Mux(doIdxPageRepl, idxPageRepl, UInt(0))
+
+  val samePage = page(r_btb_update.bits.pc) === page(update_target)
+  val doTgtPageRepl = !samePage && !usePageHit
+  val tgtPageRepl = Mux(samePage, idxPageUpdateOH, Cat(idxPageUpdateOH(nPages-2,0), idxPageUpdateOH(nPages-1)))
+  val tgtPageUpdate = OHToUInt(Mux(usePageHit, pageHit, tgtPageRepl))
+  val tgtPageReplEn = Mux(doTgtPageRepl, tgtPageRepl, UInt(0))
+
+  when (r_btb_update.valid && (doIdxPageRepl || doTgtPageRepl)) {
+    val both = doIdxPageRepl && doTgtPageRepl
+    val next = nextPageRepl + Mux[UInt](both, 2, 1)
+    nextPageRepl := Mux(next >= nPages, next(0), next)
+  }
+
+  when (r_btb_update.valid) {
+    val waddr = Mux(updateHit, updateHitAddr, nextRepl)
+    val mask = UIntToOH(waddr)
+    idxs(waddr) := r_btb_update.bits.pc(matchBits-1, log2Up(coreInstBytes))
+    tgts(waddr) := update_target(matchBits-1, log2Up(coreInstBytes))
+    idxPages(waddr) := idxPageUpdate
+    tgtPages(waddr) := tgtPageUpdate
+    isValid := Mux(r_btb_update.bits.isValid, isValid | mask, isValid & ~mask)
+    isReturn := Mux(r_btb_update.bits.isReturn, isReturn | mask, isReturn & ~mask)
+    isJump := Mux(r_btb_update.bits.isJump, isJump | mask, isJump & ~mask)
+    if (fetchWidth > 1)
+      brIdx(waddr) := r_btb_update.bits.br_pc >> log2Up(coreInstBytes)
+
+    require(nPages % 2 == 0)
+    val idxWritesEven = !idxPageUpdate(0)
+
+    def writeBank(i: Int, mod: Int, en: UInt, data: UInt) =
+      for (i <- i until nPages by mod)
+        when (en(i)) { pages(i) := data }
+
+    writeBank(0, 2, Mux(idxWritesEven, idxPageReplEn, tgtPageReplEn),
+      Mux(idxWritesEven, page(r_btb_update.bits.pc), page(update_target)))
+    writeBank(1, 2, Mux(idxWritesEven, tgtPageReplEn, idxPageReplEn),
+      Mux(idxWritesEven, page(update_target), page(r_btb_update.bits.pc)))
+    pageValid := pageValid | tgtPageReplEn | idxPageReplEn
+  }
+
+  io.resp.valid := hits.orR
+  io.resp.bits.taken := true
+  io.resp.bits.target := Cat(Mux1H(Mux1H(hitsVec, tgtPagesOH), pages), Mux1H(hitsVec, tgts) << log2Up(coreInstBytes))
+  io.resp.bits.entry := OHToUInt(hits)
+  io.resp.bits.bridx := (if (fetchWidth > 1) Mux1H(hitsVec, brIdx) else UInt(0))
+  io.resp.bits.mask := Cat((UInt(1) << ~Mux(io.resp.bits.taken, ~io.resp.bits.bridx, UInt(0)))-1, UInt(1))
+
+  if (nBHT > 0) {
+    val bht = new BHT(nBHT)
+    val isBranch = !(hits & isJump).orR
+    val res = bht.get(io.req.bits.addr, io.req.valid && io.resp.valid && isBranch)
+    val update_btb_hit = io.bht_update.bits.prediction.valid
+    when (io.bht_update.valid && update_btb_hit) {
+      bht.update(io.bht_update.bits.pc, io.bht_update.bits.prediction.bits.bht, io.bht_update.bits.taken, io.bht_update.bits.mispredict)
+    }
+    when (!res.value(0) && isBranch) { io.resp.bits.taken := false }
+    io.resp.bits.bht := res
+  }
+
+  if (nRAS > 0) {
+    val ras = new RAS(nRAS)
+    val doPeek = (hits & isReturn).orR
+    when (!ras.isEmpty && doPeek) {
+      io.resp.bits.target := ras.peek
+    }
+    when (io.ras_update.valid) {
+      when (io.ras_update.bits.isCall) {
+        ras.push(io.ras_update.bits.returnAddr)
+        when (doPeek) {
+          io.resp.bits.target := io.ras_update.bits.returnAddr
+        }
+      }.elsewhen (io.ras_update.bits.isReturn && io.ras_update.bits.prediction.valid) {
+        ras.pop()
+      }
+    }
+  }
+}
--- a/src/main/scala/rocket/consts.scala
+++ b/src/main/scala/rocket/consts.scala
@ -0,0 +1,61 @@
+// See LICENSE for license details.
+
+package rocket
+package constants
+
+import Chisel._
+import scala.math._
+
+trait ScalarOpConstants {
+  val MT_SZ = 3
+  val MT_X  = BitPat("b???")
+  val MT_B  = UInt("b000")
+  val MT_H  = UInt("b001")
+  val MT_W  = UInt("b010")
+  val MT_D  = UInt("b011")
+  val MT_BU = UInt("b100")
+  val MT_HU = UInt("b101")
+  val MT_WU = UInt("b110")
+  def mtSize(mt: UInt) = mt(MT_SZ-2, 0)
+  def mtSigned(mt: UInt) = !mt(MT_SZ-1)
+
+  val SZ_BR = 3
+  val BR_X    = BitPat("b???")
+  val BR_EQ   = UInt(0, 3)
+  val BR_NE   = UInt(1, 3)
+  val BR_J    = UInt(2, 3)
+  val BR_N    = UInt(3, 3)
+  val BR_LT   = UInt(4, 3)
+  val BR_GE   = UInt(5, 3)
+  val BR_LTU  = UInt(6, 3)
+  val BR_GEU  = UInt(7, 3)
+
+  val A1_X    = BitPat("b??")
+  val A1_ZERO = UInt(0, 2)
+  val A1_RS1  = UInt(1, 2)
+  val A1_PC   = UInt(2, 2)
+
+  val IMM_X  = BitPat("b???")
+  val IMM_S  = UInt(0, 3)
+  val IMM_SB = UInt(1, 3)
+  val IMM_U  = UInt(2, 3)
+  val IMM_UJ = UInt(3, 3)
+  val IMM_I  = UInt(4, 3)
+  val IMM_Z  = UInt(5, 3)
+
+  val A2_X    = BitPat("b??")
+  val A2_ZERO = UInt(0, 2)
+  val A2_SIZE = UInt(1, 2)
+  val A2_RS2  = UInt(2, 2)
+  val A2_IMM  = UInt(3, 2)
+
+  val X = BitPat("b?")
+  val N = BitPat("b0")
+  val Y = BitPat("b1")
+
+  val SZ_DW = 1
+  val DW_X  = X
+  val DW_32 = Bool(false)
+  val DW_64 = Bool(true)
+  val DW_XPR = DW_64
+}
--- a/src/main/scala/rocket/csr.scala
+++ b/src/main/scala/rocket/csr.scala
@ -0,0 +1,589 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import Util._
+import Instructions._
+import cde.{Parameters, Field}
+import uncore.devices._
+import uncore.util._
+import junctions.AddrMap
+
+class MStatus extends Bundle {
+  val debug = Bool() // not truly part of mstatus, but convenient
+  val prv = UInt(width = PRV.SZ) // not truly part of mstatus, but convenient
+  val sd = Bool()
+  val zero3 = UInt(width = 31)
+  val sd_rv32 = Bool()
+  val zero2 = UInt(width = 2)
+  val vm = UInt(width = 5)
+  val zero1 = UInt(width = 4)
+  val mxr = Bool()
+  val pum = Bool()
+  val mprv = Bool()
+  val xs = UInt(width = 2)
+  val fs = UInt(width = 2)
+  val mpp = UInt(width = 2)
+  val hpp = UInt(width = 2)
+  val spp = UInt(width = 1)
+  val mpie = Bool()
+  val hpie = Bool()
+  val spie = Bool()
+  val upie = Bool()
+  val mie = Bool()
+  val hie = Bool()
+  val sie = Bool()
+  val uie = Bool()
+}
+
+class DCSR extends Bundle {
+  val xdebugver = UInt(width = 2)
+  val ndreset = Bool()
+  val fullreset = Bool()
+  val hwbpcount = UInt(width = 12)
+  val ebreakm = Bool()
+  val ebreakh = Bool()
+  val ebreaks = Bool()
+  val ebreaku = Bool()
+  val zero2 = Bool()
+  val stopcycle = Bool()
+  val stoptime = Bool()
+  val cause = UInt(width = 3)
+  val debugint = Bool()
+  val zero1 = Bool()
+  val halt = Bool()
+  val step = Bool()
+  val prv = UInt(width = PRV.SZ)
+}
+
+class MIP extends Bundle {
+  val rocc = Bool()
+  val meip = Bool()
+  val heip = Bool()
+  val seip = Bool()
+  val ueip = Bool()
+  val mtip = Bool()
+  val htip = Bool()
+  val stip = Bool()
+  val utip = Bool()
+  val msip = Bool()
+  val hsip = Bool()
+  val ssip = Bool()
+  val usip = Bool()
+}
+
+class PTBR(implicit p: Parameters) extends CoreBundle()(p) {
+  require(maxPAddrBits - pgIdxBits + asIdBits <= xLen)
+  val asid = UInt(width = asIdBits)
+  val ppn = UInt(width = maxPAddrBits - pgIdxBits)
+}
+
+object PRV
+{
+  val SZ = 2
+  val U = 0
+  val S = 1
+  val H = 2
+  val M = 3
+}
+
+object CSR
+{
+  // commands
+  val SZ = 3
+  val X = BitPat.dontCare(SZ)
+  val N = UInt(0,SZ)
+  val W = UInt(1,SZ)
+  val S = UInt(2,SZ)
+  val C = UInt(3,SZ)
+  val I = UInt(4,SZ)
+  val R = UInt(5,SZ)
+
+  val ADDRSZ = 12
+}
+
+class CSRFileIO(implicit p: Parameters) extends CoreBundle {
+  val prci = new PRCITileIO().flip
+  val rw = new Bundle {
+    val addr = UInt(INPUT, CSR.ADDRSZ)
+    val cmd = Bits(INPUT, CSR.SZ)
+    val rdata = Bits(OUTPUT, xLen)
+    val wdata = Bits(INPUT, xLen)
+  }
+
+  val csr_stall = Bool(OUTPUT)
+  val csr_xcpt = Bool(OUTPUT)
+  val eret = Bool(OUTPUT)
+  val singleStep = Bool(OUTPUT)
+
+  val status = new MStatus().asOutput
+  val ptbr = new PTBR().asOutput
+  val evec = UInt(OUTPUT, vaddrBitsExtended)
+  val exception = Bool(INPUT)
+  val retire = UInt(INPUT, log2Up(1+retireWidth))
+  val custom_mrw_csrs = Vec(nCustomMrwCsrs, UInt(INPUT, xLen))
+  val cause = UInt(INPUT, xLen)
+  val pc = UInt(INPUT, vaddrBitsExtended)
+  val badaddr = UInt(INPUT, vaddrBitsExtended)
+  val fatc = Bool(OUTPUT)
+  val time = UInt(OUTPUT, xLen)
+  val fcsr_rm = Bits(OUTPUT, FPConstants.RM_SZ)
+  val fcsr_flags = Valid(Bits(width = FPConstants.FLAGS_SZ)).flip
+  val rocc = new RoCCInterface().flip
+  val interrupt = Bool(OUTPUT)
+  val interrupt_cause = UInt(OUTPUT, xLen)
+  val bp = Vec(p(NBreakpoints), new BP).asOutput
+}
+
+class CSRFile(implicit p: Parameters) extends CoreModule()(p)
+{
+  val io = new CSRFileIO
+
+  val reset_mstatus = Wire(init=new MStatus().fromBits(0))
+  reset_mstatus.mpp := PRV.M
+  reset_mstatus.prv := PRV.M
+  val reg_mstatus = Reg(init=reset_mstatus)
+
+  val new_prv = Wire(init = reg_mstatus.prv)
+  reg_mstatus.prv := legalizePrivilege(new_prv)
+
+  val reset_dcsr = Wire(init=new DCSR().fromBits(0))
+  reset_dcsr.xdebugver := 1
+  reset_dcsr.prv := PRV.M
+  val reg_dcsr = Reg(init=reset_dcsr)
+
+  val (supported_interrupts, delegable_interrupts) = {
+    val sup = Wire(init=new MIP().fromBits(0))
+    sup.ssip := Bool(p(UseVM))
+    sup.msip := true
+    sup.stip := Bool(p(UseVM))
+    sup.mtip := true
+    sup.meip := true
+    sup.seip := Bool(p(UseVM))
+    sup.rocc := usingRoCC
+
+    val del = Wire(init=sup)
+    del.msip := false
+    del.mtip := false
+    del.meip := false
+
+    (sup.asUInt, del.asUInt)
+  }
+  val delegable_exceptions = UInt(Seq(
+    Causes.misaligned_fetch,
+    Causes.fault_fetch,
+    Causes.breakpoint,
+    Causes.fault_load,
+    Causes.fault_store,
+    Causes.user_ecall).map(1 << _).sum)
+
+  val exception = io.exception || io.csr_xcpt
+  val reg_debug = Reg(init=Bool(false))
+  val reg_dpc = Reg(UInt(width = vaddrBitsExtended))
+  val reg_dscratch = Reg(UInt(width = xLen))
+
+  val reg_singleStepped = Reg(Bool())
+  when (io.retire(0) || exception) { reg_singleStepped := true }
+  when (!io.singleStep) { reg_singleStepped := false }
+  assert(!io.singleStep || io.retire <= UInt(1))
+  assert(!reg_singleStepped || io.retire === UInt(0))
+
+  val reg_tdrselect = Reg(new TDRSelect)
+  val reg_bp = Reg(Vec(1 << log2Up(p(NBreakpoints)), new BP))
+
+  val reg_mie = Reg(init=UInt(0, xLen))
+  val reg_mideleg = Reg(init=UInt(0, xLen))
+  val reg_medeleg = Reg(init=UInt(0, xLen))
+  val reg_mip = Reg(new MIP)
+  val reg_mepc = Reg(UInt(width = vaddrBitsExtended))
+  val reg_mcause = Reg(Bits(width = xLen))
+  val reg_mbadaddr = Reg(UInt(width = vaddrBitsExtended))
+  val reg_mscratch = Reg(Bits(width = xLen))
+  val reg_mtvec = Reg(init=UInt(p(MtvecInit), paddrBits min xLen))
+
+  val reg_sepc = Reg(UInt(width = vaddrBitsExtended))
+  val reg_scause = Reg(Bits(width = xLen))
+  val reg_sbadaddr = Reg(UInt(width = vaddrBitsExtended))
+  val reg_sscratch = Reg(Bits(width = xLen))
+  val reg_stvec = Reg(UInt(width = vaddrBits))
+  val reg_sptbr = Reg(new PTBR)
+  val reg_wfi = Reg(init=Bool(false))
+
+  val reg_fflags = Reg(UInt(width = 5))
+  val reg_frm = Reg(UInt(width = 3))
+
+  val reg_instret = WideCounter(64, io.retire)
+  val reg_cycle: UInt = if (enableCommitLog) { reg_instret } else { WideCounter(64) }
+
+  val mip = Wire(init=reg_mip)
+  mip.rocc := io.rocc.interrupt
+  val read_mip = mip.asUInt & supported_interrupts
+
+  val pending_interrupts = read_mip & reg_mie
+  val m_interrupts = Mux(!reg_debug && (reg_mstatus.prv < PRV.M || (reg_mstatus.prv === PRV.M && reg_mstatus.mie)), pending_interrupts & ~reg_mideleg, UInt(0))
+  val s_interrupts = Mux(!reg_debug && (reg_mstatus.prv < PRV.S || (reg_mstatus.prv === PRV.S && reg_mstatus.sie)), pending_interrupts & reg_mideleg, UInt(0))
+  val all_interrupts = m_interrupts | s_interrupts
+  val interruptMSB = BigInt(1) << (xLen-1)
+  val interruptCause = interruptMSB + PriorityEncoder(all_interrupts)
+  io.interrupt := all_interrupts.orR && !io.singleStep || reg_singleStepped
+  io.interrupt_cause := interruptCause
+  io.bp := reg_bp take p(NBreakpoints)
+
+  val debugIntCause = reg_mip.getWidth
+  // debug interrupts are only masked by being in debug mode
+  when (Bool(usingDebug) && reg_dcsr.debugint && !reg_debug) {
+    io.interrupt := true
+    io.interrupt_cause := interruptMSB + debugIntCause
+  }
+
+  val system_insn = io.rw.cmd === CSR.I
+  val cpu_ren = io.rw.cmd =/= CSR.N && !system_insn
+
+  val isa_string = "IM" +
+    (if (usingVM) "S" else "") +
+    (if (usingUser) "U" else "") +
+    (if (usingAtomics) "A" else "") +
+    (if (usingFPU) "FD" else "") +
+    (if (usingRoCC) "X" else "")
+  val isa = (BigInt(log2Ceil(xLen) - 4) << (xLen-2)) |
+    isa_string.map(x => 1 << (x - 'A')).reduce(_|_)
+  val read_mstatus = io.status.asUInt()(xLen-1,0)
+
+  val read_mapping = collection.mutable.LinkedHashMap[Int,Bits](
+    CSRs.tdrselect -> reg_tdrselect.asUInt,
+    CSRs.tdrdata1 -> reg_bp(reg_tdrselect.tdrindex).control.asUInt,
+    CSRs.tdrdata2 -> reg_bp(reg_tdrselect.tdrindex).address,
+    CSRs.mimpid -> UInt(0),
+    CSRs.marchid -> UInt(0),
+    CSRs.mvendorid -> UInt(0),
+    CSRs.mcycle -> reg_cycle,
+    CSRs.minstret -> reg_instret,
+    CSRs.mucounteren -> UInt(0),
+    CSRs.mutime_delta -> UInt(0),
+    CSRs.mucycle_delta -> UInt(0),
+    CSRs.muinstret_delta -> UInt(0),
+    CSRs.misa -> UInt(isa),
+    CSRs.mstatus -> read_mstatus,
+    CSRs.mtvec -> reg_mtvec,
+    CSRs.mip -> read_mip,
+    CSRs.mie -> reg_mie,
+    CSRs.mideleg -> reg_mideleg,
+    CSRs.medeleg -> reg_medeleg,
+    CSRs.mscratch -> reg_mscratch,
+    CSRs.mepc -> reg_mepc.sextTo(xLen),
+    CSRs.mbadaddr -> reg_mbadaddr.sextTo(xLen),
+    CSRs.mcause -> reg_mcause,
+    CSRs.mhartid -> io.prci.id)
+
+  if (usingDebug) {
+    read_mapping += CSRs.dcsr -> reg_dcsr.asUInt
+    read_mapping += CSRs.dpc -> reg_dpc.asUInt
+    read_mapping += CSRs.dscratch -> reg_dscratch.asUInt
+  }
+
+  if (usingFPU) {
+    read_mapping += CSRs.fflags -> reg_fflags
+    read_mapping += CSRs.frm -> reg_frm
+    read_mapping += CSRs.fcsr -> Cat(reg_frm, reg_fflags)
+  }
+
+  if (usingVM) {
+    val read_sie = reg_mie & reg_mideleg
+    val read_sip = read_mip & reg_mideleg
+    val read_sstatus = Wire(init=io.status)
+    read_sstatus.vm := 0
+    read_sstatus.mprv := 0
+    read_sstatus.mpp := 0
+    read_sstatus.hpp := 0
+    read_sstatus.mpie := 0
+    read_sstatus.hpie := 0
+    read_sstatus.mie := 0
+    read_sstatus.hie := 0
+
+    read_mapping += CSRs.sstatus -> (read_sstatus.asUInt())(xLen-1,0)
+    read_mapping += CSRs.sip -> read_sip.asUInt
+    read_mapping += CSRs.sie -> read_sie.asUInt
+    read_mapping += CSRs.sscratch -> reg_sscratch
+    read_mapping += CSRs.scause -> reg_scause
+    read_mapping += CSRs.sbadaddr -> reg_sbadaddr.sextTo(xLen)
+    read_mapping += CSRs.sptbr -> reg_sptbr.asUInt
+    read_mapping += CSRs.sepc -> reg_sepc.sextTo(xLen)
+    read_mapping += CSRs.stvec -> reg_stvec.sextTo(xLen)
+    read_mapping += CSRs.mscounteren -> UInt(0)
+    read_mapping += CSRs.mstime_delta -> UInt(0)
+    read_mapping += CSRs.mscycle_delta -> UInt(0)
+    read_mapping += CSRs.msinstret_delta -> UInt(0)
+  }
+
+  if (xLen == 32) {
+    read_mapping += CSRs.mcycleh -> (reg_cycle >> 32)
+    read_mapping += CSRs.minstreth -> (reg_instret >> 32)
+    read_mapping += CSRs.mutime_deltah -> UInt(0)
+    read_mapping += CSRs.mucycle_deltah -> UInt(0)
+    read_mapping += CSRs.muinstret_deltah -> UInt(0)
+    if (usingVM) {
+      read_mapping += CSRs.mstime_deltah -> UInt(0)
+      read_mapping += CSRs.mscycle_deltah -> UInt(0)
+      read_mapping += CSRs.msinstret_deltah -> UInt(0)
+    }
+  }
+
+  for (i <- 0 until nCustomMrwCsrs) {
+    val addr = 0xff0 + i
+    require(addr < (1 << CSR.ADDRSZ))
+    require(!read_mapping.contains(addr), "custom MRW CSR address " + i + " is already in use")
+    read_mapping += addr -> io.custom_mrw_csrs(i)
+  }
+
+  val decoded_addr = read_mapping map { case (k, v) => k -> (io.rw.addr === k) }
+
+  val addr_valid = decoded_addr.values.reduce(_||_)
+  val fp_csr =
+    if (usingFPU) decoded_addr(CSRs.fflags) || decoded_addr(CSRs.frm) || decoded_addr(CSRs.fcsr)
+    else Bool(false)
+  val csr_debug = Bool(usingDebug) && io.rw.addr(5)
+  val csr_addr_priv = Cat(io.rw.addr(6,5).andR, io.rw.addr(9,8))
+  val priv_sufficient = Cat(reg_debug, reg_mstatus.prv) >= csr_addr_priv
+  val read_only = io.rw.addr(11,10).andR
+  val cpu_wen = cpu_ren && io.rw.cmd =/= CSR.R && priv_sufficient
+  val wen = cpu_wen && !read_only
+
+  val wdata = (Mux(io.rw.cmd.isOneOf(CSR.S, CSR.C), io.rw.rdata, UInt(0)) |
+               Mux(io.rw.cmd =/= CSR.C, io.rw.wdata, UInt(0))) &
+              ~Mux(io.rw.cmd === CSR.C, io.rw.wdata, UInt(0))
+
+  val do_system_insn = priv_sufficient && system_insn
+  val opcode = UInt(1) << io.rw.addr(2,0)
+  val insn_call = do_system_insn && opcode(0)
+  val insn_break = do_system_insn && opcode(1)
+  val insn_ret = do_system_insn && opcode(2)
+  val insn_sfence_vm = do_system_insn && opcode(4)
+  val insn_wfi = do_system_insn && opcode(5)
+
+  io.csr_xcpt := (cpu_wen && read_only) ||
+    (cpu_ren && (!priv_sufficient || !addr_valid || fp_csr && !io.status.fs.orR)) ||
+    (system_insn && !priv_sufficient) ||
+    insn_call || insn_break
+
+  when (insn_wfi) { reg_wfi := true }
+  when (pending_interrupts.orR) { reg_wfi := false }
+
+  val cause =
+    Mux(!io.csr_xcpt, io.cause,
+    Mux(insn_call, reg_mstatus.prv + Causes.user_ecall,
+    Mux[UInt](insn_break, Causes.breakpoint, Causes.illegal_instruction)))
+  val cause_lsbs = cause(log2Up(xLen)-1,0)
+  val causeIsDebugInt = cause(xLen-1) && cause_lsbs === debugIntCause
+  val causeIsDebugBreak = cause === Causes.breakpoint && Cat(reg_dcsr.ebreakm, reg_dcsr.ebreakh, reg_dcsr.ebreaks, reg_dcsr.ebreaku)(reg_mstatus.prv)
+  val trapToDebug = Bool(usingDebug) && (reg_singleStepped || causeIsDebugInt || causeIsDebugBreak || reg_debug)
+  val delegate = Bool(p(UseVM)) && reg_mstatus.prv < PRV.M && Mux(cause(xLen-1), reg_mideleg(cause_lsbs), reg_medeleg(cause_lsbs))
+  val debugTVec = Mux(reg_debug, UInt(0x808), UInt(0x800))
+  val tvec = Mux(trapToDebug, debugTVec, Mux(delegate, reg_stvec.sextTo(vaddrBitsExtended), reg_mtvec))
+  val epc = Mux(csr_debug, reg_dpc, Mux(Bool(p(UseVM)) && !csr_addr_priv(1), reg_sepc, reg_mepc))
+  io.fatc := insn_sfence_vm
+  io.evec := Mux(exception, tvec, epc)
+  io.ptbr := reg_sptbr
+  io.eret := insn_ret
+  io.singleStep := reg_dcsr.step && !reg_debug
+  io.status := reg_mstatus
+  io.status.sd := io.status.fs.andR || io.status.xs.andR
+  io.status.debug := reg_debug
+  if (xLen == 32)
+    io.status.sd_rv32 := io.status.sd
+
+  when (exception) {
+    val epc = ~(~io.pc | (coreInstBytes-1))
+    val pie = read_mstatus(reg_mstatus.prv)
+
+    val write_badaddr = cause isOneOf (Causes.breakpoint,
+      Causes.misaligned_load, Causes.misaligned_store, Causes.misaligned_fetch,
+      Causes.fault_load, Causes.fault_store, Causes.fault_fetch)
+
+    when (trapToDebug) {
+      reg_debug := true
+      reg_dpc := epc
+      reg_dcsr.cause := Mux(reg_singleStepped, UInt(4), Mux(causeIsDebugInt, UInt(3), UInt(1)))
+      reg_dcsr.prv := trimPrivilege(reg_mstatus.prv)
+    }.elsewhen (delegate) {
+      reg_sepc := epc
+      reg_scause := cause
+      when (write_badaddr) { reg_sbadaddr := io.badaddr }
+      reg_mstatus.spie := pie
+      reg_mstatus.spp := reg_mstatus.prv
+      reg_mstatus.sie := false
+      new_prv := PRV.S
+    }.otherwise {
+      reg_mepc := epc
+      reg_mcause := cause
+      when (write_badaddr) { reg_mbadaddr := io.badaddr }
+      reg_mstatus.mpie := pie
+      reg_mstatus.mpp := trimPrivilege(reg_mstatus.prv)
+      reg_mstatus.mie := false
+      new_prv := PRV.M
+    }
+  }
+
+  when (insn_ret) {
+    when (Bool(p(UseVM)) && !csr_addr_priv(1)) {
+      when (reg_mstatus.spp.toBool) { reg_mstatus.sie := reg_mstatus.spie }
+      reg_mstatus.spie := false
+      reg_mstatus.spp := PRV.U
+      new_prv := reg_mstatus.spp
+    }.elsewhen (csr_debug) {
+      new_prv := reg_dcsr.prv
+      reg_debug := false
+    }.otherwise {
+      when (reg_mstatus.mpp(1)) { reg_mstatus.mie := reg_mstatus.mpie }
+      .elsewhen (Bool(usingVM) && reg_mstatus.mpp(0)) { reg_mstatus.sie := reg_mstatus.mpie }
+      reg_mstatus.mpie := false
+      reg_mstatus.mpp := legalizePrivilege(PRV.U)
+      new_prv := reg_mstatus.mpp
+    }
+  }
+
+  assert(PopCount(insn_ret :: io.exception :: io.csr_xcpt :: Nil) <= 1, "these conditions must be mutually exclusive")
+
+  io.time := reg_cycle
+  io.csr_stall := reg_wfi
+
+  io.rw.rdata := Mux1H(for ((k, v) <- read_mapping) yield decoded_addr(k) -> v)
+
+  io.fcsr_rm := reg_frm
+  when (io.fcsr_flags.valid) {
+    reg_fflags := reg_fflags | io.fcsr_flags.bits
+  }
+
+  when (wen) {
+    when (decoded_addr(CSRs.mstatus)) {
+      val new_mstatus = new MStatus().fromBits(wdata)
+      reg_mstatus.mie := new_mstatus.mie
+      reg_mstatus.mpie := new_mstatus.mpie
+
+      if (usingUser) {
+        reg_mstatus.mprv := new_mstatus.mprv
+        reg_mstatus.mpp := trimPrivilege(new_mstatus.mpp)
+        if (usingVM) {
+          reg_mstatus.mxr := new_mstatus.mxr
+          reg_mstatus.pum := new_mstatus.pum
+          reg_mstatus.spp := new_mstatus.spp
+          reg_mstatus.spie := new_mstatus.spie
+          reg_mstatus.sie := new_mstatus.sie
+        }
+      }
+
+      if (usingVM) {
+        require(if (xLen == 32) pgLevels == 2 else pgLevels > 2 && pgLevels < 6)
+        val vm_on = 6 + pgLevels // TODO Sv48 support should imply Sv39 support
+        when (new_mstatus.vm === 0) { reg_mstatus.vm := 0 }
+        when (new_mstatus.vm === vm_on) { reg_mstatus.vm := vm_on }
+      }
+      if (usingVM || usingFPU) reg_mstatus.fs := Fill(2, new_mstatus.fs.orR)
+      if (usingRoCC) reg_mstatus.xs := Fill(2, new_mstatus.xs.orR)
+    }
+    when (decoded_addr(CSRs.mip)) {
+      val new_mip = new MIP().fromBits(wdata)
+      if (usingVM) {
+        reg_mip.ssip := new_mip.ssip
+        reg_mip.stip := new_mip.stip
+      }
+    }
+    when (decoded_addr(CSRs.mie))      { reg_mie := wdata & supported_interrupts }
+    when (decoded_addr(CSRs.mepc))     { reg_mepc := ~(~wdata | (coreInstBytes-1)) }
+    when (decoded_addr(CSRs.mscratch)) { reg_mscratch := wdata }
+    if (p(MtvecWritable))
+      when (decoded_addr(CSRs.mtvec))  { reg_mtvec := wdata >> 2 << 2 }
+    when (decoded_addr(CSRs.mcause))   { reg_mcause := wdata & UInt((BigInt(1) << (xLen-1)) + 31) /* only implement 5 LSBs and MSB */ }
+    when (decoded_addr(CSRs.mbadaddr)) { reg_mbadaddr := wdata(vaddrBitsExtended-1,0) }
+    if (usingFPU) {
+      when (decoded_addr(CSRs.fflags)) { reg_fflags := wdata }
+      when (decoded_addr(CSRs.frm))    { reg_frm := wdata }
+      when (decoded_addr(CSRs.fcsr))   { reg_fflags := wdata; reg_frm := wdata >> reg_fflags.getWidth }
+    }
+    if (usingDebug) {
+      when (decoded_addr(CSRs.dcsr)) {
+        val new_dcsr = new DCSR().fromBits(wdata)
+        reg_dcsr.halt := new_dcsr.halt
+        reg_dcsr.step := new_dcsr.step
+        reg_dcsr.ebreakm := new_dcsr.ebreakm
+        if (usingVM) reg_dcsr.ebreaks := new_dcsr.ebreaks
+        if (usingUser) reg_dcsr.ebreaku := new_dcsr.ebreaku
+        if (usingUser) reg_dcsr.prv := trimPrivilege(new_dcsr.prv)
+      }
+      when (decoded_addr(CSRs.dpc))      { reg_dpc := ~(~wdata | (coreInstBytes-1)) }
+      when (decoded_addr(CSRs.dscratch)) { reg_dscratch := wdata }
+    }
+    if (usingVM) {
+      when (decoded_addr(CSRs.sstatus)) {
+        val new_sstatus = new MStatus().fromBits(wdata)
+        reg_mstatus.sie := new_sstatus.sie
+        reg_mstatus.spie := new_sstatus.spie
+        reg_mstatus.spp := new_sstatus.spp
+        reg_mstatus.pum := new_sstatus.pum
+        reg_mstatus.fs := Fill(2, new_sstatus.fs.orR) // even without an FPU
+        if (usingRoCC) reg_mstatus.xs := Fill(2, new_sstatus.xs.orR)
+      }
+      when (decoded_addr(CSRs.sip)) {
+        val new_sip = new MIP().fromBits(wdata)
+        reg_mip.ssip := new_sip.ssip
+      }
+      when (decoded_addr(CSRs.sie))      { reg_mie := (reg_mie & ~reg_mideleg) | (wdata & reg_mideleg) }
+      when (decoded_addr(CSRs.sscratch)) { reg_sscratch := wdata }
+      when (decoded_addr(CSRs.sptbr))    { reg_sptbr.ppn := wdata(ppnBits-1,0) }
+      when (decoded_addr(CSRs.sepc))     { reg_sepc := ~(~wdata | (coreInstBytes-1)) }
+      when (decoded_addr(CSRs.stvec))    { reg_stvec := wdata >> 2 << 2 }
+      when (decoded_addr(CSRs.scause))   { reg_scause := wdata & UInt((BigInt(1) << (xLen-1)) + 31) /* only implement 5 LSBs and MSB */ }
+      when (decoded_addr(CSRs.sbadaddr)) { reg_sbadaddr := wdata(vaddrBitsExtended-1,0) }
+      when (decoded_addr(CSRs.mideleg))  { reg_mideleg := wdata & delegable_interrupts }
+      when (decoded_addr(CSRs.medeleg))  { reg_medeleg := wdata & delegable_exceptions }
+    }
+    if (p(NBreakpoints) > 0) {
+      val newTDR = new TDRSelect().fromBits(wdata)
+      when (decoded_addr(CSRs.tdrselect)) { reg_tdrselect.tdrindex := newTDR.tdrindex }
+
+      when (reg_tdrselect.tdrmode || reg_debug) {
+        when (decoded_addr(CSRs.tdrdata1)) {
+          val newBPC = new BPControl().fromBits(wdata)
+          reg_bp(reg_tdrselect.tdrindex).control := newBPC
+          reg_bp(reg_tdrselect.tdrindex).control.bpmatch := newBPC.bpmatch & 2 /* exact/NAPOT only */
+        }
+        when (decoded_addr(CSRs.tdrdata2)) { reg_bp(reg_tdrselect.tdrindex).address := wdata }
+      }
+    }
+  }
+
+  reg_mip := io.prci.interrupts
+  reg_dcsr.debugint := io.prci.interrupts.debug
+  reg_dcsr.hwbpcount := UInt(p(NBreakpoints))
+
+  reg_sptbr.asid := 0
+  reg_tdrselect.reserved := 0
+  reg_tdrselect.tdrmode := true // TODO support D-mode breakpoint theft
+  if (reg_bp.isEmpty) reg_tdrselect.tdrindex := 0
+  for (bpc <- reg_bp map {_.control}) {
+    bpc.tdrtype := bpc.tdrType
+    bpc.bpamaskmax := bpc.bpaMaskMax
+    bpc.reserved := 0
+    bpc.bpaction := 0
+    bpc.h := false
+    if (!usingVM) bpc.s := false
+    if (!usingUser) bpc.u := false
+    if (!usingVM && !usingUser) bpc.m := true
+    when (reset) {
+      bpc.r := false
+      bpc.w := false
+      bpc.x := false
+    }
+  }
+  for (bp <- reg_bp drop p(NBreakpoints))
+    bp := new BP().fromBits(0)
+
+  def legalizePrivilege(priv: UInt): UInt =
+    if (usingVM) Mux(priv === PRV.H, PRV.U, priv)
+    else if (usingUser) Fill(2, priv(0))
+    else PRV.M
+
+  def trimPrivilege(priv: UInt): UInt =
+    if (usingVM) priv
+    else legalizePrivilege(priv)
+}
--- a/src/main/scala/rocket/dcache.scala
+++ b/src/main/scala/rocket/dcache.scala
@ -0,0 +1,444 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import junctions._
+import uncore.tilelink._
+import uncore.agents._
+import uncore.coherence._
+import uncore.util._
+import uncore.constants._
+import cde.{Parameters, Field}
+import Util._
+
+class DCacheDataReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) {
+  val addr = Bits(width = untagBits)
+  val write = Bool()
+  val wdata = Bits(width = rowBits)
+  val wmask = Bits(width = rowBytes)
+  val way_en = Bits(width = nWays)
+}
+
+class DCacheDataArray(implicit p: Parameters) extends L1HellaCacheModule()(p) {
+  val io = new Bundle {
+    val req = Valid(new DCacheDataReq).flip
+    val resp = Vec(nWays, Bits(OUTPUT, rowBits))
+  }
+
+  val addr = io.req.bits.addr >> rowOffBits
+  for (w <- 0 until nWays) {
+    val array = SeqMem(nSets*refillCycles, Vec(rowBytes, Bits(width=8)))
+    val valid = io.req.valid && (Bool(nWays == 1) || io.req.bits.way_en(w))
+    when (valid && io.req.bits.write) {
+      val data = Vec.tabulate(rowBytes)(i => io.req.bits.wdata(8*(i+1)-1, 8*i))
+      array.write(addr, data, io.req.bits.wmask.toBools)
+    }
+    io.resp(w) := array.read(addr, valid && !io.req.bits.write).asUInt
+  }
+}
+
+class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) {
+  val io = new Bundle {
+    val cpu = (new HellaCacheIO).flip
+    val ptw = new TLBPTWIO()
+    val mem = new ClientTileLinkIO
+  }
+
+  val fq = Module(new FinishQueue(1))
+
+  require(rowBits == encRowBits) // no ECC
+  require(refillCyclesPerBeat == 1)
+  require(rowBits >= coreDataBits)
+
+  // tags
+  val replacer = p(Replacer)()
+  def onReset = L1Metadata(UInt(0), ClientMetadata.onReset)
+  val meta = Module(new MetadataArray(onReset _))
+  val metaReadArb = Module(new Arbiter(new MetaReadReq, 3))
+  val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, 3))
+  meta.io.read <> metaReadArb.io.out
+  meta.io.write <> metaWriteArb.io.out
+
+  // data
+  val data = Module(new DCacheDataArray)
+  val dataArb = Module(new Arbiter(new DCacheDataReq, 4))
+  data.io.req <> dataArb.io.out
+  dataArb.io.out.ready := true
+
+  val s1_valid = Reg(next=io.cpu.req.fire(), init=Bool(false))
+  val s1_probe = Reg(next=io.mem.probe.fire(), init=Bool(false))
+  val probe_bits = RegEnable(io.mem.probe.bits, io.mem.probe.fire())
+  val s1_nack = Wire(init=Bool(false))
+  val s1_valid_masked = s1_valid && !io.cpu.s1_kill && !io.cpu.xcpt.asUInt.orR
+  val s1_valid_not_nacked = s1_valid_masked && !s1_nack
+  val s1_req = Reg(io.cpu.req.bits)
+  when (metaReadArb.io.out.valid) {
+    s1_req := io.cpu.req.bits
+    s1_req.addr := Cat(io.cpu.req.bits.addr >> untagBits, metaReadArb.io.out.bits.idx, io.cpu.req.bits.addr(blockOffBits-1,0))
+  }
+  val s1_read = isRead(s1_req.cmd)
+  val s1_write = isWrite(s1_req.cmd)
+  val s1_readwrite = s1_read || s1_write
+  val s1_flush_valid = Reg(Bool())
+
+  val s_ready :: s_grant_wait :: s_voluntary_writeback :: s_probe_rep_dirty :: s_probe_rep_clean :: s_probe_rep_miss :: s_voluntary_write_meta :: s_probe_write_meta :: Nil = Enum(UInt(), 8)
+  val grant_wait = Reg(init=Bool(false))
+  val release_ack_wait = Reg(init=Bool(false))
+  val release_state = Reg(init=s_ready)
+  val pstore1_valid = Wire(Bool())
+  val pstore2_valid = Reg(Bool())
+  val inWriteback = release_state.isOneOf(s_voluntary_writeback, s_probe_rep_dirty)
+  val releaseWay = Wire(UInt())
+  io.cpu.req.ready := (release_state === s_ready) && !grant_wait && !s1_nack
+
+  // hit initiation path
+  dataArb.io.in(3).valid := io.cpu.req.valid && isRead(io.cpu.req.bits.cmd)
+  dataArb.io.in(3).bits.write := false
+  dataArb.io.in(3).bits.addr := io.cpu.req.bits.addr
+  dataArb.io.in(3).bits.way_en := ~UInt(0, nWays)
+  when (!dataArb.io.in(3).ready && isRead(io.cpu.req.bits.cmd)) { io.cpu.req.ready := false }
+  metaReadArb.io.in(2).valid := io.cpu.req.valid
+  metaReadArb.io.in(2).bits.idx := io.cpu.req.bits.addr(idxMSB, idxLSB)
+  metaReadArb.io.in(2).bits.way_en := ~UInt(0, nWays)
+  when (!metaReadArb.io.in(2).ready) { io.cpu.req.ready := false }
+
+  // address translation
+  val tlb = Module(new TLB)
+  io.ptw <> tlb.io.ptw
+  tlb.io.req.valid := s1_valid_masked && s1_readwrite
+  tlb.io.req.bits.passthrough := s1_req.phys
+  tlb.io.req.bits.vpn := s1_req.addr >> pgIdxBits
+  tlb.io.req.bits.instruction := false
+  tlb.io.req.bits.store := s1_write
+  when (!tlb.io.req.ready && !io.cpu.req.bits.phys) { io.cpu.req.ready := false }
+  when (s1_valid && s1_readwrite && tlb.io.resp.miss) { s1_nack := true }
+
+  val s1_paddr = Cat(tlb.io.resp.ppn, s1_req.addr(pgIdxBits-1,0))
+  val s1_tag = Mux(s1_probe, probe_bits.addr_block >> idxBits, s1_paddr(paddrBits-1, untagBits))
+  val s1_hit_way = meta.io.resp.map(r => r.coh.isValid() && r.tag === s1_tag).asUInt
+  val s1_hit_state = ClientMetadata.onReset.fromBits(
+    meta.io.resp.map(r => Mux(r.tag === s1_tag, r.coh.asUInt, UInt(0)))
+    .reduce (_|_))
+  val s1_data_way = Mux(inWriteback, releaseWay, s1_hit_way)
+  val s1_data = Mux1H(s1_data_way, data.io.resp) // retime into s2 if critical
+  val s1_victim_way = Wire(init = replacer.way)
+
+  val s2_valid = Reg(next=s1_valid_masked, init=Bool(false))
+  val s2_probe = Reg(next=s1_probe, init=Bool(false))
+  val releaseInFlight = s1_probe || s2_probe || release_state =/= s_ready
+  val s2_valid_masked = s2_valid && Reg(next = !s1_nack)
+  val s2_req = Reg(io.cpu.req.bits)
+  val s2_uncached = Reg(Bool())
+  when (s1_valid_not_nacked || s1_flush_valid) {
+    s2_req := s1_req
+    s2_req.addr := s1_paddr
+    s2_uncached := !tlb.io.resp.cacheable
+  }
+  val s2_read = isRead(s2_req.cmd)
+  val s2_write = isWrite(s2_req.cmd)
+  val s2_readwrite = s2_read || s2_write
+  val s2_flush_valid = RegNext(s1_flush_valid)
+  val s2_data = RegEnable(s1_data, s1_valid || inWriteback)
+  val s2_probe_way = RegEnable(s1_hit_way, s1_probe)
+  val s2_probe_state = RegEnable(s1_hit_state, s1_probe)
+  val s2_hit_way = RegEnable(s1_hit_way, s1_valid_not_nacked)
+  val s2_hit_state = RegEnable(s1_hit_state, s1_valid_not_nacked)
+  val s2_hit = s2_hit_state.isHit(s2_req.cmd)
+  val s2_valid_hit = s2_valid_masked && s2_readwrite && s2_hit
+  val s2_valid_miss = s2_valid_masked && s2_readwrite && !s2_hit && !(pstore1_valid || pstore2_valid) && !release_ack_wait
+  val s2_valid_cached_miss = s2_valid_miss && !s2_uncached
+  val s2_victimize = s2_valid_cached_miss || s2_flush_valid
+  val s2_valid_uncached = s2_valid_miss && s2_uncached
+  val s2_victim_way = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_way, UIntToOH(RegEnable(s1_victim_way, s1_valid_not_nacked || s1_flush_valid)))
+  val s2_victim_tag = RegEnable(meta.io.resp(s1_victim_way).tag, s1_valid_not_nacked || s1_flush_valid)
+  val s2_victim_state = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_state, RegEnable(meta.io.resp(s1_victim_way).coh, s1_valid_not_nacked || s1_flush_valid))
+  val s2_victim_valid = s2_victim_state.isValid()
+  val s2_victim_dirty = s2_victim_state.requiresVoluntaryWriteback()
+  val s2_new_hit_state = s2_hit_state.onHit(s2_req.cmd)
+  val s2_update_meta = s2_hit_state =/= s2_new_hit_state
+  io.cpu.s2_nack := s2_valid && !s2_valid_hit && !(s2_valid_uncached && io.mem.acquire.ready)
+  when (s2_valid && (!s2_valid_hit || s2_update_meta)) { s1_nack := true }
+
+  // exceptions
+  val misaligned = new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes).misaligned
+  io.cpu.xcpt.ma.ld := s1_read && misaligned
+  io.cpu.xcpt.ma.st := s1_write && misaligned
+  io.cpu.xcpt.pf.ld := s1_read && tlb.io.resp.xcpt_ld
+  io.cpu.xcpt.pf.st := s1_write && tlb.io.resp.xcpt_st
+
+  // load reservations
+  val s2_lr = Bool(usingAtomics) && s2_req.cmd === M_XLR
+  val s2_sc = Bool(usingAtomics) && s2_req.cmd === M_XSC
+  val lrscCount = Reg(init=UInt(0))
+  val lrscValid = lrscCount > 0
+  val lrscAddr = Reg(UInt())
+  val s2_sc_fail = s2_sc && !(lrscValid && lrscAddr === (s2_req.addr >> blockOffBits))
+  when (s2_valid_hit && s2_lr) {
+    lrscCount := lrscCycles - 1
+    lrscAddr := s2_req.addr >> blockOffBits
+  }
+  when (lrscValid) { lrscCount := lrscCount - 1 }
+  when ((s2_valid_hit && s2_sc) || io.cpu.invalidate_lr) { lrscCount := 0 }
+
+  // pending store buffer
+  val pstore1_cmd = RegEnable(s1_req.cmd, s1_valid_not_nacked && s1_write)
+  val pstore1_typ = RegEnable(s1_req.typ, s1_valid_not_nacked && s1_write)
+  val pstore1_addr = RegEnable(s1_paddr, s1_valid_not_nacked && s1_write)
+  val pstore1_data = RegEnable(io.cpu.s1_data, s1_valid_not_nacked && s1_write)
+  val pstore1_way = RegEnable(s1_hit_way, s1_valid_not_nacked && s1_write)
+  val pstore1_storegen = new StoreGen(pstore1_typ, pstore1_addr, pstore1_data, wordBytes)
+  val pstore1_storegen_data = Wire(init = pstore1_storegen.data)
+  val pstore1_amo = Bool(usingAtomics) && isRead(pstore1_cmd)
+  val pstore_drain_structural = pstore1_valid && pstore2_valid && ((s1_valid && s1_write) || pstore1_amo)
+  val pstore_drain_opportunistic = !(io.cpu.req.valid && isRead(io.cpu.req.bits.cmd))
+  val pstore_drain_on_miss = releaseInFlight || io.cpu.s2_nack
+  val pstore_drain =
+    Bool(usingAtomics) && pstore_drain_structural ||
+    (((pstore1_valid && !pstore1_amo) || pstore2_valid) && (pstore_drain_opportunistic || pstore_drain_on_miss))
+  pstore1_valid := {
+    val s2_store_valid = s2_valid_hit && s2_write && !s2_sc_fail
+    val pstore1_held = Reg(Bool())
+    assert(!s2_store_valid || !pstore1_held)
+    pstore1_held := (s2_store_valid || pstore1_held) && pstore2_valid && !pstore_drain
+    s2_store_valid || pstore1_held
+  }
+  val advance_pstore1 = pstore1_valid && (pstore2_valid === pstore_drain)
+  pstore2_valid := pstore2_valid && !pstore_drain || advance_pstore1
+  val pstore2_addr = RegEnable(pstore1_addr, advance_pstore1)
+  val pstore2_way = RegEnable(pstore1_way, advance_pstore1)
+  val pstore2_storegen_data = RegEnable(pstore1_storegen_data, advance_pstore1)
+  val pstore2_storegen_mask = RegEnable(pstore1_storegen.mask, advance_pstore1)
+  dataArb.io.in(0).valid := pstore_drain
+  dataArb.io.in(0).bits.write := true
+  dataArb.io.in(0).bits.addr := Mux(pstore2_valid, pstore2_addr, pstore1_addr)
+  dataArb.io.in(0).bits.way_en := Mux(pstore2_valid, pstore2_way, pstore1_way)
+  dataArb.io.in(0).bits.wdata := Fill(rowWords, Mux(pstore2_valid, pstore2_storegen_data, pstore1_storegen_data))
+  val pstore_mask_shift = Mux(pstore2_valid, pstore2_addr, pstore1_addr).extract(rowOffBits-1,offsetlsb) << wordOffBits
+  dataArb.io.in(0).bits.wmask := Mux(pstore2_valid, pstore2_storegen_mask, pstore1_storegen.mask) << pstore_mask_shift
+
+  // store->load RAW hazard detection
+  val s1_idx = s1_req.addr(idxMSB, wordOffBits)
+  val s1_raw_hazard = s1_read &&
+    ((pstore1_valid && pstore1_addr(idxMSB, wordOffBits) === s1_idx) ||
+     (pstore2_valid && pstore2_addr(idxMSB, wordOffBits) === s1_idx))
+  when (s1_valid && s1_raw_hazard) { s1_nack := true }
+
+  metaWriteArb.io.in(0).valid := (s2_valid_hit && s2_update_meta) || (s2_victimize && !s2_victim_dirty)
+  metaWriteArb.io.in(0).bits.way_en := s2_victim_way
+  metaWriteArb.io.in(0).bits.idx := s2_req.addr(idxMSB, idxLSB)
+  metaWriteArb.io.in(0).bits.data.coh := Mux(s2_hit, s2_new_hit_state, ClientMetadata.onReset)
+  metaWriteArb.io.in(0).bits.data.tag := s2_req.addr(paddrBits-1, untagBits)
+
+  // acquire
+  val cachedGetMessage = s2_hit_state.makeAcquire(
+    client_xact_id = UInt(0),
+    addr_block = s2_req.addr(paddrBits-1, blockOffBits),
+    op_code = s2_req.cmd)
+  val uncachedGetMessage = Get(
+    client_xact_id = UInt(0),
+    addr_block = s2_req.addr(paddrBits-1, blockOffBits),
+    addr_beat = s2_req.addr(blockOffBits-1, beatOffBits),
+    addr_byte = s2_req.addr(beatOffBits-1, 0),
+    operand_size = s2_req.typ,
+    alloc = Bool(false))
+  val uncachedPutOffset = s2_req.addr.extract(beatOffBits-1, wordOffBits)
+  val uncachedPutMessage = Put(
+    client_xact_id = UInt(0),
+    addr_block = s2_req.addr(paddrBits-1, blockOffBits),
+    addr_beat = s2_req.addr(blockOffBits-1, beatOffBits),
+    data = Fill(beatWords, pstore1_storegen.data),
+    wmask = Some(pstore1_storegen.mask << (uncachedPutOffset << wordOffBits)),
+    alloc = Bool(false))
+  val uncachedPutAtomicMessage = PutAtomic(
+    client_xact_id = UInt(0),
+    addr_block = s2_req.addr(paddrBits-1, blockOffBits),
+    addr_beat = s2_req.addr(blockOffBits-1, beatOffBits),
+    addr_byte = s2_req.addr(beatOffBits-1, 0),
+    atomic_opcode = s2_req.cmd,
+    operand_size = s2_req.typ,
+    data = Fill(beatWords, pstore1_storegen.data))
+  io.mem.acquire.valid := ((s2_valid_cached_miss && !s2_victim_dirty) || s2_valid_uncached) && fq.io.enq.ready
+  io.mem.acquire.bits := cachedGetMessage
+  when (s2_uncached) {
+    assert(!s2_valid_masked || !s2_hit_state.isValid(), "cache hit on uncached access")
+    io.mem.acquire.bits := uncachedGetMessage
+    when (s2_write) {
+      io.mem.acquire.bits := uncachedPutMessage
+      when (pstore1_amo) {
+        io.mem.acquire.bits := uncachedPutAtomicMessage
+      }
+    }
+  }
+  when (io.mem.acquire.fire()) { grant_wait := true }
+
+  // grant
+  val grantIsRefill = io.mem.grant.bits.hasMultibeatData()
+  val grantIsVoluntary = io.mem.grant.bits.isVoluntary()
+  val grantIsUncached = !grantIsRefill && !grantIsVoluntary
+  when (io.mem.grant.valid) {
+    assert(grant_wait || grantIsVoluntary && release_ack_wait, "unexpected grant")
+    when (grantIsUncached) { s2_data := io.mem.grant.bits.data }
+    when (grantIsVoluntary) { release_ack_wait := false }
+  }
+  val (refillCount, refillDone) = Counter(io.mem.grant.fire() && grantIsRefill, refillCycles)
+  val grantDone = refillDone || grantIsUncached
+  when (io.mem.grant.fire() && grantDone) { grant_wait := false }
+
+  // data refill
+  dataArb.io.in(1).valid := grantIsRefill && io.mem.grant.valid
+  io.mem.grant.ready := true
+  assert(dataArb.io.in(1).ready || !dataArb.io.in(1).valid)
+  dataArb.io.in(1).bits.write := true
+  dataArb.io.in(1).bits.addr := Cat(s2_req.addr(paddrBits-1, blockOffBits), io.mem.grant.bits.addr_beat) << beatOffBits
+  dataArb.io.in(1).bits.way_en := s2_victim_way
+  dataArb.io.in(1).bits.wdata := io.mem.grant.bits.data
+  dataArb.io.in(1).bits.wmask := ~UInt(0, rowBytes)
+  // tag updates on refill
+  metaWriteArb.io.in(1).valid := refillDone
+  assert(!metaWriteArb.io.in(1).valid || metaWriteArb.io.in(1).ready)
+  metaWriteArb.io.in(1).bits.way_en := s2_victim_way
+  metaWriteArb.io.in(1).bits.idx := s2_req.addr(idxMSB, idxLSB)
+  metaWriteArb.io.in(1).bits.data.coh := s2_hit_state.onGrant(io.mem.grant.bits, s2_req.cmd)
+  metaWriteArb.io.in(1).bits.data.tag := s2_req.addr(paddrBits-1, untagBits)
+
+  // finish
+  fq.io.enq.valid := io.mem.grant.fire() && io.mem.grant.bits.requiresAck() && (!grantIsRefill || refillDone)
+  fq.io.enq.bits := io.mem.grant.bits.makeFinish()
+  io.mem.finish <> fq.io.deq
+  when (fq.io.enq.valid) { assert(fq.io.enq.ready) }
+  when (refillDone) { replacer.miss }
+
+  // probe
+  val block_probe = releaseInFlight || lrscValid || (s2_valid_hit && s2_lr)
+  metaReadArb.io.in(1).valid := io.mem.probe.valid && !block_probe
+  io.mem.probe.ready := metaReadArb.io.in(1).ready && !block_probe && !s1_valid && (!s2_valid || s2_valid_hit)
+  metaReadArb.io.in(1).bits.idx := io.mem.probe.bits.addr_block
+  metaReadArb.io.in(1).bits.way_en := ~UInt(0, nWays)
+
+  // release
+  val (writebackCount, writebackDone) = Counter(io.mem.release.fire() && inWriteback, refillCycles)
+  val releaseDone = writebackDone || (io.mem.release.fire() && !inWriteback)
+  val releaseRejected = io.mem.release.valid && !io.mem.release.ready
+  val s1_release_data_valid = Reg(next = dataArb.io.in(2).fire())
+  val s2_release_data_valid = Reg(next = s1_release_data_valid && !releaseRejected)
+  val releaseDataBeat = Cat(UInt(0), writebackCount) + Mux(releaseRejected, UInt(0), s1_release_data_valid + Cat(UInt(0), s2_release_data_valid))
+  io.mem.release.valid := s2_release_data_valid
+  io.mem.release.bits := ClientMetadata.onReset.makeRelease(probe_bits)
+  val voluntaryReleaseMessage = s2_victim_state.makeVoluntaryWriteback(UInt(0), UInt(0))
+  val voluntaryNewCoh = s2_victim_state.onCacheControl(M_FLUSH)
+  val probeResponseMessage = s2_probe_state.makeRelease(probe_bits)
+  val probeNewCoh = s2_probe_state.onProbe(probe_bits)
+  val newCoh = Wire(init = probeNewCoh)
+  releaseWay := s2_probe_way
+  when (s2_victimize && s2_victim_dirty) {
+    assert(!s2_hit_state.isValid())
+    release_state := s_voluntary_writeback
+    probe_bits.addr_block := Cat(s2_victim_tag, s2_req.addr(idxMSB, idxLSB))
+  }
+  when (s2_probe) {
+    when (s2_probe_state.requiresVoluntaryWriteback()) { release_state := s_probe_rep_dirty }
+    .elsewhen (s2_probe_state.isValid()) { release_state := s_probe_rep_clean }
+    .otherwise {
+      io.mem.release.valid := true
+      release_state := s_probe_rep_miss
+    }
+  }
+  when (releaseDone) { release_state := s_ready }
+  when (release_state.isOneOf(s_probe_rep_miss, s_probe_rep_clean)) {
+    io.mem.release.valid := true
+  }
+  when (release_state.isOneOf(s_probe_rep_clean, s_probe_rep_dirty)) {
+    io.mem.release.bits := probeResponseMessage
+    when (releaseDone) { release_state := s_probe_write_meta }
+  }
+  when (release_state.isOneOf(s_voluntary_writeback, s_voluntary_write_meta)) {
+    io.mem.release.bits := voluntaryReleaseMessage
+    newCoh := voluntaryNewCoh
+    releaseWay := s2_victim_way
+    when (releaseDone) {
+      release_state := s_voluntary_write_meta
+      release_ack_wait := true
+    }
+  }
+  when (s2_probe && !io.mem.release.fire()) { s1_nack := true }
+  io.mem.release.bits.addr_block := probe_bits.addr_block
+  io.mem.release.bits.addr_beat := writebackCount
+  io.mem.release.bits.data := s2_data
+
+  dataArb.io.in(2).valid := inWriteback && releaseDataBeat < refillCycles
+  dataArb.io.in(2).bits.write := false
+  dataArb.io.in(2).bits.addr := Cat(io.mem.release.bits.addr_block, releaseDataBeat(log2Up(refillCycles)-1,0)) << rowOffBits
+  dataArb.io.in(2).bits.way_en := ~UInt(0, nWays)
+
+  metaWriteArb.io.in(2).valid := release_state.isOneOf(s_voluntary_write_meta, s_probe_write_meta)
+  metaWriteArb.io.in(2).bits.way_en := releaseWay
+  metaWriteArb.io.in(2).bits.idx := io.mem.release.bits.full_addr()(idxMSB, idxLSB)
+  metaWriteArb.io.in(2).bits.data.coh := newCoh
+  metaWriteArb.io.in(2).bits.data.tag := io.mem.release.bits.full_addr()(paddrBits-1, untagBits)
+  when (metaWriteArb.io.in(2).fire()) { release_state := s_ready }
+
+  // cached response
+  io.cpu.resp.valid := s2_valid_hit
+  io.cpu.resp.bits := s2_req
+  io.cpu.resp.bits.has_data := s2_read
+  io.cpu.resp.bits.replay := false
+  io.cpu.ordered := !(s1_valid || s2_valid || grant_wait)
+
+  // uncached response
+  io.cpu.replay_next := io.mem.grant.valid && grantIsUncached
+  val doUncachedResp = Reg(next = io.cpu.replay_next)
+  when (doUncachedResp) {
+    assert(!s2_valid_hit)
+    io.cpu.resp.valid := true
+    io.cpu.resp.bits.replay := true
+  }
+
+  // load data subword mux/sign extension
+  val s2_word_idx = s2_req.addr.extract(log2Up(rowBits/8)-1, log2Up(wordBytes))
+  val s2_data_word = s2_data >> Cat(s2_word_idx, UInt(0, log2Up(coreDataBits)))
+  val loadgen = new LoadGen(s2_req.typ, mtSigned(s2_req.typ), s2_req.addr, s2_data_word, s2_sc, wordBytes)
+  io.cpu.resp.bits.data := loadgen.data | s2_sc_fail
+  io.cpu.resp.bits.data_word_bypass := loadgen.wordData
+  io.cpu.resp.bits.store_data := pstore1_data
+
+  // AMOs
+  if (usingAtomics) {
+    val amoalu = Module(new AMOALU(xLen))
+    amoalu.io.addr := pstore1_addr
+    amoalu.io.cmd := pstore1_cmd
+    amoalu.io.typ := pstore1_typ
+    amoalu.io.lhs := s2_data_word
+    amoalu.io.rhs := pstore1_data
+    pstore1_storegen_data := amoalu.io.out
+  } else {
+    assert(!(s1_valid_masked && s1_read && s1_write), "unsupported D$ operation")
+  }
+
+  // flushes
+  val flushed = Reg(init=Bool(true))
+  val flushing = Reg(init=Bool(false))
+  val flushCounter = Counter(nSets * nWays)
+  when (io.mem.acquire.fire()) { flushed := false }
+  when (s2_valid_masked && s2_req.cmd === M_FLUSH_ALL) {
+    io.cpu.s2_nack := !flushed
+    when (!flushed) {
+      flushing := !release_ack_wait
+    }
+  }
+  s1_flush_valid := metaReadArb.io.in(0).fire() && !s1_flush_valid && !s2_flush_valid && release_state === s_ready && !release_ack_wait
+  metaReadArb.io.in(0).valid := flushing
+  metaReadArb.io.in(0).bits.idx := flushCounter.value
+  metaReadArb.io.in(0).bits.way_en := ~UInt(0, nWays)
+  when (flushing) {
+    s1_victim_way := flushCounter.value >> log2Up(nSets)
+    when (s2_flush_valid) {
+      when (flushCounter.inc()) {
+        flushed := true
+      }
+    }
+    when (flushed && release_state === s_ready && !release_ack_wait) {
+      flushing := false
+    }
+  }
+}
--- a/src/main/scala/rocket/decode.scala
+++ b/src/main/scala/rocket/decode.scala
@ -0,0 +1,203 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+
+object DecodeLogic
+{
+  def term(lit: BitPat) =
+    new Term(lit.value, BigInt(2).pow(lit.getWidth)-(lit.mask+1))
+  def logic(addr: UInt, addrWidth: Int, cache: scala.collection.mutable.Map[Term,Bool], terms: Seq[Term]) = {
+    terms.map { t =>
+      cache.getOrElseUpdate(t, (if (t.mask == 0) addr else addr & Bits(BigInt(2).pow(addrWidth)-(t.mask+1), addrWidth)) === Bits(t.value, addrWidth))
+    }.foldLeft(Bool(false))(_||_)
+  }
+	def apply(addr: UInt, default: BitPat, mapping: Iterable[(BitPat, BitPat)]): UInt = {
+    val cache = caches.getOrElseUpdate(addr, collection.mutable.Map[Term,Bool]())
+    val dterm = term(default)
+    val (keys, values) = mapping.unzip
+    val addrWidth = keys.map(_.getWidth).max
+    val terms = keys.toList.map(k => term(k))
+    val termvalues = terms zip values.toList.map(term(_))
+
+    for (t <- keys.zip(terms).tails; if !t.isEmpty)
+      for (u <- t.tail)
+        assert(!t.head._2.intersects(u._2), "DecodeLogic: keys " + t.head + " and " + u + " overlap")
+
+    Cat((0 until default.getWidth.max(values.map(_.getWidth).max)).map({ case (i: Int) =>
+      val mint = termvalues.filter { case (k,t) => ((t.mask >> i) & 1) == 0 && ((t.value >> i) & 1) == 1 }.map(_._1)
+      val maxt = termvalues.filter { case (k,t) => ((t.mask >> i) & 1) == 0 && ((t.value >> i) & 1) == 0 }.map(_._1)
+      val dc = termvalues.filter { case (k,t) => ((t.mask >> i) & 1) == 1 }.map(_._1)
+
+      if (((dterm.mask >> i) & 1) != 0) {
+        logic(addr, addrWidth, cache, SimplifyDC(mint, maxt, addrWidth))
+      } else {
+        val defbit = (dterm.value.toInt >> i) & 1
+        val t = if (defbit == 0) mint else maxt
+        val bit = logic(addr, addrWidth, cache, Simplify(t, dc, addrWidth))
+        if (defbit == 0) bit else ~bit
+      }
+    }).reverse)
+  }
+  def apply(addr: UInt, default: Seq[BitPat], mappingIn: Iterable[(BitPat, Seq[BitPat])]): Seq[UInt] = {
+    val mapping = collection.mutable.ArrayBuffer.fill(default.size)(collection.mutable.ArrayBuffer[(BitPat, BitPat)]())
+    for ((key, values) <- mappingIn)
+      for ((value, i) <- values zipWithIndex)
+        mapping(i) += key -> value
+    for ((thisDefault, thisMapping) <- default zip mapping)
+      yield apply(addr, thisDefault, thisMapping)
+  }
+  def apply(addr: UInt, default: Seq[BitPat], mappingIn: List[(UInt, Seq[BitPat])]): Seq[UInt] =
+    apply(addr, default, mappingIn.map(m => (BitPat(m._1), m._2)).asInstanceOf[Iterable[(BitPat, Seq[BitPat])]])
+  def apply(addr: UInt, trues: Iterable[UInt], falses: Iterable[UInt]): Bool =
+    apply(addr, BitPat.dontCare(1), trues.map(BitPat(_) -> BitPat("b1")) ++ falses.map(BitPat(_) -> BitPat("b0"))).toBool
+  private val caches = collection.mutable.Map[UInt,collection.mutable.Map[Term,Bool]]()
+}
+
+class Term(val value: BigInt, val mask: BigInt = 0)
+{
+  var prime = true
+
+  def covers(x: Term) = ((value ^ x.value) &~ mask | x.mask &~ mask) == 0
+  def intersects(x: Term) = ((value ^ x.value) &~ mask &~ x.mask) == 0
+  override def equals(that: Any) = that match {
+    case x: Term => x.value == value && x.mask == mask
+    case _ => false
+  }
+  override def hashCode = value.toInt
+  def < (that: Term) = value < that.value || value == that.value && mask < that.mask
+  def similar(x: Term) = {
+    val diff = value - x.value
+    mask == x.mask && value > x.value && (diff & diff-1) == 0
+  }
+  def merge(x: Term) = {
+    prime = false
+    x.prime = false
+    val bit = value - x.value
+    new Term(value &~ bit, mask | bit)
+  }
+
+  override def toString = value.toString(16) + "-" + mask.toString(16) + (if (prime) "p" else "")
+}
+
+object Simplify
+{
+  def getPrimeImplicants(implicants: Seq[Term], bits: Int) = {
+    var prime = List[Term]()
+    implicants.foreach(_.prime = true)
+    val cols = (0 to bits).map(b => implicants.filter(b == _.mask.bitCount))
+    val table = cols.map(c => (0 to bits).map(b => collection.mutable.Set(c.filter(b == _.value.bitCount):_*)))
+    for (i <- 0 to bits) {
+      for (j <- 0 until bits-i)
+        table(i)(j).foreach(a => table(i+1)(j) ++= table(i)(j+1).filter(_.similar(a)).map(_.merge(a)))
+      for (r <- table(i))
+        for (p <- r; if p.prime)
+          prime = p :: prime
+    }
+    prime.sortWith(_<_)
+  }
+  def getEssentialPrimeImplicants(prime: Seq[Term], minterms: Seq[Term]): (Seq[Term],Seq[Term],Seq[Term]) = {
+    for (i <- 0 until prime.size) {
+      val icover = minterms.filter(prime(i) covers _)
+      for (j <- 0 until prime.size) {
+        val jcover = minterms.filter(prime(j) covers _)
+        if (icover.size > jcover.size && jcover.forall(prime(i) covers _))
+          return getEssentialPrimeImplicants(prime.filter(_ != prime(j)), minterms)
+      }
+    }
+
+    val essentiallyCovered = minterms.filter(t => prime.count(_ covers t) == 1)
+    val essential = prime.filter(p => essentiallyCovered.exists(p covers _))
+    val nonessential = prime.filterNot(essential contains _)
+    val uncovered = minterms.filterNot(t => essential.exists(_ covers t))
+    if (essential.isEmpty || uncovered.isEmpty)
+      (essential, nonessential, uncovered)
+    else {
+      val (a, b, c) = getEssentialPrimeImplicants(nonessential, uncovered)
+      (essential ++ a, b, c)
+    }
+  }
+  def getCost(cover: Seq[Term], bits: Int) = cover.map(bits - _.mask.bitCount).sum
+  def cheaper(a: List[Term], b: List[Term], bits: Int) = {
+    val ca = getCost(a, bits)
+    val cb = getCost(b, bits)
+    def listLess(a: List[Term], b: List[Term]): Boolean = !b.isEmpty && (a.isEmpty || a.head < b.head || a.head == b.head && listLess(a.tail, b.tail))
+    ca < cb || ca == cb && listLess(a.sortWith(_<_), b.sortWith(_<_))
+  }
+  def getCover(implicants: Seq[Term], minterms: Seq[Term], bits: Int) = {
+    if (minterms.nonEmpty) {
+      val cover = minterms.map(m => implicants.filter(_.covers(m)).map(i => collection.mutable.Set(i)))
+      val all = cover.reduceLeft((c0, c1) => c0.map(a => c1.map(_ ++ a)).reduceLeft(_++_))
+      all.map(_.toList).reduceLeft((a, b) => if (cheaper(a, b, bits)) a else b)
+    } else
+      Seq[Term]()
+  }
+  def stringify(s: Seq[Term], bits: Int) = s.map(t => (0 until bits).map(i => if ((t.mask & (1 << i)) != 0) "x" else ((t.value >> i) & 1).toString).reduceLeft(_+_).reverse).reduceLeft(_+" + "+_)
+
+  def apply(minterms: Seq[Term], dontcares: Seq[Term], bits: Int) = {
+    val prime = getPrimeImplicants(minterms ++ dontcares, bits)
+    minterms.foreach(t => assert(prime.exists(_.covers(t))))
+    val (eprime, prime2, uncovered) = getEssentialPrimeImplicants(prime, minterms)
+    val cover = eprime ++ getCover(prime2, uncovered, bits)
+    minterms.foreach(t => assert(cover.exists(_.covers(t)))) // sanity check
+    cover
+  }
+}
+
+object SimplifyDC
+{
+  def getImplicitDC(maxterms: Seq[Term], term: Term, bits: Int, above: Boolean): Term = {
+    for (i <- 0 until bits) {
+      var t: Term = null
+      if (above && ((term.value | term.mask) & (BigInt(1) << i)) == 0)
+        t = new Term(term.value | (BigInt(1) << i), term.mask)
+      else if (!above && (term.value & (BigInt(1) << i)) != 0)
+        t = new Term(term.value & ~(BigInt(1) << i), term.mask)
+      if (t != null && !maxterms.exists(_.intersects(t)))
+        return t
+    }
+    null
+  }
+  def getPrimeImplicants(minterms: Seq[Term], maxterms: Seq[Term], bits: Int) = {
+    var prime = List[Term]()
+    minterms.foreach(_.prime = true)
+    var mint = minterms.map(t => new Term(t.value, t.mask))
+    val cols = (0 to bits).map(b => mint.filter(b == _.mask.bitCount))
+    val table = cols.map(c => (0 to bits).map(b => collection.mutable.Set(c.filter(b == _.value.bitCount):_*)))
+
+    for (i <- 0 to bits) {
+      for (j <- 0 until bits-i) {
+        table(i)(j).foreach(a => table(i+1)(j) ++= table(i)(j+1).filter(_ similar a).map(_ merge a))
+      }
+      for (j <- 0 until bits-i) {
+        for (a <- table(i)(j).filter(_.prime)) {
+          val dc = getImplicitDC(maxterms, a, bits, true)
+          if (dc != null)
+            table(i+1)(j) += dc merge a
+        }
+        for (a <- table(i)(j+1).filter(_.prime)) {
+          val dc = getImplicitDC(maxterms, a, bits, false)
+          if (dc != null)
+            table(i+1)(j) += a merge dc
+        }
+      }
+      for (r <- table(i))
+        for (p <- r; if p.prime)
+          prime = p :: prime
+    }
+    prime.sortWith(_<_)
+  }
+
+  def verify(cover: Seq[Term], minterms: Seq[Term], maxterms: Seq[Term]) = {
+    assert(minterms.forall(t => cover.exists(_ covers t)))
+    assert(maxterms.forall(t => !cover.exists(_ intersects t)))
+  }
+  def apply(minterms: Seq[Term], maxterms: Seq[Term], bits: Int) = {
+    val prime = getPrimeImplicants(minterms, maxterms, bits)
+    val (eprime, prime2, uncovered) = Simplify.getEssentialPrimeImplicants(prime, minterms)
+    val cover = eprime ++ Simplify.getCover(prime2, uncovered, bits)
+    verify(cover, minterms, maxterms)
+    cover
+  }
+}
--- a/src/main/scala/rocket/dpath_alu.scala
+++ b/src/main/scala/rocket/dpath_alu.scala
@ -0,0 +1,96 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import cde.{Parameters, Field}
+import Instructions._
+
+object ALU
+{
+  val SZ_ALU_FN = 4
+  val FN_X    = BitPat("b????")
+  val FN_ADD  = UInt(0)
+  val FN_SL   = UInt(1)
+  val FN_SEQ  = UInt(2)
+  val FN_SNE  = UInt(3)
+  val FN_XOR  = UInt(4)
+  val FN_SR   = UInt(5)
+  val FN_OR   = UInt(6)
+  val FN_AND  = UInt(7)
+  val FN_SUB  = UInt(10)
+  val FN_SRA  = UInt(11)
+  val FN_SLT  = UInt(12)
+  val FN_SGE  = UInt(13)
+  val FN_SLTU = UInt(14)
+  val FN_SGEU = UInt(15)
+
+  val FN_DIV  = FN_XOR
+  val FN_DIVU = FN_SR
+  val FN_REM  = FN_OR
+  val FN_REMU = FN_AND
+
+  val FN_MUL    = FN_ADD
+  val FN_MULH   = FN_SL
+  val FN_MULHSU = FN_SLT
+  val FN_MULHU  = FN_SLTU
+
+  def isMulFN(fn: UInt, cmp: UInt) = fn(1,0) === cmp(1,0)
+  def isSub(cmd: UInt) = cmd(3)
+  def isCmp(cmd: UInt) = cmd === FN_SEQ || cmd === FN_SNE || cmd >= FN_SLT
+  def cmpUnsigned(cmd: UInt) = cmd(1)
+  def cmpInverted(cmd: UInt) = cmd(0)
+  def cmpEq(cmd: UInt) = !cmd(3)
+}
+import ALU._
+
+class ALU(implicit p: Parameters) extends CoreModule()(p) {
+  val io = new Bundle {
+    val dw = Bits(INPUT, SZ_DW)
+    val fn = Bits(INPUT, SZ_ALU_FN)
+    val in2 = UInt(INPUT, xLen)
+    val in1 = UInt(INPUT, xLen)
+    val out = UInt(OUTPUT, xLen)
+    val adder_out = UInt(OUTPUT, xLen)
+    val cmp_out = Bool(OUTPUT)
+  }
+
+  // ADD, SUB
+  val in2_inv = Mux(isSub(io.fn), ~io.in2, io.in2)
+  val in1_xor_in2 = io.in1 ^ in2_inv
+  io.adder_out := io.in1 + in2_inv + isSub(io.fn)
+
+  // SLT, SLTU
+  io.cmp_out := cmpInverted(io.fn) ^
+    Mux(cmpEq(io.fn), in1_xor_in2 === UInt(0),
+    Mux(io.in1(xLen-1) === io.in2(xLen-1), io.adder_out(xLen-1),
+    Mux(cmpUnsigned(io.fn), io.in2(xLen-1), io.in1(xLen-1))))
+
+  // SLL, SRL, SRA
+  val (shamt, shin_r) =
+    if (xLen == 32) (io.in2(4,0), io.in1)
+    else {
+      require(xLen == 64)
+      val shin_hi_32 = Fill(32, isSub(io.fn) && io.in1(31))
+      val shin_hi = Mux(io.dw === DW_64, io.in1(63,32), shin_hi_32)
+      val shamt = Cat(io.in2(5) & (io.dw === DW_64), io.in2(4,0))
+      (shamt, Cat(shin_hi, io.in1(31,0)))
+    }
+  val shin = Mux(io.fn === FN_SR  || io.fn === FN_SRA, shin_r, Reverse(shin_r))
+  val shout_r = (Cat(isSub(io.fn) & shin(xLen-1), shin).asSInt >> shamt)(xLen-1,0)
+  val shout_l = Reverse(shout_r)
+  val shout = Mux(io.fn === FN_SR || io.fn === FN_SRA, shout_r, UInt(0)) |
+              Mux(io.fn === FN_SL,                     shout_l, UInt(0))
+
+  // AND, OR, XOR
+  val logic = Mux(io.fn === FN_XOR || io.fn === FN_OR, in1_xor_in2, UInt(0)) |
+              Mux(io.fn === FN_OR || io.fn === FN_AND, io.in1 & io.in2, UInt(0))
+  val shift_logic = (isCmp(io.fn) && io.cmp_out) | logic | shout
+  val out = Mux(io.fn === FN_ADD || io.fn === FN_SUB, io.adder_out, shift_logic)
+
+  io.out := out
+  if (xLen > 32) {
+    require(xLen == 64)
+    when (io.dw === DW_32) { io.out := Cat(Fill(32, out(31)), out(31,0)) }
+  }
+}
--- a/src/main/scala/rocket/fpu.scala
+++ b/src/main/scala/rocket/fpu.scala
@ -0,0 +1,654 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import Instructions._
+import Util._
+import FPConstants._
+import uncore.constants.MemoryOpConstants._
+import uncore.util._
+import cde.{Parameters, Field}
+
+case class FPUConfig(
+  divSqrt: Boolean = true,
+  sfmaLatency: Int = 2,
+  dfmaLatency: Int = 3
+)
+
+object FPConstants
+{
+  val FCMD_ADD =    BitPat("b0??00")
+  val FCMD_SUB =    BitPat("b0??01")
+  val FCMD_MUL =    BitPat("b0??10")
+  val FCMD_MADD =   BitPat("b1??00")
+  val FCMD_MSUB =   BitPat("b1??01")
+  val FCMD_NMSUB =  BitPat("b1??10")
+  val FCMD_NMADD =  BitPat("b1??11")
+  val FCMD_DIV =    BitPat("b?0011")
+  val FCMD_SQRT =   BitPat("b?1011")
+  val FCMD_SGNJ =   BitPat("b??1?0")
+  val FCMD_MINMAX = BitPat("b?01?1")
+  val FCMD_CVT_FF = BitPat("b??0??")
+  val FCMD_CVT_IF = BitPat("b?10??")
+  val FCMD_CMP =    BitPat("b?01??")
+  val FCMD_MV_XF =  BitPat("b?11??")
+  val FCMD_CVT_FI = BitPat("b??0??")
+  val FCMD_MV_FX =  BitPat("b??1??")
+  val FCMD_X =      BitPat("b?????")
+  val FCMD_WIDTH = 5
+
+  val RM_SZ = 3
+  val FLAGS_SZ = 5
+}
+
+class FPUCtrlSigs extends Bundle
+{
+  val cmd = Bits(width = FCMD_WIDTH)
+  val ldst = Bool()
+  val wen = Bool()
+  val ren1 = Bool()
+  val ren2 = Bool()
+  val ren3 = Bool()
+  val swap12 = Bool()
+  val swap23 = Bool()
+  val single = Bool()
+  val fromint = Bool()
+  val toint = Bool()
+  val fastpipe = Bool()
+  val fma = Bool()
+  val div = Bool()
+  val sqrt = Bool()
+  val round = Bool()
+  val wflags = Bool()
+}
+
+class FPUDecoder extends Module
+{
+  val io = new Bundle {
+    val inst = Bits(INPUT, 32)
+    val sigs = new FPUCtrlSigs().asOutput
+  }
+
+  val decoder = DecodeLogic(io.inst,
+    List                  (FCMD_X,      X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X),
+    Array(FLW      -> List(FCMD_X,      Y,Y,N,N,N,X,X,Y,N,N,N,N,N,N,N,N),
+          FLD      -> List(FCMD_X,      Y,Y,N,N,N,X,X,N,N,N,N,N,N,N,N,N),
+          FSW      -> List(FCMD_MV_XF,  Y,N,N,Y,N,Y,X,Y,N,Y,N,N,N,N,N,N),
+          FSD      -> List(FCMD_MV_XF,  Y,N,N,Y,N,Y,X,N,N,Y,N,N,N,N,N,N),
+          FMV_S_X  -> List(FCMD_MV_FX,  N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,N),
+          FMV_D_X  -> List(FCMD_MV_FX,  N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,N),
+          FCVT_S_W -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y),
+          FCVT_S_WU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y),
+          FCVT_S_L -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y),
+          FCVT_S_LU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y),
+          FCVT_D_W -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y),
+          FCVT_D_WU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y),
+          FCVT_D_L -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y),
+          FCVT_D_LU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y),
+          FMV_X_S  -> List(FCMD_MV_XF,  N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,N),
+          FMV_X_D  -> List(FCMD_MV_XF,  N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,N),
+          FCLASS_S -> List(FCMD_MV_XF,  N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,N),
+          FCLASS_D -> List(FCMD_MV_XF,  N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,N),
+          FCVT_W_S -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y),
+          FCVT_WU_S-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y),
+          FCVT_L_S -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y),
+          FCVT_LU_S-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y),
+          FCVT_W_D -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y),
+          FCVT_WU_D-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y),
+          FCVT_L_D -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y),
+          FCVT_LU_D-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y),
+          FCVT_S_D -> List(FCMD_CVT_FF, N,Y,Y,N,N,N,X,Y,N,N,Y,N,N,N,Y,Y),
+          FCVT_D_S -> List(FCMD_CVT_FF, N,Y,Y,N,N,N,X,N,N,N,Y,N,N,N,Y,Y),
+          FEQ_S    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,Y,N,Y,N,N,N,N,N,Y),
+          FLT_S    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,Y,N,Y,N,N,N,N,N,Y),
+          FLE_S    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,Y,N,Y,N,N,N,N,N,Y),
+          FEQ_D    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,Y),
+          FLT_D    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,Y),
+          FLE_D    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,Y),
+          FSGNJ_S  -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,N),
+          FSGNJN_S -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,N),
+          FSGNJX_S -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,N),
+          FSGNJ_D  -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,N),
+          FSGNJN_D -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,N),
+          FSGNJX_D -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,N),
+          FMIN_S   -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,Y),
+          FMAX_S   -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,Y),
+          FMIN_D   -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,Y),
+          FMAX_D   -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,Y),
+          FADD_S   -> List(FCMD_ADD,    N,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y,Y),
+          FSUB_S   -> List(FCMD_SUB,    N,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y,Y),
+          FMUL_S   -> List(FCMD_MUL,    N,Y,Y,Y,N,N,N,Y,N,N,N,Y,N,N,Y,Y),
+          FADD_D   -> List(FCMD_ADD,    N,Y,Y,Y,N,N,Y,N,N,N,N,Y,N,N,Y,Y),
+          FSUB_D   -> List(FCMD_SUB,    N,Y,Y,Y,N,N,Y,N,N,N,N,Y,N,N,Y,Y),
+          FMUL_D   -> List(FCMD_MUL,    N,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,Y,Y),
+          FMADD_S  -> List(FCMD_MADD,   N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y),
+          FMSUB_S  -> List(FCMD_MSUB,   N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y),
+          FNMADD_S -> List(FCMD_NMADD,  N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y),
+          FNMSUB_S -> List(FCMD_NMSUB,  N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y),
+          FMADD_D  -> List(FCMD_MADD,   N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y),
+          FMSUB_D  -> List(FCMD_MSUB,   N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y),
+          FNMADD_D -> List(FCMD_NMADD,  N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y),
+          FNMSUB_D -> List(FCMD_NMSUB,  N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y),
+          FDIV_S   -> List(FCMD_DIV,    N,Y,Y,Y,N,N,N,Y,N,N,N,N,Y,N,Y,Y),
+          FSQRT_S  -> List(FCMD_SQRT,   N,Y,Y,N,N,Y,X,Y,N,N,N,N,N,Y,Y,Y),
+          FDIV_D   -> List(FCMD_DIV,    N,Y,Y,Y,N,N,N,N,N,N,N,N,Y,N,Y,Y),
+          FSQRT_D  -> List(FCMD_SQRT,   N,Y,Y,N,N,Y,X,N,N,N,N,N,N,Y,Y,Y)
+          ))
+  val s = io.sigs
+  val sigs = Seq(s.cmd, s.ldst, s.wen, s.ren1, s.ren2, s.ren3, s.swap12,
+                 s.swap23, s.single, s.fromint, s.toint, s.fastpipe, s.fma,
+                 s.div, s.sqrt, s.round, s.wflags)
+  sigs zip decoder map {case(s,d) => s := d}
+}
+
+class FPUIO(implicit p: Parameters) extends CoreBundle {
+  val inst = Bits(INPUT, 32)
+  val fromint_data = Bits(INPUT, xLen)
+
+  val fcsr_rm = Bits(INPUT, FPConstants.RM_SZ)
+  val fcsr_flags = Valid(Bits(width = FPConstants.FLAGS_SZ))
+
+  val store_data = Bits(OUTPUT, 64)
+  val toint_data = Bits(OUTPUT, xLen)
+
+  val dmem_resp_val = Bool(INPUT)
+  val dmem_resp_type = Bits(INPUT, 3)
+  val dmem_resp_tag = UInt(INPUT, 5)
+  val dmem_resp_data = Bits(INPUT, 64)
+
+  val valid = Bool(INPUT)
+  val fcsr_rdy = Bool(OUTPUT)
+  val nack_mem = Bool(OUTPUT)
+  val illegal_rm = Bool(OUTPUT)
+  val killx = Bool(INPUT)
+  val killm = Bool(INPUT)
+  val dec = new FPUCtrlSigs().asOutput
+  val sboard_set = Bool(OUTPUT)
+  val sboard_clr = Bool(OUTPUT)
+  val sboard_clra = UInt(OUTPUT, 5)
+
+  val cp_req = Decoupled(new FPInput()).flip //cp doesn't pay attn to kill sigs
+  val cp_resp = Decoupled(new FPResult())
+}
+
+class FPResult extends Bundle
+{
+  val data = Bits(width = 65)
+  val exc = Bits(width = 5)
+}
+
+class FPInput extends FPUCtrlSigs {
+  val rm = Bits(width = 3)
+  val typ = Bits(width = 2)
+  val in1 = Bits(width = 65)
+  val in2 = Bits(width = 65)
+  val in3 = Bits(width = 65)
+}
+
+object ClassifyRecFN {
+  def apply(expWidth: Int, sigWidth: Int, in: UInt) = {
+    val sign = in(sigWidth + expWidth)
+    val exp = in(sigWidth + expWidth - 1, sigWidth - 1)
+    val sig = in(sigWidth - 2, 0)
+
+    val code        = exp(expWidth,expWidth-2)
+    val codeHi      = code(2, 1)
+    val isSpecial   = codeHi === UInt(3)
+
+    val isHighSubnormalIn = exp(expWidth-2, 0) < UInt(2)
+    val isSubnormal = code === UInt(1) || codeHi === UInt(1) && isHighSubnormalIn
+    val isNormal = codeHi === UInt(1) && !isHighSubnormalIn || codeHi === UInt(2)
+    val isZero = code === UInt(0)
+    val isInf = isSpecial && !exp(expWidth-2)
+    val isNaN = code.andR
+    val isSNaN = isNaN && !sig(sigWidth-2)
+    val isQNaN = isNaN && sig(sigWidth-2)
+
+    Cat(isQNaN, isSNaN, isInf && !sign, isNormal && !sign,
+        isSubnormal && !sign, isZero && !sign, isZero && sign,
+        isSubnormal && sign, isNormal && sign, isInf && sign)
+  }
+}
+
+class FPToInt extends Module
+{
+  val io = new Bundle {
+    val in = Valid(new FPInput).flip
+    val as_double = new FPInput().asOutput
+    val out = Valid(new Bundle {
+      val lt = Bool()
+      val store = Bits(width = 64)
+      val toint = Bits(width = 64)
+      val exc = Bits(width = 5)
+    })
+  }
+
+  val in = Reg(new FPInput)
+  val valid = Reg(next=io.in.valid)
+
+  def upconvert(x: UInt) = {
+    val s2d = Module(new hardfloat.RecFNToRecFN(8, 24, 11, 53))
+    s2d.io.in := x
+    s2d.io.roundingMode := UInt(0)
+    s2d.io.out
+  }
+
+  val in1_upconvert = upconvert(io.in.bits.in1)
+  val in2_upconvert = upconvert(io.in.bits.in2)
+
+  when (io.in.valid) {
+    in := io.in.bits
+    when (io.in.bits.single && !io.in.bits.ldst && io.in.bits.cmd =/= FCMD_MV_XF) {
+      in.in1 := in1_upconvert
+      in.in2 := in2_upconvert
+    }
+  }
+
+  val unrec_s = hardfloat.fNFromRecFN(8, 24, in.in1)
+  val unrec_d = hardfloat.fNFromRecFN(11, 53, in.in1)
+  val unrec_out = Mux(in.single, Cat(Fill(32, unrec_s(31)), unrec_s), unrec_d)
+
+  val classify_s = ClassifyRecFN(8, 24, in.in1)
+  val classify_d = ClassifyRecFN(11, 53, in.in1)
+  val classify_out = Mux(in.single, classify_s, classify_d)
+
+  val dcmp = Module(new hardfloat.CompareRecFN(11, 53))
+  dcmp.io.a := in.in1
+  dcmp.io.b := in.in2
+  dcmp.io.signaling := Bool(true)
+  val dcmp_out = (~in.rm & Cat(dcmp.io.lt, dcmp.io.eq)).orR
+  val dcmp_exc = dcmp.io.exceptionFlags
+
+  val d2l = Module(new hardfloat.RecFNToIN(11, 53, 64))
+  val d2w = Module(new hardfloat.RecFNToIN(11, 53, 32))
+  d2l.io.in := in.in1
+  d2l.io.roundingMode := in.rm
+  d2l.io.signedOut := ~in.typ(0)
+  d2w.io.in := in.in1
+  d2w.io.roundingMode := in.rm
+  d2w.io.signedOut := ~in.typ(0)
+
+  io.out.bits.toint := Mux(in.rm(0), classify_out, unrec_out)
+  io.out.bits.store := unrec_out
+  io.out.bits.exc := Bits(0)
+
+  when (in.cmd === FCMD_CMP) {
+    io.out.bits.toint := dcmp_out
+    io.out.bits.exc := dcmp_exc
+  }
+  when (in.cmd === FCMD_CVT_IF) {
+    io.out.bits.toint := Mux(in.typ(1), d2l.io.out.asSInt, d2w.io.out.asSInt).asUInt
+    val dflags = Mux(in.typ(1), d2l.io.intExceptionFlags, d2w.io.intExceptionFlags)
+    io.out.bits.exc := Cat(dflags(2, 1).orR, UInt(0, 3), dflags(0))
+  }
+
+  io.out.valid := valid
+  io.out.bits.lt := dcmp.io.lt
+  io.as_double := in
+}
+
+class IntToFP(val latency: Int) extends Module
+{
+  val io = new Bundle {
+    val in = Valid(new FPInput).flip
+    val out = Valid(new FPResult)
+  }
+
+  val in = Pipe(io.in)
+
+  val mux = Wire(new FPResult)
+  mux.exc := Bits(0)
+  mux.data := hardfloat.recFNFromFN(11, 53, in.bits.in1)
+  when (in.bits.single) {
+    mux.data := Cat(SInt(-1, 32), hardfloat.recFNFromFN(8, 24, in.bits.in1))
+  }
+
+  val longValue =
+    Mux(in.bits.typ(1), in.bits.in1.asSInt,
+    Mux(in.bits.typ(0), in.bits.in1(31,0).zext, in.bits.in1(31,0).asSInt))
+  val l2s = Module(new hardfloat.INToRecFN(64, 8, 24))
+  l2s.io.signedIn := ~in.bits.typ(0)
+  l2s.io.in := longValue.asUInt
+  l2s.io.roundingMode := in.bits.rm
+
+  val l2d = Module(new hardfloat.INToRecFN(64, 11, 53))
+  l2d.io.signedIn := ~in.bits.typ(0)
+  l2d.io.in := longValue.asUInt
+  l2d.io.roundingMode := in.bits.rm
+
+  when (in.bits.cmd === FCMD_CVT_FI) {
+    when (in.bits.single) {
+      mux.data := Cat(SInt(-1, 32), l2s.io.out)
+      mux.exc := l2s.io.exceptionFlags
+    }.otherwise {
+      mux.data := l2d.io.out
+      mux.exc := l2d.io.exceptionFlags
+    }
+  }
+
+  io.out <> Pipe(in.valid, mux, latency-1)
+}
+
+class FPToFP(val latency: Int) extends Module
+{
+  val io = new Bundle {
+    val in = Valid(new FPInput).flip
+    val out = Valid(new FPResult)
+    val lt = Bool(INPUT) // from FPToInt
+  }
+
+  val in = Pipe(io.in)
+
+  // fp->fp units
+  val isSgnj = in.bits.cmd === FCMD_SGNJ
+  def fsgnjSign(in1: Bits, in2: Bits, pos: Int, en: Bool, rm: Bits) =
+    Mux(rm(1) || !en, in1(pos), rm(0)) ^ (en && in2(pos))
+  val sign_s = fsgnjSign(in.bits.in1, in.bits.in2, 32, in.bits.single && isSgnj, in.bits.rm)
+  val sign_d = fsgnjSign(in.bits.in1, in.bits.in2, 64, !in.bits.single && isSgnj, in.bits.rm)
+  val fsgnj = Cat(sign_d, in.bits.in1(63,33), sign_s, in.bits.in1(31,0))
+
+  val s2d = Module(new hardfloat.RecFNToRecFN(8, 24, 11, 53))
+  val d2s = Module(new hardfloat.RecFNToRecFN(11, 53, 8, 24))
+  s2d.io.in := in.bits.in1
+  s2d.io.roundingMode := in.bits.rm
+  d2s.io.in := in.bits.in1
+  d2s.io.roundingMode := in.bits.rm
+
+  val isnan1 = Mux(in.bits.single, in.bits.in1(31,29).andR, in.bits.in1(63,61).andR)
+  val isnan2 = Mux(in.bits.single, in.bits.in2(31,29).andR, in.bits.in2(63,61).andR)
+  val issnan1 = isnan1 && ~Mux(in.bits.single, in.bits.in1(22), in.bits.in1(51))
+  val issnan2 = isnan2 && ~Mux(in.bits.single, in.bits.in2(22), in.bits.in2(51))
+  val minmax_exc = Cat(issnan1 || issnan2, Bits(0,4))
+  val isMax = in.bits.rm(0)
+  val isLHS = isnan2 || isMax =/= io.lt && !isnan1
+
+  val mux = Wire(new FPResult)
+  mux.exc := minmax_exc
+  mux.data := in.bits.in2
+
+  when (isSgnj) { mux.exc := UInt(0) }
+  when (isSgnj || isLHS) { mux.data := fsgnj }
+  when (in.bits.cmd === FCMD_CVT_FF) {
+    when (in.bits.single) {
+      mux.data := Cat(SInt(-1, 32), d2s.io.out)
+      mux.exc := d2s.io.exceptionFlags
+    }.otherwise {
+      mux.data := s2d.io.out
+      mux.exc := s2d.io.exceptionFlags
+    }
+  }
+
+  io.out <> Pipe(in.valid, mux, latency-1)
+}
+
+class FPUFMAPipe(val latency: Int, expWidth: Int, sigWidth: Int) extends Module
+{
+  val io = new Bundle {
+    val in = Valid(new FPInput).flip
+    val out = Valid(new FPResult)
+  }
+
+  val width = sigWidth + expWidth
+  val one = UInt(1) << (width-1)
+  val zero = (io.in.bits.in1(width) ^ io.in.bits.in2(width)) << width
+
+  val valid = Reg(next=io.in.valid)
+  val in = Reg(new FPInput)
+  when (io.in.valid) {
+    in := io.in.bits
+    val cmd_fma = io.in.bits.ren3
+    val cmd_addsub = io.in.bits.swap23
+    in.cmd := Cat(io.in.bits.cmd(1) & (cmd_fma || cmd_addsub), io.in.bits.cmd(0))
+    when (cmd_addsub) { in.in2 := one }
+    unless (cmd_fma || cmd_addsub) { in.in3 := zero }
+  }
+
+  val fma = Module(new hardfloat.MulAddRecFN(expWidth, sigWidth))
+  fma.io.op := in.cmd
+  fma.io.roundingMode := in.rm
+  fma.io.a := in.in1
+  fma.io.b := in.in2
+  fma.io.c := in.in3
+
+  val res = Wire(new FPResult)
+  res.data := Cat(SInt(-1, 32), fma.io.out)
+  res.exc := fma.io.exceptionFlags
+  io.out := Pipe(valid, res, latency-1)
+}
+
+class FPU(cfg: FPUConfig)(implicit p: Parameters) extends CoreModule()(p) {
+  require(xLen == 64, "RV32 Rocket FP support missing")
+  val io = new FPUIO
+
+  val ex_reg_valid = Reg(next=io.valid, init=Bool(false))
+  val req_valid = ex_reg_valid || io.cp_req.valid
+  val ex_reg_inst = RegEnable(io.inst, io.valid)
+  val ex_cp_valid = io.cp_req.valid && !ex_reg_valid
+  val mem_reg_valid = Reg(next=ex_reg_valid && !io.killx || ex_cp_valid, init=Bool(false))
+  val mem_reg_inst = RegEnable(ex_reg_inst, ex_reg_valid)
+  val mem_cp_valid = Reg(next=ex_cp_valid, init=Bool(false))
+  val killm = (io.killm || io.nack_mem) && !mem_cp_valid
+  val wb_reg_valid = Reg(next=mem_reg_valid && (!killm || mem_cp_valid), init=Bool(false))
+  val wb_cp_valid = Reg(next=mem_cp_valid, init=Bool(false))
+
+  val fp_decoder = Module(new FPUDecoder)
+  fp_decoder.io.inst := io.inst
+
+  val cp_ctrl = Wire(new FPUCtrlSigs)
+  cp_ctrl <> io.cp_req.bits
+  io.cp_resp.valid := Bool(false)
+  io.cp_resp.bits.data := UInt(0)
+
+  val id_ctrl = fp_decoder.io.sigs
+  val ex_ctrl = Mux(ex_reg_valid, RegEnable(id_ctrl, io.valid), cp_ctrl)
+  val mem_ctrl = RegEnable(ex_ctrl, req_valid)
+  val wb_ctrl = RegEnable(mem_ctrl, mem_reg_valid)
+
+  // load response
+  val load_wb = Reg(next=io.dmem_resp_val)
+  val load_wb_single = RegEnable(!io.dmem_resp_type(0), io.dmem_resp_val)
+  val load_wb_data = RegEnable(io.dmem_resp_data, io.dmem_resp_val)
+  val load_wb_tag = RegEnable(io.dmem_resp_tag, io.dmem_resp_val)
+  val rec_s = hardfloat.recFNFromFN(8, 24, load_wb_data)
+  val rec_d = hardfloat.recFNFromFN(11, 53, load_wb_data)
+  val load_wb_data_recoded = Mux(load_wb_single, Cat(SInt(-1, 32), rec_s), rec_d)
+
+  // regfile
+  val regfile = Mem(32, Bits(width = 65))
+  when (load_wb) {
+    regfile(load_wb_tag) := load_wb_data_recoded
+    if (enableCommitLog) {
+      printf ("f%d p%d 0x%x\n", load_wb_tag, load_wb_tag + UInt(32),
+        Mux(load_wb_single, load_wb_data(31,0), load_wb_data))
+    }
+  }
+
+  val ex_ra1::ex_ra2::ex_ra3::Nil = List.fill(3)(Reg(UInt()))
+  when (io.valid) {
+    when (id_ctrl.ren1) {
+      when (!id_ctrl.swap12) { ex_ra1 := io.inst(19,15) }
+      when (id_ctrl.swap12) { ex_ra2 := io.inst(19,15) }
+    }
+    when (id_ctrl.ren2) {
+      when (id_ctrl.swap12) { ex_ra1 := io.inst(24,20) }
+      when (id_ctrl.swap23) { ex_ra3 := io.inst(24,20) }
+      when (!id_ctrl.swap12 && !id_ctrl.swap23) { ex_ra2 := io.inst(24,20) }
+    }
+    when (id_ctrl.ren3) { ex_ra3 := io.inst(31,27) }
+  }
+  val ex_rs1::ex_rs2::ex_rs3::Nil = Seq(ex_ra1, ex_ra2, ex_ra3).map(regfile(_))
+  val ex_rm = Mux(ex_reg_inst(14,12) === Bits(7), io.fcsr_rm, ex_reg_inst(14,12))
+
+  val cp_rs1 = io.cp_req.bits.in1
+  val cp_rs2 = Mux(io.cp_req.bits.swap23, io.cp_req.bits.in3, io.cp_req.bits.in2)
+  val cp_rs3 = Mux(io.cp_req.bits.swap23, io.cp_req.bits.in2, io.cp_req.bits.in3)
+
+  val req = Wire(new FPInput)
+  req := ex_ctrl
+  req.rm := Mux(ex_reg_valid, ex_rm, io.cp_req.bits.rm)
+  req.in1 := Mux(ex_reg_valid, ex_rs1, cp_rs1)
+  req.in2 := Mux(ex_reg_valid, ex_rs2, cp_rs2)
+  req.in3 := Mux(ex_reg_valid, ex_rs3, cp_rs3)
+  req.typ := Mux(ex_reg_valid, ex_reg_inst(21,20), io.cp_req.bits.typ)
+
+  val sfma = Module(new FPUFMAPipe(cfg.sfmaLatency, 8, 24))
+  sfma.io.in.valid := req_valid && ex_ctrl.fma && ex_ctrl.single
+  sfma.io.in.bits := req
+
+  val dfma = Module(new FPUFMAPipe(cfg.dfmaLatency, 11, 53))
+  dfma.io.in.valid := req_valid && ex_ctrl.fma && !ex_ctrl.single
+  dfma.io.in.bits := req
+
+  val fpiu = Module(new FPToInt)
+  fpiu.io.in.valid := req_valid && (ex_ctrl.toint || ex_ctrl.div || ex_ctrl.sqrt || ex_ctrl.cmd === FCMD_MINMAX)
+  fpiu.io.in.bits := req
+  io.store_data := fpiu.io.out.bits.store
+  io.toint_data := fpiu.io.out.bits.toint
+  when(fpiu.io.out.valid && mem_cp_valid && mem_ctrl.toint){
+    io.cp_resp.bits.data := fpiu.io.out.bits.toint
+    io.cp_resp.valid := Bool(true)
+  }
+
+  val ifpu = Module(new IntToFP(3))
+  ifpu.io.in.valid := req_valid && ex_ctrl.fromint
+  ifpu.io.in.bits := req
+  ifpu.io.in.bits.in1 := Mux(ex_reg_valid, io.fromint_data, cp_rs1)
+
+  val fpmu = Module(new FPToFP(2))
+  fpmu.io.in.valid := req_valid && ex_ctrl.fastpipe
+  fpmu.io.in.bits := req
+  fpmu.io.lt := fpiu.io.out.bits.lt
+
+  val divSqrt_wen = Reg(next=Bool(false))
+  val divSqrt_inReady = Wire(init=Bool(false))
+  val divSqrt_waddr = Reg(Bits())
+  val divSqrt_wdata = Wire(Bits())
+  val divSqrt_flags = Wire(Bits())
+  val divSqrt_in_flight = Reg(init=Bool(false))
+  val divSqrt_killed = Reg(Bool())
+
+  // writeback arbitration
+  case class Pipe(p: Module, lat: Int, cond: (FPUCtrlSigs) => Bool, res: FPResult)
+  val pipes = List(
+    Pipe(fpmu, fpmu.latency, (c: FPUCtrlSigs) => c.fastpipe, fpmu.io.out.bits),
+    Pipe(ifpu, ifpu.latency, (c: FPUCtrlSigs) => c.fromint, ifpu.io.out.bits),
+    Pipe(sfma, sfma.latency, (c: FPUCtrlSigs) => c.fma && c.single, sfma.io.out.bits),
+    Pipe(dfma, dfma.latency, (c: FPUCtrlSigs) => c.fma && !c.single, dfma.io.out.bits))
+  def latencyMask(c: FPUCtrlSigs, offset: Int) = {
+    require(pipes.forall(_.lat >= offset))
+    pipes.map(p => Mux(p.cond(c), UInt(1 << p.lat-offset), UInt(0))).reduce(_|_)
+  }
+  def pipeid(c: FPUCtrlSigs) = pipes.zipWithIndex.map(p => Mux(p._1.cond(c), UInt(p._2), UInt(0))).reduce(_|_)
+  val maxLatency = pipes.map(_.lat).max
+  val memLatencyMask = latencyMask(mem_ctrl, 2)
+
+  class WBInfo extends Bundle {
+    val rd = UInt(width = 5)
+    val single = Bool()
+    val cp = Bool()
+    val pipeid = UInt(width = log2Ceil(pipes.size))
+    override def cloneType: this.type = new WBInfo().asInstanceOf[this.type]
+  }
+
+  val wen = Reg(init=Bits(0, maxLatency-1))
+  val wbInfo = Reg(Vec(maxLatency-1, new WBInfo))
+  val mem_wen = mem_reg_valid && (mem_ctrl.fma || mem_ctrl.fastpipe || mem_ctrl.fromint)
+  val write_port_busy = RegEnable(mem_wen && (memLatencyMask & latencyMask(ex_ctrl, 1)).orR || (wen & latencyMask(ex_ctrl, 0)).orR, req_valid)
+
+  for (i <- 0 until maxLatency-2) {
+    when (wen(i+1)) { wbInfo(i) := wbInfo(i+1) }
+  }
+  wen := wen >> 1
+  when (mem_wen) {
+    when (!killm) {
+      wen := wen >> 1 | memLatencyMask
+    }
+    for (i <- 0 until maxLatency-1) {
+      when (!write_port_busy && memLatencyMask(i)) {
+        wbInfo(i).cp := mem_cp_valid
+        wbInfo(i).single := mem_ctrl.single
+        wbInfo(i).pipeid := pipeid(mem_ctrl)
+        wbInfo(i).rd := mem_reg_inst(11,7)
+      }
+    }
+  }
+
+  val waddr = Mux(divSqrt_wen, divSqrt_waddr, wbInfo(0).rd)
+  val wdata = Mux(divSqrt_wen, divSqrt_wdata, (pipes.map(_.res.data): Seq[UInt])(wbInfo(0).pipeid))
+  val wexc = (pipes.map(_.res.exc): Seq[UInt])(wbInfo(0).pipeid)
+  when ((!wbInfo(0).cp && wen(0)) || divSqrt_wen) {
+    regfile(waddr) := wdata
+    if (enableCommitLog) {
+      val wdata_unrec_s = hardfloat.fNFromRecFN(8, 24, wdata(64,0))
+      val wdata_unrec_d = hardfloat.fNFromRecFN(11, 53, wdata(64,0))
+      printf ("f%d p%d 0x%x\n", waddr, waddr+ UInt(32),
+        Mux(wbInfo(0).single, Cat(UInt(0,32), wdata_unrec_s), wdata_unrec_d))
+    }
+  }
+  when (wbInfo(0).cp && wen(0)) {
+    io.cp_resp.bits.data := wdata
+    io.cp_resp.valid := Bool(true)
+  }
+  io.cp_req.ready := !ex_reg_valid
+
+  val wb_toint_valid = wb_reg_valid && wb_ctrl.toint
+  val wb_toint_exc = RegEnable(fpiu.io.out.bits.exc, mem_ctrl.toint)
+  io.fcsr_flags.valid := wb_toint_valid || divSqrt_wen || wen(0)
+  io.fcsr_flags.bits :=
+    Mux(wb_toint_valid, wb_toint_exc, UInt(0)) |
+    Mux(divSqrt_wen, divSqrt_flags, UInt(0)) |
+    Mux(wen(0), wexc, UInt(0))
+
+  val units_busy = mem_reg_valid && (mem_ctrl.div || mem_ctrl.sqrt) && (!divSqrt_inReady || wen.orR) // || mem_reg_valid && mem_ctrl.fma && Reg(next=Mux(ex_ctrl.single, io.sfma.valid, io.dfma.valid))
+  io.fcsr_rdy := !(ex_reg_valid && ex_ctrl.wflags || mem_reg_valid && mem_ctrl.wflags || wb_reg_valid && wb_ctrl.toint || wen.orR || divSqrt_in_flight)
+  io.nack_mem := units_busy || write_port_busy || divSqrt_in_flight
+  io.dec <> fp_decoder.io.sigs
+  def useScoreboard(f: ((Pipe, Int)) => Bool) = pipes.zipWithIndex.filter(_._1.lat > 3).map(x => f(x)).fold(Bool(false))(_||_)
+  io.sboard_set := wb_reg_valid && !wb_cp_valid && Reg(next=useScoreboard(_._1.cond(mem_ctrl)) || mem_ctrl.div || mem_ctrl.sqrt)
+  io.sboard_clr := !wb_cp_valid && (divSqrt_wen || (wen(0) && useScoreboard(x => wbInfo(0).pipeid === UInt(x._2))))
+  io.sboard_clra := waddr
+  // we don't currently support round-max-magnitude (rm=4)
+  io.illegal_rm := ex_rm(2) && ex_ctrl.round
+
+  divSqrt_wdata := 0
+  divSqrt_flags := 0
+  if (cfg.divSqrt) {
+    val divSqrt_single = Reg(Bool())
+    val divSqrt_rm = Reg(Bits())
+    val divSqrt_flags_double = Reg(Bits())
+    val divSqrt_wdata_double = Reg(Bits())
+
+    val divSqrt = Module(new hardfloat.DivSqrtRecF64)
+    divSqrt_inReady := Mux(divSqrt.io.sqrtOp, divSqrt.io.inReady_sqrt, divSqrt.io.inReady_div)
+    val divSqrt_outValid = divSqrt.io.outValid_div || divSqrt.io.outValid_sqrt
+    divSqrt.io.inValid := mem_reg_valid && (mem_ctrl.div || mem_ctrl.sqrt) && !divSqrt_in_flight
+    divSqrt.io.sqrtOp := mem_ctrl.sqrt
+    divSqrt.io.a := fpiu.io.as_double.in1
+    divSqrt.io.b := fpiu.io.as_double.in2
+    divSqrt.io.roundingMode := fpiu.io.as_double.rm
+
+    when (divSqrt.io.inValid && divSqrt_inReady) {
+      divSqrt_in_flight := true
+      divSqrt_killed := killm
+      divSqrt_single := mem_ctrl.single
+      divSqrt_waddr := mem_reg_inst(11,7)
+      divSqrt_rm := divSqrt.io.roundingMode
+    }
+
+    when (divSqrt_outValid) {
+      divSqrt_wen := !divSqrt_killed
+      divSqrt_wdata_double := divSqrt.io.out
+      divSqrt_in_flight := false
+      divSqrt_flags_double := divSqrt.io.exceptionFlags
+    }
+
+    val divSqrt_toSingle = Module(new hardfloat.RecFNToRecFN(11, 53, 8, 24))
+    divSqrt_toSingle.io.in := divSqrt_wdata_double
+    divSqrt_toSingle.io.roundingMode := divSqrt_rm
+    divSqrt_wdata := Mux(divSqrt_single, divSqrt_toSingle.io.out, divSqrt_wdata_double)
+    divSqrt_flags := divSqrt_flags_double | Mux(divSqrt_single, divSqrt_toSingle.io.exceptionFlags, Bits(0))
+  } else {
+    when (ex_ctrl.div || ex_ctrl.sqrt) { io.illegal_rm := true }
+  }
+}
--- a/src/main/scala/rocket/frontend.scala
+++ b/src/main/scala/rocket/frontend.scala
@ -0,0 +1,133 @@
+package rocket
+
+import Chisel._
+import uncore.tilelink._
+import Util._
+import cde.{Parameters, Field}
+
+class FrontendReq(implicit p: Parameters) extends CoreBundle()(p) {
+  val pc = UInt(width = vaddrBitsExtended)
+  val speculative = Bool()
+}
+
+class FrontendResp(implicit p: Parameters) extends CoreBundle()(p) {
+  val btb = Valid(new BTBResp)
+  val pc = UInt(width = vaddrBitsExtended)  // ID stage PC
+  val data = UInt(width = fetchWidth * coreInstBits)
+  val mask = Bits(width = fetchWidth)
+  val xcpt_if = Bool()
+  val replay = Bool()
+}
+
+class FrontendIO(implicit p: Parameters) extends CoreBundle()(p) {
+  val req = Valid(new FrontendReq)
+  val resp = Decoupled(new FrontendResp).flip
+  val btb_update = Valid(new BTBUpdate)
+  val bht_update = Valid(new BHTUpdate)
+  val ras_update = Valid(new RASUpdate)
+  val flush_icache = Bool(OUTPUT)
+  val flush_tlb = Bool(OUTPUT)
+  val npc = UInt(INPUT, width = vaddrBitsExtended)
+}
+
+class Frontend(implicit p: Parameters) extends CoreModule()(p) with HasL1CacheParameters {
+  val io = new Bundle {
+    val cpu = new FrontendIO().flip
+    val ptw = new TLBPTWIO()
+    val mem = new ClientUncachedTileLinkIO
+  }
+
+  val icache = Module(new ICache(latency = 2))
+  val tlb = Module(new TLB)
+
+  val s1_pc_ = Reg(UInt(width=vaddrBitsExtended))
+  val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline)
+  val s1_speculative = Reg(Bool())
+  val s1_same_block = Reg(Bool())
+  val s2_valid = Reg(init=Bool(true))
+  val s2_pc = Reg(init=UInt(p(ResetVector)))
+  val s2_btb_resp_valid = Reg(init=Bool(false))
+  val s2_btb_resp_bits = Reg(new BTBResp)
+  val s2_xcpt_if = Reg(init=Bool(false))
+  val s2_speculative = Reg(init=Bool(false))
+  val s2_cacheable = Reg(init=Bool(false))
+
+  val ntpc = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth)
+  val ntpc_same_block = (ntpc & rowBytes) === (s1_pc & rowBytes)
+  val predicted_npc = Wire(init = ntpc)
+  val predicted_taken = Wire(init = Bool(false))
+  val icmiss = s2_valid && !icache.io.resp.valid
+  val npc = Mux(icmiss, s2_pc, predicted_npc)
+  val s0_same_block = !predicted_taken && !icmiss && !io.cpu.req.valid && ntpc_same_block
+
+  val stall = io.cpu.resp.valid && !io.cpu.resp.ready
+  when (!stall) {
+    s1_same_block := s0_same_block && !tlb.io.resp.miss
+    s1_pc_ := io.cpu.npc
+    // consider RVC fetches across blocks to be non-speculative if the first
+    // part was non-speculative
+    val s0_speculative =
+      if (usingCompressed) s1_speculative || s2_valid && !s2_speculative || predicted_taken
+      else Bool(true)
+    s1_speculative := Mux(icmiss, s2_speculative, s0_speculative)
+    s2_valid := !icmiss
+    when (!icmiss) {
+      s2_pc := s1_pc
+      s2_speculative := s1_speculative
+      s2_cacheable := tlb.io.resp.cacheable
+      s2_xcpt_if := tlb.io.resp.xcpt_if
+    }
+  }
+  when (io.cpu.req.valid) {
+    s1_same_block := Bool(false)
+    s1_pc_ := io.cpu.npc
+    s1_speculative := io.cpu.req.bits.speculative
+    s2_valid := Bool(false)
+  }
+
+  if (p(BtbKey).nEntries > 0) {
+    val btb = Module(new BTB)
+    btb.io.req.valid := false
+    btb.io.req.bits.addr := s1_pc_
+    btb.io.btb_update := io.cpu.btb_update
+    btb.io.bht_update := io.cpu.bht_update
+    btb.io.ras_update := io.cpu.ras_update
+    when (!stall && !icmiss) {
+      btb.io.req.valid := true
+      s2_btb_resp_valid := btb.io.resp.valid
+      s2_btb_resp_bits := btb.io.resp.bits
+    }
+    when (btb.io.resp.valid && btb.io.resp.bits.taken) {
+      predicted_npc := btb.io.resp.bits.target.sextTo(vaddrBitsExtended)
+      predicted_taken := Bool(true)
+    }
+  }
+
+  io.ptw <> tlb.io.ptw
+  tlb.io.req.valid := !stall && !icmiss
+  tlb.io.req.bits.vpn := s1_pc >> pgIdxBits
+  tlb.io.req.bits.passthrough := Bool(false)
+  tlb.io.req.bits.instruction := Bool(true)
+  tlb.io.req.bits.store := Bool(false)
+
+  io.mem <> icache.io.mem
+  icache.io.req.valid := !stall && !s0_same_block
+  icache.io.req.bits.addr := io.cpu.npc
+  icache.io.invalidate := io.cpu.flush_icache
+  icache.io.s1_ppn := tlb.io.resp.ppn
+  icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || tlb.io.resp.xcpt_if || icmiss || io.cpu.flush_tlb
+  icache.io.s2_kill := s2_speculative && !s2_cacheable
+  icache.io.resp.ready := !stall && !s1_same_block
+
+  io.cpu.resp.valid := s2_valid && (icache.io.resp.valid || icache.io.s2_kill || s2_xcpt_if)
+  io.cpu.resp.bits.pc := s2_pc
+  io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc)
+
+  require(fetchWidth * coreInstBytes <= rowBytes && isPow2(fetchWidth))
+  io.cpu.resp.bits.data := icache.io.resp.bits.datablock >> (s2_pc.extract(log2Ceil(rowBytes)-1,log2Ceil(fetchWidth*coreInstBytes)) << log2Ceil(fetchWidth*coreInstBits))
+  io.cpu.resp.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes))
+  io.cpu.resp.bits.xcpt_if := s2_xcpt_if
+  io.cpu.resp.bits.replay := icache.io.s2_kill && !icache.io.resp.valid && !s2_xcpt_if
+  io.cpu.resp.bits.btb.valid := s2_btb_resp_valid
+  io.cpu.resp.bits.btb.bits := s2_btb_resp_bits
+}
--- a/src/main/scala/rocket/ibuf.scala
+++ b/src/main/scala/rocket/ibuf.scala
@ -0,0 +1,132 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import Util._
+import cde.{Parameters, Field}
+import junctions._
+
+class Instruction(implicit val p: Parameters) extends ParameterizedBundle with HasCoreParameters {
+  val pf0 = Bool() // page fault on first half of instruction
+  val pf1 = Bool() // page fault on second half of instruction
+  val replay = Bool()
+  val btb_hit = Bool()
+  val rvc = Bool()
+  val inst = new ExpandedInstruction
+  require(coreInstBits == (if (usingCompressed) 16 else 32))
+}
+
+class IBuf(implicit p: Parameters) extends CoreModule {
+  val io = new Bundle {
+    val imem = Decoupled(new FrontendResp).flip
+    val kill = Bool(INPUT)
+    val pc = UInt(width = vaddrBitsExtended)
+    val btb_resp = new BTBResp().asOutput
+    val inst = Vec(retireWidth, Decoupled(new Instruction))
+  }
+
+  // This module is meant to be more general, but it's not there yet
+  require(decodeWidth == 1)
+
+  val n = fetchWidth - 1
+  val nBufValid = if (n == 0) UInt(0) else Reg(init=UInt(0, log2Ceil(fetchWidth)))
+  val buf = Reg(io.imem.bits)
+  val ibufBTBHit = Reg(Bool())
+  val ibufBTBResp = Reg(new BTBResp)
+  val pcWordMask = UInt(coreInstBytes*fetchWidth-1, vaddrBitsExtended)
+
+  val pcWordBits = io.imem.bits.pc.extract(log2Ceil(fetchWidth*coreInstBytes)-1, log2Ceil(coreInstBytes))
+  val nReady = Wire(init = UInt(0, log2Ceil(fetchWidth+1)))
+  val nIC = Mux(io.imem.bits.btb.valid && io.imem.bits.btb.bits.taken, io.imem.bits.btb.bits.bridx +& 1, UInt(fetchWidth)) - pcWordBits
+  val nICReady = nReady - nBufValid
+  val nValid = Mux(io.imem.valid, nIC, UInt(0)) + nBufValid
+  io.imem.ready := nReady >= nBufValid && (nICReady >= nIC || n >= nIC - nICReady)
+
+  if (n > 0) {
+    nBufValid := Mux(nReady >= nBufValid, UInt(0), nBufValid - nReady)
+    if (n > 1) when (nReady > 0 && nReady < nBufValid) {
+      val shiftedBuf = shiftInsnRight(buf.data(n*coreInstBits-1, coreInstBits), (nReady-1)(log2Ceil(n-1)-1,0))
+      buf.data := Cat(buf.data(n*coreInstBits-1, (n-1)*coreInstBits), shiftedBuf((n-1)*coreInstBits-1, 0))
+      buf.pc := buf.pc & ~pcWordMask | (buf.pc + (nReady << log2Ceil(coreInstBytes))) & pcWordMask
+      ibufBTBResp.bridx := ibufBTBResp.bridx - nReady
+    }
+    when (io.imem.valid && nReady >= nBufValid && nICReady < nIC && n >= nIC - nICReady) {
+      val shamt = pcWordBits + nICReady
+      nBufValid := nIC - nICReady
+      buf := io.imem.bits
+      buf.data := shiftInsnRight(io.imem.bits.data, shamt)(n*coreInstBits-1,0)
+      buf.pc := io.imem.bits.pc & ~pcWordMask | (io.imem.bits.pc + (nICReady << log2Ceil(coreInstBytes))) & pcWordMask
+      ibufBTBHit := io.imem.bits.btb.valid
+      when (io.imem.bits.btb.valid) {
+        ibufBTBResp := io.imem.bits.btb.bits
+        ibufBTBResp.bridx := io.imem.bits.btb.bits.bridx + nICReady
+      }
+    }
+    when (io.kill) {
+      nBufValid := 0
+    }
+  }
+
+  val icShiftAmt = (fetchWidth + nBufValid - pcWordBits)(log2Ceil(fetchWidth), 0)
+  val icData = shiftInsnLeft(Cat(io.imem.bits.data, Fill(fetchWidth, io.imem.bits.data(coreInstBits-1, 0))), icShiftAmt)
+    .extract(3*fetchWidth*coreInstBits-1, 2*fetchWidth*coreInstBits)
+  val icMask = (~UInt(0, fetchWidth*coreInstBits) << (nBufValid << log2Ceil(coreInstBits)))(fetchWidth*coreInstBits-1,0)
+  val inst = icData & icMask | buf.data & ~icMask
+
+  val valid = (UIntToOH(nValid) - 1)(fetchWidth-1, 0)
+  val bufMask = UIntToOH(nBufValid) - 1
+  val xcpt_if = valid & (Mux(buf.xcpt_if, bufMask, UInt(0)) | Mux(io.imem.bits.xcpt_if, ~bufMask, UInt(0)))
+  val ic_replay = valid & (Mux(buf.replay, bufMask, UInt(0)) | Mux(io.imem.bits.replay, ~bufMask, UInt(0)))
+  val ibufBTBHitMask = Mux(ibufBTBHit, UIntToOH(ibufBTBResp.bridx), UInt(0))
+  val icBTBHitMask = Mux(io.imem.bits.btb.valid, UIntToOH(io.imem.bits.btb.bits.bridx +& nBufValid - pcWordBits), UInt(0))
+  val btbHitMask = ibufBTBHitMask & bufMask | icBTBHitMask & ~bufMask
+
+  io.btb_resp := Mux((ibufBTBHitMask & bufMask).orR, ibufBTBResp, io.imem.bits.btb.bits)
+  io.pc := Mux(nBufValid > 0, buf.pc, io.imem.bits.pc)
+  expand(0, 0, inst)
+
+  def expand(i: Int, j: UInt, curInst: UInt): Unit = if (i < retireWidth) {
+    val exp = Module(new RVCExpander)
+    exp.io.in := curInst
+    io.inst(i).bits.inst := exp.io.out
+
+    if (usingCompressed) {
+      val replay = ic_replay(j) || (!exp.io.rvc && (btbHitMask(j) || ic_replay(j+1)))
+      io.inst(i).valid := valid(j) && (exp.io.rvc || valid(j+1) || xcpt_if(j+1) || replay)
+      io.inst(i).bits.pf0 := xcpt_if(j)
+      io.inst(i).bits.pf1 := !exp.io.rvc && xcpt_if(j+1)
+      io.inst(i).bits.replay := replay
+      io.inst(i).bits.btb_hit := btbHitMask(j) || (!exp.io.rvc && btbHitMask(j+1))
+      io.inst(i).bits.rvc := exp.io.rvc
+
+      when (io.inst(i).fire()) { nReady := Mux(exp.io.rvc, j+1, j+2) }
+
+      expand(i+1, Mux(exp.io.rvc, j+1, j+2), Mux(exp.io.rvc, curInst >> 16, curInst >> 32))
+    } else {
+      when (io.inst(i).ready) { nReady := i+1 }
+      io.inst(i).valid := valid(i)
+      io.inst(i).bits.pf0 := xcpt_if(i)
+      io.inst(i).bits.pf1 := false
+      io.inst(i).bits.replay := ic_replay(i)
+      io.inst(i).bits.rvc := false
+      io.inst(i).bits.btb_hit := btbHitMask(i)
+
+      expand(i+1, null, curInst >> 32)
+    }
+  }
+
+  def shiftInsnLeft(in: UInt, dist: UInt) = {
+    val r = in.getWidth/coreInstBits
+    require(in.getWidth % coreInstBits == 0)
+    val data = Cat(Fill((1 << (log2Ceil(r) + 1)) - r, in >> (r-1)*coreInstBits), in)
+    data << (dist << log2Ceil(coreInstBits))
+  }
+
+  def shiftInsnRight(in: UInt, dist: UInt) = {
+    val r = in.getWidth/coreInstBits
+    require(in.getWidth % coreInstBits == 0)
+    val data = Cat(Fill((1 << (log2Ceil(r) + 1)) - r, in >> (r-1)*coreInstBits), in)
+    data >> (dist << log2Ceil(coreInstBits))
+  }
+}
--- a/src/main/scala/rocket/icache.scala
+++ b/src/main/scala/rocket/icache.scala
@ -0,0 +1,157 @@
+package rocket
+
+import Chisel._
+import uncore.agents._
+import uncore.tilelink._
+import uncore.util._
+import Util._
+import cde.{Parameters, Field}
+
+trait HasL1CacheParameters extends HasCacheParameters with HasCoreParameters {
+  val outerDataBeats = p(TLKey(p(TLId))).dataBeats
+  val outerDataBits = p(TLKey(p(TLId))).dataBitsPerBeat
+  val refillCyclesPerBeat = outerDataBits/rowBits
+  val refillCycles = refillCyclesPerBeat*outerDataBeats
+}
+
+class ICacheReq(implicit p: Parameters) extends CoreBundle()(p) with HasL1CacheParameters {
+  val addr = UInt(width = vaddrBits)
+}
+
+class ICacheResp(implicit p: Parameters) extends CoreBundle()(p) with HasL1CacheParameters {
+  val data = Bits(width = coreInstBits)
+  val datablock = Bits(width = rowBits)
+}
+
+class ICache(latency: Int)(implicit p: Parameters) extends CoreModule()(p) with HasL1CacheParameters {
+  val io = new Bundle {
+    val req = Valid(new ICacheReq).flip
+    val s1_ppn = UInt(INPUT, ppnBits) // delayed one cycle w.r.t. req
+    val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req
+    val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission
+
+    val resp = Decoupled(new ICacheResp)
+    val invalidate = Bool(INPUT)
+    val mem = new ClientUncachedTileLinkIO
+  }
+  require(isPow2(nSets) && isPow2(nWays))
+  require(isPow2(coreInstBytes))
+  require(!usingVM || pgIdxBits >= untagBits)
+
+  val s_ready :: s_request :: s_refill_wait :: s_refill :: Nil = Enum(UInt(), 4)
+  val state = Reg(init=s_ready)
+  val invalidated = Reg(Bool())
+  val stall = !io.resp.ready
+  val rdy = Wire(Bool())
+
+  val refill_addr = Reg(UInt(width = paddrBits))
+  val s1_any_tag_hit = Wire(Bool())
+
+  val s1_valid = Reg(init=Bool(false))
+  val s1_vaddr = Reg(UInt())
+  val s1_paddr = Cat(io.s1_ppn, s1_vaddr(pgIdxBits-1,0))
+  val s1_tag = s1_paddr(tagBits+untagBits-1,untagBits)
+
+  val s0_valid = io.req.valid || s1_valid && stall
+  val s0_vaddr = Mux(s1_valid && stall, s1_vaddr, io.req.bits.addr)
+
+  s1_valid := io.req.valid && rdy || s1_valid && stall && !io.s1_kill
+  when (io.req.valid && rdy) {
+    s1_vaddr := io.req.bits.addr
+  }
+
+  val out_valid = s1_valid && !io.s1_kill && state === s_ready
+  val s1_idx = s1_vaddr(untagBits-1,blockOffBits)
+  val s1_hit = out_valid && s1_any_tag_hit
+  val s1_miss = out_valid && !s1_any_tag_hit
+  rdy := state === s_ready && !s1_miss
+
+  when (s1_miss && state === s_ready) {
+    refill_addr := s1_paddr
+  }
+  val refill_tag = refill_addr(tagBits+untagBits-1,untagBits)
+
+  val narrow_grant = FlowThroughSerializer(io.mem.grant, refillCyclesPerBeat)
+  val (refill_cnt, refill_wrap) = Counter(narrow_grant.fire(), refillCycles)
+  val refill_done = state === s_refill && refill_wrap
+  narrow_grant.ready := Bool(true)
+
+  val repl_way = if (isDM) UInt(0) else LFSR16(s1_miss)(log2Up(nWays)-1,0)
+  val entagbits = code.width(tagBits)
+  val tag_array = SeqMem(nSets, Vec(nWays, Bits(width = entagbits)))
+  val tag_rdata = tag_array.read(s0_vaddr(untagBits-1,blockOffBits), !refill_done && s0_valid)
+  when (refill_done) {
+    val tag = code.encode(refill_tag)
+    tag_array.write(s1_idx, Vec.fill(nWays)(tag), Vec.tabulate(nWays)(repl_way === _))
+  }
+
+  val vb_array = Reg(init=Bits(0, nSets*nWays))
+  when (refill_done && !invalidated) {
+    vb_array := vb_array.bitSet(Cat(repl_way, s1_idx), Bool(true))
+  }
+  when (io.invalidate) {
+    vb_array := Bits(0)
+    invalidated := Bool(true)
+  }
+  val s1_disparity = Wire(Vec(nWays, Bool()))
+  for (i <- 0 until nWays)
+    when (s1_valid && s1_disparity(i)) { vb_array := vb_array.bitSet(Cat(UInt(i), s1_idx), Bool(false)) }
+
+  val s1_tag_match = Wire(Vec(nWays, Bool()))
+  val s1_tag_hit = Wire(Vec(nWays, Bool()))
+  val s1_dout = Wire(Vec(nWays, Bits(width = rowBits)))
+
+  for (i <- 0 until nWays) {
+    val s1_vb = !io.invalidate && vb_array(Cat(UInt(i), s1_vaddr(untagBits-1,blockOffBits))).toBool
+    val tag_out = tag_rdata(i)
+    val s1_tag_disparity = code.decode(tag_out).error
+    s1_tag_match(i) := tag_out(tagBits-1,0) === s1_tag
+    s1_tag_hit(i) := s1_vb && s1_tag_match(i)
+    s1_disparity(i) := s1_vb && (s1_tag_disparity || code.decode(s1_dout(i)).error)
+  }
+  s1_any_tag_hit := s1_tag_hit.reduceLeft(_||_) && !s1_disparity.reduceLeft(_||_)
+
+  for (i <- 0 until nWays) {
+    val data_array = SeqMem(nSets * refillCycles, Bits(width = code.width(rowBits)))
+    val wen = narrow_grant.valid && repl_way === UInt(i)
+    when (wen) {
+      val e_d = code.encode(narrow_grant.bits.data)
+      data_array.write((s1_idx << log2Ceil(refillCycles)) | refill_cnt, e_d)
+    }
+    val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles))
+    s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid)
+  }
+
+  // output signals
+  latency match {
+    case 1 =>
+      io.resp.bits.datablock := Mux1H(s1_tag_hit, s1_dout)
+      io.resp.valid := s1_hit
+    case 2 =>
+      val s2_hit = RegEnable(s1_hit, !stall)
+      val s2_tag_hit = RegEnable(s1_tag_hit, !stall)
+      val s2_dout = RegEnable(s1_dout, !stall)
+      io.resp.bits.datablock := Mux1H(s2_tag_hit, s2_dout)
+      io.resp.valid := s2_hit
+  }
+  io.mem.acquire.valid := state === s_request && !io.s2_kill
+  io.mem.acquire.bits := GetBlock(addr_block = refill_addr >> blockOffBits)
+
+  // control state machine
+  switch (state) {
+    is (s_ready) {
+      when (s1_miss) { state := s_request }
+      invalidated := Bool(false)
+    }
+    is (s_request) {
+      when (io.mem.acquire.ready) { state := s_refill_wait }
+      when (io.s2_kill) { state := s_ready }
+    }
+    is (s_refill_wait) {
+      when (io.mem.grant.valid) { state := s_refill }
+    }
+    is (s_refill) {
+      when (refill_done) { state := s_ready }
+    }
+  }
+}
--- a/src/main/scala/rocket/idecode.scala
+++ b/src/main/scala/rocket/idecode.scala
@ -0,0 +1,314 @@
+// See LICENSE for license details
+
+package rocket
+
+import Chisel._
+import Instructions._
+import uncore.constants.MemoryOpConstants._
+import ALU._
+import cde.Parameters
+import Util._
+
+abstract trait DecodeConstants extends HasCoreParameters
+{
+  val table: Array[(BitPat, List[BitPat])]
+}
+
+class IntCtrlSigs extends Bundle {
+  val legal = Bool()
+  val fp = Bool()
+  val rocc = Bool()
+  val branch = Bool()
+  val jal = Bool()
+  val jalr = Bool()
+  val rxs2 = Bool()
+  val rxs1 = Bool()
+  val sel_alu2 = Bits(width = A2_X.getWidth)
+  val sel_alu1 = Bits(width = A1_X.getWidth)
+  val sel_imm = Bits(width = IMM_X.getWidth)
+  val alu_dw = Bool()
+  val alu_fn = Bits(width = FN_X.getWidth)
+  val mem = Bool()
+  val mem_cmd = Bits(width = M_SZ)
+  val mem_type = Bits(width = MT_SZ)
+  val rfs1 = Bool()
+  val rfs2 = Bool()
+  val rfs3 = Bool()
+  val wfd = Bool()
+  val div = Bool()
+  val wxd = Bool()
+  val csr = Bits(width = CSR.SZ)
+  val fence_i = Bool()
+  val fence = Bool()
+  val amo = Bool()
+
+  def default: List[BitPat] =
+                //           jal                                                                 renf1             fence.i
+                //   val     | jalr                                                              | renf2           |
+                //   | fp_val| | renx2                                                           | | renf3         |
+                //   | | rocc| | | renx1     s_alu1                          mem_val             | | | wfd         | 
+                //   | | | br| | | | s_alu2  |       imm    dw     alu       | mem_cmd   mem_type| | | | div       | 
+                //   | | | | | | | | |       |       |      |      |         | |           |     | | | | | wxd     | fence
+                //   | | | | | | | | |       |       |      |      |         | |           |     | | | | | | csr   | | amo
+                //   | | | | | | | | |       |       |      |      |         | |           |     | | | | | | |     | | |
+                List(N,X,X,X,X,X,X,X,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, X,X,X,X,X,X,CSR.X,X,X,X)
+
+  def decode(inst: UInt, table: Iterable[(BitPat, List[BitPat])]) = {
+    val decoder = DecodeLogic(inst, default, table)
+    val sigs = Seq(legal, fp, rocc, branch, jal, jalr, rxs2, rxs1, sel_alu2,
+                   sel_alu1, sel_imm, alu_dw, alu_fn, mem, mem_cmd, mem_type,
+                   rfs1, rfs2, rfs3, wfd, div, wxd, csr, fence_i, fence, amo)
+    sigs zip decoder map {case(s,d) => s := d}
+    this
+  }
+}
+
+class IDecode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    BNE->       List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X,  FN_SNE,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    BEQ->       List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X,  FN_SEQ,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    BLT->       List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X,  FN_SLT,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    BLTU->      List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X,  FN_SLTU,  N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    BGE->       List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X,  FN_SGE,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    BGEU->      List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X,  FN_SGEU,  N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+
+    JAL->       List(Y,N,N,N,Y,N,N,N,A2_SIZE,A1_PC,  IMM_UJ,DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    JALR->      List(Y,N,N,N,N,Y,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    AUIPC->     List(Y,N,N,N,N,N,N,N,A2_IMM, A1_PC,  IMM_U, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+
+    LB->        List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   Y,M_XRD,      MT_B, N,N,N,N,N,Y,CSR.N,N,N,N),
+    LH->        List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   Y,M_XRD,      MT_H, N,N,N,N,N,Y,CSR.N,N,N,N),
+    LW->        List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   Y,M_XRD,      MT_W, N,N,N,N,N,Y,CSR.N,N,N,N),
+    LBU->       List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   Y,M_XRD,      MT_BU,N,N,N,N,N,Y,CSR.N,N,N,N),
+    LHU->       List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   Y,M_XRD,      MT_HU,N,N,N,N,N,Y,CSR.N,N,N,N),
+    SB->        List(Y,N,N,N,N,N,Y,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD,   Y,M_XWR,      MT_B, N,N,N,N,N,N,CSR.N,N,N,N),
+    SH->        List(Y,N,N,N,N,N,Y,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD,   Y,M_XWR,      MT_H, N,N,N,N,N,N,CSR.N,N,N,N),
+    SW->        List(Y,N,N,N,N,N,Y,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD,   Y,M_XWR,      MT_W, N,N,N,N,N,N,CSR.N,N,N,N),
+
+    LUI->       List(Y,N,N,N,N,N,N,N,A2_IMM, A1_ZERO,IMM_U, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    ADDI->      List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SLTI ->     List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SLT,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SLTIU->     List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SLTU,  N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    ANDI->      List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_AND,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    ORI->       List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_OR,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    XORI->      List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_XOR,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SLLI->      List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SL,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SRLI->      List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SR,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SRAI->      List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SRA,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    ADD->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SUB->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SUB,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SLT->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SLT,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SLTU->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SLTU,  N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    AND->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_AND,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    OR->        List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_OR,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    XOR->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_XOR,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SLL->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SL,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SRL->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SR,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SRA->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SRA,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+
+    FENCE->     List(Y,N,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,Y,N),
+    FENCE_I->   List(Y,N,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     Y,M_FLUSH_ALL,MT_X, N,N,N,N,N,N,CSR.N,Y,N,N),
+
+    SCALL->     List(Y,N,N,N,N,N,N,X,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,N,N,N,CSR.I,N,N,N),
+    SBREAK->    List(Y,N,N,N,N,N,N,X,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,N,N,N,CSR.I,N,N,N),
+    MRET->      List(Y,N,N,N,N,N,N,X,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,N,N,N,CSR.I,N,N,N),
+    WFI->       List(Y,N,N,N,N,N,N,X,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,N,N,N,CSR.I,N,N,N),
+    CSRRW->     List(Y,N,N,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.W,N,N,N),
+    CSRRS->     List(Y,N,N,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.S,N,N,N),
+    CSRRC->     List(Y,N,N,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.C,N,N,N),
+    CSRRWI->    List(Y,N,N,N,N,N,N,N,A2_IMM, A1_ZERO,IMM_Z, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.W,N,N,N),
+    CSRRSI->    List(Y,N,N,N,N,N,N,N,A2_IMM, A1_ZERO,IMM_Z, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.S,N,N,N),
+    CSRRCI->    List(Y,N,N,N,N,N,N,N,A2_IMM, A1_ZERO,IMM_Z, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.C,N,N,N))
+}
+
+class SDecode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    SFENCE_VM-> List(Y,N,N,N,N,N,N,X,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,N,N,N,CSR.I,N,N,N),
+    SRET->      List(Y,N,N,N,N,N,N,X,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,N,N,N,CSR.I,N,N,N))
+}
+
+class DebugDecode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    DRET->      List(Y,N,N,N,N,N,N,X,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,N,N,N,CSR.I,N,N,N))
+}
+
+class I64Decode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    LD->        List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   Y,M_XRD,      MT_D, N,N,N,N,N,Y,CSR.N,N,N,N),
+    LWU->       List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   Y,M_XRD,      MT_WU,N,N,N,N,N,Y,CSR.N,N,N,N),
+    SD->        List(Y,N,N,N,N,N,Y,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD,   Y,M_XWR,      MT_D, N,N,N,N,N,N,CSR.N,N,N,N),
+
+    ADDIW->     List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_32,FN_ADD,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SLLIW->     List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_32,FN_SL,     N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SRLIW->     List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_32,FN_SR,     N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SRAIW->     List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_32,FN_SRA,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    ADDW->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_ADD,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SUBW->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_SUB,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SLLW->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_SL,     N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SRLW->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_SR,     N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    SRAW->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_SRA,    N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N))
+}
+
+class MDecode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    MUL->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_MUL,   N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+    MULH->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_MULH,  N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+    MULHU->     List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_MULHU, N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+    MULHSU->    List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_MULHSU,N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+
+    DIV->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_DIV,   N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+    DIVU->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_DIVU,  N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+    REM->       List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_REM,   N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+    REMU->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_REMU,  N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N))
+}
+
+class M64Decode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    MULW->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_MUL,   N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+
+    DIVW->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_DIV,   N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+    DIVUW->     List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_DIVU,  N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+    REMW->      List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_REM,   N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N),
+    REMUW->     List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_REMU,  N,M_X,        MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N))
+}
+
+class ADecode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    AMOADD_W->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_ADD,   MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOXOR_W->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_XOR,   MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOSWAP_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_SWAP,  MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOAND_W->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_AND,   MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOOR_W->   List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_OR,    MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOMIN_W->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_MIN,   MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOMINU_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_MINU,  MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOMAX_W->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_MAX,   MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOMAXU_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_MAXU,  MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+
+    LR_W->      List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XLR,      MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    SC_W->      List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XSC,      MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y))
+}
+
+class A64Decode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    AMOADD_D->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_ADD,   MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOSWAP_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_SWAP,  MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOXOR_D->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_XOR,   MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOAND_D->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_AND,   MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOOR_D->   List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_OR,    MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOMIN_D->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_MIN,   MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOMINU_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_MINU,  MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOMAX_D->  List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_MAX,   MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    AMOMAXU_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XA_MAXU,  MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+
+    LR_D->      List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XLR,      MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y),
+    SC_D->      List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_XSC,      MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y))
+}
+
+class FDecode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    FCVT_S_D->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,Y,N,N,CSR.N,N,N,N),
+    FCVT_D_S->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,Y,N,N,CSR.N,N,N,N),
+    FSGNJ_S->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FSGNJ_D->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FSGNJX_S->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FSGNJX_D->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FSGNJN_S->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FSGNJN_D->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FMIN_S->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FMIN_D->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FMAX_S->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FMAX_D->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FADD_S->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FADD_D->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FSUB_S->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FSUB_D->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FMUL_S->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FMUL_D->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FMADD_S->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N),
+    FMADD_D->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N),
+    FMSUB_S->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N),
+    FMSUB_D->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N),
+    FNMADD_S->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N),
+    FNMADD_D->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N),
+    FNMSUB_S->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N),
+    FNMSUB_D->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N),
+    FCLASS_S->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FCLASS_D->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FMV_X_S->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FCVT_W_S->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FCVT_W_D->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FCVT_WU_S-> List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FCVT_WU_D-> List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FEQ_S->     List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N),
+    FEQ_D->     List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N),
+    FLT_S->     List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N),
+    FLT_D->     List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N),
+    FLE_S->     List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N),
+    FLE_D->     List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N),
+    FMV_S_X->   List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FCVT_S_W->  List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FCVT_D_W->  List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FCVT_S_WU-> List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FCVT_D_WU-> List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FLW->       List(Y,Y,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   Y,M_XRD,      MT_W, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FLD->       List(Y,Y,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD,   Y,M_XRD,      MT_D, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FSW->       List(Y,Y,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD,   Y,M_XWR,      MT_W, N,Y,N,N,N,N,CSR.N,N,N,N),
+    FSD->       List(Y,Y,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD,   Y,M_XWR,      MT_D, N,Y,N,N,N,N,CSR.N,N,N,N))
+}
+
+class F64Decode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    FMV_X_D->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FCVT_L_S->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FCVT_L_D->  List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FCVT_LU_S-> List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FCVT_LU_D-> List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N),
+    FMV_D_X->   List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FCVT_S_L->  List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FCVT_D_L->  List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FCVT_S_LU-> List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FCVT_D_LU-> List(Y,Y,N,N,N,N,N,Y,A2_X,   A1_RS1, IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,Y,N,N,CSR.N,N,N,N),
+    FDIV_S->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FDIV_D->    List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FSQRT_S->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N),
+    FSQRT_D->   List(Y,Y,N,N,N,N,N,N,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N))
+}
+
+class RoCCDecode(implicit val p: Parameters) extends DecodeConstants
+{
+  val table: Array[(BitPat, List[BitPat])] = Array(
+    CUSTOM0->           List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM0_RS1->       List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM0_RS1_RS2->   List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM0_RD->        List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM0_RD_RS1->    List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM0_RD_RS1_RS2->List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM1->           List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM1_RS1->       List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM1_RS1_RS2->   List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM1_RD->        List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM1_RD_RS1->    List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM1_RD_RS1_RS2->List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM2->           List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM2_RS1->       List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM2_RS1_RS2->   List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM2_RD->        List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM2_RD_RS1->    List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM2_RD_RS1_RS2->List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM3->           List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM3_RS1->       List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM3_RS1_RS2->   List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,N,CSR.N,N,N,N),
+    CUSTOM3_RD->        List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM3_RD_RS1->    List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N),
+    CUSTOM3_RD_RS1_RS2->List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   N,M_X,        MT_X, N,N,N,N,N,Y,CSR.N,N,N,N))
+}
--- a/src/main/scala/rocket/instructions.scala
+++ b/src/main/scala/rocket/instructions.scala
@ -0,0 +1,383 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+
+/* Automatically generated by parse-opcodes */
+object Instructions {
+  def BEQ                = BitPat("b?????????????????000?????1100011")
+  def BNE                = BitPat("b?????????????????001?????1100011")
+  def BLT                = BitPat("b?????????????????100?????1100011")
+  def BGE                = BitPat("b?????????????????101?????1100011")
+  def BLTU               = BitPat("b?????????????????110?????1100011")
+  def BGEU               = BitPat("b?????????????????111?????1100011")
+  def JALR               = BitPat("b?????????????????000?????1100111")
+  def JAL                = BitPat("b?????????????????????????1101111")
+  def LUI                = BitPat("b?????????????????????????0110111")
+  def AUIPC              = BitPat("b?????????????????????????0010111")
+  def ADDI               = BitPat("b?????????????????000?????0010011")
+  def SLLI               = BitPat("b000000???????????001?????0010011")
+  def SLTI               = BitPat("b?????????????????010?????0010011")
+  def SLTIU              = BitPat("b?????????????????011?????0010011")
+  def XORI               = BitPat("b?????????????????100?????0010011")
+  def SRLI               = BitPat("b000000???????????101?????0010011")
+  def SRAI               = BitPat("b010000???????????101?????0010011")
+  def ORI                = BitPat("b?????????????????110?????0010011")
+  def ANDI               = BitPat("b?????????????????111?????0010011")
+  def ADD                = BitPat("b0000000??????????000?????0110011")
+  def SUB                = BitPat("b0100000??????????000?????0110011")
+  def SLL                = BitPat("b0000000??????????001?????0110011")
+  def SLT                = BitPat("b0000000??????????010?????0110011")
+  def SLTU               = BitPat("b0000000??????????011?????0110011")
+  def XOR                = BitPat("b0000000??????????100?????0110011")
+  def SRL                = BitPat("b0000000??????????101?????0110011")
+  def SRA                = BitPat("b0100000??????????101?????0110011")
+  def OR                 = BitPat("b0000000??????????110?????0110011")
+  def AND                = BitPat("b0000000??????????111?????0110011")
+  def ADDIW              = BitPat("b?????????????????000?????0011011")
+  def SLLIW              = BitPat("b0000000??????????001?????0011011")
+  def SRLIW              = BitPat("b0000000??????????101?????0011011")
+  def SRAIW              = BitPat("b0100000??????????101?????0011011")
+  def ADDW               = BitPat("b0000000??????????000?????0111011")
+  def SUBW               = BitPat("b0100000??????????000?????0111011")
+  def SLLW               = BitPat("b0000000??????????001?????0111011")
+  def SRLW               = BitPat("b0000000??????????101?????0111011")
+  def SRAW               = BitPat("b0100000??????????101?????0111011")
+  def LB                 = BitPat("b?????????????????000?????0000011")
+  def LH                 = BitPat("b?????????????????001?????0000011")
+  def LW                 = BitPat("b?????????????????010?????0000011")
+  def LD                 = BitPat("b?????????????????011?????0000011")
+  def LBU                = BitPat("b?????????????????100?????0000011")
+  def LHU                = BitPat("b?????????????????101?????0000011")
+  def LWU                = BitPat("b?????????????????110?????0000011")
+  def SB                 = BitPat("b?????????????????000?????0100011")
+  def SH                 = BitPat("b?????????????????001?????0100011")
+  def SW                 = BitPat("b?????????????????010?????0100011")
+  def SD                 = BitPat("b?????????????????011?????0100011")
+  def FENCE              = BitPat("b?????????????????000?????0001111")
+  def FENCE_I            = BitPat("b?????????????????001?????0001111")
+  def MUL                = BitPat("b0000001??????????000?????0110011")
+  def MULH               = BitPat("b0000001??????????001?????0110011")
+  def MULHSU             = BitPat("b0000001??????????010?????0110011")
+  def MULHU              = BitPat("b0000001??????????011?????0110011")
+  def DIV                = BitPat("b0000001??????????100?????0110011")
+  def DIVU               = BitPat("b0000001??????????101?????0110011")
+  def REM                = BitPat("b0000001??????????110?????0110011")
+  def REMU               = BitPat("b0000001??????????111?????0110011")
+  def MULW               = BitPat("b0000001??????????000?????0111011")
+  def DIVW               = BitPat("b0000001??????????100?????0111011")
+  def DIVUW              = BitPat("b0000001??????????101?????0111011")
+  def REMW               = BitPat("b0000001??????????110?????0111011")
+  def REMUW              = BitPat("b0000001??????????111?????0111011")
+  def AMOADD_W           = BitPat("b00000????????????010?????0101111")
+  def AMOXOR_W           = BitPat("b00100????????????010?????0101111")
+  def AMOOR_W            = BitPat("b01000????????????010?????0101111")
+  def AMOAND_W           = BitPat("b01100????????????010?????0101111")
+  def AMOMIN_W           = BitPat("b10000????????????010?????0101111")
+  def AMOMAX_W           = BitPat("b10100????????????010?????0101111")
+  def AMOMINU_W          = BitPat("b11000????????????010?????0101111")
+  def AMOMAXU_W          = BitPat("b11100????????????010?????0101111")
+  def AMOSWAP_W          = BitPat("b00001????????????010?????0101111")
+  def LR_W               = BitPat("b00010??00000?????010?????0101111")
+  def SC_W               = BitPat("b00011????????????010?????0101111")
+  def AMOADD_D           = BitPat("b00000????????????011?????0101111")
+  def AMOXOR_D           = BitPat("b00100????????????011?????0101111")
+  def AMOOR_D            = BitPat("b01000????????????011?????0101111")
+  def AMOAND_D           = BitPat("b01100????????????011?????0101111")
+  def AMOMIN_D           = BitPat("b10000????????????011?????0101111")
+  def AMOMAX_D           = BitPat("b10100????????????011?????0101111")
+  def AMOMINU_D          = BitPat("b11000????????????011?????0101111")
+  def AMOMAXU_D          = BitPat("b11100????????????011?????0101111")
+  def AMOSWAP_D          = BitPat("b00001????????????011?????0101111")
+  def LR_D               = BitPat("b00010??00000?????011?????0101111")
+  def SC_D               = BitPat("b00011????????????011?????0101111")
+  def ECALL              = BitPat("b00000000000000000000000001110011")
+  def EBREAK             = BitPat("b00000000000100000000000001110011")
+  def URET               = BitPat("b00000000001000000000000001110011")
+  def SRET               = BitPat("b00010000001000000000000001110011")
+  def HRET               = BitPat("b00100000001000000000000001110011")
+  def MRET               = BitPat("b00110000001000000000000001110011")
+  def DRET               = BitPat("b01111011001000000000000001110011")
+  def SFENCE_VM          = BitPat("b000100000100?????000000001110011")
+  def WFI                = BitPat("b00010000010100000000000001110011")
+  def CSRRW              = BitPat("b?????????????????001?????1110011")
+  def CSRRS              = BitPat("b?????????????????010?????1110011")
+  def CSRRC              = BitPat("b?????????????????011?????1110011")
+  def CSRRWI             = BitPat("b?????????????????101?????1110011")
+  def CSRRSI             = BitPat("b?????????????????110?????1110011")
+  def CSRRCI             = BitPat("b?????????????????111?????1110011")
+  def FADD_S             = BitPat("b0000000??????????????????1010011")
+  def FSUB_S             = BitPat("b0000100??????????????????1010011")
+  def FMUL_S             = BitPat("b0001000??????????????????1010011")
+  def FDIV_S             = BitPat("b0001100??????????????????1010011")
+  def FSGNJ_S            = BitPat("b0010000??????????000?????1010011")
+  def FSGNJN_S           = BitPat("b0010000??????????001?????1010011")
+  def FSGNJX_S           = BitPat("b0010000??????????010?????1010011")
+  def FMIN_S             = BitPat("b0010100??????????000?????1010011")
+  def FMAX_S             = BitPat("b0010100??????????001?????1010011")
+  def FSQRT_S            = BitPat("b010110000000?????????????1010011")
+  def FADD_D             = BitPat("b0000001??????????????????1010011")
+  def FSUB_D             = BitPat("b0000101??????????????????1010011")
+  def FMUL_D             = BitPat("b0001001??????????????????1010011")
+  def FDIV_D             = BitPat("b0001101??????????????????1010011")
+  def FSGNJ_D            = BitPat("b0010001??????????000?????1010011")
+  def FSGNJN_D           = BitPat("b0010001??????????001?????1010011")
+  def FSGNJX_D           = BitPat("b0010001??????????010?????1010011")
+  def FMIN_D             = BitPat("b0010101??????????000?????1010011")
+  def FMAX_D             = BitPat("b0010101??????????001?????1010011")
+  def FCVT_S_D           = BitPat("b010000000001?????????????1010011")
+  def FCVT_D_S           = BitPat("b010000100000?????????????1010011")
+  def FSQRT_D            = BitPat("b010110100000?????????????1010011")
+  def FLE_S              = BitPat("b1010000??????????000?????1010011")
+  def FLT_S              = BitPat("b1010000??????????001?????1010011")
+  def FEQ_S              = BitPat("b1010000??????????010?????1010011")
+  def FLE_D              = BitPat("b1010001??????????000?????1010011")
+  def FLT_D              = BitPat("b1010001??????????001?????1010011")
+  def FEQ_D              = BitPat("b1010001??????????010?????1010011")
+  def FCVT_W_S           = BitPat("b110000000000?????????????1010011")
+  def FCVT_WU_S          = BitPat("b110000000001?????????????1010011")
+  def FCVT_L_S           = BitPat("b110000000010?????????????1010011")
+  def FCVT_LU_S          = BitPat("b110000000011?????????????1010011")
+  def FMV_X_S            = BitPat("b111000000000?????000?????1010011")
+  def FCLASS_S           = BitPat("b111000000000?????001?????1010011")
+  def FCVT_W_D           = BitPat("b110000100000?????????????1010011")
+  def FCVT_WU_D          = BitPat("b110000100001?????????????1010011")
+  def FCVT_L_D           = BitPat("b110000100010?????????????1010011")
+  def FCVT_LU_D          = BitPat("b110000100011?????????????1010011")
+  def FMV_X_D            = BitPat("b111000100000?????000?????1010011")
+  def FCLASS_D           = BitPat("b111000100000?????001?????1010011")
+  def FCVT_S_W           = BitPat("b110100000000?????????????1010011")
+  def FCVT_S_WU          = BitPat("b110100000001?????????????1010011")
+  def FCVT_S_L           = BitPat("b110100000010?????????????1010011")
+  def FCVT_S_LU          = BitPat("b110100000011?????????????1010011")
+  def FMV_S_X            = BitPat("b111100000000?????000?????1010011")
+  def FCVT_D_W           = BitPat("b110100100000?????????????1010011")
+  def FCVT_D_WU          = BitPat("b110100100001?????????????1010011")
+  def FCVT_D_L           = BitPat("b110100100010?????????????1010011")
+  def FCVT_D_LU          = BitPat("b110100100011?????????????1010011")
+  def FMV_D_X            = BitPat("b111100100000?????000?????1010011")
+  def FLW                = BitPat("b?????????????????010?????0000111")
+  def FLD                = BitPat("b?????????????????011?????0000111")
+  def FSW                = BitPat("b?????????????????010?????0100111")
+  def FSD                = BitPat("b?????????????????011?????0100111")
+  def FMADD_S            = BitPat("b?????00??????????????????1000011")
+  def FMSUB_S            = BitPat("b?????00??????????????????1000111")
+  def FNMSUB_S           = BitPat("b?????00??????????????????1001011")
+  def FNMADD_S           = BitPat("b?????00??????????????????1001111")
+  def FMADD_D            = BitPat("b?????01??????????????????1000011")
+  def FMSUB_D            = BitPat("b?????01??????????????????1000111")
+  def FNMSUB_D           = BitPat("b?????01??????????????????1001011")
+  def FNMADD_D           = BitPat("b?????01??????????????????1001111")
+  def CUSTOM0            = BitPat("b?????????????????000?????0001011")
+  def CUSTOM0_RS1        = BitPat("b?????????????????010?????0001011")
+  def CUSTOM0_RS1_RS2    = BitPat("b?????????????????011?????0001011")
+  def CUSTOM0_RD         = BitPat("b?????????????????100?????0001011")
+  def CUSTOM0_RD_RS1     = BitPat("b?????????????????110?????0001011")
+  def CUSTOM0_RD_RS1_RS2 = BitPat("b?????????????????111?????0001011")
+  def CUSTOM1            = BitPat("b?????????????????000?????0101011")
+  def CUSTOM1_RS1        = BitPat("b?????????????????010?????0101011")
+  def CUSTOM1_RS1_RS2    = BitPat("b?????????????????011?????0101011")
+  def CUSTOM1_RD         = BitPat("b?????????????????100?????0101011")
+  def CUSTOM1_RD_RS1     = BitPat("b?????????????????110?????0101011")
+  def CUSTOM1_RD_RS1_RS2 = BitPat("b?????????????????111?????0101011")
+  def CUSTOM2            = BitPat("b?????????????????000?????1011011")
+  def CUSTOM2_RS1        = BitPat("b?????????????????010?????1011011")
+  def CUSTOM2_RS1_RS2    = BitPat("b?????????????????011?????1011011")
+  def CUSTOM2_RD         = BitPat("b?????????????????100?????1011011")
+  def CUSTOM2_RD_RS1     = BitPat("b?????????????????110?????1011011")
+  def CUSTOM2_RD_RS1_RS2 = BitPat("b?????????????????111?????1011011")
+  def CUSTOM3            = BitPat("b?????????????????000?????1111011")
+  def CUSTOM3_RS1        = BitPat("b?????????????????010?????1111011")
+  def CUSTOM3_RS1_RS2    = BitPat("b?????????????????011?????1111011")
+  def CUSTOM3_RD         = BitPat("b?????????????????100?????1111011")
+  def CUSTOM3_RD_RS1     = BitPat("b?????????????????110?????1111011")
+  def CUSTOM3_RD_RS1_RS2 = BitPat("b?????????????????111?????1111011")
+  def SLLI_RV32          = BitPat("b0000000??????????001?????0010011")
+  def SRLI_RV32          = BitPat("b0000000??????????101?????0010011")
+  def SRAI_RV32          = BitPat("b0100000??????????101?????0010011")
+  def FRFLAGS            = BitPat("b00000000000100000010?????1110011")
+  def FSFLAGS            = BitPat("b000000000001?????001?????1110011")
+  def FSFLAGSI           = BitPat("b000000000001?????101?????1110011")
+  def FRRM               = BitPat("b00000000001000000010?????1110011")
+  def FSRM               = BitPat("b000000000010?????001?????1110011")
+  def FSRMI              = BitPat("b000000000010?????101?????1110011")
+  def FSCSR              = BitPat("b000000000011?????001?????1110011")
+  def FRCSR              = BitPat("b00000000001100000010?????1110011")
+  def RDCYCLE            = BitPat("b11000000000000000010?????1110011")
+  def RDTIME             = BitPat("b11000000000100000010?????1110011")
+  def RDINSTRET          = BitPat("b11000000001000000010?????1110011")
+  def RDCYCLEH           = BitPat("b11001000000000000010?????1110011")
+  def RDTIMEH            = BitPat("b11001000000100000010?????1110011")
+  def RDINSTRETH         = BitPat("b11001000001000000010?????1110011")
+  def SCALL              = BitPat("b00000000000000000000000001110011")
+  def SBREAK             = BitPat("b00000000000100000000000001110011")
+}
+object Causes {
+  val misaligned_fetch = 0x0
+  val fault_fetch = 0x1
+  val illegal_instruction = 0x2
+  val breakpoint = 0x3
+  val misaligned_load = 0x4
+  val fault_load = 0x5
+  val misaligned_store = 0x6
+  val fault_store = 0x7
+  val user_ecall = 0x8
+  val supervisor_ecall = 0x9
+  val hypervisor_ecall = 0xa
+  val machine_ecall = 0xb
+  val all = {
+    val res = collection.mutable.ArrayBuffer[Int]()
+    res += misaligned_fetch
+    res += fault_fetch
+    res += illegal_instruction
+    res += breakpoint
+    res += misaligned_load
+    res += fault_load
+    res += misaligned_store
+    res += fault_store
+    res += user_ecall
+    res += supervisor_ecall
+    res += hypervisor_ecall
+    res += machine_ecall
+    res.toArray
+  }
+}
+object CSRs {
+  val fflags = 0x1
+  val frm = 0x2
+  val fcsr = 0x3
+  val cycle = 0xc00
+  val time = 0xc01
+  val instret = 0xc02
+  val sstatus = 0x100
+  val sie = 0x104
+  val stvec = 0x105
+  val sscratch = 0x140
+  val sepc = 0x141
+  val scause = 0x142
+  val sbadaddr = 0x143
+  val sip = 0x144
+  val sptbr = 0x180
+  val scycle = 0xd00
+  val stime = 0xd01
+  val sinstret = 0xd02
+  val mstatus = 0x300
+  val medeleg = 0x302
+  val mideleg = 0x303
+  val mie = 0x304
+  val mtvec = 0x305
+  val mscratch = 0x340
+  val mepc = 0x341
+  val mcause = 0x342
+  val mbadaddr = 0x343
+  val mip = 0x344
+  val mucounteren = 0x310
+  val mscounteren = 0x311
+  val mucycle_delta = 0x700
+  val mutime_delta = 0x701
+  val muinstret_delta = 0x702
+  val mscycle_delta = 0x704
+  val mstime_delta = 0x705
+  val msinstret_delta = 0x706
+  val tdrselect = 0x7a0
+  val tdrdata1 = 0x7a1
+  val tdrdata2 = 0x7a2
+  val tdrdata3 = 0x7a3
+  val dcsr = 0x7b0
+  val dpc = 0x7b1
+  val dscratch = 0x7b2
+  val mcycle = 0xf00
+  val mtime = 0xf01
+  val minstret = 0xf02
+  val misa = 0xf10
+  val mvendorid = 0xf11
+  val marchid = 0xf12
+  val mimpid = 0xf13
+  val mhartid = 0xf14
+  val mreset = 0x7c2
+  val cycleh = 0xc80
+  val timeh = 0xc81
+  val instreth = 0xc82
+  val mucycle_deltah = 0x780
+  val mutime_deltah = 0x781
+  val muinstret_deltah = 0x782
+  val mscycle_deltah = 0x784
+  val mstime_deltah = 0x785
+  val msinstret_deltah = 0x786
+  val mcycleh = 0xf80
+  val mtimeh = 0xf81
+  val minstreth = 0xf82
+  val all = {
+    val res = collection.mutable.ArrayBuffer[Int]()
+    res += fflags
+    res += frm
+    res += fcsr
+    res += cycle
+    res += time
+    res += instret
+    res += sstatus
+    res += sie
+    res += stvec
+    res += sscratch
+    res += sepc
+    res += scause
+    res += sbadaddr
+    res += sip
+    res += sptbr
+    res += scycle
+    res += stime
+    res += sinstret
+    res += mstatus
+    res += medeleg
+    res += mideleg
+    res += mie
+    res += mtvec
+    res += mscratch
+    res += mepc
+    res += mcause
+    res += mbadaddr
+    res += mip
+    res += mucounteren
+    res += mscounteren
+    res += mucycle_delta
+    res += mutime_delta
+    res += muinstret_delta
+    res += mscycle_delta
+    res += mstime_delta
+    res += msinstret_delta
+    res += tdrselect
+    res += tdrdata1
+    res += tdrdata2
+    res += tdrdata3
+    res += dcsr
+    res += dpc
+    res += dscratch
+    res += mcycle
+    res += mtime
+    res += minstret
+    res += misa
+    res += mvendorid
+    res += marchid
+    res += mimpid
+    res += mhartid
+    res += mreset
+    res.toArray
+  }
+  val all32 = {
+    val res = collection.mutable.ArrayBuffer(all:_*)
+    res += cycleh
+    res += timeh
+    res += instreth
+    res += mucycle_deltah
+    res += mutime_deltah
+    res += muinstret_deltah
+    res += mscycle_deltah
+    res += mstime_deltah
+    res += msinstret_deltah
+    res += mcycleh
+    res += mtimeh
+    res += minstreth
+    res.toArray
+  }
+}
--- a/src/main/scala/rocket/multiplier.scala
+++ b/src/main/scala/rocket/multiplier.scala
@ -0,0 +1,154 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import ALU._
+import Util._
+
+class MultiplierReq(dataBits: Int, tagBits: Int) extends Bundle {
+  val fn = Bits(width = SZ_ALU_FN)
+  val dw = Bits(width = SZ_DW)
+  val in1 = Bits(width = dataBits)
+  val in2 = Bits(width = dataBits)
+  val tag = UInt(width = tagBits)
+  override def cloneType = new MultiplierReq(dataBits, tagBits).asInstanceOf[this.type]
+}
+
+class MultiplierResp(dataBits: Int, tagBits: Int) extends Bundle {
+  val data = Bits(width = dataBits)
+  val tag = UInt(width = tagBits)
+  override def cloneType = new MultiplierResp(dataBits, tagBits).asInstanceOf[this.type]
+}
+
+class MultiplierIO(dataBits: Int, tagBits: Int) extends Bundle {
+  val req = Decoupled(new MultiplierReq(dataBits, tagBits)).flip
+  val kill = Bool(INPUT)
+  val resp = Decoupled(new MultiplierResp(dataBits, tagBits))
+}
+
+case class MulDivConfig(
+  mulUnroll: Int = 1,
+  mulEarlyOut: Boolean = false,
+  divEarlyOut: Boolean = false
+)
+
+class MulDiv(cfg: MulDivConfig, width: Int, nXpr: Int = 32) extends Module {
+  val io = new MultiplierIO(width, log2Up(nXpr))
+  val w = io.req.bits.in1.getWidth
+  val mulw = (w + cfg.mulUnroll - 1) / cfg.mulUnroll * cfg.mulUnroll
+ 
+  val s_ready :: s_neg_inputs :: s_busy :: s_move_rem :: s_neg_output :: s_done :: Nil = Enum(UInt(), 6)
+  val state = Reg(init=s_ready)
+ 
+  val req = Reg(io.req.bits)
+  val count = Reg(UInt(width = log2Up(w+1)))
+  val neg_out = Reg(Bool())
+  val isMul = Reg(Bool())
+  val isHi = Reg(Bool())
+  val divisor = Reg(Bits(width = w+1)) // div only needs w bits
+  val remainder = Reg(Bits(width = 2*mulw+2)) // div only needs 2*w+1 bits
+
+  val cmdMul :: cmdHi :: lhsSigned :: rhsSigned :: Nil =
+    DecodeLogic(io.req.bits.fn, List(X, X, X, X), List(
+                   FN_DIV    -> List(N, N, Y, Y),
+                   FN_REM    -> List(N, Y, Y, Y),
+                   FN_DIVU   -> List(N, N, N, N),
+                   FN_REMU   -> List(N, Y, N, N),
+                   FN_MUL    -> List(Y, N, X, X),
+                   FN_MULH   -> List(Y, Y, Y, Y),
+                   FN_MULHU  -> List(Y, Y, N, N),
+                   FN_MULHSU -> List(Y, Y, Y, N))).map(_ toBool)
+
+  require(w == 32 || w == 64)
+  def halfWidth(req: MultiplierReq) = Bool(w > 32) && req.dw === DW_32
+
+  def sext(x: Bits, halfW: Bool, signed: Bool) = {
+    val sign = signed && Mux(halfW, x(w/2-1), x(w-1))
+    val hi = Mux(halfW, Fill(w/2, sign), x(w-1,w/2))
+    (Cat(hi, x(w/2-1,0)), sign)
+  }
+  val (lhs_in, lhs_sign) = sext(io.req.bits.in1, halfWidth(io.req.bits), lhsSigned)
+  val (rhs_in, rhs_sign) = sext(io.req.bits.in2, halfWidth(io.req.bits), rhsSigned)
+  
+  val subtractor = remainder(2*w,w) - divisor(w,0)
+  val less = subtractor(w)
+  val negated_remainder = -remainder(w-1,0)
+
+  when (state === s_neg_inputs) {
+    when (remainder(w-1) || isMul) {
+      remainder := negated_remainder
+    }
+    when (divisor(w-1) || isMul) {
+      divisor := subtractor
+    }
+    state := s_busy
+  }
+
+  when (state === s_neg_output) {
+    remainder := negated_remainder
+    state := s_done
+  }
+  when (state === s_move_rem) {
+    remainder := remainder(2*w, w+1)
+    state := Mux(neg_out, s_neg_output, s_done)
+  }
+  when (state === s_busy && isMul) {
+    val mulReg = Cat(remainder(2*mulw+1,w+1),remainder(w-1,0))
+    val mplier = mulReg(mulw-1,0)
+    val accum = mulReg(2*mulw,mulw).asSInt
+    val mpcand = divisor.asSInt
+    val prod = mplier(cfg.mulUnroll-1, 0) * mpcand + accum
+    val nextMulReg = Cat(prod, mplier(mulw-1, cfg.mulUnroll))
+
+    val eOutMask = (SInt(BigInt(-1) << mulw) >> (count * cfg.mulUnroll)(log2Up(mulw)-1,0))(mulw-1,0)
+    val eOut = Bool(cfg.mulEarlyOut) && count =/= mulw/cfg.mulUnroll-1 && count =/= 0 &&
+      !isHi && (mplier & ~eOutMask) === UInt(0)
+    val eOutRes = (mulReg >> (mulw - count * cfg.mulUnroll)(log2Up(mulw)-1,0))
+    val nextMulReg1 = Cat(nextMulReg(2*mulw,mulw), Mux(eOut, eOutRes, nextMulReg)(mulw-1,0))
+    remainder := Cat(nextMulReg1 >> w, Bool(false), nextMulReg1(w-1,0))
+
+    count := count + 1
+    when (eOut || count === mulw/cfg.mulUnroll-1) {
+      state := Mux(isHi, s_move_rem, s_done)
+    }
+  }
+  when (state === s_busy && !isMul) {
+    when (count === w) {
+      state := Mux(isHi, s_move_rem, Mux(neg_out, s_neg_output, s_done))
+    }
+    count := count + 1
+
+    remainder := Cat(Mux(less, remainder(2*w-1,w), subtractor(w-1,0)), remainder(w-1,0), !less)
+
+    val divisorMSB = Log2(divisor(w-1,0), w)
+    val dividendMSB = Log2(remainder(w-1,0), w)
+    val eOutPos = UInt(w-1) + divisorMSB - dividendMSB
+    val eOutZero = divisorMSB > dividendMSB
+    val eOut = count === 0 && less /* not divby0 */ && (eOutPos > 0 || eOutZero)
+    when (Bool(cfg.divEarlyOut) && eOut) {
+      val shift = Mux(eOutZero, UInt(w-1), eOutPos(log2Up(w)-1,0))
+      remainder := remainder(w-1,0) << shift
+      count := shift
+    }
+    when (count === 0 && !less /* divby0 */ && !isHi) { neg_out := false }
+  }
+  when (io.resp.fire() || io.kill) {
+    state := s_ready
+  }
+  when (io.req.fire()) {
+    state := Mux(lhs_sign || rhs_sign && !cmdMul, s_neg_inputs, s_busy)
+    isMul := cmdMul
+    isHi := cmdHi
+    count := 0
+    neg_out := !cmdMul && Mux(cmdHi, lhs_sign, lhs_sign =/= rhs_sign)
+    divisor := Cat(rhs_sign, rhs_in)
+    remainder := lhs_in
+    req := io.req.bits
+  }
+
+  io.resp.bits := req
+  io.resp.bits.data := Mux(halfWidth(req), Cat(Fill(w/2, remainder(w/2-1)), remainder(w/2-1,0)), remainder(w-1,0))
+  io.resp.valid := state === s_done
+  io.req.ready := state === s_ready
+}
--- a/src/main/scala/rocket/nbdcache.scala
+++ b/src/main/scala/rocket/nbdcache.scala
--- a/src/main/scala/rocket/package.scala
+++ b/src/main/scala/rocket/package.scala
@ -0,0 +1,4 @@
+// See LICENSE for license details.
+
+package object rocket extends 
+  rocket.constants.ScalarOpConstants
--- a/src/main/scala/rocket/ptw.scala
+++ b/src/main/scala/rocket/ptw.scala
@ -0,0 +1,218 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import uncore.agents._
+import uncore.constants._
+import Util._
+import uncore.util._
+import cde.{Parameters, Field}
+
+class PTWReq(implicit p: Parameters) extends CoreBundle()(p) {
+  val prv = Bits(width = 2)
+  val pum = Bool()
+  val mxr = Bool()
+  val addr = UInt(width = vpnBits)
+  val store = Bool()
+  val fetch = Bool()
+}
+
+class PTWResp(implicit p: Parameters) extends CoreBundle()(p) {
+  val pte = new PTE
+}
+
+class TLBPTWIO(implicit p: Parameters) extends CoreBundle()(p) {
+  val req = Decoupled(new PTWReq)
+  val resp = Valid(new PTWResp).flip
+  val ptbr = new PTBR().asInput
+  val invalidate = Bool(INPUT)
+  val status = new MStatus().asInput
+}
+
+class DatapathPTWIO(implicit p: Parameters) extends CoreBundle()(p) {
+  val ptbr = new PTBR().asInput
+  val invalidate = Bool(INPUT)
+  val status = new MStatus().asInput
+}
+
+class PTE(implicit p: Parameters) extends CoreBundle()(p) {
+  val reserved_for_hardware = Bits(width = 16)
+  val ppn = UInt(width = 38)
+  val reserved_for_software = Bits(width = 2)
+  val d = Bool()
+  val a = Bool()
+  val g = Bool()
+  val u = Bool()
+  val x = Bool()
+  val w = Bool()
+  val r = Bool()
+  val v = Bool()
+
+  def table(dummy: Int = 0) = v && !r && !w && !x
+  def leaf(dummy: Int = 0) = v && (r || (x && !w))
+  def ur(dummy: Int = 0) = sr() && u
+  def uw(dummy: Int = 0) = sw() && u
+  def ux(dummy: Int = 0) = sx() && u
+  def sr(dummy: Int = 0) = leaf() && r
+  def sw(dummy: Int = 0) = leaf() && w
+  def sx(dummy: Int = 0) = leaf() && x
+
+  def access_ok(req: PTWReq) = {
+    val perm_ok = Mux(req.fetch, x, Mux(req.store, w, r || (x && req.mxr)))
+    val priv_ok = Mux(u, !req.pum, req.prv(0))
+    leaf() && priv_ok && perm_ok
+  }
+}
+
+class PTW(n: Int)(implicit p: Parameters) extends CoreModule()(p) {
+  val io = new Bundle {
+    val requestor = Vec(n, new TLBPTWIO).flip
+    val mem = new HellaCacheIO
+    val dpath = new DatapathPTWIO
+  }
+
+  require(usingAtomics, "PTW requires atomic memory operations")
+
+  val s_ready :: s_req :: s_wait1 :: s_wait2 :: s_set_dirty :: s_wait1_dirty :: s_wait2_dirty :: s_done :: Nil = Enum(UInt(), 8)
+  val state = Reg(init=s_ready)
+  val count = Reg(UInt(width = log2Up(pgLevels)))
+  val s1_kill = Reg(next = Bool(false))
+
+  val r_req = Reg(new PTWReq)
+  val r_req_dest = Reg(Bits())
+  val r_pte = Reg(new PTE)
+  
+  val vpn_idxs = (0 until pgLevels).map(i => (r_req.addr >> (pgLevels-i-1)*pgLevelBits)(pgLevelBits-1,0))
+  val vpn_idx = vpn_idxs(count)
+
+  val arb = Module(new RRArbiter(new PTWReq, n))
+  arb.io.in <> io.requestor.map(_.req)
+  arb.io.out.ready := state === s_ready
+
+  val pte = {
+    val tmp = new PTE().fromBits(io.mem.resp.bits.data)
+    val res = Wire(init = new PTE().fromBits(io.mem.resp.bits.data))
+    res.ppn := tmp.ppn(ppnBits-1, 0)
+    when ((tmp.ppn >> ppnBits) =/= 0) { res.v := false }
+    res
+  }
+  val pte_addr = Cat(r_pte.ppn, vpn_idx) << log2Ceil(xLen/8)
+
+  when (arb.io.out.fire()) {
+    r_req := arb.io.out.bits
+    r_req_dest := arb.io.chosen
+    r_pte.ppn := io.dpath.ptbr.ppn
+  }
+
+  val (pte_cache_hit, pte_cache_data) = {
+    val size = 1 << log2Up(pgLevels * 2)
+    val plru = new PseudoLRU(size)
+    val valid = Reg(init = UInt(0, size))
+    val tags = Reg(Vec(size, UInt(width = paddrBits)))
+    val data = Reg(Vec(size, UInt(width = ppnBits)))
+
+    val hits = tags.map(_ === pte_addr).asUInt & valid
+    val hit = hits.orR
+    when (io.mem.resp.valid && pte.table() && !hit) {
+      val r = Mux(valid.andR, plru.replace, PriorityEncoder(~valid))
+      valid := valid | UIntToOH(r)
+      tags(r) := pte_addr
+      data(r) := pte.ppn
+    }
+    when (hit && state === s_req) { plru.access(OHToUInt(hits)) }
+    when (io.dpath.invalidate) { valid := 0 }
+
+    (hit && count < pgLevels-1, Mux1H(hits, data))
+  }
+
+  val pte_wdata = Wire(init=new PTE().fromBits(0))
+  pte_wdata.a := true
+  pte_wdata.d := r_req.store
+  
+  io.mem.req.valid     := state.isOneOf(s_req, s_set_dirty)
+  io.mem.req.bits.phys := Bool(true)
+  io.mem.req.bits.cmd  := Mux(state === s_set_dirty, M_XA_OR, M_XRD)
+  io.mem.req.bits.typ  := log2Ceil(xLen/8)
+  io.mem.req.bits.addr := pte_addr
+  io.mem.s1_data := pte_wdata.asUInt
+  io.mem.s1_kill := s1_kill
+  io.mem.invalidate_lr := Bool(false)
+  
+  val resp_ppns = (0 until pgLevels-1).map(i => Cat(pte_addr >> (pgIdxBits + pgLevelBits*(pgLevels-i-1)), r_req.addr(pgLevelBits*(pgLevels-i-1)-1,0))) :+ (pte_addr >> pgIdxBits)
+  for (i <- 0 until io.requestor.size) {
+    io.requestor(i).resp.valid := state === s_done && (r_req_dest === i)
+    io.requestor(i).resp.bits.pte := r_pte
+    io.requestor(i).resp.bits.pte.ppn := resp_ppns(count)
+    io.requestor(i).ptbr := io.dpath.ptbr
+    io.requestor(i).invalidate := io.dpath.invalidate
+    io.requestor(i).status := io.dpath.status
+  }
+
+  // control state machine
+  switch (state) {
+    is (s_ready) {
+      when (arb.io.out.valid) {
+        state := s_req
+      }
+      count := UInt(0)
+    }
+    is (s_req) {
+      when (pte_cache_hit) {
+        s1_kill := true
+        state := s_req
+        count := count + 1
+        r_pte.ppn := pte_cache_data
+      }.elsewhen (io.mem.req.ready) {
+        state := s_wait1
+      }
+    }
+    is (s_wait1) {
+      state := s_wait2
+      when (io.mem.xcpt.pf.ld) {
+        r_pte.v := false
+        state := s_done
+      }
+    }
+    is (s_wait2) {
+      when (io.mem.s2_nack) {
+        state := s_req
+      }
+      when (io.mem.resp.valid) {
+        state := s_done
+        when (pte.access_ok(r_req) && (!pte.a || (r_req.store && !pte.d))) {
+          state := s_set_dirty
+        }.otherwise {
+          r_pte := pte
+        }
+        when (pte.table() && count < pgLevels-1) {
+          state := s_req
+          count := count + 1
+        }
+      }
+    }
+    is (s_set_dirty) {
+      when (io.mem.req.ready) {
+        state := s_wait1_dirty
+      }
+    }
+    is (s_wait1_dirty) {
+      state := s_wait2_dirty
+      when (io.mem.xcpt.pf.st) {
+        r_pte.v := false
+        state := s_done
+      }
+    }
+    is (s_wait2_dirty) {
+      when (io.mem.s2_nack) {
+        state := s_set_dirty
+      }
+      when (io.mem.resp.valid) {
+        state := s_req
+      }
+    }
+    is (s_done) {
+      state := s_ready
+    }
+  }
+}
--- a/src/main/scala/rocket/rocc.scala
+++ b/src/main/scala/rocket/rocc.scala
@ -0,0 +1,290 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import uncore.tilelink._
+import uncore.constants._
+import uncore.agents.CacheName
+import Util._
+import cde.{Parameters, Field}
+
+case object RoccMaxTaggedMemXacts extends Field[Int]
+case object RoccNMemChannels extends Field[Int]
+case object RoccNPTWPorts extends Field[Int]
+
+class RoCCInstruction extends Bundle
+{
+  val funct = Bits(width = 7)
+  val rs2 = Bits(width = 5)
+  val rs1 = Bits(width = 5)
+  val xd = Bool()
+  val xs1 = Bool()
+  val xs2 = Bool()
+  val rd = Bits(width = 5)
+  val opcode = Bits(width = 7)
+}
+
+class RoCCCommand(implicit p: Parameters) extends CoreBundle()(p) {
+  val inst = new RoCCInstruction
+  val rs1 = Bits(width = xLen)
+  val rs2 = Bits(width = xLen)
+  val status = new MStatus
+}
+
+class RoCCResponse(implicit p: Parameters) extends CoreBundle()(p) {
+  val rd = Bits(width = 5)
+  val data = Bits(width = xLen)
+}
+
+class RoCCInterface(implicit p: Parameters) extends CoreBundle()(p) {
+  val cmd = Decoupled(new RoCCCommand).flip
+  val resp = Decoupled(new RoCCResponse)
+  val mem = new HellaCacheIO()(p.alterPartial({ case CacheName => "L1D" }))
+  val busy = Bool(OUTPUT)
+  val interrupt = Bool(OUTPUT)
+  
+  // These should be handled differently, eventually
+  val autl = new ClientUncachedTileLinkIO
+  val utl = Vec(p(RoccNMemChannels), new ClientUncachedTileLinkIO)
+  val ptw = Vec(p(RoccNPTWPorts), new TLBPTWIO)
+  val fpu_req = Decoupled(new FPInput)
+  val fpu_resp = Decoupled(new FPResult).flip
+  val exception = Bool(INPUT)
+
+  override def cloneType = new RoCCInterface().asInstanceOf[this.type]
+}
+
+abstract class RoCC(implicit p: Parameters) extends CoreModule()(p) {
+  val io = new RoCCInterface
+  io.mem.req.bits.phys := Bool(true) // don't perform address translation
+}
+
+class AccumulatorExample(n: Int = 4)(implicit p: Parameters) extends RoCC()(p) {
+  val regfile = Mem(n, UInt(width = xLen))
+  val busy = Reg(init = Vec.fill(n){Bool(false)})
+
+  val cmd = Queue(io.cmd)
+  val funct = cmd.bits.inst.funct
+  val addr = cmd.bits.rs2(log2Up(n)-1,0)
+  val doWrite = funct === UInt(0)
+  val doRead = funct === UInt(1)
+  val doLoad = funct === UInt(2)
+  val doAccum = funct === UInt(3)
+  val memRespTag = io.mem.resp.bits.tag(log2Up(n)-1,0)
+
+  // datapath
+  val addend = cmd.bits.rs1
+  val accum = regfile(addr)
+  val wdata = Mux(doWrite, addend, accum + addend)
+
+  when (cmd.fire() && (doWrite || doAccum)) {
+    regfile(addr) := wdata
+  }
+
+  when (io.mem.resp.valid) {
+    regfile(memRespTag) := io.mem.resp.bits.data
+    busy(memRespTag) := Bool(false)
+  }
+
+  // control
+  when (io.mem.req.fire()) {
+    busy(addr) := Bool(true)
+  }
+
+  val doResp = cmd.bits.inst.xd
+  val stallReg = busy(addr)
+  val stallLoad = doLoad && !io.mem.req.ready
+  val stallResp = doResp && !io.resp.ready
+
+  cmd.ready := !stallReg && !stallLoad && !stallResp
+    // command resolved if no stalls AND not issuing a load that will need a request
+
+  // PROC RESPONSE INTERFACE
+  io.resp.valid := cmd.valid && doResp && !stallReg && !stallLoad
+    // valid response if valid command, need a response, and no stalls
+  io.resp.bits.rd := cmd.bits.inst.rd
+    // Must respond with the appropriate tag or undefined behavior
+  io.resp.bits.data := accum
+    // Semantics is to always send out prior accumulator register value
+
+  io.busy := cmd.valid || busy.reduce(_||_)
+    // Be busy when have pending memory requests or committed possibility of pending requests
+  io.interrupt := Bool(false)
+    // Set this true to trigger an interrupt on the processor (please refer to supervisor documentation)
+
+  // MEMORY REQUEST INTERFACE
+  io.mem.req.valid := cmd.valid && doLoad && !stallReg && !stallResp
+  io.mem.req.bits.addr := addend
+  io.mem.req.bits.tag := addr
+  io.mem.req.bits.cmd := M_XRD // perform a load (M_XWR for stores)
+  io.mem.req.bits.typ := MT_D // D = 8 bytes, W = 4, H = 2, B = 1
+  io.mem.req.bits.data := Bits(0) // we're not performing any stores...
+  io.mem.invalidate_lr := false
+
+  io.autl.acquire.valid := false
+  io.autl.grant.ready := false
+}
+
+class TranslatorExample(implicit p: Parameters) extends RoCC()(p) {
+  val req_addr = Reg(UInt(width = coreMaxAddrBits))
+  val req_rd = Reg(io.resp.bits.rd)
+  val req_offset = req_addr(pgIdxBits - 1, 0)
+  val req_vpn = req_addr(coreMaxAddrBits - 1, pgIdxBits)
+  val pte = Reg(new PTE)
+
+  val s_idle :: s_ptw_req :: s_ptw_resp :: s_resp :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_idle)
+
+  io.cmd.ready := (state === s_idle)
+
+  when (io.cmd.fire()) {
+    req_rd := io.cmd.bits.inst.rd
+    req_addr := io.cmd.bits.rs1
+    state := s_ptw_req
+  }
+
+  private val ptw = io.ptw(0)
+
+  when (ptw.req.fire()) { state := s_ptw_resp }
+
+  when (state === s_ptw_resp && ptw.resp.valid) {
+    pte := ptw.resp.bits.pte
+    state := s_resp
+  }
+
+  when (io.resp.fire()) { state := s_idle }
+
+  ptw.req.valid := (state === s_ptw_req)
+  ptw.req.bits.addr := req_vpn
+  ptw.req.bits.store := Bool(false)
+  ptw.req.bits.fetch := Bool(false)
+
+  io.resp.valid := (state === s_resp)
+  io.resp.bits.rd := req_rd
+  io.resp.bits.data := Mux(pte.leaf(), Cat(pte.ppn, req_offset), SInt(-1, xLen).asUInt)
+
+  io.busy := (state =/= s_idle)
+  io.interrupt := Bool(false)
+  io.mem.req.valid := Bool(false)
+  io.mem.invalidate_lr := Bool(false)
+  io.autl.acquire.valid := Bool(false)
+  io.autl.grant.ready := Bool(false)
+}
+
+class CharacterCountExample(implicit p: Parameters) extends RoCC()(p)
+    with HasTileLinkParameters {
+
+  private val blockOffset = tlBeatAddrBits + tlByteAddrBits
+
+  val needle = Reg(UInt(width = 8))
+  val addr = Reg(UInt(width = coreMaxAddrBits))
+  val count = Reg(UInt(width = xLen))
+  val resp_rd = Reg(io.resp.bits.rd)
+
+  val addr_block = addr(coreMaxAddrBits - 1, blockOffset)
+  val offset = addr(blockOffset - 1, 0)
+  val next_addr = (addr_block + UInt(1)) << UInt(blockOffset)
+
+  val s_idle :: s_acq :: s_gnt :: s_check :: s_resp :: Nil = Enum(Bits(), 5)
+  val state = Reg(init = s_idle)
+
+  val gnt = io.autl.grant.bits
+  val recv_data = Reg(UInt(width = tlDataBits))
+  val recv_beat = Reg(UInt(width = tlBeatAddrBits))
+
+  val data_bytes = Vec.tabulate(tlDataBytes) { i => recv_data(8 * (i + 1) - 1, 8 * i) }
+  val zero_match = data_bytes.map(_ === UInt(0))
+  val needle_match = data_bytes.map(_ === needle)
+  val first_zero = PriorityEncoder(zero_match)
+
+  val chars_found = PopCount(needle_match.zipWithIndex.map {
+    case (matches, i) =>
+      val idx = Cat(recv_beat, UInt(i, tlByteAddrBits))
+      matches && idx >= offset && UInt(i) <= first_zero
+  })
+  val zero_found = zero_match.reduce(_ || _)
+  val finished = Reg(Bool())
+
+  io.cmd.ready := (state === s_idle)
+  io.resp.valid := (state === s_resp)
+  io.resp.bits.rd := resp_rd
+  io.resp.bits.data := count
+  io.autl.acquire.valid := (state === s_acq)
+  io.autl.acquire.bits := GetBlock(addr_block = addr_block)
+  io.autl.grant.ready := (state === s_gnt)
+
+  when (io.cmd.fire()) {
+    addr := io.cmd.bits.rs1
+    needle := io.cmd.bits.rs2
+    resp_rd := io.cmd.bits.inst.rd
+    count := UInt(0)
+    finished := Bool(false)
+    state := s_acq
+  }
+
+  when (io.autl.acquire.fire()) { state := s_gnt }
+
+  when (io.autl.grant.fire()) {
+    recv_beat := gnt.addr_beat
+    recv_data := gnt.data
+    state := s_check
+  }
+
+  when (state === s_check) {
+    when (!finished) {
+      count := count + chars_found
+    }
+    when (zero_found) { finished := Bool(true) }
+    when (recv_beat === UInt(tlDataBeats - 1)) {
+      addr := next_addr
+      state := Mux(zero_found || finished, s_resp, s_acq)
+    } .otherwise {
+      state := s_gnt
+    }
+  }
+
+  when (io.resp.fire()) { state := s_idle }
+
+  io.busy := (state =/= s_idle)
+  io.interrupt := Bool(false)
+  io.mem.req.valid := Bool(false)
+  io.mem.invalidate_lr := Bool(false)
+}
+
+class OpcodeSet(val opcodes: Seq[UInt]) {
+  def |(set: OpcodeSet) =
+    new OpcodeSet(this.opcodes ++ set.opcodes)
+
+  def matches(oc: UInt) = opcodes.map(_ === oc).reduce(_ || _)
+}
+
+object OpcodeSet {
+  val custom0 = new OpcodeSet(Seq(Bits("b0001011")))
+  val custom1 = new OpcodeSet(Seq(Bits("b0101011")))
+  val custom2 = new OpcodeSet(Seq(Bits("b1011011")))
+  val custom3 = new OpcodeSet(Seq(Bits("b1111011")))
+  val all = custom0 | custom1 | custom2 | custom3
+}
+
+class RoccCommandRouter(opcodes: Seq[OpcodeSet])(implicit p: Parameters)
+    extends CoreModule()(p) {
+  val io = new Bundle {
+    val in = Decoupled(new RoCCCommand).flip
+    val out = Vec(opcodes.size, Decoupled(new RoCCCommand))
+    val busy = Bool(OUTPUT)
+  }
+
+  val cmd = Queue(io.in)
+  val cmdReadys = io.out.zip(opcodes).map { case (out, opcode) =>
+    val me = opcode.matches(cmd.bits.inst.opcode)
+    out.valid := cmd.valid && me
+    out.bits := cmd.bits
+    out.ready && me
+  }
+  cmd.ready := cmdReadys.reduce(_ || _)
+  io.busy := cmd.valid
+
+  assert(PopCount(cmdReadys) <= UInt(1),
+    "Custom opcode matched for more than one accelerator")
+}
--- a/src/main/scala/rocket/rocket.scala
+++ b/src/main/scala/rocket/rocket.scala
@ -0,0 +1,702 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import junctions._
+import uncore.devices._
+import uncore.agents.CacheName
+import uncore.constants._
+import Util._
+import cde.{Parameters, Field}
+
+case object XLen extends Field[Int]
+case object FetchWidth extends Field[Int]
+case object RetireWidth extends Field[Int]
+case object FPUKey extends Field[Option[FPUConfig]]
+case object MulDivKey extends Field[Option[MulDivConfig]]
+case object UseVM extends Field[Boolean]
+case object UseUser extends Field[Boolean]
+case object UseDebug extends Field[Boolean]
+case object UseAtomics extends Field[Boolean]
+case object UseCompressed extends Field[Boolean]
+case object FastLoadWord extends Field[Boolean]
+case object FastLoadByte extends Field[Boolean]
+case object CoreInstBits extends Field[Int]
+case object NCustomMRWCSRs extends Field[Int]
+case object MtvecWritable extends Field[Boolean]
+case object MtvecInit extends Field[BigInt]
+case object ResetVector extends Field[BigInt]
+case object NBreakpoints extends Field[Int]
+
+trait HasCoreParameters extends HasAddrMapParameters {
+  implicit val p: Parameters
+  val xLen = p(XLen)
+
+  val usingVM = p(UseVM)
+  val usingUser = p(UseUser) || usingVM
+  val usingDebug = p(UseDebug)
+  val usingMulDiv = p(MulDivKey).nonEmpty
+  val usingFPU = p(FPUKey).nonEmpty
+  val usingAtomics = p(UseAtomics)
+  val usingCompressed = p(UseCompressed)
+  val usingRoCC = !p(BuildRoCC).isEmpty
+  val fastLoadWord = p(FastLoadWord)
+  val fastLoadByte = p(FastLoadByte)
+
+  val retireWidth = p(RetireWidth)
+  val fetchWidth = p(FetchWidth)
+  val coreInstBits = p(CoreInstBits)
+  val coreInstBytes = coreInstBits/8
+  val coreDataBits = xLen
+  val coreDataBytes = coreDataBits/8
+  val dcacheArbPorts = 1 + (if (usingVM) 1 else 0) + p(BuildRoCC).size
+  val coreDCacheReqTagBits = 6
+  val dcacheReqTagBits = coreDCacheReqTagBits + log2Ceil(dcacheArbPorts)
+
+  def pgIdxBits = 12
+  def pgLevelBits = 10 - log2Ceil(xLen / 32)
+  def vaddrBits = pgIdxBits + pgLevels * pgLevelBits
+  def ppnBits = paddrBits - pgIdxBits
+  def vpnBits = vaddrBits - pgIdxBits
+  val pgLevels = p(PgLevels)
+  val asIdBits = p(ASIdBits)
+  val vpnBitsExtended = vpnBits + (vaddrBits < xLen).toInt
+  val vaddrBitsExtended = vpnBitsExtended + pgIdxBits
+  val coreMaxAddrBits = paddrBits max vaddrBitsExtended
+  val nCustomMrwCsrs = p(NCustomMRWCSRs)
+  val nCores = p(NTiles)
+
+  // fetchWidth doubled, but coreInstBytes halved, for RVC
+  val decodeWidth = fetchWidth / (if (usingCompressed) 2 else 1)
+
+  // Print out log of committed instructions and their writeback values.
+  // Requires post-processing due to out-of-order writebacks.
+  val enableCommitLog = false
+
+  val maxPAddrBits = xLen match {
+    case 32 => 34
+    case 64 => 50
+  }
+
+  require(paddrBits <= maxPAddrBits)
+  require(!fastLoadByte || fastLoadWord)
+}
+
+abstract class CoreModule(implicit val p: Parameters) extends Module
+  with HasCoreParameters
+abstract class CoreBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
+  with HasCoreParameters
+
+class RegFile(n: Int, w: Int, zero: Boolean = false) {
+  private val rf = Mem(n, UInt(width = w))
+  private def access(addr: UInt) = rf(~addr(log2Up(n)-1,0))
+  private val reads = collection.mutable.ArrayBuffer[(UInt,UInt)]()
+  private var canRead = true
+  def read(addr: UInt) = {
+    require(canRead)
+    reads += addr -> Wire(UInt())
+    reads.last._2 := Mux(Bool(zero) && addr === UInt(0), UInt(0), access(addr))
+    reads.last._2
+  }
+  def write(addr: UInt, data: UInt) = {
+    canRead = false
+    when (addr =/= UInt(0)) {
+      access(addr) := data
+      for ((raddr, rdata) <- reads)
+        when (addr === raddr) { rdata := data }
+    }
+  }
+}
+
+object ImmGen {
+  def apply(sel: UInt, inst: UInt) = {
+    val sign = Mux(sel === IMM_Z, SInt(0), inst(31).asSInt)
+    val b30_20 = Mux(sel === IMM_U, inst(30,20).asSInt, sign)
+    val b19_12 = Mux(sel =/= IMM_U && sel =/= IMM_UJ, sign, inst(19,12).asSInt)
+    val b11 = Mux(sel === IMM_U || sel === IMM_Z, SInt(0),
+              Mux(sel === IMM_UJ, inst(20).asSInt,
+              Mux(sel === IMM_SB, inst(7).asSInt, sign)))
+    val b10_5 = Mux(sel === IMM_U || sel === IMM_Z, Bits(0), inst(30,25))
+    val b4_1 = Mux(sel === IMM_U, Bits(0),
+               Mux(sel === IMM_S || sel === IMM_SB, inst(11,8),
+               Mux(sel === IMM_Z, inst(19,16), inst(24,21))))
+    val b0 = Mux(sel === IMM_S, inst(7),
+             Mux(sel === IMM_I, inst(20),
+             Mux(sel === IMM_Z, inst(15), Bits(0))))
+
+    Cat(sign, b30_20, b19_12, b11, b10_5, b4_1, b0).asSInt
+  }
+}
+
+class Rocket(implicit p: Parameters) extends CoreModule()(p) {
+  val io = new Bundle {
+    val prci = new PRCITileIO().flip
+    val imem  = new FrontendIO()(p.alterPartial({case CacheName => "L1I" }))
+    val dmem = new HellaCacheIO()(p.alterPartial({ case CacheName => "L1D" }))
+    val ptw = new DatapathPTWIO().flip
+    val fpu = new FPUIO().flip
+    val rocc = new RoCCInterface().flip
+  }
+
+  val decode_table = {
+    (if (usingMulDiv) new MDecode +: (if (xLen > 32) Seq(new M64Decode) else Nil) else Nil) ++:
+    (if (usingAtomics) new ADecode +: (if (xLen > 32) Seq(new A64Decode) else Nil) else Nil) ++:
+    (if (usingFPU) new FDecode +: (if (xLen > 32) Seq(new F64Decode) else Nil) else Nil) ++:
+    (if (usingRoCC) Some(new RoCCDecode) else None) ++:
+    (if (xLen > 32) Some(new I64Decode) else None) ++:
+    (if (usingVM) Some(new SDecode) else None) ++:
+    (if (usingDebug) Some(new DebugDecode) else None) ++:
+    Seq(new IDecode)
+  } flatMap(_.table)
+
+  val ex_ctrl = Reg(new IntCtrlSigs)
+  val mem_ctrl = Reg(new IntCtrlSigs)
+  val wb_ctrl = Reg(new IntCtrlSigs)
+
+  val ex_reg_xcpt_interrupt  = Reg(Bool())
+  val ex_reg_valid           = Reg(Bool())
+  val ex_reg_rvc             = Reg(Bool())
+  val ex_reg_btb_hit         = Reg(Bool())
+  val ex_reg_btb_resp        = Reg(new BTBResp)
+  val ex_reg_xcpt            = Reg(Bool())
+  val ex_reg_flush_pipe      = Reg(Bool())
+  val ex_reg_load_use        = Reg(Bool())
+  val ex_reg_cause           = Reg(UInt())
+  val ex_reg_replay = Reg(Bool())
+  val ex_reg_pc = Reg(UInt())
+  val ex_reg_inst = Reg(Bits())
+
+  val mem_reg_xcpt_interrupt  = Reg(Bool())
+  val mem_reg_valid           = Reg(Bool())
+  val mem_reg_rvc             = Reg(Bool())
+  val mem_reg_btb_hit         = Reg(Bool())
+  val mem_reg_btb_resp        = Reg(new BTBResp)
+  val mem_reg_xcpt            = Reg(Bool())
+  val mem_reg_replay          = Reg(Bool())
+  val mem_reg_flush_pipe      = Reg(Bool())
+  val mem_reg_cause           = Reg(UInt())
+  val mem_reg_slow_bypass     = Reg(Bool())
+  val mem_reg_load            = Reg(Bool())
+  val mem_reg_store           = Reg(Bool())
+  val mem_reg_pc = Reg(UInt())
+  val mem_reg_inst = Reg(Bits())
+  val mem_reg_wdata = Reg(Bits())
+  val mem_reg_rs2 = Reg(Bits())
+  val take_pc_mem = Wire(Bool())
+
+  val wb_reg_valid           = Reg(Bool())
+  val wb_reg_xcpt            = Reg(Bool())
+  val wb_reg_replay          = Reg(Bool())
+  val wb_reg_cause           = Reg(UInt())
+  val wb_reg_pc = Reg(UInt())
+  val wb_reg_inst = Reg(Bits())
+  val wb_reg_wdata = Reg(Bits())
+  val wb_reg_rs2 = Reg(Bits())
+  val take_pc_wb = Wire(Bool())
+
+  val take_pc_mem_wb = take_pc_wb || take_pc_mem
+  val take_pc = take_pc_mem_wb
+
+  // decode stage
+  val ibuf = Module(new IBuf)
+  val id_expanded_inst = ibuf.io.inst.map(_.bits.inst)
+  val id_inst = id_expanded_inst.map(_.bits)
+  ibuf.io.imem <> io.imem.resp
+  ibuf.io.kill := take_pc
+
+  require(decodeWidth == 1 /* TODO */ && retireWidth == decodeWidth)
+  val id_ctrl = Wire(new IntCtrlSigs()).decode(id_inst(0), decode_table)
+  val id_raddr3 = id_expanded_inst(0).rs3
+  val id_raddr2 = id_expanded_inst(0).rs2
+  val id_raddr1 = id_expanded_inst(0).rs1
+  val id_waddr  = id_expanded_inst(0).rd
+  val id_load_use = Wire(Bool())
+  val id_reg_fence = Reg(init=Bool(false))
+  val id_ren = IndexedSeq(id_ctrl.rxs1, id_ctrl.rxs2)
+  val id_raddr = IndexedSeq(id_raddr1, id_raddr2)
+  val rf = new RegFile(31, xLen)
+  val id_rs = id_raddr.map(rf.read _)
+  val ctrl_killd = Wire(Bool())
+
+  val csr = Module(new CSRFile)
+  val id_csr_en = id_ctrl.csr =/= CSR.N
+  val id_system_insn = id_ctrl.csr === CSR.I
+  val id_csr_ren = (id_ctrl.csr === CSR.S || id_ctrl.csr === CSR.C) && id_raddr1 === UInt(0)
+  val id_csr = Mux(id_csr_ren, CSR.R, id_ctrl.csr)
+  val id_csr_addr = id_inst(0)(31,20)
+  // this is overly conservative
+  val safe_csrs = CSRs.sscratch :: CSRs.sepc :: CSRs.mscratch :: CSRs.mepc :: CSRs.mcause :: CSRs.mbadaddr :: Nil
+  val legal_csrs = collection.mutable.LinkedHashSet(CSRs.all:_*)
+  val id_csr_flush = id_system_insn || (id_csr_en && !id_csr_ren && !DecodeLogic(id_csr_addr, safe_csrs.map(UInt(_)), (legal_csrs -- safe_csrs).toList.map(UInt(_))))
+
+  val id_illegal_insn = !id_ctrl.legal ||
+    id_ctrl.fp && !csr.io.status.fs.orR ||
+    id_ctrl.rocc && !csr.io.status.xs.orR
+  // stall decode for fences (now, for AMO.aq; later, for AMO.rl and FENCE)
+  val id_amo_aq = id_inst(0)(26)
+  val id_amo_rl = id_inst(0)(25)
+  val id_fence_next = id_ctrl.fence || id_ctrl.amo && id_amo_rl
+  val id_mem_busy = !io.dmem.ordered || io.dmem.req.valid
+  val id_rocc_busy = Bool(usingRoCC) &&
+    (io.rocc.busy || ex_reg_valid && ex_ctrl.rocc ||
+     mem_reg_valid && mem_ctrl.rocc || wb_reg_valid && wb_ctrl.rocc)
+  id_reg_fence := id_fence_next || id_reg_fence && id_mem_busy
+  val id_do_fence = id_rocc_busy && id_ctrl.fence ||
+    id_mem_busy && (id_ctrl.amo && id_amo_aq || id_ctrl.fence_i || id_reg_fence && (id_ctrl.mem || id_ctrl.rocc) || id_csr_en)
+
+  val bpu = Module(new BreakpointUnit)
+  bpu.io.status := csr.io.status
+  bpu.io.bp := csr.io.bp
+  bpu.io.pc := ibuf.io.pc
+  bpu.io.ea := mem_reg_wdata
+
+  val id_xcpt_if = ibuf.io.inst(0).bits.pf0 || ibuf.io.inst(0).bits.pf1
+  val (id_xcpt, id_cause) = checkExceptions(List(
+    (csr.io.interrupt, csr.io.interrupt_cause),
+    (bpu.io.xcpt_if,   UInt(Causes.breakpoint)),
+    (id_xcpt_if,       UInt(Causes.fault_fetch)),
+    (id_illegal_insn,  UInt(Causes.illegal_instruction))))
+
+  val dcache_bypass_data =
+    if (fastLoadByte) io.dmem.resp.bits.data
+    else if (fastLoadWord) io.dmem.resp.bits.data_word_bypass
+    else wb_reg_wdata
+
+  // detect bypass opportunities
+  val ex_waddr = ex_reg_inst(11,7)
+  val mem_waddr = mem_reg_inst(11,7)
+  val wb_waddr = wb_reg_inst(11,7)
+  val bypass_sources = IndexedSeq(
+    (Bool(true), UInt(0), UInt(0)), // treat reading x0 as a bypass
+    (ex_reg_valid && ex_ctrl.wxd, ex_waddr, mem_reg_wdata),
+    (mem_reg_valid && mem_ctrl.wxd && !mem_ctrl.mem, mem_waddr, wb_reg_wdata),
+    (mem_reg_valid && mem_ctrl.wxd, mem_waddr, dcache_bypass_data))
+  val id_bypass_src = id_raddr.map(raddr => bypass_sources.map(s => s._1 && s._2 === raddr))
+
+  // execute stage
+  val bypass_mux = Vec(bypass_sources.map(_._3))
+  val ex_reg_rs_bypass = Reg(Vec(id_raddr.size, Bool()))
+  val ex_reg_rs_lsb = Reg(Vec(id_raddr.size, UInt()))
+  val ex_reg_rs_msb = Reg(Vec(id_raddr.size, UInt()))
+  val ex_rs = for (i <- 0 until id_raddr.size)
+    yield Mux(ex_reg_rs_bypass(i), bypass_mux(ex_reg_rs_lsb(i)), Cat(ex_reg_rs_msb(i), ex_reg_rs_lsb(i)))
+  val ex_imm = ImmGen(ex_ctrl.sel_imm, ex_reg_inst)
+  val ex_op1 = MuxLookup(ex_ctrl.sel_alu1, SInt(0), Seq(
+    A1_RS1 -> ex_rs(0).asSInt,
+    A1_PC -> ex_reg_pc.asSInt))
+  val ex_op2 = MuxLookup(ex_ctrl.sel_alu2, SInt(0), Seq(
+    A2_RS2 -> ex_rs(1).asSInt,
+    A2_IMM -> ex_imm,
+    A2_SIZE -> Mux(ex_reg_rvc, SInt(2), SInt(4))))
+
+  val alu = Module(new ALU)
+  alu.io.dw := ex_ctrl.alu_dw
+  alu.io.fn := ex_ctrl.alu_fn
+  alu.io.in2 := ex_op2.asUInt
+  alu.io.in1 := ex_op1.asUInt
+  
+  // multiplier and divider
+  val div = Module(new MulDiv(p(MulDivKey).getOrElse(MulDivConfig()), width = xLen))
+  div.io.req.valid := ex_reg_valid && ex_ctrl.div
+  div.io.req.bits.dw := ex_ctrl.alu_dw
+  div.io.req.bits.fn := ex_ctrl.alu_fn
+  div.io.req.bits.in1 := ex_rs(0)
+  div.io.req.bits.in2 := ex_rs(1)
+  div.io.req.bits.tag := ex_waddr
+
+  ex_reg_valid := !ctrl_killd
+  ex_reg_replay := !take_pc && ibuf.io.inst(0).valid && ibuf.io.inst(0).bits.replay
+  ex_reg_xcpt := !ctrl_killd && id_xcpt
+  ex_reg_xcpt_interrupt := !take_pc && ibuf.io.inst(0).valid && csr.io.interrupt
+  when (id_xcpt) { ex_reg_cause := id_cause }
+  ex_reg_btb_hit := ibuf.io.inst(0).bits.btb_hit
+  when (ibuf.io.inst(0).bits.btb_hit) { ex_reg_btb_resp := ibuf.io.btb_resp }
+
+  when (!ctrl_killd) {
+    ex_ctrl := id_ctrl
+    ex_reg_rvc := ibuf.io.inst(0).bits.rvc
+    ex_ctrl.csr := id_csr
+    when (id_xcpt) { // pass PC down ALU writeback pipeline for badaddr
+      ex_ctrl.alu_fn := ALU.FN_ADD
+      ex_ctrl.alu_dw := DW_XPR
+      ex_ctrl.sel_alu1 := A1_PC
+      ex_ctrl.sel_alu2 := A2_ZERO
+      when (!bpu.io.xcpt_if && !ibuf.io.inst(0).bits.pf0 && ibuf.io.inst(0).bits.pf1) { // PC+2
+        ex_ctrl.sel_alu2 := A2_SIZE
+        ex_reg_rvc := true
+      }
+    }
+    ex_reg_flush_pipe := id_ctrl.fence_i || id_csr_flush || csr.io.singleStep
+    ex_reg_load_use := id_load_use
+
+    when (id_ctrl.jalr && csr.io.status.debug) {
+      ex_reg_flush_pipe := true
+      ex_ctrl.fence_i := true
+    }
+
+    for (i <- 0 until id_raddr.size) {
+      val do_bypass = id_bypass_src(i).reduce(_||_)
+      val bypass_src = PriorityEncoder(id_bypass_src(i))
+      ex_reg_rs_bypass(i) := do_bypass
+      ex_reg_rs_lsb(i) := bypass_src
+      when (id_ren(i) && !do_bypass) {
+        ex_reg_rs_lsb(i) := id_rs(i)(bypass_src.getWidth-1,0)
+        ex_reg_rs_msb(i) := id_rs(i) >> bypass_src.getWidth
+      }
+    }
+  }
+  when (!ctrl_killd || csr.io.interrupt || ibuf.io.inst(0).bits.replay) {
+    ex_reg_inst := id_inst(0)
+    ex_reg_pc := ibuf.io.pc
+  }
+
+  // replay inst in ex stage?
+  val ex_pc_valid = ex_reg_valid || ex_reg_replay || ex_reg_xcpt_interrupt
+  val wb_dcache_miss = wb_ctrl.mem && !io.dmem.resp.valid
+  val replay_ex_structural = ex_ctrl.mem && !io.dmem.req.ready ||
+                             ex_ctrl.div && !div.io.req.ready
+  val replay_ex_load_use = wb_dcache_miss && ex_reg_load_use
+  val replay_ex = ex_reg_replay || (ex_reg_valid && (replay_ex_structural || replay_ex_load_use))
+  val ctrl_killx = take_pc_mem_wb || replay_ex || !ex_reg_valid
+  // detect 2-cycle load-use delay for LB/LH/SC
+  val ex_slow_bypass = ex_ctrl.mem_cmd === M_XSC || Vec(MT_B, MT_BU, MT_H, MT_HU).contains(ex_ctrl.mem_type)
+
+  val (ex_xcpt, ex_cause) = checkExceptions(List(
+    (ex_reg_xcpt_interrupt || ex_reg_xcpt, ex_reg_cause),
+    (ex_ctrl.fp && io.fpu.illegal_rm,      UInt(Causes.illegal_instruction))))
+
+  // memory stage
+  val mem_br_taken = mem_reg_wdata(0)
+  val mem_br_target = mem_reg_pc.asSInt +
+    Mux(mem_ctrl.branch && mem_br_taken, ImmGen(IMM_SB, mem_reg_inst),
+    Mux(mem_ctrl.jal, ImmGen(IMM_UJ, mem_reg_inst),
+    Mux(mem_reg_rvc, SInt(2), SInt(4))))
+  val mem_npc = (Mux(mem_ctrl.jalr, encodeVirtualAddress(mem_reg_wdata, mem_reg_wdata).asSInt, mem_br_target) & SInt(-2)).asUInt
+  val mem_wrong_npc = Mux(ex_pc_valid, mem_npc =/= ex_reg_pc, Mux(ibuf.io.inst(0).valid, mem_npc =/= ibuf.io.pc, Bool(true)))
+  val mem_npc_misaligned = if (usingCompressed) Bool(false) else mem_npc(1)
+  val mem_int_wdata = Mux(!mem_reg_xcpt && (mem_ctrl.jalr ^ mem_npc_misaligned), mem_br_target, mem_reg_wdata.asSInt).asUInt
+  val mem_cfi = mem_ctrl.branch || mem_ctrl.jalr || mem_ctrl.jal
+  val mem_cfi_taken = (mem_ctrl.branch && mem_br_taken) || mem_ctrl.jalr || mem_ctrl.jal
+  val mem_misprediction =
+    if (p(BtbKey).nEntries == 0) mem_cfi_taken
+    else mem_wrong_npc
+  take_pc_mem := mem_reg_valid && (mem_misprediction || mem_reg_flush_pipe)
+
+  mem_reg_valid := !ctrl_killx
+  mem_reg_replay := !take_pc_mem_wb && replay_ex
+  mem_reg_xcpt := !ctrl_killx && ex_xcpt
+  mem_reg_xcpt_interrupt := !take_pc_mem_wb && ex_reg_xcpt_interrupt
+  when (ex_xcpt) { mem_reg_cause := ex_cause }
+
+  when (ex_pc_valid) {
+    mem_ctrl := ex_ctrl
+    mem_reg_rvc := ex_reg_rvc
+    mem_reg_load := ex_ctrl.mem && isRead(ex_ctrl.mem_cmd)
+    mem_reg_store := ex_ctrl.mem && isWrite(ex_ctrl.mem_cmd)
+    mem_reg_btb_hit := ex_reg_btb_hit
+    when (ex_reg_btb_hit) { mem_reg_btb_resp := ex_reg_btb_resp }
+    mem_reg_flush_pipe := ex_reg_flush_pipe
+    mem_reg_slow_bypass := ex_slow_bypass
+
+    mem_reg_inst := ex_reg_inst
+    mem_reg_pc := ex_reg_pc
+    mem_reg_wdata := alu.io.out
+    when (ex_ctrl.rxs2 && (ex_ctrl.mem || ex_ctrl.rocc)) {
+      mem_reg_rs2 := ex_rs(1)
+    }
+  }
+
+  val mem_breakpoint = (mem_reg_load && bpu.io.xcpt_ld) || (mem_reg_store && bpu.io.xcpt_st)
+  val (mem_new_xcpt, mem_new_cause) = checkExceptions(List(
+    (mem_breakpoint,                     UInt(Causes.breakpoint)),
+    (mem_npc_misaligned,                 UInt(Causes.misaligned_fetch)),
+    (mem_ctrl.mem && io.dmem.xcpt.ma.st, UInt(Causes.misaligned_store)),
+    (mem_ctrl.mem && io.dmem.xcpt.ma.ld, UInt(Causes.misaligned_load)),
+    (mem_ctrl.mem && io.dmem.xcpt.pf.st, UInt(Causes.fault_store)),
+    (mem_ctrl.mem && io.dmem.xcpt.pf.ld, UInt(Causes.fault_load))))
+
+  val (mem_xcpt, mem_cause) = checkExceptions(List(
+    (mem_reg_xcpt_interrupt || mem_reg_xcpt, mem_reg_cause),
+    (mem_reg_valid && mem_new_xcpt,          mem_new_cause)))
+
+  val dcache_kill_mem = mem_reg_valid && mem_ctrl.wxd && io.dmem.replay_next // structural hazard on writeback port
+  val fpu_kill_mem = mem_reg_valid && mem_ctrl.fp && io.fpu.nack_mem
+  val replay_mem  = dcache_kill_mem || mem_reg_replay || fpu_kill_mem
+  val killm_common = dcache_kill_mem || take_pc_wb || mem_reg_xcpt || !mem_reg_valid
+  div.io.kill := killm_common && Reg(next = div.io.req.fire())
+  val ctrl_killm = killm_common || mem_xcpt || fpu_kill_mem
+
+  // writeback stage
+  wb_reg_valid := !ctrl_killm
+  wb_reg_replay := replay_mem && !take_pc_wb
+  wb_reg_xcpt := mem_xcpt && !take_pc_wb
+  when (mem_xcpt) { wb_reg_cause := mem_cause }
+  when (mem_reg_valid || mem_reg_replay || mem_reg_xcpt_interrupt) {
+    wb_ctrl := mem_ctrl
+    wb_reg_wdata := Mux(!mem_reg_xcpt && mem_ctrl.fp && mem_ctrl.wxd, io.fpu.toint_data, mem_int_wdata)
+    when (mem_ctrl.rocc) {
+      wb_reg_rs2 := mem_reg_rs2
+    }
+    wb_reg_inst := mem_reg_inst
+    wb_reg_pc := mem_reg_pc
+  }
+
+  val wb_set_sboard = wb_ctrl.div || wb_dcache_miss || wb_ctrl.rocc
+  val replay_wb_common = io.dmem.s2_nack || wb_reg_replay
+  val replay_wb_rocc = wb_reg_valid && wb_ctrl.rocc && !io.rocc.cmd.ready
+  val replay_wb = replay_wb_common || replay_wb_rocc
+  val wb_xcpt = wb_reg_xcpt || csr.io.csr_xcpt
+  take_pc_wb := replay_wb || wb_xcpt || csr.io.eret
+
+  // writeback arbitration
+  val dmem_resp_xpu = !io.dmem.resp.bits.tag(0).toBool
+  val dmem_resp_fpu =  io.dmem.resp.bits.tag(0).toBool
+  val dmem_resp_waddr = io.dmem.resp.bits.tag(5, 1)
+  val dmem_resp_valid = io.dmem.resp.valid && io.dmem.resp.bits.has_data
+  val dmem_resp_replay = dmem_resp_valid && io.dmem.resp.bits.replay
+
+  div.io.resp.ready := !(wb_reg_valid && wb_ctrl.wxd)
+  val ll_wdata = Wire(init = div.io.resp.bits.data)
+  val ll_waddr = Wire(init = div.io.resp.bits.tag)
+  val ll_wen = Wire(init = div.io.resp.fire())
+  if (usingRoCC) {
+    io.rocc.resp.ready := !(wb_reg_valid && wb_ctrl.wxd)
+    when (io.rocc.resp.fire()) {
+      div.io.resp.ready := Bool(false)
+      ll_wdata := io.rocc.resp.bits.data
+      ll_waddr := io.rocc.resp.bits.rd
+      ll_wen := Bool(true)
+    }
+  }
+  when (dmem_resp_replay && dmem_resp_xpu) {
+    div.io.resp.ready := Bool(false)
+    if (usingRoCC)
+      io.rocc.resp.ready := Bool(false)
+    ll_waddr := dmem_resp_waddr
+    ll_wen := Bool(true)
+  }
+
+  val wb_valid = wb_reg_valid && !replay_wb && !wb_xcpt
+  val wb_wen = wb_valid && wb_ctrl.wxd
+  val rf_wen = wb_wen || ll_wen 
+  val rf_waddr = Mux(ll_wen, ll_waddr, wb_waddr)
+  val rf_wdata = Mux(dmem_resp_valid && dmem_resp_xpu, io.dmem.resp.bits.data,
+                 Mux(ll_wen, ll_wdata,
+                 Mux(wb_ctrl.csr =/= CSR.N, csr.io.rw.rdata,
+                 wb_reg_wdata)))
+  when (rf_wen) { rf.write(rf_waddr, rf_wdata) }
+
+  // hook up control/status regfile
+  csr.io.exception := wb_reg_xcpt
+  csr.io.cause := wb_reg_cause
+  csr.io.retire := wb_valid
+  csr.io.prci <> io.prci
+  io.fpu.fcsr_rm := csr.io.fcsr_rm
+  csr.io.fcsr_flags := io.fpu.fcsr_flags
+  csr.io.rocc.interrupt <> io.rocc.interrupt
+  csr.io.pc := wb_reg_pc
+  csr.io.badaddr := encodeVirtualAddress(wb_reg_wdata, wb_reg_wdata)
+  io.ptw.ptbr := csr.io.ptbr
+  io.ptw.invalidate := csr.io.fatc
+  io.ptw.status := csr.io.status
+  csr.io.rw.addr := wb_reg_inst(31,20)
+  csr.io.rw.cmd := Mux(wb_reg_valid, wb_ctrl.csr, CSR.N)
+  csr.io.rw.wdata := wb_reg_wdata
+
+  val hazard_targets = Seq((id_ctrl.rxs1 && id_raddr1 =/= UInt(0), id_raddr1),
+                           (id_ctrl.rxs2 && id_raddr2 =/= UInt(0), id_raddr2),
+                           (id_ctrl.wxd  && id_waddr  =/= UInt(0), id_waddr))
+  val fp_hazard_targets = Seq((io.fpu.dec.ren1, id_raddr1),
+                              (io.fpu.dec.ren2, id_raddr2),
+                              (io.fpu.dec.ren3, id_raddr3),
+                              (io.fpu.dec.wen, id_waddr))
+
+  val sboard = new Scoreboard(32, true)
+  sboard.clear(ll_wen, ll_waddr)
+  val id_sboard_hazard = checkHazards(hazard_targets, sboard.read _)
+  sboard.set(wb_set_sboard && wb_wen, wb_waddr)
+
+  // stall for RAW/WAW hazards on CSRs, loads, AMOs, and mul/div in execute stage.
+  val ex_cannot_bypass = ex_ctrl.csr =/= CSR.N || ex_ctrl.jalr || ex_ctrl.mem || ex_ctrl.div || ex_ctrl.fp || ex_ctrl.rocc
+  val data_hazard_ex = ex_ctrl.wxd && checkHazards(hazard_targets, _ === ex_waddr)
+  val fp_data_hazard_ex = ex_ctrl.wfd && checkHazards(fp_hazard_targets, _ === ex_waddr)
+  val id_ex_hazard = ex_reg_valid && (data_hazard_ex && ex_cannot_bypass || fp_data_hazard_ex)
+
+  // stall for RAW/WAW hazards on CSRs, LB/LH, and mul/div in memory stage.
+  val mem_mem_cmd_bh =
+    if (fastLoadWord) Bool(!fastLoadByte) && mem_reg_slow_bypass
+    else Bool(true)
+  val mem_cannot_bypass = mem_ctrl.csr =/= CSR.N || mem_ctrl.mem && mem_mem_cmd_bh || mem_ctrl.div || mem_ctrl.fp || mem_ctrl.rocc
+  val data_hazard_mem = mem_ctrl.wxd && checkHazards(hazard_targets, _ === mem_waddr)
+  val fp_data_hazard_mem = mem_ctrl.wfd && checkHazards(fp_hazard_targets, _ === mem_waddr)
+  val id_mem_hazard = mem_reg_valid && (data_hazard_mem && mem_cannot_bypass || fp_data_hazard_mem)
+  id_load_use := mem_reg_valid && data_hazard_mem && mem_ctrl.mem
+
+  // stall for RAW/WAW hazards on load/AMO misses and mul/div in writeback.
+  val data_hazard_wb = wb_ctrl.wxd && checkHazards(hazard_targets, _ === wb_waddr)
+  val fp_data_hazard_wb = wb_ctrl.wfd && checkHazards(fp_hazard_targets, _ === wb_waddr)
+  val id_wb_hazard = wb_reg_valid && (data_hazard_wb && wb_set_sboard || fp_data_hazard_wb)
+
+  val id_stall_fpu = if (usingFPU) {
+    val fp_sboard = new Scoreboard(32)
+    fp_sboard.set((wb_dcache_miss && wb_ctrl.wfd || io.fpu.sboard_set) && wb_valid, wb_waddr)
+    fp_sboard.clear(dmem_resp_replay && dmem_resp_fpu, dmem_resp_waddr)
+    fp_sboard.clear(io.fpu.sboard_clr, io.fpu.sboard_clra)
+
+    id_csr_en && !io.fpu.fcsr_rdy || checkHazards(fp_hazard_targets, fp_sboard.read _)
+  } else Bool(false)
+
+  val dcache_blocked = Reg(Bool())
+  dcache_blocked := !io.dmem.req.ready && (io.dmem.req.valid || dcache_blocked)
+  val rocc_blocked = Reg(Bool())
+  rocc_blocked := !wb_reg_xcpt && !io.rocc.cmd.ready && (io.rocc.cmd.valid || rocc_blocked)
+
+  val ctrl_stalld =
+    id_ex_hazard || id_mem_hazard || id_wb_hazard || id_sboard_hazard ||
+    id_ctrl.fp && id_stall_fpu ||
+    id_ctrl.mem && dcache_blocked || // reduce activity during D$ misses
+    id_ctrl.rocc && rocc_blocked || // reduce activity while RoCC is busy
+    id_do_fence ||
+    csr.io.csr_stall
+  ctrl_killd := !ibuf.io.inst(0).valid || ibuf.io.inst(0).bits.replay || take_pc || ctrl_stalld || csr.io.interrupt
+
+  io.imem.req.valid := take_pc
+  io.imem.req.bits.speculative := !take_pc_wb
+  io.imem.req.bits.pc :=
+    Mux(wb_xcpt || csr.io.eret, csr.io.evec, // exception or [m|s]ret
+    Mux(replay_wb,              wb_reg_pc,   // replay
+                                mem_npc))    // mispredicted branch
+  io.imem.flush_icache := wb_reg_valid && wb_ctrl.fence_i && !io.dmem.s2_nack
+  io.imem.flush_tlb := csr.io.fatc
+
+  ibuf.io.inst(0).ready := !ctrl_stalld || csr.io.interrupt
+
+  io.imem.btb_update.valid := (mem_reg_replay && mem_reg_btb_hit) || (mem_reg_valid && !take_pc_wb && (mem_cfi_taken || !mem_cfi) && mem_wrong_npc)
+  io.imem.btb_update.bits.isValid := !mem_reg_replay && mem_cfi
+  io.imem.btb_update.bits.isJump := mem_ctrl.jal || mem_ctrl.jalr
+  io.imem.btb_update.bits.isReturn := mem_ctrl.jalr && mem_reg_inst(19,15) === BitPat("b00??1")
+  io.imem.btb_update.bits.target := io.imem.req.bits.pc
+  io.imem.btb_update.bits.br_pc := (if (usingCompressed) mem_reg_pc + Mux(mem_reg_rvc, UInt(0), UInt(2)) else mem_reg_pc)
+  io.imem.btb_update.bits.pc := ~(~io.imem.btb_update.bits.br_pc | (coreInstBytes*fetchWidth-1))
+  io.imem.btb_update.bits.prediction.valid := mem_reg_btb_hit
+  io.imem.btb_update.bits.prediction.bits := mem_reg_btb_resp
+
+  io.imem.bht_update.valid := mem_reg_valid && !take_pc_wb && mem_ctrl.branch
+  io.imem.bht_update.bits.pc := io.imem.btb_update.bits.pc
+  io.imem.bht_update.bits.taken := mem_br_taken
+  io.imem.bht_update.bits.mispredict := mem_wrong_npc
+  io.imem.bht_update.bits.prediction := io.imem.btb_update.bits.prediction
+
+  io.imem.ras_update.valid := mem_reg_valid && !take_pc_wb
+  io.imem.ras_update.bits.returnAddr := mem_int_wdata
+  io.imem.ras_update.bits.isCall := io.imem.btb_update.bits.isJump && mem_waddr(0)
+  io.imem.ras_update.bits.isReturn := io.imem.btb_update.bits.isReturn
+  io.imem.ras_update.bits.prediction := io.imem.btb_update.bits.prediction
+
+  io.fpu.valid := !ctrl_killd && id_ctrl.fp
+  io.fpu.killx := ctrl_killx
+  io.fpu.killm := killm_common
+  io.fpu.inst := id_inst(0)
+  io.fpu.fromint_data := ex_rs(0)
+  io.fpu.dmem_resp_val := dmem_resp_valid && dmem_resp_fpu
+  io.fpu.dmem_resp_data := io.dmem.resp.bits.data_word_bypass
+  io.fpu.dmem_resp_type := io.dmem.resp.bits.typ
+  io.fpu.dmem_resp_tag := dmem_resp_waddr
+
+  io.dmem.req.valid     := ex_reg_valid && ex_ctrl.mem
+  val ex_dcache_tag = Cat(ex_waddr, ex_ctrl.fp)
+  require(coreDCacheReqTagBits >= ex_dcache_tag.getWidth)
+  io.dmem.req.bits.tag  := ex_dcache_tag
+  io.dmem.req.bits.cmd  := ex_ctrl.mem_cmd
+  io.dmem.req.bits.typ  := ex_ctrl.mem_type
+  io.dmem.req.bits.phys := Bool(false)
+  io.dmem.req.bits.addr := encodeVirtualAddress(ex_rs(0), alu.io.adder_out)
+  io.dmem.invalidate_lr := wb_xcpt
+  io.dmem.s1_data := Mux(mem_ctrl.fp, io.fpu.store_data, mem_reg_rs2)
+  io.dmem.s1_kill := killm_common || mem_breakpoint
+  when (mem_xcpt && !io.dmem.s1_kill) {
+    assert(io.dmem.xcpt.asUInt.orR) // make sure s1_kill is exhaustive
+  }
+
+  io.rocc.cmd.valid := wb_reg_valid && wb_ctrl.rocc && !replay_wb_common
+  io.rocc.exception := wb_xcpt && csr.io.status.xs.orR
+  io.rocc.cmd.bits.status := csr.io.status
+  io.rocc.cmd.bits.inst := new RoCCInstruction().fromBits(wb_reg_inst)
+  io.rocc.cmd.bits.rs1 := wb_reg_wdata
+  io.rocc.cmd.bits.rs2 := wb_reg_rs2
+
+  if (enableCommitLog) {
+    val pc = Wire(SInt(width=xLen))
+    pc := wb_reg_pc
+    val inst = wb_reg_inst
+    val rd = RegNext(RegNext(RegNext(id_waddr)))
+    val wfd = wb_ctrl.wfd
+    val wxd = wb_ctrl.wxd
+    val has_data = wb_wen && !wb_set_sboard
+    val priv = csr.io.status.prv
+
+    when (wb_valid) {
+      when (wfd) {
+        printf ("%d 0x%x (0x%x) f%d p%d 0xXXXXXXXXXXXXXXXX\n", priv, pc, inst, rd, rd+UInt(32))
+      }
+      .elsewhen (wxd && rd =/= UInt(0) && has_data) {
+        printf ("%d 0x%x (0x%x) x%d 0x%x\n", priv, pc, inst, rd, rf_wdata)
+      }
+      .elsewhen (wxd && rd =/= UInt(0) && !has_data) {
+        printf ("%d 0x%x (0x%x) x%d p%d 0xXXXXXXXXXXXXXXXX\n", priv, pc, inst, rd, rd)
+      }
+      .otherwise {
+        printf ("%d 0x%x (0x%x)\n", priv, pc, inst)
+      }
+    }
+
+    when (ll_wen && rf_waddr =/= UInt(0)) {
+      printf ("x%d p%d 0x%x\n", rf_waddr, rf_waddr, rf_wdata)
+    }
+  }
+  else {
+    printf("C%d: %d [%d] pc=[%x] W[r%d=%x][%d] R[r%d=%x] R[r%d=%x] inst=[%x] DASM(%x)\n",
+         io.prci.id, csr.io.time(31,0), wb_valid, wb_reg_pc,
+         Mux(rf_wen, rf_waddr, UInt(0)), rf_wdata, rf_wen,
+         wb_reg_inst(19,15), Reg(next=Reg(next=ex_rs(0))),
+         wb_reg_inst(24,20), Reg(next=Reg(next=ex_rs(1))),
+         wb_reg_inst, wb_reg_inst)
+  }
+
+  def checkExceptions(x: Seq[(Bool, UInt)]) =
+    (x.map(_._1).reduce(_||_), PriorityMux(x))
+
+  def checkHazards(targets: Seq[(Bool, UInt)], cond: UInt => Bool) =
+    targets.map(h => h._1 && cond(h._2)).reduce(_||_)
+
+  def encodeVirtualAddress(a0: UInt, ea: UInt) = if (vaddrBitsExtended == vaddrBits) ea else {
+    // efficient means to compress 64-bit VA into vaddrBits+1 bits
+    // (VA is bad if VA(vaddrBits) != VA(vaddrBits-1))
+    val a = a0 >> vaddrBits-1
+    val e = ea(vaddrBits,vaddrBits-1).asSInt
+    val msb =
+      Mux(a === UInt(0) || a === UInt(1), e =/= SInt(0),
+      Mux(a.asSInt === SInt(-1) || a.asSInt === SInt(-2), e === SInt(-1), e(0)))
+    Cat(msb, ea(vaddrBits-1,0))
+  }
+
+  class Scoreboard(n: Int, zero: Boolean = false)
+  {
+    def set(en: Bool, addr: UInt): Unit = update(en, _next | mask(en, addr))
+    def clear(en: Bool, addr: UInt): Unit = update(en, _next & ~mask(en, addr))
+    def read(addr: UInt): Bool = r(addr)
+    def readBypassed(addr: UInt): Bool = _next(addr)
+
+    private val _r = Reg(init=Bits(0, n))
+    private val r = if (zero) (_r >> 1 << 1) else _r
+    private var _next = r
+    private var ens = Bool(false)
+    private def mask(en: Bool, addr: UInt) = Mux(en, UInt(1) << addr, UInt(0))
+    private def update(en: Bool, update: UInt) = {
+      _next = update
+      ens = ens || en
+      when (ens) { _r := _next }
+    }
+  }
+}
--- a/src/main/scala/rocket/rvc.scala
+++ b/src/main/scala/rocket/rvc.scala
@ -0,0 +1,166 @@
+package rocket
+
+import Chisel._
+import Chisel.ImplicitConversions._
+import Util._
+import cde.Parameters
+import uncore.util._
+
+class ExpandedInstruction extends Bundle {
+  val bits = UInt(width = 32)
+  val rd = UInt(width = 5)
+  val rs1 = UInt(width = 5)
+  val rs2 = UInt(width = 5)
+  val rs3 = UInt(width = 5)
+}
+
+class RVCDecoder(x: UInt)(implicit p: Parameters) {
+  def inst(bits: UInt, rd: UInt = x(11,7), rs1: UInt = x(19,15), rs2: UInt = x(24,20), rs3: UInt = x(31,27)) = {
+    val res = Wire(new ExpandedInstruction)
+    res.bits := bits
+    res.rd := rd
+    res.rs1 := rs1
+    res.rs2 := rs2
+    res.rs3 := rs3
+    res
+  }
+
+  def rs1p = Cat(UInt(1,2), x(9,7))
+  def rs2p = Cat(UInt(1,2), x(4,2))
+  def rs2 = x(6,2)
+  def rd = x(11,7)
+  def addi4spnImm = Cat(x(10,7), x(12,11), x(5), x(6), UInt(0,2))
+  def lwImm = Cat(x(5), x(12,10), x(6), UInt(0,2))
+  def ldImm = Cat(x(6,5), x(12,10), UInt(0,3))
+  def lwspImm = Cat(x(3,2), x(12), x(6,4), UInt(0,2))
+  def ldspImm = Cat(x(4,2), x(12), x(6,5), UInt(0,3))
+  def swspImm = Cat(x(8,7), x(12,9), UInt(0,2))
+  def sdspImm = Cat(x(9,7), x(12,10), UInt(0,3))
+  def luiImm = Cat(Fill(15, x(12)), x(6,2), UInt(0,12))
+  def addi16spImm = Cat(Fill(3, x(12)), x(4,3), x(5), x(2), x(6), UInt(0,4))
+  def addiImm = Cat(Fill(7, x(12)), x(6,2))
+  def jImm = Cat(Fill(10, x(12)), x(8), x(10,9), x(6), x(7), x(2), x(11), x(5,3), UInt(0,1))
+  def bImm = Cat(Fill(5, x(12)), x(6,5), x(2), x(11,10), x(4,3), UInt(0,1))
+  def shamt = Cat(x(12), x(6,2))
+  def x0 = UInt(0,5)
+  def ra = UInt(1,5)
+  def sp = UInt(2,5)
+
+  def q0 = {
+    def addi4spn = {
+      val opc = Mux(x(12,5).orR, UInt(0x13,7), UInt(0x1F,7))
+      inst(Cat(addi4spnImm, sp, UInt(0,3), rs2p, opc), rs2p, sp, rs2p)
+    }
+    def ld = inst(Cat(ldImm, rs1p, UInt(3,3), rs2p, UInt(0x03,7)), rs2p, rs1p, rs2p)
+    def lw = inst(Cat(lwImm, rs1p, UInt(2,3), rs2p, UInt(0x03,7)), rs2p, rs1p, rs2p)
+    def fld = inst(Cat(ldImm, rs1p, UInt(3,3), rs2p, UInt(0x07,7)), rs2p, rs1p, rs2p)
+    def flw = {
+      if (p(XLen) == 32) inst(Cat(lwImm, rs1p, UInt(2,3), rs2p, UInt(0x07,7)), rs2p, rs1p, rs2p)
+      else ld
+    }
+    def unimp = inst(Cat(lwImm >> 5, rs2p, rs1p, UInt(2,3), lwImm(4,0), UInt(0x2F,7)), rs2p, rs1p, rs2p)
+    def sd = inst(Cat(ldImm >> 5, rs2p, rs1p, UInt(3,3), ldImm(4,0), UInt(0x23,7)), rs2p, rs1p, rs2p)
+    def sw = inst(Cat(lwImm >> 5, rs2p, rs1p, UInt(2,3), lwImm(4,0), UInt(0x23,7)), rs2p, rs1p, rs2p)
+    def fsd = inst(Cat(ldImm >> 5, rs2p, rs1p, UInt(3,3), ldImm(4,0), UInt(0x27,7)), rs2p, rs1p, rs2p)
+    def fsw = {
+      if (p(XLen) == 32) inst(Cat(lwImm >> 5, rs2p, rs1p, UInt(2,3), lwImm(4,0), UInt(0x27,7)), rs2p, rs1p, rs2p)
+      else sd
+    }
+    Seq(addi4spn, fld, lw, flw, unimp, fsd, sw, fsw)
+  }
+
+  def q1 = {
+    def addi = inst(Cat(addiImm, rd, UInt(0,3), rd, UInt(0x13,7)), rd, rd, rs2p)
+    def addiw = {
+      val opc = Mux(rd.orR, UInt(0x1B,7), UInt(0x1F,7))
+      inst(Cat(addiImm, rd, UInt(0,3), rd, opc), rd, rd, rs2p)
+    }
+    def jal = {
+      if (p(XLen) == 32) inst(Cat(jImm(20), jImm(10,1), jImm(11), jImm(19,12), ra, UInt(0x6F,7)), ra, rd, rs2p)
+      else addiw
+    }
+    def li = inst(Cat(addiImm, x0, UInt(0,3), rd, UInt(0x13,7)), rd, x0, rs2p)
+    def addi16sp = {
+      val opc = Mux(addiImm.orR, UInt(0x13,7), UInt(0x1F,7))
+      inst(Cat(addi16spImm, rd, UInt(0,3), rd, opc), rd, rd, rs2p)
+    }
+    def lui = {
+      val opc = Mux(addiImm.orR, UInt(0x37,7), UInt(0x3F,7))
+      val me = inst(Cat(luiImm(31,12), rd, opc), rd, rd, rs2p)
+      Mux(rd === x0 || rd === sp, addi16sp, me)
+    }
+    def j = inst(Cat(jImm(20), jImm(10,1), jImm(11), jImm(19,12), x0, UInt(0x6F,7)), x0, rs1p, rs2p)
+    def beqz = inst(Cat(bImm(12), bImm(10,5), x0, rs1p, UInt(0,3), bImm(4,1), bImm(11), UInt(0x63,7)), rs1p, rs1p, x0)
+    def bnez = inst(Cat(bImm(12), bImm(10,5), x0, rs1p, UInt(1,3), bImm(4,1), bImm(11), UInt(0x63,7)), x0, rs1p, x0)
+    def arith = {
+      def srli = Cat(shamt, rs1p, UInt(5,3), rs1p, UInt(0x13,7))
+      def srai = srli | UInt(1 << 30)
+      def andi = Cat(addiImm, rs1p, UInt(7,3), rs1p, UInt(0x13,7))
+      def rtype = {
+        val funct = Seq(0.U, 4.U, 6.U, 7.U, 0.U, 0.U, 2.U, 3.U)(Cat(x(12), x(6,5)))
+        val sub = Mux(x(6,5) === UInt(0), UInt(1 << 30), UInt(0))
+        val opc = Mux(x(12), UInt(0x3B,7), UInt(0x33,7))
+        Cat(rs2p, rs1p, funct, rs1p, opc) | sub
+      }
+      inst(Seq(srli, srai, andi, rtype)(x(11,10)), rs1p, rs1p, rs2p)
+    }
+    Seq(addi, jal, li, lui, arith, j, beqz, bnez)
+  }
+  
+  def q2 = {
+    def slli = inst(Cat(shamt, rd, UInt(1,3), rd, UInt(0x13,7)), rd, rd, rs2)
+    def ldsp = inst(Cat(ldspImm, sp, UInt(3,3), rd, UInt(0x03,7)), rd, sp, rs2)
+    def lwsp = inst(Cat(lwspImm, sp, UInt(2,3), rd, UInt(0x03,7)), rd, sp, rs2)
+    def fldsp = inst(Cat(ldspImm, sp, UInt(3,3), rd, UInt(0x07,7)), rd, sp, rs2)
+    def flwsp = {
+      if (p(XLen) == 32) inst(Cat(lwspImm, sp, UInt(2,3), rd, UInt(0x07,7)), rd, sp, rs2)
+      else ldsp
+    }
+    def sdsp = inst(Cat(sdspImm >> 5, rs2, sp, UInt(3,3), sdspImm(4,0), UInt(0x23,7)), rd, sp, rs2)
+    def swsp = inst(Cat(swspImm >> 5, rs2, sp, UInt(2,3), swspImm(4,0), UInt(0x23,7)), rd, sp, rs2)
+    def fsdsp = inst(Cat(sdspImm >> 5, rs2, sp, UInt(3,3), sdspImm(4,0), UInt(0x27,7)), rd, sp, rs2)
+    def fswsp = {
+      if (p(XLen) == 32) inst(Cat(swspImm >> 5, rs2, sp, UInt(2,3), swspImm(4,0), UInt(0x27,7)), rd, sp, rs2)
+      else sdsp
+    }
+    def jalr = {
+      val mv = inst(Cat(rs2, x0, UInt(0,3), rd, UInt(0x33,7)), rd, x0, rs2)
+      val add = inst(Cat(rs2, rd, UInt(0,3), rd, UInt(0x33,7)), rd, rd, rs2)
+      val jr = Cat(rs2, rd, UInt(0,3), x0, UInt(0x67,7))
+      val reserved = Cat(jr >> 7, UInt(0x1F,7))
+      val jr_reserved = inst(Mux(rd.orR, jr, reserved), x0, rd, rs2)
+      val jr_mv = Mux(rs2.orR, mv, jr_reserved)
+      val jalr = Cat(rs2, rd, UInt(0,3), ra, UInt(0x67,7))
+      val ebreak = Cat(jr >> 7, UInt(0x73,7)) | UInt(1 << 20)
+      val jalr_ebreak = inst(Mux(rd.orR, jalr, ebreak), ra, rd, rs2)
+      val jalr_add = Mux(rs2.orR, add, jalr_ebreak)
+      Mux(x(12), jalr_add, jr_mv)
+    }
+    Seq(slli, fldsp, lwsp, flwsp, jalr, fsdsp, swsp, fswsp)
+  }
+
+  def q3 = Seq.fill(8)(passthrough)
+
+  def passthrough = inst(x)
+
+  def decode = {
+    val s = q0 ++ q1 ++ q2 ++ q3
+    s(Cat(x(1,0), x(15,13)))
+  }
+}
+
+class RVCExpander(implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val in = UInt(INPUT, 32)
+    val out = new ExpandedInstruction
+    val rvc = Bool(OUTPUT)
+  }
+
+  if (p(UseCompressed)) {
+    io.rvc := io.in(1,0) =/= UInt(3)
+    io.out := new RVCDecoder(io.in).decode
+  } else {
+    io.rvc := Bool(false)
+    io.out := new RVCDecoder(io.in).passthrough
+  }
+}
--- a/src/main/scala/rocket/tile.scala
+++ b/src/main/scala/rocket/tile.scala
@ -0,0 +1,134 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import uncore.tilelink._
+import uncore.agents._
+import uncore.devices._
+import Util._
+import cde.{Parameters, Field}
+
+case object BuildRoCC extends Field[Seq[RoccParameters]]
+case object NCachedTileLinkPorts extends Field[Int]
+case object NUncachedTileLinkPorts extends Field[Int]
+
+case class RoccParameters(
+  opcodes: OpcodeSet,
+  generator: Parameters => RoCC,
+  nMemChannels: Int = 0,
+  nPTWPorts : Int = 0,
+  useFPU: Boolean = false)
+
+abstract class Tile(clockSignal: Clock = null, resetSignal: Bool = null)
+    (implicit p: Parameters) extends Module(Option(clockSignal), Option(resetSignal)) {
+  val nCachedTileLinkPorts = p(NCachedTileLinkPorts)
+  val nUncachedTileLinkPorts = p(NUncachedTileLinkPorts)
+  val dcacheParams = p.alterPartial({ case CacheName => "L1D" })
+
+  class TileIO extends Bundle {
+    val cached = Vec(nCachedTileLinkPorts, new ClientTileLinkIO)
+    val uncached = Vec(nUncachedTileLinkPorts, new ClientUncachedTileLinkIO)
+    val prci = new PRCITileIO().flip
+  }
+
+  val io = new TileIO
+}
+
+class RocketTile(clockSignal: Clock = null, resetSignal: Bool = null)
+    (implicit p: Parameters) extends Tile(clockSignal, resetSignal)(p) {
+  val buildRocc = p(BuildRoCC)
+  val usingRocc = !buildRocc.isEmpty
+  val nRocc = buildRocc.size
+  val nFPUPorts = buildRocc.filter(_.useFPU).size
+
+  val core = Module(new Rocket)
+  val icache = Module(new Frontend()(p.alterPartial({ case CacheName => "L1I" })))
+  val dcache = HellaCache(p(DCacheKey))(dcacheParams)
+
+  val ptwPorts = collection.mutable.ArrayBuffer(icache.io.ptw, dcache.ptw)
+  val dcPorts = collection.mutable.ArrayBuffer(core.io.dmem)
+  val uncachedArbPorts = collection.mutable.ArrayBuffer(icache.io.mem)
+  val uncachedPorts = collection.mutable.ArrayBuffer[ClientUncachedTileLinkIO]()
+  val cachedPorts = collection.mutable.ArrayBuffer(dcache.mem)
+  core.io.prci <> io.prci
+  icache.io.cpu <> core.io.imem
+
+  val fpuOpt = p(FPUKey).map(cfg => Module(new FPU(cfg)))
+  fpuOpt.foreach(fpu => core.io.fpu <> fpu.io)
+
+  if (usingRocc) {
+    val respArb = Module(new RRArbiter(new RoCCResponse, nRocc))
+    core.io.rocc.resp <> respArb.io.out
+
+    val roccOpcodes = buildRocc.map(_.opcodes)
+    val cmdRouter = Module(new RoccCommandRouter(roccOpcodes))
+    cmdRouter.io.in <> core.io.rocc.cmd
+
+    val roccs = buildRocc.zipWithIndex.map { case (accelParams, i) =>
+      val rocc = accelParams.generator(p.alterPartial({
+        case RoccNMemChannels => accelParams.nMemChannels
+        case RoccNPTWPorts => accelParams.nPTWPorts
+      }))
+      val dcIF = Module(new SimpleHellaCacheIF()(dcacheParams))
+      rocc.io.cmd <> cmdRouter.io.out(i)
+      rocc.io.exception := core.io.rocc.exception
+      dcIF.io.requestor <> rocc.io.mem
+      dcPorts += dcIF.io.cache
+      uncachedArbPorts += rocc.io.autl
+      rocc
+    }
+
+    if (nFPUPorts > 0) {
+      fpuOpt.foreach { fpu =>
+        val fpArb = Module(new InOrderArbiter(new FPInput, new FPResult, nFPUPorts))
+        val fp_roccs = roccs.zip(buildRocc)
+          .filter { case (_, params) => params.useFPU }
+          .map { case (rocc, _) => rocc.io }
+        fpArb.io.in_req <> fp_roccs.map(_.fpu_req)
+        fp_roccs.zip(fpArb.io.in_resp).foreach {
+          case (rocc, fpu_resp) => rocc.fpu_resp <> fpu_resp
+        }
+        fpu.io.cp_req <> fpArb.io.out_req
+        fpArb.io.out_resp <> fpu.io.cp_resp
+      }
+    }
+
+    core.io.rocc.busy := cmdRouter.io.busy || roccs.map(_.io.busy).reduce(_ || _)
+    core.io.rocc.interrupt := roccs.map(_.io.interrupt).reduce(_ || _)
+    respArb.io.in <> roccs.map(rocc => Queue(rocc.io.resp))
+
+    ptwPorts ++= roccs.flatMap(_.io.ptw)
+    uncachedPorts ++= roccs.flatMap(_.io.utl)
+  }
+
+  val uncachedArb = Module(new ClientUncachedTileLinkIOArbiter(uncachedArbPorts.size))
+  uncachedArb.io.in <> uncachedArbPorts
+  uncachedArb.io.out +=: uncachedPorts
+
+  // Connect the caches and RoCC to the outer memory system
+  io.uncached <> uncachedPorts
+  io.cached <> cachedPorts
+  // TODO remove nCached/nUncachedTileLinkPorts parameters and these assertions
+  require(uncachedPorts.size == nUncachedTileLinkPorts)
+  require(cachedPorts.size == nCachedTileLinkPorts)
+
+  if (p(UseVM)) {
+    val ptw = Module(new PTW(ptwPorts.size)(dcacheParams))
+    ptw.io.requestor <> ptwPorts
+    ptw.io.mem +=: dcPorts
+    core.io.ptw <> ptw.io.dpath
+  }
+
+  require(dcPorts.size == core.dcacheArbPorts)
+  val dcArb = Module(new HellaCacheArbiter(dcPorts.size)(dcacheParams))
+  dcArb.io.requestor <> dcPorts
+  dcache.cpu <> dcArb.io.mem
+
+  if (nFPUPorts == 0) {
+    fpuOpt.foreach { fpu =>
+      fpu.io.cp_req.valid := Bool(false)
+      fpu.io.cp_resp.ready := Bool(false)
+    }
+  }
+}
--- a/src/main/scala/rocket/tlb.scala
+++ b/src/main/scala/rocket/tlb.scala
@ -0,0 +1,193 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import Util._
+import junctions._
+import scala.math._
+import cde.{Parameters, Field}
+import uncore.agents.PseudoLRU
+import uncore.coherence._
+import uncore.util._
+
+case object PgLevels extends Field[Int]
+case object ASIdBits extends Field[Int]
+case object NTLBEntries extends Field[Int]
+
+trait HasTLBParameters extends HasCoreParameters {
+  val entries = p(NTLBEntries)
+  val camAddrBits = log2Ceil(entries)
+  val camTagBits = asIdBits + vpnBits
+}
+
+class TLBReq(implicit p: Parameters) extends CoreBundle()(p) {
+  val vpn = UInt(width = vpnBitsExtended)
+  val passthrough = Bool()
+  val instruction = Bool()
+  val store = Bool()
+}
+
+class TLBResp(implicit p: Parameters) extends CoreBundle()(p) {
+  // lookup responses
+  val miss = Bool(OUTPUT)
+  val ppn = UInt(OUTPUT, ppnBits)
+  val xcpt_ld = Bool(OUTPUT)
+  val xcpt_st = Bool(OUTPUT)
+  val xcpt_if = Bool(OUTPUT)
+  val cacheable = Bool(OUTPUT)
+}
+
+class TLB(implicit val p: Parameters) extends Module with HasTLBParameters {
+  val io = new Bundle {
+    val req = Decoupled(new TLBReq).flip
+    val resp = new TLBResp
+    val ptw = new TLBPTWIO
+  }
+
+  val valid = Reg(init = UInt(0, entries))
+  val ppns = Reg(Vec(entries, UInt(width = ppnBits)))
+  val tags = Reg(Vec(entries, UInt(width = asIdBits + vpnBits)))
+
+  val s_ready :: s_request :: s_wait :: s_wait_invalidate :: Nil = Enum(UInt(), 4)
+  val state = Reg(init=s_ready)
+  val r_refill_tag = Reg(UInt(width = asIdBits + vpnBits))
+  val r_refill_waddr = Reg(UInt(width = log2Ceil(entries)))
+  val r_req = Reg(new TLBReq)
+
+  val do_mprv = io.ptw.status.mprv && !io.req.bits.instruction
+  val priv = Mux(do_mprv, io.ptw.status.mpp, io.ptw.status.prv)
+  val priv_s = priv === PRV.S
+  val priv_uses_vm = priv <= PRV.S && !io.ptw.status.debug
+
+  // share a single physical memory attribute checker (unshare if critical path)
+  val passthrough_ppn = io.req.bits.vpn(ppnBits-1, 0)
+  val refill_ppn = io.ptw.resp.bits.pte.ppn(ppnBits-1, 0)
+  val do_refill = Bool(usingVM) && io.ptw.resp.valid
+  val mpu_ppn = Mux(do_refill, refill_ppn, passthrough_ppn)
+  val prot = addrMap.getProt(mpu_ppn << pgIdxBits)
+  val cacheable = addrMap.isCacheable(mpu_ppn << pgIdxBits)
+  def pgaligned(r: MemRegion) = {
+    val pgsize = 1 << pgIdxBits
+    (r.start % pgsize) == 0 && (r.size % pgsize) == 0
+  }
+  require(addrMap.flatten.forall(e => pgaligned(e.region)),
+    "MemoryMap regions must be page-aligned")
+  
+  val lookup_tag = Cat(io.ptw.ptbr.asid, io.req.bits.vpn(vpnBits-1,0))
+  val vm_enabled = Bool(usingVM) && io.ptw.status.vm(3) && priv_uses_vm && !io.req.bits.passthrough
+  val hitsVec = (0 until entries).map(i => valid(i) && vm_enabled && tags(i) === lookup_tag) :+ !vm_enabled
+  val hits = hitsVec.asUInt
+  
+  // permission bit arrays
+  val pte_array = Reg(new PTE)
+  val u_array = Reg(UInt(width = entries)) // user permission
+  val sw_array = Reg(UInt(width = entries)) // write permission
+  val sx_array = Reg(UInt(width = entries)) // execute permission
+  val sr_array = Reg(UInt(width = entries)) // read permission
+  val xr_array = Reg(UInt(width = entries)) // read permission to executable page
+  val cash_array = Reg(UInt(width = entries)) // cacheable
+  val dirty_array = Reg(UInt(width = entries)) // PTE dirty bit
+  when (do_refill) {
+    val pte = io.ptw.resp.bits.pte
+    ppns(r_refill_waddr) := pte.ppn
+    tags(r_refill_waddr) := r_refill_tag
+
+    val mask = UIntToOH(r_refill_waddr)
+    valid := valid | mask
+    u_array := Mux(pte.u, u_array | mask, u_array & ~mask)
+    sw_array := Mux(pte.sw() && prot.w, sw_array | mask, sw_array & ~mask)
+    sx_array := Mux(pte.sx() && prot.x, sx_array | mask, sx_array & ~mask)
+    sr_array := Mux(pte.sr() && prot.r, sr_array | mask, sr_array & ~mask)
+    xr_array := Mux(pte.sx() && prot.r, xr_array | mask, xr_array & ~mask)
+    cash_array := Mux(cacheable, cash_array | mask, cash_array & ~mask)
+    dirty_array := Mux(pte.d, dirty_array | mask, dirty_array & ~mask)
+  }
+ 
+  val plru = new PseudoLRU(entries)
+  val repl_waddr = Mux(!valid.andR, PriorityEncoder(~valid), plru.replace)
+
+  val priv_ok = Mux(priv_s, ~Mux(io.ptw.status.pum, u_array, UInt(0)), u_array)
+  val w_array = Cat(prot.w, priv_ok & sw_array)
+  val x_array = Cat(prot.x, priv_ok & sx_array)
+  val r_array = Cat(prot.r, priv_ok & (sr_array | Mux(io.ptw.status.mxr, xr_array, UInt(0))))
+  val c_array = Cat(cacheable, cash_array)
+
+  val bad_va =
+    if (vpnBits == vpnBitsExtended) Bool(false)
+    else io.req.bits.vpn(vpnBits) =/= io.req.bits.vpn(vpnBits-1)
+  // it's only a store hit if the dirty bit is set
+  val tlb_hits = hits(entries-1, 0) & (dirty_array | ~Mux(io.req.bits.store, w_array, UInt(0)))
+  val tlb_hit = tlb_hits.orR
+  val tlb_miss = vm_enabled && !bad_va && !tlb_hit
+
+  when (io.req.valid && !tlb_miss) {
+    plru.access(OHToUInt(hits(entries-1, 0)))
+  }
+
+  io.req.ready := state === s_ready
+  io.resp.xcpt_ld := bad_va || (~r_array & hits).orR
+  io.resp.xcpt_st := bad_va || (~w_array & hits).orR
+  io.resp.xcpt_if := bad_va || (~x_array & hits).orR
+  io.resp.cacheable := (c_array & hits).orR
+  io.resp.miss := do_refill || tlb_miss
+  io.resp.ppn := Mux1H(hitsVec, ppns :+ passthrough_ppn)
+
+  io.ptw.req.valid := state === s_request
+  io.ptw.req.bits := io.ptw.status
+  io.ptw.req.bits.addr := r_refill_tag
+  io.ptw.req.bits.store := r_req.store
+  io.ptw.req.bits.fetch := r_req.instruction
+
+  if (usingVM) {
+    when (io.req.fire() && tlb_miss) {
+      state := s_request
+      r_refill_tag := lookup_tag
+      r_refill_waddr := repl_waddr
+      r_req := io.req.bits
+    }
+    when (state === s_request) {
+      when (io.ptw.invalidate) {
+        state := s_ready
+      }
+      when (io.ptw.req.ready) {
+        state := s_wait
+        when (io.ptw.invalidate) { state := s_wait_invalidate }
+      }
+    }
+    when (state === s_wait && io.ptw.invalidate) {
+      state := s_wait_invalidate
+    }
+    when (io.ptw.resp.valid) {
+      state := s_ready
+    }
+
+    when (io.ptw.invalidate) {
+      valid := 0
+    }
+  }
+}
+
+class DecoupledTLB(implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val req = Decoupled(new TLBReq).flip
+    val resp = Decoupled(new TLBResp)
+    val ptw = new TLBPTWIO
+  }
+
+  val reqq = Queue(io.req)
+  val tlb = Module(new TLB)
+
+  val resp_helper = DecoupledHelper(
+    reqq.valid, tlb.io.req.ready, io.resp.ready)
+  val tlb_miss = tlb.io.resp.miss
+
+  tlb.io.req.valid := resp_helper.fire(tlb.io.req.ready)
+  tlb.io.req.bits := reqq.bits
+  reqq.ready := resp_helper.fire(reqq.valid, !tlb_miss)
+
+  io.resp.valid := resp_helper.fire(io.resp.ready, !tlb_miss)
+  io.resp.bits := tlb.io.resp
+
+  io.ptw <> tlb.io.ptw
+}
--- a/src/main/scala/rocket/util.scala
+++ b/src/main/scala/rocket/util.scala
@ -0,0 +1,159 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import uncore.util._
+import scala.math._
+import cde.{Parameters, Field}
+
+object Util {
+  implicit def uintToBitPat(x: UInt): BitPat = BitPat(x)
+  implicit def intToUInt(x: Int): UInt = UInt(x)
+  implicit def bigIntToUInt(x: BigInt): UInt = UInt(x)
+  implicit def booleanToBool(x: Boolean): Bits = Bool(x)
+  implicit def intSeqToUIntSeq(x: Seq[Int]): Seq[UInt] = x.map(UInt(_))
+  implicit def wcToUInt(c: WideCounter): UInt = c.value
+
+  implicit class UIntToAugmentedUInt(val x: UInt) extends AnyVal {
+    def sextTo(n: Int): UInt =
+      if (x.getWidth == n) x
+      else Cat(Fill(n - x.getWidth, x(x.getWidth-1)), x)
+
+    def extract(hi: Int, lo: Int): UInt = {
+      if (hi == lo-1) UInt(0)
+      else x(hi, lo)
+    }
+  }
+
+  implicit def booleanToIntConv(x: Boolean) = new AnyRef {
+    def toInt: Int = if (x) 1 else 0
+  }
+}
+
+import Util._
+
+object Str
+{
+  def apply(s: String): UInt = {
+    var i = BigInt(0)
+    require(s.forall(validChar _))
+    for (c <- s)
+      i = (i << 8) | c
+    UInt(i, s.length*8)
+  }
+  def apply(x: Char): UInt = {
+    require(validChar(x))
+    UInt(x.toInt, 8)
+  }
+  def apply(x: UInt): UInt = apply(x, 10)
+  def apply(x: UInt, radix: Int): UInt = {
+    val rad = UInt(radix)
+    val w = x.getWidth
+    require(w > 0)
+
+    var q = x
+    var s = digit(q % rad)
+    for (i <- 1 until ceil(log(2)/log(radix)*w).toInt) {
+      q = q / rad
+      s = Cat(Mux(Bool(radix == 10) && q === UInt(0), Str(' '), digit(q % rad)), s)
+    }
+    s
+  }
+  def apply(x: SInt): UInt = apply(x, 10)
+  def apply(x: SInt, radix: Int): UInt = {
+    val neg = x < SInt(0)
+    val abs = x.abs
+    if (radix != 10) {
+      Cat(Mux(neg, Str('-'), Str(' ')), Str(abs, radix))
+    } else {
+      val rad = UInt(radix)
+      val w = abs.getWidth
+      require(w > 0)
+
+      var q = abs
+      var s = digit(q % rad)
+      var needSign = neg
+      for (i <- 1 until ceil(log(2)/log(radix)*w).toInt) {
+        q = q / rad
+        val placeSpace = q === UInt(0)
+        val space = Mux(needSign, Str('-'), Str(' '))
+        needSign = needSign && !placeSpace
+        s = Cat(Mux(placeSpace, space, digit(q % rad)), s)
+      }
+      Cat(Mux(needSign, Str('-'), Str(' ')), s)
+    }
+  }
+
+  private def digit(d: UInt): UInt = Mux(d < UInt(10), Str('0')+d, Str(('a'-10).toChar)+d)(7,0)
+  private def validChar(x: Char) = x == (x & 0xFF)
+}
+
+object Split
+{
+  // is there a better way to do do this?
+  def apply(x: Bits, n0: Int) = {
+    val w = checkWidth(x, n0)
+    (x(w-1,n0), x(n0-1,0))
+  }
+  def apply(x: Bits, n1: Int, n0: Int) = {
+    val w = checkWidth(x, n1, n0)
+    (x(w-1,n1), x(n1-1,n0), x(n0-1,0))
+  }
+  def apply(x: Bits, n2: Int, n1: Int, n0: Int) = {
+    val w = checkWidth(x, n2, n1, n0)
+    (x(w-1,n2), x(n2-1,n1), x(n1-1,n0), x(n0-1,0))
+  }
+
+  private def checkWidth(x: Bits, n: Int*) = {
+    val w = x.getWidth
+    def decreasing(x: Seq[Int]): Boolean =
+      if (x.tail.isEmpty) true
+      else x.head >= x.tail.head && decreasing(x.tail)
+    require(decreasing(w :: n.toList))
+    w
+  }
+}
+
+// a counter that clock gates most of its MSBs using the LSB carry-out
+case class WideCounter(width: Int, inc: UInt = UInt(1))
+{
+  private val isWide = width > 2*inc.getWidth
+  private val smallWidth = if (isWide) inc.getWidth max log2Up(width) else width
+  private val small = Reg(init=UInt(0, smallWidth))
+  private val nextSmall = small +& inc
+  small := nextSmall
+
+  private val large = if (isWide) {
+    val r = Reg(init=UInt(0, width - smallWidth))
+    when (nextSmall(smallWidth)) { r := r + UInt(1) }
+    r
+  } else null
+
+  val value = if (isWide) Cat(large, small) else small
+
+  def := (x: UInt) = {
+    small := x
+    if (isWide) large := x >> smallWidth
+  }
+}
+
+object Random
+{
+  def apply(mod: Int, random: UInt): UInt = {
+    if (isPow2(mod)) random(log2Up(mod)-1,0)
+    else PriorityEncoder(partition(apply(1 << log2Up(mod*8), random), mod))
+  }
+  def apply(mod: Int): UInt = apply(mod, randomizer)
+  def oneHot(mod: Int, random: UInt): UInt = {
+    if (isPow2(mod)) UIntToOH(random(log2Up(mod)-1,0))
+    else PriorityEncoderOH(partition(apply(1 << log2Up(mod*8), random), mod)).asUInt
+  }
+  def oneHot(mod: Int): UInt = oneHot(mod, randomizer)
+
+  private def randomizer = LFSR16()
+  private def round(x: Double): Int =
+    if (x.toInt.toDouble == x) x.toInt else (x.toInt + 1) & -2
+  private def partition(value: UInt, slices: Int) =
+    Seq.tabulate(slices)(i => value < round((i << value.getWidth).toDouble / slices))
+}
--- a/src/main/scala/uncore/Builder.scala
+++ b/src/main/scala/uncore/Builder.scala
@ -0,0 +1,117 @@
+package uncore
+
+import Chisel._
+import cde.{Config, Parameters, ParameterDump, Knob, Dump}
+import junctions.PAddrBits
+import uncore.tilelink._
+import uncore.agents._
+import uncore.coherence._
+
+object UncoreBuilder extends App {
+  val topModuleName = args(0)
+  val configClassName = args(1)
+  val config = try {
+      Class.forName(s"uncore.$configClassName").newInstance.asInstanceOf[Config]
+    } catch {
+      case e: java.lang.ClassNotFoundException =>
+        throwException("Unable to find configClassName \"" + configClassName +
+                       "\", did you misspell it?", e)
+    }
+  val world = config.toInstance
+  val paramsFromConfig: Parameters = Parameters.root(world)
+
+  val gen = () => 
+    Class.forName(s"uncore.$topModuleName")
+      .getConstructor(classOf[cde.Parameters])
+      .newInstance(paramsFromConfig)
+      .asInstanceOf[Module]
+
+  chiselMain.run(args.drop(2), gen)
+
+  val pdFile = new java.io.FileWriter(s"${Driver.targetDir}/$topModuleName.prm")
+  pdFile.write(ParameterDump.getDump)
+  pdFile.close
+
+}
+
+class DefaultL2Config extends Config (
+  topDefinitions = { (pname,site,here) => 
+    pname match {
+      case PAddrBits => 32
+      case CacheId => 0
+      case CacheName => "L2Bank"
+      case TLId => "L1toL2"
+      case InnerTLId => "L1toL2"
+      case OuterTLId => "L2toMC"
+      case "N_CACHED" => Dump("N_CACHED",here[Int]("CACHED_CLIENTS_PER_PORT"))
+      case "N_UNCACHED" => Dump("N_UNCACHED",here[Int]("MAX_CLIENTS_PER_PORT") - here[Int]("N_CACHED"))
+      case "MAX_CLIENT_XACTS" => 4
+      case "MAX_CLIENTS_PER_PORT" => Knob("NTILES")
+      case "CACHED_CLIENTS_PER_PORT" => Knob("N_CACHED_TILES")
+      case TLKey("L1toL2") => 
+        TileLinkParameters(
+          coherencePolicy = new MESICoherence(site(L2DirectoryRepresentation)),
+          nManagers = 1,
+          nCachingClients = here[Int]("N_CACHED"),
+          nCachelessClients = here[Int]("N_UNCACHED"),
+          maxClientXacts = here[Int]("MAX_CLIENT_XACTS"),
+          maxClientsPerPort = here[Int]("MAX_CLIENTS_PER_PORT"),
+          maxManagerXacts = site(NAcquireTransactors) + 2,
+          dataBits = site(CacheBlockBytes)*8,
+          dataBeats = 2)
+      case TLKey("L2toMC") => 
+        TileLinkParameters(
+          coherencePolicy = new MEICoherence(new NullRepresentation(1)),
+          nManagers = 1,
+          nCachingClients = 1,
+          nCachelessClients = 0,
+          maxClientXacts = 1,
+          maxClientsPerPort = site(NAcquireTransactors) + 2,
+          maxManagerXacts = 1,
+          dataBits = site(CacheBlockBytes)*8,
+          dataBeats = 2)
+      case CacheBlockBytes => 64
+      case CacheBlockOffsetBits => log2Up(here(CacheBlockBytes))
+      case "L2_SETS" => Knob("L2_SETS") 
+      case NSets => Dump("L2_SETS",here[Int]("L2_SETS"))
+      case NWays => Knob("L2_WAYS")
+      case RowBits => site(TLKey(site(TLId))).dataBitsPerBeat
+      case CacheIdBits => Dump("CACHE_ID_BITS",1)
+      case L2StoreDataQueueDepth => 1
+      case NAcquireTransactors => Dump("N_ACQUIRE_TRANSACTORS",2)
+      case NSecondaryMisses => 4
+      case L2DirectoryRepresentation => new FullRepresentation(here[Int]("N_CACHED"))
+      case L2Replacer => () => new SeqRandom(site(NWays))
+      case ECCCode => None
+      case AmoAluOperandBits => 64
+      case SplitMetadata => false
+ //     case XLen => 128
+  }},
+  knobValues = {
+    case "L2_WAYS" => 1
+    case "L2_SETS" => 1024
+    case "NTILES" => 2
+    case "N_CACHED_TILES" => 2
+    case "L2_CAPACITY_IN_KB" => 256
+  }
+)
+
+class WithPLRU extends Config(
+  (pname, site, here) => pname match {
+    case L2Replacer => () => new SeqPLRU(site(NSets), site(NWays))
+  })
+
+class PLRUL2Config extends Config(new WithPLRU ++ new DefaultL2Config)
+
+class With1L2Ways extends Config(knobValues = { case "L2_WAYS" => 1 })
+class With2L2Ways extends Config(knobValues = { case "L2_WAYS" => 2 })
+class With4L2Ways extends Config(knobValues = { case "L2_WAYS" => 4 })
+
+class With1Cached extends Config(knobValues = { case "N_CACHED_TILES" => 1 })
+class With2Cached extends Config(knobValues = { case "N_CACHED_TILES" => 2 })
+
+
+class W1Cached1WaysConfig extends Config(new With1L2Ways ++ new With1Cached ++ new DefaultL2Config)
+class W1Cached2WaysConfig extends Config(new With2L2Ways ++ new With1Cached ++ new DefaultL2Config)
+class W2Cached1WaysConfig extends Config(new With1L2Ways ++ new With2Cached ++ new DefaultL2Config)
+class W2Cached2WaysConfig extends Config(new With2L2Ways ++ new With2Cached ++ new DefaultL2Config)
--- a/src/main/scala/uncore/Consts.scala
+++ b/src/main/scala/uncore/Consts.scala
@ -0,0 +1,39 @@
+// See LICENSE for license details.
+
+package uncore
+package constants
+
+import Chisel._
+
+object MemoryOpConstants extends MemoryOpConstants
+trait MemoryOpConstants {
+  val NUM_XA_OPS = 9
+  val M_SZ      = 5
+  val M_X       = BitPat("b?????");
+  val M_XRD     = UInt("b00000"); // int load
+  val M_XWR     = UInt("b00001"); // int store
+  val M_PFR     = UInt("b00010"); // prefetch with intent to read
+  val M_PFW     = UInt("b00011"); // prefetch with intent to write
+  val M_XA_SWAP = UInt("b00100");
+  val M_FLUSH_ALL = UInt("b00101")  // flush all lines
+  val M_XLR     = UInt("b00110");
+  val M_XSC     = UInt("b00111");
+  val M_XA_ADD  = UInt("b01000");
+  val M_XA_XOR  = UInt("b01001");
+  val M_XA_OR   = UInt("b01010");
+  val M_XA_AND  = UInt("b01011");
+  val M_XA_MIN  = UInt("b01100");
+  val M_XA_MAX  = UInt("b01101");
+  val M_XA_MINU = UInt("b01110");
+  val M_XA_MAXU = UInt("b01111");
+  val M_FLUSH   = UInt("b10000") // write back dirty data and cede R/W permissions
+  val M_PRODUCE = UInt("b10001") // write back dirty data and cede W permissions
+  val M_CLEAN   = UInt("b10011") // write back dirty data and retain R/W permissions
+
+  def isAMO(cmd: UInt) = cmd(3) || cmd === M_XA_SWAP
+  def isPrefetch(cmd: UInt) = cmd === M_PFR || cmd === M_PFW
+  def isRead(cmd: UInt) = cmd === M_XRD || cmd === M_XLR || cmd === M_XSC || isAMO(cmd)
+  def isWrite(cmd: UInt) = cmd === M_XWR || cmd === M_XSC || isAMO(cmd)
+  def isWriteIntent(cmd: UInt) = isWrite(cmd) || cmd === M_PFW || cmd === M_XLR
+}
+
--- a/src/main/scala/uncore/Package.scala
+++ b/src/main/scala/uncore/Package.scala
@ -0,0 +1,4 @@
+// See LICENSE for license details.
+package uncore
+
+package object constants extends uncore.constants.MemoryOpConstants
--- a/src/main/scala/uncore/agents/Agents.scala
+++ b/src/main/scala/uncore/agents/Agents.scala
@ -0,0 +1,162 @@
+// See LICENSE for license details.
+
+package uncore.agents
+
+import Chisel._
+import cde.{Parameters, Field}
+import junctions._
+import uncore.tilelink._
+import uncore.converters._
+import uncore.coherence._
+import uncore.util._
+
+case object NReleaseTransactors extends Field[Int]
+case object NProbeTransactors extends Field[Int]
+case object NAcquireTransactors extends Field[Int]
+
+trait HasCoherenceAgentParameters {
+  implicit val p: Parameters
+  val nReleaseTransactors = 1
+  val nAcquireTransactors = p(NAcquireTransactors)
+  val nTransactors = nReleaseTransactors + nAcquireTransactors
+  val blockAddrBits = p(PAddrBits) - p(CacheBlockOffsetBits)
+  val outerTLId = p(OuterTLId)
+  val outerTLParams = p(TLKey(outerTLId))
+  val outerDataBeats = outerTLParams.dataBeats
+  val outerDataBits = outerTLParams.dataBitsPerBeat
+  val outerBeatAddrBits = log2Up(outerDataBeats)
+  val outerByteAddrBits = log2Up(outerDataBits/8)
+  val outerWriteMaskBits = outerTLParams.writeMaskBits
+  val innerTLId = p(InnerTLId)
+  val innerTLParams = p(TLKey(innerTLId))
+  val innerDataBeats = innerTLParams.dataBeats
+  val innerDataBits = innerTLParams.dataBitsPerBeat
+  val innerWriteMaskBits = innerTLParams.writeMaskBits
+  val innerBeatAddrBits = log2Up(innerDataBeats)
+  val innerByteAddrBits = log2Up(innerDataBits/8)
+  val innerNCachingClients = innerTLParams.nCachingClients
+  val maxManagerXacts = innerTLParams.maxManagerXacts
+  require(outerDataBeats == innerDataBeats) //TODO: fix all xact_data Vecs to remove this requirement
+}
+
+abstract class CoherenceAgentModule(implicit val p: Parameters) extends Module
+  with HasCoherenceAgentParameters
+abstract class CoherenceAgentBundle(implicit val p: Parameters) extends junctions.ParameterizedBundle()(p)
+   with HasCoherenceAgentParameters
+
+trait HasCoherenceAgentWiringHelpers {
+  def doOutputArbitration[T <: TileLinkChannel](
+      out: DecoupledIO[T],
+      ins: Seq[DecoupledIO[T]]) {
+    def lock(o: T) = o.hasMultibeatData()
+    val arb = Module(new LockingRRArbiter(out.bits, ins.size, out.bits.tlDataBeats, Some(lock _)))
+    out <> arb.io.out
+    arb.io.in <> ins
+  }
+
+  def doInputRouting[T <: Bundle with HasManagerTransactionId](
+        in: DecoupledIO[T],
+        outs: Seq[DecoupledIO[T]]) {
+    val idx = in.bits.manager_xact_id
+    outs.map(_.bits := in.bits)
+    outs.zipWithIndex.map { case (o,i) => o.valid := in.valid && idx === UInt(i) }
+    in.ready := outs.map(_.ready).apply(idx)
+  }
+
+  /** Broadcasts valid messages on this channel to all trackers,
+    * but includes logic to allocate a new tracker in the case where
+    * no previously allocated tracker matches the new req's addr.
+    *
+    * When a match is reported, if ready is high the new transaction
+    * is merged; when ready is low the transaction is being blocked.
+    * When no match is reported, any high idles are presumed to be
+    * from trackers that are available for allocation, and one is
+    * assigned via alloc based on priority; if no idles are high then
+    * all trackers are busy with other transactions. If idle is high
+    * but ready is low, the tracker will be allocated but does not
+    * have sufficient buffering for the data.
+    */ 
+  def doInputRoutingWithAllocation[T <: TileLinkChannel with HasTileLinkData](
+        in: DecoupledIO[T],
+        outs: Seq[DecoupledIO[T]],
+        allocs: Seq[TrackerAllocation],
+        dataOverrides: Option[Seq[UInt]] = None,
+        allocOverride: Option[Bool] = None,
+        matchOverride: Option[Bool] = None) {
+    val ready_bits = outs.map(_.ready).asUInt
+    val can_alloc_bits = allocs.map(_.can).asUInt
+    val should_alloc_bits = PriorityEncoderOH(can_alloc_bits)
+    val match_bits = allocs.map(_.matches).asUInt
+    val no_matches = !match_bits.orR
+    val alloc_ok = allocOverride.getOrElse(Bool(true))
+    val match_ok = matchOverride.getOrElse(Bool(true))
+    in.ready := (Mux(no_matches, can_alloc_bits, match_bits) & ready_bits).orR && alloc_ok && match_ok
+    outs.zip(allocs).zipWithIndex.foreach { case((out, alloc), i) =>
+      out.valid := in.valid && match_ok && alloc_ok
+      out.bits := in.bits
+      dataOverrides foreach { d => out.bits.data := d(i) }
+      alloc.should := should_alloc_bits(i) && no_matches && alloc_ok
+    }
+  }
+}
+
+trait HasInnerTLIO extends HasCoherenceAgentParameters {
+  val inner = new ManagerTileLinkIO()(p.alterPartial({case TLId => p(InnerTLId)}))
+  val incoherent = Vec(inner.tlNCachingClients, Bool()).asInput
+  def iacq(dummy: Int = 0) = inner.acquire.bits
+  def iprb(dummy: Int = 0) = inner.probe.bits
+  def irel(dummy: Int = 0) = inner.release.bits
+  def ignt(dummy: Int = 0) = inner.grant.bits
+  def ifin(dummy: Int = 0) = inner.finish.bits
+}
+
+trait HasUncachedOuterTLIO extends HasCoherenceAgentParameters {
+  val outer = new ClientUncachedTileLinkIO()(p.alterPartial({case TLId => p(OuterTLId)}))
+  def oacq(dummy: Int = 0) = outer.acquire.bits
+  def ognt(dummy: Int = 0) = outer.grant.bits
+}
+
+trait HasCachedOuterTLIO extends HasCoherenceAgentParameters {
+  val outer = new ClientTileLinkIO()(p.alterPartial({case TLId => p(OuterTLId)}))
+  def oacq(dummy: Int = 0) = outer.acquire.bits
+  def oprb(dummy: Int = 0) = outer.probe.bits
+  def orel(dummy: Int = 0) = outer.release.bits
+  def ognt(dummy: Int = 0) = outer.grant.bits
+}
+
+class ManagerTLIO(implicit p: Parameters) extends CoherenceAgentBundle()(p)
+  with HasInnerTLIO
+  with HasUncachedOuterTLIO
+
+abstract class CoherenceAgent(implicit p: Parameters) extends CoherenceAgentModule()(p) {
+  def innerTL: ManagerTileLinkIO
+  def outerTL: ClientTileLinkIO
+  def incoherent: Vec[Bool]
+}
+
+abstract class ManagerCoherenceAgent(implicit p: Parameters) extends CoherenceAgent()(p)
+    with HasCoherenceAgentWiringHelpers {
+  val io = new ManagerTLIO
+  def innerTL = io.inner
+  def outerTL = TileLinkIOWrapper(io.outer)(p.alterPartial({case TLId => p(OuterTLId)}))
+  def incoherent = io.incoherent
+}
+
+class HierarchicalTLIO(implicit p: Parameters) extends CoherenceAgentBundle()(p)
+  with HasInnerTLIO
+  with HasCachedOuterTLIO
+
+abstract class HierarchicalCoherenceAgent(implicit p: Parameters) extends CoherenceAgent()(p)
+    with HasCoherenceAgentWiringHelpers {
+  val io = new HierarchicalTLIO
+  def innerTL = io.inner
+  def outerTL = io.outer
+  def incoherent = io.incoherent
+
+  // TODO: Remove this function (and all its calls) when we support probing the L2
+  def disconnectOuterProbeAndFinish() {
+    io.outer.probe.ready := Bool(false)
+    io.outer.finish.valid := Bool(false)
+    assert(!io.outer.probe.valid, "L2 agent got illegal probe")
+  }
+}
--- a/src/main/scala/uncore/agents/Broadcast.scala
+++ b/src/main/scala/uncore/agents/Broadcast.scala
@ -0,0 +1,204 @@
+// See LICENSE for license details.
+
+package uncore.agents
+
+import Chisel._
+import uncore.coherence._
+import uncore.tilelink._
+import uncore.constants._
+import uncore.util._
+import cde.Parameters
+
+class L2BroadcastHub(implicit p: Parameters) extends HierarchicalCoherenceAgent()(p) {
+
+  // Create TSHRs for outstanding transactions
+  val irelTrackerList =
+    (0 until nReleaseTransactors).map(id =>
+      Module(new BufferedBroadcastVoluntaryReleaseTracker(id)))
+  val iacqTrackerList = 
+    (nReleaseTransactors until nTransactors).map(id =>
+      Module(new BufferedBroadcastAcquireTracker(id)))
+  val trackerList = irelTrackerList ++ iacqTrackerList
+
+  // Propagate incoherence flags
+  trackerList.map(_.io.incoherent) foreach { _ := io.incoherent }
+
+  // Create an arbiter for the one memory port
+  val outerList = trackerList.map(_.io.outer)
+  val outer_arb = Module(new ClientTileLinkIOArbiter(outerList.size)
+                                                    (p.alterPartial({ case TLId => p(OuterTLId) })))
+  outer_arb.io.in <> outerList
+  io.outer <> outer_arb.io.out
+
+  // Handle acquire transaction initiation
+  val irel_vs_iacq_conflict =
+    io.inner.acquire.valid &&
+    io.inner.release.valid &&
+    io.irel().conflicts(io.iacq())
+
+  doInputRoutingWithAllocation(
+    in = io.inner.acquire,
+    outs = trackerList.map(_.io.inner.acquire),
+    allocs = trackerList.map(_.io.alloc.iacq),
+    allocOverride = Some(!irel_vs_iacq_conflict))
+
+  // Handle releases, which might be voluntary and might have data
+  doInputRoutingWithAllocation(
+    in = io.inner.release,
+    outs = trackerList.map(_.io.inner.release),
+    allocs = trackerList.map(_.io.alloc.irel))
+
+  // Wire probe requests and grant reply to clients, finish acks from clients
+  doOutputArbitration(io.inner.probe, trackerList.map(_.io.inner.probe))
+
+  doOutputArbitration(io.inner.grant, trackerList.map(_.io.inner.grant))
+
+  doInputRouting(io.inner.finish, trackerList.map(_.io.inner.finish))
+
+  disconnectOuterProbeAndFinish()
+}
+
+class BroadcastXactTracker(implicit p: Parameters) extends XactTracker()(p) {
+  val io = new HierarchicalXactTrackerIO
+  pinAllReadyValidLow(io)
+}
+
+trait BroadcastsToAllClients extends HasCoherenceAgentParameters {
+  val coh = HierarchicalMetadata.onReset
+  val inner_coh = coh.inner
+  val outer_coh = coh.outer
+  def full_representation = ~UInt(0, width = innerNCachingClients)
+}
+
+abstract class BroadcastVoluntaryReleaseTracker(trackerId: Int)(implicit p: Parameters)
+    extends VoluntaryReleaseTracker(trackerId)(p) 
+    with EmitsVoluntaryReleases
+    with BroadcastsToAllClients {
+  val io = new HierarchicalXactTrackerIO
+  pinAllReadyValidLow(io)
+
+  // Checks for illegal behavior
+  assert(!(state === s_idle && io.inner.release.fire() && io.alloc.irel.should && !io.irel().isVoluntary()),
+    "VoluntaryReleaseTracker accepted Release that wasn't voluntary!")
+}
+
+abstract class BroadcastAcquireTracker(trackerId: Int)(implicit p: Parameters)
+    extends AcquireTracker(trackerId)(p) 
+    with EmitsVoluntaryReleases
+    with BroadcastsToAllClients {
+  val io = new HierarchicalXactTrackerIO
+  pinAllReadyValidLow(io)
+
+  val alwaysWriteFullBeat = false
+  val nSecondaryMisses = 1
+  def iacq_can_merge = Bool(false)
+
+  // Checks for illegal behavior
+  // TODO: this could be allowed, but is a useful check against allocation gone wild
+  assert(!(state === s_idle && io.inner.acquire.fire() && io.alloc.iacq.should &&
+    io.iacq().hasMultibeatData() && !io.iacq().first()),
+    "AcquireTracker initialized with a tail data beat.")
+
+  assert(!(state =/= s_idle && pending_ignt && xact_iacq.isPrefetch()),
+    "Broadcast Hub does not support Prefetches.")
+
+  assert(!(state =/= s_idle && pending_ignt && xact_iacq.isAtomic()),
+    "Broadcast Hub does not support PutAtomics.")
+}
+
+class BufferedBroadcastVoluntaryReleaseTracker(trackerId: Int)(implicit p: Parameters)
+    extends BroadcastVoluntaryReleaseTracker(trackerId)(p)
+    with HasDataBuffer {
+
+  // Tell the parent if any incoming messages conflict with the ongoing transaction
+  routeInParent(irelCanAlloc = Bool(true))
+
+  // Start transaction by accepting inner release
+  innerRelease(block_vol_ignt = pending_orel || vol_ognt_counter.pending)
+
+  // A release beat can be accepted if we are idle, if its a mergeable transaction, or if its a tail beat
+  io.inner.release.ready := state === s_idle || irel_can_merge || irel_same_xact
+
+  when(io.inner.release.fire()) { data_buffer(io.irel().addr_beat) := io.irel().data }
+
+  // Dispatch outer release
+  outerRelease(
+    coh = outer_coh.onHit(M_XWR),
+    data = data_buffer(vol_ognt_counter.up.idx),
+    add_pending_send_bit = irel_is_allocating)
+
+  quiesce() {}
+}
+
+class BufferedBroadcastAcquireTracker(trackerId: Int)(implicit p: Parameters)
+    extends BroadcastAcquireTracker(trackerId)(p)
+    with HasByteWriteMaskBuffer {
+
+  // Setup IOs used for routing in the parent
+  routeInParent(iacqCanAlloc = Bool(true))
+
+  // First, take care of accpeting new acquires or secondary misses
+  // Handling of primary and secondary misses' data and write mask merging
+  innerAcquire(
+    can_alloc = Bool(false),
+    next = s_inner_probe)
+
+  io.inner.acquire.ready := state === s_idle || iacq_can_merge || iacq_same_xact_multibeat
+
+  // Track which clients yet need to be probed and make Probe message
+  // If a writeback occurs, we can forward its data via the buffer,
+  // and skip having to go outwards
+  val skip_outer_acquire = pending_ignt_data.andR
+
+  innerProbe(
+    inner_coh.makeProbe(curr_probe_dst, xact_iacq, xact_addr_block),
+    Mux(!skip_outer_acquire, s_outer_acquire, s_busy))
+
+  // Handle incoming releases from clients, which may reduce sharer counts
+  // and/or write back dirty data, and may be unexpected voluntary releases
+  def irel_can_merge = io.irel().conflicts(xact_addr_block) &&
+                         io.irel().isVoluntary() &&
+                         !state.isOneOf(s_idle, s_meta_write) &&
+                         !all_pending_done &&
+                         !io.outer.grant.fire() &&
+                         !io.inner.grant.fire() &&
+                         !vol_ignt_counter.pending &&
+                         !blockInnerRelease()
+
+  innerRelease(block_vol_ignt = vol_ognt_counter.pending) 
+
+  //TODO: accept vol irels when state === s_idle, operate like the VolRelTracker
+  io.inner.release.ready := irel_can_merge || irel_same_xact
+
+  mergeDataInner(io.inner.release)
+
+  // If there was a writeback, forward it outwards
+  outerRelease(
+    coh = outer_coh.onHit(M_XWR),
+    data = data_buffer(vol_ognt_counter.up.idx))
+
+  // Send outer request for miss
+  outerAcquire(
+    caching = !xact_iacq.isBuiltInType(),
+    coh = outer_coh,
+    data = data_buffer(ognt_counter.up.idx),
+    wmask = wmask_buffer(ognt_counter.up.idx),
+    next = s_busy)
+    
+  // Handle the response from outer memory
+  mergeDataOuter(io.outer.grant)
+
+  // Acknowledge or respond with data
+  innerGrant(
+    data = data_buffer(ignt_data_idx),
+    external_pending = pending_orel || ognt_counter.pending || vol_ognt_counter.pending)
+
+  when(iacq_is_allocating) {
+    initializeProbes()
+  }
+
+  initDataInner(io.inner.acquire, iacq_is_allocating || iacq_is_merging)
+
+  // Wait for everything to quiesce
+  quiesce() { clearWmaskBuffer() }
+}
--- a/src/main/scala/uncore/agents/Bufferless.scala
+++ b/src/main/scala/uncore/agents/Bufferless.scala
@ -0,0 +1,162 @@
+// See LICENSE for license details.
+
+package uncore.agents
+
+import Chisel._
+import uncore.coherence._
+import uncore.tilelink._
+import uncore.constants._
+import cde.Parameters
+
+
+class BufferlessBroadcastHub(implicit p: Parameters) extends HierarchicalCoherenceAgent()(p) {
+
+  // Create TSHRs for outstanding transactions
+  val irelTrackerList =
+    (0 until nReleaseTransactors).map(id =>
+      Module(new BufferlessBroadcastVoluntaryReleaseTracker(id)))
+  val iacqTrackerList = 
+    (nReleaseTransactors until nTransactors).map(id =>
+      Module(new BufferlessBroadcastAcquireTracker(id)))
+  val trackerList = irelTrackerList ++ iacqTrackerList
+
+  // Propagate incoherence flags
+  trackerList.map(_.io.incoherent) foreach { _ := io.incoherent }
+
+  // Create an arbiter for the one memory port
+  val outerList = trackerList.map(_.io.outer)
+  val outer_arb = Module(new ClientTileLinkIOArbiter(outerList.size)
+                                                    (p.alterPartial({ case TLId => p(OuterTLId) })))
+  outer_arb.io.in <> outerList
+  io.outer <> outer_arb.io.out
+
+  val iacq = Queue(io.inner.acquire, 1, pipe=true)
+  val irel = Queue(io.inner.release, 1, pipe=true)
+
+  // Handle acquire transaction initiation
+  val irel_vs_iacq_conflict =
+    iacq.valid &&
+    irel.valid &&
+    irel.bits.conflicts(iacq.bits)
+
+  doInputRoutingWithAllocation(
+    in = iacq,
+    outs = trackerList.map(_.io.inner.acquire),
+    allocs = trackerList.map(_.io.alloc.iacq),
+    allocOverride = Some(!irel_vs_iacq_conflict))
+  io.outer.acquire.bits.data := iacq.bits.data
+  when (io.oacq().hasData()) {
+    io.outer.acquire.bits.addr_beat := iacq.bits.addr_beat
+  }
+
+  // Handle releases, which might be voluntary and might have data
+  doInputRoutingWithAllocation(
+    in = irel,
+    outs = trackerList.map(_.io.inner.release),
+    allocs = trackerList.map(_.io.alloc.irel))
+  io.outer.release.bits.data := irel.bits.data
+  when (io.orel().hasData()) {
+    io.outer.release.bits.addr_beat := irel.bits.addr_beat
+  }
+
+  // Wire probe requests and grant reply to clients, finish acks from clients
+  doOutputArbitration(io.inner.probe, trackerList.map(_.io.inner.probe))
+
+  doOutputArbitration(io.inner.grant, trackerList.map(_.io.inner.grant))
+  io.inner.grant.bits.data := io.outer.grant.bits.data
+  io.inner.grant.bits.addr_beat := io.outer.grant.bits.addr_beat
+
+  doInputRouting(io.inner.finish, trackerList.map(_.io.inner.finish))
+
+  disconnectOuterProbeAndFinish()
+}
+
+class BufferlessBroadcastVoluntaryReleaseTracker(trackerId: Int)(implicit p: Parameters)
+    extends BroadcastVoluntaryReleaseTracker(trackerId)(p) {
+
+  // Tell the parent if any incoming messages conflict with the ongoing transaction
+  routeInParent(irelCanAlloc = Bool(true))
+
+  // Start transaction by accepting inner release
+  innerRelease(block_vol_ignt = pending_orel || vol_ognt_counter.pending)
+
+  // A release beat can be accepted if we are idle, if its a mergeable transaction, or if its a tail beat
+  // and if the outer relase path is clear 
+  io.inner.release.ready := Mux(io.irel().hasData(),
+    (state =/= s_idle) && (irel_can_merge || irel_same_xact) && io.outer.release.ready,
+    (state === s_idle) || irel_can_merge || irel_same_xact)
+
+  // Dispatch outer release
+  outerRelease(coh = outer_coh.onHit(M_XWR), buffering = Bool(false))
+
+  quiesce() {}
+}
+
+class BufferlessBroadcastAcquireTracker(trackerId: Int)(implicit p: Parameters)
+    extends BroadcastAcquireTracker(trackerId)(p) {
+
+  // Setup IOs used for routing in the parent
+  routeInParent(iacqCanAlloc = Bool(true))
+
+  // First, take care of accpeting new acquires or secondary misses
+  // Handling of primary and secondary misses' data and write mask merging
+  innerAcquire(
+    can_alloc = Bool(false),
+    next = s_inner_probe)
+
+  // We are never going to merge anything in the bufferless hub
+  // Therefore, we only need to concern ourselves with the allocated
+  // transaction and (in case of PutBlock) subsequent tail beats
+  val iacq_can_forward = iacq_same_xact && !vol_ognt_counter.pending
+  io.inner.acquire.ready := Mux(io.iacq().hasData(),
+    state === s_outer_acquire && iacq_can_forward && io.outer.acquire.ready,
+    state === s_idle && io.alloc.iacq.should)
+
+  // Track which clients yet need to be probed and make Probe message
+  innerProbe(
+    inner_coh.makeProbe(curr_probe_dst, xact_iacq, xact_addr_block),
+    s_outer_acquire)
+
+  // Handle incoming releases from clients, which may reduce sharer counts
+  // and/or write back dirty data, and may be unexpected voluntary releases
+  def irel_can_merge = io.irel().conflicts(xact_addr_block) &&
+                         io.irel().isVoluntary() &&
+                         !vol_ignt_counter.pending &&
+                         !(io.irel().hasData() && ognt_counter.pending) &&
+                         (state =/= s_idle)
+
+  innerRelease(block_vol_ignt = vol_ognt_counter.pending) 
+
+  val irel_could_accept = irel_can_merge || irel_same_xact
+  io.inner.release.ready := irel_could_accept &&
+    (!io.irel().hasData() || io.outer.release.ready)
+
+  // If there was a writeback, forward it outwards
+  outerRelease(
+    coh = outer_coh.onHit(M_XWR),
+    buffering = Bool(false),
+    block_orel = !irel_could_accept)
+
+  // Send outer request for miss
+  outerAcquire(
+    caching = !xact_iacq.isBuiltInType(),
+    block_outer_acquire = vol_ognt_counter.pending,
+    buffering = Bool(false),
+    coh = outer_coh,
+    next = s_busy)
+
+  // Handle the response from outer memory
+  when (ognt_counter.pending && io.ognt().hasData()) {
+    io.outer.grant.ready := io.inner.grant.ready // bypass data
+  }
+
+  // Acknowledge or respond with data
+  innerGrant(
+    external_pending = pending_orel || vol_ognt_counter.pending,
+    buffering = Bool(false))
+
+  when(iacq_is_allocating) { initializeProbes() }
+
+  // Wait for everything to quiesce
+  quiesce() {}
+}
--- a/src/main/scala/uncore/agents/Cache.scala
+++ b/src/main/scala/uncore/agents/Cache.scala
--- a/src/main/scala/uncore/agents/Ecc.scala
+++ b/src/main/scala/uncore/agents/Ecc.scala
@ -0,0 +1,146 @@
+// See LICENSE for license details.
+
+package uncore.agents
+
+import Chisel._
+import uncore.util._
+
+abstract class Decoding
+{
+  def uncorrected: UInt
+  def corrected: UInt
+  def correctable: Bool
+  def uncorrectable: Bool
+  def error = correctable || uncorrectable
+}
+
+abstract class Code
+{
+  def width(w0: Int): Int
+  def encode(x: UInt): UInt
+  def decode(x: UInt): Decoding
+}
+
+class IdentityCode extends Code
+{
+  def width(w0: Int) = w0
+  def encode(x: UInt) = x
+  def decode(y: UInt) = new Decoding {
+    def uncorrected = y
+    def corrected = y
+    def correctable = Bool(false)
+    def uncorrectable = Bool(false)
+  }
+}
+
+class ParityCode extends Code
+{
+  def width(w0: Int) = w0+1
+  def encode(x: UInt) = Cat(x.xorR, x)
+  def decode(y: UInt) = new Decoding {
+    def uncorrected = y(y.getWidth-2,0)
+    def corrected = uncorrected
+    def correctable = Bool(false)
+    def uncorrectable = y.xorR
+  }
+}
+
+class SECCode extends Code
+{
+  def width(k: Int) = {
+    val m = log2Floor(k) + 1
+    k + m + (if((1 << m) < m+k+1) 1 else 0)
+  }
+  def encode(x: UInt) = {
+    val k = x.getWidth
+    require(k > 0)
+    val n = width(k)
+
+    val y = for (i <- 1 to n) yield {
+      if (isPow2(i)) {
+        val r = for (j <- 1 to n; if j != i && (j & i) != 0)
+          yield x(mapping(j))
+        r reduce (_^_)
+      } else
+        x(mapping(i))
+    }
+    y.asUInt
+  }
+  def decode(y: UInt) = new Decoding {
+    val n = y.getWidth
+    require(n > 0 && !isPow2(n))
+
+    val p2 = for (i <- 0 until log2Up(n)) yield 1 << i
+    val syndrome = (p2 map { i =>
+      val r = for (j <- 1 to n; if (j & i) != 0)
+        yield y(j-1)
+      r reduce (_^_)
+    }).asUInt
+
+    private def swizzle(z: UInt) = (1 to n).filter(i => !isPow2(i)).map(i => z(i-1)).asUInt
+    def uncorrected = swizzle(y)
+    def corrected = swizzle(((y << 1) ^ UIntToOH(syndrome)) >> 1)
+    def correctable = syndrome.orR
+    def uncorrectable = Bool(false)
+  }
+  private def mapping(i: Int) = i-1-log2Up(i)
+}
+
+class SECDEDCode extends Code
+{
+  private val sec = new SECCode
+  private val par = new ParityCode
+
+  def width(k: Int) = sec.width(k)+1
+  def encode(x: UInt) = par.encode(sec.encode(x))
+  def decode(x: UInt) = new Decoding {
+    val secdec = sec.decode(x(x.getWidth-2,0))
+    val pardec = par.decode(x)
+
+    def uncorrected = secdec.uncorrected
+    def corrected = secdec.corrected
+    def correctable = pardec.uncorrectable
+    def uncorrectable = !pardec.uncorrectable && secdec.correctable
+  }
+}
+
+object ErrGen
+{
+  // generate a 1-bit error with approximate probability 2^-f
+  def apply(width: Int, f: Int): UInt = {
+    require(width > 0 && f >= 0 && log2Up(width) + f <= 16)
+    UIntToOH(LFSR16()(log2Up(width)+f-1,0))(width-1,0)
+  }
+  def apply(x: UInt, f: Int): UInt = x ^ apply(x.getWidth, f)
+}
+
+class SECDEDTest extends Module
+{
+  val code = new SECDEDCode
+  val k = 4
+  val n = code.width(k)
+
+  val io = new Bundle {
+    val original = Bits(OUTPUT, k)
+    val encoded = Bits(OUTPUT, n)
+    val injected = Bits(OUTPUT, n)
+    val uncorrected = Bits(OUTPUT, k)
+    val corrected = Bits(OUTPUT, k)
+    val correctable = Bool(OUTPUT)
+    val uncorrectable = Bool(OUTPUT)
+  }
+
+  val c = Counter(Bool(true), 1 << k)
+  val numErrors = Counter(c._2, 3)._1
+  val e = code.encode(c._1)
+  val i = e ^ Mux(numErrors < UInt(1), UInt(0), ErrGen(n, 1)) ^ Mux(numErrors < UInt(2), UInt(0), ErrGen(n, 1))
+  val d = code.decode(i)
+
+  io.original := c._1
+  io.encoded := e
+  io.injected := i
+  io.uncorrected := d.uncorrected
+  io.corrected := d.corrected
+  io.correctable := d.correctable
+  io.uncorrectable := d.uncorrectable
+}
--- a/src/main/scala/uncore/agents/Mmio.scala
+++ b/src/main/scala/uncore/agents/Mmio.scala
@ -0,0 +1,73 @@
+package uncore.agents
+
+import Chisel._
+import uncore.tilelink._
+import cde.Parameters
+
+class MMIOTileLinkManagerData(implicit p: Parameters)
+    extends TLBundle()(p)
+    with HasClientId
+    with HasClientTransactionId
+
+class MMIOTileLinkManager(implicit p: Parameters)
+    extends CoherenceAgentModule()(p) {
+  val io = new ManagerTLIO
+
+  // MMIO requests should never need probe or release
+  io.inner.probe.valid := Bool(false)
+  io.inner.release.ready := Bool(false)
+
+  val multibeat_fire = io.outer.acquire.fire() && io.oacq().hasMultibeatData()
+  val multibeat_start = multibeat_fire && io.oacq().addr_beat === UInt(0)
+  val multibeat_end = multibeat_fire && io.oacq().addr_beat === UInt(outerDataBeats - 1)
+
+  // Acquire and Grant are basically passthru,
+  // except client_id and client_xact_id need to be converted.
+  // Associate the inner client_id and client_xact_id
+  // with the outer client_xact_id.
+  val xact_pending = Reg(init = UInt(0, maxManagerXacts))
+  val xact_id_sel = PriorityEncoder(~xact_pending)
+  val xact_id_reg = RegEnable(xact_id_sel, multibeat_start)
+  val xact_multibeat = Reg(init = Bool(false))
+  val outer_xact_id = Mux(xact_multibeat, xact_id_reg, xact_id_sel)
+  val xact_free = !xact_pending.andR
+  val xact_buffer = Reg(Vec(maxManagerXacts, new MMIOTileLinkManagerData))
+
+  io.inner.acquire.ready := io.outer.acquire.ready && xact_free
+  io.outer.acquire.valid := io.inner.acquire.valid && xact_free
+  io.outer.acquire.bits  := io.inner.acquire.bits
+  io.outer.acquire.bits.client_xact_id := outer_xact_id
+
+  def isLastBeat[T <: TileLinkChannel with HasTileLinkBeatId](in: T): Bool =
+    !in.hasMultibeatData() || in.addr_beat === UInt(outerDataBeats - 1)
+
+  def addPendingBitOnAcq[T <: AcquireMetadata](in: DecoupledIO[T]): UInt =
+    Mux(in.fire() && isLastBeat(in.bits), UIntToOH(in.bits.client_xact_id), UInt(0))
+
+  def clearPendingBitOnGnt[T <: GrantMetadata](in: DecoupledIO[T]): UInt =
+    ~Mux(in.fire() && isLastBeat(in.bits) && !in.bits.requiresAck(),
+      UIntToOH(in.bits.manager_xact_id), UInt(0))
+
+  def clearPendingBitOnFin(in: DecoupledIO[Finish]): UInt =
+    ~Mux(in.fire(), UIntToOH(in.bits.manager_xact_id), UInt(0))
+
+  xact_pending := (xact_pending | addPendingBitOnAcq(io.outer.acquire)) &
+                                  clearPendingBitOnFin(io.inner.finish) &
+                                  clearPendingBitOnGnt(io.inner.grant)
+
+  when (io.outer.acquire.fire() && isLastBeat(io.outer.acquire.bits)) {
+    xact_buffer(outer_xact_id) := io.iacq()
+  }
+
+  when (multibeat_start) { xact_multibeat := Bool(true) }
+  when (multibeat_end)   { xact_multibeat := Bool(false) }
+
+  val gnt_xact = xact_buffer(io.ognt().client_xact_id)
+  io.outer.grant.ready := io.inner.grant.ready
+  io.inner.grant.valid := io.outer.grant.valid
+  io.inner.grant.bits  := io.outer.grant.bits
+  io.inner.grant.bits.client_id := gnt_xact.client_id
+  io.inner.grant.bits.client_xact_id := gnt_xact.client_xact_id
+  io.inner.grant.bits.manager_xact_id := io.ognt().client_xact_id
+  io.inner.finish.ready := Bool(true)
+}
--- a/src/main/scala/uncore/agents/StatelessBridge.scala
+++ b/src/main/scala/uncore/agents/StatelessBridge.scala
@ -0,0 +1,69 @@
+// See LICENSE for license details.
+
+package uncore.agents
+
+import Chisel._
+import uncore.coherence._
+import uncore.tilelink._
+import uncore.constants._
+import uncore.devices._
+import cde.{Parameters, Field, Config}
+
+/** The ManagerToClientStateless Bridge does not maintain any state for the messages
+  *  which pass through it. It simply passes the messages back and forth without any
+  *  tracking or translation. 
+  *  
+  * This can reduce area and timing in very constrained situations:
+  *   - The Manager and Client implement the same coherence protocol
+  *   - There are no probe or finish messages.
+  *   - The outer transaction ID is large enough to handle all possible inner
+  *     transaction IDs, such that no remapping state must be maintained.
+  *
+  * This bridge DOES NOT keep the uncached channel coherent with the cached
+  * channel. Uncached requests to blocks cached by the L1 will not probe the L1.
+  * As a result, uncached reads to cached blocks will get stale data until
+  * the L1 performs a voluntary writeback, and uncached writes to cached blocks
+  * will get lost, as the voluntary writeback from the L1 will overwrite the
+  * changes. If your tile relies on probing the L1 data cache in order to
+  * share data between the instruction cache and data cache (e.g. you are using
+  * a non-blocking L1 D$) or if the tile has uncached channels capable of
+  * writes (e.g. Hwacha and other RoCC accelerators), DO NOT USE THIS BRIDGE.
+  */
+
+class ManagerToClientStatelessBridge(implicit p: Parameters) extends HierarchicalCoherenceAgent()(p) {
+  val icid = io.inner.tlClientIdBits
+  val ixid = io.inner.tlClientXactIdBits
+  val oxid = io.outer.tlClientXactIdBits
+
+  val innerCoh = io.inner.tlCoh.getClass
+  val outerCoh = io.outer.tlCoh.getClass
+
+  // Stateless Bridge is only usable in certain constrained situations.
+  // Sanity check its usage here.
+
+  require(io.inner.tlNCachingClients <= 1)
+  require(icid + ixid <= oxid)
+  require(innerCoh eq outerCoh,
+    s"Coherence policies do not match: inner is ${innerCoh.getSimpleName}, outer is ${outerCoh.getSimpleName}")
+
+  io.outer.acquire.valid := io.inner.acquire.valid
+  io.inner.acquire.ready := io.outer.acquire.ready
+  io.outer.acquire.bits := io.inner.acquire.bits
+  io.outer.acquire.bits.client_xact_id := Cat(io.inner.acquire.bits.client_id, io.inner.acquire.bits.client_xact_id)
+
+  io.outer.release.valid := io.inner.release.valid
+  io.inner.release.ready := io.outer.release.ready
+  io.outer.release.bits := io.inner.release.bits
+  io.outer.release.bits.client_xact_id := Cat(io.inner.release.bits.client_id, io.inner.release.bits.client_xact_id)
+
+  io.inner.grant.valid := io.outer.grant.valid
+  io.outer.grant.ready := io.inner.grant.ready
+  io.inner.grant.bits := io.outer.grant.bits
+  io.inner.grant.bits.client_xact_id := io.outer.grant.bits.client_xact_id(ixid-1, 0)
+  io.inner.grant.bits.client_id := io.outer.grant.bits.client_xact_id(icid+ixid-1, ixid)
+
+  io.inner.probe.valid := Bool(false)
+  io.inner.finish.ready := Bool(true)
+
+  disconnectOuterProbeAndFinish()
+}
--- a/src/main/scala/uncore/agents/StoreDataQueue.scala
+++ b/src/main/scala/uncore/agents/StoreDataQueue.scala
@ -0,0 +1,119 @@
+// See LICENSE for license details.
+
+package uncore.agents
+import Chisel._
+import uncore.tilelink._
+import cde.{Parameters, Field}
+
+case object L2StoreDataQueueDepth extends Field[Int]
+
+trait HasStoreDataQueueParameters extends HasCoherenceAgentParameters {
+  val sdqDepth = p(L2StoreDataQueueDepth)*innerDataBeats
+  val dqIdxBits = math.max(log2Up(nReleaseTransactors) + 1, log2Up(sdqDepth))
+  val nDataQueueLocations = 3 //Stores, VoluntaryWBs, Releases
+}
+
+class DataQueueLocation(implicit p: Parameters) extends CoherenceAgentBundle()(p)
+    with HasStoreDataQueueParameters {
+  val idx = UInt(width = dqIdxBits)
+  val loc = UInt(width = log2Ceil(nDataQueueLocations))
+} 
+
+object DataQueueLocation {
+  def apply(idx: UInt, loc: UInt)(implicit p: Parameters) = {
+    val d = Wire(new DataQueueLocation)
+    d.idx := idx
+    d.loc := loc
+    d
+  }
+}
+
+trait HasStoreDataQueue extends HasStoreDataQueueParameters {
+  val io: HierarchicalTLIO
+  val trackerIOsList: Seq[HierarchicalXactTrackerIO]
+
+  val internalDataBits = new DataQueueLocation().getWidth
+  val inStoreQueue :: inVolWBQueue :: inClientReleaseQueue :: Nil = Enum(UInt(), nDataQueueLocations)
+
+  val usingStoreDataQueue = p.alterPartial({
+    case TLKey(`innerTLId`) => innerTLParams.copy(overrideDataBitsPerBeat = Some(internalDataBits))
+    case TLKey(`outerTLId`) => outerTLParams.copy(overrideDataBitsPerBeat = Some(internalDataBits))
+  })
+
+  // Queue to store impending Put data
+  lazy val sdq = Reg(Vec(sdqDepth, io.iacq().data))
+  lazy val sdq_val = Reg(init=Bits(0, sdqDepth))
+  lazy val sdq_alloc_id = PriorityEncoder(~sdq_val)
+  lazy val sdq_rdy = !sdq_val.andR
+  lazy val sdq_enq = trackerIOsList.map( t =>
+                  (t.alloc.iacq.should || t.alloc.iacq.matches) &&
+                    t.inner.acquire.fire() &&
+                    t.iacq().hasData()
+                ).reduce(_||_)
+
+  lazy val sdqLoc = List.fill(nTransactors) {
+    DataQueueLocation(sdq_alloc_id, inStoreQueue).asUInt
+  }
+
+  /*
+  doInputRoutingWithAllocation(
+    in = io.inner.acquire,
+    outs = trackerList.map(_.io.inner.acquire),
+    allocs = trackerList.map(_.io.alloc._iacq),
+    dataOverride = Some(sdqLoc),
+    allocOverride = Some(sdq_rdy && !irel_vs_iacq_conflict))
+  */
+
+  // Queue to store impending Voluntary Release data
+  lazy val voluntary = io.irel().isVoluntary()
+  lazy val vwbdq_enq = io.inner.release.fire() && voluntary && io.irel().hasData()
+  lazy val (rel_data_cnt, rel_data_done) = Counter(vwbdq_enq, innerDataBeats) //TODO Zero width
+  lazy val vwbdq = Reg(Vec(innerDataBeats, io.irel().data)) //TODO Assumes nReleaseTransactors == 1
+  
+
+  lazy val vwbqLoc = (0 until nTransactors).map(i =>
+    (DataQueueLocation(rel_data_cnt,
+                       (if(i < nReleaseTransactors) inVolWBQueue
+                        else inClientReleaseQueue)).asUInt))
+  /*
+  doInputRoutingWithAllocation(
+    io.inner.release,
+    trackerList.map(_.io.inner.release),
+    trackerList.map(_.io.matches.irel),
+    trackerList.map(_.io.alloc.irel),
+    Some(vwbqLoc))
+  */
+
+  val outer_arb: ClientTileLinkIOArbiter
+  lazy val outer_data_ptr = new DataQueueLocation().fromBits(outer_arb.io.out.acquire.bits.data)
+  /*
+  val outer_arb = Module(new ClientTileLinkIOArbiter(trackerList.size)
+                    (usingStoreDataQueue.alterPartial({ case TLId => p(OuterTLId) })))
+  outer_arb.io.in <> trackerList
+  */
+  // Get the pending data out of the store data queue
+  lazy val is_in_sdq = outer_data_ptr.loc === inStoreQueue
+  lazy val free_sdq = io.outer.acquire.fire() &&
+                  io.outer.acquire.bits.hasData() &&
+                  outer_data_ptr.loc === inStoreQueue
+  /*
+  io.outer <> outer_arb.io.out
+  io.outer.acquire.bits.data := MuxLookup(outer_data_ptr.loc, io.irel().data, Array(
+                                          inStoreQueue -> sdq(outer_data_ptr.idx),
+                                          inVolWBQueue -> vwbdq(outer_data_ptr.idx)))
+  */
+
+  // Enqueue SDQ data
+  def sdqEnqueue() {
+    when (sdq_enq) { sdq(sdq_alloc_id) := io.iacq().data }
+    when(vwbdq_enq) { vwbdq(rel_data_cnt) := io.irel().data }
+  }
+
+  // Update SDQ valid bits
+  def sdqUpdate() {
+    when (io.outer.acquire.valid || sdq_enq) {
+      sdq_val := sdq_val & ~(UIntToOH(outer_data_ptr.idx) & Fill(sdqDepth, free_sdq)) | 
+                 PriorityEncoderOH(~sdq_val(sdqDepth-1,0)) & Fill(sdqDepth, sdq_enq)
+    }
+  }
+}
--- a/src/main/scala/uncore/agents/Trackers.scala
+++ b/src/main/scala/uncore/agents/Trackers.scala
@ -0,0 +1,654 @@
+// See LICENSE for license details.
+
+package uncore.agents
+
+import Chisel._
+import uncore.coherence._
+import uncore.tilelink._
+import uncore.util._
+import uncore.util._
+import junctions._
+import cde.{Field, Parameters}
+import scala.math.max
+
+case object EnableL2Logging extends Field[Boolean]
+
+class TrackerAllocation extends Bundle {
+  val matches = Bool(OUTPUT)
+  val can = Bool(OUTPUT)
+  val should = Bool(INPUT)
+}
+
+class TrackerAllocationIO(implicit val p: Parameters)
+    extends ParameterizedBundle()(p)
+    with HasCacheBlockAddress {
+  val iacq = new TrackerAllocation
+  val irel = new TrackerAllocation
+  val oprb = new TrackerAllocation
+  val idle = Bool(OUTPUT)
+  override val addr_block = UInt(OUTPUT, tlBlockAddrBits)
+}
+
+trait HasTrackerAllocationIO extends Bundle {
+  implicit val p: Parameters
+  val alloc = new TrackerAllocationIO
+}
+
+class ManagerXactTrackerIO(implicit p: Parameters) extends ManagerTLIO()(p)
+  with HasTrackerAllocationIO
+
+class HierarchicalXactTrackerIO(implicit p: Parameters) extends HierarchicalTLIO()(p)
+  with HasTrackerAllocationIO
+
+abstract class XactTracker(implicit p: Parameters) extends CoherenceAgentModule()(p)
+    with HasXactTrackerStates
+    with HasPendingBitHelpers {
+  override val s_idle :: s_meta_read :: s_meta_resp :: s_wb_req :: s_wb_resp :: s_inner_probe :: s_outer_acquire :: s_busy :: s_meta_write :: Nil = Enum(UInt(), 9)
+  val state = Reg(init=s_idle)
+
+  def quiesce(next: UInt = s_idle)(restore: => Unit) {
+    all_pending_done := !scoreboard.foldLeft(Bool(false))(_||_)
+    when(state === s_busy && all_pending_done) {
+      state := next
+      restore
+    }
+  }
+
+  def pinAllReadyValidLow[T <: Data](b: Bundle) {
+    b.elements.foreach {
+      _._2 match {
+        case d: DecoupledIO[_] =>
+          if(d.ready.dir == OUTPUT) d.ready := Bool(false)
+          else if(d.valid.dir == OUTPUT) d.valid := Bool(false)
+        case v: ValidIO[_] => if(v.valid.dir == OUTPUT) v.valid := Bool(false) 
+        case b: Bundle => pinAllReadyValidLow(b)
+        case _ =>
+      }
+    }
+  }
+}
+
+trait HasXactTrackerStates {
+  def state: UInt 
+  def s_idle: UInt = UInt(0)
+  def s_meta_read: UInt
+  def s_meta_resp: UInt
+  def s_wb_req: UInt
+  def s_wb_resp: UInt
+  def s_inner_probe: UInt
+  def s_outer_acquire: UInt
+  def s_busy: UInt
+  def s_meta_write: UInt
+}
+
+trait HasPendingBitHelpers extends HasDataBeatCounters {
+  val scoreboard = scala.collection.mutable.ListBuffer.empty[Bool]
+  val all_pending_done = Wire(Bool())
+
+  def addPendingBitWhenBeat[T <: HasBeat](inc: Bool, in: T): UInt =
+    Fill(in.tlDataBeats, inc) &  UIntToOH(in.addr_beat)
+
+  def dropPendingBitWhenBeat[T <: HasBeat](dec: Bool, in: T): UInt =
+    ~Fill(in.tlDataBeats, dec) | ~UIntToOH(in.addr_beat)
+
+  def addPendingBitWhenId[T <: HasClientId](inc: Bool, in: T): UInt =
+    Fill(in.tlNCachingClients, inc) &  UIntToOH(in.client_id)
+
+  def dropPendingBitWhenId[T <: HasClientId](dec: Bool, in: T): UInt =
+    ~Fill(in.tlNCachingClients, dec) | ~UIntToOH(in.client_id)
+
+  def addPendingBitWhenBeatHasData[T <: HasBeat](in: DecoupledIO[T], inc: Bool = Bool(true)): UInt =
+    addPendingBitWhenBeat(in.fire() && in.bits.hasData() && inc, in.bits)
+
+  def addPendingBitWhenBeatHasDataAndAllocs(
+      in: DecoupledIO[AcquireFromSrc],
+      alloc_override: Bool = Bool(false)): UInt =
+    addPendingBitWhenBeatHasData(in, in.bits.allocate() || alloc_override)
+
+  def addPendingBitWhenBeatNeedsRead(in: DecoupledIO[AcquireFromSrc],
+      always: Bool = Bool(true), unless: Bool = Bool(false)): UInt = {
+    val a = in.bits
+    val needs_read = !unless && (a.isGet() || a.isAtomic() || a.hasPartialWritemask()) || always
+    addPendingBitWhenBeat(in.fire() && needs_read, a)
+  }
+
+  def addPendingBitWhenBeatHasPartialWritemask(in: DecoupledIO[AcquireFromSrc]): UInt =
+    addPendingBitWhenBeat(in.fire() && in.bits.hasPartialWritemask(), in.bits)
+
+  def addPendingBitsFromAcquire(a: SecondaryMissInfo): UInt =
+    Mux(a.hasMultibeatData(), Fill(a.tlDataBeats, UInt(1, 1)), UIntToOH(a.addr_beat))
+
+  def dropPendingBitWhenBeatHasData[T <: HasBeat](in: DecoupledIO[T]): UInt =
+    dropPendingBitWhenBeat(in.fire() && in.bits.hasData(), in.bits)
+
+  def dropPendingBitAtDest[T <: HasId](in: DecoupledIO[T]): UInt =
+    dropPendingBitWhenId(in.fire(), in.bits)
+
+  def dropPendingBitAtDestWhenVoluntary[T <: HasId with MightBeVoluntary](in: DecoupledIO[T]): UInt =
+    dropPendingBitWhenId(in.fire() && in.bits.isVoluntary(), in.bits)
+
+  def addPendingBitAtSrc[T <: HasId](in: DecoupledIO[T]): UInt =
+    addPendingBitWhenId(in.fire(), in.bits)
+
+  def addPendingBitAtSrcWhenVoluntary[T <: HasId with MightBeVoluntary](in: DecoupledIO[T]): UInt =
+    addPendingBitWhenId(in.fire() && in.bits.isVoluntary(), in.bits)
+
+  def addOtherBits(en: Bool, nBits: Int): UInt =
+    Mux(en, Cat(Fill(nBits - 1, UInt(1, 1)), UInt(0, 1)), UInt(0, nBits))
+
+  def addPendingBitsOnFirstBeat(in: DecoupledIO[Acquire]): UInt =
+    addOtherBits(in.fire() &&
+                 in.bits.hasMultibeatData() &&
+                 in.bits.addr_beat === UInt(0),
+                 in.bits.tlDataBeats)
+
+  def dropPendingBitsOnFirstBeat(in: DecoupledIO[Acquire]): UInt =
+    ~addPendingBitsOnFirstBeat(in)
+}
+
+trait HasDataBuffer extends HasCoherenceAgentParameters {
+  val data_buffer = Reg(init=Vec.fill(innerDataBeats)(UInt(0, width = innerDataBits)))
+
+  type TLDataBundle = TLBundle with HasTileLinkData with HasTileLinkBeatId
+
+  def initDataInner[T <: Acquire](in: DecoupledIO[T], alloc: Bool) {
+    when(in.fire() && in.bits.hasData() && alloc) { 
+      data_buffer(in.bits.addr_beat) := in.bits.data
+    }
+  }
+
+  // TODO: provide func for accessing when innerDataBeats =/= outerDataBeats or internalDataBeats
+  def mergeData(dataBits: Int)(beat: UInt, incoming: UInt) {
+    data_buffer(beat) := incoming 
+  }
+
+  def mergeDataInner[T <: TLDataBundle](in: DecoupledIO[T]) {
+    when(in.fire() && in.bits.hasData()) { 
+      mergeData(innerDataBits)(in.bits.addr_beat, in.bits.data)
+    }
+  }
+
+  def mergeDataOuter[T <: TLDataBundle](in: DecoupledIO[T]) {
+    when(in.fire() && in.bits.hasData()) { 
+      mergeData(outerDataBits)(in.bits.addr_beat, in.bits.data)
+    }
+  }
+}
+
+trait HasByteWriteMaskBuffer extends HasDataBuffer { 
+  val wmask_buffer = Reg(init=Vec.fill(innerDataBeats)(UInt(0, width = innerWriteMaskBits)))
+  val data_valid = Vec(wmask_buffer.map(wmask => wmask.andR))
+
+  override def initDataInner[T <: Acquire](in: DecoupledIO[T], alloc: Bool) {
+    when(in.fire() && in.bits.hasData() && alloc) { 
+      val beat = in.bits.addr_beat
+      val full = FillInterleaved(8, in.bits.wmask())
+      data_buffer(beat) := (~full & data_buffer(beat)) | (full & in.bits.data)
+      wmask_buffer(beat) := in.bits.wmask() | wmask_buffer(beat) // assumes wmask_buffer is zeroed
+    }
+  }
+
+  override def mergeData(dataBits: Int)(beat: UInt, incoming: UInt) {
+    val old_data = incoming     // Refilled, written back, or de-cached data
+    val new_data = data_buffer(beat) // Newly Put data is already in the buffer
+    val wmask = FillInterleaved(8, wmask_buffer(beat))
+    data_buffer(beat) := (~wmask & old_data) | (wmask & new_data)
+    wmask_buffer(beat) := ~UInt(0, innerWriteMaskBits)
+  }
+
+  def clearWmaskBuffer() {
+    wmask_buffer.foreach { w => w := UInt(0) }
+  }
+}
+
+trait HasBlockAddressBuffer extends HasCoherenceAgentParameters {
+  val xact_addr_block = Reg(init = UInt(0, width = blockAddrBits))
+}
+
+
+trait HasAcquireMetadataBuffer extends HasBlockAddressBuffer {
+  val xact_allocate = Reg{ Bool() }
+  val xact_amo_shift_bytes = Reg{ UInt() }
+  val xact_op_code = Reg{ UInt() }
+  val xact_addr_byte = Reg{ UInt() }
+  val xact_op_size = Reg{ UInt() }
+  val xact_addr_beat = Wire(UInt())
+  val xact_iacq = Wire(new SecondaryMissInfo)
+}
+
+trait HasVoluntaryReleaseMetadataBuffer extends HasBlockAddressBuffer
+    with HasPendingBitHelpers
+    with HasXactTrackerStates {
+  def io: HierarchicalXactTrackerIO
+
+  val xact_vol_ir_r_type = Reg{ UInt() }
+  val xact_vol_ir_src = Reg{ UInt() }
+  val xact_vol_ir_client_xact_id = Reg{ UInt() }
+
+  def xact_vol_irel = Release(
+                        src = xact_vol_ir_src,
+                        voluntary = Bool(true),
+                        r_type = xact_vol_ir_r_type,
+                        client_xact_id = xact_vol_ir_client_xact_id,
+                        addr_block = xact_addr_block)
+                        (p.alterPartial({ case TLId => p(InnerTLId) }))
+}
+
+trait AcceptsVoluntaryReleases extends HasVoluntaryReleaseMetadataBuffer {
+  def inner_coh: ManagerMetadata
+
+  val pending_irel_data = Reg(init=Bits(0, width = innerDataBeats))
+  val vol_ignt_counter = Wire(new TwoWayBeatCounterStatus)
+
+  def irel_can_merge: Bool
+  def irel_same_xact: Bool
+  def irel_is_allocating: Bool = state === s_idle && io.alloc.irel.should && io.inner.release.valid
+  def irel_is_merging: Bool = (irel_can_merge || irel_same_xact) && io.inner.release.valid
+
+  def innerRelease(block_vol_ignt: Bool = Bool(false), next: UInt = s_busy) {
+    connectTwoWayBeatCounters(
+      status = vol_ignt_counter,
+      up = io.inner.release,
+      down = io.inner.grant,
+      trackUp = (r: Release) => {
+        Mux(state === s_idle, io.alloc.irel.should, io.alloc.irel.matches) && r.isVoluntary() && r.requiresAck()
+      },
+      trackDown = (g: Grant) => (state =/= s_idle) && g.isVoluntary())
+
+
+    when(irel_is_allocating) {
+      xact_addr_block := io.irel().addr_block
+      // Set all of them to pending in the beginning as a precaution
+      // If it turns out we don't need some or all of the beats, they will
+      // be overridden below
+      pending_irel_data := ~UInt(0, innerDataBeats)
+      state := next
+    }
+
+    val irel_fire = (irel_is_allocating || irel_is_merging) && io.inner.release.ready
+    when (irel_fire) {
+      when (io.irel().first()) {
+        when (io.irel().isVoluntary()) {
+          xact_vol_ir_r_type := io.irel().r_type
+          xact_vol_ir_src := io.irel().client_id
+          xact_vol_ir_client_xact_id := io.irel().client_xact_id
+        }
+        // If this release has data, set all the pending bits except the first.
+        // Otherwise, clear all the pending bits
+        pending_irel_data := Mux(io.irel().hasMultibeatData(),
+                               dropPendingBitWhenBeatHasData(io.inner.release),
+                               UInt(0))
+      } .otherwise {
+        pending_irel_data := (pending_irel_data & dropPendingBitWhenBeatHasData(io.inner.release))
+      }
+      if (p(EnableL2Logging)) {
+        when (io.irel().hasData()) {
+          printf("[release] addr_block=%x addr_beat=%d data=%x\n",
+                  io.irel().addr_block, io.irel().addr_beat, io.irel().data)
+        }
+      }
+    }
+
+    io.inner.grant.valid := state.isOneOf(s_wb_req, s_wb_resp, s_inner_probe, s_busy) &&
+                              vol_ignt_counter.pending &&
+                              !(pending_irel_data.orR || block_vol_ignt)
+
+    io.inner.grant.bits := inner_coh.makeGrant(xact_vol_irel)
+
+    scoreboard += (pending_irel_data.orR, vol_ignt_counter.pending)
+  }
+
+}
+
+trait EmitsVoluntaryReleases extends HasVoluntaryReleaseMetadataBuffer {
+  val pending_orel_send = Reg(init=Bool(false))
+  val pending_orel_data = Reg(init=Bits(0, width = innerDataBeats))
+  val vol_ognt_counter = Wire(new TwoWayBeatCounterStatus)
+  val pending_orel = pending_orel_send || pending_orel_data.orR || vol_ognt_counter.pending
+  val sending_orel = Reg(init = Bool(false))
+
+  // Block acceptance of inner releases if we have already started sending
+  // outer releases, but have not yet sent out the beat corresponding to the
+  // inner release. This function must be included in io.inner.release.ready
+  // if it is possible to start accepting a new inner release as the previous
+  // outer release is still being sent. DO NOT include this in the
+  // io.inner.release.ready if the releases are not buffered
+  // (i.e. io.inner.release and io.outer.release combinationally linked).
+  def blockInnerRelease(rel: ReleaseMetadata = io.irel()): Bool = {
+    val waiting_to_send = sending_orel && pending_orel_data(rel.addr_beat)
+    val sending_now = io.outer.release.fire() && rel.addr_beat === io.orel().addr_beat
+    rel.hasData() && (waiting_to_send || sending_now)
+  }
+
+  def outerRelease(
+      coh: ClientMetadata,
+      buffering: Bool = Bool(true),
+      data: UInt = io.irel().data,
+      add_pending_data_bits: UInt = UInt(0),
+      add_pending_send_bit: Bool = Bool(false),
+      block_orel: Bool = Bool(false)) {
+
+    when (state =/= s_idle || io.alloc.irel.should) {
+      pending_orel_data := (pending_orel_data |
+          addPendingBitWhenBeatHasData(io.inner.release) |
+          add_pending_data_bits) &
+        dropPendingBitWhenBeatHasData(io.outer.release)
+    }
+    when (add_pending_send_bit) { pending_orel_send := Bool(true) }
+    when (io.outer.release.fire()) {
+      when (io.outer.release.bits.first()) { sending_orel := Bool(true) }
+      when (io.outer.release.bits.last())  { sending_orel := Bool(false) }
+      pending_orel_send := Bool(false)
+    }
+
+    connectTwoWayBeatCounters(
+        status = vol_ognt_counter,
+        up = io.outer.release,
+        down = io.outer.grant,
+        trackUp = (r: Release) => r.isVoluntary() && r.requiresAck(),
+        trackDown = (g: Grant) => g.isVoluntary())
+
+    io.outer.release.valid := !block_orel && Mux(buffering,
+      (state === s_busy) && Mux(io.orel().hasData(),
+        pending_orel_data(vol_ognt_counter.up.idx),
+        pending_orel_send),
+      // only writebacks need to be forwarded to the outer interface
+      state =/= s_idle && io.alloc.irel.matches &&
+        io.irel().hasData() && io.inner.release.valid)
+
+    io.outer.release.bits := coh.makeVoluntaryWriteback(
+      client_xact_id = UInt(0), // TODO was tracker id, but not needed?
+      addr_block = xact_addr_block,
+      addr_beat = vol_ognt_counter.up.idx,
+      data = data)
+
+    when (vol_ognt_counter.pending) { io.outer.grant.ready := Bool(true) }
+
+    scoreboard += (pending_orel, vol_ognt_counter.pending)
+  }
+}
+
+trait EmitsInnerProbes extends HasBlockAddressBuffer
+    with HasXactTrackerStates 
+    with HasPendingBitHelpers {
+  def io: HierarchicalXactTrackerIO
+
+  val needs_probes = (innerNCachingClients > 0)
+  val pending_iprbs = Reg(UInt(width = max(innerNCachingClients, 1)))
+  val curr_probe_dst = PriorityEncoder(pending_iprbs)
+
+  def full_representation: UInt
+  def initializeProbes() {
+    if (needs_probes)
+      pending_iprbs := full_representation & ~io.incoherent.asUInt
+    else
+      pending_iprbs := UInt(0)
+  }
+  def irel_same_xact = io.irel().conflicts(xact_addr_block) &&
+                         !io.irel().isVoluntary() &&
+                         state === s_inner_probe 
+
+  def innerProbe(prb: Probe, next: UInt) {
+    if (needs_probes) {
+      val irel_counter = Wire(new TwoWayBeatCounterStatus)
+
+      pending_iprbs := pending_iprbs & dropPendingBitAtDest(io.inner.probe)
+      io.inner.probe.valid := state === s_inner_probe && pending_iprbs.orR
+      io.inner.probe.bits := prb
+
+      connectTwoWayBeatCounters(
+        status = irel_counter,
+        up = io.inner.probe,
+        down = io.inner.release,
+        max = innerNCachingClients,
+        trackDown = (r: Release) => (state =/= s_idle) && !r.isVoluntary())
+
+      when(state === s_inner_probe && !(pending_iprbs.orR || irel_counter.pending)) {
+        state := next
+      }
+    } else {
+      when (state === s_inner_probe) { state := next }
+    }
+
+    //N.B. no pending bits added to scoreboard because all handled in s_inner_probe
+  }
+}
+
+trait RoutesInParent extends HasBlockAddressBuffer
+    with HasXactTrackerStates {
+  def io: HierarchicalXactTrackerIO
+  type AddrComparison = HasCacheBlockAddress => Bool
+  def exactAddrMatch(a: HasCacheBlockAddress): Bool = a.conflicts(xact_addr_block)
+  def routeInParent(iacqMatches: AddrComparison = exactAddrMatch,
+            irelMatches: AddrComparison = exactAddrMatch,
+            oprbMatches: AddrComparison = exactAddrMatch,
+            iacqCanAlloc: Bool = Bool(false),
+            irelCanAlloc: Bool = Bool(false),
+            oprbCanAlloc: Bool = Bool(false)) {
+    io.alloc.iacq.matches := (state =/= s_idle) && iacqMatches(io.iacq())
+    io.alloc.irel.matches := (state =/= s_idle) && irelMatches(io.irel())
+    io.alloc.oprb.matches := (state =/= s_idle) && oprbMatches(io.oprb())
+    io.alloc.iacq.can := state === s_idle && iacqCanAlloc
+    io.alloc.irel.can := state === s_idle && irelCanAlloc
+    io.alloc.oprb.can := state === s_idle && oprbCanAlloc
+    io.alloc.addr_block := xact_addr_block
+    io.alloc.idle := state === s_idle
+  }
+}
+
+trait AcceptsInnerAcquires extends HasAcquireMetadataBuffer
+    with AcceptsVoluntaryReleases
+    with HasXactTrackerStates
+    with HasPendingBitHelpers {
+  def io: HierarchicalXactTrackerIO
+  def nSecondaryMisses: Int
+  def alwaysWriteFullBeat: Boolean
+  def inner_coh: ManagerMetadata
+  def trackerId: Int
+
+  // Secondary miss queue holds transaction metadata used to make grants
+  lazy val ignt_q = Module(new Queue(
+        new SecondaryMissInfo()(p.alterPartial({ case TLId => p(InnerTLId) })),
+        1 + nSecondaryMisses))
+
+  val pending_ignt = Wire(Bool())
+  val ignt_data_idx = Wire(UInt())
+  val ignt_data_done = Wire(Bool())
+  val ifin_counter = Wire(new TwoWayBeatCounterStatus)
+  val pending_put_data = Reg(init=Bits(0, width = innerDataBeats))
+  val pending_ignt_data = Reg(init=Bits(0, width = innerDataBeats))
+
+  def iacq_same_xact: Bool =
+    (xact_iacq.client_xact_id === io.iacq().client_xact_id) &&
+      (xact_iacq.client_id === io.iacq().client_id) &&
+      pending_ignt
+  def iacq_same_xact_multibeat = iacq_same_xact && io.iacq().hasMultibeatData()
+  def iacq_can_merge: Bool
+  def iacq_is_allocating: Bool = state === s_idle && io.alloc.iacq.should && io.inner.acquire.valid
+  def iacq_is_merging: Bool = (iacq_can_merge || iacq_same_xact) && io.inner.acquire.valid
+
+  def innerAcquire(can_alloc: Bool, next: UInt) {
+    val iacq_matches_head = iacq_same_xact && xact_iacq.addr_beat === io.iacq().addr_beat
+
+    // Enqueue some metadata information that we'll use to make coherence updates with later
+    ignt_q.io.enq.valid := iacq_is_allocating ||
+                           (!iacq_matches_head && pending_ignt &&
+                             io.inner.acquire.fire() && io.iacq().first())
+    ignt_q.io.enq.bits := io.iacq()
+
+    // Use the outputs of the queue to make further messages
+    xact_iacq := Mux(ignt_q.io.deq.valid, ignt_q.io.deq.bits, ignt_q.io.enq.bits)
+    xact_addr_beat := xact_iacq.addr_beat
+    pending_ignt := ignt_q.io.count > UInt(0)
+
+    // Track whether any beats are missing from a PutBlock
+    when (state =/= s_idle || io.alloc.iacq.should) {
+      pending_put_data := (pending_put_data &
+          dropPendingBitWhenBeatHasData(io.inner.acquire)) |
+          addPendingBitsOnFirstBeat(io.inner.acquire)
+    }
+
+    // Intialize transaction metadata for accepted Acquire
+    when(iacq_is_allocating) {
+      xact_addr_block := io.iacq().addr_block
+      xact_allocate := io.iacq().allocate() && can_alloc
+      xact_amo_shift_bytes := io.iacq().amo_shift_bytes()
+      xact_op_code := io.iacq().op_code()
+      xact_addr_byte := io.iacq().addr_byte()
+      xact_op_size := io.iacq().op_size()
+      // Make sure to collect all data from a PutBlock
+      pending_put_data := Mux(
+        io.iacq().isBuiltInType(Acquire.putBlockType),
+        dropPendingBitWhenBeatHasData(io.inner.acquire),
+        UInt(0))
+      pending_ignt_data := UInt(0)
+      state := next
+    }
+
+    scoreboard += (pending_put_data.orR)
+  }
+
+  def innerGrant(
+      data: UInt = io.ognt().data,
+      external_pending: Bool = Bool(false),
+      buffering: Bool = Bool(true),
+      add_pending_bits: UInt = UInt(0)) {
+    // Track the number of outstanding inner.finishes
+    connectTwoWayBeatCounters(
+      status = ifin_counter,
+      up = io.inner.grant,
+      down = io.inner.finish,
+      max = nSecondaryMisses,
+      trackUp = (g: Grant) => g.requiresAck())
+
+    // Track which beats are ready for response
+    when(!iacq_is_allocating) {
+      pending_ignt_data := pending_ignt_data |
+                             addPendingBitWhenBeatHasData(io.inner.release) |
+                             addPendingBitWhenBeatHasData(io.outer.grant) |
+                             add_pending_bits
+    }
+
+    if (p(EnableL2Logging)) {
+      when (io.inner.grant.fire() && io.ignt().hasData()) {
+        printf("[get] addr_block=%x addr_beat=%d data=%x\n",
+          xact_addr_block, io.ignt().addr_beat, io.ignt().data)
+      }
+    }
+
+    // Have we finished receiving the complete inner acquire transaction?
+    val iacq_finished = !(state === s_idle ||
+                          state === s_meta_read ||
+                          pending_put_data.orR)
+
+    val ignt_from_iacq = inner_coh.makeGrant(
+                            sec = ignt_q.io.deq.bits,
+                            manager_xact_id = UInt(trackerId), 
+                            data = data)
+
+    // Make the Grant message using the data stored in the secondary miss queue
+    val (cnt, done) = connectOutgoingDataBeatCounter(io.inner.grant, ignt_q.io.deq.bits.addr_beat)
+    ignt_data_idx := cnt
+    ignt_data_done := done
+    ignt_q.io.deq.ready := Bool(false)
+    when(!vol_ignt_counter.pending) {
+      ignt_q.io.deq.ready := ignt_data_done
+      io.inner.grant.bits := ignt_from_iacq
+      io.inner.grant.bits.addr_beat := ignt_data_idx // override based on outgoing counter
+      when (state === s_busy && pending_ignt) {
+        io.inner.grant.valid := !external_pending && 
+          Mux(io.ignt().hasData(),
+            Mux(buffering,
+              pending_ignt_data(ignt_data_idx),
+              io.outer.grant.valid),
+            iacq_finished)
+      }
+    }
+
+    // We must wait for as many Finishes as we sent Grants
+    io.inner.finish.ready := state === s_busy
+
+    scoreboard += (pending_ignt, ifin_counter.pending)
+  }
+
+}
+
+trait EmitsOuterAcquires extends AcceptsInnerAcquires {
+  val ognt_counter = Wire(new TwoWayBeatCounterStatus)
+
+  // Handle misses or coherence permission upgrades by initiating a new transaction in the outer memory:
+  //
+  // If we're allocating in this cache, we can use the current metadata
+  // to make an appropriate custom Acquire, otherwise we copy over the
+  // built-in Acquire from the inner TL to the outer TL
+  def outerAcquire(
+      caching: Bool,
+      coh: ClientMetadata,
+      block_outer_acquire: Bool = Bool(false),
+      buffering: Bool = Bool(true),
+      data: UInt = io.iacq().data,
+      wmask: UInt = io.iacq().wmask(),
+      next: UInt = s_busy) {
+
+    // Tracks outstanding Acquires, waiting for their matching Grant.
+    connectTwoWayBeatCounters(
+      status = ognt_counter,
+      up = io.outer.acquire,
+      down = io.outer.grant,
+      beat = xact_addr_beat,
+      trackDown = (g: Grant) => !g.isVoluntary())
+
+    io.outer.acquire.valid :=
+      state === s_outer_acquire && !block_outer_acquire &&
+        (xact_allocate ||
+          Mux(buffering,
+            !pending_put_data(ognt_counter.up.idx),
+            // If not buffering, we should only send an outer acquire if
+            // the ignt_q is not empty (pending_ignt) and the enqueued
+            // transaction does not have data or we are receiving the
+            // inner acquire and it is the same transaction as the one enqueued.
+            pending_ignt && (!xact_iacq.hasData() ||
+              (io.inner.acquire.valid && iacq_same_xact))))
+
+    io.outer.acquire.bits :=
+      Mux(caching,
+        coh.makeAcquire(
+          op_code = xact_op_code,
+          client_xact_id = UInt(0),
+          addr_block = xact_addr_block),
+        BuiltInAcquireBuilder(
+          a_type = xact_iacq.a_type,
+          client_xact_id = UInt(0),
+          addr_block = xact_addr_block,
+          addr_beat = ognt_counter.up.idx,
+          data = data,
+          addr_byte = xact_addr_byte,
+          operand_size = xact_op_size,
+          opcode = xact_op_code,
+          wmask = wmask,
+          alloc = Bool(false))
+          (p.alterPartial({ case TLId => p(OuterTLId)})))
+
+    when(state === s_outer_acquire && ognt_counter.up.done) { state := next }
+
+    when (ognt_counter.pending) { io.outer.grant.ready := Bool(true) }
+
+    scoreboard += ognt_counter.pending
+  }
+}
+
+abstract class VoluntaryReleaseTracker(val trackerId: Int)(implicit p: Parameters) extends XactTracker()(p)
+    with AcceptsVoluntaryReleases
+    with RoutesInParent {
+  def irel_can_merge = Bool(false)
+  def irel_same_xact = io.irel().conflicts(xact_addr_block) &&
+                          io.irel().isVoluntary() &&
+                          pending_irel_data.orR
+}
+
+abstract class AcquireTracker(val trackerId: Int)(implicit p: Parameters) extends XactTracker()(p)
+    with AcceptsInnerAcquires
+    with EmitsOuterAcquires
+    with EmitsInnerProbes
+    with RoutesInParent {
+}
--- a/src/main/scala/uncore/coherence/Directory.scala
+++ b/src/main/scala/uncore/coherence/Directory.scala
@ -0,0 +1,43 @@
+// See LICENSE for license details.
+
+package uncore.coherence
+import Chisel._
+
+// This class encapsulates transformations on different directory information
+// storage formats
+abstract class DirectoryRepresentation(val width: Int) {
+  def pop(prev: UInt, id: UInt): UInt
+  def push(prev: UInt, id: UInt): UInt
+  def flush: UInt
+  def none(s: UInt): Bool
+  def one(s: UInt): Bool
+  def count(s: UInt): UInt
+  def next(s: UInt): UInt
+  def full(s: UInt): UInt
+}
+
+abstract trait HasDirectoryRepresentation {
+  val dir: DirectoryRepresentation
+}
+
+class NullRepresentation(nClients: Int) extends DirectoryRepresentation(1) {
+  def pop(prev: UInt, id: UInt) = UInt(0)
+  def push(prev: UInt, id: UInt) = UInt(0)
+  def flush  = UInt(0)
+  def none(s: UInt) = Bool(false)
+  def one(s: UInt) = Bool(false)
+  def count(s: UInt) = UInt(nClients)
+  def next(s: UInt) = UInt(0)
+  def full(s: UInt) = SInt(-1, width = nClients).asUInt
+}
+
+class FullRepresentation(nClients: Int) extends DirectoryRepresentation(nClients) {
+  def pop(prev: UInt, id: UInt) =  prev &  ~UIntToOH(id)
+  def push(prev: UInt, id: UInt) = prev | UIntToOH(id)
+  def flush = UInt(0, width = width)
+  def none(s: UInt) = s === UInt(0)
+  def one(s: UInt) = PopCount(s) === UInt(1)
+  def count(s: UInt) = PopCount(s)
+  def next(s: UInt) = PriorityEncoder(s)
+  def full(s: UInt) = s
+}
--- a/src/main/scala/uncore/coherence/Metadata.scala
+++ b/src/main/scala/uncore/coherence/Metadata.scala
@ -0,0 +1,344 @@
+// See LICENSE for license details.
+
+package uncore.coherence
+
+import Chisel._
+import uncore.tilelink._
+import uncore.constants._
+import cde.{Parameters, Field}
+
+/** Identifies the TLId of the inner network in a hierarchical cache controller */ 
+case object InnerTLId extends Field[String]
+/** Identifies the TLId of the outer network in a hierarchical cache controller */ 
+case object OuterTLId extends Field[String]
+
+/** Base class to represent coherence information in clients and managers */
+abstract class CoherenceMetadata(implicit p: Parameters) extends TLBundle()(p) {
+  val co = tlCoh
+}
+
+/** Stores the client-side coherence information,
+  * such as permissions on the data and whether the data is dirty.
+  * Its API can be used to make TileLink messages in response to
+  * memory operations or [[uncore.Probe]] messages.
+  */
+class ClientMetadata(implicit p: Parameters) extends CoherenceMetadata()(p) {
+  /** Actual state information stored in this bundle */
+  val state = UInt(width = co.clientStateWidth)
+
+  /** Metadata equality */
+  def ===(rhs: ClientMetadata): Bool = this.state === rhs.state
+  def =/=(rhs: ClientMetadata): Bool = !this.===(rhs)
+
+  /** Is the block's data present in this cache */
+  def isValid(dummy: Int = 0): Bool = co.isValid(this)
+  /** Does this cache have permissions on this block sufficient to perform op */
+  def isHit(op_code: UInt): Bool = co.isHit(op_code, this)
+  /** Does this cache lack permissions on this block sufficient to perform op */ 
+  def isMiss(op_code: UInt): Bool = !co.isHit(op_code, this)
+  /** Does a secondary miss on the block require another Acquire message */
+  def requiresAcquireOnSecondaryMiss(first_op: UInt, second_op: UInt): Bool =
+    co.requiresAcquireOnSecondaryMiss(first_op, second_op, this)
+  /** Does op require a Release to be made to outer memory */
+  def requiresReleaseOnCacheControl(op_code: UInt): Bool =
+    co.requiresReleaseOnCacheControl(op_code: UInt, this)
+  /** Does an eviction require a Release to be made to outer memory */
+  def requiresVoluntaryWriteback(dummy: Int = 0): Bool =
+    co.requiresReleaseOnCacheControl(M_FLUSH, this)
+
+  /** Constructs an Acquire message based on this metdata and a memory operation
+    *
+    * @param client_xact_id client's transaction id
+    * @param addr_block address of the cache block
+    * @param op_code a memory operation from [[uncore.constants.MemoryOpConstants]]
+    */
+  def makeAcquire(
+        op_code: UInt,
+        client_xact_id: UInt,
+        addr_block: UInt): Acquire = {
+    Acquire(
+      is_builtin_type = Bool(false),
+      a_type = co.getAcquireType(op_code, this),
+      client_xact_id = client_xact_id,
+      addr_block = addr_block,
+      union = Cat(op_code, Bool(true)))(p)
+  }
+
+  /** Constructs a Release message based on this metadata on cache control op
+    *
+    * @param client_xact_id client's transaction id
+    * @param addr_block address of the cache block
+    * @param addr_beat sub-block address (which beat)
+    * @param data data being written back
+    */
+  def makeVoluntaryRelease(
+        op_code: UInt,
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt = UInt(0),
+        data: UInt = UInt(0)): Release =
+    Release(
+      voluntary = Bool(true),
+      r_type = co.getReleaseType(op_code, this),
+      client_xact_id = client_xact_id,
+      addr_block = addr_block,
+      addr_beat = addr_beat,
+      data = data)(p)
+
+  /** Constructs a Release message based on this metadata on an eviction
+    *
+    * @param client_xact_id client's transaction id
+    * @param addr_block address of the cache block
+    * @param addr_beat sub-block address (which beat)
+    * @param data data being written back
+    */
+  def makeVoluntaryWriteback(
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt = UInt(0),
+        data: UInt = UInt(0)): Release =
+    makeVoluntaryRelease(
+      op_code = M_FLUSH,
+      client_xact_id = client_xact_id,
+      addr_block = addr_block,
+      addr_beat = addr_beat,
+      data = data)
+
+  /** Constructs a Release message based on this metadata and a [[uncore.Probe]]
+    *
+    * @param the incoming [[uncore.Probe]]
+    * @param addr_beat sub-block address (which beat)
+    * @param data data being released
+    */
+  def makeRelease(
+        prb: Probe,
+        addr_beat: UInt = UInt(0),
+        data: UInt = UInt(0)): Release =
+    Release(
+      voluntary = Bool(false),
+      r_type = co.getReleaseType(prb, this),
+      client_xact_id = UInt(0),
+      addr_block = prb.addr_block,
+      addr_beat = addr_beat,
+      data = data)(p)
+
+  /** New metadata after receiving a [[uncore.Grant]]
+    *
+    * @param incoming the incoming [[uncore.Grant]]
+    * @param pending the mem op that triggered this transaction
+    */
+  def onGrant(incoming: Grant, pending: UInt): ClientMetadata =
+    co.clientMetadataOnGrant(incoming, pending, this)
+
+  /** New metadata after receiving a [[uncore.Probe]]
+    *
+    * @param incoming the incoming [[uncore.Probe]]
+    */
+  def onProbe(incoming: Probe): ClientMetadata =
+    co.clientMetadataOnProbe(incoming, this)
+
+  /** New metadata after a op_code hits this block
+    *
+    * @param op_code a memory operation from [[uncore.constants.MemoryOpConstants]]
+    */
+  def onHit(op_code: UInt): ClientMetadata =
+    co.clientMetadataOnHit(op_code, this)
+
+  /** New metadata after op_code releases permissions on this block
+    *
+    * @param op_code a memory operation from [[uncore.constants.MemoryOpConstants]]
+    */
+  def onCacheControl(op_code: UInt): ClientMetadata =
+    co.clientMetadataOnCacheControl(op_code, this)
+}
+
+/** Factories for ClientMetadata, including on reset */
+object ClientMetadata {
+  def apply(state: UInt)(implicit p: Parameters) = {
+    val meta = Wire(new ClientMetadata)
+    meta.state := state
+    meta
+  }
+  def onReset(implicit p: Parameters) = ClientMetadata(UInt(0))(p) // TODO: assumes clientInvalid === 0
+}
+
+/** Stores manager-side information about the status 
+  * of a cache block, including whether it has any known sharers.
+  *
+  * Its API can be used to create [[uncore.Probe]] and [[uncore.Grant]] messages.
+  */
+class ManagerMetadata(implicit p: Parameters) extends CoherenceMetadata()(p) {
+  // Currently no coherence policies assume manager-side state information
+  // val state = UInt(width = co.masterStateWidth) TODO: Fix 0-width wires in Chisel
+
+  /** The directory information for this block */
+  val sharers = UInt(width = co.dir.width)
+
+  /** Metadata equality */
+  def ===(rhs: ManagerMetadata): Bool = //this.state === rhs.state && TODO: Fix 0-width wires in Chisel
+                                         this.sharers === rhs.sharers
+  def =/=(rhs: ManagerMetadata): Bool = !this.===(rhs)
+
+  /** Converts the directory info into an N-hot sharer bitvector (i.e. full representation) */
+  def full(dummy: Int = 0): UInt = co.dir.full(this.sharers)
+
+  /** Does this [[uncore.Acquire]] require [[uncore.Probe Probes]] to be sent */
+  def requiresProbes(acq: HasAcquireType): Bool = co.requiresProbes(acq, this)
+  /** Does this memory op require [[uncore.Probe Probes]] to be sent */
+  def requiresProbes(op_code: UInt): Bool = co.requiresProbes(op_code, this)
+  /** Does an eviction require [[uncore.Probe Probes]] to be sent */
+  def requiresProbesOnVoluntaryWriteback(dummy: Int = 0): Bool =
+    co.requiresProbes(M_FLUSH, this)
+
+  /** Construct an appropriate [[uncore.ProbeToDst]] for a given [[uncore.Acquire]]
+    *
+    * @param dst Destination client id for this Probe
+    * @param acq Acquire message triggering this Probe
+    * @param addr_block address of the cache block being probed
+    */
+  def makeProbe(dst: UInt, acq: HasAcquireType, addr_block: UInt): ProbeToDst =
+    Probe(dst, co.getProbeType(acq, this), addr_block)(p)
+
+  /** Construct an appropriate [[uncore.ProbeToDst]] for a given [[uncore.Acquire]]
+    *
+    * @param dst Destination client id for this Probe
+    * @param acq Acquire message triggering this Probe
+    */
+  def makeProbe(dst: UInt, acq: AcquireMetadata): ProbeToDst =
+    Probe(dst, co.getProbeType(acq, this), acq.addr_block)(p)
+
+  /** Construct an appropriate [[uncore.ProbeToDst]] for a given mem op
+    *
+    * @param dst Destination client id for this Probe
+    * @param op_code memory operation triggering this Probe
+    * @param addr_block address of the cache block being probed
+    */
+  def makeProbe(dst: UInt, op_code: UInt, addr_block: UInt): ProbeToDst =
+    Probe(dst, co.getProbeType(op_code, this), addr_block)(p)
+
+  /** Construct an appropriate [[uncore.ProbeToDst]] for an eviction
+    *
+    * @param dst Destination client id for this Probe
+    * @param addr_block address of the cache block being probed prior to eviction
+    */
+  def makeProbeForVoluntaryWriteback(dst: UInt, addr_block: UInt): ProbeToDst =
+    makeProbe(dst, M_FLUSH, addr_block)
+
+  /** Construct an appropriate [[uncore.GrantToDst]] to acknowledge an [[uncore.Release]]
+    *
+    * @param rel Release message being acknowledged by this Grant
+    */
+  def makeGrant(rel: ReleaseMetadata with HasClientId): GrantToDst =
+    Grant(
+      dst = rel.client_id,
+      is_builtin_type = Bool(true),
+      g_type = Grant.voluntaryAckType,
+      client_xact_id = rel.client_xact_id,
+      manager_xact_id = UInt(0))(p)
+
+  /** Construct an appropriate [[uncore.GrantToDst]] to respond to an [[uncore.Acquire]]
+    *
+    * May contain single or multiple beats of data, or just be a permissions upgrade.
+    *
+    * @param acq Acquire message being responded to by this Grant
+    * @param manager_xact_id manager's transaction id
+    * @param addr_beat beat id of the data
+    * @param data data being refilled to the original requestor
+    */
+  def makeGrant(
+        acq: AcquireMetadata with HasClientId,
+        manager_xact_id: UInt, 
+        addr_beat: UInt = UInt(0),
+        data: UInt = UInt(0)): GrantToDst =
+    Grant(
+      dst = acq.client_id,
+      is_builtin_type = acq.isBuiltInType(),
+      g_type = co.getGrantType(acq, this),
+      client_xact_id = acq.client_xact_id,
+      manager_xact_id = manager_xact_id,
+      addr_beat = addr_beat,
+      data = data)(p)
+
+  /** Construct an [[uncore.GrantToDst]] to respond to an [[uncore.Acquire]] with some overrides
+    *
+    * Used to respond to secondary misses merged into this transaction.
+    * May contain single or multiple beats of data.
+    *
+    * @param sec Secondary miss info
+    * @param manager_xact_id manager's transaction id
+    * @param data data being refilled to the original requestor
+    */
+  def makeGrant(
+        sec: SecondaryMissInfo,
+        manager_xact_id: UInt,
+        data: UInt): GrantToDst = {
+    Grant(
+      dst = sec.client_id,
+      is_builtin_type = sec.isBuiltInType(),
+      g_type = co.getGrantType(sec, this),
+      client_xact_id = sec.client_xact_id,
+      manager_xact_id = manager_xact_id,
+      addr_beat = sec.addr_beat,
+      data = data)(p)
+  }
+    
+  /** New metadata after receiving a [[uncore.ReleaseFromSrc]]
+    *
+    * @param incoming the incoming [[uncore.ReleaseFromSrc]]
+    */
+  def onRelease(incoming: ReleaseMetadata with HasClientId): ManagerMetadata =
+    co.managerMetadataOnRelease(incoming, incoming.client_id, this)
+
+  /** New metadata after sending a [[uncore.GrantToDst]]
+    *
+    * @param outgoing the outgoing [[uncore.GrantToDst]]
+    */
+  def onGrant(outgoing: GrantMetadata with HasClientId): ManagerMetadata =
+    co.managerMetadataOnGrant(outgoing, outgoing.client_id, this)
+}
+
+/** Factories for ManagerMetadata, including on reset */
+object ManagerMetadata {
+  def apply(sharers: UInt, state: UInt = UInt(width = 0))(implicit p: Parameters) = {
+    val meta = Wire(new ManagerMetadata)
+    //meta.state := state TODO: Fix 0-width wires in Chisel 
+    meta.sharers := sharers
+    meta
+  }
+  def apply(implicit p: Parameters) = {
+    val meta = Wire(new ManagerMetadata)
+    //meta.state := UInt(width = 0) TODO: Fix 0-width wires in Chisel 
+    meta.sharers := meta.co.dir.flush
+    meta
+  }
+  def onReset(implicit p: Parameters) = ManagerMetadata(p)
+}
+
+/** HierarchicalMetadata is used in a cache in a multi-level memory hierarchy
+  * that is a manager with respect to some inner caches and a client with
+  * respect to some outer cache.
+  *
+  * This class makes use of two different sets of TileLink parameters, which are
+  * applied by contextually mapping [[uncore.TLId]] to one of 
+  * [[uncore.InnerTLId]] or [[uncore.OuterTLId]].
+  */ 
+class HierarchicalMetadata(implicit p: Parameters) extends CoherenceMetadata()(p) {
+  val inner: ManagerMetadata = new ManagerMetadata()(p.alterPartial({case TLId => p(InnerTLId)}))
+  val outer: ClientMetadata = new ClientMetadata()(p.alterPartial({case TLId => p(OuterTLId)}))
+  def ===(rhs: HierarchicalMetadata): Bool = 
+    this.inner === rhs.inner && this.outer === rhs.outer
+  def =/=(rhs: HierarchicalMetadata): Bool = !this.===(rhs)
+}
+
+/** Factories for HierarchicalMetadata, including on reset */
+object HierarchicalMetadata {
+  def apply(inner: ManagerMetadata, outer: ClientMetadata)
+           (implicit p: Parameters): HierarchicalMetadata = {
+    val m = Wire(new HierarchicalMetadata)
+    m.inner := inner
+    m.outer := outer
+    m
+  }
+  def onReset(implicit p: Parameters): HierarchicalMetadata = 
+    apply(ManagerMetadata.onReset, ClientMetadata.onReset)
+}
--- a/src/main/scala/uncore/coherence/Policies.scala
+++ b/src/main/scala/uncore/coherence/Policies.scala
@ -0,0 +1,696 @@
+// See LICENSE for license details.
+
+package uncore.coherence
+
+import Chisel._
+import uncore.tilelink._
+import uncore.constants._
+import uncore.util._
+
+/** The entire CoherencePolicy API consists of the following three traits:
+  * HasCustomTileLinkMessageTypes, used to define custom messages
+  * HasClientSideCoherencePolicy, for client coherence agents
+  * HasManagerSideCoherencePolicy, for manager coherence agents
+  */
+abstract class CoherencePolicy(val dir: DirectoryRepresentation)
+  extends HasCustomTileLinkMessageTypes
+  with HasClientSideCoherencePolicy
+  with HasManagerSideCoherencePolicy
+
+/** This API defines the custom, coherence-policy-defined message types,
+  * as opposed to the built-in ones found in tilelink.scala.
+  * Policies must enumerate the custom messages to be sent over each
+  * channel, as well as which of them have associated data.
+  */
+trait HasCustomTileLinkMessageTypes {
+  val nAcquireTypes: Int
+  def acquireTypeWidth = log2Up(nAcquireTypes)
+  val nProbeTypes: Int
+  def probeTypeWidth = log2Up(nProbeTypes)
+  val nReleaseTypes: Int
+  def releaseTypeWidth = log2Up(nReleaseTypes)
+  val nGrantTypes: Int
+  def grantTypeWidth = log2Up(nGrantTypes)
+
+  val acquireTypesWithData = Nil // Only built-in Acquire types have data for now
+  def releaseTypesWithData: Seq[UInt]
+  def grantTypesWithData: Seq[UInt]
+}
+
+/** This API contains all functions required for client coherence agents.
+  * Policies must enumerate the number of client states and define their 
+  * permissions with respect to memory operations. Policies must fill in functions
+  * to control which messages are sent and how metadata is updated in response
+  * to coherence events. These funtions are generally called from within the 
+  * ClientMetadata class in metadata.scala
+  */
+trait HasClientSideCoherencePolicy {
+  // Client coherence states and their permissions
+  val nClientStates: Int
+  def clientStateWidth = log2Ceil(nClientStates)
+  def clientStatesWithReadPermission: Seq[UInt]
+  def clientStatesWithWritePermission: Seq[UInt]
+  def clientStatesWithDirtyData: Seq[UInt]
+
+  // Transaction initiation logic
+  def isValid(meta: ClientMetadata): Bool
+  def isHit(cmd: UInt, meta: ClientMetadata): Bool = {
+    Mux(isWriteIntent(cmd), 
+      meta.state isOneOf clientStatesWithWritePermission,
+      meta.state isOneOf clientStatesWithReadPermission)
+  }
+  //TODO: Assumes all states with write permissions also have read permissions
+  def requiresAcquireOnSecondaryMiss(
+        first_cmd: UInt,
+        second_cmd: UInt,
+        meta: ClientMetadata): Bool = {
+    isWriteIntent(second_cmd) && !isWriteIntent(first_cmd)
+  }
+  //TODO: Assumes all cache ctrl ops writeback dirty data, and
+  //      doesn't issue transaction when e.g. downgrading Exclusive to Shared:
+  def requiresReleaseOnCacheControl(cmd: UInt, meta: ClientMetadata): Bool =
+    meta.state isOneOf clientStatesWithDirtyData
+
+  // Determine which custom message type to use
+  def getAcquireType(cmd: UInt, meta: ClientMetadata): UInt
+  def getReleaseType(cmd: UInt, meta: ClientMetadata): UInt
+  def getReleaseType(p: HasProbeType, meta: ClientMetadata): UInt
+
+  // Mutate ClientMetadata based on messages or cmds
+  def clientMetadataOnHit(cmd: UInt, meta: ClientMetadata): ClientMetadata
+  def clientMetadataOnCacheControl(cmd: UInt, meta: ClientMetadata): ClientMetadata 
+  def clientMetadataOnGrant(incoming: HasGrantType, cmd: UInt, meta: ClientMetadata): ClientMetadata 
+  def clientMetadataOnProbe(incoming: HasProbeType, meta: ClientMetadata): ClientMetadata 
+}
+
+/** This API contains all functions required for manager coherence agents.
+  * Policies must enumerate the number of manager states. Policies must fill
+  * in functions to control which Probe and Grant messages are sent and how 
+  * metadata should be updated in response to coherence events. These funtions
+  * are generally called from within the ManagerMetadata class in metadata.scala
+  */
+trait HasManagerSideCoherencePolicy extends HasDirectoryRepresentation {
+  val nManagerStates: Int
+  def masterStateWidth = log2Ceil(nManagerStates)
+
+  // Transaction probing logic
+  def requiresProbes(acq: HasAcquireType, meta: ManagerMetadata): Bool
+  def requiresProbes(cmd: UInt, meta: ManagerMetadata): Bool
+
+  // Determine which custom message type to use in response
+  def getProbeType(cmd: UInt, meta: ManagerMetadata): UInt
+  def getProbeType(acq: HasAcquireType, meta: ManagerMetadata): UInt
+  def getGrantType(acq: HasAcquireType, meta: ManagerMetadata): UInt
+  def getExclusiveGrantType(): UInt
+
+  // Mutate ManagerMetadata based on messages or cmds
+  def managerMetadataOnRelease(incoming: HasReleaseType, src: UInt, meta: ManagerMetadata): ManagerMetadata
+  def managerMetadataOnGrant(outgoing: HasGrantType, dst: UInt, meta: ManagerMetadata) =
+    ManagerMetadata(sharers=Mux(outgoing.isBuiltInType(), // Assumes all built-ins are uncached
+                                meta.sharers,
+                                dir.push(meta.sharers, dst)))(meta.p)
+                    //state = meta.state)  TODO: Fix 0-width wires in Chisel
+}
+
+/** The following concrete implementations of CoherencePolicy each provide the 
+  * functionality of one particular protocol.
+  */
+
+/** A simple protocol with only two Client states.
+  * Data is always assumed to be dirty.
+  * Only a single client may ever have a copy of a block at a time.
+  */
+class MICoherence(dir: DirectoryRepresentation) extends CoherencePolicy(dir) {
+  // Message types
+  val nAcquireTypes = 1
+  val nProbeTypes = 2
+  val nReleaseTypes = 4
+  val nGrantTypes = 1
+
+  val acquireExclusive :: Nil = Enum(UInt(), nAcquireTypes)
+  val probeInvalidate :: probeCopy :: Nil = Enum(UInt(), nProbeTypes)
+  val releaseInvalidateData :: releaseCopyData :: releaseInvalidateAck :: releaseCopyAck :: Nil = Enum(UInt(), nReleaseTypes)
+  val grantExclusive :: Nil = Enum(UInt(), nGrantTypes)
+
+  def releaseTypesWithData = Seq(releaseInvalidateData, releaseCopyData)
+  def grantTypesWithData = Seq(grantExclusive)
+
+  // Client states and functions
+  val nClientStates = 2
+  val clientInvalid :: clientValid :: Nil = Enum(UInt(), nClientStates)
+
+  def clientStatesWithReadPermission = Seq(clientValid)
+  def clientStatesWithWritePermission = Seq(clientValid)
+  def clientStatesWithDirtyData = Seq(clientValid)
+
+  def isValid (meta: ClientMetadata): Bool = meta.state =/= clientInvalid
+
+  def getAcquireType(cmd: UInt, meta: ClientMetadata): UInt = acquireExclusive
+
+  def getReleaseType(cmd: UInt, meta: ClientMetadata): UInt = {
+    val dirty = meta.state isOneOf clientStatesWithDirtyData
+    MuxLookup(cmd, releaseCopyAck, Array(
+      M_FLUSH   -> Mux(dirty, releaseInvalidateData, releaseInvalidateAck),
+      M_PRODUCE -> Mux(dirty, releaseCopyData, releaseCopyAck),
+      M_CLEAN   -> Mux(dirty, releaseCopyData, releaseCopyAck)))
+  }
+
+  def getReleaseType(incoming: HasProbeType, meta: ClientMetadata): UInt =
+    MuxLookup(incoming.p_type, releaseInvalidateAck, Array(
+      probeInvalidate -> getReleaseType(M_FLUSH, meta),
+      probeCopy       -> getReleaseType(M_FLUSH, meta)))
+
+  def clientMetadataOnHit(cmd: UInt, meta: ClientMetadata) = meta
+
+  def clientMetadataOnCacheControl(cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(Mux(cmd === M_FLUSH, clientInvalid, meta.state))(meta.p)
+
+  def clientMetadataOnGrant(incoming: HasGrantType, cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(Mux(incoming.isBuiltInType(), clientInvalid, clientValid))(meta.p)
+
+  def clientMetadataOnProbe(incoming: HasProbeType, meta: ClientMetadata) =
+    ClientMetadata(Mux(incoming.p_type === probeInvalidate,
+                       clientInvalid, meta.state))(meta.p)
+
+  // Manager states and functions:
+  val nManagerStates = 0 // We don't actually need any states for this protocol
+
+  def requiresProbes(a: HasAcquireType, meta: ManagerMetadata) = !dir.none(meta.sharers)
+
+  def requiresProbes(cmd: UInt, meta: ManagerMetadata) = !dir.none(meta.sharers)
+
+  def getProbeType(cmd: UInt, meta: ManagerMetadata): UInt =
+    MuxLookup(cmd, probeCopy, Array(
+      M_FLUSH -> probeInvalidate))
+
+  def getProbeType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), 
+      MuxLookup(a.a_type, probeCopy, Array(
+        Acquire.getBlockType -> probeCopy, 
+        Acquire.putBlockType -> probeInvalidate,
+        Acquire.getType -> probeCopy, 
+        Acquire.putType -> probeInvalidate,
+        Acquire.getPrefetchType -> probeCopy,
+        Acquire.putPrefetchType -> probeInvalidate,
+        Acquire.putAtomicType -> probeInvalidate)), 
+      probeInvalidate)
+
+  def getGrantType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), Acquire.getBuiltInGrantType(a.a_type), grantExclusive)
+  def getExclusiveGrantType(): UInt = grantExclusive
+
+  def managerMetadataOnRelease(incoming: HasReleaseType, src: UInt, meta: ManagerMetadata) = {
+    val popped = ManagerMetadata(sharers=dir.pop(meta.sharers, src))(meta.p)
+    MuxCase(meta, Array(
+      incoming.is(releaseInvalidateData) -> popped,
+      incoming.is(releaseInvalidateAck)  -> popped))
+  }
+}
+
+/** A simple protocol with only three Client states.
+  * Data is marked as dirty when written.
+  * Only a single client may ever have a copy of a block at a time.
+  */
+class MEICoherence(dir: DirectoryRepresentation) extends CoherencePolicy(dir) {
+  // Message types
+  val nAcquireTypes = 2
+  val nProbeTypes = 3
+  val nReleaseTypes = 6
+  val nGrantTypes = 1
+
+  val acquireExclusiveClean :: acquireExclusiveDirty :: Nil = Enum(UInt(), nAcquireTypes)
+  val probeInvalidate :: probeDowngrade :: probeCopy :: Nil = Enum(UInt(), nProbeTypes)
+  val releaseInvalidateData :: releaseDowngradeData :: releaseCopyData :: releaseInvalidateAck :: releaseDowngradeAck :: releaseCopyAck :: Nil = Enum(UInt(), nReleaseTypes)
+  val grantExclusive :: Nil = Enum(UInt(), nGrantTypes)
+
+  def releaseTypesWithData = Seq(releaseInvalidateData, releaseDowngradeData, releaseCopyData)
+  def grantTypesWithData = Seq(grantExclusive)
+
+  // Client states and functions
+  val nClientStates = 3
+  val clientInvalid :: clientExclusiveClean :: clientExclusiveDirty :: Nil = Enum(UInt(), nClientStates)
+
+  def clientStatesWithReadPermission = Seq(clientExclusiveClean, clientExclusiveDirty)
+  def clientStatesWithWritePermission = Seq(clientExclusiveClean, clientExclusiveDirty)
+  def clientStatesWithDirtyData = Seq(clientExclusiveDirty)
+
+  def isValid (meta: ClientMetadata) = meta.state =/= clientInvalid
+
+  def getAcquireType(cmd: UInt, meta: ClientMetadata): UInt =
+    Mux(isWriteIntent(cmd), acquireExclusiveDirty, acquireExclusiveClean)
+
+  def getReleaseType(cmd: UInt, meta: ClientMetadata): UInt = {
+    val dirty = meta.state isOneOf clientStatesWithDirtyData
+    MuxLookup(cmd, releaseCopyAck, Array(
+      M_FLUSH   -> Mux(dirty, releaseInvalidateData, releaseInvalidateAck),
+      M_PRODUCE -> Mux(dirty, releaseDowngradeData, releaseDowngradeAck),
+      M_CLEAN   -> Mux(dirty, releaseCopyData, releaseCopyAck)))
+  }
+
+  def getReleaseType(incoming: HasProbeType, meta: ClientMetadata): UInt =
+    MuxLookup(incoming.p_type, releaseInvalidateAck, Array(
+      probeInvalidate -> getReleaseType(M_FLUSH, meta),
+      probeDowngrade  -> getReleaseType(M_FLUSH, meta),
+      probeCopy       -> getReleaseType(M_FLUSH, meta)))
+
+  def clientMetadataOnHit(cmd: UInt, meta: ClientMetadata) = 
+    ClientMetadata(Mux(isWrite(cmd), clientExclusiveDirty, meta.state))(meta.p)
+
+  def clientMetadataOnCacheControl(cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(
+      MuxLookup(cmd, meta.state, Array(
+        M_FLUSH -> clientInvalid,
+        M_CLEAN -> Mux(meta.state === clientExclusiveDirty, clientExclusiveClean, meta.state))))(meta.p)
+
+  def clientMetadataOnGrant(incoming: HasGrantType, cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(
+      Mux(incoming.isBuiltInType(), clientInvalid,
+        Mux(isWrite(cmd), clientExclusiveDirty, clientExclusiveClean)))(meta.p)
+
+  def clientMetadataOnProbe(incoming: HasProbeType, meta: ClientMetadata) =
+    ClientMetadata(
+      MuxLookup(incoming.p_type, meta.state, Array(
+        probeInvalidate -> clientInvalid,
+        probeDowngrade  -> clientInvalid,
+        probeCopy       -> clientInvalid)))(meta.p)
+
+  // Manager states and functions:
+  val nManagerStates = 0 // We don't actually need any states for this protocol
+
+  def requiresProbes(a: HasAcquireType, meta: ManagerMetadata) = !dir.none(meta.sharers)
+  def requiresProbes(cmd: UInt, meta: ManagerMetadata) = !dir.none(meta.sharers)
+
+  def getProbeType(cmd: UInt, meta: ManagerMetadata): UInt =
+    MuxLookup(cmd, probeCopy, Array(
+      M_FLUSH -> probeInvalidate,
+      M_PRODUCE -> probeDowngrade))
+
+  def getProbeType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), 
+      MuxLookup(a.a_type, probeCopy, Array(
+        Acquire.getBlockType -> probeCopy, 
+        Acquire.putBlockType -> probeInvalidate,
+        Acquire.getType -> probeCopy, 
+        Acquire.putType -> probeInvalidate,
+        Acquire.getPrefetchType -> probeCopy,
+        Acquire.putPrefetchType -> probeInvalidate,
+        Acquire.putAtomicType -> probeInvalidate)),
+      probeInvalidate)
+
+  def getGrantType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), Acquire.getBuiltInGrantType(a.a_type), grantExclusive)
+  def getExclusiveGrantType(): UInt = grantExclusive
+
+  def managerMetadataOnRelease(incoming: HasReleaseType, src: UInt, meta: ManagerMetadata) = {
+    val popped = ManagerMetadata(sharers=dir.pop(meta.sharers, src))(meta.p)
+    MuxCase(meta, Array(
+      incoming.is(releaseInvalidateData) -> popped,
+      incoming.is(releaseInvalidateAck)  -> popped))
+  }
+}
+
+/** A protocol with only three Client states.
+  * Data is always assumed to be dirty.
+  * Multiple clients may share read permissions on a block at the same time.
+  */
+class MSICoherence(dir: DirectoryRepresentation) extends CoherencePolicy(dir) {
+  // Message types
+  val nAcquireTypes = 2
+  val nProbeTypes = 3
+  val nReleaseTypes = 6
+  val nGrantTypes = 3
+
+  val acquireShared :: acquireExclusive :: Nil = Enum(UInt(), nAcquireTypes)
+  val probeInvalidate :: probeDowngrade :: probeCopy :: Nil = Enum(UInt(), nProbeTypes)
+  val releaseInvalidateData :: releaseDowngradeData :: releaseCopyData :: releaseInvalidateAck :: releaseDowngradeAck :: releaseCopyAck :: Nil = Enum(UInt(), nReleaseTypes)
+  val grantShared :: grantExclusive :: grantExclusiveAck :: Nil = Enum(UInt(), nGrantTypes)
+
+  def releaseTypesWithData = Seq(releaseInvalidateData, releaseDowngradeData, releaseCopyData)
+  def grantTypesWithData = Seq(grantShared, grantExclusive)
+
+  // Client states and functions
+  val nClientStates = 3
+  val clientInvalid :: clientShared :: clientExclusiveDirty :: Nil = Enum(UInt(), nClientStates)
+
+  def clientStatesWithReadPermission = Seq(clientShared, clientExclusiveDirty)
+  def clientStatesWithWritePermission = Seq(clientExclusiveDirty)
+  def clientStatesWithDirtyData = Seq(clientExclusiveDirty)
+
+  def isValid(meta: ClientMetadata): Bool = meta.state =/= clientInvalid
+
+  def getAcquireType(cmd: UInt, meta: ClientMetadata): UInt =
+    Mux(isWriteIntent(cmd), acquireExclusive, acquireShared)
+
+  def getReleaseType(cmd: UInt, meta: ClientMetadata): UInt = {
+    val dirty = meta.state isOneOf clientStatesWithDirtyData
+    MuxLookup(cmd, releaseCopyAck, Array(
+      M_FLUSH   -> Mux(dirty, releaseInvalidateData, releaseInvalidateAck),
+      M_PRODUCE -> Mux(dirty, releaseDowngradeData, releaseDowngradeAck),
+      M_CLEAN   -> Mux(dirty, releaseCopyData, releaseCopyAck)))
+  }
+
+  def getReleaseType(incoming: HasProbeType, meta: ClientMetadata): UInt =
+    MuxLookup(incoming.p_type, releaseInvalidateAck, Array(
+      probeInvalidate -> getReleaseType(M_FLUSH, meta),
+      probeDowngrade  -> getReleaseType(M_PRODUCE, meta),
+      probeCopy       -> getReleaseType(M_PRODUCE, meta)))
+
+  def clientMetadataOnHit(cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(Mux(isWrite(cmd), clientExclusiveDirty, meta.state))(meta.p)
+
+  def clientMetadataOnCacheControl(cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(
+      MuxLookup(cmd, meta.state, Array(
+        M_FLUSH   -> clientInvalid,
+        M_PRODUCE -> Mux(meta.state isOneOf clientStatesWithWritePermission,
+                      clientShared, meta.state))))(meta.p)
+
+  def clientMetadataOnGrant(incoming: HasGrantType, cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(
+      Mux(incoming.isBuiltInType(), clientInvalid,
+        MuxLookup(incoming.g_type, clientInvalid, Array(
+          grantShared -> clientShared,
+          grantExclusive -> clientExclusiveDirty,
+          grantExclusiveAck -> clientExclusiveDirty))))(meta.p)
+
+  def clientMetadataOnProbe(incoming: HasProbeType, meta: ClientMetadata) = 
+    ClientMetadata(
+      MuxLookup(incoming.p_type, meta.state, Array(
+        probeInvalidate -> clientInvalid,
+        probeDowngrade  -> clientShared,
+        probeCopy       -> clientShared)))(meta.p)
+
+  // Manager states and functions:
+  val nManagerStates = 0 // TODO: We could add a Shared state to avoid probing
+                         //        only a single sharer (also would need 
+                         //        notification msg to track clean drops)
+                         //        Also could avoid probes on outer WBs.
+
+  def requiresProbes(a: HasAcquireType, meta: ManagerMetadata) =
+    Mux(dir.none(meta.sharers), Bool(false), 
+      Mux(dir.one(meta.sharers), Bool(true), //TODO: for now we assume it's Exclusive
+        Mux(a.isBuiltInType(), a.hasData(), a.a_type =/= acquireShared)))
+
+  def requiresProbes(cmd: UInt, meta: ManagerMetadata) = !dir.none(meta.sharers)
+
+  def getProbeType(cmd: UInt, meta: ManagerMetadata): UInt =
+    MuxLookup(cmd, probeCopy, Array(
+      M_FLUSH -> probeInvalidate,
+      M_PRODUCE -> probeDowngrade))
+
+  def getProbeType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), 
+      MuxLookup(a.a_type, probeCopy, Array(
+        Acquire.getBlockType -> probeCopy, 
+        Acquire.putBlockType -> probeInvalidate,
+        Acquire.getType -> probeCopy, 
+        Acquire.putType -> probeInvalidate,
+        Acquire.getPrefetchType -> probeCopy,
+        Acquire.putPrefetchType -> probeInvalidate,
+        Acquire.putAtomicType -> probeInvalidate)),
+      MuxLookup(a.a_type, probeCopy, Array(
+        acquireShared -> probeDowngrade,
+        acquireExclusive -> probeInvalidate)))
+
+  def getGrantType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), Acquire.getBuiltInGrantType(a.a_type),
+      Mux(a.a_type === acquireShared,
+        Mux(!dir.none(meta.sharers), grantShared, grantExclusive),
+        grantExclusive))
+  def getExclusiveGrantType(): UInt = grantExclusive
+
+  def managerMetadataOnRelease(incoming: HasReleaseType, src: UInt, meta: ManagerMetadata) = {
+    val popped = ManagerMetadata(sharers=dir.pop(meta.sharers, src))(meta.p)
+    MuxCase(meta, Array(
+      incoming.is(releaseInvalidateData) -> popped,
+      incoming.is(releaseInvalidateAck)  -> popped))
+  }
+}
+
+/** A protocol with four Client states.
+  * Data is marked as dirty when written.
+  * Multiple clients may share read permissions on a block at the same time.
+  */
+class MESICoherence(dir: DirectoryRepresentation) extends CoherencePolicy(dir) {
+  // Message types
+  val nAcquireTypes = 2
+  val nProbeTypes = 3
+  val nReleaseTypes = 6
+  val nGrantTypes = 3
+
+  val acquireShared :: acquireExclusive :: Nil = Enum(UInt(), nAcquireTypes)
+  val probeInvalidate :: probeDowngrade :: probeCopy :: Nil = Enum(UInt(), nProbeTypes)
+  val releaseInvalidateData :: releaseDowngradeData :: releaseCopyData :: releaseInvalidateAck :: releaseDowngradeAck :: releaseCopyAck :: Nil = Enum(UInt(), nReleaseTypes)
+  val grantShared :: grantExclusive :: grantExclusiveAck :: Nil = Enum(UInt(), nGrantTypes)
+
+  def releaseTypesWithData = Seq(releaseInvalidateData, releaseDowngradeData, releaseCopyData)
+  def grantTypesWithData = Seq(grantShared, grantExclusive)
+
+  // Client states and functions
+  val nClientStates = 4
+  val clientInvalid :: clientShared :: clientExclusiveClean :: clientExclusiveDirty :: Nil = Enum(UInt(), nClientStates)
+
+  def clientStatesWithReadPermission = Seq(clientShared, clientExclusiveClean, clientExclusiveDirty)
+  def clientStatesWithWritePermission = Seq(clientExclusiveClean, clientExclusiveDirty)
+  def clientStatesWithDirtyData = Seq(clientExclusiveDirty)
+
+  def isValid(meta: ClientMetadata): Bool = meta.state =/= clientInvalid
+
+  def getAcquireType(cmd: UInt, meta: ClientMetadata): UInt =
+    Mux(isWriteIntent(cmd), acquireExclusive, acquireShared)
+
+  def getReleaseType(cmd: UInt, meta: ClientMetadata): UInt = {
+    val dirty = meta.state isOneOf clientStatesWithDirtyData
+    MuxLookup(cmd, releaseCopyAck, Array(
+      M_FLUSH   -> Mux(dirty, releaseInvalidateData, releaseInvalidateAck),
+      M_PRODUCE -> Mux(dirty, releaseDowngradeData, releaseDowngradeAck),
+      M_CLEAN   -> Mux(dirty, releaseCopyData, releaseCopyAck)))
+  }
+
+  def getReleaseType(incoming: HasProbeType, meta: ClientMetadata): UInt =
+    MuxLookup(incoming.p_type, releaseInvalidateAck, Array(
+      probeInvalidate -> getReleaseType(M_FLUSH, meta),
+      probeDowngrade  -> getReleaseType(M_PRODUCE, meta),
+      probeCopy       -> getReleaseType(M_PRODUCE, meta)))
+
+  def clientMetadataOnHit(cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(Mux(isWrite(cmd), clientExclusiveDirty, meta.state))(meta.p)
+
+  def clientMetadataOnCacheControl(cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(
+      MuxLookup(cmd, meta.state, Array(
+        M_FLUSH   -> clientInvalid,
+        M_PRODUCE -> Mux(meta.state isOneOf clientStatesWithWritePermission,
+                      clientShared, meta.state),
+        M_CLEAN   -> Mux(meta.state === clientExclusiveDirty,
+                      clientExclusiveClean, meta.state))))(meta.p)
+
+  def clientMetadataOnGrant(incoming: HasGrantType, cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(
+      Mux(incoming.isBuiltInType(), clientInvalid,
+        MuxLookup(incoming.g_type, clientInvalid, Array(
+          grantShared -> clientShared,
+          grantExclusive -> Mux(isWrite(cmd), clientExclusiveDirty, clientExclusiveClean),
+          grantExclusiveAck -> clientExclusiveDirty))))(meta.p)
+
+  def clientMetadataOnProbe(incoming: HasProbeType, meta: ClientMetadata) =
+    ClientMetadata(
+      MuxLookup(incoming.p_type, meta.state, Array(
+        probeInvalidate -> clientInvalid,
+        probeDowngrade  -> clientShared,
+        probeCopy       -> clientShared)))(meta.p)
+
+  // Manager states and functions:
+  val nManagerStates = 0 // TODO: We could add a Shared state to avoid probing
+                         //        only a single sharer (also would need 
+                         //        notification msg to track clean drops)
+                         //        Also could avoid probes on outer WBs.
+
+  def requiresProbes(a: HasAcquireType, meta: ManagerMetadata) =
+    Mux(dir.none(meta.sharers), Bool(false), 
+      Mux(dir.one(meta.sharers), Bool(true), //TODO: for now we assume it's Exclusive
+        Mux(a.isBuiltInType(), a.hasData(), a.a_type =/= acquireShared)))
+
+  def requiresProbes(cmd: UInt, meta: ManagerMetadata) = !dir.none(meta.sharers)
+
+  def getProbeType(cmd: UInt, meta: ManagerMetadata): UInt =
+    MuxLookup(cmd, probeCopy, Array(
+      M_FLUSH -> probeInvalidate,
+      M_PRODUCE -> probeDowngrade))
+
+  def getProbeType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), 
+      MuxLookup(a.a_type, probeCopy, Array(
+        Acquire.getBlockType -> probeCopy, 
+        Acquire.putBlockType -> probeInvalidate,
+        Acquire.getType -> probeCopy, 
+        Acquire.putType -> probeInvalidate,
+        Acquire.getPrefetchType -> probeCopy,
+        Acquire.putPrefetchType -> probeInvalidate,
+        Acquire.putAtomicType -> probeInvalidate)),
+      MuxLookup(a.a_type, probeCopy, Array(
+        acquireShared -> probeDowngrade,
+        acquireExclusive -> probeInvalidate)))
+
+  def getGrantType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), Acquire.getBuiltInGrantType(a.a_type),
+      Mux(a.a_type === acquireShared,
+        Mux(!dir.none(meta.sharers), grantShared, grantExclusive),
+        grantExclusive))
+  def getExclusiveGrantType(): UInt = grantExclusive
+
+  def managerMetadataOnRelease(incoming: HasReleaseType, src: UInt, meta: ManagerMetadata) = {
+    val popped = ManagerMetadata(sharers=dir.pop(meta.sharers, src))(meta.p)
+    MuxCase(meta, Array(
+      incoming.is(releaseInvalidateData) -> popped,
+      incoming.is(releaseInvalidateAck)  -> popped))
+  }
+}
+
+class MigratoryCoherence(dir: DirectoryRepresentation) extends CoherencePolicy(dir) {
+  // Message types
+  val nAcquireTypes = 3
+  val nProbeTypes = 4
+  val nReleaseTypes = 10
+  val nGrantTypes = 4
+
+  val acquireShared :: acquireExclusive :: acquireInvalidateOthers :: Nil = Enum(UInt(), nAcquireTypes)
+  val probeInvalidate :: probeDowngrade :: probeCopy :: probeInvalidateOthers :: Nil = Enum(UInt(), nProbeTypes)
+  val releaseInvalidateData :: releaseDowngradeData :: releaseCopyData :: releaseInvalidateAck :: releaseDowngradeAck :: releaseCopyAck :: releaseDowngradeDataMigratory :: releaseDowngradeAckHasCopy :: releaseInvalidateDataMigratory :: releaseInvalidateAckMigratory :: Nil = Enum(UInt(), nReleaseTypes)
+  val grantShared :: grantExclusive :: grantExclusiveAck :: grantReadMigratory :: Nil = Enum(UInt(), nGrantTypes)
+
+  def releaseTypesWithData = Seq(releaseInvalidateData, releaseDowngradeData, releaseCopyData, releaseInvalidateDataMigratory, releaseDowngradeDataMigratory)
+  def grantTypesWithData = Seq(grantShared, grantExclusive, grantReadMigratory)
+
+  // Client states and functions
+  val nClientStates = 7
+  val clientInvalid :: clientShared :: clientExclusiveClean :: clientExclusiveDirty :: clientSharedByTwo :: clientMigratoryClean :: clientMigratoryDirty :: Nil = Enum(UInt(), nClientStates)
+
+  def clientStatesWithReadPermission = Seq(clientShared, clientExclusiveClean, clientExclusiveDirty, clientSharedByTwo, clientMigratoryClean, clientMigratoryDirty)
+  def clientStatesWithWritePermission = Seq(clientExclusiveClean, clientExclusiveDirty, clientMigratoryClean, clientMigratoryDirty)
+  def clientStatesWithDirtyData = Seq(clientExclusiveDirty, clientMigratoryDirty)
+
+  def isValid (meta: ClientMetadata): Bool = meta.state =/= clientInvalid
+
+  def getAcquireType(cmd: UInt, meta: ClientMetadata): UInt =
+    Mux(isWriteIntent(cmd), 
+      Mux(meta.state === clientInvalid, acquireExclusive, acquireInvalidateOthers), 
+      acquireShared)
+
+  def getReleaseType(cmd: UInt, meta: ClientMetadata): UInt = {
+    val dirty = meta.state isOneOf clientStatesWithDirtyData
+    MuxLookup(cmd, releaseCopyAck, Array(
+      M_FLUSH   -> Mux(dirty, releaseInvalidateData, releaseInvalidateAck),
+      M_PRODUCE -> Mux(dirty, releaseDowngradeData, releaseDowngradeAck),
+      M_CLEAN   -> Mux(dirty, releaseCopyData, releaseCopyAck)))
+  }
+
+  def getReleaseType(incoming: HasProbeType, meta: ClientMetadata): UInt = {
+    val dirty = meta.state isOneOf clientStatesWithDirtyData
+    val with_data = MuxLookup(incoming.p_type, releaseInvalidateData, Array(
+      probeInvalidate -> Mux(meta.state isOneOf (clientExclusiveDirty, clientMigratoryDirty),
+                          releaseInvalidateDataMigratory, releaseInvalidateData),
+      probeDowngrade -> Mux(meta.state === clientMigratoryDirty,
+                          releaseDowngradeDataMigratory, releaseDowngradeData),
+      probeCopy -> releaseCopyData))
+    val without_data = MuxLookup(incoming.p_type, releaseInvalidateAck, Array(
+      probeInvalidate -> Mux(clientExclusiveClean === meta.state,
+                           releaseInvalidateAckMigratory, releaseInvalidateAck),
+      probeInvalidateOthers -> Mux(clientSharedByTwo === meta.state,
+                                 releaseInvalidateAckMigratory, releaseInvalidateAck),
+      probeDowngrade -> Mux(meta.state =/= clientInvalid,
+                         releaseDowngradeAckHasCopy, releaseDowngradeAck),
+      probeCopy -> Mux(meta.state =/= clientInvalid,
+                     releaseDowngradeAckHasCopy, releaseDowngradeAck)))
+    Mux(dirty, with_data, without_data)
+  }
+
+  def clientMetadataOnHit(cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(
+      Mux(isWrite(cmd), MuxLookup(meta.state, clientExclusiveDirty, Array(
+                          clientExclusiveClean -> clientExclusiveDirty,
+                          clientMigratoryClean -> clientMigratoryDirty)),
+                        meta.state))(meta.p)
+
+  def clientMetadataOnCacheControl(cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(
+      MuxLookup(cmd, meta.state, Array(
+        M_FLUSH   -> clientInvalid,
+        M_PRODUCE -> Mux(meta.state isOneOf clientStatesWithWritePermission,
+                       clientShared, meta.state),
+        M_CLEAN   -> MuxLookup(meta.state, meta.state, Array(
+                       clientExclusiveDirty -> clientExclusiveClean,
+                       clientMigratoryDirty -> clientMigratoryClean)))))(meta.p)
+
+  def clientMetadataOnGrant(incoming: HasGrantType, cmd: UInt, meta: ClientMetadata) =
+    ClientMetadata(
+      Mux(incoming.isBuiltInType(), clientInvalid,
+        MuxLookup(incoming.g_type, clientInvalid, Array(
+          grantShared        -> clientShared,
+          grantExclusive     -> Mux(isWrite(cmd), clientExclusiveDirty, clientExclusiveClean),
+          grantExclusiveAck  -> clientExclusiveDirty, 
+          grantReadMigratory -> Mux(isWrite(cmd),
+                                  clientMigratoryDirty, clientMigratoryClean)))))(meta.p)
+
+  def clientMetadataOnProbe(incoming: HasProbeType, meta: ClientMetadata) = {
+    val downgradeState = MuxLookup(meta.state, clientShared, Array(
+                              clientExclusiveClean -> clientSharedByTwo,
+                              clientExclusiveDirty -> clientSharedByTwo,
+                              clientSharedByTwo    -> clientShared,
+                              clientMigratoryClean -> clientSharedByTwo,
+                              clientMigratoryDirty -> clientInvalid))
+    ClientMetadata(
+      MuxLookup(incoming.p_type, meta.state, Array(
+        probeInvalidate -> clientInvalid,
+        probeInvalidateOthers -> clientInvalid,
+        probeDowngrade -> downgradeState,
+        probeCopy -> downgradeState)))(meta.p)
+  }
+
+  // Manager states and functions:
+  val nManagerStates = 0 // TODO: we could add some states to reduce the number of message types
+
+  def requiresProbes(a: HasAcquireType, meta: ManagerMetadata) =
+    Mux(dir.none(meta.sharers), Bool(false),
+      Mux(dir.one(meta.sharers), Bool(true), //TODO: for now we assume it's Exclusive
+        Mux(a.isBuiltInType(), a.hasData(), a.a_type =/= acquireShared)))
+
+  def requiresProbes(cmd: UInt, meta: ManagerMetadata) = !dir.none(meta.sharers)
+
+  def getProbeType(cmd: UInt, meta: ManagerMetadata): UInt =
+    MuxLookup(cmd, probeCopy, Array(
+      M_FLUSH -> probeInvalidate,
+      M_PRODUCE -> probeDowngrade))
+
+  def getProbeType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), 
+      MuxLookup(a.a_type, probeCopy, Array(
+        Acquire.getBlockType -> probeCopy, 
+        Acquire.putBlockType -> probeInvalidate,
+        Acquire.getType -> probeCopy, 
+        Acquire.putType -> probeInvalidate,
+        Acquire.getPrefetchType -> probeCopy,
+        Acquire.putPrefetchType -> probeInvalidate,
+        Acquire.putAtomicType -> probeInvalidate)),
+      MuxLookup(a.a_type, probeCopy, Array(
+        acquireShared -> probeDowngrade,
+        acquireExclusive -> probeInvalidate, 
+        acquireInvalidateOthers -> probeInvalidateOthers)))
+
+  def getGrantType(a: HasAcquireType, meta: ManagerMetadata): UInt =
+    Mux(a.isBuiltInType(), Acquire.getBuiltInGrantType(a.a_type),
+      MuxLookup(a.a_type, grantShared, Array(
+        acquireShared    -> Mux(!dir.none(meta.sharers), grantShared, grantExclusive),
+        acquireExclusive -> grantExclusive,                                            
+        acquireInvalidateOthers -> grantExclusiveAck)))  //TODO: add this to MESI for broadcast?
+  def getExclusiveGrantType(): UInt = grantExclusive
+
+  def managerMetadataOnRelease(incoming: HasReleaseType, src: UInt, meta: ManagerMetadata) = {
+    val popped = ManagerMetadata(sharers=dir.pop(meta.sharers, src))(meta.p)
+    MuxCase(meta, Array(
+      incoming.is(releaseInvalidateData) -> popped,
+      incoming.is(releaseInvalidateAck)  -> popped,
+      incoming.is(releaseInvalidateDataMigratory) -> popped,
+      incoming.is(releaseInvalidateAckMigratory) -> popped))
+  }
+}
--- a/src/main/scala/uncore/converters/Ahb.scala
+++ b/src/main/scala/uncore/converters/Ahb.scala
@ -0,0 +1,424 @@
+package uncore.converters
+
+import Chisel._
+import junctions._
+import uncore.tilelink._
+import uncore.util._
+import uncore.constants._
+import cde.{Parameters, Field}
+import HastiConstants._
+
+/* We need to translate TileLink requests into operations we can actually execute on AHB.
+ * The general plan of attack is:
+ *   get         => one AHB=>TL read
+ *   put         => [multiple AHB write fragments=>nill], one AHB write=>TL
+ *   getBlock    => AHB burst reads =>TL
+ *   putBlock    => AHB burst writes=>TL
+ *   getPrefetch => noop=>TL
+ *   putPrefetch => noop=>TL
+ *   putAtomic   => one AHB=>TL read, one idle, one AHB atom_write=>nill, one idle
+ *
+ * This requires that we support a pipeline of optional AHB requests with optional TL responses
+ */
+class AHBRequestIO(implicit p: Parameters) extends HastiMasterIO
+    with HasGrantType
+    with HasClientTransactionId
+    with HasTileLinkBeatId {
+  val executeAHB = Bool()
+  val respondTL  = Bool()
+  val latchAtom  = Bool()
+  val firstBurst = Bool()
+  val finalBurst = Bool()
+  val cmd        = Bits(width = M_SZ) // atomic op
+}
+
+// AHB stage1: translate TileLink Acquires into AHBRequests
+class AHBTileLinkIn(supportAtomics: Boolean = false)(implicit val p: Parameters) extends Module
+    with HasHastiParameters
+    with HasTileLinkParameters
+    with HasAddrMapParameters {
+  val io = new Bundle {
+    val acquire = new DecoupledIO(new Acquire).flip // NOTE: acquire must be either a Queue or a Pipe
+    val request = new DecoupledIO(new AHBRequestIO)
+  }
+  
+  // Match the AHB burst with a TileLink {Put,Get}Block
+  val burstSize = tlDataBeats match {
+    case 1  => HBURST_SINGLE
+    // case 2 not supported by AHB
+    case 4  => HBURST_WRAP4
+    case 8  => HBURST_WRAP8
+    case 16 => HBURST_WRAP16
+    case _  => throw new java.lang.AssertionError("TileLink beats unsupported by AHB")
+  }
+  
+  // Bursts start at 0 and wrap-around back to 0
+  val finalBurst = UInt(tlDataBeats-1, width = log2Up(tlDataBeats)).asUInt
+  val firstBurst = UInt(0,             width = log2Up(tlDataBeats))
+  val next_wmask = Wire(UInt(width = tlDataBytes)) // calculated below
+  
+  // State variables for processing more complicated TileLink Acquires
+  val s_atom_r :: s_atom_idle1 :: s_atom_w :: s_atom_idle2 :: Nil = Enum(UInt(), 4)
+  val atom_state = Reg(init = s_atom_r) // never changes if !supportAtomics
+  val done_wmask = Reg(init = UInt(0, width = tlDataBytes))
+  val burst      = Reg(init = firstBurst)
+  
+  // Grab some view of the TileLink acquire
+  val acq_wmask    = io.acquire.bits.wmask()
+  val isReadBurst  = io.acquire.bits.is(Acquire.getBlockType)
+  val isWriteBurst = io.acquire.bits.is(Acquire.putBlockType)
+  val isBurst      = isWriteBurst || isReadBurst
+  val isAtomic     = io.acquire.bits.is(Acquire.putAtomicType) && Bool(supportAtomics)
+  val isPut        = io.acquire.bits.is(Acquire.putType)
+  
+  // Final states?
+  val last_wmask = next_wmask === acq_wmask
+  val last_atom  = atom_state === s_atom_idle2
+  val last_burst = burst      === finalBurst
+  
+  // Block the incoming request until we've fully consumed it
+  // NOTE: the outgoing grant.valid may happen while acquire.ready is still false;
+  // for this reason it is essential to have a Queue or a Pipe infront of acquire
+  io.acquire.ready := io.request.ready && MuxLookup(io.acquire.bits.a_type, Bool(true), Array(
+    Acquire.getType         -> Bool(true),
+    Acquire.getBlockType    -> last_burst, // hold it until the last beat is burst
+    Acquire.putType         -> last_wmask, // only accept the put if we can fully consume its wmask
+    Acquire.putBlockType    -> Bool(true),
+    Acquire.putAtomicType   -> last_atom,  // atomic operation stages complete
+    Acquire.getPrefetchType -> Bool(true),
+    Acquire.putPrefetchType -> Bool(true)))
+  
+  // Advance the fragment state
+  when (io.request.ready && io.acquire.valid && isPut) {
+    when (last_wmask) { // if this was the last fragment, restart FSM
+      done_wmask := UInt(0)
+    } .otherwise {
+      done_wmask := next_wmask 
+    }
+  }
+  
+  // Advance the burst state
+  // We assume here that TileLink gives us all putBlock beats with nothing between them
+  when (io.request.ready && io.acquire.valid && isBurst) {
+    when (last_burst) {
+      burst := UInt(0)
+    } .otherwise {
+      burst := burst + UInt(1)
+    }
+  }
+  
+  // Advance the atomic state machine
+  when (io.request.ready && io.acquire.valid && isAtomic) {
+    switch (atom_state) {
+      is (s_atom_r)     { atom_state := s_atom_idle1 }
+      is (s_atom_idle1) { atom_state := s_atom_w     } // idle1 => AMOALU runs on a different clock than AHB slave read
+      is (s_atom_w)     { atom_state := s_atom_idle2 }
+      is (s_atom_idle2) { atom_state := s_atom_r     } // idle2 state is required by AHB after hmastlock is lowered
+    }
+  }
+  
+  // Returns (range=0, range=-1, aligned_wmask, size)
+  def mask_helper(in_0 : Bool, range : UInt): (Bool, Bool, UInt, UInt) = {
+    val len = range.getWidth
+    if (len == 1) {
+      (range === UInt(0), range === UInt(1), in_0.asUInt() & range, UInt(0))
+    } else {
+      val mid = len / 2
+      val lo  = range(mid-1, 0)
+      val hi  = range(len-1, mid)
+      val (lo_0, lo_1, lo_m, lo_s) = mask_helper(in_0,         lo)
+      val (hi_0, hi_1, hi_m, hi_s) = mask_helper(in_0 && lo_0, hi)
+      val out_0 = lo_0 && hi_0
+      val out_1 = lo_1 && hi_1
+      val out_m = Cat(hi_m, lo_m) | Fill(len, (in_0 && out_1).asUInt())
+      val out_s = Mux(out_1, UInt(log2Up(len)), Mux(lo_0, hi_s, lo_s))
+      (out_0, out_1, out_m, out_s)
+    }
+  }
+  
+  val pending_wmask = acq_wmask & ~done_wmask
+  val put_addr = PriorityEncoder(pending_wmask)
+  val (wmask_0, _, exec_wmask, put_size) = mask_helper(Bool(true), pending_wmask)
+  next_wmask := done_wmask | exec_wmask
+  
+  // Calculate the address, with consideration to put fragments and bursts
+  val addr_block = io.acquire.bits.addr_block
+  val addr_beatin= io.acquire.bits.addr_beat
+  val addr_burst = Mux(isReadBurst, addr_beatin + burst, addr_beatin)
+  val addr_byte  = Mux(isPut, put_addr, io.acquire.bits.addr_byte())
+  val addr_beat  = Mux(isWriteBurst, UInt(0), addr_burst)
+  val ahbAddr    = Cat(addr_block, addr_burst, addr_byte)
+  val ahbSize    = Mux(isPut, put_size, Mux(isBurst, UInt(log2Ceil(tlDataBytes)), io.acquire.bits.op_size()))
+  
+  val ahbBurst = MuxLookup(io.acquire.bits.a_type, HBURST_SINGLE, Array(
+    Acquire.getType         -> HBURST_SINGLE,
+    Acquire.getBlockType    -> burstSize,
+    Acquire.putType         -> HBURST_SINGLE,
+    Acquire.putBlockType    -> burstSize,
+    Acquire.putAtomicType   -> HBURST_SINGLE,
+    Acquire.getPrefetchType -> HBURST_SINGLE,
+    Acquire.putPrefetchType -> HBURST_SINGLE))
+  
+  val ahbWrite = MuxLookup(io.acquire.bits.a_type, Bool(false), Array(
+    Acquire.getType         -> Bool(false),
+    Acquire.getBlockType    -> Bool(false),
+    Acquire.putType         -> Bool(true),
+    Acquire.putBlockType    -> Bool(true),
+    Acquire.putAtomicType   -> MuxLookup(atom_state, Bool(false), Array(
+      s_atom_r              -> Bool(false),
+      s_atom_idle1          -> Bool(false),  // don't care
+      s_atom_w              -> Bool(true),
+      s_atom_idle2          -> Bool(true))), // don't care
+    Acquire.getPrefetchType -> Bool(false),  // don't care
+    Acquire.putPrefetchType -> Bool(true)))  // don't care
+  
+  val ahbExecute = MuxLookup(io.acquire.bits.a_type, Bool(false), Array(
+    Acquire.getType         -> Bool(true),
+    Acquire.getBlockType    -> Bool(true),
+    Acquire.putType         -> !wmask_0,  // handle the case of a Put with no bytes!
+    Acquire.putBlockType    -> Bool(true),
+    Acquire.putAtomicType   -> MuxLookup(atom_state, Bool(false), Array(
+      s_atom_r              -> Bool(true),
+      s_atom_idle1          -> Bool(false),
+      s_atom_w              -> Bool(true),
+      s_atom_idle2          -> Bool(false))),
+    Acquire.getPrefetchType -> Bool(false),
+    Acquire.putPrefetchType -> Bool(false)))
+  
+  val respondTL = MuxLookup(io.acquire.bits.a_type, Bool(false), Array(
+    Acquire.getType         -> Bool(true),
+    Acquire.getBlockType    -> Bool(true),
+    Acquire.putType         -> last_wmask,
+    Acquire.putBlockType    -> last_burst,
+    Acquire.putAtomicType   -> MuxLookup(atom_state, Bool(false), Array(
+      s_atom_r              -> Bool(true), // they want the old data
+      s_atom_idle1          -> Bool(false),
+      s_atom_w              -> Bool(false),
+      s_atom_idle2          -> Bool(false))),
+    Acquire.getPrefetchType -> Bool(true),
+    Acquire.putPrefetchType -> Bool(true)))
+  
+  io.request.valid                := io.acquire.valid
+  io.request.bits.htrans          := HTRANS_IDLE // unused/ignored
+  io.request.bits.haddr           := ahbAddr
+  io.request.bits.hmastlock       := isAtomic && atom_state =/= s_atom_idle2
+  io.request.bits.hwrite          := ahbWrite
+  io.request.bits.hburst          := ahbBurst
+  io.request.bits.hsize           := ahbSize
+  io.request.bits.hprot           := HPROT_DATA | HPROT_PRIVILEGED
+  io.request.bits.hwdata          := io.acquire.bits.data
+  io.request.bits.executeAHB      := ahbExecute
+  io.request.bits.respondTL       := respondTL
+  io.request.bits.latchAtom       := isAtomic && atom_state === s_atom_r
+  io.request.bits.firstBurst      := burst === firstBurst
+  io.request.bits.finalBurst      := burst === finalBurst || !isBurst
+  io.request.bits.cmd             := io.acquire.bits.op_code()
+  io.request.bits.is_builtin_type := Bool(true)
+  io.request.bits.g_type          := io.acquire.bits.getBuiltInGrantType()
+  io.request.bits.client_xact_id  := io.acquire.bits.client_xact_id
+  io.request.bits.addr_beat       := addr_beat
+
+  val debugBurst = Reg(UInt())
+  when (io.request.valid) {
+    debugBurst := addr_burst - burst
+  }
+  
+  // We only support built-in TileLink requests
+  assert(!io.acquire.valid || io.acquire.bits.is_builtin_type, "AHB bridge only supports builtin TileLink types")
+  // Ensure alignment of address to size
+  assert(!io.acquire.valid || (ahbAddr & ((UInt(1) << ahbSize) - UInt(1))) === UInt(0), "TileLink operation misaligned")
+  // If this is a putBlock, make sure it moves properly
+  assert(!io.acquire.valid || !isBurst || burst === firstBurst || debugBurst === addr_burst - burst, "TileLink putBlock beats not sequential")
+  // We better not get an incomplete TileLink acquire
+  assert(!io.acquire.valid || isBurst  || burst === firstBurst, "TileLink never completed a putBlock")
+  // If we disabled atomic support, we better not see a request
+  assert(!io.acquire.bits.is(Acquire.putAtomicType) || Bool(supportAtomics))
+}
+
+// AHB stage2: execute AHBRequests
+class AHBBusMaster(supportAtomics: Boolean = false)(implicit val p: Parameters) extends Module
+    with HasHastiParameters
+    with HasTileLinkParameters
+    with HasAddrMapParameters {
+  val io = new Bundle {
+    val request = new DecoupledIO(new AHBRequestIO).flip
+    val grant   = new DecoupledIO(new Grant)
+    val ahb     = new HastiMasterIO()
+  }
+  
+  // All AHB outputs are registered (they might be IOs)
+  val midBurst  = Reg(init = Bool(false))
+  val htrans    = Reg(init = HTRANS_IDLE)
+  val haddr     = Reg(UInt())
+  val hmastlock = Reg(init = Bool(false))
+  val hwrite    = Reg(Bool())
+  val hburst    = Reg(UInt())
+  val hsize     = Reg(init = UInt(0, width = SZ_HSIZE))
+  val hprot     = Reg(UInt())
+  val hwdata0   = Reg(Bits())
+  val hwdata1   = Reg(Bits())
+  val hrdata    = Reg(Bits())
+  
+  io.ahb.htrans    := htrans
+  io.ahb.haddr     := haddr
+  io.ahb.hmastlock := hmastlock
+  io.ahb.hwrite    := hwrite
+  io.ahb.hburst    := hburst
+  io.ahb.hsize     := hsize
+  io.ahb.hprot     := hprot
+  io.ahb.hwdata    := hwdata1 // one cycle after the address phase
+  
+  // TileLink response data needed in data phase
+  val respondTL0      = Reg(init = Bool(false))
+  val respondTL1      = Reg(init = Bool(false))
+  val latchAtom0      = Reg(init = Bool(false))
+  val latchAtom1      = Reg(init = Bool(false))
+  val executeAHB0     = Reg(init = Bool(false))
+  val executeAHB1     = Reg(init = Bool(false))
+  val bubble          = Reg(init = Bool(true)) // nothing useful in address phase
+  val cmd             = Reg(Bits())
+  val g_type0         = Reg(UInt())
+  val g_type1         = Reg(UInt())
+  val client_xact_id0 = Reg(Bits())
+  val client_xact_id1 = Reg(Bits())
+  val addr_beat0      = Reg(UInt())
+  val addr_beat1      = Reg(UInt())
+  val grant1          = Reg(new Grant)
+  
+  // It is allowed to progress from Idle/Busy during a wait state
+  val addrReady = io.ahb.hready || bubble || (!executeAHB1 && !executeAHB0)
+  val dataReady = io.ahb.hready || !executeAHB1
+  
+  // Only accept a new AHBRequest if we have enough buffer space in the pad
+  // to accomodate a persistent drop in TileLink's grant.ready
+  io.request.ready := addrReady && io.grant.ready
+  
+  // htrans must be updated even if no request is valid
+  when (addrReady) {
+    when (io.request.fire() && io.request.bits.executeAHB) {
+      midBurst := !io.request.bits.finalBurst
+      when (io.request.bits.firstBurst) {
+        htrans := HTRANS_NONSEQ
+      } .otherwise {
+        htrans := HTRANS_SEQ
+      }
+    } .otherwise {
+      when (midBurst) {
+        htrans := HTRANS_BUSY
+      } .otherwise {
+        htrans := HTRANS_IDLE
+      }
+    }
+  }
+
+  // Address phase, clear repondTL when we have nothing to do
+  when (addrReady) {
+    when (io.request.fire()) {
+      respondTL0 := io.request.bits.respondTL
+      latchAtom0 := io.request.bits.latchAtom
+      executeAHB0:= io.request.bits.executeAHB
+      bubble     := Bool(false)
+    } .otherwise {
+      respondTL0 := Bool(false)
+      latchAtom0 := Bool(false)
+      executeAHB0:= Bool(false)
+      bubble     := Bool(true) // an atom-injected Idle is not a bubble!
+    }
+  }
+
+  // Transfer bulk address phase
+  when (io.request.fire()) {
+    haddr     := io.request.bits.haddr
+    hmastlock := io.request.bits.hmastlock
+    hwrite    := io.request.bits.hwrite
+    hburst    := io.request.bits.hburst
+    hsize     := io.request.bits.hsize
+    hprot     := io.request.bits.hprot
+    hwdata0   := io.request.bits.hwdata
+    cmd             := io.request.bits.cmd
+    g_type0         := io.request.bits.g_type
+    client_xact_id0 := io.request.bits.client_xact_id
+    addr_beat0      := io.request.bits.addr_beat
+  }
+  
+  // Execute Atomic ops; unused and optimized away if !supportAtomics
+  val amo_p = p.alterPartial({
+    case CacheBlockOffsetBits => hastiAddrBits
+  })
+  val alu = Module(new AMOALU(hastiDataBits, rhsIsAligned = true)(amo_p))
+  alu.io.addr := haddr
+  alu.io.cmd  := cmd
+  alu.io.typ  := hsize
+  alu.io.rhs  := hwdata0
+  alu.io.lhs  := hrdata
+  
+  // Transfer bulk data phase
+  when (dataReady) {
+    when (addrReady) {
+      respondTL1    := respondTL0
+      latchAtom1    := latchAtom0
+      executeAHB1   := executeAHB0
+    } .otherwise {
+      respondTL1    := Bool(false)
+      latchAtom1    := Bool(false)
+      executeAHB1   := Bool(false)
+    }
+    hwdata1         := Mux(Bool(supportAtomics), alu.io.out, hwdata0)
+    g_type1         := g_type0
+    client_xact_id1 := client_xact_id0
+    addr_beat1      := addr_beat0
+  }
+  
+  // Latch the read result for an atomic operation
+  when (dataReady && latchAtom1) {
+    hrdata := io.ahb.hrdata
+  }
+  
+  // Only issue TL grant when the slave has provided data
+  io.grant.valid := dataReady && respondTL1
+  io.grant.bits := Grant(
+      is_builtin_type = Bool(true),
+      g_type          = g_type1,
+      client_xact_id  = client_xact_id1,
+      manager_xact_id = UInt(0),
+      addr_beat       = addr_beat1,
+      data            = io.ahb.hrdata)
+
+  // We cannot support errors from AHB to TileLink
+  assert(!io.ahb.hresp, "AHB hresp error detected and cannot be reported via TileLink")
+}
+
+class AHBBridge(supportAtomics: Boolean = true)(implicit val p: Parameters) extends Module
+    with HasHastiParameters
+    with HasTileLinkParameters
+    with HasAddrMapParameters {
+  val io = new Bundle {
+    val tl  = new ClientUncachedTileLinkIO().flip
+    val ahb = new HastiMasterIO()
+  }
+  
+  // Hasti and TileLink widths must agree at this point in the topology
+  require (tlDataBits == hastiDataBits)
+  require (p(PAddrBits) == hastiAddrBits)
+  
+  // AHB does not permit bursts to cross a 1KB boundary
+  require (tlDataBits * tlDataBeats <= 1024*8)
+  // tlDataBytes must be a power of 2
+  require (1 << log2Ceil(tlDataBytes) == tlDataBytes)
+  
+  // Create the sub-blocks
+  val fsm = Module(new AHBTileLinkIn(supportAtomics))
+  val bus = Module(new AHBBusMaster(supportAtomics))
+  val pad = Module(new Queue(new Grant, 4))
+  
+  fsm.io.acquire <> Queue(io.tl.acquire, 2) // Pipe is also acceptable
+  bus.io.request <> fsm.io.request
+  io.ahb         <> bus.io.ahb
+  io.tl.grant    <> pad.io.deq
+  
+  // The pad is needed to absorb AHB progress while !grant.ready
+  // We are only 'ready' if the pad has at least 3 cycles of space
+  bus.io.grant.ready := pad.io.count <= UInt(1)
+  pad.io.enq.bits  := bus.io.grant.bits
+  pad.io.enq.valid := bus.io.grant.valid
+}
--- a/src/main/scala/uncore/converters/Nasti.scala
+++ b/src/main/scala/uncore/converters/Nasti.scala
@ -0,0 +1,383 @@
+package uncore.converters
+
+import Chisel._
+import junctions._
+import uncore.tilelink._
+import uncore.constants._
+import cde.Parameters
+import scala.math.min
+
+class IdMapper(val inIdBits: Int, val outIdBits: Int,
+               val forceMapping: Boolean = false)
+              (implicit val p: Parameters) extends Module {
+
+  val io = new Bundle {
+    val req = new Bundle {
+      val valid  = Bool(INPUT)
+      val ready  = Bool(OUTPUT)
+      val in_id  = UInt(INPUT, inIdBits)
+      val out_id = UInt(OUTPUT, outIdBits)
+    }
+    val resp = new Bundle {
+      val valid   = Bool(INPUT)
+      val matches = Bool(OUTPUT)
+      val out_id  = UInt(INPUT, outIdBits)
+      val in_id   = UInt(OUTPUT, inIdBits)
+    }
+  }
+  val maxInXacts = 1 << inIdBits
+
+  if (inIdBits <= outIdBits && !forceMapping) {
+    io.req.ready := Bool(true)
+    io.req.out_id := io.req.in_id
+    io.resp.matches := Bool(true)
+    io.resp.in_id := io.resp.out_id
+  } else {
+    val nInXacts = 1 << inIdBits
+    // No point in allowing more out xacts than in xacts
+    val nOutXacts = min(1 << outIdBits, nInXacts)
+
+    val out_id_free = Reg(init = Vec.fill(nOutXacts){Bool(true)})
+    val in_id_free = Reg(init = Vec.fill(nInXacts){Bool(true)})
+    val next_out_id = PriorityEncoder(out_id_free)
+    val id_mapping = Reg(Vec(nOutXacts, UInt(0, inIdBits)))
+
+    val req_fire = io.req.valid && io.req.ready
+    when (req_fire) {
+      out_id_free(io.req.out_id) := Bool(false)
+      in_id_free(io.req.in_id) := Bool(false)
+      id_mapping(io.req.out_id) := io.req.in_id
+    }
+    when (io.resp.valid) {
+      out_id_free(io.resp.out_id) := Bool(true)
+      in_id_free(io.resp.in_id) := Bool(true)
+    }
+
+    io.req.ready := out_id_free.reduce(_ || _) && in_id_free(io.req.in_id)
+    io.req.out_id := next_out_id
+
+    io.resp.in_id := id_mapping(io.resp.out_id)
+    io.resp.matches := !out_id_free(io.resp.out_id)
+  }
+}
+
+class NastiIOTileLinkIOConverterInfo(implicit p: Parameters) extends TLBundle()(p) {
+  val addr_beat = UInt(width = tlBeatAddrBits)
+  val subblock = Bool()
+}
+
+class NastiIOTileLinkIOConverter(implicit p: Parameters) extends TLModule()(p)
+    with HasNastiParameters {
+  val io = new Bundle {
+    val tl = new ClientUncachedTileLinkIO().flip
+    val nasti = new NastiIO
+  }
+
+  val dataBits = tlDataBits*tlDataBeats 
+  require(tlDataBits == nastiXDataBits, "Data sizes between LLC and MC don't agree") // TODO: remove this restriction
+  require(tlDataBeats < (1 << nastiXLenBits), "Can't have that many beats")
+
+  val has_data = io.tl.acquire.bits.hasData()
+
+  val is_subblock = io.tl.acquire.bits.isSubBlockType()
+  val is_multibeat = io.tl.acquire.bits.hasMultibeatData()
+  val (tl_cnt_out, tl_wrap_out) = Counter(
+    io.tl.acquire.fire() && is_multibeat, tlDataBeats)
+
+  val get_valid = io.tl.acquire.valid && !has_data
+  val put_valid = io.tl.acquire.valid && has_data
+
+  // Reorder queue saves extra information needed to send correct
+  // grant back to TL client
+  val roqIdBits = min(tlClientXactIdBits, nastiXIdBits)
+  val roq = Module(new ReorderQueue(
+    new NastiIOTileLinkIOConverterInfo, roqIdBits))
+
+  val get_id_mapper = Module(new IdMapper(tlClientXactIdBits, nastiXIdBits))
+  val put_id_mapper = Module(new IdMapper(tlClientXactIdBits, nastiXIdBits))
+
+  val get_id_ready = get_id_mapper.io.req.ready
+  val put_id_mask = is_subblock || io.tl.acquire.bits.addr_beat === UInt(0)
+  val put_id_ready = put_id_mapper.io.req.ready || !put_id_mask
+
+  // For Get/GetBlock, make sure Reorder queue can accept new entry
+  val get_helper = DecoupledHelper(
+    get_valid,
+    roq.io.enq.ready,
+    io.nasti.ar.ready,
+    get_id_ready)
+
+  val w_inflight = Reg(init = Bool(false))
+  val w_id_reg = Reg(init = UInt(0, nastiXIdBits))
+  val w_id = Mux(w_inflight, w_id_reg, put_id_mapper.io.req.out_id)
+
+  // For Put/PutBlock, make sure aw and w channel are both ready before
+  // we send the first beat
+  val aw_ready = w_inflight || io.nasti.aw.ready
+  val put_helper = DecoupledHelper(
+    put_valid,
+    aw_ready,
+    io.nasti.w.ready,
+    put_id_ready)
+
+  val (nasti_cnt_out, nasti_wrap_out) = Counter(
+    io.nasti.r.fire() && !roq.io.deq.data.subblock, tlDataBeats)
+
+  roq.io.enq.valid := get_helper.fire(roq.io.enq.ready)
+  roq.io.enq.bits.tag := io.nasti.ar.bits.id
+  roq.io.enq.bits.data.addr_beat := io.tl.acquire.bits.addr_beat
+  roq.io.enq.bits.data.subblock := is_subblock
+  roq.io.deq.valid := io.nasti.r.fire() && (nasti_wrap_out || roq.io.deq.data.subblock)
+  roq.io.deq.tag := io.nasti.r.bits.id
+
+  get_id_mapper.io.req.valid := get_helper.fire(get_id_ready)
+  get_id_mapper.io.req.in_id := io.tl.acquire.bits.client_xact_id
+  get_id_mapper.io.resp.valid := io.nasti.r.fire() && io.nasti.r.bits.last
+  get_id_mapper.io.resp.out_id := io.nasti.r.bits.id
+
+  put_id_mapper.io.req.valid := put_helper.fire(put_id_ready, put_id_mask)
+  put_id_mapper.io.req.in_id := io.tl.acquire.bits.client_xact_id
+  put_id_mapper.io.resp.valid := io.nasti.b.fire()
+  put_id_mapper.io.resp.out_id := io.nasti.b.bits.id
+
+  // Decompose outgoing TL Acquires into Nasti address and data channels
+  io.nasti.ar.valid := get_helper.fire(io.nasti.ar.ready)
+  io.nasti.ar.bits := NastiReadAddressChannel(
+    id = get_id_mapper.io.req.out_id,
+    addr = io.tl.acquire.bits.full_addr(),
+    size = Mux(is_subblock,
+      io.tl.acquire.bits.op_size(),
+      UInt(log2Ceil(tlDataBytes))),
+    len = Mux(is_subblock, UInt(0), UInt(tlDataBeats - 1)))
+
+  def mask_helper(all_inside_0: Seq[Bool], defsize: Int): (Seq[Bool], UInt, UInt) = {
+    val len = all_inside_0.size
+    if (len == 1) {
+      (Seq(Bool(true)), UInt(0), UInt(defsize))
+    } else {
+      val sub_inside_0 = Seq.tabulate (len/2) { i => all_inside_0(2*i) && all_inside_0(2*i+1) }
+      val (sub_outside_0, sub_offset, sub_size) = mask_helper(sub_inside_0, defsize+1)
+      val all_outside_0 = Seq.tabulate (len) { i => sub_outside_0(i/2) && all_inside_0(i^1) }
+      val odd_outside_0 = Seq.tabulate (len/2) { i => all_outside_0(2*i+1) }
+      val odd_outside = odd_outside_0.reduce (_ || _)
+      val all_outside = all_outside_0.reduce (_ || _)
+      val offset = Cat(sub_offset, odd_outside)
+      val size = Mux(all_outside, UInt(defsize), sub_size)
+      (all_outside_0, offset, size)
+    }
+  }
+
+  val all_inside_0 = (~io.tl.acquire.bits.wmask()).toBools
+  val (_, put_offset, put_size) = mask_helper(all_inside_0, 0)
+
+  io.nasti.aw.valid := put_helper.fire(aw_ready, !w_inflight)
+  io.nasti.aw.bits := NastiWriteAddressChannel(
+    id = put_id_mapper.io.req.out_id,
+    addr = io.tl.acquire.bits.full_addr() |
+           Mux(is_multibeat, UInt(0), put_offset),
+    size = Mux(is_multibeat, UInt(log2Ceil(tlDataBytes)), put_size),
+    len = Mux(is_multibeat, UInt(tlDataBeats - 1), UInt(0)))
+
+  io.nasti.w.valid := put_helper.fire(io.nasti.w.ready)
+  io.nasti.w.bits := NastiWriteDataChannel(
+    id = w_id,
+    data = io.tl.acquire.bits.data,
+    strb = Some(io.tl.acquire.bits.wmask()),
+    last = Mux(w_inflight,
+      tl_cnt_out === UInt(tlDataBeats - 1), !is_multibeat))
+
+  io.tl.acquire.ready := Mux(has_data,
+    put_helper.fire(put_valid),
+    get_helper.fire(get_valid))
+
+  when (!w_inflight && io.tl.acquire.fire() && is_multibeat) {
+    w_inflight := Bool(true)
+    w_id_reg := w_id
+  }
+
+  when (w_inflight) {
+    when (tl_wrap_out) { w_inflight := Bool(false) }
+  }
+
+  // Aggregate incoming NASTI responses into TL Grants
+  val (tl_cnt_in, tl_wrap_in) = Counter(
+    io.tl.grant.fire() && io.tl.grant.bits.hasMultibeatData(), tlDataBeats)
+  val gnt_arb = Module(new LockingArbiter(new GrantToDst, 2,
+    tlDataBeats, Some((gnt: GrantToDst) => gnt.hasMultibeatData())))
+  io.tl.grant <> gnt_arb.io.out
+
+  gnt_arb.io.in(0).valid := io.nasti.r.valid
+  io.nasti.r.ready := gnt_arb.io.in(0).ready
+  gnt_arb.io.in(0).bits := Grant(
+    is_builtin_type = Bool(true),
+    g_type = Mux(roq.io.deq.data.subblock,
+      Grant.getDataBeatType, Grant.getDataBlockType),
+    client_xact_id = get_id_mapper.io.resp.in_id,
+    manager_xact_id = UInt(0),
+    addr_beat = Mux(roq.io.deq.data.subblock, roq.io.deq.data.addr_beat, tl_cnt_in),
+    data = io.nasti.r.bits.data)
+
+  assert(!roq.io.deq.valid || roq.io.deq.matches,
+    "TL -> NASTI converter ReorderQueue: NASTI tag error")
+  assert(!gnt_arb.io.in(0).valid || get_id_mapper.io.resp.matches,
+    "TL -> NASTI ID Mapper: NASTI tag error")
+
+  gnt_arb.io.in(1).valid := io.nasti.b.valid
+  io.nasti.b.ready := gnt_arb.io.in(1).ready
+  gnt_arb.io.in(1).bits := Grant(
+    is_builtin_type = Bool(true),
+    g_type = Grant.putAckType,
+    client_xact_id = put_id_mapper.io.resp.in_id,
+    manager_xact_id = UInt(0),
+    addr_beat = UInt(0),
+    data = Bits(0))
+  assert(!gnt_arb.io.in(1).valid || put_id_mapper.io.resp.matches, "NASTI tag error")
+
+  assert(!io.nasti.r.valid || io.nasti.r.bits.resp === UInt(0), "NASTI read error")
+  assert(!io.nasti.b.valid || io.nasti.b.bits.resp === UInt(0), "NASTI write error")
+}
+
+class TileLinkIONastiIOConverter(implicit p: Parameters) extends TLModule()(p)
+    with HasNastiParameters {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val tl = new ClientUncachedTileLinkIO
+  }
+
+  val (s_idle :: s_put :: Nil) = Enum(Bits(), 2)
+  val state = Reg(init = s_idle)
+
+  private val blockOffset = tlByteAddrBits + tlBeatAddrBits
+
+  val aw_req = Reg(new NastiWriteAddressChannel)
+  val w_tl_id = Reg(io.tl.acquire.bits.client_xact_id)
+
+  def is_singlebeat(chan: NastiAddressChannel): Bool =
+    chan.len === UInt(0)
+
+  def is_multibeat(chan: NastiAddressChannel): Bool =
+    chan.len === UInt(tlDataBeats - 1) && chan.size === UInt(log2Up(tlDataBytes))
+
+  def nasti_addr_block(chan: NastiAddressChannel): UInt =
+    chan.addr(nastiXAddrBits - 1, blockOffset)
+
+  def nasti_addr_beat(chan: NastiAddressChannel): UInt =
+    chan.addr(blockOffset - 1, tlByteAddrBits)
+
+  def nasti_addr_byte(chan: NastiAddressChannel): UInt =
+    chan.addr(tlByteAddrBits - 1, 0)
+
+  def size_mask(size: UInt): UInt =
+    (UInt(1) << (UInt(1) << size)) - UInt(1)
+
+  def nasti_wmask(aw: NastiWriteAddressChannel, w: NastiWriteDataChannel): UInt = {
+    val base = w.strb & size_mask(aw.size)
+    val addr_byte = nasti_addr_byte(aw)
+    w.strb & (size_mask(aw.size) << addr_byte)
+  }
+
+  def tl_last(gnt: GrantMetadata): Bool =
+    !gnt.hasMultibeatData() || gnt.addr_beat === UInt(tlDataBeats - 1)
+
+  def tl_b_grant(gnt: GrantMetadata): Bool =
+    gnt.g_type === Grant.putAckType
+
+  assert(!io.nasti.ar.valid ||
+    is_singlebeat(io.nasti.ar.bits) || is_multibeat(io.nasti.ar.bits),
+    "NASTI read transaction cannot convert to TileLInk")
+
+  assert(!io.nasti.aw.valid ||
+    is_singlebeat(io.nasti.aw.bits) || is_multibeat(io.nasti.aw.bits),
+    "NASTI write transaction cannot convert to TileLInk")
+
+  val put_count = Reg(init = UInt(0, tlBeatAddrBits))
+  val get_id_mapper = Module(new IdMapper(nastiXIdBits, tlClientXactIdBits, true))
+  val put_id_mapper = Module(new IdMapper(nastiXIdBits, tlClientXactIdBits, true))
+
+  when (io.nasti.aw.fire()) {
+    aw_req := io.nasti.aw.bits
+    w_tl_id := put_id_mapper.io.req.out_id
+    state := s_put
+  }
+
+  when (io.nasti.w.fire()) {
+    put_count := put_count + UInt(1)
+    when (io.nasti.w.bits.last) {
+      put_count := UInt(0)
+      state := s_idle
+    }
+  }
+
+  val get_acquire = Mux(is_multibeat(io.nasti.ar.bits),
+    GetBlock(
+      client_xact_id = get_id_mapper.io.req.out_id,
+      addr_block = nasti_addr_block(io.nasti.ar.bits)),
+    Get(
+      client_xact_id = get_id_mapper.io.req.out_id,
+      addr_block = nasti_addr_block(io.nasti.ar.bits),
+      addr_beat = nasti_addr_beat(io.nasti.ar.bits),
+      addr_byte = nasti_addr_byte(io.nasti.ar.bits),
+      operand_size = io.nasti.ar.bits.size,
+      alloc = Bool(false)))
+
+  val put_acquire = Mux(is_multibeat(aw_req),
+    PutBlock(
+      client_xact_id = w_tl_id,
+      addr_block = nasti_addr_block(aw_req),
+      addr_beat = put_count,
+      data = io.nasti.w.bits.data,
+      wmask = Some(io.nasti.w.bits.strb)),
+    Put(
+      client_xact_id = w_tl_id,
+      addr_block = nasti_addr_block(aw_req),
+      addr_beat = nasti_addr_beat(aw_req),
+      data = io.nasti.w.bits.data,
+      wmask = Some(nasti_wmask(aw_req, io.nasti.w.bits))))
+
+  val get_helper = DecoupledHelper(
+    io.nasti.ar.valid,
+    get_id_mapper.io.req.ready,
+    io.tl.acquire.ready)
+
+  get_id_mapper.io.req.valid := get_helper.fire(
+    get_id_mapper.io.req.ready, state === s_idle)
+  get_id_mapper.io.req.in_id := io.nasti.ar.bits.id
+  get_id_mapper.io.resp.out_id := io.tl.grant.bits.client_xact_id
+  get_id_mapper.io.resp.valid := io.nasti.r.fire() && io.nasti.r.bits.last
+
+  val aw_ok = (state === s_idle && !io.nasti.ar.valid)
+
+  put_id_mapper.io.req.valid := aw_ok && io.nasti.aw.valid
+  put_id_mapper.io.req.in_id := io.nasti.aw.bits.id
+  put_id_mapper.io.resp.out_id := io.tl.grant.bits.client_xact_id
+  put_id_mapper.io.resp.valid := io.nasti.b.fire()
+
+  io.tl.acquire.bits := Mux(state === s_put, put_acquire, get_acquire)
+  io.tl.acquire.valid := get_helper.fire(io.tl.acquire.ready, state === s_idle) ||
+                         (state === s_put && io.nasti.w.valid)
+
+  io.nasti.ar.ready := get_helper.fire(io.nasti.ar.valid, state === s_idle)
+  io.nasti.aw.ready := aw_ok && put_id_mapper.io.req.ready
+  io.nasti.w.ready  := (state === s_put && io.tl.acquire.ready)
+
+  val nXacts = tlMaxClientXacts * tlMaxClientsPerPort
+
+  io.nasti.b.valid := io.tl.grant.valid && tl_b_grant(io.tl.grant.bits)
+  io.nasti.b.bits := NastiWriteResponseChannel(
+    id = put_id_mapper.io.resp.in_id)
+
+  assert(!io.nasti.b.valid || put_id_mapper.io.resp.matches,
+    "Put ID does not match")
+
+  io.nasti.r.valid := io.tl.grant.valid && !tl_b_grant(io.tl.grant.bits)
+  io.nasti.r.bits := NastiReadDataChannel(
+    id = get_id_mapper.io.resp.in_id,
+    data = io.tl.grant.bits.data,
+    last = tl_last(io.tl.grant.bits))
+
+  assert(!io.nasti.r.valid || get_id_mapper.io.resp.matches,
+    "Get ID does not match")
+
+  io.tl.grant.ready := Mux(tl_b_grant(io.tl.grant.bits),
+    io.nasti.b.ready, io.nasti.r.ready)
+}
--- a/src/main/scala/uncore/converters/Smi.scala
+++ b/src/main/scala/uncore/converters/Smi.scala
@ -0,0 +1,32 @@
+// See LICENSE for details
+
+package uncore.converters
+
+import Chisel._
+import junctions._
+import uncore.tilelink._
+import cde.Parameters
+
+/** Convert TileLink protocol to Smi protocol */
+class SmiIOTileLinkIOConverter(val dataWidth: Int, val addrWidth: Int)
+                              (implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val tl = (new ClientUncachedTileLinkIO).flip
+    val smi = new SmiIO(dataWidth, addrWidth)
+  }
+
+  def decoupledNastiConnect(outer: NastiIO, inner: NastiIO) {
+    outer.ar <> Queue(inner.ar)
+    outer.aw <> Queue(inner.aw)
+    outer.w  <> Queue(inner.w)
+    inner.r  <> Queue(outer.r)
+    inner.b  <> Queue(outer.b)
+  }
+
+  val tl2nasti  = Module(new NastiIOTileLinkIOConverter())
+  val nasti2smi = Module(new SmiIONastiIOConverter(dataWidth, addrWidth))
+
+  tl2nasti.io.tl <> io.tl
+  decoupledNastiConnect(nasti2smi.io.nasti, tl2nasti.io.nasti)
+  io.smi <> nasti2smi.io.smi
+}
--- a/src/main/scala/uncore/converters/Tilelink.scala
+++ b/src/main/scala/uncore/converters/Tilelink.scala
@ -0,0 +1,681 @@
+package uncore.converters
+
+import Chisel._
+import junctions._
+import uncore.tilelink._
+import uncore.util._
+import uncore.constants._
+import cde.Parameters
+
+/** Utilities for safely wrapping a *UncachedTileLink by pinning probe.ready and release.valid low */
+object TileLinkIOWrapper {
+  def apply(tl: ClientUncachedTileLinkIO)(implicit p: Parameters): ClientTileLinkIO = {
+    val conv = Module(new ClientTileLinkIOWrapper)
+    conv.io.in <> tl
+    conv.io.out
+  }
+  def apply(tl: UncachedTileLinkIO)(implicit p: Parameters): TileLinkIO = {
+    val conv = Module(new TileLinkIOWrapper)
+    conv.io.in <> tl
+    conv.io.out
+  }
+  def apply(tl: ClientTileLinkIO): ClientTileLinkIO = tl
+  def apply(tl: TileLinkIO): TileLinkIO = tl
+}
+
+class TileLinkIOWrapper(implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val in = new UncachedTileLinkIO().flip
+    val out = new TileLinkIO
+  }
+  io.out.acquire <> io.in.acquire
+  io.in.grant <> io.out.grant
+  io.out.finish <> io.in.finish
+  io.out.probe.ready := Bool(true)
+  io.out.release.valid := Bool(false)
+}
+
+class ClientTileLinkIOWrapper(implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val in = new ClientUncachedTileLinkIO().flip
+    val out = new ClientTileLinkIO
+  }
+  io.out.acquire <> io.in.acquire
+  io.in.grant <> io.out.grant
+  io.out.probe.ready := Bool(true)
+  io.out.release.valid := Bool(false)
+}
+
+class ClientTileLinkIOUnwrapper(implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val in = new ClientTileLinkIO().flip
+    val out = new ClientUncachedTileLinkIO
+  }
+
+  val acqArb = Module(new LockingRRArbiter(new Acquire, 2, tlDataBeats,
+    Some((acq: Acquire) => acq.hasMultibeatData())))
+
+  val acqRoq = Module(new ReorderQueue(Bool(), tlClientXactIdBits))
+  val relRoq = Module(new ReorderQueue(Bool(), tlClientXactIdBits))
+
+  val iacq = io.in.acquire.bits
+  val irel = io.in.release.bits
+  val ognt = io.out.grant.bits
+
+  val acq_roq_enq = iacq.first()
+  val rel_roq_enq = irel.first()
+
+  val acq_roq_ready = !acq_roq_enq || acqRoq.io.enq.ready
+  val rel_roq_ready = !rel_roq_enq || relRoq.io.enq.ready
+
+  val acq_helper = DecoupledHelper(
+    io.in.acquire.valid,
+    acq_roq_ready,
+    acqArb.io.in(0).ready)
+
+  val rel_helper = DecoupledHelper(
+    io.in.release.valid,
+    rel_roq_ready,
+    acqArb.io.in(1).ready)
+
+  acqRoq.io.enq.valid := acq_helper.fire(acq_roq_ready, acq_roq_enq)
+  acqRoq.io.enq.bits.data := iacq.isBuiltInType()
+  acqRoq.io.enq.bits.tag := iacq.client_xact_id
+
+  acqArb.io.in(0).valid := acq_helper.fire(acqArb.io.in(0).ready)
+  acqArb.io.in(0).bits := Acquire(
+    is_builtin_type = Bool(true),
+    a_type = Mux(iacq.isBuiltInType(),
+      iacq.a_type, Acquire.getBlockType),
+    client_xact_id = iacq.client_xact_id,
+    addr_block = iacq.addr_block,
+    addr_beat = iacq.addr_beat,
+    data = iacq.data,
+    union = iacq.union)
+  io.in.acquire.ready := acq_helper.fire(io.in.acquire.valid)
+
+  relRoq.io.enq.valid := rel_helper.fire(rel_roq_ready, rel_roq_enq)
+  relRoq.io.enq.bits.data := irel.isVoluntary()
+  relRoq.io.enq.bits.tag := irel.client_xact_id
+
+  acqArb.io.in(1).valid := rel_helper.fire(acqArb.io.in(1).ready)
+  acqArb.io.in(1).bits := PutBlock(
+    client_xact_id = irel.client_xact_id,
+    addr_block = irel.addr_block,
+    addr_beat = irel.addr_beat,
+    data = irel.data)
+  io.in.release.ready := rel_helper.fire(io.in.release.valid)
+
+  io.out.acquire <> acqArb.io.out
+
+  val grant_deq_roq = io.out.grant.fire() && ognt.last()
+
+  acqRoq.io.deq.valid := acqRoq.io.deq.matches && grant_deq_roq
+  acqRoq.io.deq.tag := ognt.client_xact_id
+
+  relRoq.io.deq.valid := !acqRoq.io.deq.matches && grant_deq_roq
+  relRoq.io.deq.tag := ognt.client_xact_id
+
+  assert(!grant_deq_roq || acqRoq.io.deq.matches || relRoq.io.deq.matches,
+    "TileLink Unwrapper: client_xact_id mismatch")
+
+  val gnt_builtin = acqRoq.io.deq.data
+  val gnt_voluntary = relRoq.io.deq.data
+
+  val acq_grant = Grant(
+    is_builtin_type = gnt_builtin,
+    g_type = Mux(gnt_builtin, ognt.g_type, tlCoh.getExclusiveGrantType),
+    client_xact_id = ognt.client_xact_id,
+    manager_xact_id = ognt.manager_xact_id,
+    addr_beat = ognt.addr_beat,
+    data = ognt.data)
+
+  assert(!io.in.release.valid || io.in.release.bits.isVoluntary(), "Unwrapper can only process voluntary releases.")
+  val rel_grant = Grant(
+    is_builtin_type = Bool(true),
+    g_type = Grant.voluntaryAckType, // We should only every be working with voluntary releases
+    client_xact_id = ognt.client_xact_id,
+    manager_xact_id = ognt.manager_xact_id,
+    addr_beat = ognt.addr_beat,
+    data = ognt.data)
+
+  io.in.grant.valid := io.out.grant.valid
+  io.in.grant.bits := Mux(acqRoq.io.deq.matches, acq_grant, rel_grant)
+  io.out.grant.ready := io.in.grant.ready
+
+  io.in.probe.valid := Bool(false)
+}
+
+object TileLinkWidthAdapter {
+  def apply(in: ClientUncachedTileLinkIO, outerId: String)(implicit p: Parameters) = {
+    val outerDataBits = p(TLKey(outerId)).dataBitsPerBeat
+    if (outerDataBits > in.tlDataBits) {
+      val widener = Module(new TileLinkIOWidener(in.p(TLId), outerId))
+      widener.io.in <> in
+      widener.io.out
+    } else if (outerDataBits < in.tlDataBits) {
+      val narrower = Module(new TileLinkIONarrower(in.p(TLId), outerId))
+      narrower.io.in <> in
+      narrower.io.out
+    } else { in }
+  }
+  def apply(out: ClientUncachedTileLinkIO, in: ClientUncachedTileLinkIO)(implicit p: Parameters): Unit = {
+    require(out.tlDataBits * out.tlDataBeats == in.tlDataBits * in.tlDataBeats)
+    out <> apply(in, out.p(TLId))
+  }
+}
+
+class TileLinkIOWidener(innerTLId: String, outerTLId: String)
+    (implicit p: Parameters) extends TLModule()(p) {
+
+  val paddrBits = p(PAddrBits)
+  val innerParams = p(TLKey(innerTLId))
+  val outerParams = p(TLKey(outerTLId)) 
+  val innerDataBeats = innerParams.dataBeats
+  val innerDataBits = innerParams.dataBitsPerBeat
+  val innerWriteMaskBits = innerParams.writeMaskBits
+  val innerByteAddrBits = log2Up(innerWriteMaskBits)
+  val innerMaxXacts = innerParams.maxClientXacts * innerParams.maxClientsPerPort
+  val innerXactIdBits = log2Up(innerMaxXacts)
+  val outerDataBeats = outerParams.dataBeats
+  val outerDataBits = outerParams.dataBitsPerBeat
+  val outerWriteMaskBits = outerParams.writeMaskBits
+  val outerByteAddrBits = log2Up(outerWriteMaskBits)
+  val outerBeatAddrBits = log2Up(outerDataBeats)
+  val outerBlockOffset = outerBeatAddrBits + outerByteAddrBits
+  val outerMaxClients = outerParams.maxClientsPerPort
+  val outerClientIdBits = log2Up(outerParams.maxClientXacts * outerMaxClients)
+  val outerManagerIdBits = log2Up(outerParams.maxManagerXacts)
+  val outerBlockAddrBits = paddrBits - outerBlockOffset
+
+  require(outerDataBeats <= innerDataBeats)
+  require(outerDataBits >= innerDataBits)
+  require(outerDataBits % innerDataBits == 0)
+  require(outerDataBits * outerDataBeats == innerDataBits * innerDataBeats)
+
+  val factor = innerDataBeats / outerDataBeats
+
+  val io = new Bundle {
+    val in = new ClientUncachedTileLinkIO()(p.alterPartial({case TLId => innerTLId})).flip
+    val out = new ClientUncachedTileLinkIO()(p.alterPartial({case TLId => outerTLId}))
+  }
+
+  val iacq = io.in.acquire.bits
+  val oacq = io.out.acquire.bits
+  val ognt = io.out.grant.bits
+  val ignt = io.in.grant.bits
+
+  val shrink = iacq.a_type === Acquire.putBlockType
+  val stretch = ognt.g_type === Grant.getDataBlockType
+  val smallget = iacq.a_type === Acquire.getType
+  val smallput = iacq.a_type === Acquire.putType
+  val smallgnt = ognt.g_type === Grant.getDataBeatType
+
+  val sending_put = Reg(init = Bool(false))
+  val collecting = Reg(init = Bool(false))
+  val put_block = Reg(UInt(width = outerBlockAddrBits))
+  val put_id = Reg(UInt(width = outerClientIdBits))
+  val put_data = Reg(Vec(factor, UInt(width = innerDataBits)))
+  val put_wmask = Reg(Vec(factor, UInt(width = innerWriteMaskBits)))
+  val put_allocate = Reg(Bool())
+  val (put_beat, put_done) = Counter(io.out.acquire.fire() && oacq.hasMultibeatData(), outerDataBeats)
+  val (recv_idx, recv_done) = Counter(io.in.acquire.fire() && iacq.hasMultibeatData(), factor)
+
+  val in_addr = iacq.full_addr()
+  val out_addr_block = in_addr(paddrBits - 1, outerBlockOffset)
+  val out_addr_beat  = in_addr(outerBlockOffset - 1, outerByteAddrBits)
+  val out_addr_byte  = in_addr(outerByteAddrBits - 1, 0)
+
+  val switch_addr = in_addr(outerByteAddrBits - 1, innerByteAddrBits)
+  val smallget_switch = Reg(Vec(innerMaxXacts, switch_addr))
+
+  def align_data(addr: UInt, data: UInt): UInt =
+    data << Cat(addr, UInt(0, log2Up(innerDataBits)))
+
+  def align_wmask(addr: UInt, wmask: UInt): UInt =
+    wmask << Cat(addr, UInt(0, log2Up(innerWriteMaskBits)))
+
+  val outerConfig = p.alterPartial({ case TLId => outerTLId })
+
+  val get_acquire = Get(
+    client_xact_id = iacq.client_xact_id,
+    addr_block = out_addr_block,
+    addr_beat = out_addr_beat,
+    addr_byte = out_addr_byte,
+    operand_size = iacq.op_size(),
+    alloc = iacq.allocate())(outerConfig)
+
+  val get_block_acquire = GetBlock(
+    client_xact_id = iacq.client_xact_id,
+    addr_block = out_addr_block,
+    alloc = iacq.allocate())(outerConfig)
+
+  val put_acquire = Put(
+    client_xact_id = iacq.client_xact_id,
+    addr_block = out_addr_block,
+    addr_beat = out_addr_beat,
+    data = align_data(switch_addr, iacq.data),
+    wmask = Some(align_wmask(switch_addr, iacq.wmask())),
+    alloc = iacq.allocate())(outerConfig)
+
+  val put_block_acquire = PutBlock(
+    client_xact_id = put_id,
+    addr_block = put_block,
+    addr_beat = put_beat,
+    data = put_data.asUInt,
+    wmask = Some(put_wmask.asUInt))(outerConfig)
+
+  io.out.acquire.valid := sending_put || (!shrink && io.in.acquire.valid)
+  io.out.acquire.bits := MuxCase(get_block_acquire, Seq(
+    sending_put -> put_block_acquire,
+    smallget -> get_acquire,
+    smallput -> put_acquire))
+  io.in.acquire.ready := !sending_put && (shrink || io.out.acquire.ready)
+
+  when (io.in.acquire.fire() && shrink) {
+    when (!collecting) {
+      put_block := out_addr_block
+      put_id := iacq.client_xact_id
+      put_allocate := iacq.allocate()
+      collecting := Bool(true)
+    }
+    put_data(recv_idx) := iacq.data
+    put_wmask(recv_idx) := iacq.wmask()
+  }
+
+  when (io.in.acquire.fire() && smallget) {
+    smallget_switch(iacq.client_xact_id) := switch_addr
+  }
+
+  when (recv_done) { sending_put := Bool(true) }
+  when (sending_put && io.out.acquire.ready) { sending_put := Bool(false) }
+  when (put_done) { collecting := Bool(false) }
+
+  val returning_data = Reg(init = Bool(false))
+  val (send_idx, send_done) = Counter(
+    io.in.grant.ready && returning_data, factor)
+
+  val gnt_beat = Reg(UInt(width = outerBeatAddrBits))
+  val gnt_client_id = Reg(UInt(width = outerClientIdBits))
+  val gnt_manager_id = Reg(UInt(width = outerManagerIdBits))
+  val gnt_data = Reg(UInt(width = outerDataBits))
+
+  when (io.out.grant.fire() && stretch) {
+    gnt_data := ognt.data
+    gnt_client_id := ognt.client_xact_id
+    gnt_manager_id := ognt.manager_xact_id
+    gnt_beat := ognt.addr_beat
+    returning_data := Bool(true)
+  }
+
+  when (send_done) { returning_data := Bool(false) }
+
+  def select_data(data: UInt, sel: UInt): UInt =
+    data >> (sel << log2Up(innerDataBits))
+
+  val gnt_switch = smallget_switch(ognt.client_xact_id)
+
+  val innerConfig = p.alterPartial({ case TLId => innerTLId })
+
+  val get_block_grant = Grant(
+    is_builtin_type = Bool(true),
+    g_type = Grant.getDataBlockType,
+    client_xact_id = gnt_client_id,
+    manager_xact_id = gnt_manager_id,
+    addr_beat = Cat(gnt_beat, send_idx),
+    data = select_data(gnt_data, send_idx))(innerConfig)
+
+  val get_grant = Grant(
+    is_builtin_type = Bool(true),
+    g_type = Grant.getDataBeatType,
+    client_xact_id = ognt.client_xact_id,
+    manager_xact_id = ognt.manager_xact_id,
+    addr_beat = Cat(ognt.addr_beat, gnt_switch),
+    data = select_data(ognt.data, gnt_switch))(innerConfig)
+
+  val default_grant = Grant(
+    is_builtin_type = Bool(true),
+    g_type = ognt.g_type,
+    client_xact_id = ognt.client_xact_id,
+    manager_xact_id = ognt.manager_xact_id,
+    addr_beat = ognt.addr_beat,
+    data = ognt.data)(innerConfig)
+
+  io.in.grant.valid := returning_data || (!stretch && io.out.grant.valid)
+  io.in.grant.bits := MuxCase(default_grant, Seq(
+    returning_data -> get_block_grant,
+    smallgnt -> get_grant))
+  io.out.grant.ready := !returning_data && (stretch || io.in.grant.ready)
+}
+
+class TileLinkIONarrower(innerTLId: String, outerTLId: String)
+    (implicit p: Parameters) extends TLModule()(p) {
+
+  val innerParams = p(TLKey(innerTLId))
+  val outerParams = p(TLKey(outerTLId)) 
+  val innerDataBeats = innerParams.dataBeats
+  val innerDataBits = innerParams.dataBitsPerBeat
+  val innerWriteMaskBits = innerParams.writeMaskBits
+  val innerByteAddrBits = log2Up(innerWriteMaskBits)
+  val outerDataBeats = outerParams.dataBeats
+  val outerDataBits = outerParams.dataBitsPerBeat
+  val outerWriteMaskBits = outerParams.writeMaskBits
+  val outerByteAddrBits = log2Up(outerWriteMaskBits)
+  val outerBeatAddrBits = log2Up(outerDataBeats)
+  val outerBlockOffset = outerBeatAddrBits + outerByteAddrBits
+  val outerMaxClients = outerParams.maxClientsPerPort
+  val outerIdBits = log2Up(outerParams.maxClientXacts * outerMaxClients)
+
+  require(outerDataBeats > innerDataBeats)
+  require(outerDataBeats % innerDataBeats == 0)
+  require(outerDataBits < innerDataBits)
+  require(outerDataBits * outerDataBeats == innerDataBits * innerDataBeats)
+
+  val factor = outerDataBeats / innerDataBeats
+
+  val io = new Bundle {
+    val in = new ClientUncachedTileLinkIO()(p.alterPartial({case TLId => innerTLId})).flip
+    val out = new ClientUncachedTileLinkIO()(p.alterPartial({case TLId => outerTLId}))
+  }
+
+  val iacq = io.in.acquire.bits
+  val ognt = io.out.grant.bits
+
+  val stretch = iacq.a_type === Acquire.putBlockType
+  val shrink = iacq.a_type === Acquire.getBlockType
+  val smallput = iacq.a_type === Acquire.putType
+  val smallget = iacq.a_type === Acquire.getType
+
+  val acq_data_buffer = Reg(UInt(width = innerDataBits))
+  val acq_wmask_buffer = Reg(UInt(width = innerWriteMaskBits))
+  val acq_client_id = Reg(iacq.client_xact_id)
+  val acq_addr_block = Reg(iacq.addr_block)
+  val acq_addr_beat = Reg(iacq.addr_beat)
+  val oacq_ctr = Counter(factor)
+
+  val outer_beat_addr = iacq.full_addr()(outerBlockOffset - 1, outerByteAddrBits)
+  val outer_byte_addr = iacq.full_addr()(outerByteAddrBits - 1, 0)
+
+  val mask_chunks = Vec.tabulate(factor) { i =>
+    val lsb = i * outerWriteMaskBits
+    val msb = (i + 1) * outerWriteMaskBits - 1
+    iacq.wmask()(msb, lsb)
+  }
+
+  val data_chunks = Vec.tabulate(factor) { i =>
+    val lsb = i * outerDataBits
+    val msb = (i + 1) * outerDataBits - 1
+    iacq.data(msb, lsb)
+  }
+
+  val beat_sel = Cat(mask_chunks.map(mask => mask.orR).reverse)
+
+  val smallput_data = Mux1H(beat_sel, data_chunks)
+  val smallput_wmask = Mux1H(beat_sel, mask_chunks)
+  val smallput_beat = Cat(iacq.addr_beat, PriorityEncoder(beat_sel))
+
+  assert(!io.in.acquire.valid || !smallput || PopCount(beat_sel) <= UInt(1),
+    "Can't perform Put wider than outer width")
+
+  val read_size_ok = iacq.op_size() <= UInt(log2Ceil(outerDataBits / 8))
+  assert(!io.in.acquire.valid || !smallget || read_size_ok,
+    "Can't perform Get wider than outer width")
+
+  val outerConfig = p.alterPartial({ case TLId => outerTLId })
+  val innerConfig = p.alterPartial({ case TLId => innerTLId })
+
+  val get_block_acquire = GetBlock(
+    client_xact_id = iacq.client_xact_id,
+    addr_block = iacq.addr_block,
+    alloc = iacq.allocate())(outerConfig)
+
+  val put_block_acquire = PutBlock(
+    client_xact_id = acq_client_id,
+    addr_block = acq_addr_block,
+    addr_beat = if (factor > 1)
+                  Cat(acq_addr_beat, oacq_ctr.value)
+                else acq_addr_beat,
+    data = acq_data_buffer(outerDataBits - 1, 0),
+    wmask = Some(acq_wmask_buffer(outerWriteMaskBits - 1, 0)))(outerConfig)
+
+  val get_acquire = Get(
+    client_xact_id = iacq.client_xact_id,
+    addr_block = iacq.addr_block,
+    addr_beat = outer_beat_addr,
+    addr_byte = outer_byte_addr,
+    operand_size = iacq.op_size(),
+    alloc = iacq.allocate())(outerConfig)
+
+  val put_acquire = Put(
+    client_xact_id = iacq.client_xact_id,
+    addr_block = iacq.addr_block,
+    addr_beat = smallput_beat,
+    data = smallput_data,
+    wmask = Some(smallput_wmask))(outerConfig)
+
+  val sending_put = Reg(init = Bool(false))
+
+  val pass_valid = io.in.acquire.valid && !stretch
+
+  io.out.acquire.bits := MuxCase(Wire(io.out.acquire.bits, init=iacq), Seq(
+    (sending_put, put_block_acquire),
+    (shrink, get_block_acquire),
+    (smallput, put_acquire),
+    (smallget, get_acquire)))
+  io.out.acquire.valid := sending_put || pass_valid
+  io.in.acquire.ready := !sending_put && (stretch || io.out.acquire.ready)
+
+  when (io.in.acquire.fire() && stretch) {
+    acq_data_buffer := iacq.data
+    acq_wmask_buffer := iacq.wmask()
+    acq_client_id := iacq.client_xact_id
+    acq_addr_block := iacq.addr_block
+    acq_addr_beat := iacq.addr_beat
+    sending_put := Bool(true)
+  }
+
+  when (sending_put && io.out.acquire.ready) {
+    acq_data_buffer := acq_data_buffer >> outerDataBits
+    acq_wmask_buffer := acq_wmask_buffer >> outerWriteMaskBits
+    when (oacq_ctr.inc()) { sending_put := Bool(false) }
+  }
+
+  val ognt_block = ognt.hasMultibeatData()
+  val gnt_data_buffer = Reg(Vec(factor, UInt(width = outerDataBits)))
+  val gnt_client_id = Reg(ognt.client_xact_id)
+  val gnt_manager_id = Reg(ognt.manager_xact_id)
+
+  val ignt_ctr = Counter(innerDataBeats)
+  val ognt_ctr = Counter(factor)
+  val sending_get = Reg(init = Bool(false))
+
+  val get_block_grant = Grant(
+    is_builtin_type = Bool(true),
+    g_type = Grant.getDataBlockType,
+    client_xact_id = gnt_client_id,
+    manager_xact_id = gnt_manager_id,
+    addr_beat = ignt_ctr.value,
+    data = gnt_data_buffer.asUInt)(innerConfig)
+
+  val smallget_grant = ognt.g_type === Grant.getDataBeatType
+
+  val get_grant = Grant(
+    is_builtin_type = Bool(true),
+    g_type = Grant.getDataBeatType,
+    client_xact_id = ognt.client_xact_id,
+    manager_xact_id = ognt.manager_xact_id,
+    addr_beat = ognt.addr_beat >> UInt(log2Up(factor)),
+    data = Fill(factor, ognt.data))(innerConfig)
+
+  io.in.grant.valid := sending_get || (io.out.grant.valid && !ognt_block)
+  io.out.grant.ready := !sending_get && (ognt_block || io.in.grant.ready)
+
+  io.in.grant.bits := MuxCase(Wire(io.in.grant.bits, init=ognt), Seq(
+    sending_get -> get_block_grant,
+    smallget_grant -> get_grant))
+
+  when (io.out.grant.valid && ognt_block && !sending_get) {
+    gnt_data_buffer(ognt_ctr.value) := ognt.data
+    when (ognt_ctr.inc()) {
+      gnt_client_id := ognt.client_xact_id
+      gnt_manager_id := ognt.manager_xact_id
+      sending_get := Bool(true)
+    }
+  }
+
+  when (io.in.grant.ready && sending_get) {
+    ignt_ctr.inc()
+    sending_get := Bool(false)
+  }
+}
+
+class TileLinkFragmenterSource(implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val in  = Decoupled(new Acquire).flip
+    val out = Decoupled(new Acquire)
+    val que = Decoupled(UInt(width = tlBeatAddrBits))
+  }
+
+  // Pipeline stage with acquire data; needed to ensure in.bits stay fixed when !in.ready
+  val acq_valid = RegInit(Bool(false))
+  val acq_bits  = Reg(new Acquire)
+  // The last beat of generate acquire to send
+  val acq_last_beat = Reg(UInt(width = tlBeatAddrBits))
+  val acq_last = acq_bits.addr_beat === acq_last_beat
+
+  // 'in' has the first beat?
+  val in_multi_put = io.in.bits.isBuiltInType(Acquire.putBlockType)
+  val in_multi_get = io.in.bits.isBuiltInType(Acquire.getBlockType)
+  val in_first_beat = !in_multi_put || io.in.bits.addr_beat === UInt(0)
+
+  // Move stuff from acq to out whenever out is ready
+  io.out.valid := acq_valid
+  // When can acq accept a request?
+  val acq_ready = !acq_valid || (acq_last && io.out.ready)
+  // Move the first beat from in to acq only when both acq and que are ready
+  io.in.ready := (!in_first_beat || io.que.ready) && acq_ready
+  io.que.valid := (in_first_beat && io.in.valid) && acq_ready
+
+  // in.fire moves data from in to acq and (optionally) que
+  // out.fire moves data from acq to out
+
+  // Desired flow control results:
+  assert (!io.que.fire() || io.in.fire())                               // 1. que.fire => in.fire
+  assert (!(io.in.fire() && in_first_beat) || io.que.fire())            // 2. in.fire && in_first_beat => que.fire
+  assert (!io.out.fire() || acq_valid)                                  // 3. out.fire => acq_valid
+  assert (!io.in.fire() || (!acq_valid || (io.out.fire() && acq_last))) // 4. in.fire => !acq_valid || (out.fire && acq_last)
+  // Proofs:
+  // 1. que.fire => que.ready && in.valid && acq_ready => in.ready && in.valid
+  // 2. in.fire && in_first_beat => in.valid && acq_ready && [(!in_first_beat || que.ready) && in_first_beat] =>
+  //   in.valid && acq_ready && que.ready && in_first_beat => que.valid && que.ready
+  // 3. out.fire => out.valid => acq_valid
+  // 4. in.fire => acq_ready => !acq_valid || (acq_last && out.ready) =>
+  //   !acq_valid || (acq_valid && acq_last && out.ready) => !acq_valid || (acq_last && out.fire)
+
+  val multi_size = SInt(-1, width = tlBeatAddrBits).asUInt // TL2: use in.bits.size()/beatBits-1
+  val in_sizeMinus1 = Mux(in_multi_get || in_multi_put, multi_size, UInt(0))
+  val in_insertSizeMinus1 = Mux(in_multi_get, multi_size, UInt(0))
+
+  when (io.in.fire()) {
+    // Theorem 4 makes this safe; we overwrite garbage, or replace the final acq
+    acq_valid := Bool(true)
+    acq_bits := io.in.bits
+    acq_last_beat := io.in.bits.addr_beat + in_insertSizeMinus1
+    // Replace this with size truncation in TL2:
+    acq_bits.a_type := Mux(in_multi_put, Acquire.putType, Mux(in_multi_get, Acquire.getType, io.in.bits.a_type))
+  } .elsewhen (io.out.fire()) {
+    acq_valid := !acq_last // false => !in.valid || (!que.ready && in_first_beat)
+    acq_bits.addr_beat := acq_bits.addr_beat + UInt(1)
+    // acq_last && out.fire => acq_last && out.ready && acq_valid => acq_ready
+    // Suppose in.valid, then !in.fire => !in.ready => !(!in_first_beat || que.ready) => !que.ready && in_first_beat
+  }
+
+  // Safe by theorem 3
+  io.out.bits := acq_bits
+  // Safe by theorem 1
+  io.que.bits := in_sizeMinus1
+}
+
+class TileLinkFragmenterSink(implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val in  = Decoupled(new Grant).flip
+    val out = Decoupled(new Grant)
+    val que = Decoupled(UInt(width = tlBeatAddrBits)).flip
+  }
+
+  val count_valid = RegInit(Bool(false))
+  val multi_op = Reg(Bool())
+  val count_bits = Reg(UInt(width = tlBeatAddrBits))
+  val last = count_bits === UInt(0)
+
+  val in_put = io.in.bits.isBuiltInType(Grant.putAckType)
+  val in_get = io.in.bits.isBuiltInType(Grant.getDataBeatType)
+  val deliver = last || in_get
+
+  // Accept the input, discarding the non-final put grant
+  io.in.ready := count_valid && (io.out.ready || !deliver)
+  // Output the grant whenever we want delivery
+  io.out.valid := count_valid && io.in.valid && deliver
+  // Take a new number whenever we deliver the last beat
+  io.que.ready := !count_valid || (io.in.valid && io.out.ready && last)
+
+  // Desired flow control results:
+  assert (!io.out.fire() || (count_valid && io.in.fire()))   // 1. out.fire => in.fire && count_valid
+  assert (!(io.in.fire() && deliver) || io.out.fire())       // 2. in.fire && deliver => out.fire
+  assert (!(io.out.fire() && last) || io.que.ready)          // 3. out.fire && last => que.ready
+  assert (!io.que.fire() || (!count_valid || io.out.fire())) // 4. que.fire => !count_valid || (out.fire && last)
+  // Proofs:
+  // 1. out.fire => out.ready && (count_valid && in.valid && deliver) => (count_valid && out.ready) && in.valid => in.fire
+  // 2. in.fire && deliver => in.valid && count_valid && [(out.ready || !deliver) && deliver] =>
+  //      in.valid && count_valid && deliver && out.ready => out.fire
+  // 3. out.fire && last => out.valid && out.ready && last => in.valid && out.ready && last => que.ready
+  // 4. que.fire => que.valid && (!count_valid || (in.valid && out.ready && last))
+  //             => !count_valid || (count_valid && in.valid && out.ready && [last => deliver])
+  //             => !count_valid || (out.valid && out.ready && last)
+
+  when (io.que.fire()) {
+    // Theorem 4 makes this safe; we overwrite garbage or last output
+    count_valid := Bool(true)
+    count_bits := io.que.bits
+    multi_op := io.que.bits =/= UInt(0)
+  } .elsewhen (io.in.fire()) {
+    count_valid := !last // false => !que.valid
+    count_bits := count_bits - UInt(1)
+    // Proof: in.fire && [last => deliver] =2=> out.fire && last =3=> que.ready
+    //  !que.fire && que.ready => !que.valid
+  }
+
+  // Safe by Theorem 1
+  io.out.bits := io.in.bits
+  io.out.bits.g_type := Mux(multi_op, Mux(in_get, Grant.getDataBlockType, Grant.putAckType), io.in.bits.g_type)
+}
+
+class TileLinkFragmenter(depth: Int = 1)(implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val in = new ClientUncachedTileLinkIO().flip
+    val out = new ClientUncachedTileLinkIO
+  }
+
+  // TL2:
+  // supportsAcquire = false
+  // modify all outward managers to supportsMultibeat = true
+  // assert: all managers must behaveFIFO (not inspect duplicated id field)
+
+  val source = Module(new TileLinkFragmenterSource)
+  val sink = Module(new TileLinkFragmenterSink)
+  sink.io.que <> Queue(source.io.que, depth)
+
+  source.io.in <> io.in.acquire
+  io.out.acquire <> source.io.out
+  sink.io.in <> io.out.grant
+  io.in.grant <> sink.io.out
+}
+
+object TileLinkFragmenter {
+  // Pass the source/client to fragment
+  def apply(source: ClientUncachedTileLinkIO, depth: Int = 1)(implicit p: Parameters): ClientUncachedTileLinkIO = {
+    val fragmenter = Module(new TileLinkFragmenter(depth))
+    fragmenter.io.in <> source
+    fragmenter.io.out
+  }
+}
--- a/src/main/scala/uncore/devices/Bram.scala
+++ b/src/main/scala/uncore/devices/Bram.scala
@ -0,0 +1,161 @@
+package uncore.devices
+
+import Chisel._
+import cde.{Parameters, Field}
+import junctions._
+import uncore.tilelink._
+import uncore.util._
+import HastiConstants._
+
+class BRAMSlave(depth: Int)(implicit val p: Parameters) extends Module
+  with HasTileLinkParameters {
+  val io = new ClientUncachedTileLinkIO().flip
+
+  // For TL2:
+  // supportsAcquire = false
+  // supportsMultibeat = false
+  // supportsHint = false
+  // supportsAtomic = false
+
+  // Timing-wise, we assume the input is coming out of registers
+  // since you probably needed a TileLinkFragmenter infront of us
+
+  // Thus, only one pipeline stage: the grant result
+  val g_valid = RegInit(Bool(false))
+  val g_bits = Reg(new Grant)
+
+  // Just pass the pipeline straight through
+  io.grant.valid := g_valid
+  io.grant.bits := g_bits
+  io.acquire.ready := !g_valid || io.grant.ready
+
+  val acq_get  = io.acquire.bits.isBuiltInType(Acquire.getType)
+  val acq_put  = io.acquire.bits.isBuiltInType(Acquire.putType)
+  val acq_addr = Cat(io.acquire.bits.addr_block, io.acquire.bits.addr_beat)
+
+  val bram = Mem(depth, Bits(width = tlDataBits))
+
+  val ren = acq_get && io.acquire.fire()
+  val wen = acq_put && io.acquire.fire()
+
+  when (io.grant.fire()) {
+    g_valid := Bool(false)
+  }
+
+  when (io.acquire.fire()) {
+    g_valid := Bool(true)
+    g_bits := Grant(
+      is_builtin_type = Bool(true),
+      g_type = io.acquire.bits.getBuiltInGrantType(),
+      client_xact_id = io.acquire.bits.client_xact_id,
+      manager_xact_id = UInt(0),
+      addr_beat = io.acquire.bits.addr_beat,
+      data = UInt(0))
+  }
+
+  when (wen) {
+    bram.write(acq_addr, io.acquire.bits.data)
+    assert(io.acquire.bits.wmask().andR, "BRAMSlave: partial write masks not supported")
+  }
+  io.grant.bits.data := RegEnable(bram.read(acq_addr), ren)
+}
+
+class HastiRAM(depth: Int)(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new HastiSlaveIO
+
+  val wdata = Vec.tabulate(hastiDataBytes)(i => io.hwdata(8*(i+1)-1,8*i))
+  val waddr = Reg(UInt(width = hastiAddrBits))
+  val wvalid = Reg(init = Bool(false))
+  val wsize = Reg(UInt(width = SZ_HSIZE))
+  val ram = SeqMem(depth, Vec(hastiDataBytes, Bits(width = 8)))
+
+  val max_size = log2Ceil(hastiDataBytes)
+  val wmask_lut = MuxLookup(wsize, SInt(-1, hastiDataBytes).asUInt,
+    (0 until max_size).map(sz => (UInt(sz) -> UInt((1 << (1 << sz)) - 1))))
+  val wmask = (wmask_lut << waddr(max_size - 1, 0))(hastiDataBytes - 1, 0)
+
+  val is_trans = io.hsel && io.htrans.isOneOf(HTRANS_NONSEQ, HTRANS_SEQ)
+  val raddr = io.haddr >> UInt(max_size)
+  val ren = is_trans && !io.hwrite
+  val bypass = Reg(init = Bool(false))
+
+  when (is_trans && io.hwrite) {
+    waddr := io.haddr
+    wsize := io.hsize
+    wvalid := Bool(true)
+  } .otherwise { wvalid := Bool(false) }
+
+  when (ren) { bypass := wvalid && (waddr >> UInt(max_size)) === raddr }
+
+  when (wvalid) {
+    ram.write(waddr >> UInt(max_size), wdata, wmask.toBools)
+  }
+
+  val rdata = ram.read(raddr, ren)
+  io.hrdata := Cat(rdata.zip(wmask.toBools).zip(wdata).map {
+    case ((rbyte, wsel), wbyte) => Mux(wsel && bypass, wbyte, rbyte)
+  }.reverse)
+
+  io.hready := Bool(true)
+  io.hresp := HRESP_OKAY
+}
+
+/**
+ * This RAM is not meant to be particularly performant.
+ * It just supports the entire range of uncached TileLink operations in the
+ * simplest way possible.
+ */
+class TileLinkTestRAM(depth: Int)(implicit val p: Parameters) extends Module
+    with HasTileLinkParameters {
+  val io = new ClientUncachedTileLinkIO().flip
+
+  val ram = Mem(depth, UInt(width = tlDataBits))
+
+  val responding = Reg(init = Bool(false))
+  val acq = io.acquire.bits
+  val r_acq = Reg(io.acquire.bits)
+  val acq_addr = Cat(acq.addr_block, acq.addr_beat)
+  val r_acq_addr = Cat(r_acq.addr_block, r_acq.addr_beat)
+
+  when (io.acquire.fire() && io.acquire.bits.last()) {
+    r_acq := io.acquire.bits
+    responding := Bool(true)
+  }
+
+  when (io.grant.fire()) {
+    val is_getblk = r_acq.isBuiltInType(Acquire.getBlockType)
+    val last_beat = r_acq.addr_beat === UInt(tlDataBeats - 1)
+    when (is_getblk && !last_beat) {
+      r_acq.addr_beat := r_acq.addr_beat + UInt(1)
+    } .otherwise { responding := Bool(false) }
+  }
+
+  val old_data = ram(acq_addr)
+  val new_data = acq.data
+  val r_old_data = RegEnable(old_data, io.acquire.fire())
+
+  io.acquire.ready := !responding
+  io.grant.valid := responding
+  io.grant.bits := Grant(
+    is_builtin_type = Bool(true),
+    g_type = r_acq.getBuiltInGrantType(),
+    client_xact_id = r_acq.client_xact_id,
+    manager_xact_id = UInt(0),
+    addr_beat = r_acq.addr_beat,
+    data = Mux(r_acq.isAtomic(), r_old_data, ram(r_acq_addr)))
+
+  val amo_shift_bits = acq.amo_shift_bytes() << UInt(3)
+  val amoalu = Module(new AMOALU(amoAluOperandBits, rhsIsAligned = true))
+  amoalu.io.addr := Cat(acq.addr_block, acq.addr_beat, acq.addr_byte())
+  amoalu.io.cmd := acq.op_code()
+  amoalu.io.typ := acq.op_size()
+  amoalu.io.lhs := old_data >> amo_shift_bits
+  amoalu.io.rhs := new_data >> amo_shift_bits
+
+  val result = Mux(acq.isAtomic(), amoalu.io.out << amo_shift_bits, new_data)
+  val wmask = FillInterleaved(8, acq.wmask())
+
+  when (io.acquire.fire() && acq.hasData()) {
+    ram(acq_addr) := (old_data & ~wmask) | (result & wmask)
+  }
+}
--- a/src/main/scala/uncore/devices/Debug.scala
+++ b/src/main/scala/uncore/devices/Debug.scala
--- a/src/main/scala/uncore/devices/Plic.scala
+++ b/src/main/scala/uncore/devices/Plic.scala
@ -0,0 +1,187 @@
+// See LICENSE for license details.
+
+package uncore.devices
+
+import Chisel._
+import Chisel.ImplicitConversions._
+
+import junctions._
+import uncore.tilelink._
+import cde.Parameters
+
+class GatewayPLICIO extends Bundle {
+  val valid = Bool(OUTPUT)
+  val ready = Bool(INPUT)
+  val complete = Bool(INPUT)
+}
+
+class LevelGateway extends Module {
+  val io = new Bundle {
+    val interrupt = Bool(INPUT)
+    val plic = new GatewayPLICIO
+  }
+
+  val inFlight = Reg(init=Bool(false))
+  when (io.interrupt && io.plic.ready) { inFlight := true }
+  when (io.plic.complete) { inFlight := false }
+  io.plic.valid := io.interrupt && !inFlight
+}
+
+case class PLICConfig(nHartsIn: Int, supervisor: Boolean, nDevices: Int, nPriorities: Int) {
+  def contextsPerHart = if (supervisor) 2 else 1
+  def nHarts = contextsPerHart * nHartsIn
+  def context(i: Int, mode: Char) = mode match {
+    case 'M' => i * contextsPerHart
+    case 'S' => require(supervisor); i * contextsPerHart + 1
+  }
+  def claimAddr(i: Int, mode: Char) = hartBase + hartOffset(context(i, mode)) + claimOffset
+  def threshAddr(i: Int, mode: Char) = hartBase + hartOffset(context(i, mode))
+  def enableAddr(i: Int, mode: Char) = enableBase + enableOffset(context(i, mode))
+  def size = hartBase + hartOffset(maxHarts)
+
+  def maxDevices = 1023
+  def maxHarts = 15872
+  def pendingBase = 0x1000
+  def enableBase = 0x2000
+  def hartBase = 0x200000
+  require(hartBase >= enableBase + enableOffset(maxHarts))
+
+  def enableOffset(i: Int) = i * ((maxDevices+7)/8)
+  def hartOffset(i: Int) = i * 0x1000
+  def claimOffset = 4
+  def priorityBytes = 4
+
+  require(nDevices > 0 && nDevices <= maxDevices)
+  require(nHarts > 0 && nHarts <= maxHarts)
+  require(nPriorities >= 0 && nPriorities <= nDevices)
+}
+
+/** Platform-Level Interrupt Controller */
+class PLIC(val cfg: PLICConfig)(implicit val p: Parameters) extends Module
+    with HasTileLinkParameters
+    with HasAddrMapParameters {
+  val io = new Bundle {
+    val devices = Vec(cfg.nDevices, new GatewayPLICIO).flip
+    val harts = Vec(cfg.nHarts, Bool()).asOutput
+    val tl = new ClientUncachedTileLinkIO().flip
+  }
+
+  val priority =
+    if (cfg.nPriorities > 0) Reg(Vec(cfg.nDevices+1, UInt(width=log2Up(cfg.nPriorities+1))))
+    else Wire(init=Vec.fill(cfg.nDevices+1)(UInt(1)))
+  val threshold =
+    if (cfg.nPriorities > 0) Reg(Vec(cfg.nHarts, UInt(width = log2Up(cfg.nPriorities+1))))
+    else Wire(init=Vec.fill(cfg.nHarts)(UInt(0)))
+  val pending = Reg(init=Vec.fill(cfg.nDevices+1){Bool(false)})
+  val enables = Reg(Vec(cfg.nHarts, Vec(cfg.nDevices+1, Bool())))
+
+  for ((p, g) <- pending.tail zip io.devices) {
+    g.ready := !p
+    g.complete := false
+    when (g.valid) { p := true }
+  }
+
+  def findMax(x: Seq[UInt]): (UInt, UInt) = {
+    if (x.length > 1) {
+      val half = 1 << (log2Ceil(x.length) - 1)
+      val lMax = findMax(x take half)
+      val rMax = findMax(x drop half)
+      val useLeft = lMax._1 >= rMax._1
+      (Mux(useLeft, lMax._1, rMax._1), Mux(useLeft, lMax._2, UInt(half) + rMax._2))
+    } else (x.head, UInt(0))
+  }
+
+  val maxDevs = Wire(Vec(cfg.nHarts, UInt(width = log2Up(pending.size))))
+  for (hart <- 0 until cfg.nHarts) {
+    val effectivePriority =
+      for (((p, en), pri) <- (pending zip enables(hart) zip priority).tail)
+        yield Cat(p && en, pri)
+    val (maxPri, maxDev) = findMax((UInt(1) << priority(0).getWidth) +: effectivePriority)
+
+    maxDevs(hart) := Reg(next = maxDev)
+    io.harts(hart) := Reg(next = maxPri) > Cat(UInt(1), threshold(hart))
+  }
+
+  val acq = Queue(io.tl.acquire, 1)
+  val read = acq.fire() && acq.bits.isBuiltInType(Acquire.getType)
+  val write = acq.fire() && acq.bits.isBuiltInType(Acquire.putType)
+  assert(!acq.fire() || read || write, "unsupported PLIC operation")
+  val addr = acq.bits.full_addr()(log2Up(cfg.size)-1,0)
+
+  val claimant =
+    if (cfg.nHarts == 1) UInt(0)
+    else (addr - cfg.hartBase)(log2Up(cfg.hartOffset(cfg.nHarts))-1,log2Up(cfg.hartOffset(1)))
+  val hart = Wire(init = claimant)
+  val myMaxDev = maxDevs(claimant)
+  val myEnables = enables(hart)
+  val rdata = Wire(init = UInt(0, tlDataBits))
+  val masked_wdata = (acq.bits.data & acq.bits.full_wmask()) | (rdata & ~acq.bits.full_wmask())
+
+  when (addr >= cfg.hartBase) {
+    val word =
+      if (tlDataBytes > cfg.claimOffset) UInt(0)
+      else addr(log2Up(cfg.claimOffset),log2Up(tlDataBytes))
+    rdata := Cat(myMaxDev, UInt(0, 8*cfg.priorityBytes-threshold(0).getWidth), threshold(claimant)) >> (word * tlDataBits)
+
+    when (read && addr(log2Ceil(cfg.claimOffset))) {
+      pending(myMaxDev) := false
+    }
+    when (write) {
+      when (if (tlDataBytes > cfg.claimOffset) acq.bits.wmask()(cfg.claimOffset) else addr(log2Ceil(cfg.claimOffset))) {
+        val dev = (acq.bits.data >> ((8 * cfg.claimOffset) % tlDataBits))(log2Up(pending.size)-1,0)
+        when (myEnables(dev)) { io.devices(dev-1).complete := true }
+      }.otherwise {
+        if (cfg.nPriorities > 0) threshold(claimant) := acq.bits.data
+      }
+    }
+  }.elsewhen (addr >= cfg.enableBase) {
+    val enableHart =
+      if (cfg.nHarts > 1) (addr - cfg.enableBase)(log2Up(cfg.enableOffset(cfg.nHarts))-1,log2Up(cfg.enableOffset(1)))
+      else UInt(0)
+    hart := enableHart
+    val word =
+      if (tlDataBits >= cfg.nHarts) UInt(0)
+      else addr(log2Up((cfg.nHarts+7)/8)-1,log2Up(tlDataBytes))
+    for (i <- 0 until cfg.nHarts by tlDataBits) {
+      when (word === i/tlDataBits) {
+        rdata := Cat(myEnables.slice(i, i + tlDataBits).reverse)
+        for (j <- 0 until (tlDataBits min (myEnables.size - i))) {
+          when (write) { enables(enableHart)(i+j) := masked_wdata(j) }
+        }
+      }
+    }
+  }.elsewhen (addr >= cfg.pendingBase) {
+    val word =
+      if (tlDataBytes >= pending.size) UInt(0)
+      else addr(log2Up(pending.size)-1,log2Up(tlDataBytes))
+    rdata := pending.asUInt >> (word * tlDataBits)
+  }.otherwise {
+    val regsPerBeat = tlDataBytes >> log2Up(cfg.priorityBytes)
+    val word =
+      if (regsPerBeat >= priority.size) UInt(0)
+      else addr(log2Up(priority.size*cfg.priorityBytes)-1,log2Up(tlDataBytes))
+    for (i <- 0 until priority.size by regsPerBeat) {
+      when (word === i/regsPerBeat) {
+        rdata := Cat(priority.slice(i, i + regsPerBeat).map(p => Cat(UInt(0, 8*cfg.priorityBytes-p.getWidth), p)).reverse)
+        for (j <- 0 until (regsPerBeat min (priority.size - i))) {
+          if (cfg.nPriorities > 0) when (write) { priority(i+j) := masked_wdata >> (j * 8 * cfg.priorityBytes) }
+        }
+      }
+    }
+  }
+
+  priority(0) := 0
+  pending(0) := false
+  for (e <- enables)
+    e(0) := false
+
+  io.tl.grant.valid := acq.valid
+  acq.ready := io.tl.grant.ready
+  io.tl.grant.bits := Grant(
+    is_builtin_type = Bool(true),
+    g_type = acq.bits.getBuiltInGrantType(),
+    client_xact_id = acq.bits.client_xact_id,
+    manager_xact_id = UInt(0),
+    addr_beat = UInt(0),
+    data = rdata)
+}
--- a/src/main/scala/uncore/devices/Prci.scala
+++ b/src/main/scala/uncore/devices/Prci.scala
@ -0,0 +1,127 @@
+// See LICENSE for license details.
+
+package uncore.devices
+
+import Chisel._
+import Chisel.ImplicitConversions._
+import junctions._
+import junctions.NastiConstants._
+import uncore.tilelink._
+import cde.{Parameters, Field}
+
+/** Number of tiles */
+case object NTiles extends Field[Int]
+
+class PRCIInterrupts extends Bundle {
+  val meip = Bool()
+  val seip = Bool()
+  val debug = Bool()
+}
+
+class PRCITileIO(implicit p: Parameters) extends Bundle {
+  val reset = Bool(OUTPUT)
+  val id = UInt(OUTPUT, log2Up(p(NTiles)))
+  val interrupts = new PRCIInterrupts {
+    val mtip = Bool()
+    val msip = Bool()
+  }.asOutput
+
+  override def cloneType: this.type = new PRCITileIO().asInstanceOf[this.type]
+}
+
+object PRCI {
+  def msip(hart: Int) = hart * msipBytes
+  def timecmp(hart: Int) = 0x4000 + hart * timecmpBytes
+  def time = 0xbff8
+  def msipBytes = 4
+  def timecmpBytes = 8
+  def size = 0xc000
+}
+
+/** Power, Reset, Clock, Interrupt */
+class PRCI(implicit val p: Parameters) extends Module
+    with HasTileLinkParameters
+    with HasAddrMapParameters {
+  val io = new Bundle {
+    val interrupts = Vec(p(NTiles), new PRCIInterrupts).asInput
+    val tl = new ClientUncachedTileLinkIO().flip
+    val tiles = Vec(p(NTiles), new PRCITileIO)
+    val rtcTick = Bool(INPUT)
+  }
+
+  val timeWidth = 64
+  val timecmp = Reg(Vec(p(NTiles), UInt(width = timeWidth)))
+  val time = Reg(init=UInt(0, timeWidth))
+  when (io.rtcTick) { time := time + UInt(1) }
+
+  val ipi = Reg(init=Vec.fill(p(NTiles))(UInt(0, 32)))
+
+  val acq = Queue(io.tl.acquire, 1)
+  val addr = acq.bits.full_addr()(log2Ceil(PRCI.size)-1,0)
+  val read = acq.bits.isBuiltInType(Acquire.getType)
+  val rdata = Wire(init=UInt(0))
+  io.tl.grant.valid := acq.valid
+  acq.ready := io.tl.grant.ready
+  io.tl.grant.bits := Grant(
+    is_builtin_type = Bool(true),
+    g_type = acq.bits.getBuiltInGrantType(),
+    client_xact_id = acq.bits.client_xact_id,
+    manager_xact_id = UInt(0),
+    addr_beat = UInt(0),
+    data = rdata)
+
+  when (addr(log2Floor(PRCI.time))) {
+    require(log2Floor(PRCI.timecmp(p(NTiles)-1)) < log2Floor(PRCI.time))
+    rdata := load(Vec(time + UInt(0)), acq.bits)
+  }.elsewhen (addr >= PRCI.timecmp(0)) {
+    rdata := store(timecmp, acq.bits)
+  }.otherwise {
+    rdata := store(ipi, acq.bits) & Fill(tlDataBits/32, UInt(1, 32))
+  }
+
+  for ((tile, i) <- io.tiles zipWithIndex) {
+    tile.interrupts := io.interrupts(i)
+    tile.interrupts.msip := ipi(i)(0)
+    tile.interrupts.mtip := time >= timecmp(i)
+    tile.id := UInt(i)
+  }
+
+  // TODO generalize these to help other TL slaves
+  def load(v: Vec[UInt], acq: Acquire): UInt = {
+    val w = v.head.getWidth
+    val a = acq.full_addr()
+    require(isPow2(w) && w >= 8)
+    if (w > tlDataBits) {
+      (v(a(log2Up(w/8*v.size)-1,log2Up(w/8))) >> a(log2Up(w/8)-1,log2Up(tlDataBytes)))(tlDataBits-1,0)
+    } else {
+      val row = for (i <- 0 until v.size by tlDataBits/w)
+        yield Cat(v.slice(i, i + tlDataBits/w).reverse)
+      if (row.size == 1) row.head
+      else Vec(row)(a(log2Up(w/8*v.size)-1,log2Up(tlDataBytes)))
+    }
+  }
+
+  def store(v: Vec[UInt], acq: Acquire): UInt = {
+    val w = v.head.getWidth
+    require(isPow2(w) && w >= 8)
+    val a = acq.full_addr()
+    val rdata = load(v, acq)
+    val wdata = (acq.data & acq.full_wmask()) | (rdata & ~acq.full_wmask())
+    if (w <= tlDataBits) {
+      val word =
+        if (tlDataBits/w >= v.size) UInt(0)
+        else a(log2Up(w/8*v.size)-1,log2Up(tlDataBytes))
+      for (i <- 0 until v.size) {
+        when (acq.isBuiltInType(Acquire.putType) && word === i/(tlDataBits/w)) {
+          val base = i % (tlDataBits/w)
+          v(i) := wdata >> (w * (i % (tlDataBits/w)))
+        }
+      }
+    } else {
+      val i = a(log2Up(w/8*v.size)-1,log2Up(w/8))
+      val mask = FillInterleaved(tlDataBits, UIntToOH(a(log2Up(w/8)-1,log2Up(tlDataBytes))))
+      v(i) := (wdata & mask) | (v(i) & ~mask)
+    }
+    rdata
+  }
+}
--- a/src/main/scala/uncore/devices/Rom.scala
+++ b/src/main/scala/uncore/devices/Rom.scala
@ -0,0 +1,66 @@
+package uncore.devices
+
+import Chisel._
+import junctions._
+import uncore.tilelink._
+import uncore.util._
+import cde.{Parameters, Field}
+
+class ROMSlave(contents: Seq[Byte])(implicit val p: Parameters) extends Module
+    with HasTileLinkParameters
+    with HasAddrMapParameters {
+  val io = new ClientUncachedTileLinkIO().flip
+
+  val acq = Queue(io.acquire, 1)
+  val single_beat = acq.bits.isBuiltInType(Acquire.getType)
+  val multi_beat = acq.bits.isBuiltInType(Acquire.getBlockType)
+  assert(!acq.valid || single_beat || multi_beat, "unsupported ROMSlave operation")
+
+  val addr_beat = Reg(UInt())
+  when (io.grant.fire()) { addr_beat := addr_beat + UInt(1) }
+  when (io.acquire.fire()) { addr_beat := io.acquire.bits.addr_beat }
+
+  val byteWidth = tlDataBits / 8
+  val rows = (contents.size + byteWidth - 1)/byteWidth
+  val rom = Vec.tabulate(rows) { i =>
+    val slice = contents.slice(i*byteWidth, (i+1)*byteWidth)
+    UInt(slice.foldRight(BigInt(0)) { case (x,y) => (y << 8) + (x.toInt & 0xFF) }, byteWidth*8)
+  }
+  val raddr = Cat(acq.bits.addr_block, addr_beat)
+  val rdata = rom(if (rows == 1) UInt(0) else raddr(log2Up(rom.size)-1,0))
+
+  val last = !multi_beat || addr_beat === UInt(tlDataBeats-1)
+  io.grant.valid := acq.valid
+  acq.ready := io.grant.ready && last
+  io.grant.bits := Grant(
+    is_builtin_type = Bool(true),
+    g_type = acq.bits.getBuiltInGrantType(),
+    client_xact_id = acq.bits.client_xact_id,
+    manager_xact_id = UInt(0),
+    addr_beat = addr_beat,
+    data = rdata)
+}
+
+class NastiROM(contents: Seq[Byte])(implicit p: Parameters) extends Module {
+  val io = new NastiIO().flip
+  val ar = Queue(io.ar, 1)
+
+  // This assumes ROMs are in read-only parts of the address map.
+  // Reuse b_queue code from NastiErrorSlave if this assumption is bad.
+  when (ar.valid) { assert(ar.bits.len === UInt(0), "Can't burst-read from NastiROM") }
+  assert(!(io.aw.valid || io.w.valid), "Can't write to NastiROM")
+  io.aw.ready := Bool(false)
+  io.w.ready := Bool(false)
+  io.b.valid := Bool(false)
+
+  val byteWidth = io.r.bits.nastiXDataBits / 8
+  val rows = (contents.size + byteWidth - 1)/byteWidth
+  val rom = Vec.tabulate(rows) { i =>
+    val slice = contents.slice(i*byteWidth, (i+1)*byteWidth)
+    UInt(slice.foldRight(BigInt(0)) { case (x,y) => (y << 8) + (x.toInt & 0xFF) }, byteWidth*8)
+  }
+  val rdata = rom(if (rows == 1) UInt(0) else ar.bits.addr(log2Up(contents.size)-1,log2Up(byteWidth)))
+
+  io.r <> ar
+  io.r.bits := NastiReadDataChannel(ar.bits.id, rdata)
+}
--- a/src/main/scala/uncore/tilelink/Arbiters.scala
+++ b/src/main/scala/uncore/tilelink/Arbiters.scala
@ -0,0 +1,196 @@
+package uncore.tilelink
+import Chisel._
+import junctions._
+import cde.{Parameters, Field}
+
+/** Utility functions for constructing TileLinkIO arbiters */
+trait TileLinkArbiterLike extends HasTileLinkParameters {
+  // Some shorthand type variables
+  type ManagerSourcedWithId = ManagerToClientChannel with HasClientTransactionId
+  type ClientSourcedWithId = ClientToManagerChannel with HasClientTransactionId
+  type ClientSourcedWithIdAndData = ClientToManagerChannel with HasClientTransactionId with HasTileLinkData
+
+  val arbN: Int // The number of ports on the client side
+
+  // These abstract funcs are filled in depending on whether the arbiter mucks with the 
+  // outgoing client ids to track sourcing and then needs to revert them on the way back
+  def clientSourcedClientXactId(in: ClientSourcedWithId, id: Int): Bits
+  def managerSourcedClientXactId(in: ManagerSourcedWithId): Bits
+  def arbIdx(in: ManagerSourcedWithId): UInt
+
+  // The following functions are all wiring helpers for each of the different types of TileLink channels
+
+  def hookupClientSource[M <: ClientSourcedWithIdAndData](
+      clts: Seq[DecoupledIO[LogicalNetworkIO[M]]],
+      mngr: DecoupledIO[LogicalNetworkIO[M]]) {
+    def hasData(m: LogicalNetworkIO[M]) = m.payload.hasMultibeatData()
+    val arb = Module(new LockingRRArbiter(mngr.bits, arbN, tlDataBeats, Some(hasData _)))
+    clts.zipWithIndex.zip(arb.io.in).map{ case ((req, id), arb) => {
+      arb.valid := req.valid
+      arb.bits := req.bits
+      arb.bits.payload.client_xact_id := clientSourcedClientXactId(req.bits.payload, id)
+      req.ready := arb.ready
+    }}
+    mngr <> arb.io.out
+  }
+
+  def hookupClientSourceHeaderless[M <: ClientSourcedWithIdAndData](
+      clts: Seq[DecoupledIO[M]],
+      mngr: DecoupledIO[M]) {
+    def hasData(m: M) = m.hasMultibeatData()
+    val arb = Module(new LockingRRArbiter(mngr.bits, arbN, tlDataBeats, Some(hasData _)))
+    clts.zipWithIndex.zip(arb.io.in).map{ case ((req, id), arb) => {
+      arb.valid := req.valid
+      arb.bits := req.bits
+      arb.bits.client_xact_id := clientSourcedClientXactId(req.bits, id)
+      req.ready := arb.ready
+    }}
+    mngr <> arb.io.out
+  }
+
+  def hookupManagerSourceWithHeader[M <: ManagerToClientChannel](
+      clts: Seq[DecoupledIO[LogicalNetworkIO[M]]], 
+      mngr: DecoupledIO[LogicalNetworkIO[M]]) {
+    mngr.ready := Bool(false)
+    for (i <- 0 until arbN) {
+      clts(i).valid := Bool(false)
+      when (mngr.bits.header.dst === UInt(i)) {
+        clts(i).valid := mngr.valid
+        mngr.ready := clts(i).ready
+      }
+      clts(i).bits := mngr.bits
+    }
+  }
+
+  def hookupManagerSourceWithId[M <: ManagerSourcedWithId](
+      clts: Seq[DecoupledIO[LogicalNetworkIO[M]]], 
+      mngr: DecoupledIO[LogicalNetworkIO[M]]) {
+    mngr.ready := Bool(false)
+    for (i <- 0 until arbN) {
+      clts(i).valid := Bool(false)
+      when (arbIdx(mngr.bits.payload) === UInt(i)) {
+        clts(i).valid := mngr.valid
+        mngr.ready := clts(i).ready
+      }
+      clts(i).bits := mngr.bits
+      clts(i).bits.payload.client_xact_id := managerSourcedClientXactId(mngr.bits.payload)
+    }
+  }
+
+  def hookupManagerSourceHeaderlessWithId[M <: ManagerSourcedWithId](
+      clts: Seq[DecoupledIO[M]], 
+      mngr: DecoupledIO[M]) {
+    mngr.ready := Bool(false)
+    for (i <- 0 until arbN) {
+      clts(i).valid := Bool(false)
+      when (arbIdx(mngr.bits) === UInt(i)) {
+        clts(i).valid := mngr.valid
+        mngr.ready := clts(i).ready
+      }
+      clts(i).bits := mngr.bits
+      clts(i).bits.client_xact_id := managerSourcedClientXactId(mngr.bits)
+    }
+  }
+
+  def hookupManagerSourceBroadcast[M <: Data](clts: Seq[DecoupledIO[M]], mngr: DecoupledIO[M]) {
+    clts.map{ _.valid := mngr.valid }
+    clts.map{ _.bits := mngr.bits }
+    mngr.ready := clts.map(_.ready).reduce(_&&_)
+  }
+
+  def hookupFinish[M <: LogicalNetworkIO[Finish]]( clts: Seq[DecoupledIO[M]], mngr: DecoupledIO[M]) {
+    val arb = Module(new RRArbiter(mngr.bits, arbN))
+    arb.io.in <> clts
+    mngr <> arb.io.out
+  }
+}
+
+/** Abstract base case for any Arbiters that have UncachedTileLinkIOs */
+abstract class UncachedTileLinkIOArbiter(val arbN: Int)(implicit val p: Parameters) extends Module
+    with TileLinkArbiterLike {
+  val io = new Bundle {
+    val in = Vec(arbN, new UncachedTileLinkIO).flip
+    val out = new UncachedTileLinkIO
+  }
+  hookupClientSource(io.in.map(_.acquire), io.out.acquire)
+  hookupFinish(io.in.map(_.finish), io.out.finish)
+  hookupManagerSourceWithId(io.in.map(_.grant), io.out.grant)
+}
+
+/** Abstract base case for any Arbiters that have cached TileLinkIOs */
+abstract class TileLinkIOArbiter(val arbN: Int)(implicit val p: Parameters) extends Module
+    with TileLinkArbiterLike {
+  val io = new Bundle {
+    val in = Vec(arbN, new TileLinkIO).flip
+    val out = new TileLinkIO
+  }
+  hookupClientSource(io.in.map(_.acquire), io.out.acquire)
+  hookupClientSource(io.in.map(_.release), io.out.release)
+  hookupFinish(io.in.map(_.finish), io.out.finish)
+  hookupManagerSourceBroadcast(io.in.map(_.probe), io.out.probe)
+  hookupManagerSourceWithId(io.in.map(_.grant), io.out.grant)
+}
+
+/** Appends the port index of the arbiter to the client_xact_id */
+trait AppendsArbiterId extends TileLinkArbiterLike {
+  def clientSourcedClientXactId(in: ClientSourcedWithId, id: Int) =
+    Cat(in.client_xact_id, UInt(id, log2Up(arbN)))
+  def managerSourcedClientXactId(in: ManagerSourcedWithId) = {
+    /* This shouldn't be necessary, but Chisel3 doesn't emit correct Verilog
+     * when right shifting by too many bits.  See
+     * https://github.com/ucb-bar/firrtl/issues/69 */
+    if (in.client_xact_id.getWidth > log2Up(arbN))
+      in.client_xact_id >> log2Up(arbN)
+    else
+      UInt(0)
+  }
+  def arbIdx(in: ManagerSourcedWithId) = in.client_xact_id(log2Up(arbN)-1,0)
+}
+
+/** Uses the client_xact_id as is (assumes it has been set to port index) */
+trait PassesId extends TileLinkArbiterLike {
+  def clientSourcedClientXactId(in: ClientSourcedWithId, id: Int) = in.client_xact_id
+  def managerSourcedClientXactId(in: ManagerSourcedWithId) = in.client_xact_id
+  def arbIdx(in: ManagerSourcedWithId) = in.client_xact_id
+}
+
+/** Overwrites some default client_xact_id with the port idx */
+trait UsesNewId extends TileLinkArbiterLike {
+  def clientSourcedClientXactId(in: ClientSourcedWithId, id: Int) = UInt(id, log2Up(arbN))
+  def managerSourcedClientXactId(in: ManagerSourcedWithId) = UInt(0)
+  def arbIdx(in: ManagerSourcedWithId) = in.client_xact_id
+}
+
+// Now we can mix-in thevarious id-generation traits to make concrete arbiter classes
+class UncachedTileLinkIOArbiterThatAppendsArbiterId(val n: Int)(implicit p: Parameters) extends UncachedTileLinkIOArbiter(n)(p) with AppendsArbiterId
+class UncachedTileLinkIOArbiterThatPassesId(val n: Int)(implicit p: Parameters) extends UncachedTileLinkIOArbiter(n)(p) with PassesId
+class UncachedTileLinkIOArbiterThatUsesNewId(val n: Int)(implicit p: Parameters) extends UncachedTileLinkIOArbiter(n)(p) with UsesNewId
+class TileLinkIOArbiterThatAppendsArbiterId(val n: Int)(implicit p: Parameters) extends TileLinkIOArbiter(n)(p) with AppendsArbiterId
+class TileLinkIOArbiterThatPassesId(val n: Int)(implicit p: Parameters) extends TileLinkIOArbiter(n)(p) with PassesId
+class TileLinkIOArbiterThatUsesNewId(val n: Int)(implicit p: Parameters) extends TileLinkIOArbiter(n)(p) with UsesNewId
+
+/** Concrete uncached client-side arbiter that appends the arbiter's port id to client_xact_id */
+class ClientUncachedTileLinkIOArbiter(val arbN: Int)(implicit val p: Parameters) extends Module with TileLinkArbiterLike with AppendsArbiterId {
+  val io = new Bundle {
+    val in = Vec(arbN, new ClientUncachedTileLinkIO).flip
+    val out = new ClientUncachedTileLinkIO
+  }
+  if (arbN > 1) {
+    hookupClientSourceHeaderless(io.in.map(_.acquire), io.out.acquire)
+    hookupManagerSourceHeaderlessWithId(io.in.map(_.grant), io.out.grant)
+  } else { io.out <> io.in.head }
+}
+
+/** Concrete client-side arbiter that appends the arbiter's port id to client_xact_id */
+class ClientTileLinkIOArbiter(val arbN: Int)(implicit val p: Parameters) extends Module with TileLinkArbiterLike with AppendsArbiterId {
+  val io = new Bundle {
+    val in = Vec(arbN, new ClientTileLinkIO).flip
+    val out = new ClientTileLinkIO
+  }
+  if (arbN > 1) {
+    hookupClientSourceHeaderless(io.in.map(_.acquire), io.out.acquire)
+    hookupClientSourceHeaderless(io.in.map(_.release), io.out.release)
+    hookupManagerSourceBroadcast(io.in.map(_.probe), io.out.probe)
+    hookupManagerSourceHeaderlessWithId(io.in.map(_.grant), io.out.grant)
+  } else { io.out <> io.in.head }
+}
--- a/src/main/scala/uncore/tilelink/Definitions.scala
+++ b/src/main/scala/uncore/tilelink/Definitions.scala
@ -0,0 +1,971 @@
+// See LICENSE for license details.
+
+package uncore.tilelink
+import Chisel._
+import junctions._
+import uncore.coherence.CoherencePolicy
+import uncore.util._
+import scala.math.max
+import uncore.constants._
+import cde.{Parameters, Field}
+
+case object CacheBlockOffsetBits extends Field[Int]
+case object AmoAluOperandBits extends Field[Int]
+
+case object TLId extends Field[String]
+case class TLKey(id: String) extends Field[TileLinkParameters]
+
+/** Parameters exposed to the top-level design, set based on 
+  * external requirements or design space exploration
+  *
+  * Coherency policy used to define custom mesage types
+  * Number of manager agents
+  * Number of client agents that cache data and use custom [[uncore.Acquire]] types
+  * Number of client agents that do not cache data and use built-in [[uncore.Acquire]] types
+  * Maximum number of unique outstanding transactions per client
+  * Maximum number of clients multiplexed onto a single port
+  * Maximum number of unique outstanding transactions per manager
+  * Width of cache block addresses
+  * Total amount of data per cache block
+  * Number of data beats per cache block
+  **/
+
+case class TileLinkParameters(
+    coherencePolicy: CoherencePolicy,
+    nManagers: Int,
+    nCachingClients: Int,
+    nCachelessClients: Int,
+    maxClientXacts: Int,
+    maxClientsPerPort: Int,
+    maxManagerXacts: Int,
+    dataBits: Int,
+    dataBeats: Int = 4,
+    overrideDataBitsPerBeat: Option[Int] = None
+    ) {
+  val nClients = nCachingClients + nCachelessClients
+  val writeMaskBits: Int  = ((dataBits / dataBeats) - 1) / 8 + 1
+  val dataBitsPerBeat: Int = overrideDataBitsPerBeat.getOrElse(dataBits / dataBeats)
+}
+
+  
+/** Utility trait for building Modules and Bundles that use TileLink parameters */
+trait HasTileLinkParameters {
+  implicit val p: Parameters
+  val tlExternal = p(TLKey(p(TLId)))
+  val tlCoh = tlExternal.coherencePolicy
+  val tlNManagers = tlExternal.nManagers
+  val tlNCachingClients = tlExternal.nCachingClients
+  val tlNCachelessClients = tlExternal.nCachelessClients
+  val tlNClients = tlExternal.nClients
+  val tlClientIdBits =  log2Up(tlNClients)
+  val tlManagerIdBits =  log2Up(tlNManagers)
+  val tlMaxClientXacts = tlExternal.maxClientXacts
+  val tlMaxClientsPerPort = tlExternal.maxClientsPerPort
+  val tlMaxManagerXacts = tlExternal.maxManagerXacts
+  val tlClientXactIdBits = log2Up(tlMaxClientXacts*tlMaxClientsPerPort)
+  val tlManagerXactIdBits = log2Up(tlMaxManagerXacts)
+  val tlBlockAddrBits = p(PAddrBits) - p(CacheBlockOffsetBits)
+  val tlDataBeats = tlExternal.dataBeats
+  val tlDataBits = tlExternal.dataBitsPerBeat
+  val tlDataBytes = tlDataBits/8
+  val tlWriteMaskBits = tlExternal.writeMaskBits
+  val tlBeatAddrBits = log2Up(tlDataBeats)
+  val tlByteAddrBits = log2Up(tlWriteMaskBits)
+  val tlMemoryOpcodeBits = M_SZ
+  val tlMemoryOperandSizeBits = log2Ceil(log2Ceil(tlWriteMaskBits) + 1)
+  val tlAcquireTypeBits = max(log2Up(Acquire.nBuiltInTypes), 
+                              tlCoh.acquireTypeWidth)
+  val tlAcquireUnionBits = max(tlWriteMaskBits,
+                                 (tlByteAddrBits +
+                                   tlMemoryOperandSizeBits +
+                                   tlMemoryOpcodeBits)) + 1
+  val tlGrantTypeBits = max(log2Up(Grant.nBuiltInTypes), 
+                              tlCoh.grantTypeWidth) + 1
+/** Whether the underlying physical network preserved point-to-point ordering of messages */
+  val tlNetworkPreservesPointToPointOrdering = false
+  val tlNetworkDoesNotInterleaveBeats = true
+  val amoAluOperandBits = p(AmoAluOperandBits)
+  val amoAluOperandBytes = amoAluOperandBits/8
+}
+
+abstract class TLModule(implicit val p: Parameters) extends Module
+  with HasTileLinkParameters
+abstract class TLBundle(implicit val p: Parameters) extends junctions.ParameterizedBundle()(p)
+  with HasTileLinkParameters
+
+/** Base trait for all TileLink channels */
+abstract class TileLinkChannel(implicit p: Parameters) extends TLBundle()(p) {
+  def hasData(dummy: Int = 0): Bool
+  def hasMultibeatData(dummy: Int = 0): Bool
+}
+/** Directionality of message channel. Used to hook up logical network ports to physical network ports */
+abstract class ClientToManagerChannel(implicit p: Parameters) extends TileLinkChannel()(p)
+/** Directionality of message channel. Used to hook up logical network ports to physical network ports */
+abstract class ManagerToClientChannel(implicit p: Parameters) extends TileLinkChannel()(p)
+/** Directionality of message channel. Used to hook up logical network ports to physical network ports */
+abstract class ClientToClientChannel(implicit p: Parameters) extends TileLinkChannel()(p) // Unused for now
+
+/** Common signals that are used in multiple channels.
+  * These traits are useful for type parameterizing bundle wiring functions.
+  */
+
+/** Address of a cache block. */
+trait HasCacheBlockAddress extends HasTileLinkParameters {
+  val addr_block = UInt(width = tlBlockAddrBits)
+
+  def conflicts(that: HasCacheBlockAddress) = this.addr_block === that.addr_block
+  def conflicts(addr: UInt) = this.addr_block === addr
+}
+
+/** Sub-block address or beat id of multi-beat data */
+trait HasTileLinkBeatId extends HasTileLinkParameters {
+  val addr_beat = UInt(width = tlBeatAddrBits)
+}
+
+/* Client-side transaction id. Usually Miss Status Handling Register File index */
+trait HasClientTransactionId extends HasTileLinkParameters {
+  val client_xact_id = Bits(width = tlClientXactIdBits)
+}
+
+/** Manager-side transaction id. Usually Transaction Status Handling Register File index. */
+trait HasManagerTransactionId extends HasTileLinkParameters {
+  val manager_xact_id = Bits(width = tlManagerXactIdBits)
+}
+
+/** A single beat of cache block data */
+trait HasTileLinkData extends HasTileLinkBeatId {
+  val data = UInt(width = tlDataBits)
+
+  def hasData(dummy: Int = 0): Bool
+  def hasMultibeatData(dummy: Int = 0): Bool
+  def first(dummy: Int = 0): Bool = !hasMultibeatData() ||  addr_beat === UInt(0)
+  def last(dummy: Int = 0): Bool = !hasMultibeatData() || addr_beat === UInt(tlDataBeats-1)
+}
+
+/** An entire cache block of data */
+trait HasTileLinkBlock extends HasTileLinkParameters {
+  val data_buffer = Vec(tlDataBeats, UInt(width = tlDataBits))
+  val wmask_buffer = Vec(tlDataBeats, UInt(width = tlWriteMaskBits))
+}
+
+/** The id of a client source or destination. Used in managers. */
+trait HasClientId extends HasTileLinkParameters {
+  val client_id = UInt(width = tlClientIdBits)
+}
+
+trait HasManagerId extends HasTileLinkParameters {
+  val manager_id = UInt(width = tlManagerIdBits)
+}
+
+trait HasAcquireUnion extends HasTileLinkParameters {
+  val union = Bits(width = tlAcquireUnionBits)
+
+  // Utility funcs for accessing subblock union:
+  def isBuiltInType(t: UInt): Bool
+  val opCodeOff = 1
+  val opSizeOff = tlMemoryOpcodeBits + opCodeOff
+  val addrByteOff = tlMemoryOperandSizeBits + opSizeOff
+  val addrByteMSB = tlByteAddrBits + addrByteOff
+  /** Hint whether to allocate the block in any interveneing caches */
+  def allocate(dummy: Int = 0) = union(0)
+  /** Op code for [[uncore.PutAtomic]] operations */
+  def op_code(dummy: Int = 0) = Mux(
+    isBuiltInType(Acquire.putType) || isBuiltInType(Acquire.putBlockType),
+    M_XWR, union(opSizeOff-1, opCodeOff))
+  /** Operand size for [[uncore.PutAtomic]] */
+  def op_size(dummy: Int = 0) = union(addrByteOff-1, opSizeOff)
+  /** Byte address for [[uncore.PutAtomic]] operand */
+  def addr_byte(dummy: Int = 0) = union(addrByteMSB-1, addrByteOff)
+  def amo_offset(dummy: Int = 0) =
+    if (tlByteAddrBits > log2Up(amoAluOperandBytes)) addr_byte()(tlByteAddrBits-1, log2Up(amoAluOperandBytes))
+    else UInt(0)
+  /** Bit offset of [[uncore.PutAtomic]] operand */
+  def amo_shift_bytes(dummy: Int = 0) = UInt(amoAluOperandBytes)*amo_offset()
+  /** Write mask for [[uncore.Put]], [[uncore.PutBlock]], [[uncore.PutAtomic]] */
+  def wmask(dummy: Int = 0): UInt = {
+    val is_amo   = isBuiltInType(Acquire.putAtomicType)
+    val amo_mask = if (tlByteAddrBits > log2Up(amoAluOperandBytes))
+                     FillInterleaved(amoAluOperandBytes, UIntToOH(amo_offset()))
+                   else Acquire.fullWriteMask
+    val is_put   = isBuiltInType(Acquire.putBlockType) || isBuiltInType(Acquire.putType)
+    val put_mask = union(tlWriteMaskBits, 1)
+    Mux(is_amo, amo_mask, Mux(is_put, put_mask, UInt(0)))
+  }
+  /** Full, beat-sized writemask */
+  def full_wmask(dummy: Int = 0) = FillInterleaved(8, wmask())
+
+  /** Is this message a built-in read message */
+  def hasPartialWritemask(dummy: Int = 0): Bool = wmask() =/= Acquire.fullWriteMask
+
+}
+
+trait HasAcquireType extends HasTileLinkParameters {
+  val is_builtin_type = Bool()
+  val a_type = UInt(width = tlAcquireTypeBits)
+
+  /** Message type equality */
+  def is(t: UInt) = a_type === t //TODO: make this more opaque; def ===?
+
+  /** Is this message a built-in or custom type */
+  def isBuiltInType(dummy: Int = 0): Bool = is_builtin_type
+  /** Is this message a particular built-in type */
+  def isBuiltInType(t: UInt): Bool = is_builtin_type && a_type === t 
+
+  /** Does this message refer to subblock operands using info in the Acquire.union subbundle */ 
+  def isSubBlockType(dummy: Int = 0): Bool = isBuiltInType() && a_type.isOneOf(Acquire.typesOnSubBlocks)
+
+  /** Is this message a built-in prefetch message */
+  def isPrefetch(dummy: Int = 0): Bool = isBuiltInType() &&
+                                           (is(Acquire.getPrefetchType) || is(Acquire.putPrefetchType))
+
+  /** Is this message a built-in atomic message */
+  def isAtomic(dummy: Int = 0): Bool = isBuiltInType() && is(Acquire.putAtomicType)
+
+  /** Is this message a built-in read message */
+  def isGet(dummy: Int = 0): Bool = isBuiltInType() && (is(Acquire.getType) || is(Acquire.getBlockType))
+
+  /** Does this message contain data? Assumes that no custom message types have data. */
+  def hasData(dummy: Int = 0): Bool = isBuiltInType() && a_type.isOneOf(Acquire.typesWithData)
+
+  /** Does this message contain multiple beats of data? Assumes that no custom message types have data. */
+  def hasMultibeatData(dummy: Int = 0): Bool = Bool(tlDataBeats > 1) && isBuiltInType() &&
+                                           a_type.isOneOf(Acquire.typesWithMultibeatData)
+
+  /** Mapping between each built-in Acquire type and a built-in Grant type.  */
+  def getBuiltInGrantType(dummy: Int = 0): UInt = Acquire.getBuiltInGrantType(this.a_type)
+}
+
+trait HasProbeType extends HasTileLinkParameters {
+  val p_type = UInt(width = tlCoh.probeTypeWidth)
+
+  def is(t: UInt) = p_type === t
+  def hasData(dummy: Int = 0) = Bool(false)
+  def hasMultibeatData(dummy: Int = 0) = Bool(false)
+}
+
+trait MightBeVoluntary {
+  def isVoluntary(dummy: Int = 0): Bool
+}
+
+trait HasReleaseType extends HasTileLinkParameters with MightBeVoluntary {
+  val voluntary = Bool()
+  val r_type = UInt(width = tlCoh.releaseTypeWidth)
+
+  def is(t: UInt) = r_type === t
+  def hasData(dummy: Int = 0) = r_type.isOneOf(tlCoh.releaseTypesWithData)
+  def hasMultibeatData(dummy: Int = 0) = Bool(tlDataBeats > 1) &&
+                                           r_type.isOneOf(tlCoh.releaseTypesWithData)
+  def isVoluntary(dummy: Int = 0) = voluntary
+  def requiresAck(dummy: Int = 0) = !Bool(tlNetworkPreservesPointToPointOrdering)
+}
+
+trait HasGrantType extends HasTileLinkParameters with MightBeVoluntary {
+  val is_builtin_type = Bool()
+  val g_type = UInt(width = tlGrantTypeBits)
+
+  // Helper funcs
+  def isBuiltInType(dummy: Int = 0): Bool = is_builtin_type
+  def isBuiltInType(t: UInt): Bool = is_builtin_type && g_type === t 
+  def is(t: UInt):Bool = g_type === t
+  def hasData(dummy: Int = 0): Bool = Mux(isBuiltInType(),
+                                        g_type.isOneOf(Grant.typesWithData),
+                                        g_type.isOneOf(tlCoh.grantTypesWithData))
+  def hasMultibeatData(dummy: Int = 0): Bool = 
+    Bool(tlDataBeats > 1) && Mux(isBuiltInType(),
+                               g_type.isOneOf(Grant.typesWithMultibeatData),
+                               g_type.isOneOf(tlCoh.grantTypesWithData))
+  def isVoluntary(dummy: Int = 0): Bool = isBuiltInType() && (g_type === Grant.voluntaryAckType)
+  def requiresAck(dummy: Int = 0): Bool = !Bool(tlNetworkPreservesPointToPointOrdering) && !isVoluntary()
+}
+
+/** TileLink channel bundle definitions */
+
+/** The Acquire channel is used to intiate coherence protocol transactions in
+  * order to gain access to a cache block's data with certain permissions
+  * enabled. Messages sent over this channel may be custom types defined by
+  * a [[uncore.CoherencePolicy]] for cached data accesse or may be built-in types
+  * used for uncached data accesses. Acquires may contain data for Put or
+  * PutAtomic built-in types. After sending an Acquire, clients must
+  * wait for a manager to send them a [[uncore.Grant]] message in response.
+  */
+class AcquireMetadata(implicit p: Parameters) extends ClientToManagerChannel
+    with HasCacheBlockAddress 
+    with HasClientTransactionId
+    with HasTileLinkBeatId
+    with HasAcquireType
+    with HasAcquireUnion {
+  /** Complete physical address for block, beat or operand */
+  def full_addr(dummy: Int = 0) =
+    Cat(this.addr_block, this.addr_beat,
+      Mux(isBuiltInType() && this.a_type.isOneOf(Acquire.typesWithAddrByte),
+        this.addr_byte(), UInt(0, tlByteAddrBits)))
+}
+
+/** [[uncore.AcquireMetadata]] with an extra field containing the data beat */
+class Acquire(implicit p: Parameters) extends AcquireMetadata
+  with HasTileLinkData
+
+/** [[uncore.AcquireMetadata]] with an extra field containing the entire cache block */
+class BufferedAcquire(implicit p: Parameters) extends AcquireMetadata
+  with HasTileLinkBlock
+
+/** [[uncore.Acquire]] with an extra field stating its source id */
+class AcquireFromSrc(implicit p: Parameters) extends Acquire
+  with HasClientId
+
+/** [[uncore.BufferedAcquire]] with an extra field stating its source id */
+class BufferedAcquireFromSrc(implicit p: Parameters) extends BufferedAcquire
+  with HasClientId 
+
+/** Used to track metadata for transactions where multiple secondary misses have been merged
+  * and handled by a single transaction tracker.
+  */
+class SecondaryMissInfo(implicit p: Parameters) extends TLBundle
+  with HasClientTransactionId
+  with HasTileLinkBeatId
+  with HasClientId
+  with HasAcquireType
+
+/** Contains definitions of the the built-in Acquire types and a factory
+  * for [[uncore.Acquire]]
+  *
+  * In general you should avoid using this factory directly and use
+  * [[uncore.ClientMetadata.makeAcquire]] for custom cached Acquires and
+  * [[uncore.Get]], [[uncore.Put]], etc. for built-in uncached Acquires.
+  *
+  * @param is_builtin_type built-in or custom type message?
+  * @param a_type built-in type enum or custom type enum
+  * @param client_xact_id client's transaction id
+  * @param addr_block address of the cache block
+  * @param addr_beat sub-block address (which beat)
+  * @param data data being put outwards
+  * @param union additional fields used for uncached types
+  */
+object Acquire {
+  val nBuiltInTypes = 7
+  //TODO: Use Enum
+  def getType         = UInt("b000") // Get a single beat of data
+  def getBlockType    = UInt("b001") // Get a whole block of data
+  def putType         = UInt("b010") // Put a single beat of data
+  def putBlockType    = UInt("b011") // Put a whole block of data
+  def putAtomicType   = UInt("b100") // Perform an atomic memory op
+  def getPrefetchType = UInt("b101") // Prefetch a whole block of data
+  def putPrefetchType = UInt("b110") // Prefetch a whole block of data, with intent to write
+  def typesWithData = Vec(putType, putBlockType, putAtomicType)
+  def typesWithMultibeatData = Vec(putBlockType)
+  def typesOnSubBlocks = Vec(putType, getType, putAtomicType)
+  def typesWithAddrByte = Vec(getType, putAtomicType)
+
+  /** Mapping between each built-in Acquire type and a built-in Grant type. */
+  def getBuiltInGrantType(a_type: UInt): UInt = {
+    MuxLookup(a_type, Grant.putAckType, Array(
+      Acquire.getType       -> Grant.getDataBeatType,
+      Acquire.getBlockType  -> Grant.getDataBlockType,
+      Acquire.putType       -> Grant.putAckType,
+      Acquire.putBlockType  -> Grant.putAckType,
+      Acquire.putAtomicType -> Grant.getDataBeatType,
+      Acquire.getPrefetchType -> Grant.prefetchAckType,
+      Acquire.putPrefetchType -> Grant.prefetchAckType))
+  }
+
+  def makeUnion(
+        a_type: UInt,
+        addr_byte: UInt,
+        operand_size: UInt,
+        opcode: UInt,
+        wmask: UInt,
+        alloc: Bool)
+      (implicit p: Parameters): UInt = {
+        
+    val tlExternal = p(TLKey(p(TLId)))
+    val tlWriteMaskBits = tlExternal.writeMaskBits
+    val tlByteAddrBits = log2Up(tlWriteMaskBits)
+    val tlMemoryOperandSizeBits = log2Ceil(log2Ceil(tlWriteMaskBits) + 1)
+    
+    // These had better be the right size when we cat them together!
+    val my_addr_byte = (UInt(0, tlByteAddrBits) | addr_byte)(tlByteAddrBits-1, 0)
+    val my_operand_size = (UInt(0, tlMemoryOperandSizeBits) | operand_size)(tlMemoryOperandSizeBits-1, 0)
+    val my_opcode = (UInt(0, M_SZ) | opcode)(M_SZ-1, 0)
+    val my_wmask = (UInt(0, tlWriteMaskBits) | wmask)(tlWriteMaskBits-1, 0)
+    
+    MuxLookup(a_type, UInt(0), Array(
+      Acquire.getType       -> Cat(my_addr_byte, my_operand_size, my_opcode, alloc),
+      Acquire.getBlockType  -> Cat(my_operand_size, my_opcode, alloc),
+      Acquire.putType       -> Cat(my_wmask, alloc),
+      Acquire.putBlockType  -> Cat(my_wmask, alloc),
+      Acquire.putAtomicType -> Cat(my_addr_byte, my_operand_size, my_opcode, alloc),
+      Acquire.getPrefetchType -> Cat(M_XRD, alloc),
+      Acquire.putPrefetchType -> Cat(M_XWR, alloc)))
+  }
+
+  def fullWriteMask(implicit p: Parameters) = SInt(-1, width = p(TLKey(p(TLId))).writeMaskBits).asUInt
+  def fullOperandSize(implicit p: Parameters) = {
+    val dataBits = p(TLKey(p(TLId))).dataBitsPerBeat
+    UInt(log2Ceil(dataBits / 8))
+  }
+
+  // Most generic constructor
+  def apply(
+        is_builtin_type: Bool,
+        a_type: Bits,
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt = UInt(0),
+        data: UInt = UInt(0),
+        union: UInt = UInt(0))
+      (implicit p: Parameters): Acquire = {
+    val acq = Wire(new Acquire)
+    acq.is_builtin_type := is_builtin_type
+    acq.a_type := a_type
+    acq.client_xact_id := client_xact_id
+    acq.addr_block := addr_block
+    acq.addr_beat := addr_beat
+    acq.data := data
+    acq.union := union
+    acq
+  }
+
+  // Copy constructor
+  def apply(a: Acquire): Acquire = {
+    val acq = Wire(new Acquire()(a.p))
+    acq := a
+    acq
+  }
+}
+
+object BuiltInAcquireBuilder {
+  def apply(
+        a_type: UInt,
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt = UInt(0),
+        data: UInt = UInt(0),
+        addr_byte: UInt = UInt(0),
+        operand_size: UInt = UInt(0),
+        opcode: UInt = UInt(0),
+        wmask: UInt = UInt(0),
+        alloc: Bool = Bool(true))
+      (implicit p: Parameters): Acquire = {
+    Acquire(
+        is_builtin_type = Bool(true),
+        a_type = a_type,
+        client_xact_id = client_xact_id,
+        addr_block = addr_block,
+        addr_beat = addr_beat,
+        data = data,
+        union = Acquire.makeUnion(a_type, addr_byte, operand_size, opcode, wmask, alloc))
+  }
+}
+
+/** Get a single beat of data from the outer memory hierarchy
+  *
+  * The client can hint whether he block containing this beat should be 
+  * allocated in the intervening levels of the hierarchy.
+  *
+  * @param client_xact_id client's transaction id
+  * @param addr_block address of the cache block
+  * @param addr_beat sub-block address (which beat)
+  * @param addr_byte sub-block address (which byte)
+  * @param operand_size {byte, half, word, double} from [[uncore.MemoryOpConstants]]
+  * @param alloc hint whether the block should be allocated in intervening caches
+  */
+object Get {
+  def apply(
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt,
+        alloc: Bool = Bool(true))
+      (implicit p: Parameters): Acquire = {
+    BuiltInAcquireBuilder(
+      a_type = Acquire.getType,
+      client_xact_id = client_xact_id,
+      addr_block = addr_block,
+      addr_beat = addr_beat,
+      operand_size = Acquire.fullOperandSize,
+      opcode = M_XRD,
+      alloc = alloc)
+  }
+  def apply(
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt,
+        addr_byte: UInt,
+        operand_size: UInt,
+        alloc: Bool)
+      (implicit p: Parameters): Acquire = {
+    BuiltInAcquireBuilder(
+      a_type = Acquire.getType,
+      client_xact_id = client_xact_id,
+      addr_block = addr_block,
+      addr_beat = addr_beat,
+      addr_byte = addr_byte, 
+      operand_size = operand_size,
+      opcode = M_XRD,
+      alloc = alloc)
+  }
+}
+
+/** Get a whole cache block of data from the outer memory hierarchy
+  *
+  * The client can hint whether the block should be allocated in the 
+  * intervening levels of the hierarchy.
+  *
+  * @param client_xact_id client's transaction id
+  * @param addr_block address of the cache block
+  * @param alloc hint whether the block should be allocated in intervening caches
+  */
+object GetBlock {
+  def apply(
+        client_xact_id: UInt = UInt(0),
+        addr_block: UInt,
+        alloc: Bool = Bool(true))
+      (implicit p: Parameters): Acquire = {
+    BuiltInAcquireBuilder(
+      a_type = Acquire.getBlockType,
+      client_xact_id = client_xact_id, 
+      addr_block = addr_block,
+      operand_size = Acquire.fullOperandSize,
+      opcode = M_XRD,
+      alloc = alloc)
+  }
+}
+
+/** Prefetch a cache block into the next-outermost level of the memory hierarchy
+  * with read permissions.
+  *
+  * @param client_xact_id client's transaction id
+  * @param addr_block address of the cache block
+  */
+object GetPrefetch {
+  def apply(
+       client_xact_id: UInt,
+       addr_block: UInt)
+      (implicit p: Parameters): Acquire = {
+    BuiltInAcquireBuilder(
+      a_type = Acquire.getPrefetchType,
+      client_xact_id = client_xact_id,
+      addr_block = addr_block)
+  }
+}
+
+/** Put a single beat of data into the outer memory hierarchy
+  *
+  * The block will be allocated in the next-outermost level of the hierarchy.
+  *
+  * @param client_xact_id client's transaction id
+  * @param addr_block address of the cache block
+  * @param addr_beat sub-block address (which beat)
+  * @param data data being refilled to the original requestor
+  * @param wmask per-byte write mask for this beat
+  * @param alloc hint whether the block should be allocated in intervening caches
+  */
+object Put {
+  def apply(
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt,
+        data: UInt,
+        wmask: Option[UInt]= None,
+        alloc: Bool = Bool(true))
+      (implicit p: Parameters): Acquire = {
+    BuiltInAcquireBuilder(
+      a_type = Acquire.putType,
+      addr_block = addr_block,
+      addr_beat = addr_beat,
+      client_xact_id = client_xact_id,
+      data = data,
+      wmask = wmask.getOrElse(Acquire.fullWriteMask),
+      alloc = alloc)
+  }
+}
+
+/** Put a whole cache block of data into the outer memory hierarchy
+  *
+  * If the write mask is not full, the block will be allocated in the
+  * next-outermost level of the hierarchy. If the write mask is full, the
+  * client can hint whether the block should be allocated or not.
+  *
+  * @param client_xact_id client's transaction id
+  * @param addr_block address of the cache block
+  * @param addr_beat sub-block address (which beat of several)
+  * @param data data being refilled to the original requestor
+  * @param wmask per-byte write mask for this beat
+  * @param alloc hint whether the block should be allocated in intervening caches
+  */
+object PutBlock {
+  def apply(
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt,
+        data: UInt,
+        wmask: Option[UInt] = None,
+        alloc: Bool = Bool(true))
+      (implicit p: Parameters): Acquire = {
+    BuiltInAcquireBuilder(
+      a_type = Acquire.putBlockType,
+      client_xact_id = client_xact_id,
+      addr_block = addr_block,
+      addr_beat = addr_beat,
+      data = data,
+      wmask = wmask.getOrElse(Acquire.fullWriteMask),
+      alloc = alloc)
+  }
+}
+
+/** Prefetch a cache block into the next-outermost level of the memory hierarchy
+  * with write permissions.
+  *
+  * @param client_xact_id client's transaction id
+  * @param addr_block address of the cache block
+  */
+object PutPrefetch {
+  def apply(
+        client_xact_id: UInt,
+        addr_block: UInt)
+      (implicit p: Parameters): Acquire = {
+    BuiltInAcquireBuilder(
+      a_type = Acquire.putPrefetchType,
+      client_xact_id = client_xact_id,
+      addr_block = addr_block)
+  }
+}
+
+/** Perform an atomic memory operation in the next-outermost level of the memory hierarchy
+  *
+  * @param client_xact_id client's transaction id
+  * @param addr_block address of the cache block
+  * @param addr_beat sub-block address (within which beat)
+  * @param addr_byte sub-block address (which byte)
+  * @param atomic_opcode {swap, add, xor, and, min, max, minu, maxu} from [[uncore.MemoryOpConstants]]
+  * @param operand_size {byte, half, word, double} from [[uncore.MemoryOpConstants]]
+  * @param data source operand data
+  */
+object PutAtomic {
+  def apply(
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt,
+        addr_byte: UInt,
+        atomic_opcode: UInt,
+        operand_size: UInt,
+        data: UInt)
+      (implicit p: Parameters): Acquire = {
+    BuiltInAcquireBuilder(
+      a_type = Acquire.putAtomicType,
+      client_xact_id = client_xact_id, 
+      addr_block = addr_block, 
+      addr_beat = addr_beat, 
+      data = data,
+      addr_byte = addr_byte,
+      operand_size = operand_size,
+      opcode = atomic_opcode)
+  }
+}
+
+/** The Probe channel is used to force clients to release data or cede permissions
+  * on a cache block. Clients respond to Probes with [[uncore.Release]] messages.
+  * The available types of Probes are customized by a particular
+  * [[uncore.CoherencePolicy]].
+  */
+class Probe(implicit p: Parameters) extends ManagerToClientChannel
+  with HasCacheBlockAddress 
+  with HasProbeType
+
+/** [[uncore.Probe]] with an extra field stating its destination id */
+class ProbeToDst(implicit p: Parameters) extends Probe()(p) with HasClientId
+
+/** Contains factories for [[uncore.Probe]] and [[uncore.ProbeToDst]]
+  *
+  * In general you should avoid using these factories directly and use
+  * [[uncore.ManagerMetadata.makeProbe(UInt,Acquire)* makeProbe]] instead.
+  *
+  * @param dst id of client to which probe should be sent
+  * @param p_type custom probe type
+  * @param addr_block address of the cache block
+  */
+object Probe {
+  def apply(p_type: UInt, addr_block: UInt)(implicit p: Parameters): Probe = {
+    val prb = Wire(new Probe)
+    prb.p_type := p_type
+    prb.addr_block := addr_block
+    prb
+  }
+  def apply(dst: UInt, p_type: UInt, addr_block: UInt)(implicit p: Parameters): ProbeToDst = {
+    val prb = Wire(new ProbeToDst)
+    prb.client_id := dst
+    prb.p_type := p_type
+    prb.addr_block := addr_block
+    prb
+  }
+}
+
+/** The Release channel is used to release data or permission back to the manager
+  * in response to [[uncore.Probe]] messages. It can also be used to voluntarily
+  * write back data, for example in the event that dirty data must be evicted on
+  * a cache miss. The available types of Release messages are always customized by
+  * a particular [[uncore.CoherencePolicy]]. Releases may contain data or may be
+  * simple acknowledgements. Voluntary Releases are acknowledged with [[uncore.Grant Grants]].
+  */
+class ReleaseMetadata(implicit p: Parameters) extends ClientToManagerChannel
+    with HasTileLinkBeatId
+    with HasCacheBlockAddress 
+    with HasClientTransactionId 
+    with HasReleaseType {
+  def full_addr(dummy: Int = 0) = Cat(this.addr_block, this.addr_beat, UInt(0, width = tlByteAddrBits))
+}
+
+/** [[uncore.ReleaseMetadata]] with an extra field containing the data beat */
+class Release(implicit p: Parameters) extends ReleaseMetadata
+  with HasTileLinkData
+
+/** [[uncore.ReleaseMetadata]] with an extra field containing the entire cache block */
+class BufferedRelease(implicit p: Parameters) extends ReleaseMetadata
+  with HasTileLinkBlock
+
+/** [[uncore.Release]] with an extra field stating its source id */
+class ReleaseFromSrc(implicit p: Parameters) extends Release
+  with HasClientId
+
+/** [[uncore.BufferedRelease]] with an extra field stating its source id */
+class BufferedReleaseFromSrc(implicit p: Parameters) extends BufferedRelease
+  with HasClientId
+
+/** Contains a [[uncore.Release]] factory
+  *
+  * In general you should avoid using this factory directly and use
+  * [[uncore.ClientMetadata.makeRelease]] instead.
+  *
+  * @param voluntary is this a voluntary writeback
+  * @param r_type type enum defined by coherence protocol
+  * @param client_xact_id client's transaction id
+  * @param addr_block address of the cache block
+  * @param addr_beat beat id of the data
+  * @param data data being written back
+  */
+object Release {
+  def apply(
+        voluntary: Bool,
+        r_type: UInt,
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt,
+        data: UInt)
+      (implicit p: Parameters): Release = {
+    val rel = Wire(new Release)
+    rel.r_type := r_type
+    rel.client_xact_id := client_xact_id
+    rel.addr_block := addr_block
+    rel.addr_beat := addr_beat
+    rel.data := data
+    rel.voluntary := voluntary
+    rel
+  }
+
+  def apply(
+        src: UInt,
+        voluntary: Bool,
+        r_type: UInt,
+        client_xact_id: UInt,
+        addr_block: UInt,
+        addr_beat: UInt = UInt(0),
+        data: UInt = UInt(0))
+      (implicit p: Parameters): ReleaseFromSrc = {
+    val rel = Wire(new ReleaseFromSrc)
+    rel.client_id := src
+    rel.voluntary := voluntary
+    rel.r_type := r_type
+    rel.client_xact_id := client_xact_id
+    rel.addr_block := addr_block
+    rel.addr_beat := addr_beat
+    rel.data := data
+    rel
+  }
+}
+
+/** The Grant channel is used to refill data or grant permissions requested of the 
+  * manager agent via an [[uncore.Acquire]] message. It is also used to acknowledge
+  * the receipt of voluntary writeback from clients in the form of [[uncore.Release]]
+  * messages. There are built-in Grant messages used for Gets and Puts, and
+  * coherence policies may also define custom Grant types. Grants may contain data
+  * or may be simple acknowledgements. Grants are responded to with [[uncore.Finish]].
+  */
+class GrantMetadata(implicit p: Parameters) extends ManagerToClientChannel
+    with HasTileLinkBeatId
+    with HasClientTransactionId 
+    with HasManagerTransactionId
+    with HasGrantType {
+  def makeFinish(dummy: Int = 0): Finish = {
+    val f = Wire(new Finish)
+    f.manager_xact_id := this.manager_xact_id
+    f
+  }
+}
+
+/** [[uncore.GrantMetadata]] with an extra field containing a single beat of data */
+class Grant(implicit p: Parameters) extends GrantMetadata
+  with HasTileLinkData
+
+/** [[uncore.Grant]] with an extra field stating its destination */
+class GrantToDst(implicit p: Parameters) extends Grant
+  with HasClientId
+
+/** [[uncore.Grant]] with an extra field stating its destination */
+class GrantFromSrc(implicit p: Parameters) extends Grant
+    with HasManagerId {
+  override def makeFinish(dummy: Int = 0): FinishToDst = {
+    val f = Wire(new FinishToDst)
+    f.manager_xact_id := this.manager_xact_id
+    f.manager_id := this.manager_id
+    f
+  }
+}
+
+/** [[uncore.GrantMetadata]] with an extra field containing an entire cache block */
+class BufferedGrant(implicit p: Parameters) extends GrantMetadata
+  with HasTileLinkBlock
+
+/** [[uncore.BufferedGrant]] with an extra field stating its destination */
+class BufferedGrantToDst(implicit p: Parameters) extends BufferedGrant
+  with HasClientId
+
+/** Contains definitions of the the built-in grant types and factories 
+  * for [[uncore.Grant]] and [[uncore.GrantToDst]]
+  *
+  * In general you should avoid using these factories directly and use
+  * [[uncore.ManagerMetadata.makeGrant(uncore.AcquireFromSrc* makeGrant]] instead.
+  *
+  * @param dst id of client to which grant should be sent
+  * @param is_builtin_type built-in or custom type message?
+  * @param g_type built-in type enum or custom type enum
+  * @param client_xact_id client's transaction id
+  * @param manager_xact_id manager's transaction id
+  * @param addr_beat beat id of the data
+  * @param data data being refilled to the original requestor
+  */
+object Grant {
+  val nBuiltInTypes = 5
+  def voluntaryAckType = UInt("b000") // For acking Releases
+  def prefetchAckType  = UInt("b001") // For acking any kind of Prefetch
+  def putAckType       = UInt("b011") // For acking any kind of non-prfetch Put
+  def getDataBeatType  = UInt("b100") // Supplying a single beat of Get
+  def getDataBlockType = UInt("b101") // Supplying all beats of a GetBlock
+  def typesWithData = Vec(getDataBlockType, getDataBeatType)
+  def typesWithMultibeatData= Vec(getDataBlockType)
+
+  def apply(
+        is_builtin_type: Bool,
+        g_type: UInt,
+        client_xact_id: UInt, 
+        manager_xact_id: UInt,
+        addr_beat: UInt,
+        data: UInt)
+      (implicit p: Parameters): Grant = {
+    val gnt = Wire(new Grant)
+    gnt.is_builtin_type := is_builtin_type
+    gnt.g_type := g_type
+    gnt.client_xact_id := client_xact_id
+    gnt.manager_xact_id := manager_xact_id
+    gnt.addr_beat := addr_beat
+    gnt.data := data
+    gnt
+  }
+
+  def apply(
+        dst: UInt,
+        is_builtin_type: Bool,
+        g_type: UInt,
+        client_xact_id: UInt,
+        manager_xact_id: UInt,
+        addr_beat: UInt = UInt(0),
+        data: UInt = UInt(0))
+      (implicit p: Parameters): GrantToDst = {
+    val gnt = Wire(new GrantToDst)
+    gnt.client_id := dst
+    gnt.is_builtin_type := is_builtin_type
+    gnt.g_type := g_type
+    gnt.client_xact_id := client_xact_id
+    gnt.manager_xact_id := manager_xact_id
+    gnt.addr_beat := addr_beat
+    gnt.data := data
+    gnt
+  }
+}
+
+/** The Finish channel is used to provide a global ordering of transactions
+  * in networks that do not guarantee point-to-point ordering of messages.
+  * A Finsish message is sent as acknowledgement of receipt of a [[uncore.Grant]].
+  * When a Finish message is received, a manager knows it is safe to begin
+  * processing other transactions that touch the same cache block.
+  */
+class Finish(implicit p: Parameters) extends ClientToManagerChannel()(p)
+    with HasManagerTransactionId {
+  def hasData(dummy: Int = 0) = Bool(false)
+  def hasMultibeatData(dummy: Int = 0) = Bool(false)
+}
+
+/** [[uncore.Finish]] with an extra field stating its destination */
+class FinishToDst(implicit p: Parameters) extends Finish
+  with HasManagerId
+
+/** Complete IO definition for incoherent TileLink, including networking headers */
+class UncachedTileLinkIO(implicit p: Parameters) extends TLBundle()(p) {
+  val acquire   = new DecoupledIO(new LogicalNetworkIO(new Acquire))
+  val grant     = new DecoupledIO(new LogicalNetworkIO(new Grant)).flip
+  val finish = new DecoupledIO(new LogicalNetworkIO(new Finish))
+}
+
+/** Complete IO definition for coherent TileLink, including networking headers */
+class TileLinkIO(implicit p: Parameters) extends UncachedTileLinkIO()(p) {
+  val probe     = new DecoupledIO(new LogicalNetworkIO(new Probe)).flip
+  val release   = new DecoupledIO(new LogicalNetworkIO(new Release))
+}
+
+/** This version of UncachedTileLinkIO does not contain network headers. 
+  * It is intended for use within client agents.
+  *
+  * Headers are provided in the top-level that instantiates the clients and network,
+  * probably using a [[uncore.ClientTileLinkNetworkPort]] module.
+  * By eliding the header subbundles within the clients we can enable 
+  * hierarchical P-and-R while minimizing unconnected port errors in GDS.
+  *
+  * Secondly, this version of the interface elides [[uncore.Finish]] messages, with the
+  * assumption that a [[uncore.FinishUnit]] has been coupled to the TileLinkIO port
+  * to deal with acking received [[uncore.Grant Grants]].
+  */
+class ClientUncachedTileLinkIO(implicit p: Parameters) extends TLBundle()(p) {
+  val acquire   = new DecoupledIO(new Acquire)
+  val grant     = new DecoupledIO(new Grant).flip
+}
+
+/** This version of TileLinkIO does not contain network headers. 
+  * It is intended for use within client agents.
+  */
+class ClientTileLinkIO(implicit p: Parameters) extends TLBundle()(p) {
+  val acquire   = new DecoupledIO(new Acquire)
+  val probe     = new DecoupledIO(new Probe).flip
+  val release   = new DecoupledIO(new Release)
+  val grant     = new DecoupledIO(new GrantFromSrc).flip
+  val finish    = new DecoupledIO(new FinishToDst)
+}
+
+/** This version of TileLinkIO does not contain network headers, but
+  * every channel does include an extra client_id subbundle.
+  * It is intended for use within Management agents.
+  *
+  * Managers need to track where [[uncore.Acquire]] and [[uncore.Release]] messages
+  * originated so that they can send a [[uncore.Grant]] to the right place. 
+  * Similarly they must be able to issues Probes to particular clients.
+  * However, we'd still prefer to have [[uncore.ManagerTileLinkNetworkPort]] fill in
+  * the header.src to enable hierarchical p-and-r of the managers. Additionally, 
+  * coherent clients might be mapped to random network port ids, and we'll leave it to the
+  * [[uncore.ManagerTileLinkNetworkPort]] to apply the correct mapping. Managers do need to
+  * see Finished so they know when to allow new transactions on a cache
+  * block to proceed.
+  */
+class ManagerTileLinkIO(implicit p: Parameters) extends TLBundle()(p) {
+  val acquire   = new DecoupledIO(new AcquireFromSrc).flip
+  val grant     = new DecoupledIO(new GrantToDst)
+  val finish    = new DecoupledIO(new Finish).flip
+  val probe     = new DecoupledIO(new ProbeToDst)
+  val release   = new DecoupledIO(new ReleaseFromSrc).flip
+}
--- a/src/main/scala/uncore/tilelink/Interconnect.scala
+++ b/src/main/scala/uncore/tilelink/Interconnect.scala
@ -0,0 +1,386 @@
+package uncore.tilelink
+
+import Chisel._
+import junctions._
+import scala.collection.mutable.ArraySeq
+import uncore.util._
+import cde.{Parameters, Field}
+
+
+/** PortedTileLinkNetworks combine a TileLink protocol with a particular physical
+  * network implementation.
+  *
+  * Specifically, they provide mappings between ClientTileLinkIO/ 
+  * ManagerTileLinkIO channels and LogicalNetwork ports (i.e. generic
+  * TileLinkIO with networking headers). Channels coming into the network have
+  * appropriate networking headers appended and outgoing channels have their
+  * headers stripped.
+  *
+  * @constructor base class constructor for Ported TileLink NoC
+  * @param addrToManagerId a mapping from a physical address to the network
+  *        id of a coherence manager
+  * @param sharerToClientId a mapping from the id of a particular coherent
+  *        client (as determined by e.g. the directory) and the network id
+  *        of that client
+  * @param clientDepths the depths of the queue that should be used to buffer
+  *        each channel on the client side of the network
+  * @param managerDepths the depths of the queue that should be used to buffer
+  *        each channel on the manager side of the network
+  */
+abstract class PortedTileLinkNetwork(
+      addrToManagerId: UInt => UInt,
+      sharerToClientId: UInt => UInt,
+      clientDepths: TileLinkDepths, 
+      managerDepths: TileLinkDepths)
+    (implicit p: Parameters) extends TLModule()(p) {
+  val nClients = tlNClients
+  val nManagers = tlNManagers
+  val io = new Bundle {
+    val clients_cached = Vec(tlNCachingClients, new ClientTileLinkIO).flip
+    val clients_uncached = Vec(tlNCachelessClients, new ClientUncachedTileLinkIO).flip
+    val managers = Vec(nManagers, new ManagerTileLinkIO).flip
+  }
+
+  val clients = (io.clients_cached ++ io.clients_uncached).zipWithIndex.map { 
+    case (io, idx) => {
+      val qs = Module(new TileLinkEnqueuer(clientDepths))
+      io match {
+        case c: ClientTileLinkIO => {
+          val port = Module(new ClientTileLinkNetworkPort(idx, addrToManagerId))
+          port.io.client <> c
+          qs.io.client <> port.io.network
+          qs.io.manager 
+        }
+        case u: ClientUncachedTileLinkIO => {
+          val port = Module(new ClientUncachedTileLinkNetworkPort(idx, addrToManagerId))
+          port.io.client <> u
+          qs.io.client <> port.io.network
+          qs.io.manager 
+        }
+      }
+    }
+  }
+
+  val managers = io.managers.zipWithIndex.map {
+    case (m, i) => {
+      val port = Module(new ManagerTileLinkNetworkPort(i, sharerToClientId))
+      val qs = Module(new TileLinkEnqueuer(managerDepths))
+      port.io.manager <> m
+      port.io.network <> qs.io.manager
+      qs.io.client
+    }
+  }
+}
+
+/** A simple arbiter for each channel that also deals with header-based routing.
+  * Assumes a single manager agent. */
+class PortedTileLinkArbiter(
+      sharerToClientId: UInt => UInt = (u: UInt) => u,
+      clientDepths: TileLinkDepths = TileLinkDepths(0,0,0,0,0), 
+      managerDepths: TileLinkDepths = TileLinkDepths(0,0,0,0,0))
+    (implicit p: Parameters)
+      extends PortedTileLinkNetwork(u => UInt(0), sharerToClientId, clientDepths, managerDepths)(p)
+        with TileLinkArbiterLike
+        with PassesId {
+  val arbN = nClients
+  require(nManagers == 1)
+  if(arbN > 1) {
+    hookupClientSource(clients.map(_.acquire), managers.head.acquire)
+    hookupClientSource(clients.map(_.release), managers.head.release)
+    hookupFinish(clients.map(_.finish), managers.head.finish)
+    hookupManagerSourceWithHeader(clients.map(_.probe), managers.head.probe)
+    hookupManagerSourceWithHeader(clients.map(_.grant), managers.head.grant)
+  } else {
+    managers.head <> clients.head
+  }
+}
+
+/** Provides a separate physical crossbar for each channel. Assumes multiple manager
+  * agents. Managers are assigned to higher physical network port ids than
+  * clients, and translations between logical network id and physical crossbar
+  * port id are done automatically.
+  */
+class PortedTileLinkCrossbar(
+      addrToManagerId: UInt => UInt = u => UInt(0),
+      sharerToClientId: UInt => UInt = u => u,
+      clientDepths: TileLinkDepths = TileLinkDepths(0,0,0,0,0), 
+      managerDepths: TileLinkDepths = TileLinkDepths(0,0,0,0,0))
+    (implicit p: Parameters)
+      extends PortedTileLinkNetwork(addrToManagerId, sharerToClientId, clientDepths, managerDepths)(p) {
+  val n = p(LNEndpoints)
+  val phyHdrWidth = log2Up(n)
+  val count = tlDataBeats
+  // Actually instantiate the particular networks required for TileLink
+  val acqNet = Module(new BasicBus(CrossbarConfig(n, new Acquire, count, Some((a: PhysicalNetworkIO[Acquire]) => a.payload.hasMultibeatData()))))
+  val relNet = Module(new BasicBus(CrossbarConfig(n, new Release, count, Some((r: PhysicalNetworkIO[Release]) => r.payload.hasMultibeatData()))))
+  val prbNet = Module(new BasicBus(CrossbarConfig(n, new Probe)))
+  val gntNet = Module(new BasicBus(CrossbarConfig(n, new Grant, count, Some((g: PhysicalNetworkIO[Grant]) => g.payload.hasMultibeatData()))))
+  val ackNet = Module(new BasicBus(CrossbarConfig(n, new Finish)))
+
+  // Aliases for the various network IO bundle types
+  type PNIO[T <: Data] = DecoupledIO[PhysicalNetworkIO[T]]
+  type LNIO[T <: Data] = DecoupledIO[LogicalNetworkIO[T]]
+  type FromCrossbar[T <: Data] = PNIO[T] => LNIO[T]
+  type ToCrossbar[T <: Data] = LNIO[T] => PNIO[T]
+
+  // Shims for converting between logical network IOs and physical network IOs
+  def crossbarToManagerShim[T <: Data](in: PNIO[T]): LNIO[T] = {
+    val out = DefaultFromPhysicalShim(in)
+    out.bits.header.src := in.bits.header.src - UInt(nManagers)
+    out
+  }
+  def crossbarToClientShim[T <: Data](in: PNIO[T]): LNIO[T] = {
+    val out = DefaultFromPhysicalShim(in)
+    out.bits.header.dst := in.bits.header.dst - UInt(nManagers)
+    out
+  }
+  def managerToCrossbarShim[T <: Data](in: LNIO[T]): PNIO[T] = {
+    val out = DefaultToPhysicalShim(n, in)
+    out.bits.header.dst := in.bits.header.dst + UInt(nManagers, phyHdrWidth)
+    out
+  }
+  def clientToCrossbarShim[T <: Data](in: LNIO[T]): PNIO[T] = {
+    val out = DefaultToPhysicalShim(n, in)
+    out.bits.header.src := in.bits.header.src + UInt(nManagers, phyHdrWidth)
+    out
+  }
+
+  // Make an individual connection between virtual and physical ports using
+  // a particular shim. Also pin the unused Decoupled control signal low.
+  def doDecoupledInputHookup[T <: Data](phys_in: PNIO[T], phys_out: PNIO[T], log_io: LNIO[T], shim: ToCrossbar[T]) = {
+    val s = shim(log_io)
+    phys_in.valid := s.valid
+    phys_in.bits := s.bits
+    s.ready := phys_in.ready
+    phys_out.ready := Bool(false)
+  }
+
+  def doDecoupledOutputHookup[T <: Data](phys_in: PNIO[T], phys_out: PNIO[T], log_io: LNIO[T], shim: FromCrossbar[T]) = {
+    val s = shim(phys_out)
+    log_io.valid := s.valid
+    log_io.bits := s.bits
+    s.ready := log_io.ready
+    phys_in.valid := Bool(false)
+  }
+
+  //Hookup all instances of a particular subbundle of TileLink
+  def doDecoupledHookups[T <: Data](physIO: BasicCrossbarIO[T], getLogIO: TileLinkIO => LNIO[T]) = {
+    physIO.in.head.bits.payload match {
+      case c: ClientToManagerChannel => {
+        managers.zipWithIndex.map { case (i, id) => 
+          doDecoupledOutputHookup(physIO.in(id), physIO.out(id), getLogIO(i), crossbarToManagerShim[T])
+        }
+        clients.zipWithIndex.map { case (i, id) =>
+          doDecoupledInputHookup(physIO.in(id+nManagers), physIO.out(id+nManagers), getLogIO(i), clientToCrossbarShim[T])
+        }
+      }
+      case m: ManagerToClientChannel => {
+        managers.zipWithIndex.map { case (i, id) =>
+          doDecoupledInputHookup(physIO.in(id), physIO.out(id), getLogIO(i), managerToCrossbarShim[T])
+        }
+        clients.zipWithIndex.map { case (i, id) =>
+          doDecoupledOutputHookup(physIO.in(id+nManagers), physIO.out(id+nManagers), getLogIO(i), crossbarToClientShim[T])
+        }
+      }
+    }
+  }
+
+  doDecoupledHookups(acqNet.io, (tl: TileLinkIO) => tl.acquire)
+  doDecoupledHookups(relNet.io, (tl: TileLinkIO) => tl.release)
+  doDecoupledHookups(prbNet.io, (tl: TileLinkIO) => tl.probe)
+  doDecoupledHookups(gntNet.io, (tl: TileLinkIO) => tl.grant)
+  doDecoupledHookups(ackNet.io, (tl: TileLinkIO) => tl.finish)
+}
+
+class ClientUncachedTileLinkIORouter(
+    nOuter: Int, routeSel: UInt => UInt)(implicit p: Parameters)
+    extends TLModule {
+
+  val io = new Bundle {
+    val in = (new ClientUncachedTileLinkIO).flip
+    val out =  Vec(nOuter, new ClientUncachedTileLinkIO)
+  }
+
+  val acq_route = routeSel(io.in.acquire.bits.full_addr())
+
+  io.in.acquire.ready := Bool(false)
+
+  io.out.zipWithIndex.foreach { case (out, i) =>
+    out.acquire.valid := io.in.acquire.valid && acq_route(i)
+    out.acquire.bits := io.in.acquire.bits
+    when (acq_route(i)) { io.in.acquire.ready := out.acquire.ready }
+  }
+
+  val gnt_arb = Module(new LockingRRArbiter(
+    new Grant, nOuter, tlDataBeats, Some((gnt: Grant) => gnt.hasMultibeatData())))
+  gnt_arb.io.in <> io.out.map(_.grant)
+  io.in.grant <> gnt_arb.io.out
+
+  assert(!io.in.acquire.valid || acq_route.orR, "No valid route")
+}
+
+class TileLinkInterconnectIO(val nInner: Int, val nOuter: Int)
+    (implicit p: Parameters) extends Bundle {
+  val in = Vec(nInner, new ClientUncachedTileLinkIO).flip
+  val out = Vec(nOuter, new ClientUncachedTileLinkIO)
+}
+
+class ClientUncachedTileLinkIOCrossbar(
+    nInner: Int, nOuter: Int, routeSel: UInt => UInt)
+    (implicit p: Parameters) extends TLModule {
+
+  val io = new TileLinkInterconnectIO(nInner, nOuter)
+
+  if (nInner == 1) {
+    val router = Module(new ClientUncachedTileLinkIORouter(nOuter, routeSel))
+    router.io.in <> io.in.head
+    io.out <> router.io.out
+  } else {
+    val routers = List.fill(nInner) {
+      Module(new ClientUncachedTileLinkIORouter(nOuter, routeSel)) }
+    val arbiters = List.fill(nOuter) {
+      Module(new ClientUncachedTileLinkIOArbiter(nInner)) }
+
+    for (i <- 0 until nInner) {
+      routers(i).io.in <> io.in(i)
+    }
+
+    for (i <- 0 until nOuter) {
+      arbiters(i).io.in <> routers.map(r => r.io.out(i))
+      io.out(i) <> arbiters(i).io.out
+    }
+  }
+}
+
+abstract class TileLinkInterconnect(implicit p: Parameters) extends TLModule()(p) {
+  val nInner: Int
+  val nOuter: Int
+
+  lazy val io = new TileLinkInterconnectIO(nInner, nOuter)
+}
+
+class TileLinkRecursiveInterconnect(val nInner: Int, addrMap: AddrMap)
+    (implicit p: Parameters) extends TileLinkInterconnect()(p) {
+  def port(name: String) = io.out(addrMap.port(name))
+  val nOuter = addrMap.numSlaves
+  val routeSel = (addr: UInt) =>
+    Cat(addrMap.entries.map(e => addrMap(e.name).containsAddress(addr)).reverse)
+
+  val xbar = Module(new ClientUncachedTileLinkIOCrossbar(nInner, addrMap.length, routeSel))
+  xbar.io.in <> io.in
+
+  io.out <> addrMap.entries.zip(xbar.io.out).flatMap {
+    case (entry, xbarOut) => {
+      entry.region match {
+        case submap: AddrMap if submap.isEmpty =>
+          xbarOut.acquire.ready := Bool(false)
+          xbarOut.grant.valid := Bool(false)
+          None
+        case submap: AddrMap if !submap.collapse =>
+          val ic = Module(new TileLinkRecursiveInterconnect(1, submap))
+          ic.io.in.head <> xbarOut
+          ic.io.out
+        case _ =>
+          Some(xbarOut)
+      }
+    }
+  }
+}
+
+class TileLinkMemoryInterconnect(
+    nBanksPerChannel: Int, nChannels: Int)
+    (implicit p: Parameters) extends TileLinkInterconnect()(p) {
+
+  val nBanks = nBanksPerChannel * nChannels
+  val nInner = nBanks
+  val nOuter = nChannels
+
+  def connectChannel(outer: ClientUncachedTileLinkIO, inner: ClientUncachedTileLinkIO) {
+    outer <> inner
+    outer.acquire.bits.addr_block := inner.acquire.bits.addr_block >> UInt(log2Ceil(nChannels))
+  }
+
+  for (i <- 0 until nChannels) {
+    /* Bank assignments to channels are strided so that consecutive banks
+     * map to different channels. That way, consecutive cache lines also
+     * map to different channels */
+    val banks = (i until nBanks by nChannels).map(j => io.in(j))
+
+    val channelArb = Module(new ClientUncachedTileLinkIOArbiter(nBanksPerChannel))
+    channelArb.io.in <> banks
+    connectChannel(io.out(i), channelArb.io.out)
+  }
+}
+
+/** Allows users to switch between various memory configurations.  Note that
+  * this is a dangerous operation: not only does switching the select input to
+  * this module violate TileLink, it also causes the memory of the machine to
+  * become garbled.  It's expected that select only changes at boot time, as
+  * part of the memory controller configuration. */
+class TileLinkMemorySelectorIO(val nBanks: Int, val maxMemChannels: Int, nConfigs: Int)
+                           (implicit p: Parameters)
+                           extends TileLinkInterconnectIO(nBanks, maxMemChannels) {
+  val select  = UInt(INPUT, width = log2Up(nConfigs))
+  override def cloneType =
+    new TileLinkMemorySelectorIO(nBanks, maxMemChannels, nConfigs).asInstanceOf[this.type]
+}
+
+class TileLinkMemorySelector(nBanks: Int, maxMemChannels: Int, configs: Seq[Int])
+                         (implicit p: Parameters)
+                         extends TileLinkInterconnect()(p) {
+  val nInner = nBanks
+  val nOuter = maxMemChannels
+  val nConfigs = configs.size
+
+  override lazy val io = new TileLinkMemorySelectorIO(nBanks, maxMemChannels, nConfigs)
+
+  def muxOnSelect[T <: Data](up: DecoupledIO[T], dn: DecoupledIO[T], active: Bool): Unit = {
+    when (active) { dn.bits  := up.bits  }
+    when (active) { up.ready := dn.ready }
+    when (active) { dn.valid := up.valid }
+  }
+
+  def muxOnSelect(up: ClientUncachedTileLinkIO, dn: ClientUncachedTileLinkIO, active: Bool): Unit = {
+    muxOnSelect(up.acquire, dn.acquire, active)
+    muxOnSelect(dn.grant, up.grant, active)
+  }
+
+  def muxOnSelect(up: Vec[ClientUncachedTileLinkIO], dn: Vec[ClientUncachedTileLinkIO], active: Bool) : Unit = {
+    for (i <- 0 until up.size)
+      muxOnSelect(up(i), dn(i), active)
+  }
+
+  /* Disconnects a vector of TileLink ports, which involves setting them to
+   * invalid.  Due to Chisel reasons, we need to also set the bits to 0 (since
+   * there can't be any unconnected inputs). */
+  def disconnectOuter(outer: Vec[ClientUncachedTileLinkIO]) = {
+    outer.foreach{ m =>
+      m.acquire.valid := Bool(false)
+      m.acquire.bits := m.acquire.bits.fromBits(UInt(0))
+      m.grant.ready := Bool(false)
+    }
+  }
+
+  def disconnectInner(inner: Vec[ClientUncachedTileLinkIO]) = {
+    inner.foreach { m =>
+      m.grant.valid := Bool(false)
+      m.grant.bits := m.grant.bits.fromBits(UInt(0))
+      m.acquire.ready := Bool(false)
+    }
+  }
+
+  /* Provides default wires on all our outputs. */
+  disconnectOuter(io.out)
+  disconnectInner(io.in)
+
+  /* Constructs interconnects for each of the layouts suggested by the
+   * configuration and switches between them based on the select input. */
+  configs.zipWithIndex.foreach{ case (nChannels, select) =>
+    val nBanksPerChannel = nBanks / nChannels
+    val ic = Module(new TileLinkMemoryInterconnect(nBanksPerChannel, nChannels))
+    disconnectInner(ic.io.out)
+    disconnectOuter(ic.io.in)
+    muxOnSelect(io.in, ic.io.in, io.select === UInt(select))
+    muxOnSelect(ic.io.out, io.out, io.select === UInt(select))
+  }
+}
--- a/src/main/scala/uncore/tilelink/Network.scala
+++ b/src/main/scala/uncore/tilelink/Network.scala
@ -0,0 +1,308 @@
+// See LICENSE for license details.
+
+package uncore.tilelink
+
+import Chisel._
+import uncore.util._
+import cde.{Parameters, Field}
+
+case object LNEndpoints extends Field[Int]
+case object LNHeaderBits extends Field[Int]
+
+class PhysicalHeader(n: Int) extends Bundle {
+  val src = UInt(width = log2Up(n))
+  val dst = UInt(width = log2Up(n))
+}
+
+class PhysicalNetworkIO[T <: Data](n: Int, dType: T) extends Bundle {
+  val header = new PhysicalHeader(n)
+  val payload = dType.cloneType
+  override def cloneType = new PhysicalNetworkIO(n,dType).asInstanceOf[this.type]
+}
+
+class BasicCrossbarIO[T <: Data](n: Int, dType: T) extends Bundle {
+  val in  = Vec(n, Decoupled(new PhysicalNetworkIO(n,dType))).flip
+  val out = Vec(n, Decoupled(new PhysicalNetworkIO(n,dType)))
+}
+
+abstract class PhysicalNetwork extends Module
+
+case class CrossbarConfig[T <: Data](n: Int, dType: T, count: Int = 1, needsLock: Option[PhysicalNetworkIO[T] => Bool] = None)
+
+abstract class AbstractCrossbar[T <: Data](conf: CrossbarConfig[T]) extends PhysicalNetwork {
+  val io = new BasicCrossbarIO(conf.n, conf.dType)
+}
+
+class BasicBus[T <: Data](conf: CrossbarConfig[T]) extends AbstractCrossbar(conf) {
+  val arb = Module(new LockingRRArbiter(io.in(0).bits, conf.n, conf.count, conf.needsLock))
+  arb.io.in <> io.in
+
+  arb.io.out.ready := io.out(arb.io.out.bits.header.dst).ready
+  for ((out, i) <- io.out zipWithIndex) {
+    out.valid := arb.io.out.valid && arb.io.out.bits.header.dst === UInt(i)
+    out.bits := arb.io.out.bits
+  }
+}
+
+class BasicCrossbar[T <: Data](conf: CrossbarConfig[T]) extends AbstractCrossbar(conf) {
+  io.in.foreach { _.ready := Bool(false) }
+
+  io.out.zipWithIndex.map{ case (out, i) => {
+    val rrarb = Module(new LockingRRArbiter(io.in(0).bits, conf.n, conf.count, conf.needsLock))
+    (rrarb.io.in, io.in).zipped.map{ case (arb, in) => {
+      val destined = in.bits.header.dst === UInt(i)
+      arb.valid := in.valid && destined
+      arb.bits := in.bits
+      when (arb.ready && destined) { in.ready := Bool(true) }
+    }}
+    out <> rrarb.io.out
+  }}
+}
+
+abstract class LogicalNetwork extends Module
+
+class LogicalHeader(implicit p: Parameters) extends junctions.ParameterizedBundle()(p) {
+  val src = UInt(width = p(LNHeaderBits))
+  val dst = UInt(width = p(LNHeaderBits))
+}
+
+class LogicalNetworkIO[T <: Data](dType: T)(implicit p: Parameters) extends Bundle {
+  val header = new LogicalHeader
+  val payload = dType.cloneType
+  override def cloneType = new LogicalNetworkIO(dType)(p).asInstanceOf[this.type]
+}
+
+object DecoupledLogicalNetworkIOWrapper {
+  def apply[T <: Data](
+        in: DecoupledIO[T],
+        src: UInt = UInt(0),
+        dst: UInt = UInt(0))
+      (implicit p: Parameters): DecoupledIO[LogicalNetworkIO[T]] = {
+    val out = Wire(Decoupled(new LogicalNetworkIO(in.bits)))
+    out.valid := in.valid
+    out.bits.payload := in.bits
+    out.bits.header.dst := dst
+    out.bits.header.src := src
+    in.ready := out.ready
+    out
+  }
+}
+
+object DecoupledLogicalNetworkIOUnwrapper {
+  def apply[T <: Data](in: DecoupledIO[LogicalNetworkIO[T]])
+                      (implicit p: Parameters): DecoupledIO[T] = {
+    val out = Wire(Decoupled(in.bits.payload))
+    out.valid := in.valid
+    out.bits := in.bits.payload
+    in.ready := out.ready
+    out
+  }
+}
+
+object DefaultFromPhysicalShim {
+  def apply[T <: Data](in: DecoupledIO[PhysicalNetworkIO[T]])
+                      (implicit p: Parameters): DecoupledIO[LogicalNetworkIO[T]] = {
+    val out = Wire(Decoupled(new LogicalNetworkIO(in.bits.payload)))
+    out.bits.header := in.bits.header
+    out.bits.payload := in.bits.payload
+    out.valid := in.valid
+    in.ready := out.ready
+    out
+  }
+}
+
+object DefaultToPhysicalShim {
+  def apply[T <: Data](n: Int, in: DecoupledIO[LogicalNetworkIO[T]])
+                      (implicit p: Parameters): DecoupledIO[PhysicalNetworkIO[T]] = {
+    val out = Wire(Decoupled(new PhysicalNetworkIO(n, in.bits.payload)))
+    out.bits.header := in.bits.header
+    out.bits.payload := in.bits.payload
+    out.valid := in.valid
+    in.ready := out.ready
+    out
+  }
+}
+
+/** A helper module that automatically issues [[uncore.Finish]] messages in repsonse
+  * to [[uncore.Grant]] that it receives from a manager and forwards to a client
+  */
+class FinishUnit(srcId: Int = 0, outstanding: Int = 2)(implicit p: Parameters) extends TLModule()(p)
+    with HasDataBeatCounters {
+  val io = new Bundle {
+    val grant = Decoupled(new LogicalNetworkIO(new Grant)).flip
+    val refill = Decoupled(new Grant)
+    val finish = Decoupled(new LogicalNetworkIO(new Finish))
+    val ready = Bool(OUTPUT)
+  }
+
+  val g = io.grant.bits.payload
+
+  if(tlNetworkPreservesPointToPointOrdering) {
+    io.finish.valid := Bool(false)
+    io.refill.valid := io.grant.valid
+    io.refill.bits := g
+    io.grant.ready := io.refill.ready
+    io.ready := Bool(true)
+  } else {
+    // We only want to send Finishes after we have collected all beats of
+    // a multibeat Grant. But Grants from multiple managers or transactions may
+    // get interleaved, so we could need a counter for each.
+    val done = if(tlNetworkDoesNotInterleaveBeats) {
+      connectIncomingDataBeatCounterWithHeader(io.grant)
+    } else {
+      val entries = 1 << tlClientXactIdBits
+      def getId(g: LogicalNetworkIO[Grant]) = g.payload.client_xact_id
+      assert(getId(io.grant.bits) <= UInt(entries), "Not enough grant beat counters, only " + entries + " entries.")
+      connectIncomingDataBeatCountersWithHeader(io.grant, entries, getId).reduce(_||_)
+    }
+    val q = Module(new FinishQueue(outstanding))
+    q.io.enq.valid := io.grant.fire() && g.requiresAck() && (!g.hasMultibeatData() || done)
+    q.io.enq.bits := g.makeFinish()
+    q.io.enq.bits.manager_id := io.grant.bits.header.src
+
+    io.finish.bits.header.src := UInt(srcId)
+    io.finish.bits.header.dst := q.io.deq.bits.manager_id
+    io.finish.bits.payload := q.io.deq.bits
+    io.finish.valid := q.io.deq.valid
+    q.io.deq.ready := io.finish.ready
+
+    io.refill.valid := (q.io.enq.ready || !g.requiresAck()) && io.grant.valid
+    io.refill.bits := g
+    io.grant.ready := (q.io.enq.ready || !g.requiresAck()) && io.refill.ready
+    io.ready := q.io.enq.ready
+  }
+}
+
+class FinishQueue(entries: Int)(implicit p: Parameters) extends Queue(new FinishToDst()(p), entries)
+
+/** A port to convert [[uncore.ClientTileLinkIO]].flip into [[uncore.TileLinkIO]]
+  *
+  * Creates network headers for [[uncore.Acquire]] and [[uncore.Release]] messages,
+  * calculating header.dst and filling in header.src.
+  * Strips headers from [[uncore.Probe Probes]].
+  * Passes [[uncore.GrantFromSrc]] and accepts [[uncore.FinishFromDst]] in response,
+  * setting up the headers for each.
+  *
+  * @param clientId network port id of this agent
+  * @param addrConvert how a physical address maps to a destination manager port id
+  */
+class ClientTileLinkNetworkPort(clientId: Int, addrConvert: UInt => UInt)
+                               (implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val client = new ClientTileLinkIO().flip
+    val network = new TileLinkIO
+  }
+
+  val acq_with_header = ClientTileLinkHeaderCreator(io.client.acquire, clientId, addrConvert)
+  val rel_with_header = ClientTileLinkHeaderCreator(io.client.release, clientId, addrConvert)
+  val fin_with_header = ClientTileLinkHeaderCreator(io.client.finish, clientId)
+  val prb_without_header = DecoupledLogicalNetworkIOUnwrapper(io.network.probe)
+  val gnt_without_header = DecoupledLogicalNetworkIOUnwrapper(io.network.grant)
+
+  io.network.acquire <> acq_with_header
+  io.network.release <> rel_with_header
+  io.network.finish <> fin_with_header
+  io.client.probe <> prb_without_header
+  io.client.grant.bits.manager_id := io.network.grant.bits.header.src
+  io.client.grant <> gnt_without_header
+}
+
+/** A port to convert [[uncore.ClientUncachedTileLinkIO]].flip into [[uncore.TileLinkIO]]
+  *
+  * Creates network headers for [[uncore.Acquire]] and [[uncore.Release]] messages,
+  * calculating header.dst and filling in header.src.
+  * Responds to [[uncore.Grant]] by automatically issuing [[uncore.Finish]] to the granting managers.
+  *
+  * @param clientId network port id of this agent
+  * @param addrConvert how a physical address maps to a destination manager port id
+  */
+class ClientUncachedTileLinkNetworkPort(clientId: Int, addrConvert: UInt => UInt)
+                               (implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val client = new ClientUncachedTileLinkIO().flip
+    val network = new TileLinkIO
+  }
+
+  val finisher = Module(new FinishUnit(clientId))
+  finisher.io.grant <> io.network.grant
+  io.network.finish <> finisher.io.finish
+
+  val acq_with_header = ClientTileLinkHeaderCreator(io.client.acquire, clientId, addrConvert)
+  val gnt_without_header = finisher.io.refill
+
+  io.network.acquire.bits := acq_with_header.bits
+  io.network.acquire.valid := acq_with_header.valid && finisher.io.ready
+  acq_with_header.ready := io.network.acquire.ready && finisher.io.ready
+  io.client.grant <> gnt_without_header
+  io.network.probe.ready :=  Bool(false)
+  io.network.release.valid := Bool(false)
+}
+
+object ClientTileLinkHeaderCreator {
+  def apply[T <: ClientToManagerChannel with HasManagerId](
+        in: DecoupledIO[T],
+        clientId: Int)
+      (implicit p: Parameters): DecoupledIO[LogicalNetworkIO[T]] = {
+    val out = Wire(new DecoupledIO(new LogicalNetworkIO(in.bits)))
+    out.bits.payload := in.bits
+    out.bits.header.src := UInt(clientId)
+    out.bits.header.dst := in.bits.manager_id
+    out.valid := in.valid
+    in.ready := out.ready
+    out
+  }
+  def apply[T <: ClientToManagerChannel with HasCacheBlockAddress](
+        in: DecoupledIO[T],
+        clientId: Int,
+        addrConvert: UInt => UInt)
+      (implicit p: Parameters): DecoupledIO[LogicalNetworkIO[T]] = {
+    val out = Wire(new DecoupledIO(new LogicalNetworkIO(in.bits)))
+    out.bits.payload := in.bits
+    out.bits.header.src := UInt(clientId)
+    out.bits.header.dst := addrConvert(in.bits.addr_block)
+    out.valid := in.valid
+    in.ready := out.ready
+    out
+  }
+}
+
+/** A port to convert [[uncore.ManagerTileLinkIO]].flip into [[uncore.TileLinkIO]].flip
+  *
+  * Creates network headers for [[uncore.Probe]] and [[uncore.Grant]] messagess,
+  * calculating header.dst and filling in header.src.
+  * Strips headers from [[uncore.Acquire]], [[uncore.Release]] and [[uncore.Finish]],
+  * but supplies client_id instead.
+  *
+  * @param managerId the network port id of this agent
+  * @param idConvert how a sharer id maps to a destination client port id
+  */
+class ManagerTileLinkNetworkPort(managerId: Int, idConvert: UInt => UInt)
+                                (implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val manager = new ManagerTileLinkIO().flip
+    val network = new TileLinkIO().flip
+  }
+  io.network.grant <> ManagerTileLinkHeaderCreator(io.manager.grant, managerId, (u: UInt) => u)
+  io.network.probe <> ManagerTileLinkHeaderCreator(io.manager.probe, managerId, idConvert)
+  io.manager.acquire <> DecoupledLogicalNetworkIOUnwrapper(io.network.acquire)
+  io.manager.acquire.bits.client_id := io.network.acquire.bits.header.src
+  io.manager.release <> DecoupledLogicalNetworkIOUnwrapper(io.network.release)
+  io.manager.release.bits.client_id := io.network.release.bits.header.src
+  io.manager.finish <> DecoupledLogicalNetworkIOUnwrapper(io.network.finish)
+}
+
+object ManagerTileLinkHeaderCreator {
+  def apply[T <: ManagerToClientChannel with HasClientId](
+        in: DecoupledIO[T],
+        managerId: Int,
+        idConvert: UInt => UInt)
+      (implicit p: Parameters): DecoupledIO[LogicalNetworkIO[T]] = {
+    val out = Wire(new DecoupledIO(new LogicalNetworkIO(in.bits)))
+    out.bits.payload := in.bits
+    out.bits.header.src := UInt(managerId)
+    out.bits.header.dst := idConvert(in.bits.client_id)
+    out.valid := in.valid
+    in.ready := out.ready
+    out
+  }
+}
--- a/src/main/scala/uncore/unittests/Drivers.scala
+++ b/src/main/scala/uncore/unittests/Drivers.scala
@ -0,0 +1,422 @@
+package uncore.unittests
+
+import Chisel._
+import junctions._
+import uncore.tilelink._
+import uncore.constants._
+import uncore.util._
+import cde.Parameters
+
+abstract class Driver(implicit p: Parameters) extends TLModule()(p) {
+  val io = new Bundle {
+    val mem = new ClientUncachedTileLinkIO
+    val start = Bool(INPUT)
+    val finished = Bool(OUTPUT)
+  }
+}
+
+/**
+ * Tests that single-beat Gets of decreasing size return subsets of the
+ * data returned by larger Gets
+ */
+class GetMultiWidthDriver(implicit p: Parameters) extends Driver()(p) {
+  val s_start :: s_send :: s_recv :: s_done :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_start)
+
+  val w = 64
+  val initialSize = UInt(log2Ceil(w/8))
+  val size = Reg(UInt(width = log2Ceil(log2Ceil(w/8)+1)))
+  val ref = Reg(UInt(width = w))
+  val bytemask = (UInt(1) << (UInt(1) << size)) - UInt(1)
+  val bitmask = FillInterleaved(8, bytemask)
+
+  io.mem.acquire.valid := (state === s_send)
+  io.mem.acquire.bits := Get(
+    client_xact_id = UInt(0),
+    addr_block = UInt(0),
+    addr_beat = UInt(0),
+    addr_byte = UInt(0),
+    operand_size = size,
+    alloc = Bool(false))
+  io.mem.grant.ready := (state === s_recv)
+
+  when (state === s_start && io.start) {
+    size := initialSize
+    state := s_send
+  }
+
+  when (io.mem.acquire.fire()) { state := s_recv }
+  when (io.mem.grant.fire()) {
+    when (size === initialSize) { ref := io.mem.grant.bits.data }
+    size := size - UInt(1)
+    state := Mux(size === UInt(0), s_done, s_send)
+  }
+
+  io.finished := state === s_done
+
+  assert(!io.mem.grant.valid || size === initialSize ||
+         (io.mem.grant.bits.data & bitmask) === (ref & bitmask),
+         "GetMultiWidth: smaller get does not match larger get")
+}
+
+/**
+ * Tests that single-beat Gets across a range of memory return
+ * the expected data.
+ * @param expected The values of the data expected to be read.
+ *                 Each element is the data for one beat.
+ */
+class GetSweepDriver(expected: Seq[BigInt])
+                    (implicit p: Parameters) extends Driver()(p) {
+
+  val s_start :: s_send :: s_recv :: s_done :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_start)
+
+  val nReqs = expected.size
+  val (req_cnt, req_done) = Counter(io.mem.grant.fire(), nReqs)
+
+  when (state === s_start && io.start) { state := s_send }
+  when (io.mem.acquire.fire()) { state := s_recv }
+  when (io.mem.grant.fire()) { state := s_send }
+  when (req_done) { state := s_done }
+
+  val (addr_block, addr_beat) = if (nReqs > tlDataBeats) {
+    (req_cnt(log2Up(nReqs) - 1, tlBeatAddrBits),
+     req_cnt(tlBeatAddrBits - 1, 0))
+  } else {
+    (UInt(0), req_cnt)
+  }
+
+  val exp_data = Vec(expected.map(e => UInt(e, tlDataBits)))
+
+  io.mem.acquire.valid := (state === s_send)
+  io.mem.acquire.bits := Get(
+    client_xact_id = UInt(0),
+    addr_block = addr_block,
+    addr_beat = addr_beat)
+  io.mem.grant.ready := (state === s_recv)
+  io.finished := state === s_done
+
+  assert(!io.mem.grant.valid || io.mem.grant.bits.data === exp_data(req_cnt),
+         "GetSweep: data does not match expected")
+}
+
+/**
+ * Tests that multi-beat GetBlocks across a range of memory return
+ * the expected data.
+ * @param expected The values of the data expected to be read.
+ *                 Each element is the data for one beat.
+ */
+class GetBlockSweepDriver(expected: Seq[BigInt])
+                         (implicit p: Parameters) extends Driver()(p) {
+  val s_start :: s_send :: s_recv :: s_done :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_start)
+
+  val nReqs = ((expected.size - 1) / tlDataBeats + 1) * tlDataBeats
+  val (req_cnt, req_done) = Counter(io.mem.grant.fire(), nReqs)
+  val (addr_beat, beats_done) = Counter(io.mem.grant.fire(), tlDataBeats)
+
+  val tlBlockOffset = tlByteAddrBits + tlBeatAddrBits
+  val addr_block =
+    if (nReqs > tlDataBeats) req_cnt(log2Up(nReqs) - 1, tlBlockOffset)
+    else UInt(0)
+
+  io.mem.acquire.valid := (state === s_send)
+  io.mem.acquire.bits := GetBlock(
+    client_xact_id = UInt(0),
+    addr_block = addr_block)
+  io.mem.grant.ready := (state === s_recv)
+  io.finished := state === s_done
+
+  when (state === s_start && io.start) { state := s_send }
+  when (io.mem.acquire.fire()) { state := s_recv }
+  when (beats_done) { state := s_send }
+  when (req_done) { state := s_done }
+
+  val exp_data = Vec(expected.map(e => UInt(e, tlDataBits)))
+
+  assert(!io.mem.grant.valid || req_cnt >= UInt(expected.size) ||
+         io.mem.grant.bits.data === exp_data(req_cnt),
+         "GetBlockSweep: data does not match expected")
+}
+
+/**
+ * Tests that single-beat Puts across a range of memory persists correctly.
+ * @param n the number of beats to put
+ */
+class PutSweepDriver(val n: Int)(implicit p: Parameters) extends Driver()(p) {
+  val (s_idle :: s_put_req :: s_put_resp ::
+       s_get_req :: s_get_resp :: s_done :: Nil) = Enum(Bits(), 6)
+  val state = Reg(init = s_idle)
+
+  val (put_cnt, put_done) = Counter(state === s_put_resp && io.mem.grant.valid, n)
+  val (get_cnt, get_done) = Counter(state === s_get_resp && io.mem.grant.valid, n)
+
+  val (put_block, put_beat) = if (n > tlDataBeats) {
+    (put_cnt(log2Up(n) - 1, tlBeatAddrBits),
+     put_cnt(tlBeatAddrBits - 1, 0))
+  } else {
+    (UInt(0), put_cnt)
+  }
+  val (get_block, get_beat) = if (n > tlDataBeats) {
+    (get_cnt(log2Up(n) - 1, tlBeatAddrBits),
+     get_cnt(tlBeatAddrBits - 1, 0))
+  } else {
+    (UInt(0), get_cnt)
+  }
+
+  val dataRep = (tlDataBits - 1) / log2Up(n) + 1
+  val put_data = Fill(dataRep, put_cnt)(tlDataBits - 1, 0)
+  val get_data = Fill(dataRep, get_cnt)(tlDataBits - 1, 0)
+
+  io.mem.acquire.valid := state.isOneOf(s_put_req, s_get_req)
+  io.mem.acquire.bits := Mux(state === s_put_req,
+    Put(
+      client_xact_id = UInt(0),
+      addr_block = put_block,
+      addr_beat = put_beat,
+      data = put_data),
+    Get(
+      client_xact_id = UInt(0),
+      addr_block = get_block,
+      addr_beat = get_beat))
+  io.mem.grant.ready := state.isOneOf(s_put_resp, s_get_resp)
+
+  when (state === s_idle && io.start) { state := s_put_req }
+  when (state === s_put_req && io.mem.acquire.ready) { state := s_put_resp }
+  when (state === s_put_resp && io.mem.grant.valid) {
+    state := Mux(put_done, s_get_req, s_put_req)
+  }
+  when (state === s_get_req && io.mem.acquire.ready) { state := s_get_resp }
+  when (state === s_get_resp && io.mem.grant.valid) {
+    state := Mux(get_done, s_done, s_get_req)
+  }
+
+  io.finished := (state === s_done)
+
+  assert(!io.mem.grant.valid || !io.mem.grant.bits.hasData() ||
+         io.mem.grant.bits.data === get_data,
+         "PutSweepDriver: data does not match")
+}
+
+/**
+ * Tests that write-masked single-beat puts work correctly by putting
+ * data with steadily smaller write-masks to the same beat.
+ * @param minBytes the smallest number of bytes that can be in the writemask
+ */
+class PutMaskDriver(minBytes: Int = 1)(implicit p: Parameters) extends Driver()(p) {
+  val (s_idle :: s_put_req :: s_put_resp ::
+       s_get_req :: s_get_resp :: s_done :: Nil) = Enum(Bits(), 6)
+  val state = Reg(init = s_idle)
+  val nbytes = Reg(UInt(width = log2Up(tlWriteMaskBits) + 1))
+  val wmask = (UInt(1) << nbytes) - UInt(1)
+  val wdata = Fill(tlDataBits / 8, Wire(UInt(width = 8), init = nbytes))
+  // TL data bytes down to minBytes logarithmically by 2
+  val expected = (log2Ceil(tlDataBits / 8) to log2Ceil(minBytes) by -1)
+    .map(1 << _).foldLeft(UInt(0, tlDataBits)) {
+      // Change the lower nbytes of the value
+      (value, nbytes) => {
+        val mask = UInt((BigInt(1) << (nbytes * 8)) - BigInt(1), tlDataBits)
+        val wval = Fill(tlDataBits / 8, UInt(nbytes, 8))
+        (value & ~mask) | (wval & mask)
+      }
+    }
+
+  when (state === s_idle && io.start) {
+    state := s_put_req
+    nbytes := UInt(8)
+  }
+  when (state === s_put_req && io.mem.acquire.ready) {
+    state := s_put_resp
+  }
+  when (state === s_put_resp && io.mem.grant.valid) {
+    nbytes := nbytes >> UInt(1)
+    state := Mux(nbytes === UInt(minBytes), s_get_req, s_put_req)
+  }
+  when (state === s_get_req && io.mem.acquire.ready) {
+    state := s_get_resp
+  }
+  when (state === s_get_resp && io.mem.grant.valid) {
+    state := s_done
+  }
+
+  io.finished := (state === s_done)
+  io.mem.acquire.valid := state.isOneOf(s_put_req, s_get_req)
+  io.mem.acquire.bits := Mux(state === s_put_req,
+    Put(
+      client_xact_id = UInt(0),
+      addr_block = UInt(0),
+      addr_beat = UInt(0),
+      data = wdata,
+      wmask = Some(wmask)),
+    Get(
+      client_xact_id = UInt(0),
+      addr_block = UInt(0),
+      addr_beat = UInt(0)))
+  io.mem.grant.ready := state.isOneOf(s_put_resp, s_get_resp)
+
+  assert(!io.mem.grant.valid || state =/= s_get_resp ||
+         io.mem.grant.bits.data === expected,
+         "PutMask: data does not match expected")
+}
+
+class PutBlockSweepDriver(val n: Int)(implicit p: Parameters)
+    extends Driver()(p) {
+  val (s_idle :: s_put_req :: s_put_resp ::
+       s_get_req :: s_get_resp :: s_done :: Nil) = Enum(Bits(), 6)
+  val state = Reg(init = s_idle)
+
+  val (put_beat, put_beat_done) = Counter(
+    state === s_put_req && io.mem.acquire.ready, tlDataBeats)
+  val (put_cnt, put_done) = Counter(
+    state === s_put_resp && io.mem.grant.valid, n)
+  val (get_beat, get_beat_done) = Counter(
+    state === s_get_resp && io.mem.grant.valid, tlDataBeats)
+  val (get_cnt, get_done) = Counter(get_beat_done, n)
+
+  val dataRep = (tlDataBits - 1) / (log2Up(n) + tlBeatAddrBits) + 1
+  val put_data = Fill(dataRep, Cat(put_cnt, put_beat))(tlDataBits - 1, 0)
+  val get_data = Fill(dataRep, Cat(get_cnt, get_beat))(tlDataBits - 1, 0)
+
+  when (state === s_idle && io.start) { state := s_put_req }
+  when (put_beat_done) { state := s_put_resp }
+  when (state === s_put_resp && io.mem.grant.valid) {
+    state := Mux(put_done, s_get_req, s_put_req)
+  }
+  when (state === s_get_req && io.mem.acquire.ready) { state := s_get_resp }
+  when (get_beat_done) { state := Mux(get_done, s_done, s_get_req) }
+
+  val put_acquire = PutBlock(
+    client_xact_id = UInt(0),
+    addr_block = put_cnt,
+    addr_beat = put_beat,
+    data = put_data)
+
+  val get_acquire = GetBlock(
+    client_xact_id = UInt(0),
+    addr_block = get_cnt)
+
+  io.finished := (state === s_done)
+  io.mem.acquire.valid := state.isOneOf(s_put_req, s_get_req)
+  io.mem.acquire.bits := Mux(state === s_put_req, put_acquire, get_acquire)
+  io.mem.grant.ready := state.isOneOf(s_put_resp, s_get_resp)
+
+  assert(!io.mem.grant.valid || state =/= s_get_resp ||
+         io.mem.grant.bits.data === get_data,
+         "PutBlockSweep: data does not match expected")
+}
+
+class PutAtomicDriver(implicit p: Parameters) extends Driver()(p) {
+  val s_idle :: s_put :: s_atomic :: s_get :: s_done :: Nil = Enum(Bits(), 5)
+  val state = Reg(init = s_idle)
+  val sending = Reg(init = Bool(false))
+
+  val put_acquire = Put(
+    client_xact_id = UInt(0),
+    addr_block = UInt(0),
+    addr_beat = UInt(0),
+    // Put 15 in bytes 7:4
+    data = UInt(15L << 32),
+    wmask = Some(UInt(0xf0)))
+
+  val amo_acquire = PutAtomic(
+    client_xact_id = UInt(0),
+    addr_block = UInt(0),
+    addr_beat = UInt(0),
+    addr_byte = UInt(4),
+    atomic_opcode = M_XA_ADD,
+    operand_size = UInt(log2Ceil(32 / 8)),
+    data = UInt(3L << 32))
+
+  val get_acquire = Get(
+    client_xact_id = UInt(0),
+    addr_block = UInt(0),
+    addr_beat = UInt(0))
+
+  io.finished := (state === s_done)
+  io.mem.acquire.valid := sending
+  io.mem.acquire.bits := MuxLookup(state, get_acquire, Seq(
+    s_put -> put_acquire,
+    s_atomic -> amo_acquire,
+    s_get -> get_acquire))
+  io.mem.grant.ready := !sending
+
+  when (io.mem.acquire.fire()) { sending := Bool(false) }
+
+  when (state === s_idle && io.start) {
+    state := s_put
+    sending := Bool(true)
+  }
+  when (io.mem.grant.fire()) {
+    when (state === s_put) { sending := Bool(true); state := s_atomic }
+    when (state === s_atomic) { sending := Bool(true); state := s_get }
+    when (state === s_get) { state := s_done }
+  }
+
+  assert(!io.mem.grant.valid || state =/= s_get ||
+         io.mem.grant.bits.data(63, 32) === UInt(18))
+}
+
+class PrefetchDriver(implicit p: Parameters) extends Driver()(p) {
+  val s_idle :: s_put_pf :: s_get_pf :: s_done :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_idle)
+  val sending = Reg(init = Bool(false))
+
+  when (state === s_idle) {
+    sending := Bool(true)
+    state := s_put_pf
+  }
+
+  when (io.mem.acquire.fire()) { sending := Bool(false) }
+  when (io.mem.grant.fire()) {
+    when (state === s_put_pf) { sending := Bool(true); state := s_get_pf }
+    when (state === s_get_pf) { state := s_done }
+  }
+
+  io.finished := (state === s_done)
+  io.mem.acquire.valid := sending
+  io.mem.acquire.bits := Mux(state === s_put_pf,
+    PutPrefetch(
+      client_xact_id = UInt(0),
+      addr_block = UInt(0)),
+    GetPrefetch(
+      client_xact_id = UInt(0),
+      addr_block = UInt(0)))
+  io.mem.grant.ready := !sending
+}
+
+class DriverSet(driverGen: Parameters => Seq[Driver])(implicit p: Parameters)
+    extends Driver()(p) {
+  val s_start :: s_run :: s_done :: Nil = Enum(Bits(), 3)
+  val state = Reg(init = s_start)
+
+  val drivers = driverGen(p)
+  val idx = Reg(init = UInt(0, log2Up(drivers.size)))
+  val finished = Wire(init = Bool(false))
+
+  when (state === s_start && io.start) { state := s_run }
+  when (state === s_run && finished) {
+    when (idx === UInt(drivers.size - 1)) { state := s_done }
+    idx := idx + UInt(1)
+  }
+
+  io.finished := state === s_done
+
+  io.mem.acquire.valid := Bool(false)
+  io.mem.grant.ready := Bool(false)
+
+  drivers.zipWithIndex.foreach { case (driv, i) =>
+    val me = idx === UInt(i)
+
+    driv.io.start := me && state === s_run
+    driv.io.mem.acquire.ready := io.mem.acquire.ready && me
+    driv.io.mem.grant.valid := io.mem.grant.valid && me
+    driv.io.mem.grant.bits := io.mem.grant.bits
+
+    when (me) {
+      io.mem.acquire.valid := driv.io.mem.acquire.valid
+      io.mem.acquire.bits := driv.io.mem.acquire.bits
+      io.mem.grant.ready := driv.io.mem.grant.ready
+      finished := driv.io.finished
+    }
+  }
+}
--- a/src/main/scala/uncore/unittests/Tests.scala
+++ b/src/main/scala/uncore/unittests/Tests.scala
@ -0,0 +1,85 @@
+package uncore.unittests
+
+import Chisel._
+import junctions._
+import junctions.unittests._
+import uncore.devices._
+import uncore.tilelink._
+import uncore.converters._
+import cde.Parameters
+
+class SmiConverterTest(implicit val p: Parameters) extends UnitTest
+    with HasTileLinkParameters {
+  val outermostParams = p.alterPartial({ case TLId => "Outermost" })
+
+  val smiWidth = 32
+  val smiDepth = 64
+  val tlDepth = (smiWidth * smiDepth) / tlDataBits
+
+  val smimem = Module(new SmiMem(smiWidth, smiDepth))
+  val conv = Module(new SmiIOTileLinkIOConverter(
+    smiWidth, log2Up(smiDepth))(outermostParams))
+  val driver = Module(new DriverSet(
+    (driverParams: Parameters) => {
+      implicit val p = driverParams
+      Seq(
+        Module(new PutSweepDriver(tlDepth)),
+        Module(new PutMaskDriver(smiWidth / 8)),
+        Module(new PutBlockSweepDriver(tlDepth / tlDataBeats)),
+        Module(new GetMultiWidthDriver))
+    })(outermostParams))
+
+  conv.io.tl <> driver.io.mem
+  smimem.io <> conv.io.smi
+  driver.io.start := io.start
+  io.finished := driver.io.finished
+}
+
+class ROMSlaveTest(implicit p: Parameters) extends UnitTest {
+  implicit val testName = "ROMSlaveTest"
+  val romdata = Seq(
+    BigInt("01234567deadbeef", 16),
+    BigInt("ab32fee8d00dfeed", 16))
+  val rombytes = romdata.map(_.toByteArray.reverse).flatten
+  val rom = Module(new ROMSlave(rombytes))
+  val driver = Module(new DriverSet(
+    (driverParams: Parameters) => {
+      implicit val p = driverParams
+      Seq(
+        Module(new GetMultiWidthDriver),
+        Module(new GetSweepDriver(romdata)),
+        Module(new GetBlockSweepDriver(romdata)))
+    }))
+  rom.io <> driver.io.mem
+  driver.io.start := io.start
+  io.finished := driver.io.finished
+}
+
+class TileLinkRAMTest(implicit val p: Parameters)
+    extends UnitTest with HasTileLinkParameters {
+
+  val depth = 2 * tlDataBeats
+  val ram = Module(new TileLinkTestRAM(depth))
+  val driver = Module(new DriverSet(
+    (driverParams: Parameters) => {
+      implicit val p = driverParams
+      Seq(
+        Module(new PutSweepDriver(depth)),
+        Module(new PutMaskDriver),
+        Module(new PutAtomicDriver),
+        Module(new PutBlockSweepDriver(depth / tlDataBeats)),
+        Module(new PrefetchDriver),
+        Module(new GetMultiWidthDriver))
+    }))
+  ram.io <> driver.io.mem
+  driver.io.start := io.start
+  io.finished := driver.io.finished
+}
+
+object UncoreUnitTests {
+  def apply(implicit p: Parameters): Seq[UnitTest] =
+    Seq(
+      Module(new SmiConverterTest),
+      Module(new ROMSlaveTest),
+      Module(new TileLinkRAMTest))
+}
--- a/src/main/scala/uncore/util/AmoAlu.scala
+++ b/src/main/scala/uncore/util/AmoAlu.scala
@ -0,0 +1,105 @@
+// See LICENSE for license details.
+
+package uncore.util
+
+import Chisel._
+import uncore.tilelink._
+import cde.Parameters
+import uncore.constants._
+
+class StoreGen(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) {
+  val size = typ(log2Up(log2Up(maxSize)+1)-1,0)
+  def misaligned =
+    (addr & ((UInt(1) << size) - UInt(1))(log2Up(maxSize)-1,0)).orR
+
+  def mask = {
+    var res = UInt(1)
+    for (i <- 0 until log2Up(maxSize)) {
+      val upper = Mux(addr(i), res, UInt(0)) | Mux(size >= UInt(i+1), UInt((BigInt(1) << (1 << i))-1), UInt(0))
+      val lower = Mux(addr(i), UInt(0), res)
+      res = Cat(upper, lower)
+    }
+    res
+  }
+
+  protected def genData(i: Int): UInt =
+    if (i >= log2Up(maxSize)) dat
+    else Mux(size === UInt(i), Fill(1 << (log2Up(maxSize)-i), dat((8 << i)-1,0)), genData(i+1))
+
+  def data = genData(0)
+  def wordData = genData(2)
+}
+
+class StoreGenAligned(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) extends StoreGen(typ, addr, dat, maxSize) {
+  override def genData(i: Int) = dat
+}
+
+class LoadGen(typ: UInt, signed: Bool, addr: UInt, dat: UInt, zero: Bool, maxSize: Int) {
+  private val size = new StoreGen(typ, addr, dat, maxSize).size
+
+  private def genData(logMinSize: Int): UInt = {
+    var res = dat
+    for (i <- log2Up(maxSize)-1 to logMinSize by -1) {
+      val pos = 8 << i
+      val shifted = Mux(addr(i), res(2*pos-1,pos), res(pos-1,0))
+      val doZero = Bool(i == 0) && zero
+      val zeroed = Mux(doZero, UInt(0), shifted)
+      res = Cat(Mux(size === UInt(i) || doZero, Fill(8*maxSize-pos, signed && zeroed(pos-1)), res(8*maxSize-1,pos)), zeroed)
+    }
+    res
+  }
+
+  def wordData = genData(2)
+  def data = genData(0)
+}
+
+class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parameters) extends Module {
+  require(operandBits == 32 || operandBits == 64)
+  val io = new Bundle {
+    val addr = Bits(INPUT, log2Ceil(operandBits/8))
+    val cmd = Bits(INPUT, M_SZ)
+    val typ = Bits(INPUT, log2Ceil(log2Ceil(operandBits/8) + 1))
+    val lhs = Bits(INPUT, operandBits)
+    val rhs = Bits(INPUT, operandBits)
+    val out = Bits(OUTPUT, operandBits)
+  }
+
+  val storegen =
+    if(rhsIsAligned) new StoreGenAligned(io.typ, io.addr, io.rhs, operandBits/8)
+    else new StoreGen(io.typ, io.addr, io.rhs, operandBits/8)
+  val rhs = storegen.wordData
+  
+  val sgned = io.cmd === M_XA_MIN || io.cmd === M_XA_MAX
+  val max = io.cmd === M_XA_MAX || io.cmd === M_XA_MAXU
+  val min = io.cmd === M_XA_MIN || io.cmd === M_XA_MINU
+
+  val adder_out =
+    if (operandBits == 32) io.lhs + rhs
+    else {
+      val mask = ~UInt(0,64) ^ (io.addr(2) << 31)
+      (io.lhs & mask) + (rhs & mask)
+    }
+
+  val less =
+    if (operandBits == 32) Mux(io.lhs(31) === rhs(31), io.lhs < rhs, Mux(sgned, io.lhs(31), io.rhs(31)))
+    else {
+      val word = !io.typ(0)
+      val cmp_lhs = Mux(word && !io.addr(2), io.lhs(31), io.lhs(63))
+      val cmp_rhs = Mux(word && !io.addr(2), rhs(31), rhs(63))
+      val lt_lo = io.lhs(31,0) < rhs(31,0)
+      val lt_hi = io.lhs(63,32) < rhs(63,32)
+      val eq_hi = io.lhs(63,32) === rhs(63,32)
+      val lt = Mux(word, Mux(io.addr(2), lt_hi, lt_lo), lt_hi || eq_hi && lt_lo)
+      Mux(cmp_lhs === cmp_rhs, lt, Mux(sgned, cmp_lhs, cmp_rhs))
+    }
+
+  val out = Mux(io.cmd === M_XA_ADD, adder_out,
+            Mux(io.cmd === M_XA_AND, io.lhs & rhs,
+            Mux(io.cmd === M_XA_OR,  io.lhs | rhs,
+            Mux(io.cmd === M_XA_XOR, io.lhs ^ rhs,
+            Mux(Mux(less, min, max), io.lhs,
+            storegen.data)))))
+
+  val wmask = FillInterleaved(8, storegen.mask)
+  io.out := wmask & out | ~wmask & io.lhs
+}
--- a/src/main/scala/uncore/util/Counters.scala
+++ b/src/main/scala/uncore/util/Counters.scala
@ -0,0 +1,134 @@
+package uncore.util
+
+import Chisel._
+import uncore.tilelink._
+import cde.Parameters
+
+// Produces 0-width value when counting to 1
+class ZCounter(val n: Int) {
+  val value = Reg(init=UInt(0, log2Ceil(n)))
+  def inc(): Bool = {
+    if (n == 1) Bool(true)
+    else {
+      val wrap = value === UInt(n-1)
+      value := Mux(Bool(!isPow2(n)) && wrap, UInt(0), value + UInt(1))
+      wrap
+    }
+  }
+}
+
+object ZCounter {
+  def apply(n: Int) = new ZCounter(n)
+  def apply(cond: Bool, n: Int): (UInt, Bool) = {
+    val c = new ZCounter(n)
+    var wrap: Bool = null
+    when (cond) { wrap = c.inc() }
+    (c.value, cond && wrap)
+  }
+}
+
+object TwoWayCounter {
+  def apply(up: Bool, down: Bool, max: Int): UInt = {
+    val cnt = Reg(init = UInt(0, log2Up(max+1)))
+    when (up && !down) { cnt := cnt + UInt(1) }
+    when (down && !up) { cnt := cnt - UInt(1) }
+    cnt
+  }
+}
+
+class BeatCounterStatus extends Bundle {
+  val idx = UInt()
+  val done = Bool()
+}
+
+class TwoWayBeatCounterStatus extends Bundle {
+  val pending = Bool()
+  val up = new BeatCounterStatus()
+  val down = new BeatCounterStatus()
+}
+
+/** Utility trait containing wiring functions to keep track of how many data beats have 
+  * been sent or recieved over a particular [[uncore.TileLinkChannel]] or pair of channels. 
+  *
+  * Won't count message types that don't have data. 
+  * Used in [[uncore.XactTracker]] and [[uncore.FinishUnit]].
+  */
+trait HasDataBeatCounters {
+  type HasBeat = TileLinkChannel with HasTileLinkBeatId
+  type HasId = TileLinkChannel with HasClientId
+
+  /** Returns the current count on this channel and when a message is done
+    * @param inc increment the counter (usually .valid or .fire())
+    * @param data the actual channel data
+    * @param beat count to return for single-beat messages
+    */
+  def connectDataBeatCounter[S <: TileLinkChannel](inc: Bool, data: S, beat: UInt) = {
+    val multi = data.hasMultibeatData()
+    val (multi_cnt, multi_done) = Counter(inc && multi, data.tlDataBeats)
+    val cnt = Mux(multi, multi_cnt, beat)
+    val done = Mux(multi, multi_done, inc)
+    (cnt, done)
+  }
+
+  /** Counter for beats on outgoing [[chisel.DecoupledIO]] */
+  def connectOutgoingDataBeatCounter[T <: TileLinkChannel](
+      out: DecoupledIO[T],
+      beat: UInt = UInt(0)): (UInt, Bool) =
+    connectDataBeatCounter(out.fire(), out.bits, beat)
+
+  /** Returns done but not cnt. Use the addr_beat subbundle instead of cnt for beats on 
+    * incoming channels in case of network reordering.
+    */
+  def connectIncomingDataBeatCounter[T <: TileLinkChannel](in: DecoupledIO[T]): Bool =
+    connectDataBeatCounter(in.fire(), in.bits, UInt(0))._2
+
+  /** Counter for beats on incoming DecoupledIO[LogicalNetworkIO[]]s returns done */
+  def connectIncomingDataBeatCounterWithHeader[T <: TileLinkChannel](in: DecoupledIO[LogicalNetworkIO[T]]): Bool =
+    connectDataBeatCounter(in.fire(), in.bits.payload, UInt(0))._2
+
+  /** If the network might interleave beats from different messages, we need a Vec of counters,
+    * one for every outstanding message id that might be interleaved.
+    *
+    * @param getId mapping from Message to counter id
+    */
+  def connectIncomingDataBeatCountersWithHeader[T <: TileLinkChannel with HasClientTransactionId](
+      in: DecoupledIO[LogicalNetworkIO[T]],
+      entries: Int,
+      getId: LogicalNetworkIO[T] => UInt): Vec[Bool] = {
+    Vec((0 until entries).map { i =>
+      connectDataBeatCounter(in.fire() && getId(in.bits) === UInt(i), in.bits.payload, UInt(0))._2 
+    })
+  }
+
+  /** Provides counters on two channels, as well a meta-counter that tracks how many
+    * messages have been sent over the up channel but not yet responded to over the down channel
+    *
+    * @param status bundle of status of the counters
+    * @param up outgoing channel
+    * @param down incoming channel
+    * @param max max number of outstanding ups with no down
+    * @param beat overrides cnts on single-beat messages
+    * @param track whether up's message should be tracked
+    * @return a tuple containing whether their are outstanding messages, up's count,
+    *         up's done, down's count, down's done
+    */
+  def connectTwoWayBeatCounters[T <: TileLinkChannel, S <: TileLinkChannel](
+      status: TwoWayBeatCounterStatus,
+      up: DecoupledIO[T],
+      down: DecoupledIO[S],
+      max: Int = 1,
+      beat: UInt = UInt(0),
+      trackUp: T => Bool = (t: T) => Bool(true),
+      trackDown: S => Bool = (s: S) => Bool(true)) {
+    val (up_idx, up_done) = connectDataBeatCounter(up.fire() && trackUp(up.bits), up.bits, beat)
+    val (dn_idx, dn_done) = connectDataBeatCounter(down.fire() && trackDown(down.bits), down.bits, beat)
+    val cnt = TwoWayCounter(up_done, dn_done, max)
+    status.pending := cnt > UInt(0)
+    status.up.idx := up_idx
+    status.up.done := up_done
+    status.down.idx := dn_idx
+    status.down.done := dn_done
+  }
+}
+
+
--- a/src/main/scala/uncore/util/Enqueuer.scala
+++ b/src/main/scala/uncore/util/Enqueuer.scala
@ -0,0 +1,56 @@
+package uncore.util
+
+import Chisel._
+import uncore.tilelink._
+import cde.Parameters
+
+/** Struct for describing per-channel queue depths */
+case class TileLinkDepths(acq: Int, prb: Int, rel: Int, gnt: Int, fin: Int)
+
+/** Optionally enqueues each [[uncore.TileLinkChannel]] individually */
+class TileLinkEnqueuer(depths: TileLinkDepths)(implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val client = new TileLinkIO().flip
+    val manager = new TileLinkIO
+  }
+  io.manager.acquire <> (if(depths.acq > 0) Queue(io.client.acquire, depths.acq) else io.client.acquire)
+  io.client.probe    <> (if(depths.prb > 0) Queue(io.manager.probe,  depths.prb) else io.manager.probe)
+  io.manager.release <> (if(depths.rel > 0) Queue(io.client.release, depths.rel) else io.client.release)
+  io.client.grant    <> (if(depths.gnt > 0) Queue(io.manager.grant,  depths.gnt) else io.manager.grant)
+  io.manager.finish  <> (if(depths.fin > 0) Queue(io.client.finish,  depths.fin) else io.client.finish)
+}
+
+object TileLinkEnqueuer {
+  def apply(in: TileLinkIO, depths: TileLinkDepths)(implicit p: Parameters): TileLinkIO = {
+    val t = Module(new TileLinkEnqueuer(depths))
+    t.io.client <> in
+    t.io.manager
+  }
+  def apply(in: TileLinkIO, depth: Int)(implicit p: Parameters): TileLinkIO = {
+    apply(in, TileLinkDepths(depth, depth, depth, depth, depth))
+  }
+}
+
+class ClientTileLinkEnqueuer(depths: TileLinkDepths)(implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val inner = new ClientTileLinkIO().flip
+    val outer = new ClientTileLinkIO
+  }
+
+  io.outer.acquire <> (if(depths.acq > 0) Queue(io.inner.acquire, depths.acq) else io.inner.acquire)
+  io.inner.probe   <> (if(depths.prb > 0) Queue(io.outer.probe,   depths.prb) else io.outer.probe)
+  io.outer.release <> (if(depths.rel > 0) Queue(io.inner.release, depths.rel) else io.inner.release)
+  io.inner.grant   <> (if(depths.gnt > 0) Queue(io.outer.grant,   depths.gnt) else io.outer.grant)
+  io.outer.finish  <> (if(depths.fin > 0) Queue(io.inner.finish,  depths.fin) else io.inner.finish)
+}
+
+object ClientTileLinkEnqueuer {
+  def apply(in: ClientTileLinkIO, depths: TileLinkDepths)(implicit p: Parameters): ClientTileLinkIO = {
+    val t = Module(new ClientTileLinkEnqueuer(depths))
+    t.io.inner <> in
+    t.io.outer
+  }
+  def apply(in: ClientTileLinkIO, depth: Int)(implicit p: Parameters): ClientTileLinkIO = {
+    apply(in, TileLinkDepths(depth, depth, depth, depth, depth))
+  }
+}
--- a/src/main/scala/uncore/util/Package.scala
+++ b/src/main/scala/uncore/util/Package.scala
@ -0,0 +1,25 @@
+package uncore
+
+import Chisel._
+
+package object util {
+  implicit class UIntIsOneOf(val x: UInt) extends AnyVal {
+    def isOneOf(s: Seq[UInt]): Bool = s.map(x === _).reduce(_||_)
+  
+    def isOneOf(u1: UInt, u2: UInt*): Bool = isOneOf(u1 +: u2.toSeq)
+  }
+
+  implicit class SeqToAugmentedSeq[T <: Data](val x: Seq[T]) extends AnyVal {
+    def apply(idx: UInt): T = {
+      if (x.size == 1) {
+        x.head
+      } else {
+        val half = 1 << (log2Ceil(x.size) - 1)
+        val newIdx = idx & UInt(half - 1)
+        Mux(idx >= UInt(half), x.drop(half)(newIdx), x.take(half)(newIdx))
+      }
+    }
+
+    def asUInt(): UInt = Cat(x.map(_.asUInt).reverse)
+  }
+}
--- a/src/main/scala/uncore/util/Serializer.scala
+++ b/src/main/scala/uncore/util/Serializer.scala
@ -0,0 +1,69 @@
+// See LICENSE for license details.
+
+package uncore.util
+
+import Chisel._
+import uncore.tilelink._
+
+class FlowThroughSerializer[T <: Bundle with HasTileLinkData](gen: T, n: Int) extends Module {
+  val io = new Bundle {
+    val in = Decoupled(gen).flip
+    val out = Decoupled(gen)
+    val cnt = UInt(OUTPUT, log2Up(n))
+    val done = Bool(OUTPUT)
+  }
+  val narrowWidth = io.in.bits.data.getWidth / n
+  require(io.in.bits.data.getWidth % narrowWidth == 0)
+
+  if(n == 1) {
+    io.out <> io.in
+    io.cnt := UInt(0)
+    io.done := Bool(true)
+  } else {
+    val cnt = Reg(init=UInt(0, width = log2Up(n)))
+    val wrap = cnt === UInt(n-1)
+    val rbits = Reg{io.in.bits}
+    val active = Reg(init=Bool(false))
+
+    val shifter = Wire(Vec(n, Bits(width = narrowWidth)))
+    (0 until n).foreach { 
+      i => shifter(i) := rbits.data((i+1)*narrowWidth-1,i*narrowWidth)
+    }
+
+    io.done := Bool(false)
+    io.cnt := cnt
+    io.in.ready := !active
+    io.out.valid := active || io.in.valid
+    io.out.bits := io.in.bits
+    when(!active && io.in.valid) {
+      when(io.in.bits.hasData()) {
+        cnt := Mux(io.out.ready, UInt(1), UInt(0))
+        rbits := io.in.bits
+        active := Bool(true)
+      }
+      io.done := !io.in.bits.hasData()
+    }
+    when(active) {
+      io.out.bits := rbits
+      io.out.bits.data := shifter(cnt)
+      when(io.out.ready) { 
+        cnt := cnt + UInt(1)
+        when(wrap) {
+          cnt := UInt(0)
+          io.done := Bool(true)
+          active := Bool(false)
+        }
+      }
+    }
+  }
+}
+
+object FlowThroughSerializer {
+  def apply[T <: Bundle with HasTileLinkData](in: DecoupledIO[T], n: Int): DecoupledIO[T] = {
+    val fs = Module(new FlowThroughSerializer(in.bits, n))
+    fs.io.in.valid := in.valid
+    fs.io.in.bits := in.bits
+    in.ready := fs.io.in.ready
+    fs.io.out
+  }
+}