Heterogeneous Tiles (#550)

Fundamental new features: * Added tile package: This package is intended to hold components re-usable across different types of tile. Will be the future location of TL2-RoCC accelerators and new diplomatic versions of intra-tile interfaces. * Adopted [ModuleName]Params convention: Code base was very inconsistent about what to name case classes that provide parameters to modules. Settled on calling them [ModuleName]Params to distinguish them from config.Parameters and config.Config. So far applied mostly only to case classes defined within rocket and tile. * Defined RocketTileParams: A nested case class containing case classes for all the components of a tile (L1 caches and core). Allows all such parameters to vary per-tile. * Defined RocketCoreParams: All the parameters that can be varied per-core. * Defined L1CacheParams: A trait defining the parameters common to L1 caches, made concrete in different derived case classes. * Defined RocketTilesKey: A sequence of RocketTileParams, one for every tile to be created. * Provided HeterogeneousDualCoreConfig: An example of making a heterogeneous chip with two cores, one big and one little. * Changes to legacy code: ReplacementPolicy moved to package util. L1Metadata moved to package tile. Legacy L2 cache agent removed because it can no longer share the metadata array implementation with the L1. Legacy GroundTests on life support. Additional changes that got rolled in along the way: * rocket: Fix critical path through BTB for I$ index bits > pgIdxBits * coreplex: tiles connected via :=* * groundtest: updated to use TileParams * tilelink: cache cork requirements are relaxed to allow more cacheless masters
2017-02-09 13:59:09 -08:00
parent f9acd4988c
commit e8c8d2af71
57 changed files with 1084 additions and 1933 deletions
--- a/src/main/scala/rocket/dpath_alu.scala
+++ b/src/main/scala/rocket/dpath_alu.scala
@ -5,6 +5,7 @@ package rocket

 import Chisel._
 import config._
+import tile._
 import Instructions._

 object ALU
--- a/src/main/scala/rocket/Arbiter.scala
+++ b/src/main/scala/rocket/Arbiter.scala
--- a/src/main/scala/rocket/BTB.scala
+++ b/src/main/scala/rocket/BTB.scala
@ -4,23 +4,22 @@
 package rocket

 import Chisel._
-import config._
-import util._
 import Chisel.ImplicitConversions._
-import uncore.util.PseudoLRU
+import config._
+import tile.HasCoreParameters
+import util._

-case object BtbKey extends Field[BtbParameters]
-
-case class BtbParameters(
+case class BTBParams(
  nEntries: Int = 40,
  nRAS: Int = 2,
  updatesOutOfOrder: Boolean = false)

-abstract trait HasBtbParameters extends HasCoreParameters {
-  val matchBits = pgIdxBits
-  val entries = p(BtbKey).nEntries
-  val nRAS = p(BtbKey).nRAS
-  val updatesOutOfOrder = p(BtbKey).updatesOutOfOrder
+trait HasBtbParameters extends HasCoreParameters {
+  val btbParams = tileParams.btb.getOrElse(BTBParams(nEntries = 0))
+  val matchBits = pgIdxBits max log2Ceil(p(coreplex.CacheBlockBytes) * tileParams.icache.get.nSets)
+  val entries = btbParams.nEntries
+  val nRAS = btbParams.nRAS
+  val updatesOutOfOrder = btbParams.updatesOutOfOrder
  val nPages = ((1 max(log2Up(entries)))+1)/2*2 // control logic assumes 2 divides pages
  val opaqueBits = log2Up(entries)
  val nBHT = 1 << log2Up(entries*2)
--- a/src/main/scala/rocket/Breakpoint.scala
+++ b/src/main/scala/rocket/Breakpoint.scala
@ -3,9 +3,10 @@
 package rocket

 import Chisel._
-import util._
 import Chisel.ImplicitConversions._
 import config._
+import tile._
+import util._

 class BPControl(implicit p: Parameters) extends CoreBundle()(p) {
  val ttype = UInt(width = 4)
--- a/src/main/scala/rocket/CSR.scala
+++ b/src/main/scala/rocket/CSR.scala
@ -6,6 +6,7 @@ package rocket
 import Chisel._
 import Instructions._
 import config._
+import tile._
 import uncore.devices._
 import util._
 import Chisel.ImplicitConversions._
@ -60,14 +61,6 @@ class DCSR extends Bundle {
  val prv = UInt(width = PRV.SZ)
 }

-class TileInterrupts(implicit p: Parameters) extends CoreBundle()(p) {
-  val debug = Bool()
-  val mtip = Bool()
-  val msip = Bool()
-  val meip = Bool()
-  val seip = usingVM.option(Bool())
-}
-
 class MIP extends Bundle {
  val rocc = Bool()
  val meip = Bool()
@ -127,7 +120,8 @@ object CSR
  val nCtr = firstHPM + nHPM
 }

-class CSRFileIO(implicit p: Parameters) extends CoreBundle {
+class CSRFileIO(implicit p: Parameters) extends CoreBundle
+    with HasRocketCoreParameters {
  val interrupts = new TileInterrupts().asInput
  val hartid = UInt(INPUT, xLen)
  val rw = new Bundle {
@ -163,7 +157,7 @@ class CSRFileIO(implicit p: Parameters) extends CoreBundle {
 }

 class CSRFile(implicit p: Parameters) extends CoreModule()(p)
-{
+    with HasRocketCoreParameters {
  val io = new CSRFileIO

  val reset_mstatus = Wire(init=new MStatus().fromBits(0))
@ -227,7 +221,7 @@ class CSRFile(implicit p: Parameters) extends CoreModule()(p)
  val reg_mbadaddr = Reg(UInt(width = vaddrBitsExtended))
  val reg_mscratch = Reg(Bits(width = xLen))
  val mtvecWidth = paddrBits min xLen
-  val reg_mtvec = p(MtvecInit) match {
+  val reg_mtvec = mtvecInit match {
    case Some(addr) => Reg(init=UInt(addr, mtvecWidth))
    case None => Reg(UInt(width = mtvecWidth))
  }
@ -547,7 +541,7 @@ class CSRFile(implicit p: Parameters) extends CoreModule()(p)
    when (decoded_addr(CSRs.mie))      { reg_mie := wdata & supported_interrupts }
    when (decoded_addr(CSRs.mepc))     { reg_mepc := formEPC(wdata) }
    when (decoded_addr(CSRs.mscratch)) { reg_mscratch := wdata }
-    if (p(MtvecWritable))
+    if (mtvecWritable)
      when (decoded_addr(CSRs.mtvec))  { reg_mtvec := wdata >> 2 << 2 }
    when (decoded_addr(CSRs.mcause))   { reg_mcause := wdata & UInt((BigInt(1) << (xLen-1)) + 31) /* only implement 5 LSBs and MSB */ }
    when (decoded_addr(CSRs.mbadaddr)) { reg_mbadaddr := wdata(vaddrBitsExtended-1,0) }
--- a/src/main/scala/rocket/Consts.scala
+++ b/src/main/scala/rocket/Consts.scala
--- a/src/main/scala/rocket/Core.scala
+++ b/src/main/scala/rocket/Core.scala
@ -1,93 +0,0 @@
-// See LICENSE.SiFive for license details.
-
-package rocket
-
-import Chisel._
-import config._
-import uncore.tilelink2.TLEdgeOut
-import uncore.util.{CacheName, CacheBlockBytes}
-import util._
-
-case object BuildCore extends Field[(RocketConfig, Parameters) => CoreModule with HasCoreIO]
-case object SharedMemoryTLEdge extends Field[TLEdgeOut]
-
-trait HasCoreParameters {
-  implicit val p: Parameters
-  val xLen = p(XLen)
-  val fLen = xLen // TODO relax this
-
-  val usingVM = p(UseVM)
-  val usingUser = p(UseUser) || usingVM
-  val usingDebug = p(UseDebug)
-  val usingMulDiv = p(MulDivKey).nonEmpty
-  val usingFPU = p(FPUKey).nonEmpty
-  val usingAtomics = p(UseAtomics)
-  val usingCompressed = p(UseCompressed)
-  val usingRoCC = !p(BuildRoCC).isEmpty
-  val fastLoadWord = p(FastLoadWord)
-  val fastLoadByte = p(FastLoadByte)
-  val fastJAL = p(FastJAL)
-  val nBreakpoints = p(NBreakpoints)
-  val nPerfCounters = p(NPerfCounters)
-  val nPerfEvents = p(NPerfEvents)
-  val usingDataScratchpad = p(DataScratchpadSize) > 0
-
-  val retireWidth = p(RetireWidth)
-  val fetchWidth = p(FetchWidth)
-  val coreInstBits = p(CoreInstBits)
-  val coreInstBytes = coreInstBits/8
-  val coreDataBits = xLen
-  val coreDataBytes = coreDataBits/8
-
-  val dcacheArbPorts = 1 + usingVM.toInt + usingDataScratchpad.toInt + p(BuildRoCC).size
-  val coreDCacheReqTagBits = 6
-  val dcacheReqTagBits = coreDCacheReqTagBits + log2Ceil(dcacheArbPorts)
-
-  def pgIdxBits = 12
-  def pgLevelBits = 10 - log2Ceil(xLen / 32)
-  def vaddrBits = pgIdxBits + pgLevels * pgLevelBits
-  val paddrBits = p(PAddrBits)
-  def ppnBits = paddrBits - pgIdxBits
-  def vpnBits = vaddrBits - pgIdxBits
-  val pgLevels = p(PgLevels)
-  val asIdBits = p(ASIdBits)
-  val vpnBitsExtended = vpnBits + (vaddrBits < xLen).toInt
-  val vaddrBitsExtended = vpnBitsExtended + pgIdxBits
-  val coreMaxAddrBits = paddrBits max vaddrBitsExtended
-  val nCustomMrwCsrs = p(NCustomMRWCSRs)
-
-  // fetchWidth doubled, but coreInstBytes halved, for RVC
-  val decodeWidth = fetchWidth / (if (usingCompressed) 2 else 1)
-
-  // Print out log of committed instructions and their writeback values.
-  // Requires post-processing due to out-of-order writebacks.
-  val enableCommitLog = false
-
-  val maxPAddrBits = xLen match {
-    case 32 => 34
-    case 64 => 50
-  }
-
-  require(paddrBits <= maxPAddrBits)
-  require(!fastLoadByte || fastLoadWord)
-}
-
-abstract class CoreModule(implicit val p: Parameters) extends Module
-  with HasCoreParameters
-
-abstract class CoreBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
-  with HasCoreParameters
-
-trait HasCoreIO {
-  implicit val p: Parameters
-  val io = new Bundle {
-    val interrupts = new TileInterrupts().asInput
-    val hartid = UInt(INPUT, p(XLen))
-    val imem  = new FrontendIO()(p.alterPartial({case CacheName => CacheName("L1I") }))
-    val dmem = new HellaCacheIO()(p.alterPartial({ case CacheName => CacheName("L1D") }))
-    val ptw = new DatapathPTWIO().flip
-    val fpu = new FPUCoreIO().flip
-    val rocc = new RoCCCoreIO().flip
-  }
-}
-
--- a/src/main/scala/rocket/DCache.scala
+++ b/src/main/scala/rocket/DCache.scala
@ -4,13 +4,13 @@ package rocket

 import Chisel._
 import Chisel.ImplicitConversions._
+import config._
 import diplomacy._
 import uncore.constants._
 import uncore.tilelink2._
 import uncore.util._
 import util._
 import TLMessages._
-import config._

 class DCacheDataReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) {
  val addr = Bits(width = untagBits)
@ -38,22 +38,19 @@ class DCacheDataArray(implicit p: Parameters) extends L1HellaCacheModule()(p) {
  }
 }

-class DCache(cfg: DCacheConfig, val scratch: () => Option[AddressSet] = () => None)(implicit p: Parameters) extends HellaCache(cfg)(p) {
+class DCache(val scratch: () => Option[AddressSet] = () => None)(implicit p: Parameters) extends HellaCache()(p) {
  override lazy val module = new DCacheModule(this) 
 }

 class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
-
-  val maxUncachedInFlight = cfg.nMMIOs
-
  require(rowBits == encRowBits) // no ECC

  val grantackq = Module(new Queue(tl_out.e.bits,1)) // TODO don't need this in scratchpad mode

  // tags
-  val replacer = p(Replacer)()
+  val replacer = cacheParams.replacement
  def onReset = L1Metadata(UInt(0), ClientMetadata.onReset)
-  val metaReadArb = Module(new Arbiter(new MetaReadReq, 3))
+  val metaReadArb = Module(new Arbiter(new L1MetaReadReq, 3))
  val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, 3))

  // data
@ -104,7 +101,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
  when (!metaReadArb.io.in(2).ready) { io.cpu.req.ready := false }

  // address translation
-  val tlb = Module(new TLB)
+  val tlb = Module(new TLB(nTLBEntries))
  io.ptw <> tlb.io.ptw
  tlb.io.req.valid := s1_valid_masked && s1_readwrite
  tlb.io.req.bits.passthrough := s1_req.phys
@ -126,7 +123,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
      val hitState = Mux(inScratchpad, ClientMetadata.maximum, ClientMetadata.onReset)
      (inScratchpad, hitState, L1Metadata(UInt(0), ClientMetadata.onReset))
    } else {
-      val meta = Module(new MetadataArray(onReset _))
+      val meta = Module(new L1MetadataArray(onReset _))
      meta.io.read <> metaReadArb.io.out
      meta.io.write <> metaWriteArb.io.out
      val s1_meta = meta.io.resp
--- a/src/main/scala/rocket/Decode.scala
+++ b/src/main/scala/rocket/Decode.scala
--- a/src/main/scala/rocket/FPU.scala
+++ b/src/main/scala/rocket/FPU.scala
@ -1,753 +0,0 @@
-// See LICENSE.Berkeley for license details.
-// See LICENSE.SiFive for license details.
-
-package rocket
-
-import Chisel._
-import Instructions._
-import util._
-import Chisel.ImplicitConversions._
-import FPConstants._
-import uncore.constants.MemoryOpConstants._
-import config._
-
-case class FPUConfig(
-  divSqrt: Boolean = true,
-  sfmaLatency: Int = 3,
-  dfmaLatency: Int = 4
-)
-
-object FPConstants
-{
-  def FCMD_ADD =    BitPat("b0??00")
-  def FCMD_SUB =    BitPat("b0??01")
-  def FCMD_MUL =    BitPat("b0??10")
-  def FCMD_MADD =   BitPat("b1??00")
-  def FCMD_MSUB =   BitPat("b1??01")
-  def FCMD_NMSUB =  BitPat("b1??10")
-  def FCMD_NMADD =  BitPat("b1??11")
-  def FCMD_DIV =    BitPat("b?0011")
-  def FCMD_SQRT =   BitPat("b?1011")
-  def FCMD_SGNJ =   BitPat("b??1?0")
-  def FCMD_MINMAX = BitPat("b?01?1")
-  def FCMD_CVT_FF = BitPat("b??0??")
-  def FCMD_CVT_IF = BitPat("b?10??")
-  def FCMD_CMP =    BitPat("b?01??")
-  def FCMD_MV_XF =  BitPat("b?11??")
-  def FCMD_CVT_FI = BitPat("b??0??")
-  def FCMD_MV_FX =  BitPat("b??1??")
-  def FCMD_X =      BitPat("b?????")
-  val FCMD_WIDTH = 5
-
-  val RM_SZ = 3
-  val FLAGS_SZ = 5
-}
-
-trait HasFPUCtrlSigs {
-  val cmd = Bits(width = FCMD_WIDTH)
-  val ldst = Bool()
-  val wen = Bool()
-  val ren1 = Bool()
-  val ren2 = Bool()
-  val ren3 = Bool()
-  val swap12 = Bool()
-  val swap23 = Bool()
-  val single = Bool()
-  val fromint = Bool()
-  val toint = Bool()
-  val fastpipe = Bool()
-  val fma = Bool()
-  val div = Bool()
-  val sqrt = Bool()
-  val round = Bool()
-  val wflags = Bool()
-}
-
-class FPUCtrlSigs extends Bundle with HasFPUCtrlSigs
-
-class FPUDecoder(implicit p: Parameters) extends FPUModule()(p) {
-  val io = new Bundle {
-    val inst = Bits(INPUT, 32)
-    val sigs = new FPUCtrlSigs().asOutput
-  }
-
-  val default =       List(FCMD_X,      X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X)
-  val f =
-    Array(FLW      -> List(FCMD_X,      Y,Y,N,N,N,X,X,Y,N,N,N,N,N,N,N,N),
-          FSW      -> List(FCMD_MV_XF,  Y,N,N,Y,N,Y,X,Y,N,Y,N,N,N,N,N,N),
-          FMV_S_X  -> List(FCMD_MV_FX,  N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,N),
-          FCVT_S_W -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y),
-          FCVT_S_WU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y),
-          FCVT_S_L -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y),
-          FCVT_S_LU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y),
-          FMV_X_S  -> List(FCMD_MV_XF,  N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,N),
-          FCLASS_S -> List(FCMD_MV_XF,  N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,N),
-          FCVT_W_S -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y),
-          FCVT_WU_S-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y),
-          FCVT_L_S -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y),
-          FCVT_LU_S-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y),
-          FEQ_S    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,Y,N,Y,N,N,N,N,N,Y),
-          FLT_S    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,Y,N,Y,N,N,N,N,N,Y),
-          FLE_S    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,Y,N,Y,N,N,N,N,N,Y),
-          FSGNJ_S  -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,N),
-          FSGNJN_S -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,N),
-          FSGNJX_S -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,N),
-          FMIN_S   -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,Y),
-          FMAX_S   -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,Y),
-          FADD_S   -> List(FCMD_ADD,    N,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y,Y),
-          FSUB_S   -> List(FCMD_SUB,    N,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y,Y),
-          FMUL_S   -> List(FCMD_MUL,    N,Y,Y,Y,N,N,N,Y,N,N,N,Y,N,N,Y,Y),
-          FMADD_S  -> List(FCMD_MADD,   N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y),
-          FMSUB_S  -> List(FCMD_MSUB,   N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y),
-          FNMADD_S -> List(FCMD_NMADD,  N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y),
-          FNMSUB_S -> List(FCMD_NMSUB,  N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y),
-          FDIV_S   -> List(FCMD_DIV,    N,Y,Y,Y,N,N,N,Y,N,N,N,N,Y,N,Y,Y),
-          FSQRT_S  -> List(FCMD_SQRT,   N,Y,Y,N,N,Y,X,Y,N,N,N,N,N,Y,Y,Y))
-  val d =
-    Array(FLD      -> List(FCMD_X,      Y,Y,N,N,N,X,X,N,N,N,N,N,N,N,N,N),
-          FSD      -> List(FCMD_MV_XF,  Y,N,N,Y,N,Y,X,N,N,Y,N,N,N,N,N,N),
-          FMV_D_X  -> List(FCMD_MV_FX,  N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,N),
-          FCVT_D_W -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y),
-          FCVT_D_WU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y),
-          FCVT_D_L -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y),
-          FCVT_D_LU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y),
-          FMV_X_D  -> List(FCMD_MV_XF,  N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,N),
-          FCLASS_D -> List(FCMD_MV_XF,  N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,N),
-          FCVT_W_D -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y),
-          FCVT_WU_D-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y),
-          FCVT_L_D -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y),
-          FCVT_LU_D-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y),
-          FCVT_S_D -> List(FCMD_CVT_FF, N,Y,Y,N,N,N,X,Y,N,N,Y,N,N,N,Y,Y),
-          FCVT_D_S -> List(FCMD_CVT_FF, N,Y,Y,N,N,N,X,N,N,N,Y,N,N,N,Y,Y),
-          FEQ_D    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,Y),
-          FLT_D    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,Y),
-          FLE_D    -> List(FCMD_CMP,    N,N,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,Y),
-          FSGNJ_D  -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,N),
-          FSGNJN_D -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,N),
-          FSGNJX_D -> List(FCMD_SGNJ,   N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,N),
-          FMIN_D   -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,Y),
-          FMAX_D   -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,Y),
-          FADD_D   -> List(FCMD_ADD,    N,Y,Y,Y,N,N,Y,N,N,N,N,Y,N,N,Y,Y),
-          FSUB_D   -> List(FCMD_SUB,    N,Y,Y,Y,N,N,Y,N,N,N,N,Y,N,N,Y,Y),
-          FMUL_D   -> List(FCMD_MUL,    N,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,Y,Y),
-          FMADD_D  -> List(FCMD_MADD,   N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y),
-          FMSUB_D  -> List(FCMD_MSUB,   N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y),
-          FNMADD_D -> List(FCMD_NMADD,  N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y),
-          FNMSUB_D -> List(FCMD_NMSUB,  N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y),
-          FDIV_D   -> List(FCMD_DIV,    N,Y,Y,Y,N,N,N,N,N,N,N,N,Y,N,Y,Y),
-          FSQRT_D  -> List(FCMD_SQRT,   N,Y,Y,N,N,Y,X,N,N,N,N,N,N,Y,Y,Y))
-
-  val insns = fLen match {
-    case 32 => f
-    case 64 => f ++ d
-  }
-  val decoder = DecodeLogic(io.inst, default, insns)
-  val s = io.sigs
-  val sigs = Seq(s.cmd, s.ldst, s.wen, s.ren1, s.ren2, s.ren3, s.swap12,
-                 s.swap23, s.single, s.fromint, s.toint, s.fastpipe, s.fma,
-                 s.div, s.sqrt, s.round, s.wflags)
-  sigs zip decoder map {case(s,d) => s := d}
-}
-
-class FPUCoreIO(implicit p: Parameters) extends CoreBundle()(p) {
-  val inst = Bits(INPUT, 32)
-  val fromint_data = Bits(INPUT, xLen)
-
-  val fcsr_rm = Bits(INPUT, FPConstants.RM_SZ)
-  val fcsr_flags = Valid(Bits(width = FPConstants.FLAGS_SZ))
-
-  val store_data = Bits(OUTPUT, fLen)
-  val toint_data = Bits(OUTPUT, xLen)
-
-  val dmem_resp_val = Bool(INPUT)
-  val dmem_resp_type = Bits(INPUT, 3)
-  val dmem_resp_tag = UInt(INPUT, 5)
-  val dmem_resp_data = Bits(INPUT, fLen)
-
-  val valid = Bool(INPUT)
-  val fcsr_rdy = Bool(OUTPUT)
-  val nack_mem = Bool(OUTPUT)
-  val illegal_rm = Bool(OUTPUT)
-  val killx = Bool(INPUT)
-  val killm = Bool(INPUT)
-  val dec = new FPUCtrlSigs().asOutput
-  val sboard_set = Bool(OUTPUT)
-  val sboard_clr = Bool(OUTPUT)
-  val sboard_clra = UInt(OUTPUT, 5)
-}
-
-class FPUIO(implicit p: Parameters) extends FPUCoreIO ()(p) {
-  val cp_req = Decoupled(new FPInput()).flip //cp doesn't pay attn to kill sigs
-  val cp_resp = Decoupled(new FPResult())
-}
-
-class FPResult(implicit p: Parameters) extends CoreBundle()(p) {
-  val data = Bits(width = fLen+1)
-  val exc = Bits(width = 5)
-}
-
-class FPInput(implicit p: Parameters) extends CoreBundle()(p) with HasFPUCtrlSigs {
-  val rm = Bits(width = 3)
-  val typ = Bits(width = 2)
-  val in1 = Bits(width = fLen+1)
-  val in2 = Bits(width = fLen+1)
-  val in3 = Bits(width = fLen+1)
-
-  override def cloneType = new FPInput().asInstanceOf[this.type]
-}
-
-object ClassifyRecFN {
-  def apply(expWidth: Int, sigWidth: Int, in: UInt) = {
-    val sign = in(sigWidth + expWidth)
-    val exp = in(sigWidth + expWidth - 1, sigWidth - 1)
-    val sig = in(sigWidth - 2, 0)
-
-    val code        = exp(expWidth,expWidth-2)
-    val codeHi      = code(2, 1)
-    val isSpecial   = codeHi === UInt(3)
-
-    val isHighSubnormalIn = exp(expWidth-2, 0) < UInt(2)
-    val isSubnormal = code === UInt(1) || codeHi === UInt(1) && isHighSubnormalIn
-    val isNormal = codeHi === UInt(1) && !isHighSubnormalIn || codeHi === UInt(2)
-    val isZero = code === UInt(0)
-    val isInf = isSpecial && !exp(expWidth-2)
-    val isNaN = code.andR
-    val isSNaN = isNaN && !sig(sigWidth-2)
-    val isQNaN = isNaN && sig(sigWidth-2)
-
-    Cat(isQNaN, isSNaN, isInf && !sign, isNormal && !sign,
-        isSubnormal && !sign, isZero && !sign, isZero && sign,
-        isSubnormal && sign, isNormal && sign, isInf && sign)
-  }
-}
-
-object IsNaNRecFN {
-  def apply(expWidth: Int, sigWidth: Int, in: UInt) =
-    in(sigWidth + expWidth - 1, sigWidth + expWidth - 3).andR
-}
-
-object IsSNaNRecFN {
-  def apply(expWidth: Int, sigWidth: Int, in: UInt) =
-    IsNaNRecFN(expWidth, sigWidth, in) && !in(sigWidth - 2)
-}
-
-/** Format conversion without rounding or NaN handling */
-object RecFNToRecFN_noncompliant {
-  def apply(in: UInt, inExpWidth: Int, inSigWidth: Int, outExpWidth: Int, outSigWidth: Int) = {
-    val sign = in(inSigWidth + inExpWidth)
-    val fractIn = in(inSigWidth - 2, 0)
-    val expIn = in(inSigWidth + inExpWidth - 1, inSigWidth - 1)
-    val fractOut = fractIn << outSigWidth >> inSigWidth
-    val expOut = {
-      val expCode = expIn(inExpWidth, inExpWidth - 2)
-      val commonCase = (expIn + (1 << outExpWidth)) - (1 << inExpWidth)
-      Mux(expCode === 0 || expCode >= 6, Cat(expCode, commonCase(outExpWidth - 3, 0)),
-                                         commonCase(outExpWidth, 0))
-    }
-    Cat(sign, expOut, fractOut)
-  }
-}
-
-object CanonicalNaN {
-  def apply(expWidth: Int, sigWidth: Int): UInt =
-    UInt((BigInt(7) << (expWidth + sigWidth - 3)) + (BigInt(1) << (sigWidth - 2)), expWidth + sigWidth + 1)
-}
-
-trait HasFPUParameters {
-  val fLen: Int
-  val (sExpWidth, sSigWidth) = (8, 24)
-  val (dExpWidth, dSigWidth) = (11, 53)
-  val floatWidths = fLen match {
-    case 32 => List((sExpWidth, sSigWidth))
-    case 64 => List((sExpWidth, sSigWidth), (dExpWidth, dSigWidth))
-  }
-  val maxExpWidth = floatWidths.map(_._1).max
-  val maxSigWidth = floatWidths.map(_._2).max
-}
-
-abstract class FPUModule(implicit p: Parameters) extends CoreModule()(p) with HasFPUParameters
-
-class FPToInt(implicit p: Parameters) extends FPUModule()(p) {
-  class Output extends Bundle {
-    val lt = Bool()
-    val store = Bits(width = fLen)
-    val toint = Bits(width = xLen)
-    val exc = Bits(width = 5)
-    override def cloneType = new Output().asInstanceOf[this.type]
-  }
-  val io = new Bundle {
-    val in = Valid(new FPInput).flip
-    val as_double = new FPInput().asOutput
-    val out = Valid(new Output)
-  }
-
-  val in = Reg(new FPInput)
-  val valid = Reg(next=io.in.valid)
-
-  def upconvert(x: UInt) = RecFNToRecFN_noncompliant(x, sExpWidth, sSigWidth, maxExpWidth, maxSigWidth)
-
-  when (io.in.valid) {
-    in := io.in.bits
-    if (fLen > 32) when (io.in.bits.single && !io.in.bits.ldst && io.in.bits.cmd =/= FCMD_MV_XF) {
-      in.in1 := upconvert(io.in.bits.in1)
-      in.in2 := upconvert(io.in.bits.in2)
-    }
-  }
-
-  val unrec_s = hardfloat.fNFromRecFN(sExpWidth, sSigWidth, in.in1).sextTo(xLen)
-  val unrec_mem = fLen match {
-    case 32 => unrec_s
-    case 64 =>
-      val unrec_d = hardfloat.fNFromRecFN(dExpWidth, dSigWidth, in.in1).sextTo(xLen)
-      Mux(in.single, unrec_s, unrec_d)
-  }
-  val unrec_int = xLen match {
-    case 32 => unrec_s
-    case fLen => unrec_mem
-  }
-
-  val classify_s = ClassifyRecFN(sExpWidth, sSigWidth, in.in1)
-  val classify_out = fLen match {
-    case 32 => classify_s
-    case 64 =>
-      val classify_d = ClassifyRecFN(dExpWidth, dSigWidth, in.in1)
-      Mux(in.single, classify_s, classify_d)
-  }
-
-  val dcmp = Module(new hardfloat.CompareRecFN(maxExpWidth, maxSigWidth))
-  dcmp.io.a := in.in1
-  dcmp.io.b := in.in2
-  dcmp.io.signaling := !in.rm(1)
-
-  io.out.bits.toint := Mux(in.rm(0), classify_out, unrec_int)
-  io.out.bits.store := unrec_mem
-  io.out.bits.exc := Bits(0)
-
-  when (in.cmd === FCMD_CMP) {
-    io.out.bits.toint := (~in.rm & Cat(dcmp.io.lt, dcmp.io.eq)).orR
-    io.out.bits.exc := dcmp.io.exceptionFlags
-  }
-  when (in.cmd === FCMD_CVT_IF) {
-    val minXLen = 32
-    val n = log2Ceil(xLen/minXLen) + 1
-    for (i <- 0 until n) {
-      val conv = Module(new hardfloat.RecFNToIN(maxExpWidth, maxSigWidth, minXLen << i))
-      conv.io.in := in.in1
-      conv.io.roundingMode := in.rm
-      conv.io.signedOut := ~in.typ(0)
-      when (in.typ.extract(log2Ceil(n), 1) === i) {
-        io.out.bits.toint := conv.io.out.sextTo(xLen)
-        io.out.bits.exc := Cat(conv.io.intExceptionFlags(2, 1).orR, UInt(0, 3), conv.io.intExceptionFlags(0))
-      }
-    }
-  }
-
-  io.out.valid := valid
-  io.out.bits.lt := dcmp.io.lt
-  io.as_double := in
-}
-
-class IntToFP(val latency: Int)(implicit p: Parameters) extends FPUModule()(p) {
-  val io = new Bundle {
-    val in = Valid(new FPInput).flip
-    val out = Valid(new FPResult)
-  }
-
-  val in = Pipe(io.in)
-
-  val mux = Wire(new FPResult)
-  mux.exc := Bits(0)
-  mux.data := hardfloat.recFNFromFN(sExpWidth, sSigWidth, in.bits.in1)
-  if (fLen > 32) when (!in.bits.single) {
-    mux.data := hardfloat.recFNFromFN(dExpWidth, dSigWidth, in.bits.in1)
-  }
-
-  val intValue = {
-    val minXLen = 32
-    val n = log2Ceil(xLen/minXLen) + 1
-    val res = Wire(init = in.bits.in1.asSInt)
-    for (i <- 0 until n-1) {
-      val smallInt = in.bits.in1((minXLen << i) - 1, 0)
-      when (in.bits.typ.extract(log2Ceil(n), 1) === i) {
-        res := Mux(in.bits.typ(0), smallInt.zext, smallInt.asSInt)
-      }
-    }
-    res.asUInt
-  }
-
-  when (in.bits.cmd === FCMD_CVT_FI) {
-    val l2s = Module(new hardfloat.INToRecFN(xLen, sExpWidth, sSigWidth))
-    l2s.io.signedIn := ~in.bits.typ(0)
-    l2s.io.in := intValue
-    l2s.io.roundingMode := in.bits.rm
-    mux.data := Cat(UInt((BigInt(1) << (fLen - 32)) - 1), l2s.io.out)
-    mux.exc := l2s.io.exceptionFlags
-
-    fLen match {
-      case 32 =>
-      case 64 =>
-        val l2d = Module(new hardfloat.INToRecFN(xLen, dExpWidth, dSigWidth))
-        l2d.io.signedIn := ~in.bits.typ(0)
-        l2d.io.in := intValue
-        l2d.io.roundingMode := in.bits.rm
-        when (!in.bits.single) {
-            mux.data := Cat(UInt((BigInt(1) << (fLen - 64)) - 1), l2d.io.out)
-            mux.exc := l2d.io.exceptionFlags
-        }
-      }
-    }
-
-    io.out <> Pipe(in.valid, mux, latency-1)
-  }
-
-  class FPToFP(val latency: Int)(implicit p: Parameters) extends FPUModule()(p) {
-    val io = new Bundle {
-      val in = Valid(new FPInput).flip
-      val out = Valid(new FPResult)
-      val lt = Bool(INPUT) // from FPToInt
-    }
-
-    val in = Pipe(io.in)
-
-    val signNum = Mux(in.bits.rm(1), in.bits.in1 ^ in.bits.in2, Mux(in.bits.rm(0), ~in.bits.in2, in.bits.in2))
-    val fsgnj_s = Cat(signNum(32), in.bits.in1(31, 0))
-    val fsgnj = fLen match {
-      case 32 => fsgnj_s
-      case 64 => Mux(in.bits.single, Cat(in.bits.in1 >> 33, fsgnj_s),
-                                     Cat(signNum(64), in.bits.in1(63, 0)))
-    }
-    val mux = Wire(new FPResult)
-    mux.exc := UInt(0)
-    mux.data := fsgnj
-
-    when (in.bits.cmd === FCMD_MINMAX) {
-      def doMinMax(expWidth: Int, sigWidth: Int) = {
-        val isnan1 = IsNaNRecFN(expWidth, sigWidth, in.bits.in1)
-        val isnan2 = IsNaNRecFN(expWidth, sigWidth, in.bits.in2)
-        val issnan1 = IsSNaNRecFN(expWidth, sigWidth, in.bits.in1)
-        val issnan2 = IsSNaNRecFN(expWidth, sigWidth, in.bits.in2)
-        val invalid = issnan1 || issnan2
-        val isNaNOut = invalid || (isnan1 && isnan2)
-        val cNaN = floatWidths.filter(_._1 >= expWidth).map(x => CanonicalNaN(x._1, x._2)).reduce(_+_)
-        (isnan2 || in.bits.rm(0) =/= io.lt && !isnan1, invalid, isNaNOut, cNaN)
-      }
-      val (isLHS, isInvalid, isNaNOut, cNaN) = fLen match {
-        case 32 => doMinMax(sExpWidth, sSigWidth)
-        case 64 => MuxT(in.bits.single, doMinMax(sExpWidth, sSigWidth), doMinMax(dExpWidth, dSigWidth))
-      }
-      mux.exc := isInvalid << 4
-      mux.data := Mux(isNaNOut, cNaN, Mux(isLHS, in.bits.in1, in.bits.in2))
-    }
-
-    fLen match {
-      case 32 =>
-      case 64 =>
-        when (in.bits.cmd === FCMD_CVT_FF) {
-          when (in.bits.single) {
-            val d2s = Module(new hardfloat.RecFNToRecFN(dExpWidth, dSigWidth, sExpWidth, sSigWidth))
-            d2s.io.in := in.bits.in1
-            d2s.io.roundingMode := in.bits.rm
-          mux.data := Cat(UInt((BigInt(1) << (fLen - 32)) - 1), d2s.io.out)
-          mux.exc := d2s.io.exceptionFlags
-        }.otherwise {
-          val s2d = Module(new hardfloat.RecFNToRecFN(sExpWidth, sSigWidth, dExpWidth, dSigWidth))
-          s2d.io.in := in.bits.in1
-          s2d.io.roundingMode := in.bits.rm
-          mux.data := s2d.io.out
-          mux.exc := s2d.io.exceptionFlags
-        }
-      }
-  }
-
-  io.out <> Pipe(in.valid, mux, latency-1)
-}
-
-class FPUFMAPipe(val latency: Int, expWidth: Int, sigWidth: Int)(implicit p: Parameters) extends FPUModule()(p) {
-  val io = new Bundle {
-    val in = Valid(new FPInput).flip
-    val out = Valid(new FPResult)
-  }
-
-  val width = sigWidth + expWidth
-  val one = UInt(1) << (width-1)
-  val zero = (io.in.bits.in1(width) ^ io.in.bits.in2(width)) << width
-
-  val valid = Reg(next=io.in.valid)
-  val in = Reg(new FPInput)
-  when (io.in.valid) {
-    in := io.in.bits
-    val cmd_fma = io.in.bits.ren3
-    val cmd_addsub = io.in.bits.swap23
-    in.cmd := Cat(io.in.bits.cmd(1) & (cmd_fma || cmd_addsub), io.in.bits.cmd(0))
-    when (cmd_addsub) { in.in2 := one }
-    unless (cmd_fma || cmd_addsub) { in.in3 := zero }
-  }
-
-  val fma = Module(new hardfloat.MulAddRecFN(expWidth, sigWidth))
-  fma.io.op := in.cmd
-  fma.io.roundingMode := in.rm
-  fma.io.a := in.in1
-  fma.io.b := in.in2
-  fma.io.c := in.in3
-
-  val res = Wire(new FPResult)
-  res.data := Cat(UInt((BigInt(1) << (fLen - (expWidth + sigWidth))) - 1), fma.io.out)
-  res.exc := fma.io.exceptionFlags
-  io.out := Pipe(valid, res, latency-1)
-}
-
-class FPU(cfg: FPUConfig)(implicit p: Parameters) extends FPUModule()(p) {
-  val io = new FPUIO
-
-  val ex_reg_valid = Reg(next=io.valid, init=Bool(false))
-  val req_valid = ex_reg_valid || io.cp_req.valid
-  val ex_reg_inst = RegEnable(io.inst, io.valid)
-  val ex_cp_valid = io.cp_req.fire()
-  val mem_reg_valid = Reg(next=ex_reg_valid && !io.killx || ex_cp_valid, init=Bool(false))
-  val mem_reg_inst = RegEnable(ex_reg_inst, ex_reg_valid)
-  val mem_cp_valid = Reg(next=ex_cp_valid, init=Bool(false))
-  val killm = (io.killm || io.nack_mem) && !mem_cp_valid
-  val wb_reg_valid = Reg(next=mem_reg_valid && (!killm || mem_cp_valid), init=Bool(false))
-  val wb_cp_valid = Reg(next=mem_cp_valid, init=Bool(false))
-
-  val fp_decoder = Module(new FPUDecoder)
-  fp_decoder.io.inst := io.inst
-
-  val cp_ctrl = Wire(new FPUCtrlSigs)
-  cp_ctrl <> io.cp_req.bits
-  io.cp_resp.valid := Bool(false)
-  io.cp_resp.bits.data := UInt(0)
-
-  val id_ctrl = fp_decoder.io.sigs
-  val ex_ctrl = Mux(ex_cp_valid, cp_ctrl, RegEnable(id_ctrl, io.valid))
-  val mem_ctrl = RegEnable(ex_ctrl, req_valid)
-  val wb_ctrl = RegEnable(mem_ctrl, mem_reg_valid)
-
-  // load response
-  val load_wb = Reg(next=io.dmem_resp_val)
-  val load_wb_single = RegEnable(!io.dmem_resp_type(0), io.dmem_resp_val)
-  val load_wb_data = RegEnable(io.dmem_resp_data, io.dmem_resp_val)
-  val load_wb_tag = RegEnable(io.dmem_resp_tag, io.dmem_resp_val)
-  val rec_s = hardfloat.recFNFromFN(sExpWidth, sSigWidth, load_wb_data)
-  val load_wb_data_recoded = fLen match {
-    case 32 => rec_s
-    case 64 =>
-      val rec_d = hardfloat.recFNFromFN(dExpWidth, dSigWidth, load_wb_data)
-      Mux(load_wb_single, Cat(UInt((BigInt(1) << (fLen - 32)) - 1), rec_s), rec_d)
-  }
-
-  // regfile
-  val regfile = Mem(32, Bits(width = fLen+1))
-  when (load_wb) {
-    regfile(load_wb_tag) := load_wb_data_recoded
-    if (enableCommitLog)
-      printf("f%d p%d 0x%x\n", load_wb_tag, load_wb_tag + 32, Mux(load_wb_single, load_wb_data(31,0), load_wb_data))
-  }
-
-  val ex_ra1::ex_ra2::ex_ra3::Nil = List.fill(3)(Reg(UInt()))
-  when (io.valid) {
-    when (id_ctrl.ren1) {
-      when (!id_ctrl.swap12) { ex_ra1 := io.inst(19,15) }
-      when (id_ctrl.swap12) { ex_ra2 := io.inst(19,15) }
-    }
-    when (id_ctrl.ren2) {
-      when (id_ctrl.swap12) { ex_ra1 := io.inst(24,20) }
-      when (id_ctrl.swap23) { ex_ra3 := io.inst(24,20) }
-      when (!id_ctrl.swap12 && !id_ctrl.swap23) { ex_ra2 := io.inst(24,20) }
-    }
-    when (id_ctrl.ren3) { ex_ra3 := io.inst(31,27) }
-  }
-  val ex_rm = Mux(ex_reg_inst(14,12) === Bits(7), io.fcsr_rm, ex_reg_inst(14,12))
-
-  val req = Wire(new FPInput)
-  req := ex_ctrl
-  req.rm := ex_rm
-  req.in1 := regfile(ex_ra1)
-  req.in2 := regfile(ex_ra2)
-  req.in3 := regfile(ex_ra3)
-  req.typ := ex_reg_inst(21,20)
-  when (ex_cp_valid) {
-    req := io.cp_req.bits
-    when (io.cp_req.bits.swap23) {
-      req.in2 := io.cp_req.bits.in3
-      req.in3 := io.cp_req.bits.in2
-    }
-  }
-
-  val sfma = Module(new FPUFMAPipe(cfg.sfmaLatency, sExpWidth, sSigWidth))
-  sfma.io.in.valid := req_valid && ex_ctrl.fma && ex_ctrl.single
-  sfma.io.in.bits := req
-
-  val fpiu = Module(new FPToInt)
-  fpiu.io.in.valid := req_valid && (ex_ctrl.toint || ex_ctrl.div || ex_ctrl.sqrt || ex_ctrl.cmd === FCMD_MINMAX)
-  fpiu.io.in.bits := req
-  io.store_data := fpiu.io.out.bits.store
-  io.toint_data := fpiu.io.out.bits.toint
-  when(fpiu.io.out.valid && mem_cp_valid && mem_ctrl.toint){
-    io.cp_resp.bits.data := fpiu.io.out.bits.toint
-    io.cp_resp.valid := Bool(true)
-  }
-
-  val ifpu = Module(new IntToFP(2))
-  ifpu.io.in.valid := req_valid && ex_ctrl.fromint
-  ifpu.io.in.bits := req
-  ifpu.io.in.bits.in1 := Mux(ex_cp_valid, io.cp_req.bits.in1, io.fromint_data)
-
-  val fpmu = Module(new FPToFP(2))
-  fpmu.io.in.valid := req_valid && ex_ctrl.fastpipe
-  fpmu.io.in.bits := req
-  fpmu.io.lt := fpiu.io.out.bits.lt
-
-  val divSqrt_wen = Reg(next=Bool(false))
-  val divSqrt_inReady = Wire(init=Bool(false))
-  val divSqrt_waddr = Reg(UInt(width = 5))
-  val divSqrt_wdata = Wire(UInt(width = fLen+1))
-  val divSqrt_flags = Wire(UInt(width = 5))
-  val divSqrt_in_flight = Reg(init=Bool(false))
-  val divSqrt_killed = Reg(Bool())
-
-  // writeback arbitration
-  case class Pipe(p: Module, lat: Int, cond: (FPUCtrlSigs) => Bool, res: FPResult)
-  val pipes = List(
-    Pipe(fpmu, fpmu.latency, (c: FPUCtrlSigs) => c.fastpipe, fpmu.io.out.bits),
-    Pipe(ifpu, ifpu.latency, (c: FPUCtrlSigs) => c.fromint, ifpu.io.out.bits),
-    Pipe(sfma, sfma.latency, (c: FPUCtrlSigs) => c.fma && c.single, sfma.io.out.bits)) ++
-    (fLen > 32).option({
-          val dfma = Module(new FPUFMAPipe(cfg.dfmaLatency, dExpWidth, dSigWidth))
-          dfma.io.in.valid := req_valid && ex_ctrl.fma && !ex_ctrl.single
-          dfma.io.in.bits := req
-          Pipe(dfma, dfma.latency, (c: FPUCtrlSigs) => c.fma && !c.single, dfma.io.out.bits)
-        })
-  def latencyMask(c: FPUCtrlSigs, offset: Int) = {
-    require(pipes.forall(_.lat >= offset))
-    pipes.map(p => Mux(p.cond(c), UInt(1 << p.lat-offset), UInt(0))).reduce(_|_)
-  }
-  def pipeid(c: FPUCtrlSigs) = pipes.zipWithIndex.map(p => Mux(p._1.cond(c), UInt(p._2), UInt(0))).reduce(_|_)
-  val maxLatency = pipes.map(_.lat).max
-  val memLatencyMask = latencyMask(mem_ctrl, 2)
-
-  class WBInfo extends Bundle {
-    val rd = UInt(width = 5)
-    val single = Bool()
-    val cp = Bool()
-    val pipeid = UInt(width = log2Ceil(pipes.size))
-    override def cloneType: this.type = new WBInfo().asInstanceOf[this.type]
-  }
-
-  val wen = Reg(init=Bits(0, maxLatency-1))
-  val wbInfo = Reg(Vec(maxLatency-1, new WBInfo))
-  val mem_wen = mem_reg_valid && (mem_ctrl.fma || mem_ctrl.fastpipe || mem_ctrl.fromint)
-  val write_port_busy = RegEnable(mem_wen && (memLatencyMask & latencyMask(ex_ctrl, 1)).orR || (wen & latencyMask(ex_ctrl, 0)).orR, req_valid)
-
-  for (i <- 0 until maxLatency-2) {
-    when (wen(i+1)) { wbInfo(i) := wbInfo(i+1) }
-  }
-  wen := wen >> 1
-  when (mem_wen) {
-    when (!killm) {
-      wen := wen >> 1 | memLatencyMask
-    }
-    for (i <- 0 until maxLatency-1) {
-      when (!write_port_busy && memLatencyMask(i)) {
-        wbInfo(i).cp := mem_cp_valid
-        wbInfo(i).single := mem_ctrl.single
-        wbInfo(i).pipeid := pipeid(mem_ctrl)
-        wbInfo(i).rd := mem_reg_inst(11,7)
-      }
-    }
-  }
-
-  val waddr = Mux(divSqrt_wen, divSqrt_waddr, wbInfo(0).rd)
-  val wdata = Mux(divSqrt_wen, divSqrt_wdata, (pipes.map(_.res.data): Seq[UInt])(wbInfo(0).pipeid))
-  val wexc = (pipes.map(_.res.exc): Seq[UInt])(wbInfo(0).pipeid)
-  when ((!wbInfo(0).cp && wen(0)) || divSqrt_wen) {
-    regfile(waddr) := wdata
-    if (enableCommitLog) {
-      val wdata_unrec_s = hardfloat.fNFromRecFN(sExpWidth, sSigWidth, wdata)
-      val unrec = fLen match {
-        case 32 => wdata_unrec_s
-        case 64 =>
-          val wdata_unrec_d = hardfloat.fNFromRecFN(dExpWidth, dSigWidth, wdata)
-          Mux(wbInfo(0).single, wdata_unrec_s, wdata_unrec_d)
-      }
-      printf("f%d p%d 0x%x\n", waddr, waddr + 32, unrec)
-    }
-  }
-  when (wbInfo(0).cp && wen(0)) {
-    io.cp_resp.bits.data := wdata
-    io.cp_resp.valid := Bool(true)
-  }
-  io.cp_req.ready := !ex_reg_valid
-
-  val wb_toint_valid = wb_reg_valid && wb_ctrl.toint
-  val wb_toint_exc = RegEnable(fpiu.io.out.bits.exc, mem_ctrl.toint)
-  io.fcsr_flags.valid := wb_toint_valid || divSqrt_wen || wen(0)
-  io.fcsr_flags.bits :=
-    Mux(wb_toint_valid, wb_toint_exc, UInt(0)) |
-    Mux(divSqrt_wen, divSqrt_flags, UInt(0)) |
-    Mux(wen(0), wexc, UInt(0))
-
-  val units_busy = mem_reg_valid && (mem_ctrl.div || mem_ctrl.sqrt) && (!divSqrt_inReady || wen.orR)
-  io.fcsr_rdy := !(ex_reg_valid && ex_ctrl.wflags || mem_reg_valid && mem_ctrl.wflags || wb_reg_valid && wb_ctrl.toint || wen.orR || divSqrt_in_flight)
-  io.nack_mem := units_busy || write_port_busy || divSqrt_in_flight
-  io.dec <> fp_decoder.io.sigs
-  def useScoreboard(f: ((Pipe, Int)) => Bool) = pipes.zipWithIndex.filter(_._1.lat > 3).map(x => f(x)).fold(Bool(false))(_||_)
-  io.sboard_set := wb_reg_valid && !wb_cp_valid && Reg(next=useScoreboard(_._1.cond(mem_ctrl)) || mem_ctrl.div || mem_ctrl.sqrt)
-  io.sboard_clr := !wb_cp_valid && (divSqrt_wen || (wen(0) && useScoreboard(x => wbInfo(0).pipeid === UInt(x._2))))
-  io.sboard_clra := waddr
-  // we don't currently support round-max-magnitude (rm=4)
-  io.illegal_rm := ex_rm(2) && ex_ctrl.round
-
-  divSqrt_wdata := 0
-  divSqrt_flags := 0
-  if (cfg.divSqrt) {
-    require(fLen == 64)
-    val divSqrt_single = Reg(Bool())
-    val divSqrt_rm = Reg(Bits())
-    val divSqrt_flags_double = Reg(Bits())
-    val divSqrt_wdata_double = Reg(Bits())
-
-    val divSqrt = Module(new hardfloat.DivSqrtRecF64)
-    divSqrt_inReady := Mux(divSqrt.io.sqrtOp, divSqrt.io.inReady_sqrt, divSqrt.io.inReady_div)
-    val divSqrt_outValid = divSqrt.io.outValid_div || divSqrt.io.outValid_sqrt
-    divSqrt.io.inValid := mem_reg_valid && (mem_ctrl.div || mem_ctrl.sqrt) && !divSqrt_in_flight
-    divSqrt.io.sqrtOp := mem_ctrl.sqrt
-    divSqrt.io.a := fpiu.io.as_double.in1
-    divSqrt.io.b := fpiu.io.as_double.in2
-    divSqrt.io.roundingMode := fpiu.io.as_double.rm
-
-    when (divSqrt.io.inValid && divSqrt_inReady) {
-      divSqrt_in_flight := true
-      divSqrt_killed := killm
-      divSqrt_single := mem_ctrl.single
-      divSqrt_waddr := mem_reg_inst(11,7)
-      divSqrt_rm := divSqrt.io.roundingMode
-    }
-
-    when (divSqrt_outValid) {
-      divSqrt_wen := !divSqrt_killed
-      divSqrt_wdata_double := divSqrt.io.out
-      divSqrt_in_flight := false
-      divSqrt_flags_double := divSqrt.io.exceptionFlags
-    }
-
-    val divSqrt_toSingle = Module(new hardfloat.RecFNToRecFN(11, 53, 8, 24))
-    divSqrt_toSingle.io.in := divSqrt_wdata_double
-    divSqrt_toSingle.io.roundingMode := divSqrt_rm
-    divSqrt_wdata := Mux(divSqrt_single, divSqrt_toSingle.io.out, divSqrt_wdata_double)
-    divSqrt_flags := divSqrt_flags_double | Mux(divSqrt_single, divSqrt_toSingle.io.exceptionFlags, Bits(0))
-  } else {
-    when (ex_ctrl.div || ex_ctrl.sqrt) { io.illegal_rm := true }
-  }
-}
-
-/** Mix-ins for constructing tiles that may have an FPU external to the core pipeline */
-trait CanHaveSharedFPU {
-  implicit val p: Parameters
-}
-
-trait CanHaveSharedFPUModule {
-  val outer: CanHaveSharedFPU
-  val fpuOpt = outer.p(FPUKey).map(cfg => Module(new FPU(cfg)(outer.p)))
-  // TODO fpArb could go here instead of inside LegacyRoccComplex
-}
--- a/src/main/scala/rocket/Frontend.scala
+++ b/src/main/scala/rocket/Frontend.scala
@ -4,13 +4,13 @@
 package rocket

 import Chisel._
+import Chisel.ImplicitConversions._
 import config._
 import coreplex._
 import diplomacy._
 import uncore.tilelink2._
-import uncore.util.CacheName
+import tile._
 import util._
-import Chisel.ImplicitConversions._

 class FrontendReq(implicit p: Parameters) extends CoreBundle()(p) {
  val pc = UInt(width = vaddrBitsExtended)
@ -54,12 +54,12 @@ class FrontendBundle(outer: Frontend) extends CoreBundle()(outer.p) {

 class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
    with HasCoreParameters
-    with HasL1CacheParameters {
+    with HasL1ICacheParameters {
  val io = new FrontendBundle(outer)
  implicit val edge = outer.node.edgesOut(0)
  val icache = outer.icache.module

-  val tlb = Module(new TLB)
+  val tlb = Module(new TLB(nTLBEntries))

  val s1_pc_ = Reg(UInt(width=vaddrBitsExtended))
  val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline)
@ -106,7 +106,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
    s2_valid := Bool(false)
  }

-  if (p(BtbKey).nEntries > 0) {
+  if (usingBTB) {
    val btb = Module(new BTB)
    btb.io.req.valid := false
    btb.io.req.bits.addr := s1_pc_
@ -153,21 +153,18 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
 }

 /** Mix-ins for constructing tiles that have an ICache-based pipeline frontend */
-trait HasICacheFrontend extends CanHavePTW with TileNetwork {
+trait HasICacheFrontend extends CanHavePTW with HasTileLinkMasterPort {
  val module: HasICacheFrontendModule
-  val frontend = LazyModule(new Frontend()(p.alterPartial({
-    case CacheName => CacheName("L1I")
-  })))
-  l1backend.node := frontend.node
+  val frontend = LazyModule(new Frontend)
+  masterNode := frontend.node
  nPTWPorts += 1
 }

-trait HasICacheFrontendBundle extends TileNetworkBundle {
+trait HasICacheFrontendBundle extends HasTileLinkMasterPortBundle {
  val outer: HasICacheFrontend
 }

-trait HasICacheFrontendModule extends CanHavePTWModule with TileNetworkModule {
+trait HasICacheFrontendModule extends CanHavePTWModule with HasTileLinkMasterPortModule {
  val outer: HasICacheFrontend
-  //val io: HasICacheFrontendBundle
  ptwPorts += outer.frontend.module.io.ptw
 }
--- a/src/main/scala/rocket/HellaCache.scala
+++ b/src/main/scala/rocket/HellaCache.scala
@ -7,43 +7,58 @@ import Chisel._
 import config.{Parameters, Field}
 import coreplex._
 import diplomacy._
+import tile._
 import uncore.constants._
 import uncore.tilelink2._
-import uncore.util._
-import util.ParameterizedBundle
+import uncore.util.Code
+import util.{ParameterizedBundle, RandomReplacement}
 import scala.collection.mutable.ListBuffer

-case class DCacheConfig(
-  nMSHRs: Int = 1,
-  nSDQ: Int = 17,
-  nRPQ: Int = 16,
-  nMMIOs: Int = 1)
+case class DCacheParams(
+    nSets: Int = 64,
+    nWays: Int = 4,
+    rowBits: Int = 64,
+    nTLBEntries: Int = 8,
+    splitMetadata: Boolean = false,
+    ecc: Option[Code] = None,
+    nMSHRs: Int = 1,
+    nSDQ: Int = 17,
+    nRPQ: Int = 16,
+    nMMIOs: Int = 1) extends L1CacheParams {
+  def replacement = new RandomReplacement(nWays)
+}

-case object DCacheKey extends Field[DCacheConfig]
+trait HasL1HellaCacheParameters extends HasL1CacheParameters
+    with HasCoreParameters {
+  val cacheParams = tileParams.dcache.get
+  val cfg = cacheParams

-trait HasL1HellaCacheParameters extends HasL1CacheParameters {
-  val wordBits = xLen // really, xLen max 
-  val wordBytes = wordBits/8
-  val wordOffBits = log2Up(wordBytes)
-  val beatBytes = cacheBlockBytes / cacheDataBeats
-  val beatWords = beatBytes / wordBytes
-  val beatOffBits = log2Up(beatBytes)
-  val idxMSB = untagBits-1
-  val idxLSB = blockOffBits
-  val offsetmsb = idxLSB-1
-  val offsetlsb = wordOffBits
-  val rowWords = rowBits/wordBits
-  val doNarrowRead = coreDataBits * nWays % rowBits == 0
-  val encDataBits = code.width(coreDataBits)
-  val encRowBits = encDataBits*rowWords
-  val nIOMSHRs = 1
-  val lrscCycles = 32 // ISA requires 16-insn LRSC sequences to succeed
+  def wordBits = xLen // really, xLen max 
+  def wordBytes = wordBits/8
+  def wordOffBits = log2Up(wordBytes)
+  def beatBytes = cacheBlockBytes / cacheDataBeats
+  def beatWords = beatBytes / wordBytes
+  def beatOffBits = log2Up(beatBytes)
+  def idxMSB = untagBits-1
+  def idxLSB = blockOffBits
+  def offsetmsb = idxLSB-1
+  def offsetlsb = wordOffBits
+  def rowWords = rowBits/wordBits
+  def doNarrowRead = coreDataBits * nWays % rowBits == 0
+  def encDataBits = code.width(coreDataBits)
+  def encRowBits = encDataBits*rowWords
+  def lrscCycles = 32 // ISA requires 16-insn LRSC sequences to succeed
+  def nIOMSHRs = cacheParams.nMMIOs
+  def maxUncachedInFlight = cacheParams.nMMIOs
+  def dataScratchpadSize = tileParams.dataScratchpadBytes

-  require(isPow2(nSets))
-  require(rowBits >= coreDataBits)
-  require(rowBits == cacheDataBits) // TODO should rowBits even be seperably specifiable?
-  require(xLen <= cacheDataBits) // would need offset addr for puts if data width < xlen
-  require(!usingVM || untagBits <= pgIdxBits)
+  require(isPow2(nSets), s"nSets($nSets) must be pow2")
+  require(rowBits >= coreDataBits, s"rowBits($rowBits) < coreDataBits($coreDataBits)")
+  // TODO should rowBits even be seperably specifiable?
+  require(rowBits == cacheDataBits, s"rowBits($rowBits) != cacheDataBits($cacheDataBits)") 
+  // would need offset addr for puts if data width < xlen
+  require(xLen <= cacheDataBits, s"xLen($xLen) > cacheDataBits($cacheDataBits)")
+  require(!usingVM || untagBits <= pgIdxBits, s"untagBits($untagBits) > pgIdxBits($pgIdxBits)")
 }

 abstract class L1HellaCacheModule(implicit val p: Parameters) extends Module
@ -52,25 +67,7 @@ abstract class L1HellaCacheModule(implicit val p: Parameters) extends Module
 abstract class L1HellaCacheBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
  with HasL1HellaCacheParameters

-class L1Metadata(implicit p: Parameters) extends Metadata()(p) with HasL1HellaCacheParameters {
-  val coh = new ClientMetadata
-}
-object L1Metadata {
-  def apply(tag: Bits, coh: ClientMetadata)(implicit p: Parameters) = {
-    val meta = Wire(new L1Metadata)
-    meta.tag := tag
-    meta.coh := coh
-    meta
-  }
-}
-
-class L1MetaReadReq(implicit p: Parameters) extends MetaReadReq {
-  val tag = Bits(width = tagBits)
-  override def cloneType = new L1MetaReadReq()(p).asInstanceOf[this.type] //TODO remove
-}
-
-class L1MetaWriteReq(implicit p: Parameters) extends 
-  MetaWriteReq[L1Metadata](new L1Metadata)
+/** Bundle definitions for HellaCache interfaces */

 trait HasCoreMemOp extends HasCoreParameters {
  val addr = UInt(width = coreMaxAddrBits)
@ -83,6 +80,10 @@ trait HasCoreData extends HasCoreParameters {
  val data = Bits(width = coreDataBits)
 }

+class HellaCacheReqInternal(implicit p: Parameters) extends CoreBundle()(p) with HasCoreMemOp {
+  val phys = Bool()
+}
+
 class HellaCacheReq(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasCoreData

 class HellaCacheResp(implicit p: Parameters) extends CoreBundle()(p)
@ -104,7 +105,6 @@ class HellaCacheExceptions extends Bundle {
  val pf = new AlignmentExceptions
 }

-
 // interface between D$ and processor/DTLB
 class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) {
  val req = Decoupled(new HellaCacheReq)
@ -119,10 +119,13 @@ class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) {
  val ordered = Bool(INPUT)
 }

-abstract class HellaCache(val cfg: DCacheConfig)(implicit p: Parameters) extends LazyModule {
+/** Base classes for Diplomatic TL2 HellaCaches */
+
+abstract class HellaCache(implicit p: Parameters) extends LazyModule {
+  private val cfg = p(TileKey).dcache.get
  val node = TLClientNode(TLClientParameters(
    sourceId = IdRange(0, cfg.nMSHRs + cfg.nMMIOs),
-    supportsProbe = TransferSizes(p(CacheBlockBytes))))
+    supportsProbe = TransferSizes(1, p(CacheBlockBytes))))
  val module: HellaCacheModule
 }

@ -135,38 +138,101 @@ class HellaCacheBundle(outer: HellaCache) extends Bundle {

 class HellaCacheModule(outer: HellaCache) extends LazyModuleImp(outer)
    with HasL1HellaCacheParameters {
-  implicit val cfg = outer.cfg
  implicit val edge = outer.node.edgesOut(0)
  val io = new HellaCacheBundle(outer)
  val tl_out = io.mem(0)
 }

 object HellaCache {
-  def apply(cfg: DCacheConfig, scratch: () => Option[AddressSet] = () => None)(implicit p: Parameters) = {
-    if (cfg.nMSHRs == 0) LazyModule(new DCache(cfg, scratch))
-    else LazyModule(new NonBlockingDCache(cfg))
+  def apply(blocking: Boolean, scratch: () => Option[AddressSet] = () => None)(implicit p: Parameters) = {
+    if (blocking) LazyModule(new DCache(scratch))
+    else LazyModule(new NonBlockingDCache)
  }
 }

 /** Mix-ins for constructing tiles that have a HellaCache */
-trait HasHellaCache extends TileNetwork {
+
+trait HasHellaCache extends HasTileLinkMasterPort {
  val module: HasHellaCacheModule
  implicit val p: Parameters
  def findScratchpadFromICache: Option[AddressSet]
  var nDCachePorts = 0
-  val dcacheParams = p.alterPartial({ case CacheName => CacheName("L1D") })
-  val dcache = HellaCache(p(DCacheKey), findScratchpadFromICache _)(dcacheParams)
-  l1backend.node := dcache.node
+  val dcache = HellaCache(tileParams.dcache.get.nMSHRs == 0, findScratchpadFromICache _)
+  masterNode := dcache.node
 }

-trait HasHellaCacheBundle extends TileNetworkBundle {
+trait HasHellaCacheBundle extends HasTileLinkMasterPortBundle {
  val outer: HasHellaCache
 }

-trait HasHellaCacheModule extends TileNetworkModule {
+trait HasHellaCacheModule extends HasTileLinkMasterPortModule {
  val outer: HasHellaCache
  //val io: HasHellaCacheBundle
  val dcachePorts = ListBuffer[HellaCacheIO]()
-  val dcacheArb = Module(new HellaCacheArbiter(outer.nDCachePorts)(outer.dcacheParams))
+  val dcacheArb = Module(new HellaCacheArbiter(outer.nDCachePorts)(outer.p))
  outer.dcache.module.io.cpu <> dcacheArb.io.mem
 }
+
+/** Metadata array used for all HellaCaches */
+
+class L1Metadata(implicit p: Parameters) extends L1HellaCacheBundle()(p) {
+  val coh = new ClientMetadata
+  val tag = UInt(width = tagBits)
+}
+
+object L1Metadata {
+  def apply(tag: Bits, coh: ClientMetadata)(implicit p: Parameters) = {
+    val meta = Wire(new L1Metadata)
+    meta.tag := tag
+    meta.coh := coh
+    meta
+  }
+}
+
+class L1MetaReadReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) {
+  val idx    = UInt(width = idxBits)
+  val way_en = UInt(width = nWays)
+  val tag    = UInt(width = tagBits)
+}
+
+class L1MetaWriteReq(implicit p: Parameters) extends L1MetaReadReq()(p) {
+  val data = new L1Metadata
+}
+
+class L1MetadataArray[T <: L1Metadata](onReset: () => T)(implicit p: Parameters) extends L1HellaCacheModule()(p) {
+  val rstVal = onReset()
+  val io = new Bundle {
+    val read = Decoupled(new L1MetaReadReq).flip
+    val write = Decoupled(new L1MetaWriteReq).flip
+    val resp = Vec(nWays, rstVal.cloneType).asOutput
+  }
+  val rst_cnt = Reg(init=UInt(0, log2Up(nSets+1)))
+  val rst = rst_cnt < UInt(nSets)
+  val waddr = Mux(rst, rst_cnt, io.write.bits.idx)
+  val wdata = Mux(rst, rstVal, io.write.bits.data).asUInt
+  val wmask = Mux(rst || Bool(nWays == 1), SInt(-1), io.write.bits.way_en.asSInt).toBools
+  val rmask = Mux(rst || Bool(nWays == 1), SInt(-1), io.read.bits.way_en.asSInt).toBools
+  when (rst) { rst_cnt := rst_cnt+UInt(1) }
+
+  val metabits = rstVal.getWidth
+
+  if (hasSplitMetadata) {
+    val tag_arrs = List.fill(nWays){ SeqMem(nSets, UInt(width = metabits)) }
+    val tag_readout = Wire(Vec(nWays,rstVal.cloneType))
+    (0 until nWays).foreach { (i) =>
+      when (rst || (io.write.valid && wmask(i))) {
+        tag_arrs(i).write(waddr, wdata)
+      }
+      io.resp(i) := rstVal.fromBits(tag_arrs(i).read(io.read.bits.idx, io.read.valid && rmask(i)))
+    }
+  } else {
+    val tag_arr = SeqMem(nSets, Vec(nWays, UInt(width = metabits)))
+    when (rst || io.write.valid) {
+      tag_arr.write(waddr, Vec.fill(nWays)(wdata), wmask)
+    }
+    io.resp := tag_arr.read(io.read.bits.idx, io.read.valid).map(rstVal.fromBits(_))
+  }
+
+  io.read.ready := !rst && !io.write.valid // so really this could be a 6T RAM
+  io.write.ready := !rst
+}
--- a/src/main/scala/rocket/IBuf.scala
+++ b/src/main/scala/rocket/IBuf.scala
@ -3,9 +3,10 @@
 package rocket

 import Chisel._
-import util._
 import Chisel.ImplicitConversions._
 import config._
+import tile._
+import util._

 class Instruction(implicit val p: Parameters) extends ParameterizedBundle with HasCoreParameters {
  val pf0 = Bool() // page fault on first half of instruction
--- a/src/main/scala/rocket/ICache.scala
+++ b/src/main/scala/rocket/ICache.scala
@ -6,25 +6,32 @@ package rocket
 import Chisel._
 import config._
 import diplomacy._
-import uncore.agents._
+import tile._
 import uncore.tilelink2._
-import uncore.util._
+import uncore.util.Code
 import util._
 import Chisel.ImplicitConversions._

-trait HasL1CacheParameters extends HasCacheParameters with HasCoreParameters {
-  val cacheBlockBytes = p(CacheBlockBytes)
-  val lgCacheBlockBytes = log2Up(cacheBlockBytes)
-  val cacheDataBits = p(SharedMemoryTLEdge).bundle.dataBits
-  val cacheDataBeats = (cacheBlockBytes * 8) / cacheDataBits
-  val refillCycles = cacheDataBeats
+case class ICacheParams(
+    nSets: Int = 64,
+    nWays: Int = 4,
+    rowBits: Int = 128,
+    nTLBEntries: Int = 8,
+    cacheIdBits: Int = 0,
+    splitMetadata: Boolean = false,
+    ecc: Option[Code] = None) extends L1CacheParams {
+  def replacement = new RandomReplacement(nWays)
 }

-class ICacheReq(implicit p: Parameters) extends CoreBundle()(p) with HasL1CacheParameters {
+trait HasL1ICacheParameters extends HasL1CacheParameters with HasCoreParameters {
+  val cacheParams = tileParams.icache.get
+}
+
+class ICacheReq(implicit p: Parameters) extends CoreBundle()(p) with HasL1ICacheParameters {
  val addr = UInt(width = vaddrBits)
 }

-class ICacheResp(implicit p: Parameters) extends CoreBundle()(p) with HasL1CacheParameters {
+class ICacheResp(implicit p: Parameters) extends CoreBundle()(p) with HasL1ICacheParameters {
  val data = Bits(width = coreInstBits)
  val datablock = Bits(width = rowBits)
 }
@ -46,8 +53,7 @@ class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) {
 }

 class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
-    with HasCoreParameters
-    with HasL1CacheParameters {
+    with HasL1ICacheParameters {
  val io = new ICacheBundle(outer)
  val edge = outer.node.edgesOut(0)
  val tl_out = io.mem(0)
--- a/src/main/scala/rocket/IDecode.scala
+++ b/src/main/scala/rocket/IDecode.scala
@ -4,12 +4,13 @@
 package rocket

 import Chisel._
+import Chisel.ImplicitConversions._
 import Instructions._
 import uncore.constants.MemoryOpConstants._
 import ALU._
 import config._
+import tile.HasCoreParameters
 import util._
-import Chisel.ImplicitConversions._

 abstract trait DecodeConstants extends HasCoreParameters
 {
--- a/src/main/scala/rocket/Instructions.scala
+++ b/src/main/scala/rocket/Instructions.scala
--- a/src/main/scala/rocket/Multiplier.scala
+++ b/src/main/scala/rocket/Multiplier.scala
@ -29,13 +29,13 @@ class MultiplierIO(dataBits: Int, tagBits: Int) extends Bundle {
  val resp = Decoupled(new MultiplierResp(dataBits, tagBits))
 }

-case class MulDivConfig(
+case class MulDivParams(
  mulUnroll: Int = 1,
  mulEarlyOut: Boolean = false,
  divEarlyOut: Boolean = false
 )

-class MulDiv(cfg: MulDivConfig, width: Int, nXpr: Int = 32) extends Module {
+class MulDiv(cfg: MulDivParams, width: Int, nXpr: Int = 32) extends Module {
  val io = new MultiplierIO(width, log2Up(nXpr))
  val w = io.req.bits.in1.getWidth
  val mulw = (w + cfg.mulUnroll - 1) / cfg.mulUnroll * cfg.mulUnroll
--- a/src/main/scala/rocket/NBDcache.scala
+++ b/src/main/scala/rocket/NBDcache.scala
@ -5,13 +5,12 @@ package rocket

 import Chisel._
 import Chisel.ImplicitConversions._
+import config._
 import diplomacy._
 import uncore.constants._
-import uncore.tilelink._
 import uncore.tilelink2._
 import uncore.util._
 import util._
-import config._

 trait HasMissInfo extends HasL1HellaCacheParameters {
  val tag_match = Bool()
@ -19,11 +18,6 @@ trait HasMissInfo extends HasL1HellaCacheParameters {
  val way_en = Bits(width = nWays)
 }

-class HellaCacheReqInternal(implicit p: Parameters) extends CoreBundle()(p)
-    with HasCoreMemOp {
-  val phys = Bool()
-}
-
 class L1DataReadReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) {
  val way_en = Bits(width = nWays)
  val addr   = Bits(width = untagBits)
@ -38,18 +32,14 @@ class L1RefillReq(implicit p: Parameters) extends L1DataReadReq()(p)

 class Replay(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasCoreData

-class ReplayInternal(cfg: DCacheConfig)(implicit p: Parameters) extends HellaCacheReqInternal()(p) {
+class ReplayInternal(implicit p: Parameters) extends HellaCacheReqInternal()(p)
+    with HasL1HellaCacheParameters {
  val sdq_id = UInt(width = log2Up(cfg.nSDQ))
-
-  override def cloneType = new ReplayInternal(cfg)(p).asInstanceOf[this.type]
 }

 class MSHRReq(implicit p: Parameters) extends Replay()(p) with HasMissInfo

-class MSHRReqInternal(cfg: DCacheConfig)(implicit p: Parameters)
-    extends ReplayInternal(cfg)(p) with HasMissInfo {
-  override def cloneType = new MSHRReqInternal(cfg)(p).asInstanceOf[this.type]
-}
+class MSHRReqInternal(implicit p: Parameters) extends ReplayInternal()(p) with HasMissInfo

 class WritebackReq(params: TLBundleParameters)(implicit p: Parameters) extends L1HellaCacheBundle()(p) {
  val tag = Bits(width = tagBits)
@ -145,13 +135,13 @@ class IOMSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCa
  }
 }

-class MSHR(id: Int)(implicit edge: TLEdgeOut, cfg: DCacheConfig, p: Parameters) extends L1HellaCacheModule()(p) {
+class MSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCacheModule()(p) {
  val io = new Bundle {
    val req_pri_val    = Bool(INPUT)
    val req_pri_rdy    = Bool(OUTPUT)
    val req_sec_val    = Bool(INPUT)
    val req_sec_rdy    = Bool(OUTPUT)
-    val req_bits       = new MSHRReqInternal(cfg).asInput
+    val req_bits       = new MSHRReqInternal().asInput

    val idx_match       = Bool(OUTPUT)
    val tag             = Bits(OUTPUT, tagBits)
@ -163,7 +153,7 @@ class MSHR(id: Int)(implicit edge: TLEdgeOut, cfg: DCacheConfig, p: Parameters)
    val refill = new L1RefillReq().asOutput // Data is bypassed
    val meta_read = Decoupled(new L1MetaReadReq)
    val meta_write = Decoupled(new L1MetaWriteReq)
-    val replay = Decoupled(new ReplayInternal(cfg))
+    val replay = Decoupled(new ReplayInternal)
    val wb_req = Decoupled(new WritebackReq(edge.bundle))
    val probe_rdy = Bool(OUTPUT)
  }
@ -171,7 +161,7 @@ class MSHR(id: Int)(implicit edge: TLEdgeOut, cfg: DCacheConfig, p: Parameters)
  val s_invalid :: s_wb_req :: s_wb_resp :: s_meta_clear :: s_refill_req :: s_refill_resp :: s_meta_write_req :: s_meta_write_resp :: s_drain_rpq :: Nil = Enum(UInt(), 9)
  val state = Reg(init=s_invalid)

-  val req = Reg(new MSHRReqInternal(cfg))
+  val req = Reg(new MSHRReqInternal)
  val req_idx = req.addr(untagBits-1,blockOffBits)
  val req_tag = req.addr >> untagBits
  val req_block_addr = (req.addr >> blockOffBits) << blockOffBits
@ -195,7 +185,7 @@ class MSHR(id: Int)(implicit edge: TLEdgeOut, cfg: DCacheConfig, p: Parameters)
                    (state.isOneOf(s_refill_req, s_refill_resp) &&
                      !cmd_requires_second_acquire && !refill_done))

-  val rpq = Module(new Queue(new ReplayInternal(cfg), cfg.nRPQ))
+  val rpq = Module(new Queue(new ReplayInternal, cfg.nRPQ))
  rpq.io.enq.valid := (io.req_pri_val && io.req_pri_rdy || io.req_sec_val && sec_rdy) && !isPrefetch(io.req_bits.cmd)
  rpq.io.enq.bits := io.req_bits
  rpq.io.deq.ready := (io.replay.ready && state === s_drain_rpq) || state === s_invalid
@ -310,7 +300,7 @@ class MSHR(id: Int)(implicit edge: TLEdgeOut, cfg: DCacheConfig, p: Parameters)
  }
 }

-class MSHRFile(implicit edge: TLEdgeOut, cfg: DCacheConfig, p: Parameters) extends L1HellaCacheModule()(p) {
+class MSHRFile(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCacheModule()(p) {
  val io = new Bundle {
    val req = Decoupled(new MSHRReq).flip
    val resp = Decoupled(new HellaCacheResp)
@ -350,7 +340,7 @@ class MSHRFile(implicit edge: TLEdgeOut, cfg: DCacheConfig, p: Parameters) exten
  val meta_read_arb = Module(new Arbiter(new L1MetaReadReq, cfg.nMSHRs))
  val meta_write_arb = Module(new Arbiter(new L1MetaWriteReq, cfg.nMSHRs))
  val wb_req_arb = Module(new Arbiter(new WritebackReq(edge.bundle), cfg.nMSHRs))
-  val replay_arb = Module(new Arbiter(new ReplayInternal(cfg), cfg.nMSHRs))
+  val replay_arb = Module(new Arbiter(new ReplayInternal, cfg.nMSHRs))
  val alloc_arb = Module(new Arbiter(Bool(), cfg.nMSHRs))

  var idx_match = Bool(false)
@ -671,14 +661,14 @@ class DataArray(implicit p: Parameters) extends L1HellaCacheModule()(p) {
  io.write.ready := Bool(true)
 }

-class NonBlockingDCache(cfg: DCacheConfig)(implicit p: Parameters) extends HellaCache(cfg)(p) {
+class NonBlockingDCache(implicit p: Parameters) extends HellaCache()(p) {
  override lazy val module = new NonBlockingDCacheModule(this) 
 }

 class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule(outer) {

  require(isPow2(nWays)) // TODO: relax this
-  require(p(DataScratchpadSize) == 0)
+  require(dataScratchpadSize == 0)

  val wb = Module(new WritebackUnit)
  val prober = Module(new ProbeUnit)
@ -706,7 +696,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule
  val s1_write = isWrite(s1_req.cmd)
  val s1_readwrite = s1_read || s1_write || isPrefetch(s1_req.cmd)

-  val dtlb = Module(new TLB)
+  val dtlb = Module(new TLB(nTLBEntries))
  io.ptw <> dtlb.io.ptw
  dtlb.io.req.valid := s1_valid_masked && s1_readwrite
  dtlb.io.req.bits.passthrough := s1_req.phys
@ -754,8 +744,8 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule

  // tags
  def onReset = L1Metadata(UInt(0), ClientMetadata.onReset)
-  val meta = Module(new MetadataArray(onReset _))
-  val metaReadArb = Module(new Arbiter(new MetaReadReq, 5))
+  val meta = Module(new L1MetadataArray(onReset _))
+  val metaReadArb = Module(new Arbiter(new L1MetaReadReq, 5))
  val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, 2))
  meta.io.read <> metaReadArb.io.out
  meta.io.write <> metaWriteArb.io.out
@ -852,7 +842,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule
  writeArb.io.in(0).bits.way_en :=  s3_way

  // replacement policy
-  val replacer = p(Replacer)()
+  val replacer = cacheParams.replacement
  val s1_replaced_way_en = UIntToOH(replacer.way)
  val s2_replaced_way_en = UIntToOH(RegEnable(replacer.way, s1_clk_en))
  val s2_repl_meta = Mux1H(s2_replaced_way_en, wayMap((w: Int) => RegEnable(meta.io.resp(w), s1_clk_en && s1_replaced_way_en(w))).toSeq)
--- a/src/main/scala/rocket/PTW.scala
+++ b/src/main/scala/rocket/PTW.scala
@ -4,11 +4,12 @@
 package rocket

 import Chisel._
-import config._
-import uncore.constants._
-import uncore.util.PseudoLRU
-import util._
 import Chisel.ImplicitConversions._
+import config._
+import tile._
+import uncore.constants._
+import util._
+
 import scala.collection.mutable.ListBuffer

 class PTWReq(implicit p: Parameters) extends CoreBundle()(p) {
@ -224,11 +225,12 @@ trait CanHavePTW extends HasHellaCache {
  implicit val p: Parameters
  val module: CanHavePTWModule
  var nPTWPorts = 1
+  nDCachePorts += usingPTW.toInt
 }

 trait CanHavePTWModule extends HasHellaCacheModule {
  val outer: CanHavePTW
  val ptwPorts = ListBuffer(outer.dcache.module.io.ptw)
-  val ptwOpt = if (outer.p(UseVM)) { Some(Module(new PTW(outer.nPTWPorts)(outer.p))) } else None
+  val ptwOpt = if (outer.usingPTW) { Some(Module(new PTW(outer.nPTWPorts)(outer.p))) } else None
  ptwOpt foreach { ptw => dcachePorts += ptw.io.mem }
 }
--- a/src/main/scala/rocket/Package.scala
+++ b/src/main/scala/rocket/Package.scala
--- a/src/main/scala/rocket/RVC.scala
+++ b/src/main/scala/rocket/RVC.scala
@ -4,8 +4,9 @@ package rocket

 import Chisel._
 import Chisel.ImplicitConversions._
-import util._
 import config._
+import tile._
+import util._

 class ExpandedInstruction extends Bundle {
  val bits = UInt(width = 32)
@ -150,14 +151,14 @@ class RVCDecoder(x: UInt)(implicit p: Parameters) {
  }
 }

-class RVCExpander(implicit p: Parameters) extends Module {
+class RVCExpander(implicit val p: Parameters) extends Module with HasCoreParameters {
  val io = new Bundle {
    val in = UInt(INPUT, 32)
    val out = new ExpandedInstruction
    val rvc = Bool(OUTPUT)
  }

-  if (p(UseCompressed)) {
+  if (usingCompressed) {
    io.rvc := io.in(1,0) =/= UInt(3)
    io.out := new RVCDecoder(io.in).decode
  } else {
--- a/src/main/scala/rocket/Rocc.scala
+++ b/src/main/scala/rocket/Rocc.scala
@ -1,437 +0,0 @@
-// See LICENSE.Berkeley for license details.
-// See LICENSE.SiFive for license details.
-
-package rocket
-
-import Chisel._
-import Chisel.ImplicitConversions._
-import config._
-import coreplex._
-import diplomacy._
-import uncore.constants._
-import uncore.agents._
-import uncore.coherence._
-import uncore.devices._
-import uncore.tilelink._
-import uncore.tilelink2._
-import uncore.util._
-import util._
-
-case object RoccMaxTaggedMemXacts extends Field[Int]
-case object RoccNMemChannels extends Field[Int]
-case object RoccNPTWPorts extends Field[Int]
-case object BuildRoCC extends Field[Seq[RoccParameters]]
-
-trait CanHaveLegacyRoccs extends CanHaveSharedFPU with CanHavePTW with TileNetwork {
-  val module: CanHaveLegacyRoccsModule
-  val legacyRocc = if (p(BuildRoCC).isEmpty) None
-    else Some(LazyModule(new LegacyRoccComplex()(p.alter { (site, here, up) => {
-        case CacheBlockOffsetBits => log2Up(site(CacheBlockBytes))
-        case AmoAluOperandBits => site(XLen)
-        case RoccNMemChannels => site(BuildRoCC).map(_.nMemChannels).foldLeft(0)(_ + _)
-        case RoccNPTWPorts => site(BuildRoCC).map(_.nPTWPorts).foldLeft(0)(_ + _)
-        case TLId => "L1toL2"
-        case TLKey("L1toL2") =>
-          TileLinkParameters(
-            coherencePolicy = new MESICoherence(new NullRepresentation(site(NTiles))),
-            nManagers = site(BankedL2Config).nBanks + 1 /* MMIO */,
-            nCachingClients = 1,
-            nCachelessClients = 1,
-            maxClientXacts = List(
-                site(DCacheKey).nMSHRs + 1 /* IOMSHR */,
-                if (site(BuildRoCC).isEmpty) 1 else site(RoccMaxTaggedMemXacts)).max,
-            maxClientsPerPort = if (site(BuildRoCC).isEmpty) 1 else 2,
-            maxManagerXacts = 8,
-            dataBeats = (8 * site(CacheBlockBytes)) / site(XLen),
-            dataBits = site(CacheBlockBytes)*8)
-    }})))
-
-  // TODO for now, all legacy rocc mem ports mapped to one external node
-  legacyRocc foreach { lr =>
-    lr.masterNodes.foreach { l1backend.node := _ }
-    nPTWPorts += lr.nPTWPorts
-    nDCachePorts += lr.nRocc
-  }
-}
-
-trait CanHaveLegacyRoccsModule extends CanHaveSharedFPUModule with CanHavePTWModule with TileNetworkModule {
-  val outer: CanHaveLegacyRoccs
-
-  fpuOpt foreach { fpu =>
-    outer.legacyRocc.orElse {
-      fpu.io.cp_req.valid := Bool(false)
-      fpu.io.cp_resp.ready := Bool(false)
-      None
-    } foreach { lr =>
-      fpu.io.cp_req <> lr.module.io.fpu.cp_req
-      fpu.io.cp_resp <> lr.module.io.fpu.cp_resp
-    }
-  }
-
-  outer.legacyRocc foreach { lr =>
-    ptwPorts ++= lr.module.io.ptw
-    dcachePorts ++= lr.module.io.dcache
-  }
-
-}
-
-class LegacyRoccComplex(implicit p: Parameters) extends LazyModule {
-  val buildRocc = p(BuildRoCC)
-  val usingRocc = !buildRocc.isEmpty
-  val nRocc = buildRocc.size
-  val nFPUPorts = buildRocc.filter(_.useFPU).size
-  val nMemChannels = buildRocc.map(_.nMemChannels).sum + nRocc
-  val nPTWPorts = buildRocc.map(_.nPTWPorts).sum
-  val roccOpcodes = buildRocc.map(_.opcodes)
-
-  val legacies = List.fill(nMemChannels) { LazyModule(new TLLegacy()(p.alterPartial({ case PAddrBits => 32 }))) }
-  val masterNodes = legacies.map(_ => TLOutputNode())
-  legacies.zip(masterNodes).foreach { case(l,m) => m := TLHintHandler()(l.node) }
-
-  lazy val module = new LazyModuleImp(this) with HasCoreParameters {
-    val io = new Bundle {
-      val tl = masterNodes.map(_.bundleOut)
-      val dcache = Vec(nRocc, new HellaCacheIO)
-      val fpu = new Bundle {
-        val cp_req = Decoupled(new FPInput())
-        val cp_resp = Decoupled(new FPResult()).flip
-      }
-      val ptw = Vec(nPTWPorts, new TLBPTWIO)
-      val core = new Bundle {
-        val cmd = Decoupled(new RoCCCommand).flip
-        val resp = Decoupled(new RoCCResponse)
-        val busy = Bool(OUTPUT)
-        val interrupt = Bool(OUTPUT)
-        val exception = Bool(INPUT)
-      }
-    }
-
-    val respArb = Module(new RRArbiter(new RoCCResponse, nRocc))
-    io.core.resp <> respArb.io.out
-
-    val cmdRouter = Module(new RoccCommandRouter(roccOpcodes))
-    cmdRouter.io.in <> io.core.cmd
-
-    val roccs = buildRocc.zipWithIndex.map { case (accelParams, i) =>
-      val rocc = accelParams.generator(p.alterPartial({
-        case RoccNMemChannels => accelParams.nMemChannels
-        case RoccNPTWPorts => accelParams.nPTWPorts
-      }))
-      val dcIF = Module(new SimpleHellaCacheIF()(p.alterPartial({ case CacheName => CacheName("L1D") })))
-      rocc.io.cmd <> cmdRouter.io.out(i)
-      rocc.io.exception := io.core.exception
-      dcIF.io.requestor <> rocc.io.mem
-      io.dcache(i) := dcIF.io.cache
-      legacies(i).module.io.legacy <> rocc.io.autl
-      respArb.io.in(i) <> Queue(rocc.io.resp)
-      rocc
-    }
-
-    (nRocc until legacies.size) zip roccs.map(_.io.utl) foreach { case(i, utl) =>
-      legacies(i).module.io.legacy <> utl
-    }
-    io.core.busy := cmdRouter.io.busy || roccs.map(_.io.busy).reduce(_ || _)
-    io.core.interrupt := roccs.map(_.io.interrupt).reduce(_ || _)
-
-    if (usingFPU && nFPUPorts > 0) {
-      val fpArb = Module(new InOrderArbiter(new FPInput, new FPResult, nFPUPorts))
-      val fp_rocc_ios = roccs.zip(buildRocc)
-        .filter { case (_, params) => params.useFPU }
-        .map { case (rocc, _) => rocc.io }
-      fpArb.io.in_req <> fp_rocc_ios.map(_.fpu_req)
-      fp_rocc_ios.zip(fpArb.io.in_resp).foreach {
-        case (rocc, arb) => rocc.fpu_resp <> arb
-      }
-      io.fpu.cp_req <> fpArb.io.out_req
-      fpArb.io.out_resp <> io.fpu.cp_resp
-    } else {
-      io.fpu.cp_req.valid := Bool(false)
-      io.fpu.cp_resp.ready := Bool(false)
-    }
-  }
-}
-
-case class RoccParameters(
-  opcodes: OpcodeSet,
-  generator: Parameters => RoCC,
-  nMemChannels: Int = 0,
-  nPTWPorts : Int = 0,
-  useFPU: Boolean = false)
-
-class RoCCInstruction extends Bundle
-{
-  val funct = Bits(width = 7)
-  val rs2 = Bits(width = 5)
-  val rs1 = Bits(width = 5)
-  val xd = Bool()
-  val xs1 = Bool()
-  val xs2 = Bool()
-  val rd = Bits(width = 5)
-  val opcode = Bits(width = 7)
-}
-
-class RoCCCommand(implicit p: Parameters) extends CoreBundle()(p) {
-  val inst = new RoCCInstruction
-  val rs1 = Bits(width = xLen)
-  val rs2 = Bits(width = xLen)
-  val status = new MStatus
-}
-
-class RoCCResponse(implicit p: Parameters) extends CoreBundle()(p) {
-  val rd = Bits(width = 5)
-  val data = Bits(width = xLen)
-}
-
-class RoCCCoreIO(implicit p: Parameters) extends CoreBundle()(p) {
-  val cmd = Decoupled(new RoCCCommand).flip
-  val resp = Decoupled(new RoCCResponse)
-  val mem = new HellaCacheIO
-  val busy = Bool(OUTPUT)
-  val interrupt = Bool(OUTPUT)
-  val exception = Bool(INPUT)
-
-  override def cloneType = new RoCCCoreIO()(p).asInstanceOf[this.type]
-}
-
-class RoCCIO(implicit p: Parameters) extends RoCCCoreIO()(p) {
-  // These should be handled differently, eventually
-  val autl = new ClientUncachedTileLinkIO
-  val utl = Vec(p(RoccNMemChannels), new ClientUncachedTileLinkIO)
-  val ptw = Vec(p(RoccNPTWPorts), new TLBPTWIO)
-  val fpu_req = Decoupled(new FPInput)
-  val fpu_resp = Decoupled(new FPResult).flip
-
-  override def cloneType = new RoCCIO()(p).asInstanceOf[this.type]
-}
-
-abstract class RoCC(implicit p: Parameters) extends CoreModule()(p) {
-  val io = new RoCCIO
-  io.mem.req.bits.phys := Bool(true) // don't perform address translation
-  io.mem.invalidate_lr := Bool(false) // don't mess with LR/SC
-}
-
-class AccumulatorExample(n: Int = 4)(implicit p: Parameters) extends RoCC()(p) {
-  val regfile = Mem(n, UInt(width = xLen))
-  val busy = Reg(init = Vec.fill(n){Bool(false)})
-
-  val cmd = Queue(io.cmd)
-  val funct = cmd.bits.inst.funct
-  val addr = cmd.bits.rs2(log2Up(n)-1,0)
-  val doWrite = funct === UInt(0)
-  val doRead = funct === UInt(1)
-  val doLoad = funct === UInt(2)
-  val doAccum = funct === UInt(3)
-  val memRespTag = io.mem.resp.bits.tag(log2Up(n)-1,0)
-
-  // datapath
-  val addend = cmd.bits.rs1
-  val accum = regfile(addr)
-  val wdata = Mux(doWrite, addend, accum + addend)
-
-  when (cmd.fire() && (doWrite || doAccum)) {
-    regfile(addr) := wdata
-  }
-
-  when (io.mem.resp.valid) {
-    regfile(memRespTag) := io.mem.resp.bits.data
-    busy(memRespTag) := Bool(false)
-  }
-
-  // control
-  when (io.mem.req.fire()) {
-    busy(addr) := Bool(true)
-  }
-
-  val doResp = cmd.bits.inst.xd
-  val stallReg = busy(addr)
-  val stallLoad = doLoad && !io.mem.req.ready
-  val stallResp = doResp && !io.resp.ready
-
-  cmd.ready := !stallReg && !stallLoad && !stallResp
-    // command resolved if no stalls AND not issuing a load that will need a request
-
-  // PROC RESPONSE INTERFACE
-  io.resp.valid := cmd.valid && doResp && !stallReg && !stallLoad
-    // valid response if valid command, need a response, and no stalls
-  io.resp.bits.rd := cmd.bits.inst.rd
-    // Must respond with the appropriate tag or undefined behavior
-  io.resp.bits.data := accum
-    // Semantics is to always send out prior accumulator register value
-
-  io.busy := cmd.valid || busy.reduce(_||_)
-    // Be busy when have pending memory requests or committed possibility of pending requests
-  io.interrupt := Bool(false)
-    // Set this true to trigger an interrupt on the processor (please refer to supervisor documentation)
-
-  // MEMORY REQUEST INTERFACE
-  io.mem.req.valid := cmd.valid && doLoad && !stallReg && !stallResp
-  io.mem.req.bits.addr := addend
-  io.mem.req.bits.tag := addr
-  io.mem.req.bits.cmd := M_XRD // perform a load (M_XWR for stores)
-  io.mem.req.bits.typ := MT_D // D = 8 bytes, W = 4, H = 2, B = 1
-  io.mem.req.bits.data := Bits(0) // we're not performing any stores...
-
-  io.autl.acquire.valid := false
-  io.autl.grant.ready := false
-}
-
-class TranslatorExample(implicit p: Parameters) extends RoCC()(p) {
-  val req_addr = Reg(UInt(width = coreMaxAddrBits))
-  val req_rd = Reg(io.resp.bits.rd)
-  val req_offset = req_addr(pgIdxBits - 1, 0)
-  val req_vpn = req_addr(coreMaxAddrBits - 1, pgIdxBits)
-  val pte = Reg(new PTE)
-
-  val s_idle :: s_ptw_req :: s_ptw_resp :: s_resp :: Nil = Enum(Bits(), 4)
-  val state = Reg(init = s_idle)
-
-  io.cmd.ready := (state === s_idle)
-
-  when (io.cmd.fire()) {
-    req_rd := io.cmd.bits.inst.rd
-    req_addr := io.cmd.bits.rs1
-    state := s_ptw_req
-  }
-
-  private val ptw = io.ptw(0)
-
-  when (ptw.req.fire()) { state := s_ptw_resp }
-
-  when (state === s_ptw_resp && ptw.resp.valid) {
-    pte := ptw.resp.bits.pte
-    state := s_resp
-  }
-
-  when (io.resp.fire()) { state := s_idle }
-
-  ptw.req.valid := (state === s_ptw_req)
-  ptw.req.bits.addr := req_vpn
-  ptw.req.bits.store := Bool(false)
-  ptw.req.bits.fetch := Bool(false)
-
-  io.resp.valid := (state === s_resp)
-  io.resp.bits.rd := req_rd
-  io.resp.bits.data := Mux(pte.leaf(), Cat(pte.ppn, req_offset), SInt(-1, xLen).asUInt)
-
-  io.busy := (state =/= s_idle)
-  io.interrupt := Bool(false)
-  io.mem.req.valid := Bool(false)
-  io.autl.acquire.valid := Bool(false)
-  io.autl.grant.ready := Bool(false)
-}
-
-class CharacterCountExample(implicit p: Parameters) extends RoCC()(p)
-    with HasTileLinkParameters {
-
-  private val blockOffset = tlBeatAddrBits + tlByteAddrBits
-
-  val needle = Reg(UInt(width = 8))
-  val addr = Reg(UInt(width = coreMaxAddrBits))
-  val count = Reg(UInt(width = xLen))
-  val resp_rd = Reg(io.resp.bits.rd)
-
-  val addr_block = addr(coreMaxAddrBits - 1, blockOffset)
-  val offset = addr(blockOffset - 1, 0)
-  val next_addr = (addr_block + UInt(1)) << UInt(blockOffset)
-
-  val s_idle :: s_acq :: s_gnt :: s_check :: s_resp :: Nil = Enum(Bits(), 5)
-  val state = Reg(init = s_idle)
-
-  val gnt = io.autl.grant.bits
-  val recv_data = Reg(UInt(width = tlDataBits))
-  val recv_beat = Reg(UInt(width = tlBeatAddrBits))
-
-  val data_bytes = Vec.tabulate(tlDataBytes) { i => recv_data(8 * (i + 1) - 1, 8 * i) }
-  val zero_match = data_bytes.map(_ === UInt(0))
-  val needle_match = data_bytes.map(_ === needle)
-  val first_zero = PriorityEncoder(zero_match)
-
-  val chars_found = PopCount(needle_match.zipWithIndex.map {
-    case (matches, i) =>
-      val idx = Cat(recv_beat, UInt(i, tlByteAddrBits))
-      matches && idx >= offset && UInt(i) <= first_zero
-  })
-  val zero_found = zero_match.reduce(_ || _)
-  val finished = Reg(Bool())
-
-  io.cmd.ready := (state === s_idle)
-  io.resp.valid := (state === s_resp)
-  io.resp.bits.rd := resp_rd
-  io.resp.bits.data := count
-  io.autl.acquire.valid := (state === s_acq)
-  io.autl.acquire.bits := GetBlock(addr_block = addr_block)
-  io.autl.grant.ready := (state === s_gnt)
-
-  when (io.cmd.fire()) {
-    addr := io.cmd.bits.rs1
-    needle := io.cmd.bits.rs2
-    resp_rd := io.cmd.bits.inst.rd
-    count := UInt(0)
-    finished := Bool(false)
-    state := s_acq
-  }
-
-  when (io.autl.acquire.fire()) { state := s_gnt }
-
-  when (io.autl.grant.fire()) {
-    recv_beat := gnt.addr_beat
-    recv_data := gnt.data
-    state := s_check
-  }
-
-  when (state === s_check) {
-    when (!finished) {
-      count := count + chars_found
-    }
-    when (zero_found) { finished := Bool(true) }
-    when (recv_beat === UInt(tlDataBeats - 1)) {
-      addr := next_addr
-      state := Mux(zero_found || finished, s_resp, s_acq)
-    } .otherwise {
-      state := s_gnt
-    }
-  }
-
-  when (io.resp.fire()) { state := s_idle }
-
-  io.busy := (state =/= s_idle)
-  io.interrupt := Bool(false)
-  io.mem.req.valid := Bool(false)
-}
-
-class OpcodeSet(val opcodes: Seq[UInt]) {
-  def |(set: OpcodeSet) =
-    new OpcodeSet(this.opcodes ++ set.opcodes)
-
-  def matches(oc: UInt) = opcodes.map(_ === oc).reduce(_ || _)
-}
-
-object OpcodeSet {
-  def custom0 = new OpcodeSet(Seq(Bits("b0001011")))
-  def custom1 = new OpcodeSet(Seq(Bits("b0101011")))
-  def custom2 = new OpcodeSet(Seq(Bits("b1011011")))
-  def custom3 = new OpcodeSet(Seq(Bits("b1111011")))
-  def all = custom0 | custom1 | custom2 | custom3
-}
-
-class RoccCommandRouter(opcodes: Seq[OpcodeSet])(implicit p: Parameters)
-    extends CoreModule()(p) {
-  val io = new Bundle {
-    val in = Decoupled(new RoCCCommand).flip
-    val out = Vec(opcodes.size, Decoupled(new RoCCCommand))
-    val busy = Bool(OUTPUT)
-  }
-
-  val cmd = Queue(io.in)
-  val cmdReadys = io.out.zip(opcodes).map { case (out, opcode) =>
-    val me = opcode.matches(cmd.bits.inst.opcode)
-    out.valid := cmd.valid && me
-    out.bits := cmd.bits
-    out.ready && me
-  }
-  cmd.ready := cmdReadys.reduce(_ || _)
-  io.busy := cmd.valid
-
-  assert(PopCount(cmdReadys) <= UInt(1),
-    "Custom opcode matched for more than one accelerator")
-}
--- a/src/main/scala/rocket/Rocket.scala
+++ b/src/main/scala/rocket/Rocket.scala
@ -5,76 +5,57 @@ package rocket

 import Chisel._
 import config._
+import tile._
 import uncore.constants._
 import util._
 import Chisel.ImplicitConversions._

-case class RocketConfig(xLen: Int)
-// TODO replace some of below fields with above Config
-case object XLen extends Field[Int]
-case object FetchWidth extends Field[Int]
-case object RetireWidth extends Field[Int]
-case object FPUKey extends Field[Option[FPUConfig]]
-case object MulDivKey extends Field[Option[MulDivConfig]]
-case object UseVM extends Field[Boolean]
-case object UseUser extends Field[Boolean]
-case object UseDebug extends Field[Boolean]
-case object UseAtomics extends Field[Boolean]
-case object UseCompressed extends Field[Boolean]
-case object FastLoadWord extends Field[Boolean]
-case object FastLoadByte extends Field[Boolean]
-case object FastJAL extends Field[Boolean]
-case object CoreInstBits extends Field[Int]
-case object NCustomMRWCSRs extends Field[Int]
-case object MtvecWritable extends Field[Boolean]
-case object MtvecInit extends Field[Option[BigInt]]
-case object NBreakpoints extends Field[Int]
-case object NPerfCounters extends Field[Int]
-case object NPerfEvents extends Field[Int]
-case object DataScratchpadSize extends Field[Int]
-
-class RegFile(n: Int, w: Int, zero: Boolean = false) {
-  private val rf = Mem(n, UInt(width = w))
-  private def access(addr: UInt) = rf(~addr(log2Up(n)-1,0))
-  private val reads = collection.mutable.ArrayBuffer[(UInt,UInt)]()
-  private var canRead = true
-  def read(addr: UInt) = {
-    require(canRead)
-    reads += addr -> Wire(UInt())
-    reads.last._2 := Mux(Bool(zero) && addr === UInt(0), UInt(0), access(addr))
-    reads.last._2
-  }
-  def write(addr: UInt, data: UInt) = {
-    canRead = false
-    when (addr =/= UInt(0)) {
-      access(addr) := data
-      for ((raddr, rdata) <- reads)
-        when (addr === raddr) { rdata := data }
-    }
-  }
+case class RocketCoreParams(
+  useVM: Boolean = true,
+  useUser: Boolean = false,
+  useDebug: Boolean = true,
+  useAtomics: Boolean = true,
+  useCompressed: Boolean = true,
+  nBreakpoints: Int = 1,
+  nPerfCounters: Int = 0,
+  nPerfEvents: Int = 0,
+  nCustomMRWCSRs: Int = 0,
+  mtvecInit: Option[BigInt] = Some(BigInt(0)),
+  mtvecWritable: Boolean = true,
+  fastLoadWord: Boolean = true,
+  fastLoadByte: Boolean = false,
+  fastJAL: Boolean = false,
+  mulDiv: Option[MulDivParams] = Some(MulDivParams()),
+  fpu: Option[FPUParams] = Some(FPUParams())
+) extends CoreParams {
+  val fetchWidth: Int = if (useCompressed) 2 else 1
+  //  fetchWidth doubled, but coreInstBytes halved, for RVC:
+  val decodeWidth: Int = fetchWidth / (if (useCompressed) 2 else 1)
+  val retireWidth: Int = 1
+  val instBits: Int = if (useCompressed) 16 else 32
 }

-object ImmGen {
-  def apply(sel: UInt, inst: UInt) = {
-    val sign = Mux(sel === IMM_Z, SInt(0), inst(31).asSInt)
-    val b30_20 = Mux(sel === IMM_U, inst(30,20).asSInt, sign)
-    val b19_12 = Mux(sel =/= IMM_U && sel =/= IMM_UJ, sign, inst(19,12).asSInt)
-    val b11 = Mux(sel === IMM_U || sel === IMM_Z, SInt(0),
-              Mux(sel === IMM_UJ, inst(20).asSInt,
-              Mux(sel === IMM_SB, inst(7).asSInt, sign)))
-    val b10_5 = Mux(sel === IMM_U || sel === IMM_Z, Bits(0), inst(30,25))
-    val b4_1 = Mux(sel === IMM_U, Bits(0),
-               Mux(sel === IMM_S || sel === IMM_SB, inst(11,8),
-               Mux(sel === IMM_Z, inst(19,16), inst(24,21))))
-    val b0 = Mux(sel === IMM_S, inst(7),
-             Mux(sel === IMM_I, inst(20),
-             Mux(sel === IMM_Z, inst(15), Bits(0))))
+trait HasRocketCoreParameters extends HasCoreParameters {
+  val rocketParams: RocketCoreParams = tileParams.core.asInstanceOf[RocketCoreParams]

-    Cat(sign, b30_20, b19_12, b11, b10_5, b4_1, b0).asSInt
-  }
+  val fastLoadWord = rocketParams.fastLoadWord
+  val fastLoadByte = rocketParams.fastLoadByte
+  val fastJAL = rocketParams.fastJAL
+  val nBreakpoints = rocketParams.nBreakpoints
+  val nPerfCounters = rocketParams.nPerfCounters
+  val nPerfEvents = rocketParams.nPerfEvents
+  val nCustomMrwCsrs = rocketParams.nCustomMRWCSRs
+  val mtvecInit = rocketParams.mtvecInit
+  val mtvecWritable = rocketParams.mtvecWritable
+
+  val mulDivParams = rocketParams.mulDiv.getOrElse(MulDivParams()) // TODO ask andrew about this
+
+  require(!fastLoadByte || fastLoadWord)
 }

-class Rocket(val c: RocketConfig)(implicit p: Parameters) extends CoreModule()(p) with HasCoreIO {
+class Rocket(implicit p: Parameters) extends CoreModule()(p)
+    with HasRocketCoreParameters
+    with HasCoreIO {

  val decode_table = {
    (if (usingMulDiv) new MDecode +: (xLen > 32).option(new M64Decode).toSeq else Nil) ++:
@ -243,7 +224,7 @@ class Rocket(val c: RocketConfig)(implicit p: Parameters) extends CoreModule()(p
  alu.io.in1 := ex_op1.asUInt
  
  // multiplier and divider
-  val div = Module(new MulDiv(p(MulDivKey).getOrElse(MulDivConfig()), width = xLen))
+  val div = Module(new MulDiv(mulDivParams, width = xLen))
  div.io.req.valid := ex_reg_valid && ex_ctrl.div
  div.io.req.bits.dw := ex_ctrl.alu_dw
  div.io.req.bits.fn := ex_ctrl.alu_fn
@ -324,9 +305,7 @@ class Rocket(val c: RocketConfig)(implicit p: Parameters) extends CoreModule()(p
  val mem_int_wdata = Mux(!mem_reg_xcpt && (mem_ctrl.jalr ^ mem_npc_misaligned), mem_br_target, mem_reg_wdata.asSInt).asUInt
  val mem_cfi = mem_ctrl.branch || mem_ctrl.jalr || mem_ctrl.jal
  val mem_cfi_taken = (mem_ctrl.branch && mem_br_taken) || mem_ctrl.jalr || (Bool(!fastJAL) && mem_ctrl.jal)
-  val mem_misprediction =
-    if (p(BtbKey).nEntries == 0) mem_cfi_taken
-    else mem_wrong_npc
+  val mem_misprediction = if (usingBTB) mem_wrong_npc else mem_cfi_taken
  take_pc_mem := mem_reg_valid && (mem_misprediction || mem_reg_flush_pipe)

  mem_reg_valid := !ctrl_killx
@ -652,3 +631,44 @@ class Rocket(val c: RocketConfig)(implicit p: Parameters) extends CoreModule()(p
    }
  }
 }
+
+class RegFile(n: Int, w: Int, zero: Boolean = false) {
+  private val rf = Mem(n, UInt(width = w))
+  private def access(addr: UInt) = rf(~addr(log2Up(n)-1,0))
+  private val reads = collection.mutable.ArrayBuffer[(UInt,UInt)]()
+  private var canRead = true
+  def read(addr: UInt) = {
+    require(canRead)
+    reads += addr -> Wire(UInt())
+    reads.last._2 := Mux(Bool(zero) && addr === UInt(0), UInt(0), access(addr))
+    reads.last._2
+  }
+  def write(addr: UInt, data: UInt) = {
+    canRead = false
+    when (addr =/= UInt(0)) {
+      access(addr) := data
+      for ((raddr, rdata) <- reads)
+        when (addr === raddr) { rdata := data }
+    }
+  }
+}
+
+object ImmGen {
+  def apply(sel: UInt, inst: UInt) = {
+    val sign = Mux(sel === IMM_Z, SInt(0), inst(31).asSInt)
+    val b30_20 = Mux(sel === IMM_U, inst(30,20).asSInt, sign)
+    val b19_12 = Mux(sel =/= IMM_U && sel =/= IMM_UJ, sign, inst(19,12).asSInt)
+    val b11 = Mux(sel === IMM_U || sel === IMM_Z, SInt(0),
+              Mux(sel === IMM_UJ, inst(20).asSInt,
+              Mux(sel === IMM_SB, inst(7).asSInt, sign)))
+    val b10_5 = Mux(sel === IMM_U || sel === IMM_Z, Bits(0), inst(30,25))
+    val b4_1 = Mux(sel === IMM_U, Bits(0),
+               Mux(sel === IMM_S || sel === IMM_SB, inst(11,8),
+               Mux(sel === IMM_Z, inst(19,16), inst(24,21))))
+    val b0 = Mux(sel === IMM_S, inst(7),
+             Mux(sel === IMM_I, inst(20),
+             Mux(sel === IMM_Z, inst(15), Bits(0))))
+
+    Cat(sign, b30_20, b19_12, b11, b10_5, b4_1, b0).asSInt
+  }
+}
--- a/src/main/scala/rocket/ScratchpadSlavePort.scala
+++ b/src/main/scala/rocket/ScratchpadSlavePort.scala
@ -4,22 +4,24 @@ package rocket

 import Chisel._
 import Chisel.ImplicitConversions._
-import junctions._
-import diplomacy._
 import config._
+import coreplex.CacheBlockBytes
+import diplomacy._
+import tile._
 import uncore.constants._
 import uncore.tilelink2._
 import uncore.util._
+import util._

-class ScratchpadSlavePort(implicit p: Parameters) extends LazyModule {
-  val coreDataBytes = p(XLen)/8
+class ScratchpadSlavePort(sizeBytes: Int)(implicit p: Parameters) extends LazyModule
+    with HasCoreParameters {
  val node = TLManagerNode(Seq(TLManagerPortParameters(
    Seq(TLManagerParameters(
-      address            = List(AddressSet(0x80000000L, BigInt(p(DataScratchpadSize)-1))),
+      address            = List(AddressSet(0x80000000L, BigInt(sizeBytes-1))),
      regionType         = RegionType.UNCACHED,
      executable         = true,
-      supportsArithmetic = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none,
-      supportsLogical    = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none,
+      supportsArithmetic = if (usingAtomics) TransferSizes(1, coreDataBytes) else TransferSizes.none,
+      supportsLogical    = if (usingAtomics) TransferSizes(1, coreDataBytes) else TransferSizes.none,
      supportsPutPartial = TransferSizes(1, coreDataBytes),
      supportsPutFull    = TransferSizes(1, coreDataBytes),
      supportsGet        = TransferSizes(1, coreDataBytes),
@ -106,10 +108,11 @@ class ScratchpadSlavePort(implicit p: Parameters) extends LazyModule {
 trait CanHaveScratchpad extends HasHellaCache with HasICacheFrontend {
  val module: CanHaveScratchpadModule

-  val slaveNode = if (p(DataScratchpadSize) == 0) None else Some(TLInputNode())
-  val scratch = if (p(DataScratchpadSize) == 0) None else Some(LazyModule(new ScratchpadSlavePort()(dcacheParams)))
+  val sizeBytes = tileParams.dataScratchpadBytes
+  val slaveNode = TLInputNode()
+  val scratch   = if (sizeBytes > 0) Some(LazyModule(new ScratchpadSlavePort(sizeBytes))) else None

-  (slaveNode zip scratch) foreach { case (node, lm) => lm.node := TLFragmenter(p(XLen)/8, p(CacheBlockBytes))(node) }
+  scratch foreach { lm => lm.node := TLFragmenter(p(XLen)/8, p(CacheBlockBytes))(slaveNode) }

  def findScratchpadFromICache: Option[AddressSet] = scratch.map { s =>
    val finalNode = frontend.node.edgesOut(0).manager.managers.find(_.nodePath.last == s.node)
@ -118,12 +121,12 @@ trait CanHaveScratchpad extends HasHellaCache with HasICacheFrontend {
    finalNode.get.address(0)
  }

-  nDCachePorts += 1 // core TODO dcachePorts += () => module.io.dmem ??
+  nDCachePorts += (sizeBytes > 0).toInt
 }

 trait CanHaveScratchpadBundle extends HasHellaCacheBundle with HasICacheFrontendBundle {
  val outer: CanHaveScratchpad
-  val slave = outer.slaveNode.map(_.bundleIn)
+  val slave = outer.slaveNode.bundleIn
 }

 trait CanHaveScratchpadModule extends HasHellaCacheModule with HasICacheFrontendModule {
--- a/src/main/scala/rocket/TLB.scala
+++ b/src/main/scala/rocket/TLB.scala
@ -4,24 +4,18 @@
 package rocket

 import Chisel._
-import util._
 import Chisel.ImplicitConversions._
-import scala.math._
 import config._
 import diplomacy._
-import uncore.util._
+import coreplex.CacheBlockBytes
+import tile.{XLen, CoreModule, CoreBundle}
 import uncore.tilelink2._
+import util._

 case object PAddrBits extends Field[Int]
 case object PgLevels extends Field[Int]
 case object ASIdBits extends Field[Int]

-trait HasTLBParameters extends HasL1CacheParameters {
-  val entries = p(p(CacheName)).nTLBEntries
-  val camAddrBits = log2Ceil(entries)
-  val camTagBits = asIdBits + vpnBits
-}
-
 class TLBReq(implicit p: Parameters) extends CoreBundle()(p) {
  val vpn = UInt(width = vpnBitsExtended)
  val passthrough = Bool()
@ -39,12 +33,15 @@ class TLBResp(implicit p: Parameters) extends CoreBundle()(p) {
  val cacheable = Bool(OUTPUT)
 }

-class TLB(implicit edge: TLEdgeOut, val p: Parameters) extends Module with HasTLBParameters {
+class TLB(entries: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(p) {
  val io = new Bundle {
    val req = Decoupled(new TLBReq).flip
    val resp = new TLBResp
    val ptw = new TLBPTWIO
  }
+  val cacheBlockBytes = p(CacheBlockBytes)
+  val camAddrBits = log2Ceil(entries)
+  val camTagBits = asIdBits + vpnBits

  val valid = Reg(init = UInt(0, entries))
  val ppns = Reg(Vec(entries, UInt(width = ppnBits)))
@ -182,7 +179,7 @@ class TLB(implicit edge: TLEdgeOut, val p: Parameters) extends Module with HasTL
  }
 }

-class DecoupledTLB(implicit edge: TLEdgeOut, p: Parameters) extends Module {
+class DecoupledTLB(entries: Int)(implicit edge: TLEdgeOut, p: Parameters) extends Module {
  val io = new Bundle {
    val req = Decoupled(new TLBReq).flip
    val resp = Decoupled(new TLBResp)
@ -191,7 +188,7 @@ class DecoupledTLB(implicit edge: TLEdgeOut, p: Parameters) extends Module {

  val req = Reg(new TLBReq)
  val resp = Reg(new TLBResp)
-  val tlb = Module(new TLB)
+  val tlb = Module(new TLB(entries))

  val s_idle :: s_tlb_req :: s_tlb_resp :: s_done :: Nil = Enum(Bits(), 4)
  val state = Reg(init = s_idle)
--- a/src/main/scala/rocket/Tile.scala
+++ b/src/main/scala/rocket/Tile.scala
@ -7,12 +7,23 @@ import Chisel._
 import config._
 import coreplex._
 import diplomacy._
-import uncore.converters._
+import tile._
 import uncore.devices._
 import uncore.tilelink2._
 import util._

-class RocketTile(val c: RocketConfig)(implicit p: Parameters) extends BaseTile()(p)
+case class RocketTileParams(
+    core: RocketCoreParams = RocketCoreParams(),
+    icache: Option[ICacheParams] = Some(ICacheParams()),
+    dcache: Option[DCacheParams] = Some(DCacheParams()),
+    rocc: Seq[RoCCParams] = Nil,
+    btb: Option[BTBParams] = Some(BTBParams()),
+    dataScratchpadBytes: Int = 0) extends TileParams {
+  require(icache.isDefined)
+  require(dcache.isDefined)
+}
+  
+class RocketTile(val rocketParams: RocketTileParams)(implicit p: Parameters) extends BaseTile(rocketParams)(p)
    with CanHaveLegacyRoccs  // implies CanHaveSharedFPU with CanHavePTW with HasHellaCache
    with CanHaveScratchpad { // implies CanHavePTW with HasHellaCache with HasICacheFrontend

@ -28,7 +39,7 @@ class RocketTileModule(outer: RocketTile) extends BaseTileModule(outer, () => ne
    with CanHaveLegacyRoccsModule
    with CanHaveScratchpadModule {

-  val core = Module(p(BuildCore)(outer.c, outer.p))
+  val core = Module(p(BuildCore)(outer.p))
  core.io.interrupts := io.interrupts
  core.io.hartid := io.hartid
  outer.frontend.module.io.cpu <> core.io.imem
@ -44,25 +55,34 @@ class RocketTileModule(outer: RocketTile) extends BaseTileModule(outer, () => ne
    core.io.rocc.interrupt := lr.module.io.core.interrupt
  }

+  // TODO eliminate this redundancy
+  val h = dcachePorts.size
+  val c = core.dcacheArbPorts
+  val o = outer.nDCachePorts
+  require(h == c, s"port list size was $h, core expected $c")
+  require(h == o, s"port list size was $h, outer counted $o")
  // TODO figure out how to move the below into their respective mix-ins
-  require(dcachePorts.size == core.dcacheArbPorts)
  dcacheArb.io.requestor <> dcachePorts
  ptwOpt foreach { ptw => ptw.io.requestor <> ptwPorts }
 }

-class AsyncRocketTile(c: RocketConfig)(implicit p: Parameters) extends LazyModule {
-  val rocket = LazyModule(new RocketTile(c))
+class AsyncRocketTile(rtp: RocketTileParams)(implicit p: Parameters) extends LazyModule {
+  val rocket = LazyModule(new RocketTile(rtp))

-  val masterNodes = rocket.masterNodes.map(_ => TLAsyncOutputNode())
-  val slaveNode = rocket.slaveNode.map(_ => TLAsyncInputNode())
+  val masterNode = TLAsyncOutputNode()
+  val source = LazyModule(new TLAsyncCrossingSource)
+  source.node :=* rocket.masterNode
+  masterNode :=* source.node

-  (rocket.masterNodes zip masterNodes) foreach { case (r,n) => n := TLAsyncCrossingSource()(r) }
-  (rocket.slaveNode zip slaveNode) foreach { case (r,n) => r := TLAsyncCrossingSink()(n) }
+  val slaveNode = TLAsyncInputNode()
+  val sink = LazyModule(new TLAsyncCrossingSink)
+  rocket.slaveNode :*= sink.node
+  sink.node :*= slaveNode

  lazy val module = new LazyModuleImp(this) {
    val io = new Bundle {
-      val master = masterNodes.head.bundleOut // TODO fix after Chisel #366
-      val slave = slaveNode.map(_.bundleIn)
+      val master = masterNode.bundleOut
+      val slave = slaveNode.bundleIn
      val hartid = UInt(INPUT, p(XLen))
      val interrupts = new TileInterrupts()(p).asInput
      val resetVector = UInt(INPUT, p(XLen))
@ -74,19 +94,23 @@ class AsyncRocketTile(c: RocketConfig)(implicit p: Parameters) extends LazyModul
  }
 }

-class RationalRocketTile(c: RocketConfig)(implicit p: Parameters) extends LazyModule {
-  val rocket = LazyModule(new RocketTile(c))
+class RationalRocketTile(rtp: RocketTileParams)(implicit p: Parameters) extends LazyModule {
+  val rocket = LazyModule(new RocketTile(rtp))

-  val masterNodes = rocket.masterNodes.map(_ => TLRationalOutputNode())
-  val slaveNode = rocket.slaveNode.map(_ => TLRationalInputNode())
+  val masterNode = TLRationalOutputNode()
+  val source = LazyModule(new TLRationalCrossingSource)
+  source.node :=* rocket.masterNode
+  masterNode :=* source.node

-  (rocket.masterNodes zip masterNodes) foreach { case (r,n) => n := TLRationalCrossingSource()(r) }
-  (rocket.slaveNode zip slaveNode) foreach { case (r,n) => r := TLRationalCrossingSink()(n) }
+  val slaveNode = TLRationalInputNode()
+  val sink = LazyModule(new TLRationalCrossingSink)
+  rocket.slaveNode :*= sink.node
+  sink.node :*= slaveNode

  lazy val module = new LazyModuleImp(this) {
    val io = new Bundle {
-      val master = masterNodes.head.bundleOut // TODO fix after Chisel #366
-      val slave = slaveNode.map(_.bundleIn)
+      val master = masterNode.bundleOut
+      val slave = slaveNode.bundleIn
      val hartid = UInt(INPUT, p(XLen))
      val interrupts = new TileInterrupts()(p).asInput
      val resetVector = UInt(INPUT, p(XLen))