[tl2] convert NBDcache to TL2 (WIP; compiles but untested)

2016-11-18 19:01:36 -08:00
parent 5f1cc19d71
commit 8b908465e0
8 changed files with 1049 additions and 1019 deletions
--- a/src/main/scala/rocket/HellaCache.scala
+++ b/src/main/scala/rocket/HellaCache.scala
@ -0,0 +1,167 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import cde.{Parameters, Field}
+import diplomacy._
+import uncore.tilelink2._
+import uncore.agents._
+import uncore.constants._
+import uncore.tilelink.{TLKey, TLId}
+import util.ParameterizedBundle
+
+case class DCacheConfig(
+  nMSHRs: Int = 1,
+  nSDQ: Int = 17,
+  nRPQ: Int = 16,
+  nMMIOs: Int = 1)
+
+case object DCacheKey extends Field[DCacheConfig]
+
+trait HasL1HellaCacheParameters extends HasCacheParameters with HasCoreParameters {
+  val outerDataBeats = p(TLKey(p(TLId))).dataBeats
+  val outerDataBits = p(TLKey(p(TLId))).dataBitsPerBeat
+
+  val refillCyclesPerBeat = outerDataBits/rowBits
+  require(refillCyclesPerBeat == 1)
+
+  val refillCycles = refillCyclesPerBeat*outerDataBeats
+
+  val cacheBlockBytes = p(CacheBlockBytes)
+  val lgCacheBlockBytes = log2Up(cacheBlockBytes)
+
+  val wordBits = xLen // really, xLen max 
+  val wordBytes = wordBits/8
+  val wordOffBits = log2Up(wordBytes)
+  val beatBytes = cacheBlockBytes / outerDataBeats
+  val beatWords = beatBytes / wordBytes
+  val beatOffBits = log2Up(beatBytes)
+  val idxMSB = untagBits-1
+  val idxLSB = blockOffBits
+  val offsetmsb = idxLSB-1
+  val offsetlsb = wordOffBits
+  val rowWords = rowBits/wordBits
+  val doNarrowRead = coreDataBits * nWays % rowBits == 0
+  val encDataBits = code.width(coreDataBits)
+  val encRowBits = encDataBits*rowWords
+  val nIOMSHRs = 1
+  val lrscCycles = 32 // ISA requires 16-insn LRSC sequences to succeed
+
+  require(isPow2(nSets))
+  require(rowBits >= coreDataBits)
+  require(rowBits <= outerDataBits)
+  require(xLen <= outerDataBits) // would need offset addr for puts if data width < xlen
+  require(!usingVM || untagBits <= pgIdxBits)
+}
+
+abstract class L1HellaCacheModule(implicit val p: Parameters) extends Module
+  with HasL1HellaCacheParameters
+
+abstract class L1HellaCacheBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
+  with HasL1HellaCacheParameters
+
+class L1Metadata(implicit p: Parameters) extends Metadata()(p) with HasL1HellaCacheParameters {
+  val coh = new ClientMetadata
+}
+object L1Metadata {
+  def apply(tag: Bits, coh: ClientMetadata)(implicit p: Parameters) = {
+    val meta = Wire(new L1Metadata)
+    meta.tag := tag
+    meta.coh := coh
+    meta
+  }
+}
+
+class L1MetaReadReq(implicit p: Parameters) extends MetaReadReq {
+  val tag = Bits(width = tagBits)
+  override def cloneType = new L1MetaReadReq()(p).asInstanceOf[this.type] //TODO remove
+}
+
+class L1MetaWriteReq(implicit p: Parameters) extends 
+  MetaWriteReq[L1Metadata](new L1Metadata)
+
+trait HasCoreMemOp extends HasCoreParameters {
+  val addr = UInt(width = coreMaxAddrBits)
+  val tag  = Bits(width = dcacheReqTagBits)
+  val cmd  = Bits(width = M_SZ)
+  val typ  = Bits(width = MT_SZ)
+}
+
+trait HasCoreData extends HasCoreParameters {
+  val data = Bits(width = coreDataBits)
+}
+
+class HellaCacheReq(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasCoreData
+
+class HellaCacheResp(implicit p: Parameters) extends CoreBundle()(p)
+    with HasCoreMemOp
+    with HasCoreData {
+  val replay = Bool()
+  val has_data = Bool()
+  val data_word_bypass = Bits(width = coreDataBits)
+  val store_data = Bits(width = coreDataBits)
+}
+
+class AlignmentExceptions extends Bundle {
+  val ld = Bool()
+  val st = Bool()
+}
+
+class HellaCacheExceptions extends Bundle {
+  val ma = new AlignmentExceptions
+  val pf = new AlignmentExceptions
+}
+
+
+// interface between D$ and processor/DTLB
+class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) {
+  val req = Decoupled(new HellaCacheReq)
+  val s1_kill = Bool(OUTPUT) // kill previous cycle's req
+  val s1_data = Bits(OUTPUT, coreDataBits) // data for previous cycle's req
+  val s2_nack = Bool(INPUT) // req from two cycles ago is rejected
+
+  val resp = Valid(new HellaCacheResp).flip
+  val replay_next = Bool(INPUT)
+  val xcpt = (new HellaCacheExceptions).asInput
+  val invalidate_lr = Bool(OUTPUT)
+  val ordered = Bool(INPUT)
+}
+
+abstract class HellaCache(val cfg: DCacheConfig)(implicit val p: Parameters) extends LazyModule {
+  val node = TLClientNode(TLClientParameters(
+    sourceId = IdRange(0, cfg.nMSHRs + cfg.nMMIOs),
+    supportsProbe = TransferSizes(p(CacheBlockBytes))))
+  val module: HellaCacheModule
+}
+
+class HellaCacheBundle(outer: HellaCache)(implicit p: Parameters) extends Bundle {
+  val cpu = (new HellaCacheIO).flip
+  val ptw = new TLBPTWIO()
+  val mem = outer.node.bundleOut
+}
+
+class HellaCacheModule(outer: HellaCache)(implicit val p: Parameters) extends LazyModuleImp(outer)
+    with HasL1HellaCacheParameters {
+  implicit val cfg = outer.cfg
+  val io = new HellaCacheBundle(outer)
+  val edge = outer.node.edgesOut(0)
+  val tl_out = io.mem(0)
+
+  /* TODO
+  edge.manager.managers.foreach { m =>
+    if (m.supportsGet) {
+      require (m.supportsGet.contains(TransferSizes(1, tlDataBytes)))
+    ....etc
+    }
+  }
+  */
+
+}
+
+object HellaCache {
+  def apply(cfg: DCacheConfig)(implicit p: Parameters) = {
+    if (cfg.nMSHRs == 0) LazyModule(new DCache(cfg))
+    else LazyModule(new NonBlockingDCache(cfg))
+  }
+}
--- a/src/main/scala/rocket/ScratchpadSlavePort.scala
+++ b/src/main/scala/rocket/ScratchpadSlavePort.scala
@ -0,0 +1,107 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import Chisel.ImplicitConversions._
+import cde.Parameters
+import junctions._
+import diplomacy._
+import uncore.constants._
+import uncore.tilelink2._
+import uncore.util._
+
+class ScratchpadSlavePort(implicit val p: Parameters) extends LazyModule with HasCoreParameters {
+  val node = TLManagerNode(TLManagerPortParameters(
+    Seq(TLManagerParameters(
+      address            = List(AddressSet(0x80000000L, BigInt(p(DataScratchpadSize)-1))),
+      regionType         = RegionType.UNCACHED,
+      executable         = true,
+      supportsArithmetic = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none,
+      supportsLogical    = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none,
+      supportsPutPartial = TransferSizes(1, coreDataBytes),
+      supportsPutFull    = TransferSizes(1, coreDataBytes),
+      supportsGet        = TransferSizes(1, coreDataBytes),
+      fifoId             = Some(0))), // requests handled in FIFO order
+    beatBytes = coreDataBytes,
+    minLatency = 1))
+
+  // Make sure this ends up with the same name as before
+  override def name = "dmem0"
+
+  lazy val module = new LazyModuleImp(this) {
+    val io = new Bundle {
+      val tl_in = node.bundleIn
+      val dmem = new HellaCacheIO
+    }
+
+    val tl_in = io.tl_in(0)
+    val edge = node.edgesIn(0)
+
+    require(usingDataScratchpad)
+
+    val s_ready :: s_wait :: s_replay :: s_grant :: Nil = Enum(UInt(), 4)
+    val state = Reg(init = s_ready)
+    when (io.dmem.resp.valid) { state := s_grant }
+    when (tl_in.d.fire()) { state := s_ready }
+    when (io.dmem.s2_nack) { state := s_replay }
+    when (io.dmem.req.fire()) { state := s_wait }
+
+    val acq = Reg(tl_in.a.bits)
+    when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data }
+    when (tl_in.a.fire()) { acq := tl_in.a.bits }
+
+    val isWrite = acq.opcode === TLMessages.PutFullData || acq.opcode === TLMessages.PutPartialData
+    val isRead = !edge.hasData(acq)
+
+    def formCacheReq(acq: TLBundleA) = {
+      val req = Wire(new HellaCacheReq)
+      req.cmd := MuxLookup(acq.opcode, Wire(M_XRD), Array(
+        TLMessages.PutFullData    -> M_XWR,
+        TLMessages.PutPartialData -> M_XWR,
+        TLMessages.ArithmeticData -> MuxLookup(acq.param, Wire(M_XRD), Array(
+          TLAtomics.MIN           -> M_XA_MIN,
+          TLAtomics.MAX           -> M_XA_MAX,
+          TLAtomics.MINU          -> M_XA_MINU,
+          TLAtomics.MAXU          -> M_XA_MAXU,
+          TLAtomics.ADD           -> M_XA_ADD)),
+        TLMessages.LogicalData    -> MuxLookup(acq.param, Wire(M_XRD), Array(
+          TLAtomics.XOR           -> M_XA_XOR,
+          TLAtomics.OR            -> M_XA_OR,
+          TLAtomics.AND           -> M_XA_AND,
+          TLAtomics.SWAP          -> M_XA_SWAP)),
+        TLMessages.Get            -> M_XRD))
+      // treat all loads as full words, so bytes appear in correct lane
+      req.typ := Mux(isRead, log2Ceil(coreDataBytes), acq.size)
+      req.addr := Mux(isRead, ~(~acq.address | (coreDataBytes-1)), acq.address)
+      req.tag := UInt(0)
+      req
+    }
+
+    val ready = state === s_ready || tl_in.d.fire()
+    io.dmem.req.valid := (tl_in.a.valid && ready) || state === s_replay
+    tl_in.a.ready := io.dmem.req.ready && ready
+    io.dmem.req.bits := formCacheReq(Mux(state === s_replay, acq, tl_in.a.bits))
+    // the TL data is already in the correct byte lane, but the D$
+    // expects right-justified store data, so that it can steer the bytes.
+    io.dmem.s1_data := new LoadGen(acq.size, Bool(false), acq.address(log2Ceil(coreDataBytes)-1,0), acq.data, Bool(false), coreDataBytes).data
+    io.dmem.s1_kill := false
+    io.dmem.invalidate_lr := false
+
+    // place AMO data in correct word lane
+    val minAMOBytes = 4
+    val grantData = Mux(io.dmem.resp.valid, io.dmem.resp.bits.data, acq.data)
+    val alignedGrantData = Mux(acq.size <= log2Ceil(minAMOBytes), Fill(coreDataBytes/minAMOBytes, grantData(8*minAMOBytes-1, 0)), grantData)
+
+    tl_in.d.valid := io.dmem.resp.valid || state === s_grant
+    tl_in.d.bits := Mux(isWrite,
+      edge.AccessAck(acq, UInt(0)),
+      edge.AccessAck(acq, UInt(0), UInt(0)))
+    tl_in.d.bits.data := alignedGrantData
+
+    // Tie off unused channels
+    tl_in.b.valid := Bool(false)
+    tl_in.c.ready := Bool(true)
+    tl_in.e.ready := Bool(true)
+  }
+}
--- a/src/main/scala/rocket/SimpleHellaCacheIF.scala
+++ b/src/main/scala/rocket/SimpleHellaCacheIF.scala
@ -0,0 +1,136 @@
+// See LICENSE for license details.
+
+package rocket
+
+import Chisel._
+import Chisel.ImplicitConversions._
+import cde.Parameters
+import util._
+
+/**
+ * This module buffers requests made by the SimpleHellaCacheIF in case they
+ * are nacked. Nacked requests must be replayed in order, and no other requests
+ * must be allowed to go through until the replayed requests are successfully
+ * completed.
+ */
+class SimpleHellaCacheIFReplayQueue(depth: Int)
+    (implicit val p: Parameters) extends Module
+    with HasL1HellaCacheParameters {
+  val io = new Bundle {
+    val req = Decoupled(new HellaCacheReq).flip
+    val nack = Valid(Bits(width = coreDCacheReqTagBits)).flip
+    val resp = Valid(new HellaCacheResp).flip
+    val replay = Decoupled(new HellaCacheReq)
+  }
+
+  // Registers to store the sent request
+  // When a request is sent the first time,
+  // it is stored in one of the reqs registers
+  // and the corresponding inflight bit is set.
+  // The reqs register will be deallocated once the request is
+  // successfully completed.
+  val inflight = Reg(init = UInt(0, depth))
+  val reqs = Reg(Vec(depth, new HellaCacheReq))
+
+  // The nack queue stores the index of nacked requests (in the reqs vector)
+  // in the order that they were nacked. A request is enqueued onto nackq
+  // when it is newly nacked (i.e. not a nack for a previous replay).
+  // The head of the nack queue will be replayed until it is
+  // successfully completed, at which time the request is dequeued.
+  // No new requests will be made or other replays attempted until the head
+  // of the nackq is successfully completed.
+  val nackq = Module(new Queue(UInt(width = log2Up(depth)), depth))
+  val replaying = Reg(init = Bool(false))
+
+  val next_inflight_onehot = PriorityEncoderOH(~inflight)
+  val next_inflight = OHToUInt(next_inflight_onehot)
+
+  val next_replay = nackq.io.deq.bits
+  val next_replay_onehot = UIntToOH(next_replay)
+  val next_replay_req = reqs(next_replay)
+
+  // Keep sending the head of the nack queue until it succeeds
+  io.replay.valid := nackq.io.deq.valid && !replaying
+  io.replay.bits := next_replay_req
+  // Don't allow new requests if there is are replays waiting
+  // or something being nacked.
+  io.req.ready := !inflight.andR && !nackq.io.deq.valid && !io.nack.valid
+
+  // Match on the tags to determine the index of nacks or responses
+  val nack_onehot = Cat(reqs.map(_.tag === io.nack.bits).reverse) & inflight
+  val resp_onehot = Cat(reqs.map(_.tag === io.resp.bits.tag).reverse) & inflight
+
+  val replay_complete = io.resp.valid && replaying && io.resp.bits.tag === next_replay_req.tag
+  val nack_head = io.nack.valid && nackq.io.deq.valid && io.nack.bits === next_replay_req.tag
+
+  // Enqueue to the nack queue if there is a nack that is not in response to
+  // the previous replay
+  nackq.io.enq.valid := io.nack.valid && !nack_head
+  nackq.io.enq.bits := OHToUInt(nack_onehot)
+  assert(!nackq.io.enq.valid || nackq.io.enq.ready,
+    "SimpleHellaCacheIF: ReplayQueue nack queue overflow")
+
+  // Dequeue from the nack queue if the last replay was successfully completed
+  nackq.io.deq.ready := replay_complete
+  assert(!nackq.io.deq.ready || nackq.io.deq.valid,
+    "SimpleHellaCacheIF: ReplayQueue nack queue underflow")
+
+  // Set inflight bit when a request is made
+  // Clear it when it is successfully completed
+  inflight := (inflight | Mux(io.req.fire(), next_inflight_onehot, UInt(0))) &
+                          ~Mux(io.resp.valid, resp_onehot, UInt(0))
+
+  when (io.req.fire()) {
+    reqs(next_inflight) := io.req.bits
+  }
+
+  // Only one replay outstanding at a time
+  when (io.replay.fire()) { replaying := Bool(true) }
+  when (nack_head || replay_complete) { replaying := Bool(false) }
+}
+
+// exposes a sane decoupled request interface
+class SimpleHellaCacheIF(implicit p: Parameters) extends Module
+{
+  val io = new Bundle {
+    val requestor = new HellaCacheIO().flip
+    val cache = new HellaCacheIO
+  }
+
+  val replayq = Module(new SimpleHellaCacheIFReplayQueue(2))
+  val req_arb = Module(new Arbiter(new HellaCacheReq, 2))
+
+  val req_helper = DecoupledHelper(
+    req_arb.io.in(1).ready,
+    replayq.io.req.ready,
+    io.requestor.req.valid)
+
+  req_arb.io.in(0) <> replayq.io.replay
+  req_arb.io.in(1).valid := req_helper.fire(req_arb.io.in(1).ready)
+  req_arb.io.in(1).bits := io.requestor.req.bits
+  io.requestor.req.ready := req_helper.fire(io.requestor.req.valid)
+  replayq.io.req.valid := req_helper.fire(replayq.io.req.ready)
+  replayq.io.req.bits := io.requestor.req.bits
+
+  val s0_req_fire = io.cache.req.fire()
+  val s1_req_fire = Reg(next = s0_req_fire)
+  val s2_req_fire = Reg(next = s1_req_fire)
+  val s1_req_tag = Reg(next = io.cache.req.bits.tag)
+  val s2_req_tag = Reg(next = s1_req_tag)
+  val s2_kill = Reg(next = io.cache.s1_kill)
+
+  io.cache.invalidate_lr := io.requestor.invalidate_lr
+  io.cache.req <> req_arb.io.out
+  io.cache.s1_kill := io.cache.s2_nack
+  io.cache.s1_data := RegEnable(req_arb.io.out.bits.data, s0_req_fire)
+
+  replayq.io.nack.valid := (io.cache.s2_nack || s2_kill) && s2_req_fire
+  replayq.io.nack.bits := s2_req_tag
+  replayq.io.resp := io.cache.resp
+  io.requestor.resp := io.cache.resp
+
+  assert(!Reg(next = io.cache.req.fire()) ||
+         !(io.cache.xcpt.ma.ld || io.cache.xcpt.ma.st ||
+           io.cache.xcpt.pf.ld || io.cache.xcpt.pf.st),
+         "SimpleHellaCacheIF exception")
+}
--- a/src/main/scala/rocket/dcache.scala
+++ b/src/main/scala/rocket/dcache.scala
@ -3,15 +3,15 @@
 package rocket

 import Chisel._
-import junctions._
+import Chisel.ImplicitConversions._
+import cde.Parameters
 import diplomacy._
-import uncore.tilelink2._
-import uncore.constants._
 import uncore.agents._
+import uncore.constants._
+import uncore.tilelink2._
 import uncore.util._
 import util._
 import TLMessages._
-import Chisel.ImplicitConversions._
 import config._

 class DCacheDataReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) {
@ -40,46 +40,17 @@ class DCacheDataArray(implicit p: Parameters) extends L1HellaCacheModule()(p) {
  }
 }

-class DCache(maxUncachedInFlight: Int = 2)(implicit val p: Parameters) extends LazyModule with HasL1HellaCacheParameters {
+class DCache(cfg: DCacheConfig)(implicit p: Parameters) extends HellaCache(cfg)(p) {
+  override lazy val module = new DCacheModule(this) 
+}

-  val node = TLClientNode(TLClientParameters(
-    sourceId = IdRange(0, maxUncachedInFlight),
-    supportsProbe = TransferSizes(cacheBlockBytes)))
+class DCacheModule(outer: DCache)(implicit p: Parameters) extends HellaCacheModule(outer)(p) {

-  lazy val module = new LazyModuleImp(this) {
-    val io = new Bundle {
-      val cpu = (new HellaCacheIO).flip
-      val ptw = new TLBPTWIO()
-      val mem = node.bundleOut
-    }
+  val maxUncachedInFlight = cfg.nMMIOs

-    val edge = node.edgesOut(0)
-    val tl_out = io.mem(0)
-
-    /* TODO
-    edge.manager.managers.foreach { m =>
-      // If a slave supports read at all, it must support all TL Legacy requires
-      if (m.supportsGet) {
-        require (m.supportsGet.contains(TransferSizes(1, tlDataBytes)))
-        require (m.supportsGet.contains(TransferSizes(tlDataBeats * tlDataBytes)))
-      }
-      // Likewise, any put support must mean full put support
-      if (m.supportsPutPartial) {
-        require (m.supportsPutPartial.contains(TransferSizes(1, tlDataBytes)))
-        require (m.supportsPutPartial.contains(TransferSizes(tlDataBeats * tlDataBytes)))
-      }
-      // Any atomic support => must support 32-bit size
-      if (m.supportsArithmetic) { require (m.supportsArithmetic.contains(TransferSizes(4))) }
-      if (m.supportsLogical)    { require (m.supportsLogical   .contains(TransferSizes(4))) }
-      // We straight-up require Acquire support, this is a cache afterall?
-      require (edge.manager.anySupportsAcquire)
-    }
-    */
  require(rowBits == encRowBits) // no ECC
-    require(refillCyclesPerBeat == 1)
-    require(rowBits >= coreDataBits)

-    val grantackq = Module(new Queue(tl_out.e.bits,1))
+  val grantackq = Module(new Queue(tl_out.e.bits,1)) // TODO don't need this in scratchpad mode

  // tags
  val replacer = p(Replacer)()
@ -530,100 +501,4 @@ class DCache(maxUncachedInFlight: Int = 2)(implicit val p: Parameters) extends L
      flushing := false
    }
  }
-  }
-}
-
-class ScratchpadSlavePort(implicit val p: Parameters) extends LazyModule with HasCoreParameters {
-  val node = TLManagerNode(TLManagerPortParameters(
-    Seq(TLManagerParameters(
-      address            = List(AddressSet(0x80000000L, BigInt(p(DataScratchpadSize)-1))),
-      regionType         = RegionType.UNCACHED,
-      executable         = true,
-      supportsArithmetic = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none,
-      supportsLogical    = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none,
-      supportsPutPartial = TransferSizes(1, coreDataBytes),
-      supportsPutFull    = TransferSizes(1, coreDataBytes),
-      supportsGet        = TransferSizes(1, coreDataBytes),
-      fifoId             = Some(0))), // requests handled in FIFO order
-    beatBytes = coreDataBytes,
-    minLatency = 1))
-
-  // Make sure this ends up with the same name as before
-  override def name = "dmem0"
-
-  lazy val module = new LazyModuleImp(this) {
-    val io = new Bundle {
-      val tl_in = node.bundleIn
-      val dmem = new HellaCacheIO
-    }
-
-    val tl_in = io.tl_in(0)
-    val edge = node.edgesIn(0)
-
-    require(usingDataScratchpad)
-
-    val s_ready :: s_wait :: s_replay :: s_grant :: Nil = Enum(UInt(), 4)
-    val state = Reg(init = s_ready)
-    when (io.dmem.resp.valid) { state := s_grant }
-    when (tl_in.d.fire()) { state := s_ready }
-    when (io.dmem.s2_nack) { state := s_replay }
-    when (io.dmem.req.fire()) { state := s_wait }
-
-    val acq = Reg(tl_in.a.bits)
-    when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data }
-    when (tl_in.a.fire()) { acq := tl_in.a.bits }
-
-    val isWrite = acq.opcode === TLMessages.PutFullData || acq.opcode === TLMessages.PutPartialData
-    val isRead = !edge.hasData(acq)
-
-    def formCacheReq(acq: TLBundleA) = {
-      val req = Wire(new HellaCacheReq)
-      req.cmd := MuxLookup(acq.opcode, Wire(M_XRD), Array(
-        TLMessages.PutFullData    -> M_XWR,
-        TLMessages.PutPartialData -> M_XWR,
-        TLMessages.ArithmeticData -> MuxLookup(acq.param, Wire(M_XRD), Array(
-          TLAtomics.MIN           -> M_XA_MIN,
-          TLAtomics.MAX           -> M_XA_MAX,
-          TLAtomics.MINU          -> M_XA_MINU,
-          TLAtomics.MAXU          -> M_XA_MAXU,
-          TLAtomics.ADD           -> M_XA_ADD)),
-        TLMessages.LogicalData    -> MuxLookup(acq.param, Wire(M_XRD), Array(
-          TLAtomics.XOR           -> M_XA_XOR,
-          TLAtomics.OR            -> M_XA_OR,
-          TLAtomics.AND           -> M_XA_AND,
-          TLAtomics.SWAP          -> M_XA_SWAP)),
-        TLMessages.Get            -> M_XRD))
-      // treat all loads as full words, so bytes appear in correct lane
-      req.typ := Mux(isRead, log2Ceil(coreDataBytes), acq.size)
-      req.addr := Mux(isRead, ~(~acq.address | (coreDataBytes-1)), acq.address)
-      req.tag := UInt(0)
-      req
-    }
-
-    val ready = state === s_ready || tl_in.d.fire()
-    io.dmem.req.valid := (tl_in.a.valid && ready) || state === s_replay
-    tl_in.a.ready := io.dmem.req.ready && ready
-    io.dmem.req.bits := formCacheReq(Mux(state === s_replay, acq, tl_in.a.bits))
-    // the TL data is already in the correct byte lane, but the D$
-    // expects right-justified store data, so that it can steer the bytes.
-    io.dmem.s1_data := new LoadGen(acq.size, Bool(false), acq.address(log2Ceil(coreDataBytes)-1,0), acq.data, Bool(false), coreDataBytes).data
-    io.dmem.s1_kill := false
-    io.dmem.invalidate_lr := false
-
-    // place AMO data in correct word lane
-    val minAMOBytes = 4
-    val grantData = Mux(io.dmem.resp.valid, io.dmem.resp.bits.data, acq.data)
-    val alignedGrantData = Mux(acq.size <= log2Ceil(minAMOBytes), Fill(coreDataBytes/minAMOBytes, grantData(8*minAMOBytes-1, 0)), grantData)
-
-    tl_in.d.valid := io.dmem.resp.valid || state === s_grant
-    tl_in.d.bits := Mux(isWrite,
-      edge.AccessAck(acq, UInt(0)),
-      edge.AccessAck(acq, UInt(0), UInt(0)))
-    tl_in.d.bits.data := alignedGrantData
-
-    // Tie off unused channels
-    tl_in.b.valid := Bool(false)
-    tl_in.c.ready := Bool(true)
-    tl_in.e.ready := Bool(true)
-  }
 }
--- a/src/main/scala/rocket/nbdcache.scala
+++ b/src/main/scala/rocket/nbdcache.scala
--- a/src/main/scala/uncore/tilelink2/Arbiter.scala
+++ b/src/main/scala/uncore/tilelink2/Arbiter.scala
@ -13,6 +13,14 @@ object TLArbiter
  val lowestIndexFirst: Policy = (valids, granted) =>
    valids.scanLeft(Bool(true))(_ && !_).init

+  def lowestFromSeq[T <: TLChannel](edge: TLEdge, sink: DecoupledIO[T], sources: Seq[DecoupledIO[T]]) {
+    apply(lowestIndexFirst)(sink, sources.map(s => (edge.numBeats1(s.bits), s)):_*)
+  }
+
+  def lowest[T <: TLChannel](edge: TLEdge, sink: DecoupledIO[T], sources: DecoupledIO[T]*) {
+    apply(lowestIndexFirst)(sink, sources.toList.map(s => (edge.numBeats1(s.bits), s)):_*)
+  }
+
  def apply[T <: Data](policy: Policy)(sink: DecoupledIO[T], sources: (UInt, DecoupledIO[T])*) {
    if (sources.isEmpty) {
      sink.valid := Bool(false)
--- a/src/main/scala/uncore/tilelink2/Broadcast.scala
+++ b/src/main/scala/uncore/tilelink2/Broadcast.scala
@ -136,11 +136,9 @@ class TLBroadcast(lineBytes: Int, numTrackers: Int = 4, bufferless: Boolean = fa
    putfull.bits := edgeOut.Put(Cat(put_what, in.c.bits.source), in.c.bits.address, in.c.bits.size, in.c.bits.data)._2

    // Combine ReleaseAck or the modified D
-    TLArbiter(TLArbiter.lowestIndexFirst)(in.d, (UInt(0), releaseack), (edgeOut.numBeats1(d_normal.bits), d_normal))
+    TLArbiter.lowest(edgeOut, in.d, releaseack, d_normal)
    // Combine the PutFull with the trackers
-    TLArbiter(TLArbiter.lowestIndexFirst)(out.a, 
-      ((edgeOut.numBeats1(putfull.bits), putfull) +:
-       trackers.map { t => (edgeOut.numBeats1(t.out_a.bits), t.out_a) }):_*)
+    TLArbiter.lowestFromSeq(edgeOut, out.a, putfull +: trackers.map(_.out_a))

    // The Probe FSM walks all caches and probes them
    val probe_todo = RegInit(UInt(0, width = max(1, caches.size)))
--- a/src/main/scala/uncore/tilelink2/Edges.scala
+++ b/src/main/scala/uncore/tilelink2/Edges.scala
@ -191,27 +191,32 @@ class TLEdge(

  def first(bits: TLChannel, fire: Bool): Bool = firstlastHelper(bits, fire)._1
  def first(x: DecoupledIO[TLChannel]): Bool = first(x.bits, x.fire())
+  def first(x: ValidIO[TLChannel]): Bool = first(x.bits, x.valid)

  def last(bits: TLChannel, fire: Bool): Bool = firstlastHelper(bits, fire)._2
  def last(x: DecoupledIO[TLChannel]): Bool = last(x.bits, x.fire())
+  def last(x: ValidIO[TLChannel]): Bool = last(x.bits, x.valid)

  def firstlast(bits: TLChannel, fire: Bool): (Bool, Bool, Bool) = {
    val r = firstlastHelper(bits, fire)
    (r._1, r._2, r._3)
  }
  def firstlast(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool) = firstlast(x.bits, x.fire())
+  def firstlast(x: ValidIO[TLChannel]): (Bool, Bool, Bool) = firstlast(x.bits, x.valid)

  def count(bits: TLChannel, fire: Bool): (Bool, Bool, Bool, UInt) = {
    val r = firstlastHelper(bits, fire)
    (r._1, r._2, r._3, r._4)
  }
  def count(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool, UInt) = count(x.bits, x.fire())
+  def count(x: ValidIO[TLChannel]): (Bool, Bool, Bool, UInt) = count(x.bits, x.valid)

  def addr_inc(bits: TLChannel, fire: Bool): (Bool, Bool, Bool, UInt) = {
    val r = firstlastHelper(bits, fire)
    (r._1, r._2, r._3, r._4 << log2Ceil(manager.beatBytes))
  }
  def addr_inc(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool, UInt) = addr_inc(x.bits, x.fire())
+  def addr_inc(x: ValidIO[TLChannel]): (Bool, Bool, Bool, UInt) = addr_inc(x.bits, x.valid)
 }

 class TLEdgeOut(