diff --git a/src/main/scala/rocket/HellaCache.scala b/src/main/scala/rocket/HellaCache.scala new file mode 100644 index 00000000..0fed1211 --- /dev/null +++ b/src/main/scala/rocket/HellaCache.scala @@ -0,0 +1,167 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import cde.{Parameters, Field} +import diplomacy._ +import uncore.tilelink2._ +import uncore.agents._ +import uncore.constants._ +import uncore.tilelink.{TLKey, TLId} +import util.ParameterizedBundle + +case class DCacheConfig( + nMSHRs: Int = 1, + nSDQ: Int = 17, + nRPQ: Int = 16, + nMMIOs: Int = 1) + +case object DCacheKey extends Field[DCacheConfig] + +trait HasL1HellaCacheParameters extends HasCacheParameters with HasCoreParameters { + val outerDataBeats = p(TLKey(p(TLId))).dataBeats + val outerDataBits = p(TLKey(p(TLId))).dataBitsPerBeat + + val refillCyclesPerBeat = outerDataBits/rowBits + require(refillCyclesPerBeat == 1) + + val refillCycles = refillCyclesPerBeat*outerDataBeats + + val cacheBlockBytes = p(CacheBlockBytes) + val lgCacheBlockBytes = log2Up(cacheBlockBytes) + + val wordBits = xLen // really, xLen max + val wordBytes = wordBits/8 + val wordOffBits = log2Up(wordBytes) + val beatBytes = cacheBlockBytes / outerDataBeats + val beatWords = beatBytes / wordBytes + val beatOffBits = log2Up(beatBytes) + val idxMSB = untagBits-1 + val idxLSB = blockOffBits + val offsetmsb = idxLSB-1 + val offsetlsb = wordOffBits + val rowWords = rowBits/wordBits + val doNarrowRead = coreDataBits * nWays % rowBits == 0 + val encDataBits = code.width(coreDataBits) + val encRowBits = encDataBits*rowWords + val nIOMSHRs = 1 + val lrscCycles = 32 // ISA requires 16-insn LRSC sequences to succeed + + require(isPow2(nSets)) + require(rowBits >= coreDataBits) + require(rowBits <= outerDataBits) + require(xLen <= outerDataBits) // would need offset addr for puts if data width < xlen + require(!usingVM || untagBits <= pgIdxBits) +} + +abstract class L1HellaCacheModule(implicit val p: Parameters) extends Module + with HasL1HellaCacheParameters + +abstract class L1HellaCacheBundle(implicit val p: Parameters) extends ParameterizedBundle()(p) + with HasL1HellaCacheParameters + +class L1Metadata(implicit p: Parameters) extends Metadata()(p) with HasL1HellaCacheParameters { + val coh = new ClientMetadata +} +object L1Metadata { + def apply(tag: Bits, coh: ClientMetadata)(implicit p: Parameters) = { + val meta = Wire(new L1Metadata) + meta.tag := tag + meta.coh := coh + meta + } +} + +class L1MetaReadReq(implicit p: Parameters) extends MetaReadReq { + val tag = Bits(width = tagBits) + override def cloneType = new L1MetaReadReq()(p).asInstanceOf[this.type] //TODO remove +} + +class L1MetaWriteReq(implicit p: Parameters) extends + MetaWriteReq[L1Metadata](new L1Metadata) + +trait HasCoreMemOp extends HasCoreParameters { + val addr = UInt(width = coreMaxAddrBits) + val tag = Bits(width = dcacheReqTagBits) + val cmd = Bits(width = M_SZ) + val typ = Bits(width = MT_SZ) +} + +trait HasCoreData extends HasCoreParameters { + val data = Bits(width = coreDataBits) +} + +class HellaCacheReq(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasCoreData + +class HellaCacheResp(implicit p: Parameters) extends CoreBundle()(p) + with HasCoreMemOp + with HasCoreData { + val replay = Bool() + val has_data = Bool() + val data_word_bypass = Bits(width = coreDataBits) + val store_data = Bits(width = coreDataBits) +} + +class AlignmentExceptions extends Bundle { + val ld = Bool() + val st = Bool() +} + +class HellaCacheExceptions extends Bundle { + val ma = new AlignmentExceptions + val pf = new AlignmentExceptions +} + + +// interface between D$ and processor/DTLB +class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) { + val req = Decoupled(new HellaCacheReq) + val s1_kill = Bool(OUTPUT) // kill previous cycle's req + val s1_data = Bits(OUTPUT, coreDataBits) // data for previous cycle's req + val s2_nack = Bool(INPUT) // req from two cycles ago is rejected + + val resp = Valid(new HellaCacheResp).flip + val replay_next = Bool(INPUT) + val xcpt = (new HellaCacheExceptions).asInput + val invalidate_lr = Bool(OUTPUT) + val ordered = Bool(INPUT) +} + +abstract class HellaCache(val cfg: DCacheConfig)(implicit val p: Parameters) extends LazyModule { + val node = TLClientNode(TLClientParameters( + sourceId = IdRange(0, cfg.nMSHRs + cfg.nMMIOs), + supportsProbe = TransferSizes(p(CacheBlockBytes)))) + val module: HellaCacheModule +} + +class HellaCacheBundle(outer: HellaCache)(implicit p: Parameters) extends Bundle { + val cpu = (new HellaCacheIO).flip + val ptw = new TLBPTWIO() + val mem = outer.node.bundleOut +} + +class HellaCacheModule(outer: HellaCache)(implicit val p: Parameters) extends LazyModuleImp(outer) + with HasL1HellaCacheParameters { + implicit val cfg = outer.cfg + val io = new HellaCacheBundle(outer) + val edge = outer.node.edgesOut(0) + val tl_out = io.mem(0) + + /* TODO + edge.manager.managers.foreach { m => + if (m.supportsGet) { + require (m.supportsGet.contains(TransferSizes(1, tlDataBytes))) + ....etc + } + } + */ + +} + +object HellaCache { + def apply(cfg: DCacheConfig)(implicit p: Parameters) = { + if (cfg.nMSHRs == 0) LazyModule(new DCache(cfg)) + else LazyModule(new NonBlockingDCache(cfg)) + } +} diff --git a/src/main/scala/rocket/ScratchpadSlavePort.scala b/src/main/scala/rocket/ScratchpadSlavePort.scala new file mode 100644 index 00000000..41550633 --- /dev/null +++ b/src/main/scala/rocket/ScratchpadSlavePort.scala @@ -0,0 +1,107 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import Chisel.ImplicitConversions._ +import cde.Parameters +import junctions._ +import diplomacy._ +import uncore.constants._ +import uncore.tilelink2._ +import uncore.util._ + +class ScratchpadSlavePort(implicit val p: Parameters) extends LazyModule with HasCoreParameters { + val node = TLManagerNode(TLManagerPortParameters( + Seq(TLManagerParameters( + address = List(AddressSet(0x80000000L, BigInt(p(DataScratchpadSize)-1))), + regionType = RegionType.UNCACHED, + executable = true, + supportsArithmetic = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none, + supportsLogical = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none, + supportsPutPartial = TransferSizes(1, coreDataBytes), + supportsPutFull = TransferSizes(1, coreDataBytes), + supportsGet = TransferSizes(1, coreDataBytes), + fifoId = Some(0))), // requests handled in FIFO order + beatBytes = coreDataBytes, + minLatency = 1)) + + // Make sure this ends up with the same name as before + override def name = "dmem0" + + lazy val module = new LazyModuleImp(this) { + val io = new Bundle { + val tl_in = node.bundleIn + val dmem = new HellaCacheIO + } + + val tl_in = io.tl_in(0) + val edge = node.edgesIn(0) + + require(usingDataScratchpad) + + val s_ready :: s_wait :: s_replay :: s_grant :: Nil = Enum(UInt(), 4) + val state = Reg(init = s_ready) + when (io.dmem.resp.valid) { state := s_grant } + when (tl_in.d.fire()) { state := s_ready } + when (io.dmem.s2_nack) { state := s_replay } + when (io.dmem.req.fire()) { state := s_wait } + + val acq = Reg(tl_in.a.bits) + when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data } + when (tl_in.a.fire()) { acq := tl_in.a.bits } + + val isWrite = acq.opcode === TLMessages.PutFullData || acq.opcode === TLMessages.PutPartialData + val isRead = !edge.hasData(acq) + + def formCacheReq(acq: TLBundleA) = { + val req = Wire(new HellaCacheReq) + req.cmd := MuxLookup(acq.opcode, Wire(M_XRD), Array( + TLMessages.PutFullData -> M_XWR, + TLMessages.PutPartialData -> M_XWR, + TLMessages.ArithmeticData -> MuxLookup(acq.param, Wire(M_XRD), Array( + TLAtomics.MIN -> M_XA_MIN, + TLAtomics.MAX -> M_XA_MAX, + TLAtomics.MINU -> M_XA_MINU, + TLAtomics.MAXU -> M_XA_MAXU, + TLAtomics.ADD -> M_XA_ADD)), + TLMessages.LogicalData -> MuxLookup(acq.param, Wire(M_XRD), Array( + TLAtomics.XOR -> M_XA_XOR, + TLAtomics.OR -> M_XA_OR, + TLAtomics.AND -> M_XA_AND, + TLAtomics.SWAP -> M_XA_SWAP)), + TLMessages.Get -> M_XRD)) + // treat all loads as full words, so bytes appear in correct lane + req.typ := Mux(isRead, log2Ceil(coreDataBytes), acq.size) + req.addr := Mux(isRead, ~(~acq.address | (coreDataBytes-1)), acq.address) + req.tag := UInt(0) + req + } + + val ready = state === s_ready || tl_in.d.fire() + io.dmem.req.valid := (tl_in.a.valid && ready) || state === s_replay + tl_in.a.ready := io.dmem.req.ready && ready + io.dmem.req.bits := formCacheReq(Mux(state === s_replay, acq, tl_in.a.bits)) + // the TL data is already in the correct byte lane, but the D$ + // expects right-justified store data, so that it can steer the bytes. + io.dmem.s1_data := new LoadGen(acq.size, Bool(false), acq.address(log2Ceil(coreDataBytes)-1,0), acq.data, Bool(false), coreDataBytes).data + io.dmem.s1_kill := false + io.dmem.invalidate_lr := false + + // place AMO data in correct word lane + val minAMOBytes = 4 + val grantData = Mux(io.dmem.resp.valid, io.dmem.resp.bits.data, acq.data) + val alignedGrantData = Mux(acq.size <= log2Ceil(minAMOBytes), Fill(coreDataBytes/minAMOBytes, grantData(8*minAMOBytes-1, 0)), grantData) + + tl_in.d.valid := io.dmem.resp.valid || state === s_grant + tl_in.d.bits := Mux(isWrite, + edge.AccessAck(acq, UInt(0)), + edge.AccessAck(acq, UInt(0), UInt(0))) + tl_in.d.bits.data := alignedGrantData + + // Tie off unused channels + tl_in.b.valid := Bool(false) + tl_in.c.ready := Bool(true) + tl_in.e.ready := Bool(true) + } +} diff --git a/src/main/scala/rocket/SimpleHellaCacheIF.scala b/src/main/scala/rocket/SimpleHellaCacheIF.scala new file mode 100644 index 00000000..73f3518f --- /dev/null +++ b/src/main/scala/rocket/SimpleHellaCacheIF.scala @@ -0,0 +1,136 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import Chisel.ImplicitConversions._ +import cde.Parameters +import util._ + +/** + * This module buffers requests made by the SimpleHellaCacheIF in case they + * are nacked. Nacked requests must be replayed in order, and no other requests + * must be allowed to go through until the replayed requests are successfully + * completed. + */ +class SimpleHellaCacheIFReplayQueue(depth: Int) + (implicit val p: Parameters) extends Module + with HasL1HellaCacheParameters { + val io = new Bundle { + val req = Decoupled(new HellaCacheReq).flip + val nack = Valid(Bits(width = coreDCacheReqTagBits)).flip + val resp = Valid(new HellaCacheResp).flip + val replay = Decoupled(new HellaCacheReq) + } + + // Registers to store the sent request + // When a request is sent the first time, + // it is stored in one of the reqs registers + // and the corresponding inflight bit is set. + // The reqs register will be deallocated once the request is + // successfully completed. + val inflight = Reg(init = UInt(0, depth)) + val reqs = Reg(Vec(depth, new HellaCacheReq)) + + // The nack queue stores the index of nacked requests (in the reqs vector) + // in the order that they were nacked. A request is enqueued onto nackq + // when it is newly nacked (i.e. not a nack for a previous replay). + // The head of the nack queue will be replayed until it is + // successfully completed, at which time the request is dequeued. + // No new requests will be made or other replays attempted until the head + // of the nackq is successfully completed. + val nackq = Module(new Queue(UInt(width = log2Up(depth)), depth)) + val replaying = Reg(init = Bool(false)) + + val next_inflight_onehot = PriorityEncoderOH(~inflight) + val next_inflight = OHToUInt(next_inflight_onehot) + + val next_replay = nackq.io.deq.bits + val next_replay_onehot = UIntToOH(next_replay) + val next_replay_req = reqs(next_replay) + + // Keep sending the head of the nack queue until it succeeds + io.replay.valid := nackq.io.deq.valid && !replaying + io.replay.bits := next_replay_req + // Don't allow new requests if there is are replays waiting + // or something being nacked. + io.req.ready := !inflight.andR && !nackq.io.deq.valid && !io.nack.valid + + // Match on the tags to determine the index of nacks or responses + val nack_onehot = Cat(reqs.map(_.tag === io.nack.bits).reverse) & inflight + val resp_onehot = Cat(reqs.map(_.tag === io.resp.bits.tag).reverse) & inflight + + val replay_complete = io.resp.valid && replaying && io.resp.bits.tag === next_replay_req.tag + val nack_head = io.nack.valid && nackq.io.deq.valid && io.nack.bits === next_replay_req.tag + + // Enqueue to the nack queue if there is a nack that is not in response to + // the previous replay + nackq.io.enq.valid := io.nack.valid && !nack_head + nackq.io.enq.bits := OHToUInt(nack_onehot) + assert(!nackq.io.enq.valid || nackq.io.enq.ready, + "SimpleHellaCacheIF: ReplayQueue nack queue overflow") + + // Dequeue from the nack queue if the last replay was successfully completed + nackq.io.deq.ready := replay_complete + assert(!nackq.io.deq.ready || nackq.io.deq.valid, + "SimpleHellaCacheIF: ReplayQueue nack queue underflow") + + // Set inflight bit when a request is made + // Clear it when it is successfully completed + inflight := (inflight | Mux(io.req.fire(), next_inflight_onehot, UInt(0))) & + ~Mux(io.resp.valid, resp_onehot, UInt(0)) + + when (io.req.fire()) { + reqs(next_inflight) := io.req.bits + } + + // Only one replay outstanding at a time + when (io.replay.fire()) { replaying := Bool(true) } + when (nack_head || replay_complete) { replaying := Bool(false) } +} + +// exposes a sane decoupled request interface +class SimpleHellaCacheIF(implicit p: Parameters) extends Module +{ + val io = new Bundle { + val requestor = new HellaCacheIO().flip + val cache = new HellaCacheIO + } + + val replayq = Module(new SimpleHellaCacheIFReplayQueue(2)) + val req_arb = Module(new Arbiter(new HellaCacheReq, 2)) + + val req_helper = DecoupledHelper( + req_arb.io.in(1).ready, + replayq.io.req.ready, + io.requestor.req.valid) + + req_arb.io.in(0) <> replayq.io.replay + req_arb.io.in(1).valid := req_helper.fire(req_arb.io.in(1).ready) + req_arb.io.in(1).bits := io.requestor.req.bits + io.requestor.req.ready := req_helper.fire(io.requestor.req.valid) + replayq.io.req.valid := req_helper.fire(replayq.io.req.ready) + replayq.io.req.bits := io.requestor.req.bits + + val s0_req_fire = io.cache.req.fire() + val s1_req_fire = Reg(next = s0_req_fire) + val s2_req_fire = Reg(next = s1_req_fire) + val s1_req_tag = Reg(next = io.cache.req.bits.tag) + val s2_req_tag = Reg(next = s1_req_tag) + val s2_kill = Reg(next = io.cache.s1_kill) + + io.cache.invalidate_lr := io.requestor.invalidate_lr + io.cache.req <> req_arb.io.out + io.cache.s1_kill := io.cache.s2_nack + io.cache.s1_data := RegEnable(req_arb.io.out.bits.data, s0_req_fire) + + replayq.io.nack.valid := (io.cache.s2_nack || s2_kill) && s2_req_fire + replayq.io.nack.bits := s2_req_tag + replayq.io.resp := io.cache.resp + io.requestor.resp := io.cache.resp + + assert(!Reg(next = io.cache.req.fire()) || + !(io.cache.xcpt.ma.ld || io.cache.xcpt.ma.st || + io.cache.xcpt.pf.ld || io.cache.xcpt.pf.st), + "SimpleHellaCacheIF exception") +} diff --git a/src/main/scala/rocket/dcache.scala b/src/main/scala/rocket/dcache.scala index 14991eda..cd11c7f0 100644 --- a/src/main/scala/rocket/dcache.scala +++ b/src/main/scala/rocket/dcache.scala @@ -3,15 +3,15 @@ package rocket import Chisel._ -import junctions._ +import Chisel.ImplicitConversions._ +import cde.Parameters import diplomacy._ -import uncore.tilelink2._ -import uncore.constants._ import uncore.agents._ +import uncore.constants._ +import uncore.tilelink2._ import uncore.util._ import util._ import TLMessages._ -import Chisel.ImplicitConversions._ import config._ class DCacheDataReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) { @@ -40,590 +40,465 @@ class DCacheDataArray(implicit p: Parameters) extends L1HellaCacheModule()(p) { } } -class DCache(maxUncachedInFlight: Int = 2)(implicit val p: Parameters) extends LazyModule with HasL1HellaCacheParameters { +class DCache(cfg: DCacheConfig)(implicit p: Parameters) extends HellaCache(cfg)(p) { + override lazy val module = new DCacheModule(this) +} - val node = TLClientNode(TLClientParameters( - sourceId = IdRange(0, maxUncachedInFlight), - supportsProbe = TransferSizes(cacheBlockBytes))) +class DCacheModule(outer: DCache)(implicit p: Parameters) extends HellaCacheModule(outer)(p) { - lazy val module = new LazyModuleImp(this) { - val io = new Bundle { - val cpu = (new HellaCacheIO).flip - val ptw = new TLBPTWIO() - val mem = node.bundleOut - } + val maxUncachedInFlight = cfg.nMMIOs - val edge = node.edgesOut(0) - val tl_out = io.mem(0) + require(rowBits == encRowBits) // no ECC - /* TODO - edge.manager.managers.foreach { m => - // If a slave supports read at all, it must support all TL Legacy requires - if (m.supportsGet) { - require (m.supportsGet.contains(TransferSizes(1, tlDataBytes))) - require (m.supportsGet.contains(TransferSizes(tlDataBeats * tlDataBytes))) - } - // Likewise, any put support must mean full put support - if (m.supportsPutPartial) { - require (m.supportsPutPartial.contains(TransferSizes(1, tlDataBytes))) - require (m.supportsPutPartial.contains(TransferSizes(tlDataBeats * tlDataBytes))) - } - // Any atomic support => must support 32-bit size - if (m.supportsArithmetic) { require (m.supportsArithmetic.contains(TransferSizes(4))) } - if (m.supportsLogical) { require (m.supportsLogical .contains(TransferSizes(4))) } - // We straight-up require Acquire support, this is a cache afterall? - require (edge.manager.anySupportsAcquire) - } - */ - require(rowBits == encRowBits) // no ECC - require(refillCyclesPerBeat == 1) - require(rowBits >= coreDataBits) + val grantackq = Module(new Queue(tl_out.e.bits,1)) // TODO don't need this in scratchpad mode - val grantackq = Module(new Queue(tl_out.e.bits,1)) + // tags + val replacer = p(Replacer)() + def onReset = L1Metadata(UInt(0), ClientMetadata.onReset) + val metaReadArb = Module(new Arbiter(new MetaReadReq, 3)) + val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, 3)) - // tags - val replacer = p(Replacer)() - def onReset = L1Metadata(UInt(0), ClientMetadata.onReset) - val metaReadArb = Module(new Arbiter(new MetaReadReq, 3)) - val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, 3)) + // data + val data = Module(new DCacheDataArray) + val dataArb = Module(new Arbiter(new DCacheDataReq, 4)) + data.io.req <> dataArb.io.out + dataArb.io.out.ready := true - // data - val data = Module(new DCacheDataArray) - val dataArb = Module(new Arbiter(new DCacheDataReq, 4)) - data.io.req <> dataArb.io.out - dataArb.io.out.ready := true + val s1_valid = Reg(next=io.cpu.req.fire(), init=Bool(false)) + val s1_probe = Reg(next=tl_out.b.fire(), init=Bool(false)) + val probe_bits = RegEnable(tl_out.b.bits, tl_out.b.fire()) // TODO has data now :( + val s1_nack = Wire(init=Bool(false)) + val s1_valid_masked = s1_valid && !io.cpu.s1_kill && !io.cpu.xcpt.asUInt.orR + val s1_valid_not_nacked = s1_valid_masked && !s1_nack + val s1_req = Reg(io.cpu.req.bits) + when (metaReadArb.io.out.valid) { + s1_req := io.cpu.req.bits + s1_req.addr := Cat(io.cpu.req.bits.addr >> untagBits, metaReadArb.io.out.bits.idx, io.cpu.req.bits.addr(blockOffBits-1,0)) + } + val s1_read = isRead(s1_req.cmd) + val s1_write = isWrite(s1_req.cmd) + val s1_readwrite = s1_read || s1_write + val s1_flush_valid = Reg(Bool()) - val s1_valid = Reg(next=io.cpu.req.fire(), init=Bool(false)) - val s1_probe = Reg(next=tl_out.b.fire(), init=Bool(false)) - val probe_bits = RegEnable(tl_out.b.bits, tl_out.b.fire()) // TODO has data now :( - val s1_nack = Wire(init=Bool(false)) - val s1_valid_masked = s1_valid && !io.cpu.s1_kill && !io.cpu.xcpt.asUInt.orR - val s1_valid_not_nacked = s1_valid_masked && !s1_nack - val s1_req = Reg(io.cpu.req.bits) - when (metaReadArb.io.out.valid) { - s1_req := io.cpu.req.bits - s1_req.addr := Cat(io.cpu.req.bits.addr >> untagBits, metaReadArb.io.out.bits.idx, io.cpu.req.bits.addr(blockOffBits-1,0)) - } - val s1_read = isRead(s1_req.cmd) - val s1_write = isWrite(s1_req.cmd) - val s1_readwrite = s1_read || s1_write - val s1_flush_valid = Reg(Bool()) + val s_ready :: s_voluntary_writeback :: s_probe_rep_dirty :: s_probe_rep_clean :: s_probe_rep_miss :: s_voluntary_write_meta :: s_probe_write_meta :: Nil = Enum(UInt(), 7) + val cached_grant_wait = Reg(init=Bool(false)) + val release_ack_wait = Reg(init=Bool(false)) + val release_state = Reg(init=s_ready) + val pstore1_valid = Wire(Bool()) + val pstore2_valid = Reg(Bool()) + val inWriteback = release_state.isOneOf(s_voluntary_writeback, s_probe_rep_dirty) + val releaseWay = Wire(UInt()) + io.cpu.req.ready := (release_state === s_ready) && !cached_grant_wait && !s1_nack - val s_ready :: s_voluntary_writeback :: s_probe_rep_dirty :: s_probe_rep_clean :: s_probe_rep_miss :: s_voluntary_write_meta :: s_probe_write_meta :: Nil = Enum(UInt(), 7) - val cached_grant_wait = Reg(init=Bool(false)) - val release_ack_wait = Reg(init=Bool(false)) - val release_state = Reg(init=s_ready) - val pstore1_valid = Wire(Bool()) - val pstore2_valid = Reg(Bool()) - val inWriteback = release_state.isOneOf(s_voluntary_writeback, s_probe_rep_dirty) - val releaseWay = Wire(UInt()) - io.cpu.req.ready := (release_state === s_ready) && !cached_grant_wait && !s1_nack + // I/O MSHRs + val uncachedInFlight = Reg(init=Vec.fill(maxUncachedInFlight)(Bool(false))) + val uncachedReqs = Reg(Vec(maxUncachedInFlight, new HellaCacheReq)) - // I/O MSHRs - val uncachedInFlight = Reg(init=Vec.fill(maxUncachedInFlight)(Bool(false))) - val uncachedReqs = Reg(Vec(maxUncachedInFlight, new HellaCacheReq)) + // hit initiation path + dataArb.io.in(3).valid := io.cpu.req.valid && isRead(io.cpu.req.bits.cmd) + dataArb.io.in(3).bits.write := false + dataArb.io.in(3).bits.addr := io.cpu.req.bits.addr + dataArb.io.in(3).bits.way_en := ~UInt(0, nWays) + when (!dataArb.io.in(3).ready && isRead(io.cpu.req.bits.cmd)) { io.cpu.req.ready := false } + metaReadArb.io.in(2).valid := io.cpu.req.valid + metaReadArb.io.in(2).bits.idx := io.cpu.req.bits.addr(idxMSB, idxLSB) + metaReadArb.io.in(2).bits.way_en := ~UInt(0, nWays) + when (!metaReadArb.io.in(2).ready) { io.cpu.req.ready := false } - // hit initiation path - dataArb.io.in(3).valid := io.cpu.req.valid && isRead(io.cpu.req.bits.cmd) - dataArb.io.in(3).bits.write := false - dataArb.io.in(3).bits.addr := io.cpu.req.bits.addr - dataArb.io.in(3).bits.way_en := ~UInt(0, nWays) - when (!dataArb.io.in(3).ready && isRead(io.cpu.req.bits.cmd)) { io.cpu.req.ready := false } - metaReadArb.io.in(2).valid := io.cpu.req.valid - metaReadArb.io.in(2).bits.idx := io.cpu.req.bits.addr(idxMSB, idxLSB) - metaReadArb.io.in(2).bits.way_en := ~UInt(0, nWays) - when (!metaReadArb.io.in(2).ready) { io.cpu.req.ready := false } + // address translation + val tlb = Module(new TLB) + io.ptw <> tlb.io.ptw + tlb.io.req.valid := s1_valid_masked && s1_readwrite + tlb.io.req.bits.passthrough := s1_req.phys + tlb.io.req.bits.vpn := s1_req.addr >> pgIdxBits + tlb.io.req.bits.instruction := false + tlb.io.req.bits.store := s1_write + when (!tlb.io.req.ready && !io.cpu.req.bits.phys) { io.cpu.req.ready := false } + when (s1_valid && s1_readwrite && tlb.io.resp.miss) { s1_nack := true } - // address translation - val tlb = Module(new TLB) - io.ptw <> tlb.io.ptw - tlb.io.req.valid := s1_valid_masked && s1_readwrite - tlb.io.req.bits.passthrough := s1_req.phys - tlb.io.req.bits.vpn := s1_req.addr >> pgIdxBits - tlb.io.req.bits.instruction := false - tlb.io.req.bits.store := s1_write - when (!tlb.io.req.ready && !io.cpu.req.bits.phys) { io.cpu.req.ready := false } - when (s1_valid && s1_readwrite && tlb.io.resp.miss) { s1_nack := true } - - val s1_paddr = Cat(tlb.io.resp.ppn, s1_req.addr(pgIdxBits-1,0)) - val s1_tag = Mux(s1_probe, probe_bits.address, s1_paddr)(paddrBits-1, untagBits) - val s1_victim_way = Wire(init = replacer.way) - val (s1_hit_way, s1_hit_state, s1_victim_meta) = - if (usingDataScratchpad) { - require(nWays == 1) - metaWriteArb.io.out.ready := true - metaReadArb.io.out.ready := !metaWriteArb.io.out.valid - val inScratchpad = addrMap(s"TL2:dmem${p(TileId)}").containsAddress(s1_paddr) - val hitState = Mux(inScratchpad, ClientMetadata.maximum, ClientMetadata.onReset) - (inScratchpad, hitState, L1Metadata(UInt(0), ClientMetadata.onReset)) - } else { - val meta = Module(new MetadataArray(onReset _)) - meta.io.read <> metaReadArb.io.out - meta.io.write <> metaWriteArb.io.out - val s1_meta = meta.io.resp - val s1_meta_hit_way = s1_meta.map(r => r.coh.isValid() && r.tag === s1_tag).asUInt - val s1_meta_hit_state = ClientMetadata.onReset.fromBits( - s1_meta.map(r => Mux(r.tag === s1_tag, r.coh.asUInt, UInt(0))) - .reduce (_|_)) - (s1_meta_hit_way, s1_meta_hit_state, s1_meta(s1_victim_way)) - } - val s1_data_way = Mux(inWriteback, releaseWay, s1_hit_way) - val s1_data = Mux1H(s1_data_way, data.io.resp) // retime into s2 if critical - - val s2_valid = Reg(next=s1_valid_masked, init=Bool(false)) - val s2_probe = Reg(next=s1_probe, init=Bool(false)) - val releaseInFlight = s1_probe || s2_probe || release_state =/= s_ready - val s2_valid_masked = s2_valid && Reg(next = !s1_nack) - val s2_req = Reg(io.cpu.req.bits) - val s2_req_block_addr = (s2_req.addr >> idxLSB) << idxLSB - val s2_uncached = Reg(Bool()) - when (s1_valid_not_nacked || s1_flush_valid) { - s2_req := s1_req - s2_req.addr := s1_paddr - s2_uncached := !tlb.io.resp.cacheable || Bool(usingDataScratchpad) - } - val s2_read = isRead(s2_req.cmd) - val s2_write = isWrite(s2_req.cmd) - val s2_readwrite = s2_read || s2_write - val s2_flush_valid = RegNext(s1_flush_valid) - val s2_data = RegEnable(s1_data, s1_valid || inWriteback) - val s2_probe_way = RegEnable(s1_hit_way, s1_probe) - val s2_probe_state = RegEnable(s1_hit_state, s1_probe) - val s2_hit_way = RegEnable(s1_hit_way, s1_valid_not_nacked) - val s2_hit_state = RegEnable(s1_hit_state, s1_valid_not_nacked) - val s2_hit_valid = s2_hit_state.isValid() - val (s2_hit, s2_grow_param, s2_new_hit_state) = s2_hit_state.onAccess(s2_req.cmd) - val s2_valid_hit = s2_valid_masked && s2_readwrite && s2_hit - val s2_valid_miss = s2_valid_masked && s2_readwrite && !s2_hit && !(pstore1_valid || pstore2_valid) && !release_ack_wait - val s2_valid_cached_miss = s2_valid_miss && !s2_uncached - val s2_victimize = s2_valid_cached_miss || s2_flush_valid - val s2_valid_uncached = s2_valid_miss && s2_uncached - val s2_victim_way = Mux(s2_hit_valid && !s2_flush_valid, s2_hit_way, UIntToOH(RegEnable(s1_victim_way, s1_valid_not_nacked || s1_flush_valid))) - val s2_victim_tag = RegEnable(s1_victim_meta.tag, s1_valid_not_nacked || s1_flush_valid) - val s2_victim_state = Mux(s2_hit_valid && !s2_flush_valid, s2_hit_state, RegEnable(s1_victim_meta.coh, s1_valid_not_nacked || s1_flush_valid)) - val s2_victim_valid = s2_victim_state.isValid() - val (s2_prb_ack_data, s2_report_param, probeNewCoh)= s2_probe_state.onProbe(probe_bits.param) - val (s2_victim_dirty, s2_shrink_param, voluntaryNewCoh) = s2_victim_state.onCacheControl(M_FLUSH) - val s2_update_meta = s2_hit_state =/= s2_new_hit_state - io.cpu.s2_nack := s2_valid && !s2_valid_hit && !(s2_valid_uncached && tl_out.a.ready && !uncachedInFlight.asUInt.andR) - when (s2_valid && (!s2_valid_hit || s2_update_meta)) { s1_nack := true } - - // exceptions - val s1_storegen = new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes) - io.cpu.xcpt.ma.ld := s1_read && s1_storegen.misaligned - io.cpu.xcpt.ma.st := s1_write && s1_storegen.misaligned - io.cpu.xcpt.pf.ld := s1_read && tlb.io.resp.xcpt_ld - io.cpu.xcpt.pf.st := s1_write && tlb.io.resp.xcpt_st - - // load reservations - val s2_lr = Bool(usingAtomics) && s2_req.cmd === M_XLR - val s2_sc = Bool(usingAtomics) && s2_req.cmd === M_XSC - val lrscCount = Reg(init=UInt(0)) - val lrscValid = lrscCount > 0 - val lrscAddr = Reg(UInt()) - val s2_sc_fail = s2_sc && !(lrscValid && lrscAddr === (s2_req.addr >> blockOffBits)) - when (s2_valid_hit && s2_lr) { - lrscCount := lrscCycles - 1 - lrscAddr := s2_req.addr >> blockOffBits - } - when (lrscValid) { lrscCount := lrscCount - 1 } - when ((s2_valid_masked && lrscValid) || io.cpu.invalidate_lr) { lrscCount := 0 } - - // pending store buffer - val pstore1_cmd = RegEnable(s1_req.cmd, s1_valid_not_nacked && s1_write) - val pstore1_typ = RegEnable(s1_req.typ, s1_valid_not_nacked && s1_write) - val pstore1_addr = RegEnable(s1_paddr, s1_valid_not_nacked && s1_write) - val pstore1_data = RegEnable(io.cpu.s1_data, s1_valid_not_nacked && s1_write) - val pstore1_way = RegEnable(s1_hit_way, s1_valid_not_nacked && s1_write) - val pstore1_storegen = new StoreGen(pstore1_typ, pstore1_addr, pstore1_data, wordBytes) - val pstore1_storegen_data = Wire(init = pstore1_storegen.data) - val pstore1_amo = Bool(usingAtomics) && isRead(pstore1_cmd) - val pstore_drain_structural = pstore1_valid && pstore2_valid && ((s1_valid && s1_write) || pstore1_amo) - val pstore_drain_opportunistic = !(io.cpu.req.valid && isRead(io.cpu.req.bits.cmd)) - val pstore_drain_on_miss = releaseInFlight || io.cpu.s2_nack - val pstore_drain = - Bool(usingAtomics) && pstore_drain_structural || - (((pstore1_valid && !pstore1_amo) || pstore2_valid) && (pstore_drain_opportunistic || pstore_drain_on_miss)) - pstore1_valid := { - val s2_store_valid = s2_valid_hit && s2_write && !s2_sc_fail - val pstore1_held = Reg(Bool()) - assert(!s2_store_valid || !pstore1_held) - pstore1_held := (s2_store_valid || pstore1_held) && pstore2_valid && !pstore_drain - s2_store_valid || pstore1_held - } - val advance_pstore1 = pstore1_valid && (pstore2_valid === pstore_drain) - pstore2_valid := pstore2_valid && !pstore_drain || advance_pstore1 - val pstore2_addr = RegEnable(pstore1_addr, advance_pstore1) - val pstore2_way = RegEnable(pstore1_way, advance_pstore1) - val pstore2_storegen_data = RegEnable(pstore1_storegen_data, advance_pstore1) - val pstore2_storegen_mask = RegEnable(pstore1_storegen.mask, advance_pstore1) - dataArb.io.in(0).valid := pstore_drain - dataArb.io.in(0).bits.write := true - dataArb.io.in(0).bits.addr := Mux(pstore2_valid, pstore2_addr, pstore1_addr) - dataArb.io.in(0).bits.way_en := Mux(pstore2_valid, pstore2_way, pstore1_way) - dataArb.io.in(0).bits.wdata := Fill(rowWords, Mux(pstore2_valid, pstore2_storegen_data, pstore1_storegen_data)) - val pstore_mask_shift = Mux(pstore2_valid, pstore2_addr, pstore1_addr).extract(rowOffBits-1,offsetlsb) << wordOffBits - dataArb.io.in(0).bits.wmask := Mux(pstore2_valid, pstore2_storegen_mask, pstore1_storegen.mask) << pstore_mask_shift - - // store->load RAW hazard detection - val s1_idx = s1_req.addr(idxMSB, wordOffBits) - val s1_raw_hazard = s1_read && - ((pstore1_valid && pstore1_addr(idxMSB, wordOffBits) === s1_idx && (pstore1_storegen.mask & s1_storegen.mask).orR) || - (pstore2_valid && pstore2_addr(idxMSB, wordOffBits) === s1_idx && (pstore2_storegen_mask & s1_storegen.mask).orR)) - when (s1_valid && s1_raw_hazard) { s1_nack := true } - - metaWriteArb.io.in(0).valid := (s2_valid_hit && s2_update_meta) || (s2_victimize && !s2_victim_dirty) - metaWriteArb.io.in(0).bits.way_en := s2_victim_way - metaWriteArb.io.in(0).bits.idx := s2_req.addr(idxMSB, idxLSB) - metaWriteArb.io.in(0).bits.data.coh := Mux(s2_valid_hit, s2_new_hit_state, ClientMetadata.onReset) - metaWriteArb.io.in(0).bits.data.tag := s2_req.addr(paddrBits-1, untagBits) - - // Prepare a TileLink request message that initiates a transaction - val a_source = PriorityEncoder(~uncachedInFlight.asUInt) - val acquire_address = s2_req_block_addr - val access_address = s2_req.addr - val a_size = s2_req.typ - val a_data = Fill(beatWords, pstore1_storegen.data) - val acquire = edge.Acquire(a_source, acquire_address, lgCacheBlockBytes, s2_grow_param)._2 // Cacheability checked by tlb - val get = edge.Get(a_source, access_address, a_size)._2 - val put = edge.Put(a_source, access_address, a_size, a_data)._2 - val atomics = if (edge.manager.anySupportLogical) { - MuxLookup(s2_req.cmd, Wire(new TLBundleA(edge.bundle)), Array( - M_XA_SWAP -> edge.Logical(a_source, access_address, a_size, a_data, TLAtomics.SWAP)._2, - M_XA_XOR -> edge.Logical(a_source, access_address, a_size, a_data, TLAtomics.XOR) ._2, - M_XA_OR -> edge.Logical(a_source, access_address, a_size, a_data, TLAtomics.OR) ._2, - M_XA_AND -> edge.Logical(a_source, access_address, a_size, a_data, TLAtomics.AND) ._2, - M_XA_ADD -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.ADD)._2, - M_XA_MIN -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MIN)._2, - M_XA_MAX -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MAX)._2, - M_XA_MINU -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MINU)._2, - M_XA_MAXU -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MAXU)._2)) + val s1_paddr = Cat(tlb.io.resp.ppn, s1_req.addr(pgIdxBits-1,0)) + val s1_tag = Mux(s1_probe, probe_bits.address, s1_paddr)(paddrBits-1, untagBits) + val s1_victim_way = Wire(init = replacer.way) + val (s1_hit_way, s1_hit_state, s1_victim_meta) = + if (usingDataScratchpad) { + require(nWays == 1) + metaWriteArb.io.out.ready := true + metaReadArb.io.out.ready := !metaWriteArb.io.out.valid + val inScratchpad = addrMap(s"TL2:dmem${p(TileId)}").containsAddress(s1_paddr) + val hitState = Mux(inScratchpad, ClientMetadata.maximum, ClientMetadata.onReset) + (inScratchpad, hitState, L1Metadata(UInt(0), ClientMetadata.onReset)) } else { - // If no managers support atomics, assert fail if processor asks for them - assert (!(tl_out.a.valid && pstore1_amo && s2_write && s2_uncached)) - Wire(new TLBundleA(edge.bundle)) + val meta = Module(new MetadataArray(onReset _)) + meta.io.read <> metaReadArb.io.out + meta.io.write <> metaWriteArb.io.out + val s1_meta = meta.io.resp + val s1_meta_hit_way = s1_meta.map(r => r.coh.isValid() && r.tag === s1_tag).asUInt + val s1_meta_hit_state = ClientMetadata.onReset.fromBits( + s1_meta.map(r => Mux(r.tag === s1_tag, r.coh.asUInt, UInt(0))) + .reduce (_|_)) + (s1_meta_hit_way, s1_meta_hit_state, s1_meta(s1_victim_way)) } + val s1_data_way = Mux(inWriteback, releaseWay, s1_hit_way) + val s1_data = Mux1H(s1_data_way, data.io.resp) // retime into s2 if critical - tl_out.a.valid := grantackq.io.enq.ready && ((s2_valid_cached_miss && !s2_victim_dirty) || - (s2_valid_uncached && !uncachedInFlight.asUInt.andR)) - tl_out.a.bits := Mux(!s2_uncached, acquire, Mux(!s2_write, get, Mux(!pstore1_amo, put, atomics))) + val s2_valid = Reg(next=s1_valid_masked, init=Bool(false)) + val s2_probe = Reg(next=s1_probe, init=Bool(false)) + val releaseInFlight = s1_probe || s2_probe || release_state =/= s_ready + val s2_valid_masked = s2_valid && Reg(next = !s1_nack) + val s2_req = Reg(io.cpu.req.bits) + val s2_req_block_addr = (s2_req.addr >> idxLSB) << idxLSB + val s2_uncached = Reg(Bool()) + when (s1_valid_not_nacked || s1_flush_valid) { + s2_req := s1_req + s2_req.addr := s1_paddr + s2_uncached := !tlb.io.resp.cacheable || Bool(usingDataScratchpad) + } + val s2_read = isRead(s2_req.cmd) + val s2_write = isWrite(s2_req.cmd) + val s2_readwrite = s2_read || s2_write + val s2_flush_valid = RegNext(s1_flush_valid) + val s2_data = RegEnable(s1_data, s1_valid || inWriteback) + val s2_probe_way = RegEnable(s1_hit_way, s1_probe) + val s2_probe_state = RegEnable(s1_hit_state, s1_probe) + val s2_hit_way = RegEnable(s1_hit_way, s1_valid_not_nacked) + val s2_hit_state = RegEnable(s1_hit_state, s1_valid_not_nacked) + val s2_hit_valid = s2_hit_state.isValid() + val (s2_hit, s2_grow_param, s2_new_hit_state) = s2_hit_state.onAccess(s2_req.cmd) + val s2_valid_hit = s2_valid_masked && s2_readwrite && s2_hit + val s2_valid_miss = s2_valid_masked && s2_readwrite && !s2_hit && !(pstore1_valid || pstore2_valid) && !release_ack_wait + val s2_valid_cached_miss = s2_valid_miss && !s2_uncached + val s2_victimize = s2_valid_cached_miss || s2_flush_valid + val s2_valid_uncached = s2_valid_miss && s2_uncached + val s2_victim_way = Mux(s2_hit_valid && !s2_flush_valid, s2_hit_way, UIntToOH(RegEnable(s1_victim_way, s1_valid_not_nacked || s1_flush_valid))) + val s2_victim_tag = RegEnable(s1_victim_meta.tag, s1_valid_not_nacked || s1_flush_valid) + val s2_victim_state = Mux(s2_hit_valid && !s2_flush_valid, s2_hit_state, RegEnable(s1_victim_meta.coh, s1_valid_not_nacked || s1_flush_valid)) + val s2_victim_valid = s2_victim_state.isValid() + val (s2_prb_ack_data, s2_report_param, probeNewCoh)= s2_probe_state.onProbe(probe_bits.param) + val (s2_victim_dirty, s2_shrink_param, voluntaryNewCoh) = s2_victim_state.onCacheControl(M_FLUSH) + val s2_update_meta = s2_hit_state =/= s2_new_hit_state + io.cpu.s2_nack := s2_valid && !s2_valid_hit && !(s2_valid_uncached && tl_out.a.ready && !uncachedInFlight.asUInt.andR) + when (s2_valid && (!s2_valid_hit || s2_update_meta)) { s1_nack := true } - // Set pending bits for outstanding TileLink transaction - when (tl_out.a.fire()) { - when (s2_uncached) { - uncachedInFlight(a_source) := true - uncachedReqs(a_source) := s2_req - }.otherwise { - cached_grant_wait := true - } + // exceptions + val s1_storegen = new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes) + io.cpu.xcpt.ma.ld := s1_read && s1_storegen.misaligned + io.cpu.xcpt.ma.st := s1_write && s1_storegen.misaligned + io.cpu.xcpt.pf.ld := s1_read && tlb.io.resp.xcpt_ld + io.cpu.xcpt.pf.st := s1_write && tlb.io.resp.xcpt_st + + // load reservations + val s2_lr = Bool(usingAtomics) && s2_req.cmd === M_XLR + val s2_sc = Bool(usingAtomics) && s2_req.cmd === M_XSC + val lrscCount = Reg(init=UInt(0)) + val lrscValid = lrscCount > 0 + val lrscAddr = Reg(UInt()) + val s2_sc_fail = s2_sc && !(lrscValid && lrscAddr === (s2_req.addr >> blockOffBits)) + when (s2_valid_hit && s2_lr) { + lrscCount := lrscCycles - 1 + lrscAddr := s2_req.addr >> blockOffBits + } + when (lrscValid) { lrscCount := lrscCount - 1 } + when ((s2_valid_masked && lrscValid) || io.cpu.invalidate_lr) { lrscCount := 0 } + + // pending store buffer + val pstore1_cmd = RegEnable(s1_req.cmd, s1_valid_not_nacked && s1_write) + val pstore1_typ = RegEnable(s1_req.typ, s1_valid_not_nacked && s1_write) + val pstore1_addr = RegEnable(s1_paddr, s1_valid_not_nacked && s1_write) + val pstore1_data = RegEnable(io.cpu.s1_data, s1_valid_not_nacked && s1_write) + val pstore1_way = RegEnable(s1_hit_way, s1_valid_not_nacked && s1_write) + val pstore1_storegen = new StoreGen(pstore1_typ, pstore1_addr, pstore1_data, wordBytes) + val pstore1_storegen_data = Wire(init = pstore1_storegen.data) + val pstore1_amo = Bool(usingAtomics) && isRead(pstore1_cmd) + val pstore_drain_structural = pstore1_valid && pstore2_valid && ((s1_valid && s1_write) || pstore1_amo) + val pstore_drain_opportunistic = !(io.cpu.req.valid && isRead(io.cpu.req.bits.cmd)) + val pstore_drain_on_miss = releaseInFlight || io.cpu.s2_nack + val pstore_drain = + Bool(usingAtomics) && pstore_drain_structural || + (((pstore1_valid && !pstore1_amo) || pstore2_valid) && (pstore_drain_opportunistic || pstore_drain_on_miss)) + pstore1_valid := { + val s2_store_valid = s2_valid_hit && s2_write && !s2_sc_fail + val pstore1_held = Reg(Bool()) + assert(!s2_store_valid || !pstore1_held) + pstore1_held := (s2_store_valid || pstore1_held) && pstore2_valid && !pstore_drain + s2_store_valid || pstore1_held + } + val advance_pstore1 = pstore1_valid && (pstore2_valid === pstore_drain) + pstore2_valid := pstore2_valid && !pstore_drain || advance_pstore1 + val pstore2_addr = RegEnable(pstore1_addr, advance_pstore1) + val pstore2_way = RegEnable(pstore1_way, advance_pstore1) + val pstore2_storegen_data = RegEnable(pstore1_storegen_data, advance_pstore1) + val pstore2_storegen_mask = RegEnable(pstore1_storegen.mask, advance_pstore1) + dataArb.io.in(0).valid := pstore_drain + dataArb.io.in(0).bits.write := true + dataArb.io.in(0).bits.addr := Mux(pstore2_valid, pstore2_addr, pstore1_addr) + dataArb.io.in(0).bits.way_en := Mux(pstore2_valid, pstore2_way, pstore1_way) + dataArb.io.in(0).bits.wdata := Fill(rowWords, Mux(pstore2_valid, pstore2_storegen_data, pstore1_storegen_data)) + val pstore_mask_shift = Mux(pstore2_valid, pstore2_addr, pstore1_addr).extract(rowOffBits-1,offsetlsb) << wordOffBits + dataArb.io.in(0).bits.wmask := Mux(pstore2_valid, pstore2_storegen_mask, pstore1_storegen.mask) << pstore_mask_shift + + // store->load RAW hazard detection + val s1_idx = s1_req.addr(idxMSB, wordOffBits) + val s1_raw_hazard = s1_read && + ((pstore1_valid && pstore1_addr(idxMSB, wordOffBits) === s1_idx && (pstore1_storegen.mask & s1_storegen.mask).orR) || + (pstore2_valid && pstore2_addr(idxMSB, wordOffBits) === s1_idx && (pstore2_storegen_mask & s1_storegen.mask).orR)) + when (s1_valid && s1_raw_hazard) { s1_nack := true } + + metaWriteArb.io.in(0).valid := (s2_valid_hit && s2_update_meta) || (s2_victimize && !s2_victim_dirty) + metaWriteArb.io.in(0).bits.way_en := s2_victim_way + metaWriteArb.io.in(0).bits.idx := s2_req.addr(idxMSB, idxLSB) + metaWriteArb.io.in(0).bits.data.coh := Mux(s2_valid_hit, s2_new_hit_state, ClientMetadata.onReset) + metaWriteArb.io.in(0).bits.data.tag := s2_req.addr(paddrBits-1, untagBits) + + // Prepare a TileLink request message that initiates a transaction + val a_source = PriorityEncoder(~uncachedInFlight.asUInt) + val acquire_address = s2_req_block_addr + val access_address = s2_req.addr + val a_size = s2_req.typ + val a_data = Fill(beatWords, pstore1_storegen.data) + val acquire = edge.Acquire(a_source, acquire_address, lgCacheBlockBytes, s2_grow_param)._2 // Cacheability checked by tlb + val get = edge.Get(a_source, access_address, a_size)._2 + val put = edge.Put(a_source, access_address, a_size, a_data)._2 + val atomics = if (edge.manager.anySupportLogical) { + MuxLookup(s2_req.cmd, Wire(new TLBundleA(edge.bundle)), Array( + M_XA_SWAP -> edge.Logical(a_source, access_address, a_size, a_data, TLAtomics.SWAP)._2, + M_XA_XOR -> edge.Logical(a_source, access_address, a_size, a_data, TLAtomics.XOR) ._2, + M_XA_OR -> edge.Logical(a_source, access_address, a_size, a_data, TLAtomics.OR) ._2, + M_XA_AND -> edge.Logical(a_source, access_address, a_size, a_data, TLAtomics.AND) ._2, + M_XA_ADD -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.ADD)._2, + M_XA_MIN -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MIN)._2, + M_XA_MAX -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MAX)._2, + M_XA_MINU -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MINU)._2, + M_XA_MAXU -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MAXU)._2)) + } else { + // If no managers support atomics, assert fail if processor asks for them + assert (!(tl_out.a.valid && pstore1_amo && s2_write && s2_uncached)) + Wire(new TLBundleA(edge.bundle)) + } + + tl_out.a.valid := grantackq.io.enq.ready && ((s2_valid_cached_miss && !s2_victim_dirty) || + (s2_valid_uncached && !uncachedInFlight.asUInt.andR)) + tl_out.a.bits := Mux(!s2_uncached, acquire, Mux(!s2_write, get, Mux(!pstore1_amo, put, atomics))) + + // Set pending bits for outstanding TileLink transaction + when (tl_out.a.fire()) { + when (s2_uncached) { + uncachedInFlight(a_source) := true + uncachedReqs(a_source) := s2_req + }.otherwise { + cached_grant_wait := true } + } - // grant - val (d_first, d_last, d_done, d_address_inc) = edge.addr_inc(tl_out.d) - val grantIsCached = tl_out.d.bits.opcode.isOneOf(Grant, GrantData) - val grantIsUncached = tl_out.d.bits.opcode.isOneOf(AccessAck, AccessAckData, HintAck) - val grantIsVoluntary = tl_out.d.bits.opcode === ReleaseAck // Clears a different pending bit - val grantIsRefill = tl_out.d.bits.opcode === GrantData // Writes the data array - tl_out.d.ready := true - when (tl_out.d.fire()) { - when (grantIsCached) { - assert(cached_grant_wait, "A GrantData was unexpected by the dcache.") - when(d_last) { cached_grant_wait := false } - } .elsewhen (grantIsUncached) { - val id = tl_out.d.bits.source - val req = uncachedReqs(id) - assert(uncachedInFlight(id), "An AccessAck was unexpected by the dcache.") // TODO must handle Ack coming back on same cycle! - when(d_last) { uncachedInFlight(id) := false } - s2_data := tl_out.d.bits.data - s2_req.cmd := req.cmd - s2_req.typ := req.typ - s2_req.tag := req.tag - s2_req.addr := Cat(s1_paddr >> wordOffBits /* don't-care */, req.addr(wordOffBits-1, 0)) - } .elsewhen (grantIsVoluntary) { - assert(release_ack_wait, "A ReleaseAck was unexpected by the dcache.") // TODO should handle Ack coming back on same cycle! - release_ack_wait := false - } + // grant + val (d_first, d_last, d_done, d_address_inc) = edge.addr_inc(tl_out.d) + val grantIsCached = tl_out.d.bits.opcode.isOneOf(Grant, GrantData) + val grantIsUncached = tl_out.d.bits.opcode.isOneOf(AccessAck, AccessAckData, HintAck) + val grantIsVoluntary = tl_out.d.bits.opcode === ReleaseAck // Clears a different pending bit + val grantIsRefill = tl_out.d.bits.opcode === GrantData // Writes the data array + tl_out.d.ready := true + when (tl_out.d.fire()) { + when (grantIsCached) { + assert(cached_grant_wait, "A GrantData was unexpected by the dcache.") + when(d_last) { cached_grant_wait := false } + } .elsewhen (grantIsUncached) { + val id = tl_out.d.bits.source + val req = uncachedReqs(id) + assert(uncachedInFlight(id), "An AccessAck was unexpected by the dcache.") // TODO must handle Ack coming back on same cycle! + when(d_last) { uncachedInFlight(id) := false } + s2_data := tl_out.d.bits.data + s2_req.cmd := req.cmd + s2_req.typ := req.typ + s2_req.tag := req.tag + s2_req.addr := Cat(s1_paddr >> wordOffBits /* don't-care */, req.addr(wordOffBits-1, 0)) + } .elsewhen (grantIsVoluntary) { + assert(release_ack_wait, "A ReleaseAck was unexpected by the dcache.") // TODO should handle Ack coming back on same cycle! + release_ack_wait := false } + } - // data refill - val doRefillBeat = grantIsRefill && tl_out.d.valid - dataArb.io.in(1).valid := doRefillBeat - assert(dataArb.io.in(1).ready || !doRefillBeat) - dataArb.io.in(1).bits.write := true - dataArb.io.in(1).bits.addr := s2_req_block_addr | d_address_inc - dataArb.io.in(1).bits.way_en := s2_victim_way - dataArb.io.in(1).bits.wdata := tl_out.d.bits.data - dataArb.io.in(1).bits.wmask := ~UInt(0, rowBytes) - // tag updates on refill - metaWriteArb.io.in(1).valid := grantIsCached && d_done - assert(!metaWriteArb.io.in(1).valid || metaWriteArb.io.in(1).ready) - metaWriteArb.io.in(1).bits.way_en := s2_victim_way - metaWriteArb.io.in(1).bits.idx := s2_req.addr(idxMSB, idxLSB) - metaWriteArb.io.in(1).bits.data.coh := s2_hit_state.onGrant(s2_req.cmd, tl_out.d.bits.param) - metaWriteArb.io.in(1).bits.data.tag := s2_req.addr(paddrBits-1, untagBits) - // don't accept uncached grants if there's a structural hazard on s2_data... - val blockUncachedGrant = Reg(Bool()) - blockUncachedGrant := dataArb.io.out.valid - when (grantIsUncached) { - tl_out.d.ready := !(blockUncachedGrant || s1_valid) - // ...but insert bubble to guarantee grant's eventual forward progress - when (tl_out.d.valid && !tl_out.d.ready) { - io.cpu.req.ready := false - dataArb.io.in(1).valid := true - dataArb.io.in(1).bits.write := false - blockUncachedGrant := !dataArb.io.in(1).ready - } + // data refill + val doRefillBeat = grantIsRefill && tl_out.d.valid + dataArb.io.in(1).valid := doRefillBeat + assert(dataArb.io.in(1).ready || !doRefillBeat) + dataArb.io.in(1).bits.write := true + dataArb.io.in(1).bits.addr := s2_req_block_addr | d_address_inc + dataArb.io.in(1).bits.way_en := s2_victim_way + dataArb.io.in(1).bits.wdata := tl_out.d.bits.data + dataArb.io.in(1).bits.wmask := ~UInt(0, rowBytes) + // tag updates on refill + metaWriteArb.io.in(1).valid := grantIsCached && d_done + assert(!metaWriteArb.io.in(1).valid || metaWriteArb.io.in(1).ready) + metaWriteArb.io.in(1).bits.way_en := s2_victim_way + metaWriteArb.io.in(1).bits.idx := s2_req.addr(idxMSB, idxLSB) + metaWriteArb.io.in(1).bits.data.coh := s2_hit_state.onGrant(s2_req.cmd, tl_out.d.bits.param) + metaWriteArb.io.in(1).bits.data.tag := s2_req.addr(paddrBits-1, untagBits) + // don't accept uncached grants if there's a structural hazard on s2_data... + val blockUncachedGrant = Reg(Bool()) + blockUncachedGrant := dataArb.io.out.valid + when (grantIsUncached) { + tl_out.d.ready := !(blockUncachedGrant || s1_valid) + // ...but insert bubble to guarantee grant's eventual forward progress + when (tl_out.d.valid && !tl_out.d.ready) { + io.cpu.req.ready := false + dataArb.io.in(1).valid := true + dataArb.io.in(1).bits.write := false + blockUncachedGrant := !dataArb.io.in(1).ready } + } - // Finish TileLink transaction by issuing a GrantAck - grantackq.io.enq.valid := d_done && edge.hasFollowUp(tl_out.d.bits) - grantackq.io.enq.bits := edge.GrantAck(tl_out.d.bits) - tl_out.e <> grantackq.io.deq - assert(!grantackq.io.enq.valid || grantackq.io.enq.ready, "Too many Grants received by dcache.") - when (d_done) { replacer.miss } + // Finish TileLink transaction by issuing a GrantAck + grantackq.io.enq.valid := d_done && edge.hasFollowUp(tl_out.d.bits) + grantackq.io.enq.bits := edge.GrantAck(tl_out.d.bits) + tl_out.e <> grantackq.io.deq + assert(!grantackq.io.enq.valid || grantackq.io.enq.ready, "Too many Grants received by dcache.") + when (d_done) { replacer.miss } - // Handle an incoming TileLink Probe message - val block_probe = releaseInFlight || lrscValid || (s2_valid_hit && s2_lr) - metaReadArb.io.in(1).valid := tl_out.b.valid && !block_probe - tl_out.b.ready := metaReadArb.io.in(1).ready && !block_probe && !s1_valid && (!s2_valid || s2_valid_hit) - metaReadArb.io.in(1).bits.idx := tl_out.b.bits.address(idxMSB, idxLSB) - metaReadArb.io.in(1).bits.way_en := ~UInt(0, nWays) + // Handle an incoming TileLink Probe message + val block_probe = releaseInFlight || lrscValid || (s2_valid_hit && s2_lr) + metaReadArb.io.in(1).valid := tl_out.b.valid && !block_probe + tl_out.b.ready := metaReadArb.io.in(1).ready && !block_probe && !s1_valid && (!s2_valid || s2_valid_hit) + metaReadArb.io.in(1).bits.idx := tl_out.b.bits.address(idxMSB, idxLSB) + metaReadArb.io.in(1).bits.way_en := ~UInt(0, nWays) - // release - val (_, c_last, releaseDone, c_count) = edge.count(tl_out.c) - val releaseRejected = tl_out.c.valid && !tl_out.c.ready - val s1_release_data_valid = Reg(next = dataArb.io.in(2).fire()) - val s2_release_data_valid = Reg(next = s1_release_data_valid && !releaseRejected) - val releaseDataBeat = Cat(UInt(0), c_count) + Mux(releaseRejected, UInt(0), s1_release_data_valid + Cat(UInt(0), s2_release_data_valid)) + // release + val (_, c_last, releaseDone, c_count) = edge.count(tl_out.c) + val releaseRejected = tl_out.c.valid && !tl_out.c.ready + val s1_release_data_valid = Reg(next = dataArb.io.in(2).fire()) + val s2_release_data_valid = Reg(next = s1_release_data_valid && !releaseRejected) + val releaseDataBeat = Cat(UInt(0), c_count) + Mux(releaseRejected, UInt(0), s1_release_data_valid + Cat(UInt(0), s2_release_data_valid)) - val nackResponseMessage = edge.ProbeAck( - b = probe_bits, - reportPermissions = TLPermissions.NtoN) + val nackResponseMessage = edge.ProbeAck( + b = probe_bits, + reportPermissions = TLPermissions.NtoN) - val voluntaryReleaseMessage = edge.Release( - fromSource = UInt(maxUncachedInFlight - 1), - toAddress = probe_bits.address, - lgSize = lgCacheBlockBytes, - shrinkPermissions = s2_shrink_param, - data = s2_data)._2 + val voluntaryReleaseMessage = edge.Release( + fromSource = UInt(maxUncachedInFlight - 1), + toAddress = probe_bits.address, + lgSize = lgCacheBlockBytes, + shrinkPermissions = s2_shrink_param, + data = s2_data)._2 - val probeResponseMessage = Mux(!s2_prb_ack_data, - edge.ProbeAck( - b = probe_bits, - reportPermissions = s2_report_param), - edge.ProbeAck( - b = probe_bits, - reportPermissions = s2_report_param, - data = s2_data)) + val probeResponseMessage = Mux(!s2_prb_ack_data, + edge.ProbeAck( + b = probe_bits, + reportPermissions = s2_report_param), + edge.ProbeAck( + b = probe_bits, + reportPermissions = s2_report_param, + data = s2_data)) - tl_out.c.valid := s2_release_data_valid - tl_out.c.bits := nackResponseMessage - val newCoh = Wire(init = probeNewCoh) - releaseWay := s2_probe_way + tl_out.c.valid := s2_release_data_valid + tl_out.c.bits := nackResponseMessage + val newCoh = Wire(init = probeNewCoh) + releaseWay := s2_probe_way - when (s2_victimize && s2_victim_dirty) { - assert(!(s2_valid && s2_hit_valid)) - release_state := s_voluntary_writeback - probe_bits.address := Cat(s2_victim_tag, s2_req.addr(idxMSB, idxLSB)) << idxLSB - } - when (s2_probe) { - when (s2_prb_ack_data) { release_state := s_probe_rep_dirty } - .elsewhen (s2_probe_state.isValid()) { release_state := s_probe_rep_clean } - .otherwise { - tl_out.c.valid := true - release_state := s_probe_rep_miss - } - } - when (releaseDone) { release_state := s_ready } - when (release_state.isOneOf(s_probe_rep_miss, s_probe_rep_clean)) { + when (s2_victimize && s2_victim_dirty) { + assert(!(s2_valid && s2_hit_valid)) + release_state := s_voluntary_writeback + probe_bits.address := Cat(s2_victim_tag, s2_req.addr(idxMSB, idxLSB)) << idxLSB + } + when (s2_probe) { + when (s2_prb_ack_data) { release_state := s_probe_rep_dirty } + .elsewhen (s2_probe_state.isValid()) { release_state := s_probe_rep_clean } + .otherwise { tl_out.c.valid := true + release_state := s_probe_rep_miss } - when (release_state.isOneOf(s_probe_rep_clean, s_probe_rep_dirty)) { - tl_out.c.bits := probeResponseMessage - when (releaseDone) { release_state := s_probe_write_meta } + } + when (releaseDone) { release_state := s_ready } + when (release_state.isOneOf(s_probe_rep_miss, s_probe_rep_clean)) { + tl_out.c.valid := true + } + when (release_state.isOneOf(s_probe_rep_clean, s_probe_rep_dirty)) { + tl_out.c.bits := probeResponseMessage + when (releaseDone) { release_state := s_probe_write_meta } + } + when (release_state.isOneOf(s_voluntary_writeback, s_voluntary_write_meta)) { + tl_out.c.bits := voluntaryReleaseMessage + newCoh := voluntaryNewCoh + releaseWay := s2_victim_way + when (releaseDone) { + release_state := s_voluntary_write_meta + release_ack_wait := true } - when (release_state.isOneOf(s_voluntary_writeback, s_voluntary_write_meta)) { - tl_out.c.bits := voluntaryReleaseMessage - newCoh := voluntaryNewCoh - releaseWay := s2_victim_way - when (releaseDone) { - release_state := s_voluntary_write_meta - release_ack_wait := true + } + when (s2_probe && !tl_out.c.fire()) { s1_nack := true } + tl_out.c.bits.address := probe_bits.address + tl_out.c.bits.data := s2_data + + dataArb.io.in(2).valid := inWriteback && releaseDataBeat < refillCycles + dataArb.io.in(2).bits.write := false + dataArb.io.in(2).bits.addr := tl_out.c.bits.address | (releaseDataBeat(log2Up(refillCycles)-1,0) << rowOffBits) + dataArb.io.in(2).bits.way_en := ~UInt(0, nWays) + + metaWriteArb.io.in(2).valid := release_state.isOneOf(s_voluntary_write_meta, s_probe_write_meta) + metaWriteArb.io.in(2).bits.way_en := releaseWay + metaWriteArb.io.in(2).bits.idx := tl_out.c.bits.address(idxMSB, idxLSB) + metaWriteArb.io.in(2).bits.data.coh := newCoh + metaWriteArb.io.in(2).bits.data.tag := tl_out.c.bits.address(paddrBits-1, untagBits) + when (metaWriteArb.io.in(2).fire()) { release_state := s_ready } + + // cached response + io.cpu.resp.valid := s2_valid_hit + io.cpu.resp.bits <> s2_req + io.cpu.resp.bits.has_data := s2_read + io.cpu.resp.bits.replay := false + io.cpu.ordered := !(s1_valid || s2_valid || cached_grant_wait || uncachedInFlight.asUInt.orR) + + // uncached response + io.cpu.replay_next := tl_out.d.fire() && grantIsUncached + val doUncachedResp = Reg(next = io.cpu.replay_next) + when (doUncachedResp) { + assert(!s2_valid_hit) + io.cpu.resp.valid := true + io.cpu.resp.bits.replay := true + } + + // load data subword mux/sign extension + val s2_word_idx = s2_req.addr.extract(log2Up(rowBits/8)-1, log2Up(wordBytes)) + val s2_data_word = s2_data >> Cat(s2_word_idx, UInt(0, log2Up(coreDataBits))) + val loadgen = new LoadGen(s2_req.typ, mtSigned(s2_req.typ), s2_req.addr, s2_data_word, s2_sc, wordBytes) + io.cpu.resp.bits.data := loadgen.data | s2_sc_fail + io.cpu.resp.bits.data_word_bypass := loadgen.wordData + io.cpu.resp.bits.store_data := pstore1_data + + // AMOs + if (usingAtomics) { + val amoalu = Module(new AMOALU(xLen)) + amoalu.io.addr := pstore1_addr + amoalu.io.cmd := pstore1_cmd + amoalu.io.typ := pstore1_typ + amoalu.io.lhs := s2_data_word + amoalu.io.rhs := pstore1_data + pstore1_storegen_data := amoalu.io.out + } else { + assert(!(s1_valid_masked && s1_read && s1_write), "unsupported D$ operation") + } + + // flushes + val flushed = Reg(init=Bool(true)) + val flushing = Reg(init=Bool(false)) + val flushCounter = Counter(nSets * nWays) + when (tl_out.a.fire() && !s2_uncached) { flushed := false } + when (s2_valid_masked && s2_req.cmd === M_FLUSH_ALL) { + io.cpu.s2_nack := !flushed + when (!flushed) { + flushing := !release_ack_wait && !uncachedInFlight.asUInt.orR + } + } + s1_flush_valid := metaReadArb.io.in(0).fire() && !s1_flush_valid && !s2_flush_valid && release_state === s_ready && !release_ack_wait + metaReadArb.io.in(0).valid := flushing + metaReadArb.io.in(0).bits.idx := flushCounter.value + metaReadArb.io.in(0).bits.way_en := ~UInt(0, nWays) + when (flushing) { + s1_victim_way := flushCounter.value >> log2Up(nSets) + when (s2_flush_valid) { + when (flushCounter.inc()) { + flushed := true } } - when (s2_probe && !tl_out.c.fire()) { s1_nack := true } - tl_out.c.bits.address := probe_bits.address - tl_out.c.bits.data := s2_data - - dataArb.io.in(2).valid := inWriteback && releaseDataBeat < refillCycles - dataArb.io.in(2).bits.write := false - dataArb.io.in(2).bits.addr := tl_out.c.bits.address | (releaseDataBeat(log2Up(refillCycles)-1,0) << rowOffBits) - dataArb.io.in(2).bits.way_en := ~UInt(0, nWays) - - metaWriteArb.io.in(2).valid := release_state.isOneOf(s_voluntary_write_meta, s_probe_write_meta) - metaWriteArb.io.in(2).bits.way_en := releaseWay - metaWriteArb.io.in(2).bits.idx := tl_out.c.bits.address(idxMSB, idxLSB) - metaWriteArb.io.in(2).bits.data.coh := newCoh - metaWriteArb.io.in(2).bits.data.tag := tl_out.c.bits.address(paddrBits-1, untagBits) - when (metaWriteArb.io.in(2).fire()) { release_state := s_ready } - - // cached response - io.cpu.resp.valid := s2_valid_hit - io.cpu.resp.bits <> s2_req - io.cpu.resp.bits.has_data := s2_read - io.cpu.resp.bits.replay := false - io.cpu.ordered := !(s1_valid || s2_valid || cached_grant_wait || uncachedInFlight.asUInt.orR) - - // uncached response - io.cpu.replay_next := tl_out.d.fire() && grantIsUncached - val doUncachedResp = Reg(next = io.cpu.replay_next) - when (doUncachedResp) { - assert(!s2_valid_hit) - io.cpu.resp.valid := true - io.cpu.resp.bits.replay := true - } - - // load data subword mux/sign extension - val s2_word_idx = s2_req.addr.extract(log2Up(rowBits/8)-1, log2Up(wordBytes)) - val s2_data_word = s2_data >> Cat(s2_word_idx, UInt(0, log2Up(coreDataBits))) - val loadgen = new LoadGen(s2_req.typ, mtSigned(s2_req.typ), s2_req.addr, s2_data_word, s2_sc, wordBytes) - io.cpu.resp.bits.data := loadgen.data | s2_sc_fail - io.cpu.resp.bits.data_word_bypass := loadgen.wordData - io.cpu.resp.bits.store_data := pstore1_data - - // AMOs - if (usingAtomics) { - val amoalu = Module(new AMOALU(xLen)) - amoalu.io.addr := pstore1_addr - amoalu.io.cmd := pstore1_cmd - amoalu.io.typ := pstore1_typ - amoalu.io.lhs := s2_data_word - amoalu.io.rhs := pstore1_data - pstore1_storegen_data := amoalu.io.out - } else { - assert(!(s1_valid_masked && s1_read && s1_write), "unsupported D$ operation") - } - - // flushes - val flushed = Reg(init=Bool(true)) - val flushing = Reg(init=Bool(false)) - val flushCounter = Counter(nSets * nWays) - when (tl_out.a.fire() && !s2_uncached) { flushed := false } - when (s2_valid_masked && s2_req.cmd === M_FLUSH_ALL) { - io.cpu.s2_nack := !flushed - when (!flushed) { - flushing := !release_ack_wait && !uncachedInFlight.asUInt.orR - } - } - s1_flush_valid := metaReadArb.io.in(0).fire() && !s1_flush_valid && !s2_flush_valid && release_state === s_ready && !release_ack_wait - metaReadArb.io.in(0).valid := flushing - metaReadArb.io.in(0).bits.idx := flushCounter.value - metaReadArb.io.in(0).bits.way_en := ~UInt(0, nWays) - when (flushing) { - s1_victim_way := flushCounter.value >> log2Up(nSets) - when (s2_flush_valid) { - when (flushCounter.inc()) { - flushed := true - } - } - when (flushed && release_state === s_ready && !release_ack_wait) { - flushing := false - } + when (flushed && release_state === s_ready && !release_ack_wait) { + flushing := false } } } - -class ScratchpadSlavePort(implicit val p: Parameters) extends LazyModule with HasCoreParameters { - val node = TLManagerNode(TLManagerPortParameters( - Seq(TLManagerParameters( - address = List(AddressSet(0x80000000L, BigInt(p(DataScratchpadSize)-1))), - regionType = RegionType.UNCACHED, - executable = true, - supportsArithmetic = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none, - supportsLogical = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none, - supportsPutPartial = TransferSizes(1, coreDataBytes), - supportsPutFull = TransferSizes(1, coreDataBytes), - supportsGet = TransferSizes(1, coreDataBytes), - fifoId = Some(0))), // requests handled in FIFO order - beatBytes = coreDataBytes, - minLatency = 1)) - - // Make sure this ends up with the same name as before - override def name = "dmem0" - - lazy val module = new LazyModuleImp(this) { - val io = new Bundle { - val tl_in = node.bundleIn - val dmem = new HellaCacheIO - } - - val tl_in = io.tl_in(0) - val edge = node.edgesIn(0) - - require(usingDataScratchpad) - - val s_ready :: s_wait :: s_replay :: s_grant :: Nil = Enum(UInt(), 4) - val state = Reg(init = s_ready) - when (io.dmem.resp.valid) { state := s_grant } - when (tl_in.d.fire()) { state := s_ready } - when (io.dmem.s2_nack) { state := s_replay } - when (io.dmem.req.fire()) { state := s_wait } - - val acq = Reg(tl_in.a.bits) - when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data } - when (tl_in.a.fire()) { acq := tl_in.a.bits } - - val isWrite = acq.opcode === TLMessages.PutFullData || acq.opcode === TLMessages.PutPartialData - val isRead = !edge.hasData(acq) - - def formCacheReq(acq: TLBundleA) = { - val req = Wire(new HellaCacheReq) - req.cmd := MuxLookup(acq.opcode, Wire(M_XRD), Array( - TLMessages.PutFullData -> M_XWR, - TLMessages.PutPartialData -> M_XWR, - TLMessages.ArithmeticData -> MuxLookup(acq.param, Wire(M_XRD), Array( - TLAtomics.MIN -> M_XA_MIN, - TLAtomics.MAX -> M_XA_MAX, - TLAtomics.MINU -> M_XA_MINU, - TLAtomics.MAXU -> M_XA_MAXU, - TLAtomics.ADD -> M_XA_ADD)), - TLMessages.LogicalData -> MuxLookup(acq.param, Wire(M_XRD), Array( - TLAtomics.XOR -> M_XA_XOR, - TLAtomics.OR -> M_XA_OR, - TLAtomics.AND -> M_XA_AND, - TLAtomics.SWAP -> M_XA_SWAP)), - TLMessages.Get -> M_XRD)) - // treat all loads as full words, so bytes appear in correct lane - req.typ := Mux(isRead, log2Ceil(coreDataBytes), acq.size) - req.addr := Mux(isRead, ~(~acq.address | (coreDataBytes-1)), acq.address) - req.tag := UInt(0) - req - } - - val ready = state === s_ready || tl_in.d.fire() - io.dmem.req.valid := (tl_in.a.valid && ready) || state === s_replay - tl_in.a.ready := io.dmem.req.ready && ready - io.dmem.req.bits := formCacheReq(Mux(state === s_replay, acq, tl_in.a.bits)) - // the TL data is already in the correct byte lane, but the D$ - // expects right-justified store data, so that it can steer the bytes. - io.dmem.s1_data := new LoadGen(acq.size, Bool(false), acq.address(log2Ceil(coreDataBytes)-1,0), acq.data, Bool(false), coreDataBytes).data - io.dmem.s1_kill := false - io.dmem.invalidate_lr := false - - // place AMO data in correct word lane - val minAMOBytes = 4 - val grantData = Mux(io.dmem.resp.valid, io.dmem.resp.bits.data, acq.data) - val alignedGrantData = Mux(acq.size <= log2Ceil(minAMOBytes), Fill(coreDataBytes/minAMOBytes, grantData(8*minAMOBytes-1, 0)), grantData) - - tl_in.d.valid := io.dmem.resp.valid || state === s_grant - tl_in.d.bits := Mux(isWrite, - edge.AccessAck(acq, UInt(0)), - edge.AccessAck(acq, UInt(0), UInt(0))) - tl_in.d.bits.data := alignedGrantData - - // Tie off unused channels - tl_in.b.valid := Bool(false) - tl_in.c.ready := Bool(true) - tl_in.e.ready := Bool(true) - } -} diff --git a/src/main/scala/rocket/nbdcache.scala b/src/main/scala/rocket/nbdcache.scala index 8775a6a0..7495119f 100644 --- a/src/main/scala/rocket/nbdcache.scala +++ b/src/main/scala/rocket/nbdcache.scala @@ -3,71 +3,17 @@ package rocket import Chisel._ -import uncore.tilelink._ -import uncore.tilelink2._ +import Chisel.ImplicitConversions._ +import cde.Parameters +import diplomacy._ import uncore.agents._ import uncore.constants._ +import uncore.tilelink._ +import uncore.tilelink2._ import uncore.util._ -import diplomacy._ import util._ -import Chisel.ImplicitConversions._ import config._ -case class DCacheConfig( - nMSHRs: Int = 1, - nSDQ: Int = 17, - nRPQ: Int = 16) - -case object DCacheKey extends Field[DCacheConfig] - -trait HasL1HellaCacheParameters extends HasCacheParameters with HasCoreParameters { - val outerDataBeats = p(TLKey(p(TLId))).dataBeats - val outerDataBits = p(TLKey(p(TLId))).dataBitsPerBeat - val refillCyclesPerBeat = outerDataBits/rowBits - val refillCycles = refillCyclesPerBeat*outerDataBeats - - val cacheBlockBytes = p(CacheBlockBytes) - val lgCacheBlockBytes = log2Up(cacheBlockBytes) - - val wordBits = xLen // really, xLen max - val wordBytes = wordBits/8 - val wordOffBits = log2Up(wordBytes) - val beatBytes = cacheBlockBytes / outerDataBeats - val beatWords = beatBytes / wordBytes - val beatOffBits = log2Up(beatBytes) - val idxMSB = untagBits-1 - val idxLSB = blockOffBits - val offsetmsb = idxLSB-1 - val offsetlsb = wordOffBits - val rowWords = rowBits/wordBits - val doNarrowRead = coreDataBits * nWays % rowBits == 0 - val encDataBits = code.width(coreDataBits) - val encRowBits = encDataBits*rowWords - val nIOMSHRs = 1 - val lrscCycles = 32 // ISA requires 16-insn LRSC sequences to succeed - - require(isPow2(nSets)) - require(rowBits <= outerDataBits) - require(xLen <= outerDataBits) // TODO need offset addr for puts if data width < xlen - require(!usingVM || untagBits <= pgIdxBits) -} - -abstract class L1HellaCacheModule(implicit val p: Parameters) extends Module - with HasL1HellaCacheParameters -abstract class L1HellaCacheBundle(implicit val p: Parameters) extends ParameterizedBundle()(p) - with HasL1HellaCacheParameters - -trait HasCoreMemOp extends HasCoreParameters { - val addr = UInt(width = coreMaxAddrBits) - val tag = Bits(width = dcacheReqTagBits) - val cmd = Bits(width = M_SZ) - val typ = Bits(width = MT_SZ) -} - -trait HasCoreData extends HasCoreParameters { - val data = Bits(width = coreDataBits) -} - trait HasMissInfo extends HasL1HellaCacheParameters { val tag_match = Bool() val old_meta = new L1Metadata @@ -79,41 +25,6 @@ class HellaCacheReqInternal(implicit p: Parameters) extends CoreBundle()(p) val phys = Bool() } -class HellaCacheReq(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasCoreData - -class HellaCacheResp(implicit p: Parameters) extends CoreBundle()(p) - with HasCoreMemOp - with HasCoreData { - val replay = Bool() - val has_data = Bool() - val data_word_bypass = Bits(width = coreDataBits) - val store_data = Bits(width = coreDataBits) -} - -class AlignmentExceptions extends Bundle { - val ld = Bool() - val st = Bool() -} - -class HellaCacheExceptions extends Bundle { - val ma = new AlignmentExceptions - val pf = new AlignmentExceptions -} - -// interface between D$ and processor/DTLB -class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) { - val req = Decoupled(new HellaCacheReq) - val s1_kill = Bool(OUTPUT) // kill previous cycle's req - val s1_data = Bits(OUTPUT, coreDataBits) // data for previous cycle's req - val s2_nack = Bool(INPUT) // req from two cycles ago is rejected - - val resp = Valid(new HellaCacheResp).flip - val replay_next = Bool(INPUT) - val xcpt = (new HellaCacheExceptions).asInput - val invalidate_lr = Bool(OUTPUT) - val ordered = Bool(INPUT) -} - class L1DataReadReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) { val way_en = Bits(width = nWays) val addr = Bits(width = untagBits) @@ -126,27 +37,8 @@ class L1DataWriteReq(implicit p: Parameters) extends L1DataReadReq()(p) { class L1RefillReq(implicit p: Parameters) extends L1DataReadReq()(p) -class L1MetaReadReq(implicit p: Parameters) extends MetaReadReq { - val tag = Bits(width = tagBits) - override def cloneType = new L1MetaReadReq()(p).asInstanceOf[this.type] //TODO remove -} - -class L1MetaWriteReq(implicit p: Parameters) extends - MetaWriteReq[L1Metadata](new L1Metadata) - -object L1Metadata { - def apply(tag: Bits, coh: ClientMetadata)(implicit p: Parameters) = { - val meta = Wire(new L1Metadata) - meta.tag := tag - meta.coh := coh - meta - } -} -class L1Metadata(implicit p: Parameters) extends Metadata()(p) with HasL1HellaCacheParameters { - val coh = new ClientMetadata -} - class Replay(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasCoreData + class ReplayInternal(cfg: DCacheConfig)(implicit p: Parameters) extends HellaCacheReqInternal()(p) { val sdq_id = UInt(width = log2Up(cfg.nSDQ)) @@ -154,24 +46,27 @@ class ReplayInternal(cfg: DCacheConfig)(implicit p: Parameters) extends HellaCac } class MSHRReq(implicit p: Parameters) extends Replay()(p) with HasMissInfo + class MSHRReqInternal(cfg: DCacheConfig)(implicit p: Parameters) extends ReplayInternal(cfg)(p) with HasMissInfo { override def cloneType = new MSHRReqInternal(cfg)(p).asInstanceOf[this.type] } -class ProbeInternal(implicit p: Parameters) extends Probe()(p) with HasClientTransactionId - -class WritebackReq(implicit p: Parameters) extends Release()(p) with HasCacheParameters { +class WritebackReq(params: TLBundleParameters)(implicit p: Parameters) extends L1HellaCacheBundle()(p) { + val tag = Bits(width = tagBits) + val idx = Bits(width = idxBits) + val source = UInt(width = params.sourceBits) + val param = UInt(width = TLPermissions.cWidth) val way_en = Bits(width = nWays) + val voluntary = Bool() } -class IOMSHR(id: Int)(implicit p: Parameters) extends L1HellaCacheModule()(p) { +class IOMSHR(id: Int, edge: TLEdgeOut)(implicit p: Parameters) extends L1HellaCacheModule()(p) { val io = new Bundle { val req = Decoupled(new HellaCacheReq).flip - val acquire = Decoupled(new Acquire) - val grant = Valid(new GrantFromSrc).flip - val finish = Decoupled(new FinishToDst) val resp = Decoupled(new HellaCacheResp) + val mem_access = Decoupled(new TLBundleA(edge.bundle)) + val mem_ack = Valid(new TLBundleD(edge.bundle)).flip val replay_next = Bool(OUTPUT) } @@ -185,57 +80,42 @@ class IOMSHR(id: Int)(implicit p: Parameters) extends L1HellaCacheModule()(p) { val req = Reg(new HellaCacheReq) val req_cmd_sc = req.cmd === M_XSC val grant_word = Reg(UInt(width = wordBits)) - val fq = Module(new FinishQueue(1)) - val s_idle :: s_acquire :: s_grant :: s_resp :: s_finish :: Nil = Enum(Bits(), 5) + val s_idle :: s_mem_access :: s_mem_ack :: s_resp :: Nil = Enum(Bits(), 5) val state = Reg(init = s_idle) io.req.ready := (state === s_idle) - fq.io.enq.valid := io.grant.valid && io.grant.bits.requiresAck() - fq.io.enq.bits := io.grant.bits.makeFinish() - io.finish.valid := fq.io.deq.valid && (state === s_finish) - io.finish.bits := fq.io.deq.bits - fq.io.deq.ready := io.finish.ready && (state === s_finish) - val storegen = new StoreGen(req.typ, req.addr, req.data, wordBytes) val loadgen = new LoadGen(req.typ, mtSigned(req.typ), req.addr, grant_word, req_cmd_sc, wordBytes) + + val a_source = UInt(id) + val a_address = req.addr + val a_size = req.typ + val a_data = Fill(beatWords, storegen.data) - val beat_mask = (storegen.mask << Cat(beatOffset(req.addr), UInt(0, wordOffBits))) - val beat_data = Fill(beatWords, storegen.data) + val get = edge.Get(a_source, a_address, a_size)._2 + val put = edge.Put(a_source, a_address, a_size, a_data)._2 + val atomics = if (edge.manager.anySupportLogical) { + MuxLookup(req.cmd, Wire(new TLBundleA(edge.bundle)), Array( + M_XA_SWAP -> edge.Logical(a_source, a_address, a_size, a_data, TLAtomics.SWAP)._2, + M_XA_XOR -> edge.Logical(a_source, a_address, a_size, a_data, TLAtomics.XOR) ._2, + M_XA_OR -> edge.Logical(a_source, a_address, a_size, a_data, TLAtomics.OR) ._2, + M_XA_AND -> edge.Logical(a_source, a_address, a_size, a_data, TLAtomics.AND) ._2, + M_XA_ADD -> edge.Arithmetic(a_source, a_address, a_size, a_data, TLAtomics.ADD)._2, + M_XA_MIN -> edge.Arithmetic(a_source, a_address, a_size, a_data, TLAtomics.MIN)._2, + M_XA_MAX -> edge.Arithmetic(a_source, a_address, a_size, a_data, TLAtomics.MAX)._2, + M_XA_MINU -> edge.Arithmetic(a_source, a_address, a_size, a_data, TLAtomics.MINU)._2, + M_XA_MAXU -> edge.Arithmetic(a_source, a_address, a_size, a_data, TLAtomics.MAXU)._2)) + } else { + // If no managers support atomics, assert fail if processor asks for them + assert (!isAMO(req.cmd)) + Wire(new TLBundleA(edge.bundle)) + } - val addr_block = req.addr(paddrBits - 1, blockOffBits) - val addr_beat = req.addr(blockOffBits - 1, beatOffBits) - val addr_byte = req.addr(beatOffBits - 1, 0) + io.mem_access.valid := (state === s_mem_access) + io.mem_access.bits := Mux(isAMO(req.cmd), atomics, Mux(isRead(req.cmd), get, put)) - val get_acquire = Get( - client_xact_id = UInt(id), - addr_block = addr_block, - addr_beat = addr_beat, - addr_byte = addr_byte, - operand_size = req.typ, - alloc = Bool(false)) - - val put_acquire = Put( - client_xact_id = UInt(id), - addr_block = addr_block, - addr_beat = addr_beat, - data = beat_data, - wmask = Some(beat_mask), - alloc = Bool(false)) - - val putAtomic_acquire = PutAtomic( - client_xact_id = UInt(id), - addr_block = addr_block, - addr_beat = addr_beat, - addr_byte = addr_byte, - atomic_opcode = req.cmd, - operand_size = req.typ, - data = beat_data) - - io.acquire.valid := (state === s_acquire) - io.acquire.bits := Mux(isAMO(req.cmd), putAtomic_acquire, Mux(isRead(req.cmd), get_acquire, put_acquire)) - - io.replay_next := (state === s_grant) || io.resp.valid && !io.resp.ready + io.replay_next := (state === s_mem_ack) || io.resp.valid && !io.resp.ready io.resp.valid := (state === s_resp) io.resp.bits := req io.resp.bits.has_data := isRead(req.cmd) @@ -245,30 +125,26 @@ class IOMSHR(id: Int)(implicit p: Parameters) extends L1HellaCacheModule()(p) { when (io.req.fire()) { req := io.req.bits - state := s_acquire + state := s_mem_access } - when (io.acquire.fire()) { - state := s_grant + when (io.mem_access.fire()) { + state := s_mem_ack } - when (state === s_grant && io.grant.valid) { + when (state === s_mem_ack && io.mem_ack.valid) { state := s_resp when (isRead(req.cmd)) { - grant_word := wordFromBeat(req.addr, io.grant.bits.data) + grant_word := wordFromBeat(req.addr, io.mem_ack.bits.data) } } when (io.resp.fire()) { - state := s_finish - } - - when (io.finish.fire()) { state := s_idle } } -class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheModule()(p) { +class MSHR(id: Int, edge: TLEdgeOut)(implicit cfg: DCacheConfig, p: Parameters) extends L1HellaCacheModule()(p) { val io = new Bundle { val req_pri_val = Bool(INPUT) val req_pri_rdy = Bool(OUTPUT) @@ -279,14 +155,15 @@ class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCa val idx_match = Bool(OUTPUT) val tag = Bits(OUTPUT, tagBits) - val mem_req = Decoupled(new Acquire) + val mem_acquire = Decoupled(new TLBundleA(edge.bundle)) + val mem_grant = Valid(new TLBundleD(edge.bundle)).flip + val mem_finish = Decoupled(new TLBundleE(edge.bundle)) + val refill = new L1RefillReq().asOutput // Data is bypassed val meta_read = Decoupled(new L1MetaReadReq) val meta_write = Decoupled(new L1MetaWriteReq) val replay = Decoupled(new ReplayInternal(cfg)) - val mem_grant = Valid(new GrantFromSrc).flip - val mem_finish = Decoupled(new FinishToDst) - val wb_req = Decoupled(new WritebackReq) + val wb_req = Decoupled(new WritebackReq(edge.bundle)) val probe_rdy = Bool(OUTPUT) } @@ -296,6 +173,8 @@ class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCa val new_coh_state = Reg(init=ClientMetadata.onReset) val req = Reg(new MSHRReqInternal(cfg)) val req_idx = req.addr(untagBits-1,blockOffBits) + val req_tag = req.addr >> untagBits + val req_block_addr = (req.addr >> blockOffBits) << blockOffBits val idx_match = req_idx === io.req_bits.addr(untagBits-1,blockOffBits) // We only accept secondary misses if we haven't yet sent an Acquire to outer memory // or if the Acquire that was sent will obtain a Grant with sufficient permissions @@ -307,9 +186,7 @@ class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCa // to go from clean to dirty. val dirties_coh = Reg(Bool()) val states_before_refill = Seq(s_wb_req, s_wb_resp, s_meta_clear) - val gnt_multi_data = io.mem_grant.bits.hasMultibeatData() - val (refill_cnt, refill_count_done) = Counter(io.mem_grant.valid && gnt_multi_data, refillCycles) - val refill_done = io.mem_grant.valid && (!gnt_multi_data || refill_count_done) + val (_, _, refill_done, refill_address_inc) = edge.addr_inc(io.mem_grant) val sec_rdy = idx_match && (state.isOneOf(states_before_refill) || (state.isOneOf(s_refill_req, s_refill_resp) && @@ -320,10 +197,11 @@ class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCa rpq.io.enq.bits := io.req_bits rpq.io.deq.ready := (io.replay.ready && state === s_drain_rpq) || state === s_invalid - val coh_on_grant = req.old_meta.coh.onGrant(UInt(0), UInt(0)) - //incoming = io.mem_grant.bits, - //pending = Mux(dirties_coh, M_XWR, req.cmd)) - val coh_on_hit = coh_on_grant //io.req_bits.old_meta.coh.onHit(io.req_bits.cmd) + val coh_on_grant = Mux(dirties_coh, + ClientMetadata.maximum, + req.old_meta.coh.onGrant(req.cmd, io.mem_grant.bits.param)) + val (is_hit, grow_param, coh_on_hit) = io.req_bits.old_meta.coh.onAccess(io.req_bits.cmd) + val (needs_wb, shrink_param, coh_on_wb) = io.req_bits.old_meta.coh.onCacheControl(M_FLUSH) when (state === s_drain_rpq && !rpq.io.deq.valid) { state := s_invalid @@ -339,7 +217,7 @@ class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCa state := s_meta_write_req new_coh_state := coh_on_grant } - when (io.mem_req.fire()) { // s_refill_req + when (io.mem_acquire.fire()) { // s_refill_req state := s_refill_resp } when (state === s_meta_clear && io.meta_write.ready) { @@ -349,7 +227,7 @@ class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCa state := s_meta_clear } when (io.wb_req.fire()) { // s_wb_req - state := Mux(io.wb_req.bits.requiresAck(), s_wb_resp, s_meta_clear) + state := s_wb_resp } when (io.req_sec_val && io.req_sec_rdy) { // s_wb_req, s_wb_resp, s_refill_req //If we get a secondary miss that needs more permissions before we've sent @@ -361,34 +239,32 @@ class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCa dirties_coh := dirties_coh || isWrite(io.req_bits.cmd) } when (io.req_pri_val && io.req_pri_rdy) { - val coh = io.req_bits.old_meta.coh req := io.req_bits dirties_coh := isWrite(io.req_bits.cmd) when (io.req_bits.tag_match) { - when(Bool(false)) { // TODO coh.isHit(io.req_bits.cmd)) { // set dirty bit + when (is_hit) { // set dirty bit state := s_meta_write_req new_coh_state := coh_on_hit }.otherwise { // upgrade permissions state := s_refill_req } }.otherwise { // writback if necessary and refill - //TODO state := Mux(coh.requiresVoluntaryWriteback(), s_wb_req, s_meta_clear) + state := Mux(needs_wb, s_wb_req, s_meta_clear) } } - val fq = Module(new FinishQueue(1)) - val g = io.mem_grant.bits + val grantackq = Module(new Queue(io.mem_finish.bits, 1)) val can_finish = state.isOneOf(s_invalid, s_refill_req) - fq.io.enq.valid := io.mem_grant.valid && g.requiresAck() && refill_done - fq.io.enq.bits := g.makeFinish() - io.mem_finish.valid := fq.io.deq.valid && can_finish - fq.io.deq.ready := io.mem_finish.ready && can_finish - io.mem_finish.bits := fq.io.deq.bits + grantackq.io.enq.valid := refill_done && edge.hasFollowUp(io.mem_grant.bits) + grantackq.io.enq.bits := edge.GrantAck(io.mem_grant.bits) + io.mem_finish.valid := grantackq.io.deq.valid && can_finish + io.mem_finish.bits := grantackq.io.deq.bits + grantackq.io.deq.ready := io.mem_finish.ready && can_finish io.idx_match := (state =/= s_invalid) && idx_match io.refill.way_en := req.way_en - io.refill.addr := ((req_idx << log2Ceil(refillCycles)) | refill_cnt) << rowOffBits - io.tag := req.addr >> untagBits + io.refill.addr := req_block_addr | refill_address_inc + io.tag := req_tag io.req_pri_rdy := state === s_invalid io.req_sec_rdy := sec_rdy && rpq.io.enq.ready @@ -406,16 +282,19 @@ class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCa io.meta_write.bits.way_en := req.way_en io.wb_req.valid := state === s_wb_req - //TODO io.wb_req.bits := req.old_meta.coh.makeVoluntaryWriteback( - // client_xact_id = UInt(id), - // addr_block = Cat(req.old_meta.tag, req_idx)) + io.wb_req.bits.source := UInt(id) + io.wb_req.bits.tag := req.old_meta.tag + io.wb_req.bits.idx := req_idx + io.wb_req.bits.param := shrink_param io.wb_req.bits.way_en := req.way_en + io.wb_req.bits.voluntary := Bool(true) - io.mem_req.valid := state === s_refill_req && fq.io.enq.ready - //TODO io.mem_req.bits := req.old_meta.coh.makeAcquire( - // addr_block = Cat(io.tag, req_idx), - // client_xact_id = Bits(id), - // op_code = req.cmd) + io.mem_acquire.valid := state === s_refill_req && grantackq.io.enq.ready + io.mem_acquire.bits := edge.Acquire( + fromSource = UInt(id), + toAddress = Cat(io.tag, req_idx) << blockOffBits, + lgSize = lgCacheBlockBytes, + growPermissions = grow_param)._2 io.meta_read.valid := state === s_drain_rpq io.meta_read.bits.idx := req_idx @@ -432,20 +311,21 @@ class MSHR(id: Int)(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCa } } -class MSHRFile(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheModule()(p) { +class MSHRFile(edge: TLEdgeOut)(implicit cfg: DCacheConfig, p: Parameters) extends L1HellaCacheModule()(p) { val io = new Bundle { val req = Decoupled(new MSHRReq).flip val resp = Decoupled(new HellaCacheResp) val secondary_miss = Bool(OUTPUT) - val mem_req = Decoupled(new Acquire) + val mem_acquire = Decoupled(new TLBundleA(edge.bundle)) + val mem_grant = Valid(new TLBundleD(edge.bundle)).flip + val mem_finish = Decoupled(new TLBundleE(edge.bundle)) + val refill = new L1RefillReq().asOutput val meta_read = Decoupled(new L1MetaReadReq) val meta_write = Decoupled(new L1MetaWriteReq) val replay = Decoupled(new Replay) - val mem_grant = Valid(new GrantFromSrc).flip - val mem_finish = Decoupled(new FinishToDst) - val wb_req = Decoupled(new WritebackReq) + val wb_req = Decoupled(new WritebackReq(edge.bundle)) val probe_rdy = Bool(OUTPUT) val fence_rdy = Bool(OUTPUT) @@ -453,7 +333,7 @@ class MSHRFile(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheMo } // determine if the request is cacheable or not - val cacheable = addrMap.isCacheable(io.req.bits.addr) + val cacheable = edge.manager.supportsAcquireFast(io.req.bits.addr, lgCacheBlockBytes) val sdq_val = Reg(init=Bits(0, cfg.nSDQ)) val sdq_alloc_id = PriorityEncoder(~sdq_val(cfg.nSDQ-1,0)) @@ -470,13 +350,7 @@ class MSHRFile(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheMo val refillMux = Wire(Vec(cfg.nMSHRs, new L1RefillReq)) val meta_read_arb = Module(new Arbiter(new L1MetaReadReq, cfg.nMSHRs)) val meta_write_arb = Module(new Arbiter(new L1MetaWriteReq, cfg.nMSHRs)) - val mem_req_arb = Module(new LockingArbiter( - new Acquire, - cfg.nMSHRs + nIOMSHRs, - outerDataBeats, - Some((a: Acquire) => a.hasMultibeatData()))) - val mem_finish_arb = Module(new Arbiter(new FinishToDst, cfg.nMSHRs + nIOMSHRs)) - val wb_req_arb = Module(new Arbiter(new WritebackReq, cfg.nMSHRs)) + val wb_req_arb = Module(new Arbiter(new WritebackReq(edge.bundle), cfg.nMSHRs)) val replay_arb = Module(new Arbiter(new ReplayInternal(cfg), cfg.nMSHRs)) val alloc_arb = Module(new Arbiter(Bool(), cfg.nMSHRs)) @@ -487,12 +361,12 @@ class MSHRFile(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheMo io.fence_rdy := true io.probe_rdy := true - for (i <- 0 until cfg.nMSHRs) { - val mshr = Module(new MSHR(i)(cfg)) + val mshrs = (0 until cfg.nMSHRs) map { i => + val mshr = Module(new MSHR(i,edge)(cfg,p)) idxMatch(i) := mshr.io.idx_match tagList(i) := mshr.io.tag - wbTagList(i) := mshr.io.wb_req.bits.addr_block >> idxBits + wbTagList(i) := mshr.io.wb_req.bits.tag alloc_arb.io.in(i).valid := mshr.io.req_pri_rdy mshr.io.req_pri_val := alloc_arb.io.in(i).ready @@ -503,13 +377,10 @@ class MSHRFile(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheMo meta_read_arb.io.in(i) <> mshr.io.meta_read meta_write_arb.io.in(i) <> mshr.io.meta_write - mem_req_arb.io.in(i) <> mshr.io.mem_req - mem_finish_arb.io.in(i) <> mshr.io.mem_finish wb_req_arb.io.in(i) <> mshr.io.wb_req replay_arb.io.in(i) <> mshr.io.replay - mshr.io.mem_grant.valid := io.mem_grant.valid && - io.mem_grant.bits.client_xact_id === UInt(i) + mshr.io.mem_grant.valid := io.mem_grant.valid && io.mem_grant.bits.source === UInt(i) mshr.io.mem_grant.bits := io.mem_grant.bits refillMux(i) := mshr.io.refill @@ -519,14 +390,15 @@ class MSHRFile(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheMo when (!mshr.io.req_pri_rdy) { io.fence_rdy := false } when (!mshr.io.probe_rdy) { io.probe_rdy := false } + + mshr } + alloc_arb.io.out.ready := io.req.valid && sdq_rdy && cacheable && !idx_match io.meta_read <> meta_read_arb.io.out io.meta_write <> meta_write_arb.io.out - io.mem_req <> mem_req_arb.io.out - io.mem_finish <> mem_finish_arb.io.out io.wb_req <> wb_req_arb.io.out val mmio_alloc_arb = Module(new Arbiter(Bool(), nIOMSHRs)) @@ -535,9 +407,9 @@ class MSHRFile(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheMo var mmio_rdy = Bool(false) io.replay_next := Bool(false) - for (i <- 0 until nIOMSHRs) { + val mmios = (0 until nIOMSHRs) map { i => val id = cfg.nMSHRs + i - val mshr = Module(new IOMSHR(id)) + val mshr = Module(new IOMSHR(id, edge)) mmio_alloc_arb.io.in(i).valid := mshr.io.req.ready mshr.io.req.valid := mmio_alloc_arb.io.in(i).ready @@ -545,26 +417,28 @@ class MSHRFile(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheMo mmio_rdy = mmio_rdy || mshr.io.req.ready - mem_req_arb.io.in(id) <> mshr.io.acquire - mem_finish_arb.io.in(id) <> mshr.io.finish - - mshr.io.grant.bits := io.mem_grant.bits - mshr.io.grant.valid := io.mem_grant.valid && - io.mem_grant.bits.client_xact_id === UInt(id) + mshr.io.mem_ack.bits := io.mem_grant.bits + mshr.io.mem_ack.valid := io.mem_grant.valid && io.mem_grant.bits.source === UInt(id) resp_arb.io.in(i) <> mshr.io.resp when (!mshr.io.req.ready) { io.fence_rdy := Bool(false) } when (mshr.io.replay_next) { io.replay_next := Bool(true) } + + mshr } mmio_alloc_arb.io.out.ready := io.req.valid && !cacheable + TLArbiter.lowestFromSeq(edge, io.mem_acquire, mshrs.map(_.io.mem_acquire) ++ mmios.map(_.io.mem_access)) + TLArbiter.lowestFromSeq(edge, io.mem_finish, mshrs.map(_.io.mem_finish)) + io.resp <> resp_arb.io.out - io.req.ready := Mux(!cacheable, mmio_rdy, - Mux(idx_match, tag_match && sec_rdy, pri_rdy) && sdq_rdy) + io.req.ready := Mux(!cacheable, + mmio_rdy, + sdq_rdy && Mux(idx_match, tag_match && sec_rdy, pri_rdy)) io.secondary_miss := idx_match - io.refill := refillMux(io.mem_grant.bits.client_xact_id) + io.refill := refillMux(io.mem_grant.bits.source) val free_sdq = io.replay.fire() && isWrite(io.replay.bits.cmd) io.replay.bits.data := sdq(RegEnable(replay_arb.io.out.bits.sdq_id, free_sdq)) @@ -576,23 +450,21 @@ class MSHRFile(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheMo } } -class WritebackUnit(implicit p: Parameters) extends L1HellaCacheModule()(p) { +class WritebackUnit(edge: TLEdgeOut)(implicit p: Parameters) extends L1HellaCacheModule()(p) { val io = new Bundle { - val req = Decoupled(new WritebackReq).flip + val req = Decoupled(new WritebackReq(edge.bundle)).flip val meta_read = Decoupled(new L1MetaReadReq) val data_req = Decoupled(new L1DataReadReq) val data_resp = Bits(INPUT, encRowBits) - val release = Decoupled(new Release) + val release = Decoupled(new TLBundleC(edge.bundle)) } + val req = Reg(new WritebackReq(edge.bundle)) val active = Reg(init=Bool(false)) val r1_data_req_fired = Reg(init=Bool(false)) val r2_data_req_fired = Reg(init=Bool(false)) val data_req_cnt = Reg(init = UInt(0, width = log2Up(refillCycles+1))) //TODO Zero width - val buf_v = (if(refillCyclesPerBeat > 1) Reg(init=Bits(0, width = refillCyclesPerBeat-1)) else Bits(1)) - val beat_done = buf_v.andR - val (beat_cnt, all_beats_done) = Counter(io.release.fire(), outerDataBeats) - val req = Reg(new WritebackReq) + val (_, last_beat, all_beats_done, beat_count) = edge.count(io.release) io.release.valid := false when (active) { @@ -603,13 +475,11 @@ class WritebackUnit(implicit p: Parameters) extends L1HellaCacheModule()(p) { data_req_cnt := data_req_cnt + 1 } when (r2_data_req_fired) { - io.release.valid := beat_done - when(beat_done) { - when(!io.release.ready) { - r1_data_req_fired := false - r2_data_req_fired := false - data_req_cnt := data_req_cnt - Mux[UInt](Bool(refillCycles > 1) && r1_data_req_fired, 2, 1) - } .otherwise { if(refillCyclesPerBeat > 1) buf_v := 0 } + io.release.valid := true + when(!io.release.ready) { + r1_data_req_fired := false + r2_data_req_fired := false + data_req_cnt := data_req_cnt - Mux[UInt](Bool(refillCycles > 1) && r1_data_req_fired, 2, 1) } when(!r1_data_req_fired) { // We're done if this is the final data request and the Release can be sent @@ -620,49 +490,49 @@ class WritebackUnit(implicit p: Parameters) extends L1HellaCacheModule()(p) { when (io.req.fire()) { active := true data_req_cnt := 0 - if(refillCyclesPerBeat > 1) buf_v := 0 req := io.req.bits } io.req.ready := !active - val req_idx = req.addr_block(idxBits-1, 0) val fire = active && data_req_cnt < UInt(refillCycles) // We reissue the meta read as it sets up the mux ctrl for s2_data_muxed io.meta_read.valid := fire - io.meta_read.bits.idx := req_idx - io.meta_read.bits.tag := req.addr_block >> idxBits + io.meta_read.bits.idx := req.idx + io.meta_read.bits.tag := req.tag io.data_req.valid := fire io.data_req.bits.way_en := req.way_en io.data_req.bits.addr := (if(refillCycles > 1) - Cat(req_idx, data_req_cnt(log2Up(refillCycles)-1,0)) - else req_idx) << rowOffBits + Cat(req.idx, data_req_cnt(log2Up(refillCycles)-1,0)) + else req.idx) << rowOffBits - io.release.bits := req - io.release.bits.addr_beat := beat_cnt - io.release.bits.data := (if(refillCyclesPerBeat > 1) { - // If the cache rows are narrower than a TLDataBeat, - // then buffer enough data_resps to make a whole beat - val data_buf = Reg(Bits()) - when(active && r2_data_req_fired && !beat_done) { - data_buf := Cat(io.data_resp, data_buf((refillCyclesPerBeat)*encRowBits-1, encRowBits)) - buf_v := (if(refillCyclesPerBeat > 2) - Cat(UInt(1), buf_v(refillCyclesPerBeat-2,1)) - else UInt(1)) - } - Cat(io.data_resp, data_buf) - } else { io.data_resp }) + val r_address = Cat(req.tag, req.idx) << blockOffBits + val probeResponse = edge.ProbeAck( + fromSource = req.source, + toAddress = r_address, + lgSize = lgCacheBlockBytes, + reportPermissions = req.param, + data = io.data_resp) + + val voluntaryRelease = edge.Release( + fromSource = req.source, + toAddress = r_address, + lgSize = lgCacheBlockBytes, + shrinkPermissions = req.param, + data = io.data_resp)._2 + + io.release.bits := Mux(req.voluntary, voluntaryRelease, probeResponse) } -class ProbeUnit(implicit p: Parameters) extends L1HellaCacheModule()(p) { +class ProbeUnit(edge: TLEdgeOut)(implicit p: Parameters) extends L1HellaCacheModule()(p) { val io = new Bundle { - val req = Decoupled(new ProbeInternal).flip - val rep = Decoupled(new Release) + val req = Decoupled(new TLBundleB(edge.bundle)).flip + val rep = Decoupled(new TLBundleC(edge.bundle)) val meta_read = Decoupled(new L1MetaReadReq) val meta_write = Decoupled(new L1MetaWriteReq) - val wb_req = Decoupled(new WritebackReq) + val wb_req = Decoupled(new WritebackReq(edge.bundle)) val way_en = Bits(INPUT, nWays) val mshr_rdy = Bool(INPUT) val block_state = new ClientMetadata().asInput @@ -672,34 +542,42 @@ class ProbeUnit(implicit p: Parameters) extends L1HellaCacheModule()(p) { s_mshr_resp :: s_release :: s_writeback_req :: s_writeback_resp :: s_meta_write :: Nil) = Enum(UInt(), 9) val state = Reg(init=s_invalid) - val old_coh = Reg(new ClientMetadata) - val way_en = Reg(Bits()) - val req = Reg(new ProbeInternal) - val tag_matches = way_en.orR + val req = Reg(new TLBundleB(edge.bundle)) + val req_idx = req.address(idxMSB, idxLSB) + val req_tag = req.address >> untagBits + + val way_en = Reg(Bits()) + val tag_matches = way_en.orR + val old_coh = Reg(new ClientMetadata) val miss_coh = ClientMetadata.onReset val reply_coh = Mux(tag_matches, old_coh, miss_coh) - //TODO val reply = reply_coh.makeRelease(req) + val (is_dirty, report_param, new_coh) = reply_coh.onProbe(req.param) + io.req.ready := state === s_invalid io.rep.valid := state === s_release - //TODO io.rep.bits := reply + io.rep.bits := edge.ProbeAck(req, report_param) - assert(!io.rep.valid || !io.rep.bits.hasData(), - "ProbeUnit should not send releases with data") + assert(!io.rep.valid || !edge.hasData(io.rep.bits), + "ProbeUnit should not send ProbeAcks with data, WritebackUnit should handle it") io.meta_read.valid := state === s_meta_read - io.meta_read.bits.idx := req.addr_block - io.meta_read.bits.tag := req.addr_block >> idxBits + io.meta_read.bits.idx := req_idx + io.meta_read.bits.tag := req_tag io.meta_write.valid := state === s_meta_write io.meta_write.bits.way_en := way_en - io.meta_write.bits.idx := req.addr_block - io.meta_write.bits.data.tag := req.addr_block >> idxBits - //TODO io.meta_write.bits.data.coh := old_coh.onProbe(req) + io.meta_write.bits.idx := req_idx + io.meta_write.bits.data.tag := req_tag + io.meta_write.bits.data.coh := new_coh io.wb_req.valid := state === s_writeback_req - //TODO io.wb_req.bits := reply + io.wb_req.bits.source := req.source + io.wb_req.bits.idx := req_idx + io.wb_req.bits.tag := req_tag + io.wb_req.bits.param := report_param io.wb_req.bits.way_en := way_en + io.wb_req.bits.voluntary := Bool(false) // state === s_invalid when (io.req.fire()) { @@ -718,16 +596,14 @@ class ProbeUnit(implicit p: Parameters) extends L1HellaCacheModule()(p) { } when (state === s_mshr_req) { - state := s_mshr_resp old_coh := io.block_state way_en := io.way_en // if the read didn't go through, we need to retry - when (!io.mshr_rdy) { state := s_meta_read } + state := Mux(io.mshr_rdy, s_mshr_resp, s_meta_read) } when (state === s_mshr_resp) { - val needs_writeback = tag_matches // TODO && old_coh.requiresVoluntaryWriteback() - state := Mux(needs_writeback, s_writeback_req, s_release) + state := Mux(tag_matches && is_dirty, s_writeback_req, s_release) } when (state === s_release && io.rep.ready) { @@ -796,18 +672,17 @@ class DataArray(implicit p: Parameters) extends L1HellaCacheModule()(p) { io.write.ready := Bool(true) } -class HellaCache(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCacheModule()(p) { - val io = new Bundle { - val cpu = (new HellaCacheIO).flip - val ptw = new TLBPTWIO() - val mem = new ClientTileLinkIO - } - +class NonBlockingDCache(cfg: DCacheConfig)(implicit p: Parameters) extends HellaCache(cfg)(p) { + override lazy val module = new NonBlockingDCacheModule(this) +} + +class NonBlockingDCacheModule(outer: NonBlockingDCache)(implicit p: Parameters) extends HellaCacheModule(outer)(p) { + require(isPow2(nWays)) // TODO: relax this - val wb = Module(new WritebackUnit) - val prober = Module(new ProbeUnit) - val mshrs = Module(new MSHRFile(cfg)) + val wb = Module(new WritebackUnit(edge)) + val prober = Module(new ProbeUnit(edge)) + val mshrs = Module(new MSHRFile(edge)) io.cpu.req.ready := Bool(true) val s1_valid = Reg(next=io.cpu.req.fire(), init=Bool(false)) @@ -990,7 +865,7 @@ class HellaCache(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCache mshrs.io.req.bits.way_en := Mux(s2_tag_match, s2_tag_match_way, s2_replaced_way_en) mshrs.io.req.bits.data := s2_req.data when (mshrs.io.req.fire()) { replacer.miss } - io.mem.acquire <> mshrs.io.mem_req + tl_out.a <> mshrs.io.mem_acquire // replays readArb.io.in(1).valid := mshrs.io.replay.valid @@ -1002,15 +877,9 @@ class HellaCache(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCache metaWriteArb.io.in(0) <> mshrs.io.meta_write // probes and releases - val releaseArb = Module(new LockingArbiter( - new Release, 2, outerDataBeats, - Some((r: Release) => r.hasMultibeatData()))) - io.mem.release <> releaseArb.io.out - - prober.io.req.valid := io.mem.probe.valid && !lrsc_valid - io.mem.probe.ready := prober.io.req.ready && !lrsc_valid - prober.io.req.bits := io.mem.probe.bits - releaseArb.io.in(1) <> prober.io.rep + prober.io.req.valid := tl_out.b.valid && !lrsc_valid + tl_out.b.ready := prober.io.req.ready && !lrsc_valid + prober.io.req.bits := tl_out.b.bits prober.io.way_en := s2_tag_match_way prober.io.block_state := s2_hit_state metaReadArb.io.in(2) <> prober.io.meta_read @@ -1018,32 +887,32 @@ class HellaCache(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCache prober.io.mshr_rdy := mshrs.io.probe_rdy // refills - val narrow_grant = FlowThroughSerializer(io.mem.grant, refillCyclesPerBeat) - mshrs.io.mem_grant.valid := narrow_grant.fire() - mshrs.io.mem_grant.bits := narrow_grant.bits - narrow_grant.ready := writeArb.io.in(1).ready || !narrow_grant.bits.hasData() + val grant_has_data = edge.hasData(tl_out.d.bits) + mshrs.io.mem_grant.valid := tl_out.d.fire() + mshrs.io.mem_grant.bits := tl_out.d.bits + tl_out.d.ready := writeArb.io.in(1).ready || !grant_has_data /* The last clause here is necessary in order to prevent the responses for * the IOMSHRs from being written into the data array. It works because the * IOMSHR ids start right the ones for the regular MSHRs. */ - writeArb.io.in(1).valid := narrow_grant.valid && narrow_grant.bits.hasData() && - narrow_grant.bits.client_xact_id < UInt(cfg.nMSHRs) + writeArb.io.in(1).valid := tl_out.d.valid && grant_has_data && + tl_out.d.bits.source < UInt(cfg.nMSHRs) writeArb.io.in(1).bits.addr := mshrs.io.refill.addr writeArb.io.in(1).bits.way_en := mshrs.io.refill.way_en writeArb.io.in(1).bits.wmask := ~UInt(0, rowWords) - writeArb.io.in(1).bits.data := narrow_grant.bits.data(encRowBits-1,0) + writeArb.io.in(1).bits.data := tl_out.d.bits.data(encRowBits-1,0) data.io.read <> readArb.io.out - readArb.io.out.ready := !narrow_grant.valid || narrow_grant.ready // insert bubble if refill gets blocked - io.mem.finish <> mshrs.io.mem_finish + readArb.io.out.ready := !tl_out.d.valid || tl_out.d.ready // insert bubble if refill gets blocked + tl_out.e <> mshrs.io.mem_finish // writebacks - val wbArb = Module(new Arbiter(new WritebackReq, 2)) + val wbArb = Module(new Arbiter(new WritebackReq(edge.bundle), 2)) wbArb.io.in(0) <> prober.io.wb_req wbArb.io.in(1) <> mshrs.io.wb_req wb.io.req <> wbArb.io.out metaReadArb.io.in(3) <> wb.io.meta_read readArb.io.in(2) <> wb.io.data_req wb.io.data_resp := s2_data_corrected - releaseArb.io.in(0) <> wb.io.release + TLArbiter.lowest(edge, tl_out.c, wb.io.release, prober.io.rep) // store->load bypassing val s4_valid = Reg(next=s3_valid, init=Bool(false)) @@ -1115,138 +984,3 @@ class HellaCache(cfg: DCacheConfig)(implicit p: Parameters) extends L1HellaCache io.cpu.ordered := mshrs.io.fence_rdy && !s1_valid && !s2_valid io.cpu.replay_next := (s1_replay && s1_read) || mshrs.io.replay_next } - -/** - * This module buffers requests made by the SimpleHellaCacheIF in case they - * are nacked. Nacked requests must be replayed in order, and no other requests - * must be allowed to go through until the replayed requests are successfully - * completed. - */ -class SimpleHellaCacheIFReplayQueue(depth: Int) - (implicit val p: Parameters) extends Module - with HasL1HellaCacheParameters { - val io = new Bundle { - val req = Decoupled(new HellaCacheReq).flip - val nack = Valid(Bits(width = coreDCacheReqTagBits)).flip - val resp = Valid(new HellaCacheResp).flip - val replay = Decoupled(new HellaCacheReq) - } - - // Registers to store the sent request - // When a request is sent the first time, - // it is stored in one of the reqs registers - // and the corresponding inflight bit is set. - // The reqs register will be deallocated once the request is - // successfully completed. - val inflight = Reg(init = UInt(0, depth)) - val reqs = Reg(Vec(depth, new HellaCacheReq)) - - // The nack queue stores the index of nacked requests (in the reqs vector) - // in the order that they were nacked. A request is enqueued onto nackq - // when it is newly nacked (i.e. not a nack for a previous replay). - // The head of the nack queue will be replayed until it is - // successfully completed, at which time the request is dequeued. - // No new requests will be made or other replays attempted until the head - // of the nackq is successfully completed. - val nackq = Module(new Queue(UInt(width = log2Up(depth)), depth)) - val replaying = Reg(init = Bool(false)) - - val next_inflight_onehot = PriorityEncoderOH(~inflight) - val next_inflight = OHToUInt(next_inflight_onehot) - - val next_replay = nackq.io.deq.bits - val next_replay_onehot = UIntToOH(next_replay) - val next_replay_req = reqs(next_replay) - - // Keep sending the head of the nack queue until it succeeds - io.replay.valid := nackq.io.deq.valid && !replaying - io.replay.bits := next_replay_req - // Don't allow new requests if there is are replays waiting - // or something being nacked. - io.req.ready := !inflight.andR && !nackq.io.deq.valid && !io.nack.valid - - // Match on the tags to determine the index of nacks or responses - val nack_onehot = Cat(reqs.map(_.tag === io.nack.bits).reverse) & inflight - val resp_onehot = Cat(reqs.map(_.tag === io.resp.bits.tag).reverse) & inflight - - val replay_complete = io.resp.valid && replaying && io.resp.bits.tag === next_replay_req.tag - val nack_head = io.nack.valid && nackq.io.deq.valid && io.nack.bits === next_replay_req.tag - - // Enqueue to the nack queue if there is a nack that is not in response to - // the previous replay - nackq.io.enq.valid := io.nack.valid && !nack_head - nackq.io.enq.bits := OHToUInt(nack_onehot) - assert(!nackq.io.enq.valid || nackq.io.enq.ready, - "SimpleHellaCacheIF: ReplayQueue nack queue overflow") - - // Dequeue from the nack queue if the last replay was successfully completed - nackq.io.deq.ready := replay_complete - assert(!nackq.io.deq.ready || nackq.io.deq.valid, - "SimpleHellaCacheIF: ReplayQueue nack queue underflow") - - // Set inflight bit when a request is made - // Clear it when it is successfully completed - inflight := (inflight | Mux(io.req.fire(), next_inflight_onehot, UInt(0))) & - ~Mux(io.resp.valid, resp_onehot, UInt(0)) - - when (io.req.fire()) { - reqs(next_inflight) := io.req.bits - } - - // Only one replay outstanding at a time - when (io.replay.fire()) { replaying := Bool(true) } - when (nack_head || replay_complete) { replaying := Bool(false) } -} - -// exposes a sane decoupled request interface -class SimpleHellaCacheIF(implicit p: Parameters) extends Module -{ - val io = new Bundle { - val requestor = new HellaCacheIO().flip - val cache = new HellaCacheIO - } - - val replayq = Module(new SimpleHellaCacheIFReplayQueue(2)) - val req_arb = Module(new Arbiter(new HellaCacheReq, 2)) - - val req_helper = DecoupledHelper( - req_arb.io.in(1).ready, - replayq.io.req.ready, - io.requestor.req.valid) - - req_arb.io.in(0) <> replayq.io.replay - req_arb.io.in(1).valid := req_helper.fire(req_arb.io.in(1).ready) - req_arb.io.in(1).bits := io.requestor.req.bits - io.requestor.req.ready := req_helper.fire(io.requestor.req.valid) - replayq.io.req.valid := req_helper.fire(replayq.io.req.ready) - replayq.io.req.bits := io.requestor.req.bits - - val s0_req_fire = io.cache.req.fire() - val s1_req_fire = Reg(next = s0_req_fire) - val s2_req_fire = Reg(next = s1_req_fire) - val s1_req_tag = Reg(next = io.cache.req.bits.tag) - val s2_req_tag = Reg(next = s1_req_tag) - val s2_kill = Reg(next = io.cache.s1_kill) - - io.cache.invalidate_lr := io.requestor.invalidate_lr - io.cache.req <> req_arb.io.out - io.cache.s1_kill := io.cache.s2_nack - io.cache.s1_data := RegEnable(req_arb.io.out.bits.data, s0_req_fire) - - replayq.io.nack.valid := (io.cache.s2_nack || s2_kill) && s2_req_fire - replayq.io.nack.bits := s2_req_tag - replayq.io.resp := io.cache.resp - io.requestor.resp := io.cache.resp - - assert(!Reg(next = io.cache.req.fire()) || - !(io.cache.xcpt.ma.ld || io.cache.xcpt.ma.st || - io.cache.xcpt.pf.ld || io.cache.xcpt.pf.st), - "SimpleHellaCacheIF exception") -} - -object HellaCache { - def apply(cfg: DCacheConfig)(implicit p: Parameters) = LazyModule(new DCache) - // TODO convert non-blocking cache - // if (cfg.nMSHRs == 0) Module(new DCache()).io - // else Module(new HellaCache(cfg)).io -} diff --git a/src/main/scala/uncore/tilelink2/Arbiter.scala b/src/main/scala/uncore/tilelink2/Arbiter.scala index 3ef0ca7e..a42ca60c 100644 --- a/src/main/scala/uncore/tilelink2/Arbiter.scala +++ b/src/main/scala/uncore/tilelink2/Arbiter.scala @@ -13,6 +13,14 @@ object TLArbiter val lowestIndexFirst: Policy = (valids, granted) => valids.scanLeft(Bool(true))(_ && !_).init + def lowestFromSeq[T <: TLChannel](edge: TLEdge, sink: DecoupledIO[T], sources: Seq[DecoupledIO[T]]) { + apply(lowestIndexFirst)(sink, sources.map(s => (edge.numBeats1(s.bits), s)):_*) + } + + def lowest[T <: TLChannel](edge: TLEdge, sink: DecoupledIO[T], sources: DecoupledIO[T]*) { + apply(lowestIndexFirst)(sink, sources.toList.map(s => (edge.numBeats1(s.bits), s)):_*) + } + def apply[T <: Data](policy: Policy)(sink: DecoupledIO[T], sources: (UInt, DecoupledIO[T])*) { if (sources.isEmpty) { sink.valid := Bool(false) diff --git a/src/main/scala/uncore/tilelink2/Broadcast.scala b/src/main/scala/uncore/tilelink2/Broadcast.scala index cbd4a629..8bc40111 100644 --- a/src/main/scala/uncore/tilelink2/Broadcast.scala +++ b/src/main/scala/uncore/tilelink2/Broadcast.scala @@ -136,11 +136,9 @@ class TLBroadcast(lineBytes: Int, numTrackers: Int = 4, bufferless: Boolean = fa putfull.bits := edgeOut.Put(Cat(put_what, in.c.bits.source), in.c.bits.address, in.c.bits.size, in.c.bits.data)._2 // Combine ReleaseAck or the modified D - TLArbiter(TLArbiter.lowestIndexFirst)(in.d, (UInt(0), releaseack), (edgeOut.numBeats1(d_normal.bits), d_normal)) + TLArbiter.lowest(edgeOut, in.d, releaseack, d_normal) // Combine the PutFull with the trackers - TLArbiter(TLArbiter.lowestIndexFirst)(out.a, - ((edgeOut.numBeats1(putfull.bits), putfull) +: - trackers.map { t => (edgeOut.numBeats1(t.out_a.bits), t.out_a) }):_*) + TLArbiter.lowestFromSeq(edgeOut, out.a, putfull +: trackers.map(_.out_a)) // The Probe FSM walks all caches and probes them val probe_todo = RegInit(UInt(0, width = max(1, caches.size))) diff --git a/src/main/scala/uncore/tilelink2/Edges.scala b/src/main/scala/uncore/tilelink2/Edges.scala index f2c6af63..7ce91a8d 100644 --- a/src/main/scala/uncore/tilelink2/Edges.scala +++ b/src/main/scala/uncore/tilelink2/Edges.scala @@ -191,27 +191,32 @@ class TLEdge( def first(bits: TLChannel, fire: Bool): Bool = firstlastHelper(bits, fire)._1 def first(x: DecoupledIO[TLChannel]): Bool = first(x.bits, x.fire()) + def first(x: ValidIO[TLChannel]): Bool = first(x.bits, x.valid) def last(bits: TLChannel, fire: Bool): Bool = firstlastHelper(bits, fire)._2 def last(x: DecoupledIO[TLChannel]): Bool = last(x.bits, x.fire()) + def last(x: ValidIO[TLChannel]): Bool = last(x.bits, x.valid) def firstlast(bits: TLChannel, fire: Bool): (Bool, Bool, Bool) = { val r = firstlastHelper(bits, fire) (r._1, r._2, r._3) } def firstlast(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool) = firstlast(x.bits, x.fire()) + def firstlast(x: ValidIO[TLChannel]): (Bool, Bool, Bool) = firstlast(x.bits, x.valid) def count(bits: TLChannel, fire: Bool): (Bool, Bool, Bool, UInt) = { val r = firstlastHelper(bits, fire) (r._1, r._2, r._3, r._4) } def count(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool, UInt) = count(x.bits, x.fire()) + def count(x: ValidIO[TLChannel]): (Bool, Bool, Bool, UInt) = count(x.bits, x.valid) def addr_inc(bits: TLChannel, fire: Bool): (Bool, Bool, Bool, UInt) = { val r = firstlastHelper(bits, fire) (r._1, r._2, r._3, r._4 << log2Ceil(manager.beatBytes)) } def addr_inc(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool, UInt) = addr_inc(x.bits, x.fire()) + def addr_inc(x: ValidIO[TLChannel]): (Bool, Bool, Bool, UInt) = addr_inc(x.bits, x.valid) } class TLEdgeOut(