From 438abc76d2edaf73ee6fb8bb354871e8f9df3326 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Wed, 5 Jul 2017 23:40:52 -0700 Subject: [PATCH 1/4] Handle TL errors in L1 I$ Cache the error bit in the tag array; report precisely on access. --- src/main/scala/rocket/Frontend.scala | 5 ++-- src/main/scala/rocket/ICache.scala | 34 +++++++++++++++++++++------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index 44f51d32..7a341047 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -168,12 +168,13 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) fq.io.enq.bits.pc := s2_pc io.cpu.npc := ~(~Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc) | (coreInstBytes-1)) // discard LSB(s) - fq.io.enq.bits.data := icache.io.resp.bits + fq.io.enq.bits.data := icache.io.resp.bits.data fq.io.enq.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes)) - fq.io.enq.bits.xcpt := s2_tlb_resp fq.io.enq.bits.replay := icache.io.s2_kill && !icache.io.resp.valid && !s2_xcpt fq.io.enq.bits.btb.valid := s2_btb_resp_valid fq.io.enq.bits.btb.bits := s2_btb_resp_bits + fq.io.enq.bits.xcpt := s2_tlb_resp + when (icache.io.resp.valid && icache.io.resp.bits.ae) { fq.io.enq.bits.xcpt.ae.inst := true } io.cpu.resp <> fq.io.deq diff --git a/src/main/scala/rocket/ICache.scala b/src/main/scala/rocket/ICache.scala index dfb240f4..57368a8c 100644 --- a/src/main/scala/rocket/ICache.scala +++ b/src/main/scala/rocket/ICache.scala @@ -58,6 +58,13 @@ class ICache(val icacheParams: ICacheParams, val hartid: Int)(implicit p: Parame } } +class ICacheResp(outer: ICache) extends Bundle { + val data = UInt(width = outer.icacheParams.fetchBytes*8) + val ae = Bool() + + override def cloneType = new ICacheResp(outer).asInstanceOf[this.type] +} + class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) { val hartid = UInt(INPUT, hartIdLen) val req = Decoupled(new ICacheReq).flip @@ -66,7 +73,7 @@ class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) { val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission - val resp = Valid(UInt(width = outer.icacheParams.fetchBytes*8)) + val resp = Valid(new ICacheResp(outer)) val invalidate = Bool(INPUT) val tl_out = outer.masterNode.bundleOut val tl_in = outer.slaveNode.map(_.bundleIn) @@ -145,15 +152,18 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) v } - val tag_array = SeqMem(nSets, Vec(nWays, Bits(width = tECC.width(tagBits)))) + val tag_array = SeqMem(nSets, Vec(nWays, UInt(width = tECC.width(1 + tagBits)))) val tag_rdata = tag_array.read(s0_vaddr(untagBits-1,blockOffBits), !refill_done && s0_valid) + val accruedRefillError = Reg(Bool()) + val refillError = tl_out.d.bits.error || (refill_cnt > 0 && accruedRefillError) when (refill_done) { - val tag = tECC.encode(refill_tag) - tag_array.write(refill_idx, Vec.fill(nWays)(tag), Vec.tabulate(nWays)(repl_way === _)) + val encTag = tECC.encode(Cat(refillError, refill_tag)) + tag_array.write(refill_idx, Vec.fill(nWays)(encTag), Seq.tabulate(nWays)(repl_way === _)) } val vb_array = Reg(init=Bits(0, nSets*nWays)) when (tl_out.d.fire()) { + accruedRefillError := refillError // clear bit when refill starts so hit-under-miss doesn't fetch bad data vb_array := vb_array.bitSet(Cat(repl_way, refill_idx), refill_done && !invalidated) } @@ -164,6 +174,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) } val s1_tag_disparity = Wire(Vec(nWays, Bool())) + val s1_tlError = Wire(Vec(nWays, Bool())) val wordBits = outer.icacheParams.fetchBytes*8 val s1_dout = Wire(Vec(nWays, UInt(width = dECC.width(wordBits)))) @@ -179,8 +190,12 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) lineInScratchpad(scratchpadLine(s1s3_slaveAddr)) && scratchpadWay(s1s3_slaveAddr) === i, addrInScratchpad(io.s1_paddr) && scratchpadWay(io.s1_paddr) === i) val s1_vb = vb_array(Cat(UInt(i), s1_idx)) && !s1_slaveValid - s1_tag_disparity(i) := s1_vb && tECC.decode(tag_rdata(i)).error - s1_tag_hit(i) := scratchpadHit || (s1_vb && tECC.decode(tag_rdata(i)).uncorrected === s1_tag) + val encTag = tECC.decode(tag_rdata(i)) + val (tlError, tag) = Split(encTag.uncorrected, tagBits) + val tagMatch = s1_vb && tag === s1_tag + s1_tag_disparity(i) := s1_vb && encTag.error + s1_tlError(i) := tagMatch && tlError.toBool + s1_tag_hit(i) := tagMatch || scratchpadHit } assert(!(s1_valid || s1_slaveValid) || PopCount(s1_tag_hit zip s1_tag_disparity map { case (h, d) => h && !d }) <= 1) @@ -212,7 +227,8 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) require(tECC.isInstanceOf[uncore.util.IdentityCode]) require(dECC.isInstanceOf[uncore.util.IdentityCode]) require(outer.icacheParams.itimAddr.isEmpty) - io.resp.bits := Mux1H(s1_tag_hit, s1_dout) + io.resp.bits.data := Mux1H(s1_tag_hit, s1_dout) + io.resp.bits.ae := s1_tlError.asUInt.orR io.resp.valid := s1_valid && s1_hit case 2 => @@ -221,11 +237,13 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) val s2_way_mux = Mux1H(s2_tag_hit, s2_dout) val s2_tag_disparity = RegEnable(s1_tag_disparity, s1_valid || s1_slaveValid).asUInt.orR + val s2_tlError = RegEnable(s1_tlError.asUInt.orR, s1_valid || s1_slaveValid) val s2_data_decoded = dECC.decode(s2_way_mux) val s2_disparity = s2_tag_disparity || s2_data_decoded.error when (s2_valid && s2_disparity) { invalidate := true } - io.resp.bits := s2_data_decoded.uncorrected + io.resp.bits.data := s2_data_decoded.uncorrected + io.resp.bits.ae := s2_tlError io.resp.valid := s2_valid && s2_hit && !s2_disparity tl_in.map { tl => From 90a7d6a34304bd3ca27206511f1cb727ad803760 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Wed, 5 Jul 2017 23:53:52 -0700 Subject: [PATCH 2/4] Add L2 TLB option --- src/main/scala/rocket/DCache.scala | 1 + src/main/scala/rocket/NBDcache.scala | 1 + src/main/scala/rocket/PTW.scala | 70 ++++++++++++++++++++++++++-- src/main/scala/rocket/Rocket.scala | 4 +- src/main/scala/rocket/TLB.scala | 2 + src/main/scala/tile/Core.scala | 1 + 6 files changed, 73 insertions(+), 6 deletions(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index 993014e2..98e89a9f 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -146,6 +146,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { tlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0) tlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1) tlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data + tlb.io.req.bits.sfence.bits.addr := s1_req.addr tlb.io.req.bits.passthrough := s1_req.phys tlb.io.req.bits.vaddr := s1_req.addr tlb.io.req.bits.instruction := false diff --git a/src/main/scala/rocket/NBDcache.scala b/src/main/scala/rocket/NBDcache.scala index aad94ef4..8d5612d3 100644 --- a/src/main/scala/rocket/NBDcache.scala +++ b/src/main/scala/rocket/NBDcache.scala @@ -708,6 +708,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule dtlb.io.req.bits.sfence.valid := s1_sfence dtlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0) dtlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1) + dtlb.io.req.bits.sfence.bits.addr := s1_req.addr dtlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data dtlb.io.req.bits.passthrough := s1_req.phys dtlb.io.req.bits.vaddr := s1_req.addr diff --git a/src/main/scala/rocket/PTW.scala b/src/main/scala/rocket/PTW.scala index 37ec12f7..60484eca 100644 --- a/src/main/scala/rocket/PTW.scala +++ b/src/main/scala/rocket/PTW.scala @@ -11,6 +11,7 @@ import coreplex.CacheBlockBytes import uncore.constants._ import uncore.tilelink2._ import util._ +import uncore.util.ParityCode import scala.collection.mutable.ListBuffer @@ -37,7 +38,7 @@ class TLBPTWIO(implicit p: Parameters) extends CoreBundle()(p) class DatapathPTWIO(implicit p: Parameters) extends CoreBundle()(p) with HasRocketCoreParameters { val ptbr = new PTBR().asInput - val invalidate = Bool(INPUT) + val sfence = Valid(new SFenceReq).flip val status = new MStatus().asInput val pmp = Vec(nPMPs, new PMP).asInput } @@ -125,17 +126,69 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()( data(r) := pte.ppn } when (hit && state === s_req) { plru.access(OHToUInt(hits)) } - when (io.dpath.invalidate) { valid := 0 } + when (io.dpath.sfence.valid && !io.dpath.sfence.bits.rs1) { valid := 0 } (hit && count < pgLevels-1, Mux1H(hits, data)) } + + val l2_refill = RegNext(false.B) + val (l2_hit, l2_pte) = if (coreParams.nL2TLBEntries == 0) (false.B, Wire(new PTE)) else { + class Entry extends Bundle { + val ppn = UInt(width = ppnBits) + val d = Bool() + val a = Bool() + val u = Bool() + val x = Bool() + val w = Bool() + val r = Bool() + } + + val code = new ParityCode + require(isPow2(coreParams.nL2TLBEntries)) + val idxBits = log2Ceil(coreParams.nL2TLBEntries) + val tagBits = vpnBits - idxBits + val ram = SeqMem(coreParams.nL2TLBEntries, UInt(width = code.width(new Entry().getWidth + tagBits))) + val g = Reg(UInt(width = coreParams.nL2TLBEntries)) + val valid = RegInit(UInt(0, coreParams.nL2TLBEntries)) + val (r_tag, r_idx) = Split(r_req.addr, idxBits) + when (l2_refill) { + val entry = Wire(new Entry) + entry := r_pte + ram.write(r_idx, code.encode(Cat(entry.asUInt, r_tag))) + + val mask = UIntToOH(r_idx) + valid := valid | mask + g := Mux(r_pte.g, g | mask, g & ~mask) + } + when (io.dpath.sfence.valid) { + valid := + Mux(io.dpath.sfence.bits.rs1, valid & ~UIntToOH(io.dpath.sfence.bits.addr(idxBits+pgIdxBits-1, pgIdxBits)), + Mux(io.dpath.sfence.bits.rs2, valid & g, 0.U)) + } + + val s0_valid = !l2_refill && arb.io.out.fire() + val s1_valid = RegNext(s0_valid) + val s2_valid = RegNext(s1_valid && valid(r_idx)) + val s1_rdata = ram.read(arb.io.out.bits.addr(idxBits-1, 0), s0_valid) + val s2_rdata = code.decode(RegEnable(s1_rdata, s1_valid)) + when (s2_valid && s2_rdata.error) { valid := 0.U } + + val (s2_entry, s2_tag) = Split(s2_rdata.uncorrected, tagBits) + val s2_hit = s2_valid && !s2_rdata.error && r_tag === s2_tag + val s2_pte = Wire(new PTE) + s2_pte := s2_entry.asTypeOf(new Entry) + s2_pte.g := g(r_idx) + s2_pte.v := true + + (s2_hit, s2_pte) + } - io.mem.req.valid := state === s_req + io.mem.req.valid := state === s_req && !l2_hit io.mem.req.bits.phys := Bool(true) io.mem.req.bits.cmd := M_XRD io.mem.req.bits.typ := log2Ceil(xLen/8) io.mem.req.bits.addr := pte_addr - io.mem.s1_kill := s1_kill + io.mem.s1_kill := s1_kill || l2_hit io.mem.invalidate_lr := Bool(false) val pmaPgLevelHomogeneous = (0 until pgLevels) map { i => @@ -159,7 +212,7 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()( // control state machine switch (state) { is (s_ready) { - when (arb.io.out.valid) { + when (arb.io.out.fire()) { state := s_req } count := UInt(0) @@ -186,6 +239,7 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()( state := s_req count := count + 1 }.otherwise { + l2_refill := pte.v && !invalid_paddr && count === pgLevels-1 resp_ae := pte.v && invalid_paddr state := s_ready resp_valid(r_req_dest) := true @@ -198,6 +252,12 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()( } } } + when (l2_hit) { + state := s_ready + resp_valid(r_req_dest) := true + resp_ae := false + r_pte := l2_pte + } } /** Mix-ins for constructing tiles that might have a PTW */ diff --git a/src/main/scala/rocket/Rocket.scala b/src/main/scala/rocket/Rocket.scala index 498c4a2f..07362570 100644 --- a/src/main/scala/rocket/Rocket.scala +++ b/src/main/scala/rocket/Rocket.scala @@ -25,6 +25,7 @@ case class RocketCoreParams( nPMPs: Int = 8, nPerfCounters: Int = 0, nCustomMRWCSRs: Int = 0, + nL2TLBEntries: Int = 0, mtvecInit: Option[BigInt] = Some(BigInt(0)), mtvecWritable: Boolean = true, fastLoadWord: Boolean = true, @@ -588,8 +589,9 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) io.imem.sfence.valid := wb_reg_valid && wb_reg_sfence io.imem.sfence.bits.rs1 := wb_ctrl.mem_type(0) io.imem.sfence.bits.rs2 := wb_ctrl.mem_type(1) + io.imem.sfence.bits.addr := wb_reg_wdata io.imem.sfence.bits.asid := wb_reg_rs2 - io.ptw.invalidate := io.imem.sfence.valid && !io.imem.sfence.bits.rs1 + io.ptw.sfence := io.imem.sfence ibuf.io.inst(0).ready := !ctrl_stalld || csr.io.interrupt diff --git a/src/main/scala/rocket/TLB.scala b/src/main/scala/rocket/TLB.scala index f127488c..197b5f06 100644 --- a/src/main/scala/rocket/TLB.scala +++ b/src/main/scala/rocket/TLB.scala @@ -20,6 +20,7 @@ case object ASIdBits extends Field[Int] class SFenceReq(implicit p: Parameters) extends CoreBundle()(p) { val rs1 = Bool() val rs2 = Bool() + val addr = UInt(width = vaddrBits) val asid = UInt(width = asIdBits max 1) // TODO zero-width } @@ -252,6 +253,7 @@ class TLB(lgMaxSize: Int, nEntries: Int)(implicit edge: TLEdgeOut, p: Parameters } when (sfence) { + assert((io.req.bits.sfence.bits.addr >> pgIdxBits) === vpn(vpnBits-1,0)) valid := Mux(io.req.bits.sfence.bits.rs1, valid & ~hits(totalEntries-1, 0), Mux(io.req.bits.sfence.bits.rs2, valid & entries.map(_.g).asUInt, 0)) } diff --git a/src/main/scala/tile/Core.scala b/src/main/scala/tile/Core.scala index 2befafad..31279ae7 100644 --- a/src/main/scala/tile/Core.scala +++ b/src/main/scala/tile/Core.scala @@ -24,6 +24,7 @@ trait CoreParams { val retireWidth: Int val instBits: Int val nLocalInterrupts: Int + val nL2TLBEntries: Int } trait HasCoreParameters extends HasTileParameters { From be4eceec0d911c485db0fedf8104fd10fe7a11d4 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Thu, 6 Jul 2017 00:26:32 -0700 Subject: [PATCH 3/4] Fix stupid D$ probe bug --- src/main/scala/rocket/DCache.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index 98e89a9f..8717774e 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -474,7 +474,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { probe_bits.address := Cat(s2_victim_tag, s2_req.addr(idxMSB, idxLSB)) << idxLSB } when (s2_probe) { - s1_nack := true + val probeNack = Wire(init = true.B) when (s2_meta_error) { release_state := s_probe_retry }.elsewhen (s2_prb_ack_data) { @@ -485,9 +485,10 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { release_state := Mux(releaseDone, s_probe_write_meta, s_probe_rep_clean) }.otherwise { tl_out.c.valid := true - s1_nack := !releaseDone + probeNack := !releaseDone release_state := Mux(releaseDone, s_ready, s_probe_rep_miss) } + when (probeNack) { s1_nack := true } } when (release_state === s_probe_retry) { metaArb.io.in(6).valid := true From b2351c5fbf8ba84f93f792aad973836fa84136f6 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Thu, 6 Jul 2017 11:16:56 -0700 Subject: [PATCH 4/4] Use consistent casing --- src/main/scala/rocket/ICache.scala | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/main/scala/rocket/ICache.scala b/src/main/scala/rocket/ICache.scala index 57368a8c..5cb8c045 100644 --- a/src/main/scala/rocket/ICache.scala +++ b/src/main/scala/rocket/ICache.scala @@ -157,8 +157,8 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) val accruedRefillError = Reg(Bool()) val refillError = tl_out.d.bits.error || (refill_cnt > 0 && accruedRefillError) when (refill_done) { - val encTag = tECC.encode(Cat(refillError, refill_tag)) - tag_array.write(refill_idx, Vec.fill(nWays)(encTag), Seq.tabulate(nWays)(repl_way === _)) + val enc_tag = tECC.encode(Cat(refillError, refill_tag)) + tag_array.write(refill_idx, Vec.fill(nWays)(enc_tag), Seq.tabulate(nWays)(repl_way === _)) } val vb_array = Reg(init=Bits(0, nSets*nWays)) @@ -174,7 +174,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) } val s1_tag_disparity = Wire(Vec(nWays, Bool())) - val s1_tlError = Wire(Vec(nWays, Bool())) + val s1_tl_error = Wire(Vec(nWays, Bool())) val wordBits = outer.icacheParams.fetchBytes*8 val s1_dout = Wire(Vec(nWays, UInt(width = dECC.width(wordBits)))) @@ -190,11 +190,11 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) lineInScratchpad(scratchpadLine(s1s3_slaveAddr)) && scratchpadWay(s1s3_slaveAddr) === i, addrInScratchpad(io.s1_paddr) && scratchpadWay(io.s1_paddr) === i) val s1_vb = vb_array(Cat(UInt(i), s1_idx)) && !s1_slaveValid - val encTag = tECC.decode(tag_rdata(i)) - val (tlError, tag) = Split(encTag.uncorrected, tagBits) + val enc_tag = tECC.decode(tag_rdata(i)) + val (tl_error, tag) = Split(enc_tag.uncorrected, tagBits) val tagMatch = s1_vb && tag === s1_tag - s1_tag_disparity(i) := s1_vb && encTag.error - s1_tlError(i) := tagMatch && tlError.toBool + s1_tag_disparity(i) := s1_vb && enc_tag.error + s1_tl_error(i) := tagMatch && tl_error.toBool s1_tag_hit(i) := tagMatch || scratchpadHit } assert(!(s1_valid || s1_slaveValid) || PopCount(s1_tag_hit zip s1_tag_disparity map { case (h, d) => h && !d }) <= 1) @@ -228,7 +228,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) require(dECC.isInstanceOf[uncore.util.IdentityCode]) require(outer.icacheParams.itimAddr.isEmpty) io.resp.bits.data := Mux1H(s1_tag_hit, s1_dout) - io.resp.bits.ae := s1_tlError.asUInt.orR + io.resp.bits.ae := s1_tl_error.asUInt.orR io.resp.valid := s1_valid && s1_hit case 2 => @@ -237,13 +237,13 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) val s2_way_mux = Mux1H(s2_tag_hit, s2_dout) val s2_tag_disparity = RegEnable(s1_tag_disparity, s1_valid || s1_slaveValid).asUInt.orR - val s2_tlError = RegEnable(s1_tlError.asUInt.orR, s1_valid || s1_slaveValid) + val s2_tl_error = RegEnable(s1_tl_error.asUInt.orR, s1_valid || s1_slaveValid) val s2_data_decoded = dECC.decode(s2_way_mux) val s2_disparity = s2_tag_disparity || s2_data_decoded.error when (s2_valid && s2_disparity) { invalidate := true } io.resp.bits.data := s2_data_decoded.uncorrected - io.resp.bits.ae := s2_tlError + io.resp.bits.ae := s2_tl_error io.resp.valid := s2_valid && s2_hit && !s2_disparity tl_in.map { tl =>