From ecc2ee366cfb33bf026dc584ae67446fd568f6cb Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 4 Aug 2017 00:39:10 -0700 Subject: [PATCH 01/12] Shave a few gate delays off IBuf control logic It takes a while for the pipeline to compute the stall signal, so avoid using it until the last logic levels in the clock cycle. --- src/main/scala/rocket/IBuf.scala | 39 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/main/scala/rocket/IBuf.scala b/src/main/scala/rocket/IBuf.scala index 399b394d..f5c77d1f 100644 --- a/src/main/scala/rocket/IBuf.scala +++ b/src/main/scala/rocket/IBuf.scala @@ -43,24 +43,24 @@ class IBuf(implicit p: Parameters) extends CoreModule { val nIC = Mux(io.imem.bits.btb.valid && io.imem.bits.btb.bits.taken, io.imem.bits.btb.bits.bridx +& 1, UInt(fetchWidth)) - pcWordBits val nICReady = nReady - nBufValid val nValid = Mux(io.imem.valid, nIC, UInt(0)) + nBufValid - io.imem.ready := nReady >= nBufValid && (nICReady >= nIC || n >= nIC - nICReady) + io.imem.ready := io.inst(0).ready && nReady >= nBufValid && (nICReady >= nIC || n >= nIC - nICReady) if (n > 0) { - nBufValid := Mux(nReady >= nBufValid, UInt(0), nBufValid - nReady) - if (n > 1) when (nReady > 0 && nReady < nBufValid) { - val shiftedBuf = shiftInsnRight(buf.data(n*coreInstBits-1, coreInstBits), (nReady-1)(log2Ceil(n-1)-1,0)) - buf.data := Cat(buf.data(n*coreInstBits-1, (n-1)*coreInstBits), shiftedBuf((n-1)*coreInstBits-1, 0)) - buf.pc := buf.pc & ~pcWordMask | (buf.pc + (nReady << log2Ceil(coreInstBytes))) & pcWordMask - ibufBTBResp.bridx := ibufBTBResp.bridx - nReady - } - when (io.imem.valid && nReady >= nBufValid && nICReady < nIC && n >= nIC - nICReady) { - val shamt = pcWordBits + nICReady - nBufValid := nIC - nICReady - buf := io.imem.bits - buf.data := shiftInsnRight(io.imem.bits.data, shamt)(n*coreInstBits-1,0) - buf.pc := io.imem.bits.pc & ~pcWordMask | (io.imem.bits.pc + (nICReady << log2Ceil(coreInstBytes))) & pcWordMask - ibufBTBHit := io.imem.bits.btb.valid - when (io.imem.bits.btb.valid) { + when (io.inst(0).ready) { + nBufValid := Mux(nReady >= nBufValid, UInt(0), nBufValid - nReady) + if (n > 1) when (nReady > 0 && nReady < nBufValid) { + val shiftedBuf = shiftInsnRight(buf.data(n*coreInstBits-1, coreInstBits), (nReady-1)(log2Ceil(n-1)-1,0)) + buf.data := Cat(buf.data(n*coreInstBits-1, (n-1)*coreInstBits), shiftedBuf((n-1)*coreInstBits-1, 0)) + buf.pc := buf.pc & ~pcWordMask | (buf.pc + (nReady << log2Ceil(coreInstBytes))) & pcWordMask + ibufBTBResp.bridx := ibufBTBResp.bridx - nReady + } + when (io.imem.valid && nReady >= nBufValid && nICReady < nIC && n >= nIC - nICReady) { + val shamt = pcWordBits + nICReady + nBufValid := nIC - nICReady + buf := io.imem.bits + buf.data := shiftInsnRight(io.imem.bits.data, shamt)(n*coreInstBits-1,0) + buf.pc := io.imem.bits.pc & ~pcWordMask | (io.imem.bits.pc + (nICReady << log2Ceil(coreInstBytes))) & pcWordMask + ibufBTBHit := io.imem.bits.btb.valid ibufBTBResp := io.imem.bits.btb.bits ibufBTBResp.bridx := io.imem.bits.btb.bits.bridx + nICReady } @@ -97,18 +97,19 @@ class IBuf(implicit p: Parameters) extends CoreModule { if (usingCompressed) { val replay = ic_replay(j) || (!exp.io.rvc && (btbHitMask(j) || ic_replay(j+1))) - io.inst(i).valid := valid(j) && (exp.io.rvc || valid(j+1) || xcpt(j+1).asUInt.orR || replay) + val full_insn = exp.io.rvc || valid(j+1) || xcpt(j+1).asUInt.orR || replay + io.inst(i).valid := valid(j) && full_insn io.inst(i).bits.xcpt0 := xcpt(j) io.inst(i).bits.xcpt1 := Mux(exp.io.rvc, 0.U, xcpt(j+1).asUInt).asTypeOf(new FrontendExceptions) io.inst(i).bits.replay := replay io.inst(i).bits.btb_hit := btbHitMask(j) || (!exp.io.rvc && btbHitMask(j+1)) io.inst(i).bits.rvc := exp.io.rvc - when (io.inst(i).fire()) { nReady := Mux(exp.io.rvc, j+1, j+2) } + when (full_insn && (i == 0 || io.inst(i).ready)) { nReady := Mux(exp.io.rvc, j+1, j+2) } expand(i+1, Mux(exp.io.rvc, j+1, j+2), Mux(exp.io.rvc, curInst >> 16, curInst >> 32)) } else { - when (io.inst(i).ready) { nReady := i+1 } + when (i == 0 || io.inst(i).ready) { nReady := i+1 } io.inst(i).valid := valid(i) io.inst(i).bits.xcpt0 := xcpt(i) io.inst(i).bits.xcpt1 := 0.U.asTypeOf(new FrontendExceptions) From 06a831310bc5736dac534ba92f2e810f86251245 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 4 Aug 2017 00:40:18 -0700 Subject: [PATCH 02/12] Shave a gate delay off I$ backpressure path The deleted code was a holdover from Hwacha's vector fences. --- src/main/scala/rocket/RocketCore.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/rocket/RocketCore.scala b/src/main/scala/rocket/RocketCore.scala index 7151a1d5..b8d15bc5 100644 --- a/src/main/scala/rocket/RocketCore.scala +++ b/src/main/scala/rocket/RocketCore.scala @@ -596,7 +596,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) io.imem.sfence.bits.asid := wb_reg_rs2 io.ptw.sfence := io.imem.sfence - ibuf.io.inst(0).ready := !ctrl_stalld || csr.io.interrupt + ibuf.io.inst(0).ready := !ctrl_stalld io.imem.btb_update.valid := (mem_reg_replay && mem_reg_btb_hit) || (mem_reg_valid && !take_pc_wb && mem_wrong_npc && (!mem_cfi || mem_cfi_taken)) io.imem.btb_update.bits.isValid := !mem_reg_replay && mem_cfi From a45997d03f0bcf6d51f02ff7772e8073fec12046 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 4 Aug 2017 00:37:13 -0700 Subject: [PATCH 03/12] Separate I$ parity error from miss signal Handle parity errors with a pipeline flush rather than a faster frontend replay, reducing a critical path. --- src/main/scala/rocket/Frontend.scala | 2 +- src/main/scala/rocket/ICache.scala | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index b4f873a9..a5b30900 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -149,7 +149,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) fq.io.enq.bits.data := icache.io.resp.bits.data fq.io.enq.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes)) - fq.io.enq.bits.replay := icache.io.s2_kill && !icache.io.resp.valid && !s2_xcpt + fq.io.enq.bits.replay := icache.io.resp.bits.replay || icache.io.s2_kill && !icache.io.resp.valid && !s2_xcpt fq.io.enq.bits.btb.valid := s2_btb_resp_valid fq.io.enq.bits.btb.bits := s2_btb_resp_bits fq.io.enq.bits.xcpt := s2_tlb_resp diff --git a/src/main/scala/rocket/ICache.scala b/src/main/scala/rocket/ICache.scala index d41321aa..330a7133 100644 --- a/src/main/scala/rocket/ICache.scala +++ b/src/main/scala/rocket/ICache.scala @@ -63,6 +63,7 @@ class ICache(val icacheParams: ICacheParams, val hartid: Int)(implicit p: Parame class ICacheResp(outer: ICache) extends Bundle { val data = UInt(width = outer.icacheParams.fetchBytes*8) + val replay = Bool() val ae = Bool() override def cloneType = new ICacheResp(outer).asInstanceOf[this.type] @@ -259,7 +260,8 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) io.resp.bits.data := s2_data_decoded.uncorrected io.resp.bits.ae := s2_tl_error - io.resp.valid := s2_valid && s2_hit && !s2_disparity + io.resp.bits.replay := s2_disparity + io.resp.valid := s2_valid && s2_hit tl_in.map { tl => val respValid = RegInit(false.B) From 4bfbe75d74e9ad39cefa75930c44680a47cdbe35 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 4 Aug 2017 14:59:32 -0700 Subject: [PATCH 04/12] Avoid pipeline replays when fetch queue is full --- src/main/scala/rocket/Frontend.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index a5b30900..8e545708 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -86,6 +86,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val fq = withReset(reset || io.cpu.req.valid) { Module(new ShiftQueue(new FrontendResp, 5, flow = true)) } val s0_valid = io.cpu.req.valid || !fq.io.mask(fq.io.mask.getWidth-3) + val s1_valid = RegNext(s0_valid) val s1_pc = Reg(UInt(width=vaddrBitsExtended)) val s1_speculative = Reg(Bool()) val s2_valid = RegInit(false.B) @@ -143,7 +144,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) icache.io.s2_kill := s2_speculative && !s2_tlb_resp.cacheable || s2_xcpt icache.io.s2_prefetch := s2_tlb_resp.prefetchable - fq.io.enq.valid := s2_valid && (icache.io.resp.valid || !s2_tlb_resp.miss && icache.io.s2_kill) + fq.io.enq.valid := RegNext(s1_valid) && s2_valid && (icache.io.resp.valid || !s2_tlb_resp.miss && icache.io.s2_kill) fq.io.enq.bits.pc := s2_pc io.cpu.npc := alignPC(Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc)) From df7f09b9ce3d6eb0a8a86368e4df03e7f4000edb Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 4 Aug 2017 14:59:56 -0700 Subject: [PATCH 05/12] Get I$ ECC check further off critical path --- src/main/scala/rocket/IBuf.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/main/scala/rocket/IBuf.scala b/src/main/scala/rocket/IBuf.scala index f5c77d1f..304cc6e1 100644 --- a/src/main/scala/rocket/IBuf.scala +++ b/src/main/scala/rocket/IBuf.scala @@ -76,10 +76,11 @@ class IBuf(implicit p: Parameters) extends CoreModule { val icMask = (~UInt(0, fetchWidth*coreInstBits) << (nBufValid << log2Ceil(coreInstBits)))(fetchWidth*coreInstBits-1,0) val inst = icData & icMask | buf.data & ~icMask - val valid = (UIntToOH(nValid) - 1)(fetchWidth-1, 0) - val bufMask = UIntToOH(nBufValid) - 1 + val valid = UIntToOH1(nValid, fetchWidth) + val bufMask = UIntToOH1(nBufValid, fetchWidth) val xcpt = (0 until bufMask.getWidth).map(i => Mux(bufMask(i), buf.xcpt, io.imem.bits.xcpt)) - val ic_replay = valid & (Mux(buf.replay, bufMask, UInt(0)) | Mux(io.imem.bits.replay, ~bufMask, UInt(0))) + val buf_replay = Mux(buf.replay, bufMask, UInt(0)) + val ic_replay = buf_replay | Mux(io.imem.bits.replay, valid & ~bufMask, UInt(0)) val ibufBTBHitMask = Mux(ibufBTBHit, UIntToOH(ibufBTBResp.bridx), UInt(0)) assert(!io.imem.valid || !io.imem.bits.btb.valid || io.imem.bits.btb.bits.bridx >= pcWordBits) val icBTBHitMask = Mux(io.imem.bits.btb.valid, UIntToOH(io.imem.bits.btb.bits.bridx +& nBufValid - pcWordBits), UInt(0)) @@ -97,7 +98,7 @@ class IBuf(implicit p: Parameters) extends CoreModule { if (usingCompressed) { val replay = ic_replay(j) || (!exp.io.rvc && (btbHitMask(j) || ic_replay(j+1))) - val full_insn = exp.io.rvc || valid(j+1) || xcpt(j+1).asUInt.orR || replay + val full_insn = exp.io.rvc || valid(j+1) || xcpt(j+1).asUInt.orR || buf_replay(j) io.inst(i).valid := valid(j) && full_insn io.inst(i).bits.xcpt0 := xcpt(j) io.inst(i).bits.xcpt1 := Mux(exp.io.rvc, 0.U, xcpt(j+1).asUInt).asTypeOf(new FrontendExceptions) From 8d9768455561573cacc56d49ce87dfa22e308ba9 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 4 Aug 2017 17:01:31 -0700 Subject: [PATCH 06/12] Fix L2 TLB perfctr It was counting conflict misses but not cold misses. --- src/main/scala/rocket/PTW.scala | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/scala/rocket/PTW.scala b/src/main/scala/rocket/PTW.scala index cec3271b..6d070074 100644 --- a/src/main/scala/rocket/PTW.scala +++ b/src/main/scala/rocket/PTW.scala @@ -176,17 +176,19 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()( val s0_valid = !l2_refill && arb.io.out.fire() val s1_valid = RegNext(s0_valid) - val s2_valid = RegNext(s1_valid && valid(r_idx)) + val s2_valid = RegNext(s1_valid) val s1_rdata = ram.read(arb.io.out.bits.addr(idxBits-1, 0), s0_valid) val s2_rdata = code.decode(RegEnable(s1_rdata, s1_valid)) - when (s2_valid && s2_rdata.error) { valid := 0.U } + val s2_valid_bit = RegEnable(valid(r_idx), s1_valid) + val s2_g = RegEnable(g(r_idx), s1_valid) + when (s2_valid && s2_valid_bit && s2_rdata.error) { valid := 0.U } val s2_entry = s2_rdata.uncorrected.asTypeOf(new Entry) - val s2_hit = s2_valid && !s2_rdata.error && r_tag === s2_entry.tag - io.dpath.perf.l2miss := s2_valid && !(r_tag === s2_entry.tag) + val s2_hit = s2_valid && s2_valid_bit && !s2_rdata.error && r_tag === s2_entry.tag + io.dpath.perf.l2miss := s2_valid && !(s2_valid_bit && r_tag === s2_entry.tag) val s2_pte = Wire(new PTE) s2_pte := s2_entry - s2_pte.g := g(r_idx) + s2_pte.g := s2_g s2_pte.v := true (s2_hit, s2_pte) From 6112adfbb05f2c2fef5e8f52c80911908f67ef1a Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 4 Aug 2017 17:01:51 -0700 Subject: [PATCH 07/12] Get L2 TLB tag/parity check off the D$ arbitration path --- src/main/scala/rocket/PTW.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/scala/rocket/PTW.scala b/src/main/scala/rocket/PTW.scala index 6d070074..768c3167 100644 --- a/src/main/scala/rocket/PTW.scala +++ b/src/main/scala/rocket/PTW.scala @@ -135,7 +135,7 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()( val l2_refill = RegNext(false.B) io.dpath.perf.l2miss := false - val (l2_hit, l2_pte) = if (coreParams.nL2TLBEntries == 0) (false.B, Wire(new PTE)) else { + val (l2_hit, l2_valid, l2_pte) = if (coreParams.nL2TLBEntries == 0) (false.B, false.B, Wire(new PTE)) else { val code = new ParityCode require(isPow2(coreParams.nL2TLBEntries)) val idxBits = log2Ceil(coreParams.nL2TLBEntries) @@ -191,10 +191,10 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()( s2_pte.g := s2_g s2_pte.v := true - (s2_hit, s2_pte) + (s2_hit, s2_valid && s2_valid_bit, s2_pte) } - io.mem.req.valid := state === s_req && !l2_hit + io.mem.req.valid := state === s_req && !l2_valid io.mem.req.bits.phys := Bool(true) io.mem.req.bits.cmd := M_XRD io.mem.req.bits.typ := log2Ceil(xLen/8) @@ -233,7 +233,7 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()( s1_kill := true count := count + 1 r_pte.ppn := pte_cache_data - }.elsewhen (io.mem.req.ready) { + }.elsewhen (io.mem.req.fire()) { state := s_wait1 } } From bc298bf14668ec635cc5cd2ce73330cb3bd313c3 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 4 Aug 2017 22:06:37 -0700 Subject: [PATCH 08/12] Optimize ShiftQueue for late-arriving deq.ready --- src/main/scala/util/ShiftQueue.scala | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/main/scala/util/ShiftQueue.scala b/src/main/scala/util/ShiftQueue.scala index a64d3ee8..5914da89 100644 --- a/src/main/scala/util/ShiftQueue.scala +++ b/src/main/scala/util/ShiftQueue.scala @@ -20,20 +20,20 @@ class ShiftQueue[T <: Data](gen: T, private val valid = RegInit(Vec.fill(entries) { Bool(false) }) private val elts = Reg(Vec(entries, gen)) - private val do_enq = io.enq.fire() - private val do_deq = io.deq.fire() - for (i <- 0 until entries) { + def paddedValid(i: Int) = if (i == -1) true.B else if (i == entries) false.B else valid(i) + val wdata = if (i == entries-1) io.enq.bits else Mux(valid(i+1), elts(i+1), io.enq.bits) - val shiftDown = if (i == entries-1) false.B else io.deq.ready && valid(i+1) - val enqNew = io.enq.fire() && Mux(io.deq.ready, valid(i), !valid(i) && (if (i == 0) true.B else valid(i-1))) - when (shiftDown || enqNew) { elts(i) := wdata } - } + val wen = + Mux(io.deq.ready, + paddedValid(i+1) || io.enq.fire() && valid(i), + io.enq.fire() && paddedValid(i-1) && !valid(i)) + when (wen) { elts(i) := wdata } - val padded = Seq(true.B) ++ valid ++ Seq(false.B) - for (i <- 0 until entries) { - when ( do_enq && !do_deq && padded(i+0)) { valid(i) := true.B } - when (!do_enq && do_deq && !padded(i+2)) { valid(i) := false.B } + valid(i) := + Mux(io.deq.ready, + paddedValid(i+1) || io.enq.fire() && (Bool(i == 0 && !flow) || valid(i)), + io.enq.fire() && paddedValid(i-1) || valid(i)) } io.enq.ready := !valid(entries-1) From b9b4142bb4d42c8e184e40ab820db9458266f58d Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sat, 5 Aug 2017 00:30:36 -0700 Subject: [PATCH 09/12] Get s2_nack off the critical path We were using it to compute the next PC on flush vs. replay (which require PC+4 and PC, respectively). This fix gets rid of the adder altogether by reusing the M-stage PC in the flush case, which by construction holds PC+4. --- src/main/scala/rocket/RocketCore.scala | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/main/scala/rocket/RocketCore.scala b/src/main/scala/rocket/RocketCore.scala index b8d15bc5..d22be6d9 100644 --- a/src/main/scala/rocket/RocketCore.scala +++ b/src/main/scala/rocket/RocketCore.scala @@ -376,14 +376,18 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) val mem_cfi_taken = (mem_ctrl.branch && mem_br_taken) || mem_ctrl.jalr || mem_ctrl.jal val mem_direction_misprediction = (Bool(coreParams.jumpInFrontend) || mem_reg_btb_hit) && mem_ctrl.branch && mem_br_taken =/= mem_reg_btb_resp.taken val mem_misprediction = if (usingBTB) mem_wrong_npc else mem_cfi_taken - take_pc_mem := mem_reg_valid && (mem_misprediction || mem_reg_sfence || (mem_ctrl.jalr && csr.io.status.debug)) + take_pc_mem := mem_reg_valid && (mem_misprediction || mem_reg_sfence) mem_reg_valid := !ctrl_killx mem_reg_replay := !take_pc_mem_wb && replay_ex mem_reg_xcpt := !ctrl_killx && ex_xcpt mem_reg_xcpt_interrupt := !take_pc_mem_wb && ex_reg_xcpt_interrupt - when (ex_pc_valid) { + // on pipeline flushes, cause mem_npc to hold the sequential npc, which + // will drive the W-stage npc mux + when (mem_reg_valid && mem_reg_flush_pipe) { + mem_reg_sfence := false + }.elsewhen (ex_pc_valid) { mem_ctrl := ex_ctrl mem_reg_rvc := ex_reg_rvc mem_reg_load := ex_ctrl.mem && isRead(ex_ctrl.mem_cmd) @@ -398,10 +402,16 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) mem_reg_inst := ex_reg_inst mem_reg_pc := ex_reg_pc mem_reg_wdata := alu.io.out + when (ex_ctrl.rxs2 && (ex_ctrl.mem || ex_ctrl.rocc || ex_sfence)) { val typ = Mux(ex_ctrl.rocc, log2Ceil(xLen/8).U, ex_ctrl.mem_type) mem_reg_rs2 := new StoreGen(typ, 0.U, ex_rs(1), coreDataBytes).data } + when (ex_ctrl.jalr && csr.io.status.debug) { + // flush I$ on D-mode JALR to effect uncached fetch without D$ flush + mem_ctrl.fence_i := true + mem_reg_flush_pipe := true + } } val mem_breakpoint = (mem_reg_load && bpu.io.xcpt_ld) || (mem_reg_store && bpu.io.xcpt_st) @@ -438,9 +448,6 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) wb_reg_cause := mem_cause wb_reg_inst := mem_reg_inst wb_reg_pc := mem_reg_pc - when (mem_ctrl.jalr && csr.io.status.debug) { - wb_ctrl.fence_i := true - } } val (wb_xcpt, wb_cause) = checkExceptions(List( @@ -458,7 +465,6 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) val replay_wb_common = io.dmem.s2_nack || wb_reg_replay val replay_wb_rocc = wb_reg_valid && wb_ctrl.rocc && !io.rocc.cmd.ready val replay_wb = replay_wb_common || replay_wb_rocc - val wb_npc = encodeVirtualAddress(wb_reg_pc, wb_reg_pc + Mux(replay_wb, 0.U, Mux(wb_reg_rvc, 2.U, 4.U))) take_pc_wb := replay_wb || wb_xcpt || csr.io.eret || wb_reg_flush_pipe // writeback arbitration @@ -585,9 +591,9 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) io.imem.req.valid := take_pc io.imem.req.bits.speculative := !take_pc_wb io.imem.req.bits.pc := - Mux(wb_xcpt || csr.io.eret, csr.io.evec, // exception or [m|s]ret - Mux(replay_wb || wb_reg_flush_pipe, wb_npc, // replay or flush - mem_npc)) // branch misprediction + Mux(wb_xcpt || csr.io.eret, csr.io.evec, // exception or [m|s]ret + Mux(replay_wb, wb_reg_pc, // replay + mem_npc)) // flush or branch misprediction io.imem.flush_icache := wb_reg_valid && wb_ctrl.fence_i && !io.dmem.s2_nack io.imem.sfence.valid := wb_reg_valid && wb_reg_sfence io.imem.sfence.bits.rs1 := wb_ctrl.mem_type(0) From 991e16de92e096fb0752e3f58e7e2468a6ca98b0 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sat, 5 Aug 2017 12:57:38 -0700 Subject: [PATCH 10/12] Remove probe address mux from TLB response path --- src/main/scala/rocket/DCache.scala | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index 8400c15a..b929d8a1 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -50,7 +50,7 @@ class DCacheDataArray(implicit p: Parameters) extends L1HellaCacheModule()(p) { class DCacheMetadataReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) { val write = Bool() - val idx = UInt(width = idxBits) + val addr = UInt(width = vaddrBitsExtended) val way_en = UInt(width = nWays) val data = new L1Metadata } @@ -102,7 +102,8 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val s1_req = Reg(io.cpu.req.bits) when (metaArb.io.out.valid && !metaArb.io.out.bits.write) { s1_req := io.cpu.req.bits - s1_req.addr := Cat(io.cpu.req.bits.addr >> untagBits, metaArb.io.out.bits.idx, io.cpu.req.bits.addr(blockOffBits-1,0)) + s1_req.addr := Cat(metaArb.io.out.bits.addr >> blockOffBits, io.cpu.req.bits.addr(blockOffBits-1,0)) + when (!metaArb.io.in(7).ready) { s1_req.phys := true } } val s1_read = isRead(s1_req.cmd) val s1_write = isWrite(s1_req.cmd) @@ -137,7 +138,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val s1_didntRead = RegEnable(s0_needsRead && !dataArb.io.in(3).ready, metaArb.io.out.valid && !metaArb.io.out.bits.write) metaArb.io.in(7).valid := io.cpu.req.valid metaArb.io.in(7).bits.write := false - metaArb.io.in(7).bits.idx := io.cpu.req.bits.addr(idxMSB, idxLSB) + metaArb.io.in(7).bits.addr := io.cpu.req.bits.addr metaArb.io.in(7).bits.way_en := ~UInt(0, nWays) metaArb.io.in(7).bits.data := metaArb.io.in(4).bits.data when (!metaArb.io.in(7).ready) { io.cpu.req.ready := false } @@ -160,7 +161,6 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { when (s1_valid && s1_readwrite && tlb.io.resp.miss) { s1_nack := true } val s1_paddr = tlb.io.resp.paddr - val s1_tag = Mux(s1_probe, probe_bits.address, s1_paddr) >> untagBits val s1_victim_way = Wire(init = replacer.way) val (s1_hit_way, s1_hit_state, s1_meta, s1_victim_meta) = if (usingDataScratchpad) { @@ -171,13 +171,15 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { (inScratchpad, hitState, Seq(tECC.encode(dummyMeta.asUInt)), dummyMeta) } else { val metaReq = metaArb.io.out + val metaIdx = metaReq.bits.addr(idxMSB, idxLSB) when (metaReq.valid && metaReq.bits.write) { val wdata = tECC.encode(metaReq.bits.data.asUInt) val wmask = if (nWays == 1) Seq(true.B) else metaReq.bits.way_en.toBools - tag_array.write(metaReq.bits.idx, Vec.fill(nWays)(wdata), wmask) + tag_array.write(metaIdx, Vec.fill(nWays)(wdata), wmask) } - val s1_meta = tag_array.read(metaReq.bits.idx, metaReq.valid && !metaReq.bits.write) + val s1_meta = tag_array.read(metaIdx, metaReq.valid && !metaReq.bits.write) val s1_meta_uncorrected = s1_meta.map(tECC.decode(_).uncorrected.asTypeOf(new L1Metadata)) + val s1_tag = s1_paddr >> untagBits val s1_meta_hit_way = s1_meta_uncorrected.map(r => r.coh.isValid() && r.tag === s1_tag).asUInt val s1_meta_hit_state = ClientMetadata.onReset.fromBits( s1_meta_uncorrected.map(r => Mux(r.tag === s1_tag, r.coh.asUInt, UInt(0))) @@ -254,14 +256,14 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { metaArb.io.in(1).valid := s2_meta_error && (s2_valid_masked || s2_flush_valid_pre_tag_ecc || s2_probe) metaArb.io.in(1).bits.write := true metaArb.io.in(1).bits.way_en := PriorityEncoderOH(s2_meta_errors) - metaArb.io.in(1).bits.idx := Mux(s2_probe, probe_bits.address, s2_req.addr)(idxMSB, idxLSB) + metaArb.io.in(1).bits.addr := Cat(io.cpu.req.bits.addr >> untagBits, Mux(s2_probe, probe_bits.address, s2_req.addr)(idxMSB, 0)) metaArb.io.in(1).bits.data := PriorityMux(s2_meta_errors, s2_meta_corrected) // tag updates on hit/miss metaArb.io.in(2).valid := (s2_valid_hit && s2_update_meta) || (s2_victimize && !s2_victim_dirty) metaArb.io.in(2).bits.write := true metaArb.io.in(2).bits.way_en := s2_victim_way - metaArb.io.in(2).bits.idx := s2_req.addr(idxMSB, idxLSB) + metaArb.io.in(2).bits.addr := Cat(io.cpu.req.bits.addr >> untagBits, s2_req.addr(idxMSB, 0)) metaArb.io.in(2).bits.data.coh := Mux(s2_valid_hit, s2_new_hit_state, ClientMetadata.onReset) metaArb.io.in(2).bits.data.tag := s2_req.addr >> untagBits @@ -463,7 +465,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { assert(!metaArb.io.in(3).valid || metaArb.io.in(3).ready) metaArb.io.in(3).bits.write := true metaArb.io.in(3).bits.way_en := s2_victim_way - metaArb.io.in(3).bits.idx := s2_req.addr(idxMSB, idxLSB) + metaArb.io.in(3).bits.addr := Cat(io.cpu.req.bits.addr >> untagBits, s2_req.addr(idxMSB, 0)) metaArb.io.in(3).bits.data.coh := s2_hit_state.onGrant(s2_req.cmd, tl_out.d.bits.param) metaArb.io.in(3).bits.data.tag := s2_req.addr >> untagBits // don't accept uncached grants if there's a structural hazard on s2_data... @@ -485,7 +487,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { metaArb.io.in(6).valid := tl_out.b.valid && !block_probe tl_out.b.ready := metaArb.io.in(6).ready && !block_probe && !s1_valid && (!s2_valid || s2_valid_hit) metaArb.io.in(6).bits.write := false - metaArb.io.in(6).bits.idx := tl_out.b.bits.address(idxMSB, idxLSB) + metaArb.io.in(6).bits.addr := Cat(io.cpu.req.bits.addr >> paddrBits, tl_out.b.bits.address) metaArb.io.in(6).bits.way_en := ~UInt(0, nWays) metaArb.io.in(6).bits.data := metaArb.io.in(4).bits.data @@ -529,7 +531,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { } when (release_state === s_probe_retry) { metaArb.io.in(6).valid := true - metaArb.io.in(6).bits.idx := probe_bits.address(idxMSB, idxLSB) + metaArb.io.in(6).bits.addr := Cat(io.cpu.req.bits.addr >> paddrBits, probe_bits.address) when (metaArb.io.in(6).ready) { release_state := s_ready s1_probe := true @@ -572,7 +574,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { metaArb.io.in(4).valid := release_state.isOneOf(s_voluntary_write_meta, s_probe_write_meta) metaArb.io.in(4).bits.write := true metaArb.io.in(4).bits.way_en := releaseWay - metaArb.io.in(4).bits.idx := tl_out.c.bits.address(idxMSB, idxLSB) + metaArb.io.in(4).bits.addr := Cat(io.cpu.req.bits.addr >> untagBits, tl_out.c.bits.address(idxMSB, 0)) metaArb.io.in(4).bits.data.coh := newCoh metaArb.io.in(4).bits.data.tag := tl_out.c.bits.address >> untagBits when (metaArb.io.in(4).fire()) { release_state := s_ready } @@ -648,7 +650,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { s1_flush_valid := metaArb.io.in(5).fire() && !s1_flush_valid && !s2_flush_valid_pre_tag_ecc && release_state === s_ready && !release_ack_wait metaArb.io.in(5).valid := flushing metaArb.io.in(5).bits.write := false - metaArb.io.in(5).bits.idx := flushCounter + metaArb.io.in(5).bits.addr := Cat(io.cpu.req.bits.addr >> untagBits, flushCounter(idxBits-1, 0) << blockOffBits) metaArb.io.in(5).bits.way_en := ~UInt(0, nWays) metaArb.io.in(5).bits.data := metaArb.io.in(4).bits.data when (flushing) { @@ -665,7 +667,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { } } metaArb.io.in(0).valid := resetting - metaArb.io.in(0).bits.idx := flushCounter + metaArb.io.in(0).bits.addr := metaArb.io.in(5).bits.addr metaArb.io.in(0).bits.write := true metaArb.io.in(0).bits.way_en := ~UInt(0, nWays) metaArb.io.in(0).bits.data.coh := ClientMetadata.onReset From 83875e3a0c7c33655bbb5a4ad64de6db740b21c3 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sat, 5 Aug 2017 13:47:21 -0700 Subject: [PATCH 11/12] Only flush D$ on FENCE.I if it won't always be probed on I$ miss --- src/main/scala/rocket/DCache.scala | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index b929d8a1..e83796ea 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -6,7 +6,7 @@ import Chisel._ import Chisel.ImplicitConversions._ import freechips.rocketchip.config.Parameters import freechips.rocketchip.coreplex.{RationalCrossing, RocketCrossing, RocketTilesKey} -import freechips.rocketchip.diplomacy.AddressSet +import freechips.rocketchip.diplomacy.{AddressSet, RegionType} import freechips.rocketchip.tilelink._ import freechips.rocketchip.util._ import TLMessages._ @@ -640,7 +640,6 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val flushCounterNext = flushCounter +& 1 val flushDone = (flushCounterNext >> log2Ceil(nSets)) === nWays val flushCounterWrap = flushCounterNext(log2Ceil(nSets)-1, 0) - when (tl_out_a.fire() && !s2_uncached) { flushed := false } when (s2_valid_masked && s2_req.cmd === M_FLUSH_ALL) { io.cpu.s2_nack := !flushed when (!flushed) { @@ -653,17 +652,21 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { metaArb.io.in(5).bits.addr := Cat(io.cpu.req.bits.addr >> untagBits, flushCounter(idxBits-1, 0) << blockOffBits) metaArb.io.in(5).bits.way_en := ~UInt(0, nWays) metaArb.io.in(5).bits.data := metaArb.io.in(4).bits.data - when (flushing) { - s1_victim_way := flushCounter >> log2Up(nSets) - when (s2_flush_valid) { - flushCounter := flushCounterNext - when (flushDone) { - flushed := true - if (!isPow2(nWays)) flushCounter := flushCounterWrap + // Only flush D$ on FENCE.I if some cached executable regions are untracked. + if (!edge.manager.managers.forall(m => !m.supportsAcquireB || !m.executable || m.regionType >= RegionType.TRACKED)) { + when (tl_out_a.fire() && !s2_uncached) { flushed := false } + when (flushing) { + s1_victim_way := flushCounter >> log2Up(nSets) + when (s2_flush_valid) { + flushCounter := flushCounterNext + when (flushDone) { + flushed := true + if (!isPow2(nWays)) flushCounter := flushCounterWrap + } + } + when (flushed && release_state === s_ready && !release_ack_wait) { + flushing := false } - } - when (flushed && release_state === s_ready && !release_ack_wait) { - flushing := false } } metaArb.io.in(0).valid := resetting From 39b7e930ca87ecfa1981bc3e680b7b31ae032869 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sat, 5 Aug 2017 16:13:47 -0700 Subject: [PATCH 12/12] Disable AMBAUnitTestConfig, as it is blocking unrelated PRs --- regression/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression/Makefile b/regression/Makefile index 6c011f92..7c42bc92 100644 --- a/regression/Makefile +++ b/regression/Makefile @@ -58,7 +58,7 @@ endif ifeq ($(SUITE),UnittestSuite) PROJECT=freechips.rocketchip.unittest -CONFIGS=AMBAUnitTestConfig TLSimpleUnitTestConfig TLWidthUnitTestConfig +CONFIGS=TLSimpleUnitTestConfig TLWidthUnitTestConfig endif ifeq ($(SUITE), JtagDtmSuite)