diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index 223fade9..8400c15a 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -89,7 +89,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val q_depth = if (rational) (2 min maxUncachedInFlight-1) else 0 val tl_out_a = Wire(tl_out.a) - tl_out.a <> (if (q_depth == 0) tl_out_a else Queue(tl_out_a, q_depth, flow = true, pipe = true)) + tl_out.a <> (if (q_depth == 0) tl_out_a else Queue(tl_out_a, q_depth, flow = true)) val tl_out_c = Wire(tl_out.c) tl_out.c <> (if (cacheParams.acquireBeforeRelease) Queue(tl_out_c, cacheDataBeats, flow = true) else tl_out_c) @@ -184,8 +184,8 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { .reduce (_|_)) (s1_meta_hit_way, s1_meta_hit_state, s1_meta, s1_meta_uncorrected(s1_victim_way)) } - val s1_data_way = Mux(inWriteback, releaseWay, s1_hit_way) - val s1_data = Mux1H(s1_data_way, data.io.resp) // retime into s2 if critical + val s1_data_way = Wire(init = Mux(inWriteback, releaseWay, s1_hit_way)) + val s1_all_data_ways = Vec(data.io.resp :+ dummyEncodeData(tl_out.d.bits.data)) val s1_mask = Mux(s1_req.cmd === M_PWR, io.cpu.s1_data.mask, new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes).mask) val s2_valid = Reg(next=s1_valid_masked && !s1_sfence, init=Bool(false)) && !io.cpu.s2_xcpt.asUInt.orR @@ -210,7 +210,16 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val s2_meta_errors = s2_meta.map(_.error).asUInt val s2_meta_error = s2_meta_errors.orR val s2_flush_valid = s2_flush_valid_pre_tag_ecc && !s2_meta_error - val s2_data = RegEnable(s1_data, s1_valid || inWriteback) + val s2_data = { + val en = s1_valid || inWriteback || tl_out.d.fire() + if (cacheParams.pipelineWayMux && nWays > 1) { + val s2_data_way = RegEnable(s1_data_way, en) + val s2_all_data_ways = (0 to nWays).map(i => RegEnable(s1_all_data_ways(i), en && s1_data_way(i))) + Mux1H(s2_data_way, s2_all_data_ways) + } else { + RegEnable(Mux1H(s1_data_way, s1_all_data_ways), en) + } + } val s2_probe_way = RegEnable(s1_hit_way, s1_probe) val s2_probe_state = RegEnable(s1_hit_state, s1_probe) val s2_hit_way = RegEnable(s1_hit_way, s1_valid_not_nacked) @@ -417,7 +426,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { } } when (grantIsUncachedData) { - s2_data := dummyEncodeData(tl_out.d.bits.data) + s1_data_way := 1.U << nWays s2_req.cmd := M_XRD s2_req.typ := req.typ s2_req.tag := req.tag diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index 14d7467e..bbe85fb5 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -97,7 +97,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val s2_speculative = Reg(init=Bool(false)) val s2_partial_insn_valid = RegInit(false.B) val s2_partial_insn = Reg(UInt(width = coreInstBits)) - val s2_wrong_path = Reg(Bool()) + val wrong_path = Reg(Bool()) val s1_base_pc = ~(~s1_pc | (fetchBytes - 1)) val ntpc = s1_base_pc + fetchBytes.U @@ -180,11 +180,12 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) Mux(returnAddrLSBs(log2Ceil(fetchWidth)), ntpc, s1_base_pc | ((returnAddrLSBs << log2Ceil(coreInstBytes)) & (fetchBytes - 1))) btb.io.ras_update.bits.cfiType := btb.io.resp.bits.cfiType btb.io.ras_update.bits.prediction.valid := true - } else when (fq.io.enq.fire()) { + } else { val s2_btb_hit = s2_btb_resp_valid && s2_btb_resp_bits.taken val s2_base_pc = ~(~s2_pc | (fetchBytes-1)) val taken_idx = Wire(UInt()) val after_idx = Wire(UInt()) + val useRAS = Wire(init=false.B) def scanInsns(idx: Int, prevValid: Bool, prevBits: UInt, prevTaken: Bool): Bool = { val prevRVI = prevValid && prevBits(1,0) === 3 @@ -200,48 +201,56 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val rvcBranch = bits === Instructions.C_BEQZ || bits === Instructions.C_BNEZ val rvcJAL = Bool(xLen == 32) && bits === Instructions.C_JAL val rvcJump = bits === Instructions.C_J || rvcJAL - val rvcImm = Mux(bits(14), new RVCDecoder(bits).bImm.asSInt, 0.S) | Mux(bits(14,13) === 1, new RVCDecoder(bits).jImm.asSInt, 0.S) + val rvcImm = Mux(bits(14), new RVCDecoder(bits).bImm.asSInt, new RVCDecoder(bits).jImm.asSInt) val rvcJR = bits === Instructions.C_MV && bits(6,2) === 0 val rvcReturn = rvcJR && BitPat("b00?01") === bits(11,7) val rvcJALR = bits === Instructions.C_ADD && bits(6,2) === 0 val rvcCall = rvcJAL || rvcJALR - val rviImm = Mux(rviBits(3), ImmGen(IMM_UJ, rviBits), 0.S) | Mux(!rviBits(2), ImmGen(IMM_SB, rviBits), 0.S) + val rviImm = Mux(rviBits(3), ImmGen(IMM_UJ, rviBits), ImmGen(IMM_SB, rviBits)) val taken = prevRVI && (rviJump || rviJALR || rviBranch && s2_btb_resp_bits.bht.taken) || valid && (rvcJump || rvcJALR || rvcJR || rvcBranch && s2_btb_resp_bits.bht.taken) + val predictReturn = btb.io.ras_head.valid && (prevRVI && rviReturn || valid && rvcReturn) + val predictBranch = + prevRVI && (rviJump || rviBranch && s2_btb_resp_bits.bht.taken) || + valid && (rvcJump || rvcBranch && s2_btb_resp_bits.bht.taken) when (!prevTaken) { taken_idx := idx after_idx := idx + 1 - btb.io.ras_update.valid := !s2_wrong_path && (prevRVI && (rviCall || rviReturn) || valid && (rvcCall || rvcReturn)) + btb.io.ras_update.valid := fq.io.enq.fire() && !wrong_path && (prevRVI && (rviCall || rviReturn) || valid && (rvcCall || rvcReturn)) btb.io.ras_update.bits.prediction.valid := true btb.io.ras_update.bits.cfiType := Mux(Mux(prevRVI, rviReturn, rvcReturn), CFIType.ret, CFIType.call) when (!s2_btb_hit) { - when (prevRVI && (rviJALR && !(rviReturn && btb.io.ras_head.valid)) || - valid && (rvcJALR || (rvcJR && !btb.io.ras_head.valid))) { - s2_wrong_path := true + when (fq.io.enq.fire() && taken && !predictBranch && !predictReturn) { + wrong_path := true } - when (taken) { + when (s2_valid && predictReturn) { + useRAS := true + } + when (s2_valid && predictBranch) { val pc = s2_base_pc | (idx*coreInstBytes) val npc = if (idx == 0) pc.asSInt + Mux(prevRVI, rviImm -& 2.S, rvcImm) else Mux(prevRVI, pc - coreInstBytes, pc).asSInt + Mux(prevRVI, rviImm, rvcImm) - predicted_npc := Mux(prevRVI && rviReturn || valid && rvcReturn, btb.io.ras_head.bits, npc.asUInt) + predicted_npc := npc.asUInt } when (prevRVI && rviBranch || valid && rvcBranch) { - btb.io.bht_advance.valid := !s2_wrong_path && !s2_btb_resp_valid + btb.io.bht_advance.valid := fq.io.enq.fire() && !wrong_path && !s2_btb_resp_valid btb.io.bht_advance.bits := s2_btb_resp_bits } } } if (idx == fetchWidth-1) { - s2_partial_insn_valid := false - when (valid && !prevTaken && !rvc) { - s2_partial_insn_valid := true - s2_partial_insn := bits | 0x3 + when (fq.io.enq.fire()) { + s2_partial_insn_valid := false + when (valid && !prevTaken && !rvc) { + s2_partial_insn_valid := true + s2_partial_insn := bits | 0x3 + } } prevTaken || taken } else { @@ -252,20 +261,24 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) btb.io.ras_update.bits.returnAddr := s2_base_pc + (after_idx << log2Ceil(coreInstBytes)) val taken = scanInsns(0, s2_partial_insn_valid, s2_partial_insn, false.B) - when (s2_btb_hit) { + when (useRAS) { + predicted_npc := btb.io.ras_head.bits + } + when (fq.io.enq.fire() && s2_btb_hit) { s2_partial_insn_valid := false - }.otherwise { + } + when (!s2_btb_hit) { fq.io.enq.bits.btb.bits.bridx := taken_idx when (taken) { fq.io.enq.bits.btb.valid := true fq.io.enq.bits.btb.bits.taken := true fq.io.enq.bits.btb.bits.entry := UInt(tileParams.btb.get.nEntries) - s2_redirect := true + when (fq.io.enq.fire()) { s2_redirect := true } } } } when (s2_redirect) { s2_partial_insn_valid := false } - when (io.cpu.req.valid) { s2_wrong_path := false } + when (io.cpu.req.valid) { wrong_path := false } } io.cpu.resp <> fq.io.deq diff --git a/src/main/scala/rocket/HellaCache.scala b/src/main/scala/rocket/HellaCache.scala index 03224c2f..60cc9086 100644 --- a/src/main/scala/rocket/HellaCache.scala +++ b/src/main/scala/rocket/HellaCache.scala @@ -27,6 +27,7 @@ case class DCacheParams( nMMIOs: Int = 1, blockBytes: Int = 64, acquireBeforeRelease: Boolean = false, + pipelineWayMux: Boolean = false, scratch: Option[BigInt] = None) extends L1CacheParams { def dataScratchpadBytes: Int = scratch.map(_ => nSets*blockBytes).getOrElse(0) diff --git a/src/main/scala/rocket/RocketCore.scala b/src/main/scala/rocket/RocketCore.scala index b53b4866..7151a1d5 100644 --- a/src/main/scala/rocket/RocketCore.scala +++ b/src/main/scala/rocket/RocketCore.scala @@ -528,7 +528,12 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) val sboard = new Scoreboard(32, true) sboard.clear(ll_wen, ll_waddr) - val id_sboard_hazard = checkHazards(hazard_targets, rd => sboard.read(rd) && !(ll_wen && ll_waddr === rd)) + def id_sboard_clear_bypass(r: UInt) = { + // ll_waddr arrives late when D$ has ECC, so reshuffle the hazard check + if (tileParams.dcache.get.dataECC.isInstanceOf[IdentityCode]) ll_wen && ll_waddr === r + else div.io.resp.fire() && div.io.resp.bits.tag === r || dmem_resp_replay && dmem_resp_xpu && dmem_resp_waddr === r + } + val id_sboard_hazard = checkHazards(hazard_targets, rd => sboard.read(rd) && !id_sboard_clear_bypass(rd)) sboard.set(wb_set_sboard && wb_wen, wb_waddr) // stall for RAW/WAW hazards on CSRs, loads, AMOs, and mul/div in execute stage.