From 1699622730470701cf21a19c98161e2a5f820d2f Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sat, 9 Jul 2016 01:08:52 -0700 Subject: [PATCH] Don't speculatively refill I$ in uncacheable regions --- rocket/src/main/scala/dcache.scala | 3 ++- rocket/src/main/scala/frontend.scala | 13 +++++++++++-- rocket/src/main/scala/icache.scala | 6 ++++-- rocket/src/main/scala/rocket.scala | 13 ++++++++----- rocket/src/main/scala/tlb.scala | 2 ++ 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/rocket/src/main/scala/dcache.scala b/rocket/src/main/scala/dcache.scala index 70d45a2d..ea5a42cb 100644 --- a/rocket/src/main/scala/dcache.scala +++ b/rocket/src/main/scala/dcache.scala @@ -129,9 +129,11 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { val releaseInFlight = s1_probe || s2_probe || release_state =/= s_ready val s2_valid_masked = s2_valid && Reg(next = !s1_nack) val s2_req = Reg(io.cpu.req.bits) + val s2_uncached = Reg(Bool()) when (s1_valid_not_nacked || s1_flush_valid) { s2_req := s1_req s2_req.addr := s1_paddr + s2_uncached := !tlb.io.resp.cacheable } val s2_read = isRead(s2_req.cmd) val s2_write = isWrite(s2_req.cmd) @@ -145,7 +147,6 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { val s2_hit = s2_hit_state.isHit(s2_req.cmd) val s2_valid_hit = s2_valid_masked && s2_readwrite && s2_hit val s2_valid_miss = s2_valid_masked && s2_readwrite && !s2_hit && !(pstore1_valid || pstore2_valid) && !release_ack_wait - val s2_uncached = !addrMap.isCacheable(s2_req.addr) val s2_valid_cached_miss = s2_valid_miss && !s2_uncached val s2_victimize = s2_valid_cached_miss || s2_flush_valid val s2_valid_uncached = s2_valid_miss && s2_uncached diff --git a/rocket/src/main/scala/frontend.scala b/rocket/src/main/scala/frontend.scala index 83189a94..f7b1fb1a 100644 --- a/rocket/src/main/scala/frontend.scala +++ b/rocket/src/main/scala/frontend.scala @@ -7,6 +7,7 @@ import cde.{Parameters, Field} class FrontendReq(implicit p: Parameters) extends CoreBundle()(p) { val pc = UInt(width = vaddrBitsExtended) + val speculative = Bool() } class FrontendResp(implicit p: Parameters) extends CoreBundle()(p) { @@ -14,6 +15,7 @@ class FrontendResp(implicit p: Parameters) extends CoreBundle()(p) { val data = Vec(fetchWidth, Bits(width = coreInstBits)) val mask = Bits(width = fetchWidth) val xcpt_if = Bool() + val replay = Bool() } class FrontendIO(implicit p: Parameters) extends CoreBundle()(p) { @@ -40,13 +42,15 @@ class Frontend(implicit p: Parameters) extends CoreModule()(p) with HasL1CachePa val s1_pc_ = Reg(UInt(width=vaddrBitsExtended)) val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline) + val s1_speculative = Reg(Bool()) val s1_same_block = Reg(Bool()) val s2_valid = Reg(init=Bool(true)) val s2_pc = Reg(init=UInt(p(ResetVector))) val s2_btb_resp_valid = Reg(init=Bool(false)) val s2_btb_resp_bits = Reg(new BTBResp) val s2_xcpt_if = Reg(init=Bool(false)) - val s2_resp_valid = Wire(init=Bool(false)) + val s2_speculative = Reg(init=Bool(false)) + val s2_resp_valid = Wire(Bool()) val s2_resp_data = Wire(UInt(width = rowBits)) val ntpc_0 = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth) @@ -62,15 +66,18 @@ class Frontend(implicit p: Parameters) extends CoreModule()(p) with HasL1CachePa when (!stall) { s1_same_block := s0_same_block && !tlb.io.resp.miss s1_pc_ := npc + s1_speculative := Mux(icmiss, s2_speculative, true) s2_valid := !icmiss when (!icmiss) { s2_pc := s1_pc + s2_speculative := s1_speculative && !tlb.io.resp.cacheable s2_xcpt_if := tlb.io.resp.xcpt_if } } when (io.cpu.req.valid) { s1_same_block := Bool(false) s1_pc_ := io.cpu.req.bits.pc + s1_speculative := io.cpu.req.bits.speculative s2_valid := Bool(false) } @@ -105,8 +112,9 @@ class Frontend(implicit p: Parameters) extends CoreModule()(p) with HasL1CachePa icache.io.invalidate := io.cpu.flush_icache icache.io.s1_ppn := tlb.io.resp.ppn icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || tlb.io.resp.xcpt_if || icmiss || io.cpu.flush_tlb + icache.io.s2_kill := s2_speculative - io.cpu.resp.valid := s2_valid && (s2_xcpt_if || s2_resp_valid) + io.cpu.resp.valid := s2_valid && (s2_resp_valid || s2_speculative || s2_xcpt_if) io.cpu.resp.bits.pc := s2_pc io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc) @@ -138,6 +146,7 @@ class Frontend(implicit p: Parameters) extends CoreModule()(p) with HasL1CachePa val msk_pc = if (fetchWidth == 1) all_ones else all_ones << s2_pc(log2Up(fetchWidth) -1+2,2) io.cpu.resp.bits.mask := msk_pc io.cpu.resp.bits.xcpt_if := s2_xcpt_if + io.cpu.resp.bits.replay := s2_speculative && !s2_resp_valid && !s2_xcpt_if io.cpu.btb_resp.valid := s2_btb_resp_valid io.cpu.btb_resp.bits := s2_btb_resp_bits diff --git a/rocket/src/main/scala/icache.scala b/rocket/src/main/scala/icache.scala index f41995d2..32e1b9b7 100644 --- a/rocket/src/main/scala/icache.scala +++ b/rocket/src/main/scala/icache.scala @@ -30,6 +30,7 @@ class ICache(implicit p: Parameters) extends CoreModule()(p) with HasL1CachePara val req = Valid(new ICacheReq).flip val s1_ppn = UInt(INPUT, ppnBits) // delayed one cycle w.r.t. req val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req + val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission val resp = Decoupled(new ICacheResp) val invalidate = Bool(INPUT) @@ -67,7 +68,7 @@ class ICache(implicit p: Parameters) extends CoreModule()(p) with HasL1CachePara val s1_miss = out_valid && !s1_any_tag_hit rdy := state === s_ready && !s1_miss - when (s1_valid && state === s_ready && s1_miss) { + when (s1_miss && state === s_ready) { refill_addr := s1_paddr } val refill_tag = refill_addr(tagBits+untagBits-1,untagBits) @@ -135,7 +136,7 @@ class ICache(implicit p: Parameters) extends CoreModule()(p) with HasL1CachePara io.resp.bits.datablock := Mux1H(s1_tag_hit, s1_dout) io.resp.valid := s1_hit } - io.mem.acquire.valid := (state === s_request) + io.mem.acquire.valid := state === s_request && !io.s2_kill io.mem.acquire.bits := GetBlock(addr_block = refill_addr >> blockOffBits) // control state machine @@ -146,6 +147,7 @@ class ICache(implicit p: Parameters) extends CoreModule()(p) with HasL1CachePara } is (s_request) { when (io.mem.acquire.ready) { state := s_refill_wait } + when (io.s2_kill) { state := s_ready } } is (s_refill_wait) { when (io.mem.grant.valid) { state := s_refill } diff --git a/rocket/src/main/scala/rocket.scala b/rocket/src/main/scala/rocket.scala index 7425f724..726053d3 100644 --- a/rocket/src/main/scala/rocket.scala +++ b/rocket/src/main/scala/rocket.scala @@ -157,6 +157,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) { val ex_reg_flush_pipe = Reg(Bool()) val ex_reg_load_use = Reg(Bool()) val ex_reg_cause = Reg(UInt()) + val ex_reg_replay = Reg(Bool()) val ex_reg_pc = Reg(UInt()) val ex_reg_inst = Reg(Bits()) @@ -295,8 +296,9 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) { div.io.req.bits.tag := ex_waddr ex_reg_valid := !ctrl_killd + ex_reg_replay := !take_pc && io.imem.resp.valid && io.imem.resp.bits.replay ex_reg_xcpt := !ctrl_killd && id_xcpt - ex_reg_xcpt_interrupt := csr.io.interrupt && !take_pc && io.imem.resp.valid + ex_reg_xcpt_interrupt := !take_pc && io.imem.resp.valid && csr.io.interrupt when (id_xcpt) { ex_reg_cause := id_cause } when (!ctrl_killd) { @@ -323,18 +325,18 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) { } } } - when (!ctrl_killd || csr.io.interrupt) { + when (!ctrl_killd || csr.io.interrupt || io.imem.resp.bits.replay) { ex_reg_inst := id_inst ex_reg_pc := id_pc } // replay inst in ex stage? - val ex_pc_valid = ex_reg_valid || ex_reg_xcpt_interrupt + val ex_pc_valid = ex_reg_valid || ex_reg_replay || ex_reg_xcpt_interrupt val wb_dcache_miss = wb_ctrl.mem && !io.dmem.resp.valid val replay_ex_structural = ex_ctrl.mem && !io.dmem.req.ready || ex_ctrl.div && !div.io.req.ready val replay_ex_load_use = wb_dcache_miss && ex_reg_load_use - val replay_ex = ex_reg_valid && (replay_ex_structural || replay_ex_load_use) + val replay_ex = ex_reg_replay || (ex_reg_valid && (replay_ex_structural || replay_ex_load_use)) val ctrl_killx = take_pc_mem_wb || replay_ex || !ex_reg_valid // detect 2-cycle load-use delay for LB/LH/SC val ex_slow_bypass = ex_ctrl.mem_cmd === M_XSC || Vec(MT_B, MT_BU, MT_H, MT_HU).contains(ex_ctrl.mem_type) @@ -536,9 +538,10 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) { id_ctrl.rocc && rocc_blocked || // reduce activity while RoCC is busy id_do_fence || csr.io.csr_stall - ctrl_killd := !io.imem.resp.valid || take_pc || ctrl_stalld || csr.io.interrupt + ctrl_killd := !io.imem.resp.valid || io.imem.resp.bits.replay || take_pc || ctrl_stalld || csr.io.interrupt io.imem.req.valid := take_pc + io.imem.req.bits.speculative := !take_pc_wb io.imem.req.bits.pc := Mux(wb_xcpt || csr.io.eret, csr.io.evec, // exception or [m|s]ret Mux(replay_wb, wb_reg_pc, // replay diff --git a/rocket/src/main/scala/tlb.scala b/rocket/src/main/scala/tlb.scala index e99873b1..aca9aed2 100644 --- a/rocket/src/main/scala/tlb.scala +++ b/rocket/src/main/scala/tlb.scala @@ -32,6 +32,7 @@ class TLBResp(implicit p: Parameters) extends CoreBundle()(p) { val xcpt_ld = Bool(OUTPUT) val xcpt_st = Bool(OUTPUT) val xcpt_if = Bool(OUTPUT) + val cacheable = Bool(OUTPUT) } class TLB(implicit val p: Parameters) extends Module with HasTLBParameters { @@ -111,6 +112,7 @@ class TLB(implicit val p: Parameters) extends Module with HasTLBParameters { io.resp.xcpt_ld := bad_va || (!tlb_miss && !addr_prot.r) || (tlb_hit && !(r_array & hits).orR) io.resp.xcpt_st := bad_va || (!tlb_miss && !addr_prot.w) || (tlb_hit && !(w_array & hits).orR) io.resp.xcpt_if := bad_va || (!tlb_miss && !addr_prot.x) || (tlb_hit && !(x_array & hits).orR) + io.resp.cacheable := addrMap.isCacheable(paddr) io.resp.miss := tlb_miss io.resp.ppn := Mux(vm_enabled, Mux1H(hitsVec, ppns), io.req.bits.vpn(ppnBits-1,0))