From 061a0adceb26a9f539bc12b8d57dfcb1477df280 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Tue, 18 Apr 2017 17:55:04 -0700 Subject: [PATCH] Fetch smaller parcels from the I$ --- src/main/scala/rocket/Frontend.scala | 12 +++--------- src/main/scala/rocket/ICache.scala | 29 +++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index 3590855c..0d3b3e10 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -68,7 +68,6 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val s1_pc_ = Reg(UInt(width=vaddrBitsExtended)) val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline) val s1_speculative = Reg(Bool()) - val s1_same_block = Reg(Bool()) val s2_valid = Reg(init=Bool(true)) val s2_pc = Reg(init=io.resetVector) val s2_btb_resp_valid = Reg(init=Bool(false)) @@ -83,16 +82,13 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val s2_cacheable = Reg(init=Bool(false)) val ntpc = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth) - val ntpc_same_block = (ntpc & rowBytes) === (s1_pc & rowBytes) val predicted_npc = Wire(init = ntpc) val predicted_taken = Wire(init = Bool(false)) val icmiss = s2_valid && !icache.io.resp.valid val npc = Mux(icmiss, s2_pc, predicted_npc) - val s0_same_block = !predicted_taken && !icmiss && !io.cpu.req.valid && ntpc_same_block val stall = io.cpu.resp.valid && !io.cpu.resp.ready when (!stall) { - s1_same_block := s0_same_block && !tlb.io.resp.miss s1_pc_ := io.cpu.npc // consider RVC fetches across blocks to be non-speculative if the first // part was non-speculative @@ -111,7 +107,6 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) } } when (io.cpu.req.valid) { - s1_same_block := Bool(false) s1_pc_ := io.cpu.npc s1_speculative := io.cpu.req.bits.speculative s2_valid := Bool(false) @@ -144,21 +139,20 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) tlb.io.req.bits.sfence := io.cpu.sfence tlb.io.req.bits.size := log2Ceil(coreInstBytes*fetchWidth) - icache.io.req.valid := !stall && !s0_same_block + icache.io.req.valid := !stall icache.io.req.bits.addr := io.cpu.npc icache.io.invalidate := io.cpu.flush_icache icache.io.s1_paddr := tlb.io.resp.paddr icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || icmiss || s1_speculative && !tlb.io.resp.cacheable || tlb.io.resp.pf.inst || tlb.io.resp.ae.inst icache.io.s2_kill := false - icache.io.resp.ready := !stall && !s1_same_block + icache.io.resp.ready := !stall val s2_kill = s2_speculative && !s2_cacheable || s2_xcpt io.cpu.resp.valid := s2_valid && (icache.io.resp.valid || s2_kill) io.cpu.resp.bits.pc := s2_pc io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc) - require(fetchWidth * coreInstBytes <= rowBytes && isPow2(fetchWidth)) - io.cpu.resp.bits.data := icache.io.resp.bits.datablock >> (s2_pc.extract(log2Ceil(rowBytes)-1,log2Ceil(fetchWidth*coreInstBytes)) << log2Ceil(fetchWidth*coreInstBits)) + io.cpu.resp.bits.data := icache.io.resp.bits io.cpu.resp.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes)) io.cpu.resp.bits.pf := s2_pf io.cpu.resp.bits.ae := s2_ae diff --git a/src/main/scala/rocket/ICache.scala b/src/main/scala/rocket/ICache.scala index 16934347..6c0b8708 100644 --- a/src/main/scala/rocket/ICache.scala +++ b/src/main/scala/rocket/ICache.scala @@ -47,7 +47,7 @@ class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) { val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission - val resp = Decoupled(new ICacheResp) + val resp = Decoupled(UInt(width = coreInstBits * fetchWidth)) val invalidate = Bool(INPUT) val mem = outer.node.bundleOut } @@ -110,7 +110,8 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) } val s1_tag_disparity = Wire(Vec(nWays, Bool())) - val s1_dout = Wire(Vec(nWays, UInt(width = code.width(rowBits)))) + val wordBits = coreInstBits * fetchWidth + val s1_dout = Wire(Vec(nWays, UInt(width = code.width(wordBits)))) val s1_dout_valid = RegNext(s0_valid) for (i <- 0 until nWays) { @@ -119,7 +120,24 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) s1_tag_hit(i) := s1_vb && ((code.decode(tag_rdata(i)).uncorrected === s1_tag) holdUnless s1_dout_valid) } - val data_arrays = Seq.fill(nWays) { SeqMem(nSets * refillCycles, Bits(width = code.width(rowBits))) } + require(rowBits % wordBits == 0) + val data_arrays = Seq.fill(rowBits / wordBits) { SeqMem(nSets * refillCycles, Vec(nWays, UInt(width = code.width(wordBits)))) } + for ((data_array, i) <- data_arrays zipWithIndex) { + val wen = tl_out.d.valid + when (wen) { + val idx = (refill_idx << log2Ceil(refillCycles)) | refill_cnt + val data = tl_out.d.bits.data(wordBits*(i+1)-1, wordBits*i) + data_array.write(idx, Vec.fill(nWays)(code.encode(data)), (0 until nWays).map(repl_way === _)) + } + def wordMatch(addr: UInt) = addr.extract(log2Ceil(rowBytes)-1, log2Ceil(wordBits/8)) === i + val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles)) + val dout = data_array.read(s0_raddr, !wen && (s0_valid && wordMatch(s0_vaddr))) holdUnless s1_dout_valid + when (wordMatch(io.s1_paddr)) { + s1_dout := dout + } + } + +/* for ((data_array, i) <- data_arrays zipWithIndex) { val wen = tl_out.d.valid && repl_way === UInt(i) when (wen) { @@ -129,12 +147,13 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles)) s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid) holdUnless s1_dout_valid } +*/ // output signals outer.latency match { case 1 => require(code.width(rowBits) == rowBits) // no ECC - io.resp.bits.datablock := Mux1H(s1_tag_hit, s1_dout) + io.resp.bits := Mux1H(s1_tag_hit, s1_dout) io.resp.valid := s1_hit case 2 => val s2_valid = RegEnable(out_valid, Bool(false), !stall) @@ -148,7 +167,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) val s2_disparity = s2_tag_disparity || s2_data_disparity when (s2_valid && s2_disparity) { invalidate := true } - io.resp.bits.datablock := code.decode(s2_way_mux).uncorrected + io.resp.bits := code.decode(s2_way_mux).uncorrected io.resp.valid := s2_hit && !s2_disparity } tl_out.a.valid := state === s_request && !io.s2_kill