From 061a0adceb26a9f539bc12b8d57dfcb1477df280 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Tue, 18 Apr 2017 17:55:04 -0700 Subject: [PATCH 01/14] Fetch smaller parcels from the I$ --- src/main/scala/rocket/Frontend.scala | 12 +++--------- src/main/scala/rocket/ICache.scala | 29 +++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index 3590855c..0d3b3e10 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -68,7 +68,6 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val s1_pc_ = Reg(UInt(width=vaddrBitsExtended)) val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline) val s1_speculative = Reg(Bool()) - val s1_same_block = Reg(Bool()) val s2_valid = Reg(init=Bool(true)) val s2_pc = Reg(init=io.resetVector) val s2_btb_resp_valid = Reg(init=Bool(false)) @@ -83,16 +82,13 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val s2_cacheable = Reg(init=Bool(false)) val ntpc = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth) - val ntpc_same_block = (ntpc & rowBytes) === (s1_pc & rowBytes) val predicted_npc = Wire(init = ntpc) val predicted_taken = Wire(init = Bool(false)) val icmiss = s2_valid && !icache.io.resp.valid val npc = Mux(icmiss, s2_pc, predicted_npc) - val s0_same_block = !predicted_taken && !icmiss && !io.cpu.req.valid && ntpc_same_block val stall = io.cpu.resp.valid && !io.cpu.resp.ready when (!stall) { - s1_same_block := s0_same_block && !tlb.io.resp.miss s1_pc_ := io.cpu.npc // consider RVC fetches across blocks to be non-speculative if the first // part was non-speculative @@ -111,7 +107,6 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) } } when (io.cpu.req.valid) { - s1_same_block := Bool(false) s1_pc_ := io.cpu.npc s1_speculative := io.cpu.req.bits.speculative s2_valid := Bool(false) @@ -144,21 +139,20 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) tlb.io.req.bits.sfence := io.cpu.sfence tlb.io.req.bits.size := log2Ceil(coreInstBytes*fetchWidth) - icache.io.req.valid := !stall && !s0_same_block + icache.io.req.valid := !stall icache.io.req.bits.addr := io.cpu.npc icache.io.invalidate := io.cpu.flush_icache icache.io.s1_paddr := tlb.io.resp.paddr icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || icmiss || s1_speculative && !tlb.io.resp.cacheable || tlb.io.resp.pf.inst || tlb.io.resp.ae.inst icache.io.s2_kill := false - icache.io.resp.ready := !stall && !s1_same_block + icache.io.resp.ready := !stall val s2_kill = s2_speculative && !s2_cacheable || s2_xcpt io.cpu.resp.valid := s2_valid && (icache.io.resp.valid || s2_kill) io.cpu.resp.bits.pc := s2_pc io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc) - require(fetchWidth * coreInstBytes <= rowBytes && isPow2(fetchWidth)) - io.cpu.resp.bits.data := icache.io.resp.bits.datablock >> (s2_pc.extract(log2Ceil(rowBytes)-1,log2Ceil(fetchWidth*coreInstBytes)) << log2Ceil(fetchWidth*coreInstBits)) + io.cpu.resp.bits.data := icache.io.resp.bits io.cpu.resp.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes)) io.cpu.resp.bits.pf := s2_pf io.cpu.resp.bits.ae := s2_ae diff --git a/src/main/scala/rocket/ICache.scala b/src/main/scala/rocket/ICache.scala index 16934347..6c0b8708 100644 --- a/src/main/scala/rocket/ICache.scala +++ b/src/main/scala/rocket/ICache.scala @@ -47,7 +47,7 @@ class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) { val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission - val resp = Decoupled(new ICacheResp) + val resp = Decoupled(UInt(width = coreInstBits * fetchWidth)) val invalidate = Bool(INPUT) val mem = outer.node.bundleOut } @@ -110,7 +110,8 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) } val s1_tag_disparity = Wire(Vec(nWays, Bool())) - val s1_dout = Wire(Vec(nWays, UInt(width = code.width(rowBits)))) + val wordBits = coreInstBits * fetchWidth + val s1_dout = Wire(Vec(nWays, UInt(width = code.width(wordBits)))) val s1_dout_valid = RegNext(s0_valid) for (i <- 0 until nWays) { @@ -119,7 +120,24 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) s1_tag_hit(i) := s1_vb && ((code.decode(tag_rdata(i)).uncorrected === s1_tag) holdUnless s1_dout_valid) } - val data_arrays = Seq.fill(nWays) { SeqMem(nSets * refillCycles, Bits(width = code.width(rowBits))) } + require(rowBits % wordBits == 0) + val data_arrays = Seq.fill(rowBits / wordBits) { SeqMem(nSets * refillCycles, Vec(nWays, UInt(width = code.width(wordBits)))) } + for ((data_array, i) <- data_arrays zipWithIndex) { + val wen = tl_out.d.valid + when (wen) { + val idx = (refill_idx << log2Ceil(refillCycles)) | refill_cnt + val data = tl_out.d.bits.data(wordBits*(i+1)-1, wordBits*i) + data_array.write(idx, Vec.fill(nWays)(code.encode(data)), (0 until nWays).map(repl_way === _)) + } + def wordMatch(addr: UInt) = addr.extract(log2Ceil(rowBytes)-1, log2Ceil(wordBits/8)) === i + val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles)) + val dout = data_array.read(s0_raddr, !wen && (s0_valid && wordMatch(s0_vaddr))) holdUnless s1_dout_valid + when (wordMatch(io.s1_paddr)) { + s1_dout := dout + } + } + +/* for ((data_array, i) <- data_arrays zipWithIndex) { val wen = tl_out.d.valid && repl_way === UInt(i) when (wen) { @@ -129,12 +147,13 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles)) s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid) holdUnless s1_dout_valid } +*/ // output signals outer.latency match { case 1 => require(code.width(rowBits) == rowBits) // no ECC - io.resp.bits.datablock := Mux1H(s1_tag_hit, s1_dout) + io.resp.bits := Mux1H(s1_tag_hit, s1_dout) io.resp.valid := s1_hit case 2 => val s2_valid = RegEnable(out_valid, Bool(false), !stall) @@ -148,7 +167,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) val s2_disparity = s2_tag_disparity || s2_data_disparity when (s2_valid && s2_disparity) { invalidate := true } - io.resp.bits.datablock := code.decode(s2_way_mux).uncorrected + io.resp.bits := code.decode(s2_way_mux).uncorrected io.resp.valid := s2_hit && !s2_disparity } tl_out.a.valid := state === s_request && !io.s2_kill From d24d8ff84badd3bf10303598db415d429ef331ff Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Wed, 19 Apr 2017 16:51:39 -0700 Subject: [PATCH 02/14] Don't stall the frontend, making it easier to add more features later --- src/main/scala/rocket/Frontend.scala | 80 ++++++++++++++-------------- src/main/scala/rocket/IBuf.scala | 2 +- src/main/scala/rocket/ICache.scala | 25 +++++---- src/main/scala/rocket/Rocket.scala | 2 +- 4 files changed, 55 insertions(+), 54 deletions(-) diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index 0d3b3e10..c6f14053 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -5,6 +5,7 @@ package rocket import Chisel._ import Chisel.ImplicitConversions._ +import chisel3.core.withReset import config._ import coreplex._ import diplomacy._ @@ -64,7 +65,9 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val icache = outer.icache.module val tlb = Module(new TLB(log2Ceil(coreInstBytes*fetchWidth), nTLBEntries)) + val fq = withReset(reset || io.cpu.req.valid) { Module(new Queue(new FrontendResp, 3, flow = true)) } + val s0_valid = io.cpu.req.valid || fq.io.enq.ready val s1_pc_ = Reg(UInt(width=vaddrBitsExtended)) val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline) val s1_speculative = Reg(Bool()) @@ -84,32 +87,30 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val ntpc = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth) val predicted_npc = Wire(init = ntpc) val predicted_taken = Wire(init = Bool(false)) - val icmiss = s2_valid && !icache.io.resp.valid - val npc = Mux(icmiss, s2_pc, predicted_npc) - val stall = io.cpu.resp.valid && !io.cpu.resp.ready - when (!stall) { - s1_pc_ := io.cpu.npc - // consider RVC fetches across blocks to be non-speculative if the first - // part was non-speculative - val s0_speculative = - if (usingCompressed) s1_speculative || s2_valid && !s2_speculative || predicted_taken - else Bool(true) - s1_speculative := Mux(icmiss, s2_speculative, s0_speculative) - s2_valid := !icmiss - when (!icmiss) { - s2_pc := s1_pc - s2_speculative := s1_speculative - s2_cacheable := tlb.io.resp.cacheable - s2_maybe_pf := tlb.io.resp.pf.inst - s2_maybe_ae := tlb.io.resp.ae.inst - s2_tlb_miss := tlb.io.resp.miss - } - } - when (io.cpu.req.valid) { - s1_pc_ := io.cpu.npc - s1_speculative := io.cpu.req.bits.speculative - s2_valid := Bool(false) + val s2_replay = Wire(Bool()) + s2_replay := + (s2_valid && (!icache.io.resp.valid || (fq.io.enq.valid && !fq.io.enq.ready))) || + RegNext(s2_replay && !s0_valid) + val npc = Mux(s2_replay, s2_pc, predicted_npc) + + s1_pc_ := io.cpu.npc + // consider RVC fetches across blocks to be non-speculative if the first + // part was non-speculative + val s0_speculative = + if (usingCompressed) s1_speculative || s2_valid && !s2_speculative || predicted_taken + else Bool(true) + s1_speculative := Mux(io.cpu.req.valid, io.cpu.req.bits.speculative, Mux(s2_replay, s2_speculative, s0_speculative)) + + s2_valid := false + when (!s2_replay && !io.cpu.req.valid) { + s2_valid := true + s2_pc := s1_pc + s2_speculative := s1_speculative + s2_cacheable := tlb.io.resp.cacheable + s2_maybe_pf := tlb.io.resp.pf.inst + s2_maybe_ae := tlb.io.resp.ae.inst + s2_tlb_miss := tlb.io.resp.miss } if (usingBTB) { @@ -119,7 +120,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) btb.io.btb_update := io.cpu.btb_update btb.io.bht_update := io.cpu.bht_update btb.io.ras_update := io.cpu.ras_update - when (!stall && !icmiss) { + when (!s2_replay) { btb.io.req.valid := true s2_btb_resp_valid := btb.io.resp.valid s2_btb_resp_bits := btb.io.resp.bits @@ -131,7 +132,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) } io.ptw <> tlb.io.ptw - tlb.io.req.valid := !stall && !icmiss + tlb.io.req.valid := !s2_replay tlb.io.req.bits.vaddr := s1_pc tlb.io.req.bits.passthrough := Bool(false) tlb.io.req.bits.instruction := Bool(true) @@ -139,26 +140,27 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) tlb.io.req.bits.sfence := io.cpu.sfence tlb.io.req.bits.size := log2Ceil(coreInstBytes*fetchWidth) - icache.io.req.valid := !stall + icache.io.req.valid := s0_valid icache.io.req.bits.addr := io.cpu.npc icache.io.invalidate := io.cpu.flush_icache icache.io.s1_paddr := tlb.io.resp.paddr - icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || icmiss || s1_speculative && !tlb.io.resp.cacheable || tlb.io.resp.pf.inst || tlb.io.resp.ae.inst + icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || s2_replay || s1_speculative && !tlb.io.resp.cacheable || tlb.io.resp.pf.inst || tlb.io.resp.ae.inst icache.io.s2_kill := false - icache.io.resp.ready := !stall val s2_kill = s2_speculative && !s2_cacheable || s2_xcpt - io.cpu.resp.valid := s2_valid && (icache.io.resp.valid || s2_kill) - io.cpu.resp.bits.pc := s2_pc + fq.io.enq.valid := s2_valid && (icache.io.resp.valid || s2_kill) + fq.io.enq.bits.pc := s2_pc io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc) - io.cpu.resp.bits.data := icache.io.resp.bits - io.cpu.resp.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes)) - io.cpu.resp.bits.pf := s2_pf - io.cpu.resp.bits.ae := s2_ae - io.cpu.resp.bits.replay := s2_kill && !icache.io.resp.valid && !s2_xcpt - io.cpu.resp.bits.btb.valid := s2_btb_resp_valid - io.cpu.resp.bits.btb.bits := s2_btb_resp_bits + fq.io.enq.bits.data := icache.io.resp.bits + fq.io.enq.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes)) + fq.io.enq.bits.pf := s2_pf + fq.io.enq.bits.ae := s2_ae + fq.io.enq.bits.replay := s2_kill && !icache.io.resp.valid && !s2_xcpt + fq.io.enq.bits.btb.valid := s2_btb_resp_valid + fq.io.enq.bits.btb.bits := s2_btb_resp_bits + + io.cpu.resp <> fq.io.deq // performance events io.cpu.acquire := edge.done(icache.io.mem(0).a) diff --git a/src/main/scala/rocket/IBuf.scala b/src/main/scala/rocket/IBuf.scala index ac50ac10..1f209729 100644 --- a/src/main/scala/rocket/IBuf.scala +++ b/src/main/scala/rocket/IBuf.scala @@ -84,7 +84,7 @@ class IBuf(implicit p: Parameters) extends CoreModule { val ae = valid & (Mux(buf.ae, bufMask, UInt(0)) | Mux(io.imem.bits.ae, ~bufMask, UInt(0))) val ic_replay = valid & (Mux(buf.replay, bufMask, UInt(0)) | Mux(io.imem.bits.replay, ~bufMask, UInt(0))) val ibufBTBHitMask = Mux(ibufBTBHit, UIntToOH(ibufBTBResp.bridx), UInt(0)) - assert(!io.imem.bits.btb.valid || io.imem.bits.btb.bits.bridx >= pcWordBits) + assert(!io.imem.valid || !io.imem.bits.btb.valid || io.imem.bits.btb.bits.bridx >= pcWordBits) val icBTBHitMask = Mux(io.imem.bits.btb.valid, UIntToOH(io.imem.bits.btb.bits.bridx +& nBufValid - pcWordBits), UInt(0)) val btbHitMask = ibufBTBHitMask & bufMask | icBTBHitMask & ~bufMask diff --git a/src/main/scala/rocket/ICache.scala b/src/main/scala/rocket/ICache.scala index 6c0b8708..160a472e 100644 --- a/src/main/scala/rocket/ICache.scala +++ b/src/main/scala/rocket/ICache.scala @@ -47,7 +47,7 @@ class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) { val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission - val resp = Decoupled(UInt(width = coreInstBits * fetchWidth)) + val resp = Valid(UInt(width = coreInstBits * fetchWidth)) val invalidate = Bool(INPUT) val mem = outer.node.bundleOut } @@ -65,7 +65,6 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) val s_ready :: s_request :: s_refill :: Nil = Enum(UInt(), 3) val state = Reg(init=s_ready) val invalidated = Reg(Bool()) - val stall = !io.resp.ready val refill_addr = Reg(UInt(width = paddrBits)) val s1_tag_hit = Wire(Vec(nWays, Bool())) @@ -78,10 +77,10 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) val s1_hit = out_valid && s1_any_tag_hit val s1_miss = s1_valid && state === s_ready && !s1_any_tag_hit - val s0_valid = io.req.valid && state === s_ready && !(s1_valid && stall) + val s0_valid = io.req.valid && state === s_ready val s0_vaddr = io.req.bits.addr - s1_valid := s0_valid || out_valid && stall + s1_valid := s0_valid when (s1_miss) { refill_addr := io.s1_paddr } val refill_tag = refill_addr(tagBits+untagBits-1,untagBits) @@ -116,8 +115,8 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) for (i <- 0 until nWays) { val s1_vb = vb_array(Cat(UInt(i), io.s1_paddr(untagBits-1,blockOffBits))).toBool - s1_tag_disparity(i) := (code.decode(tag_rdata(i)).error holdUnless s1_dout_valid) - s1_tag_hit(i) := s1_vb && ((code.decode(tag_rdata(i)).uncorrected === s1_tag) holdUnless s1_dout_valid) + s1_tag_disparity(i) := code.decode(tag_rdata(i)).error + s1_tag_hit(i) := s1_vb && code.decode(tag_rdata(i)).uncorrected === s1_tag } require(rowBits % wordBits == 0) @@ -131,7 +130,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) } def wordMatch(addr: UInt) = addr.extract(log2Ceil(rowBytes)-1, log2Ceil(wordBits/8)) === i val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles)) - val dout = data_array.read(s0_raddr, !wen && (s0_valid && wordMatch(s0_vaddr))) holdUnless s1_dout_valid + val dout = data_array.read(s0_raddr, !wen && (s0_valid && wordMatch(s0_vaddr))) when (wordMatch(io.s1_paddr)) { s1_dout := dout } @@ -145,7 +144,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) data_array.write((refill_idx << log2Ceil(refillCycles)) | refill_cnt, e_d) } val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles)) - s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid) holdUnless s1_dout_valid + s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid) } */ @@ -156,13 +155,13 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer) io.resp.bits := Mux1H(s1_tag_hit, s1_dout) io.resp.valid := s1_hit case 2 => - val s2_valid = RegEnable(out_valid, Bool(false), !stall) - val s2_hit = RegEnable(s1_hit, Bool(false), !stall) - val s2_tag_hit = RegEnable(s1_tag_hit, !stall) - val s2_dout = RegEnable(s1_dout, !stall) + val s2_valid = RegNext(out_valid, Bool(false)) + val s2_hit = RegNext(s1_hit, Bool(false)) + val s2_tag_hit = RegEnable(s1_tag_hit, s1_valid) + val s2_dout = RegEnable(s1_dout, s1_valid) val s2_way_mux = Mux1H(s2_tag_hit, s2_dout) - val s2_tag_disparity = RegEnable(s1_tag_disparity, !stall).asUInt.orR + val s2_tag_disparity = RegEnable(s1_tag_disparity, s1_valid).asUInt.orR val s2_data_disparity = code.decode(s2_way_mux).error val s2_disparity = s2_tag_disparity || s2_data_disparity when (s2_valid && s2_disparity) { invalidate := true } diff --git a/src/main/scala/rocket/Rocket.scala b/src/main/scala/rocket/Rocket.scala index fde726aa..b3c92cfe 100644 --- a/src/main/scala/rocket/Rocket.scala +++ b/src/main/scala/rocket/Rocket.scala @@ -173,7 +173,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) val ibuf = Module(new IBuf) val id_expanded_inst = ibuf.io.inst.map(_.bits.inst) val id_inst = id_expanded_inst.map(_.bits) - ibuf.io.imem <> (if (usingCompressed) withReset(reset || take_pc) { Queue(io.imem.resp, 1, flow = true) } else io.imem.resp) + ibuf.io.imem <> io.imem.resp ibuf.io.kill := take_pc require(decodeWidth == 1 /* TODO */ && retireWidth == decodeWidth) From bf861293d968ff16712b6950996b15656c167c2b Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 21 Apr 2017 18:01:09 -0700 Subject: [PATCH 03/14] Add ShiftQueue; use it --- src/main/scala/rocket/Frontend.scala | 2 +- src/main/scala/util/ShiftQueue.scala | 61 ++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 src/main/scala/util/ShiftQueue.scala diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index c6f14053..126f2b25 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -65,7 +65,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val icache = outer.icache.module val tlb = Module(new TLB(log2Ceil(coreInstBytes*fetchWidth), nTLBEntries)) - val fq = withReset(reset || io.cpu.req.valid) { Module(new Queue(new FrontendResp, 3, flow = true)) } + val fq = withReset(reset || io.cpu.req.valid) { Module(new ShiftQueue(new FrontendResp, 3, flow = true)) } val s0_valid = io.cpu.req.valid || fq.io.enq.ready val s1_pc_ = Reg(UInt(width=vaddrBitsExtended)) diff --git a/src/main/scala/util/ShiftQueue.scala b/src/main/scala/util/ShiftQueue.scala new file mode 100644 index 00000000..ca7e198d --- /dev/null +++ b/src/main/scala/util/ShiftQueue.scala @@ -0,0 +1,61 @@ +// See LICENSE.SiFive for license details. + +package util + +import Chisel._ + +/** Implements the same interface as chisel3.util.Queue, but uses a shift + * register internally. It is less energy efficient whenever the queue + * has more than one entry populated, but is faster on the dequeue side. + * It is efficient for usually-empty flow-through queues. */ +class ShiftQueue[T <: Data](gen: T, + val entries: Int, + pipe: Boolean = false, + flow: Boolean = false) + extends Module { + val io = IO(new QueueIO(gen, entries) { + val mask = UInt(OUTPUT, entries) + }) + + private val ram = Mem(entries, gen) + private val valid = RegInit(UInt(0, entries)) + private val elts = Reg(Vec(entries, gen)) + + private val do_enq = Wire(init=io.enq.fire()) + private val do_deq = Wire(init=io.deq.fire()) + + when (do_deq) { + when (!do_enq) { valid := (valid >> 1) } + for (i <- 1 until entries) + when (valid(i)) { elts(i-1) := elts(i) } + } + when (do_enq && do_deq) { + for (i <- 0 until entries) + when (valid(i) && (if (i == entries-1) true.B else !valid(i+1))) { elts(i) := io.enq.bits } + } + when (do_enq && !do_deq) { + valid := (valid << 1) | UInt(1) + for (i <- 0 until entries) + when (!valid(i) && (if (i == 0) true.B else valid(i-1))) { elts(i) := io.enq.bits } + } + + io.enq.ready := !valid(entries-1) + io.deq.valid := valid(0) + io.deq.bits := elts.head + + if (flow) { + when (io.enq.valid) { io.deq.valid := true.B } + when (!valid(0)) { + io.deq.bits := io.enq.bits + do_deq := false.B + when (io.deq.ready) { do_enq := false.B } + } + } + + if (pipe) { + when (io.deq.ready) { io.enq.ready := true.B } + } + + io.count := PopCount(valid) + io.mask := valid +} From c36c171202aedc1c78568aa234c1751a23e23be2 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 21 Apr 2017 18:01:32 -0700 Subject: [PATCH 04/14] Use correct interrupt priority order --- src/main/scala/rocket/CSR.scala | 14 +++++++++++--- src/main/scala/util/Package.scala | 10 +++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/main/scala/rocket/CSR.scala b/src/main/scala/rocket/CSR.scala index 1ec4db24..661cb1b8 100644 --- a/src/main/scala/rocket/CSR.scala +++ b/src/main/scala/rocket/CSR.scala @@ -300,10 +300,10 @@ class CSRFile(perfEventSets: EventSets = new EventSets(Seq()))(implicit p: Param val pending_interrupts = read_mip & reg_mie val m_interrupts = Mux(reg_mstatus.prv <= PRV.S || (reg_mstatus.prv === PRV.M && reg_mstatus.mie), pending_interrupts & ~reg_mideleg, UInt(0)) val s_interrupts = Mux(m_interrupts === 0 && (reg_mstatus.prv < PRV.S || (reg_mstatus.prv === PRV.S && reg_mstatus.sie)), pending_interrupts & reg_mideleg, UInt(0)) - val all_interrupts = m_interrupts | s_interrupts + val (anyInterrupt, whichInterrupt) = chooseInterrupt(Seq(s_interrupts, m_interrupts)) val interruptMSB = BigInt(1) << (xLen-1) - val interruptCause = UInt(interruptMSB) + PriorityEncoder(all_interrupts) - io.interrupt := all_interrupts.orR && !reg_debug && !io.singleStep || reg_singleStepped + val interruptCause = UInt(interruptMSB) + whichInterrupt + io.interrupt := anyInterrupt && !reg_debug && !io.singleStep || reg_singleStepped io.interrupt_cause := interruptCause io.bp := reg_bp take nBreakpoints io.pmp := reg_pmp.map(PMP(_)) @@ -758,6 +758,14 @@ class CSRFile(perfEventSets: EventSets = new EventSets(Seq()))(implicit p: Param } } + def chooseInterrupt(masks: Seq[UInt]) = { + // we can't simply choose the highest-numbered interrupt, because timer + // interrupts are in the wrong place in mip. + val timerMask = UInt(0xF0, xLen) + val masked = masks.map(m => Cat(m.padTo(xLen) & ~timerMask, m.padTo(xLen) & timerMask)) + (masks.map(_.orR).reduce(_||_), Log2(masked.asUInt)(log2Ceil(xLen)-1, 0)) + } + def readModifyWriteCSR(cmd: UInt, rdata: UInt, wdata: UInt) = (Mux(cmd.isOneOf(CSR.S, CSR.C), rdata, UInt(0)) | wdata) & ~Mux(cmd === CSR.C, wdata, UInt(0)) diff --git a/src/main/scala/util/Package.scala b/src/main/scala/util/Package.scala index d0983c40..85667677 100644 --- a/src/main/scala/util/Package.scala +++ b/src/main/scala/util/Package.scala @@ -38,9 +38,17 @@ package object util { implicit def wcToUInt(c: WideCounter): UInt = c.value implicit class UIntToAugmentedUInt(val x: UInt) extends AnyVal { - def sextTo(n: Int): UInt = + def sextTo(n: Int): UInt = { + require(x.getWidth <= n) if (x.getWidth == n) x else Cat(Fill(n - x.getWidth, x(x.getWidth-1)), x) + } + + def padTo(n: Int): UInt = { + require(x.getWidth <= n) + if (x.getWidth == n) x + else Cat(UInt(0, n - x.getWidth), x) + } def extract(hi: Int, lo: Int): UInt = { if (hi == lo-1) UInt(0) From 3b2c15b64890c17de30085f67a35c0a85aab6f2c Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 21 Apr 2017 18:01:56 -0700 Subject: [PATCH 05/14] Use tininess-after-rounding in FPU --- src/main/scala/tile/FPU.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/scala/tile/FPU.scala b/src/main/scala/tile/FPU.scala index 0006f786..4fcc088f 100644 --- a/src/main/scala/tile/FPU.scala +++ b/src/main/scala/tile/FPU.scala @@ -450,6 +450,7 @@ class IntToFP(val latency: Int)(implicit p: Parameters) extends FPUModule()(p) { l2s.io.signedIn := ~in.bits.typ(0) l2s.io.in := intValue l2s.io.roundingMode := in.bits.rm + l2s.io.detectTininess := hardfloat.consts.tininess_afterRounding mux.data := sanitizeNaN(l2s.io.out, FType.S) mux.exc := l2s.io.exceptionFlags @@ -460,6 +461,7 @@ class IntToFP(val latency: Int)(implicit p: Parameters) extends FPUModule()(p) { l2d.io.signedIn := ~in.bits.typ(0) l2d.io.in := intValue l2d.io.roundingMode := in.bits.rm + l2d.io.detectTininess := hardfloat.consts.tininess_afterRounding mux.data := Cat(l2d.io.out >> l2s.io.out.getWidth, l2s.io.out) when (!in.bits.singleIn) { mux.data := sanitizeNaN(l2d.io.out, FType.D) @@ -511,11 +513,13 @@ class FPToFP(val latency: Int)(implicit p: Parameters) extends FPUModule()(p) { val d2s = Module(new hardfloat.RecFNToRecFN(dExpWidth, dSigWidth, sExpWidth, sSigWidth)) d2s.io.in := in.bits.in1 d2s.io.roundingMode := in.bits.rm + d2s.io.detectTininess := hardfloat.consts.tininess_afterRounding val d2sOut = sanitizeNaN(d2s.io.out, FType.S) val s2d = Module(new hardfloat.RecFNToRecFN(sExpWidth, sSigWidth, dExpWidth, dSigWidth)) s2d.io.in := maxType.unsafeConvert(in.bits.in1, FType.S) s2d.io.roundingMode := in.bits.rm + s2d.io.detectTininess := hardfloat.consts.tininess_afterRounding val s2dOut = sanitizeNaN(s2d.io.out, FType.D) when (in.bits.singleOut) { @@ -554,6 +558,7 @@ class FPUFMAPipe(val latency: Int, t: FType)(implicit p: Parameters) extends FPU val fma = Module(new hardfloat.MulAddRecFN(t.exp, t.sig)) fma.io.op := in.fmaCmd fma.io.roundingMode := in.rm + fma.io.detectTininess := hardfloat.consts.tininess_afterRounding fma.io.a := in.in1 fma.io.b := in.in2 fma.io.c := in.in3 @@ -775,6 +780,7 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { divSqrt.io.a := fpiu.io.out.bits.in.in1 divSqrt.io.b := fpiu.io.out.bits.in.in2 divSqrt.io.roundingMode := fpiu.io.out.bits.in.rm + divSqrt.io.detectTininess := hardfloat.consts.tininess_afterRounding when (divSqrt.io.inValid && divSqrt_inReady) { divSqrt_in_flight := true @@ -794,6 +800,7 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { val divSqrt_toSingle = Module(new hardfloat.RecFNToRecFN(11, 53, 8, 24)) divSqrt_toSingle.io.in := divSqrt_wdata_double divSqrt_toSingle.io.roundingMode := divSqrt_rm + divSqrt_toSingle.io.detectTininess := hardfloat.consts.tininess_afterRounding divSqrt_wdata := Mux(divSqrt_single, Cat(divSqrt_wdata_double >> divSqrt_toSingle.io.out.getWidth, sanitizeNaN(divSqrt_toSingle.io.out, FType.S)), divSqrt_wdata_double) divSqrt_flags := divSqrt_flags_double | Mux(divSqrt_single, divSqrt_toSingle.io.exceptionFlags, Bits(0)) } else { From f2d4cb8152218ff6ce2c01d7ba4bb90ccd01a1b2 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sat, 22 Apr 2017 21:35:19 -0700 Subject: [PATCH 06/14] Update RAS speculatively from fetch stage --- src/main/scala/rocket/BTB.scala | 34 ++++++++++++++++------------ src/main/scala/rocket/Frontend.scala | 12 +++++++++- src/main/scala/rocket/Rocket.scala | 12 ++++------ 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/src/main/scala/rocket/BTB.scala b/src/main/scala/rocket/BTB.scala index 15cb5d24..1d9491bc 100644 --- a/src/main/scala/rocket/BTB.scala +++ b/src/main/scala/rocket/BTB.scala @@ -86,6 +86,15 @@ class BHT(nbht: Int)(implicit val p: Parameters) extends HasCoreParameters { val history = Reg(UInt(width = nbhtbits)) } +object CFIType { + def SZ = 2 + def apply() = UInt(width = SZ) + def branch = 0.U + def jump = 1.U + def call = 2.U + def ret = 3.U +} + // BTB update occurs during branch resolution (and only on a mispredict). // - "pc" is what future fetch PCs will tag match against. // - "br_pc" is the PC of the branch instruction. @@ -95,9 +104,8 @@ class BTBUpdate(implicit p: Parameters) extends BtbBundle()(p) { val target = UInt(width = vaddrBits) val taken = Bool() val isValid = Bool() - val isJump = Bool() - val isReturn = Bool() val br_pc = UInt(width = vaddrBits) + val cfiType = CFIType() } // BHT update occurs during branch resolution on all conditional branches. @@ -110,8 +118,7 @@ class BHTUpdate(implicit p: Parameters) extends BtbBundle()(p) { } class RASUpdate(implicit p: Parameters) extends BtbBundle()(p) { - val isCall = Bool() - val isReturn = Bool() + val cfiType = CFIType() val returnAddr = UInt(width = vaddrBits) val prediction = Valid(new BTBResp) } @@ -121,6 +128,7 @@ class RASUpdate(implicit p: Parameters) extends BtbBundle()(p) { // - "mask" provides a mask of valid instructions (instructions are // masked off by the predicted taken branch from the BTB). class BTBResp(implicit p: Parameters) extends BtbBundle()(p) { + val cfiType = CFIType() val taken = Bool() val mask = Bits(width = fetchWidth) val bridx = Bits(width = log2Up(fetchWidth)) @@ -154,8 +162,7 @@ class BTB(implicit p: Parameters) extends BtbModule { val pageValid = Reg(init = UInt(0, nPages)) val isValid = Reg(init = UInt(0, entries)) - val isReturn = Reg(UInt(width = entries)) - val isJump = Reg(UInt(width = entries)) + val cfiType = Reg(Vec(entries, CFIType())) val brIdx = Reg(Vec(entries, UInt(width=log2Up(fetchWidth)))) private def page(addr: UInt) = addr >> matchBits @@ -210,9 +217,8 @@ class BTB(implicit p: Parameters) extends BtbModule { tgts(waddr) := update_target(matchBits-1, log2Up(coreInstBytes)) idxPages(waddr) := idxPageUpdate +& 1 // the +1 corresponds to the <<1 on io.resp.valid tgtPages(waddr) := tgtPageUpdate + cfiType(waddr) := r_btb_update.bits.cfiType isValid := Mux(r_btb_update.bits.isValid, isValid | mask, isValid & ~mask) - isReturn := Mux(r_btb_update.bits.isReturn, isReturn | mask, isReturn & ~mask) - isJump := Mux(r_btb_update.bits.isJump, isJump | mask, isJump & ~mask) if (fetchWidth > 1) brIdx(waddr) := r_btb_update.bits.br_pc >> log2Up(coreInstBytes) @@ -236,6 +242,7 @@ class BTB(implicit p: Parameters) extends BtbModule { io.resp.bits.entry := OHToUInt(idxHit) io.resp.bits.bridx := (if (fetchWidth > 1) Mux1H(idxHit, brIdx) else UInt(0)) io.resp.bits.mask := Cat((UInt(1) << ~Mux(io.resp.bits.taken, ~io.resp.bits.bridx, UInt(0)))-1, UInt(1)) + io.resp.bits.cfiType := Mux1H(idxHit, cfiType) // if multiple entries for same PC land in BTB, zap them when (PopCountAtLeast(idxHit, 2)) { @@ -244,7 +251,7 @@ class BTB(implicit p: Parameters) extends BtbModule { if (nBHT > 0) { val bht = new BHT(nBHT) - val isBranch = !(idxHit & isJump).orR + val isBranch = (idxHit & cfiType.map(_ === CFIType.branch).asUInt).orR val res = bht.get(io.req.bits.addr, io.req.valid && io.resp.valid && isBranch) val update_btb_hit = io.bht_update.bits.prediction.valid when (io.bht_update.valid && update_btb_hit) { @@ -256,17 +263,14 @@ class BTB(implicit p: Parameters) extends BtbModule { if (nRAS > 0) { val ras = new RAS(nRAS) - val doPeek = (idxHit & isReturn).orR + val doPeek = (idxHit & cfiType.map(_ === CFIType.ret).asUInt).orR when (!ras.isEmpty && doPeek) { io.resp.bits.target := ras.peek } when (io.ras_update.valid) { - when (io.ras_update.bits.isCall) { + when (io.ras_update.bits.cfiType === CFIType.call) { ras.push(io.ras_update.bits.returnAddr) - when (doPeek) { - io.resp.bits.target := io.ras_update.bits.returnAddr - } - }.elsewhen (io.ras_update.bits.isReturn && io.ras_update.bits.prediction.valid) { + }.elsewhen (io.ras_update.bits.cfiType === CFIType.ret && io.ras_update.bits.prediction.valid) { ras.pop() } } diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index 126f2b25..27a02fd3 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -84,7 +84,9 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val s2_speculative = Reg(init=Bool(false)) val s2_cacheable = Reg(init=Bool(false)) - val ntpc = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth) + val fetchBytes = coreInstBytes * fetchWidth + val s1_base_pc = ~(~s1_pc | (fetchBytes - 1)) + val ntpc = s1_base_pc + fetchBytes.U val predicted_npc = Wire(init = ntpc) val predicted_taken = Wire(init = Bool(false)) @@ -129,6 +131,14 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) predicted_npc := btb.io.resp.bits.target.sextTo(vaddrBitsExtended) predicted_taken := Bool(true) } + + // push RAS speculatively + btb.io.ras_update.valid := btb.io.req.valid && btb.io.resp.valid && btb.io.resp.bits.cfiType.isOneOf(CFIType.call, CFIType.ret) + val returnAddrLSBs = btb.io.resp.bits.bridx +& 1 + btb.io.ras_update.bits.returnAddr := + Mux(returnAddrLSBs(log2Ceil(fetchWidth)), ntpc, s1_base_pc | ((returnAddrLSBs << log2Ceil(coreInstBytes)) & (fetchBytes - 1))) + btb.io.ras_update.bits.cfiType := btb.io.resp.bits.cfiType + btb.io.ras_update.bits.prediction.valid := true } io.ptw <> tlb.io.ptw diff --git a/src/main/scala/rocket/Rocket.scala b/src/main/scala/rocket/Rocket.scala index b3c92cfe..e7005050 100644 --- a/src/main/scala/rocket/Rocket.scala +++ b/src/main/scala/rocket/Rocket.scala @@ -587,8 +587,10 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) io.imem.btb_update.valid := (mem_reg_replay && mem_reg_btb_hit) || (mem_reg_valid && !take_pc_wb && (((mem_cfi_taken || !mem_cfi) && mem_wrong_npc) || (Bool(fastJAL) && mem_ctrl.jal && !mem_reg_btb_hit))) io.imem.btb_update.bits.isValid := !mem_reg_replay && mem_cfi - io.imem.btb_update.bits.isJump := mem_ctrl.jal || mem_ctrl.jalr - io.imem.btb_update.bits.isReturn := mem_ctrl.jalr && mem_reg_inst(19,15) === BitPat("b00?01") + io.imem.btb_update.bits.cfiType := + Mux(mem_ctrl.jalr && mem_reg_inst(19,15) === BitPat("b00?01"), CFIType.ret, + Mux(mem_ctrl.jal || mem_ctrl.jalr, Mux(mem_waddr(0), CFIType.call, CFIType.jump), + CFIType.branch)) io.imem.btb_update.bits.target := io.imem.req.bits.pc io.imem.btb_update.bits.br_pc := (if (usingCompressed) mem_reg_pc + Mux(mem_reg_rvc, UInt(0), UInt(2)) else mem_reg_pc) io.imem.btb_update.bits.pc := ~(~io.imem.btb_update.bits.br_pc | (coreInstBytes*fetchWidth-1)) @@ -601,12 +603,6 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) io.imem.bht_update.bits.mispredict := mem_wrong_npc io.imem.bht_update.bits.prediction := io.imem.btb_update.bits.prediction - io.imem.ras_update.valid := mem_reg_valid && !take_pc_wb - io.imem.ras_update.bits.returnAddr := mem_int_wdata - io.imem.ras_update.bits.isCall := io.imem.btb_update.bits.isJump && mem_waddr(0) - io.imem.ras_update.bits.isReturn := io.imem.btb_update.bits.isReturn - io.imem.ras_update.bits.prediction := io.imem.btb_update.bits.prediction - io.fpu.valid := !ctrl_killd && id_ctrl.fp io.fpu.killx := ctrl_killx io.fpu.killm := killm_common From 845e6f7458d1e5f04bb0c6117007eeb47097fb9a Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sat, 22 Apr 2017 22:12:26 -0700 Subject: [PATCH 07/14] Filter out duplicate test suites I botched the refactoring in 5934c7b4b91c0e19d4f8f7ae8baa6ec034d59391 --- src/main/scala/rocketchip/RocketTestSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/scala/rocketchip/RocketTestSuite.scala b/src/main/scala/rocketchip/RocketTestSuite.scala index 1cdcf317..32e7b664 100644 --- a/src/main/scala/rocketchip/RocketTestSuite.scala +++ b/src/main/scala/rocketchip/RocketTestSuite.scala @@ -4,7 +4,7 @@ package rocketchip import Chisel._ -import scala.collection.mutable.{LinkedHashSet, ArrayBuffer} +import scala.collection.mutable.LinkedHashSet abstract class RocketTestSuite { val dir: String @@ -56,9 +56,9 @@ class RegressionTestSuite(val names: LinkedHashSet[String]) extends RocketTestSu } object TestGeneration { - private val suites = ArrayBuffer[RocketTestSuite]() + private val suites = collection.mutable.ListMap[String, RocketTestSuite]() - def addSuite(s: RocketTestSuite) { suites += s } + def addSuite(s: RocketTestSuite) { suites += (s.makeTargetName -> s) } def addSuites(s: Seq[RocketTestSuite]) { s.foreach(addSuite) } @@ -93,7 +93,7 @@ run-$kind-tests-fast: $$(addprefix $$(output_dir)/, $$(addsuffix .run, $targets) } else { "\n" } } - suites.groupBy(_.kind).map { case (kind, s) => gen(kind, s) }.mkString("\n") + suites.values.toSeq.groupBy(_.kind).map { case (kind, s) => gen(kind, s) }.mkString("\n") } } From 36a79719759bb5c5c0115b90f440d090ae2ea51b Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sat, 22 Apr 2017 22:13:40 -0700 Subject: [PATCH 08/14] Bypass scoreboard to reduce MMIO latency --- src/main/scala/rocket/Rocket.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/rocket/Rocket.scala b/src/main/scala/rocket/Rocket.scala index e7005050..89c6f297 100644 --- a/src/main/scala/rocket/Rocket.scala +++ b/src/main/scala/rocket/Rocket.scala @@ -520,7 +520,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) val sboard = new Scoreboard(32, true) sboard.clear(ll_wen, ll_waddr) - val id_sboard_hazard = checkHazards(hazard_targets, sboard.read _) + val id_sboard_hazard = checkHazards(hazard_targets, rd => sboard.read(rd) && !(ll_wen && ll_waddr === rd)) sboard.set(wb_set_sboard && wb_wen, wb_waddr) // stall for RAW/WAW hazards on CSRs, loads, AMOs, and mul/div in execute stage. From 65928dc6a0779f3f89fffcfbc7aa6d2fbda1b23e Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Mon, 24 Apr 2017 01:16:21 -0700 Subject: [PATCH 09/14] Don't push RAS for "auipc ra, X; jalr ra, ra, Y" --- src/main/scala/rocket/Rocket.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/scala/rocket/Rocket.scala b/src/main/scala/rocket/Rocket.scala index 89c6f297..853fe734 100644 --- a/src/main/scala/rocket/Rocket.scala +++ b/src/main/scala/rocket/Rocket.scala @@ -588,9 +588,10 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) io.imem.btb_update.valid := (mem_reg_replay && mem_reg_btb_hit) || (mem_reg_valid && !take_pc_wb && (((mem_cfi_taken || !mem_cfi) && mem_wrong_npc) || (Bool(fastJAL) && mem_ctrl.jal && !mem_reg_btb_hit))) io.imem.btb_update.bits.isValid := !mem_reg_replay && mem_cfi io.imem.btb_update.bits.cfiType := + Mux((mem_ctrl.jal || mem_ctrl.jalr) && mem_waddr(0), CFIType.call, Mux(mem_ctrl.jalr && mem_reg_inst(19,15) === BitPat("b00?01"), CFIType.ret, - Mux(mem_ctrl.jal || mem_ctrl.jalr, Mux(mem_waddr(0), CFIType.call, CFIType.jump), - CFIType.branch)) + Mux(mem_ctrl.jal || mem_ctrl.jalr, CFIType.jump, + CFIType.branch))) io.imem.btb_update.bits.target := io.imem.req.bits.pc io.imem.btb_update.bits.br_pc := (if (usingCompressed) mem_reg_pc + Mux(mem_reg_rvc, UInt(0), UInt(2)) else mem_reg_pc) io.imem.btb_update.bits.pc := ~(~io.imem.btb_update.bits.br_pc | (coreInstBytes*fetchWidth-1)) From d0f3004097336bfff9222bf7e472308c159e7a12 Mon Sep 17 00:00:00 2001 From: "Wesley W. Terpstra" Date: Mon, 24 Apr 2017 15:13:58 -0700 Subject: [PATCH 10/14] tilelink2: help tools save some registers in the WidthWidget (#691) --- src/main/scala/uncore/tilelink2/WidthWidget.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/scala/uncore/tilelink2/WidthWidget.scala b/src/main/scala/uncore/tilelink2/WidthWidget.scala index 36e154b3..5c84b8b3 100644 --- a/src/main/scala/uncore/tilelink2/WidthWidget.scala +++ b/src/main/scala/uncore/tilelink2/WidthWidget.scala @@ -132,7 +132,13 @@ class TLWidthWidget(innerBeatBytes: Int)(implicit p: Parameters) extends LazyMod } else if (edgeIn.manager.beatBytes > edgeOut.manager.beatBytes) { // split input to output val repeat = Wire(Bool()) - repeat := split(edgeIn, Repeater(in, repeat), edgeOut, out) + val repeated = Repeater(in, repeat) + val cated = Wire(repeated) + cated <> repeated + edgeIn.data(cated.bits) := Cat( + edgeIn.data(repeated.bits)(edgeIn.manager.beatBytes*8-1, edgeOut.manager.beatBytes*8), + edgeIn.data(in.bits)(edgeOut.manager.beatBytes*8-1, 0)) + repeat := split(edgeIn, cated, edgeOut, out) } else { // merge input to output merge(edgeIn, in, edgeOut, out) From 11ff4dfbb9bf3702a6113990907e021442618685 Mon Sep 17 00:00:00 2001 From: "Wesley W. Terpstra" Date: Mon, 24 Apr 2017 15:58:33 -0700 Subject: [PATCH 11/14] rocket: seip (int 9) is only present if VM is enabled (#699) --- src/main/scala/tile/Interrupts.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/scala/tile/Interrupts.scala b/src/main/scala/tile/Interrupts.scala index 6210e3eb..4d76c2c4 100644 --- a/src/main/scala/tile/Interrupts.scala +++ b/src/main/scala/tile/Interrupts.scala @@ -30,7 +30,8 @@ trait HasExternalInterrupts extends HasTileParameters { // debug, msip, mtip, meip, seip, lip offsets in CSRs def csrIntMap: List[Int] = { val nlips = tileParams.core.nLocalInterrupts - List(65535, 3, 7, 11, 9) ++ List.tabulate(nlips)(_ + 16) + val seip = if (usingVM) Seq(9) else Nil + List(65535, 3, 7, 11) ++ seip ++ List.tabulate(nlips)(_ + 16) } } From 9c1d12696552c70313c0c0ba22fef49287187d88 Mon Sep 17 00:00:00 2001 From: "Wesley W. Terpstra" Date: Mon, 24 Apr 2017 19:12:37 -0700 Subject: [PATCH 12/14] Allow speculative fetch to uncacheable memory if it hits in I$ (#700) @aswaterman it's in --- src/main/scala/rocket/Frontend.scala | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index 27a02fd3..4784cd0e 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -91,9 +91,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) val predicted_taken = Wire(init = Bool(false)) val s2_replay = Wire(Bool()) - s2_replay := - (s2_valid && (!icache.io.resp.valid || (fq.io.enq.valid && !fq.io.enq.ready))) || - RegNext(s2_replay && !s0_valid) + s2_replay := (s2_valid && !fq.io.enq.fire()) || RegNext(s2_replay && !s0_valid) val npc = Mux(s2_replay, s2_pc, predicted_npc) s1_pc_ := io.cpu.npc @@ -154,11 +152,10 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) icache.io.req.bits.addr := io.cpu.npc icache.io.invalidate := io.cpu.flush_icache icache.io.s1_paddr := tlb.io.resp.paddr - icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || s2_replay || s1_speculative && !tlb.io.resp.cacheable || tlb.io.resp.pf.inst || tlb.io.resp.ae.inst - icache.io.s2_kill := false + icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || s2_replay + icache.io.s2_kill := s2_speculative && !s2_cacheable || s2_xcpt - val s2_kill = s2_speculative && !s2_cacheable || s2_xcpt - fq.io.enq.valid := s2_valid && (icache.io.resp.valid || s2_kill) + fq.io.enq.valid := s2_valid && (icache.io.resp.valid || icache.io.s2_kill) fq.io.enq.bits.pc := s2_pc io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc) @@ -166,7 +163,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) fq.io.enq.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes)) fq.io.enq.bits.pf := s2_pf fq.io.enq.bits.ae := s2_ae - fq.io.enq.bits.replay := s2_kill && !icache.io.resp.valid && !s2_xcpt + fq.io.enq.bits.replay := icache.io.s2_kill && !icache.io.resp.valid && !s2_xcpt fq.io.enq.bits.btb.valid := s2_btb_resp_valid fq.io.enq.bits.btb.bits := s2_btb_resp_bits From 4807ce7cedae3a3394f9aae9349a8faa67011af0 Mon Sep 17 00:00:00 2001 From: "Wesley W. Terpstra" Date: Mon, 24 Apr 2017 23:28:04 -0700 Subject: [PATCH 13/14] dcache: put a flow Q to absorb back-pressure without restarting pipeline (#701) * dcache: put a flow Q to absorb back-pressure without restarting pipeline When used with a RationalCrossing, pipelined MMIO does not come out cleanly. The first beat works, but if the second beat gets stalled, the pipeline is restarted. This is a quick hacky test to absorb the beats. Perhaps a better fix can be made to achieve the same effect. * dcache: provision as few stages as possible --- src/main/scala/rocket/DCache.scala | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index 72b68fac..868c31c5 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -11,6 +11,7 @@ import uncore.tilelink2._ import uncore.util._ import util._ import TLMessages._ +import scala.math.min class DCacheDataReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) { val addr = Bits(width = untagBits) @@ -57,6 +58,19 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { data.io.req <> dataArb.io.out dataArb.io.out.ready := true + val rational = p(coreplex.RocketCrossing) match { + case coreplex.SynchronousCrossing(_) => true + case _ => false + } + + val tl_out_a = Wire(tl_out.a) + val q_depth = if (rational) min(2, maxUncachedInFlight-1) else 0 + if (q_depth <= 0) { + tl_out.a <> tl_out_a + } else { + tl_out.a <> Queue(tl_out_a, q_depth, flow = true, pipe = true) + } + val s1_valid = Reg(next=io.cpu.req.fire(), init=Bool(false)) val s1_probe = Reg(next=tl_out.b.fire(), init=Bool(false)) val probe_bits = RegEnable(tl_out.b.bits, tl_out.b.fire()) // TODO has data now :( @@ -176,7 +190,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val (s2_prb_ack_data, s2_report_param, probeNewCoh)= s2_probe_state.onProbe(probe_bits.param) val (s2_victim_dirty, s2_shrink_param, voluntaryNewCoh) = s2_victim_state.onCacheControl(M_FLUSH) val s2_update_meta = s2_hit_state =/= s2_new_hit_state - io.cpu.s2_nack := s2_valid && !s2_valid_hit && !(s2_valid_uncached && tl_out.a.ready && !uncachedInFlight.asUInt.andR) + io.cpu.s2_nack := s2_valid && !s2_valid_hit && !(s2_valid_uncached && tl_out_a.ready && !uncachedInFlight.asUInt.andR) when (io.cpu.s2_nack || (s2_valid_hit && s2_update_meta)) { s1_nack := true } val s3_valid = Reg(next = s2_valid, init=Bool(false)) @@ -285,17 +299,17 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { M_XA_MAXU -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MAXU)._2)) } else { // If no managers support atomics, assert fail if processor asks for them - assert (!(tl_out.a.valid && pstore1_amo && s2_write && s2_uncached)) + assert (!(tl_out_a.valid && pstore1_amo && s2_write && s2_uncached)) Wire(new TLBundleA(edge.bundle)) } - tl_out.a.valid := (s2_valid_cached_miss && !s2_victim_dirty) || + tl_out_a.valid := (s2_valid_cached_miss && !s2_victim_dirty) || (s2_valid_uncached && !uncachedInFlight.asUInt.andR) - tl_out.a.bits := Mux(!s2_uncached, acquire, Mux(!s2_write, get, Mux(!pstore1_amo, put, atomics))) + tl_out_a.bits := Mux(!s2_uncached, acquire, Mux(!s2_write, get, Mux(!pstore1_amo, put, atomics))) // Set pending bits for outstanding TileLink transaction val a_sel = UIntToOH(a_source, maxUncachedInFlight+mmioOffset) >> mmioOffset - when (tl_out.a.fire()) { + when (tl_out_a.fire()) { when (s2_uncached) { (a_sel.toBools zip (uncachedInFlight zip uncachedReqs)) foreach { case (s, (f, r)) => when (s) { @@ -518,7 +532,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val flushed = Reg(init=Bool(true)) val flushing = Reg(init=Bool(false)) val flushCounter = Counter(nSets * nWays) - when (tl_out.a.fire() && !s2_uncached) { flushed := false } + when (tl_out_a.fire() && !s2_uncached) { flushed := false } when (s2_valid_masked && s2_req.cmd === M_FLUSH_ALL) { io.cpu.s2_nack := !flushed when (!flushed) { @@ -542,6 +556,6 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { } // performance events - io.cpu.acquire := edge.done(tl_out.a) + io.cpu.acquire := edge.done(tl_out_a) io.cpu.release := edge.done(tl_out.c) } From f3ab23d0681d01db9de480450b679e1729153247 Mon Sep 17 00:00:00 2001 From: "Wesley W. Terpstra" Date: Tue, 25 Apr 2017 09:18:41 -0700 Subject: [PATCH 14/14] dcache: fix stupidly wrong crossing comparison (#703) --- src/main/scala/rocket/DCache.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index 868c31c5..63affc8d 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -59,7 +59,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { dataArb.io.out.ready := true val rational = p(coreplex.RocketCrossing) match { - case coreplex.SynchronousCrossing(_) => true + case coreplex.RationalCrossing(_) => true case _ => false }