Don't stall the frontend, making it easier to add more features later
This commit is contained in:
parent
061a0adceb
commit
d24d8ff84b
@ -5,6 +5,7 @@ package rocket
|
|||||||
|
|
||||||
import Chisel._
|
import Chisel._
|
||||||
import Chisel.ImplicitConversions._
|
import Chisel.ImplicitConversions._
|
||||||
|
import chisel3.core.withReset
|
||||||
import config._
|
import config._
|
||||||
import coreplex._
|
import coreplex._
|
||||||
import diplomacy._
|
import diplomacy._
|
||||||
@ -64,7 +65,9 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
|
|||||||
val icache = outer.icache.module
|
val icache = outer.icache.module
|
||||||
|
|
||||||
val tlb = Module(new TLB(log2Ceil(coreInstBytes*fetchWidth), nTLBEntries))
|
val tlb = Module(new TLB(log2Ceil(coreInstBytes*fetchWidth), nTLBEntries))
|
||||||
|
val fq = withReset(reset || io.cpu.req.valid) { Module(new Queue(new FrontendResp, 3, flow = true)) }
|
||||||
|
|
||||||
|
val s0_valid = io.cpu.req.valid || fq.io.enq.ready
|
||||||
val s1_pc_ = Reg(UInt(width=vaddrBitsExtended))
|
val s1_pc_ = Reg(UInt(width=vaddrBitsExtended))
|
||||||
val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline)
|
val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline)
|
||||||
val s1_speculative = Reg(Bool())
|
val s1_speculative = Reg(Bool())
|
||||||
@ -84,20 +87,24 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
|
|||||||
val ntpc = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth)
|
val ntpc = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth)
|
||||||
val predicted_npc = Wire(init = ntpc)
|
val predicted_npc = Wire(init = ntpc)
|
||||||
val predicted_taken = Wire(init = Bool(false))
|
val predicted_taken = Wire(init = Bool(false))
|
||||||
val icmiss = s2_valid && !icache.io.resp.valid
|
|
||||||
val npc = Mux(icmiss, s2_pc, predicted_npc)
|
|
||||||
|
|
||||||
val stall = io.cpu.resp.valid && !io.cpu.resp.ready
|
val s2_replay = Wire(Bool())
|
||||||
when (!stall) {
|
s2_replay :=
|
||||||
|
(s2_valid && (!icache.io.resp.valid || (fq.io.enq.valid && !fq.io.enq.ready))) ||
|
||||||
|
RegNext(s2_replay && !s0_valid)
|
||||||
|
val npc = Mux(s2_replay, s2_pc, predicted_npc)
|
||||||
|
|
||||||
s1_pc_ := io.cpu.npc
|
s1_pc_ := io.cpu.npc
|
||||||
// consider RVC fetches across blocks to be non-speculative if the first
|
// consider RVC fetches across blocks to be non-speculative if the first
|
||||||
// part was non-speculative
|
// part was non-speculative
|
||||||
val s0_speculative =
|
val s0_speculative =
|
||||||
if (usingCompressed) s1_speculative || s2_valid && !s2_speculative || predicted_taken
|
if (usingCompressed) s1_speculative || s2_valid && !s2_speculative || predicted_taken
|
||||||
else Bool(true)
|
else Bool(true)
|
||||||
s1_speculative := Mux(icmiss, s2_speculative, s0_speculative)
|
s1_speculative := Mux(io.cpu.req.valid, io.cpu.req.bits.speculative, Mux(s2_replay, s2_speculative, s0_speculative))
|
||||||
s2_valid := !icmiss
|
|
||||||
when (!icmiss) {
|
s2_valid := false
|
||||||
|
when (!s2_replay && !io.cpu.req.valid) {
|
||||||
|
s2_valid := true
|
||||||
s2_pc := s1_pc
|
s2_pc := s1_pc
|
||||||
s2_speculative := s1_speculative
|
s2_speculative := s1_speculative
|
||||||
s2_cacheable := tlb.io.resp.cacheable
|
s2_cacheable := tlb.io.resp.cacheable
|
||||||
@ -105,12 +112,6 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
|
|||||||
s2_maybe_ae := tlb.io.resp.ae.inst
|
s2_maybe_ae := tlb.io.resp.ae.inst
|
||||||
s2_tlb_miss := tlb.io.resp.miss
|
s2_tlb_miss := tlb.io.resp.miss
|
||||||
}
|
}
|
||||||
}
|
|
||||||
when (io.cpu.req.valid) {
|
|
||||||
s1_pc_ := io.cpu.npc
|
|
||||||
s1_speculative := io.cpu.req.bits.speculative
|
|
||||||
s2_valid := Bool(false)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (usingBTB) {
|
if (usingBTB) {
|
||||||
val btb = Module(new BTB)
|
val btb = Module(new BTB)
|
||||||
@ -119,7 +120,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
|
|||||||
btb.io.btb_update := io.cpu.btb_update
|
btb.io.btb_update := io.cpu.btb_update
|
||||||
btb.io.bht_update := io.cpu.bht_update
|
btb.io.bht_update := io.cpu.bht_update
|
||||||
btb.io.ras_update := io.cpu.ras_update
|
btb.io.ras_update := io.cpu.ras_update
|
||||||
when (!stall && !icmiss) {
|
when (!s2_replay) {
|
||||||
btb.io.req.valid := true
|
btb.io.req.valid := true
|
||||||
s2_btb_resp_valid := btb.io.resp.valid
|
s2_btb_resp_valid := btb.io.resp.valid
|
||||||
s2_btb_resp_bits := btb.io.resp.bits
|
s2_btb_resp_bits := btb.io.resp.bits
|
||||||
@ -131,7 +132,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
|
|||||||
}
|
}
|
||||||
|
|
||||||
io.ptw <> tlb.io.ptw
|
io.ptw <> tlb.io.ptw
|
||||||
tlb.io.req.valid := !stall && !icmiss
|
tlb.io.req.valid := !s2_replay
|
||||||
tlb.io.req.bits.vaddr := s1_pc
|
tlb.io.req.bits.vaddr := s1_pc
|
||||||
tlb.io.req.bits.passthrough := Bool(false)
|
tlb.io.req.bits.passthrough := Bool(false)
|
||||||
tlb.io.req.bits.instruction := Bool(true)
|
tlb.io.req.bits.instruction := Bool(true)
|
||||||
@ -139,26 +140,27 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
|
|||||||
tlb.io.req.bits.sfence := io.cpu.sfence
|
tlb.io.req.bits.sfence := io.cpu.sfence
|
||||||
tlb.io.req.bits.size := log2Ceil(coreInstBytes*fetchWidth)
|
tlb.io.req.bits.size := log2Ceil(coreInstBytes*fetchWidth)
|
||||||
|
|
||||||
icache.io.req.valid := !stall
|
icache.io.req.valid := s0_valid
|
||||||
icache.io.req.bits.addr := io.cpu.npc
|
icache.io.req.bits.addr := io.cpu.npc
|
||||||
icache.io.invalidate := io.cpu.flush_icache
|
icache.io.invalidate := io.cpu.flush_icache
|
||||||
icache.io.s1_paddr := tlb.io.resp.paddr
|
icache.io.s1_paddr := tlb.io.resp.paddr
|
||||||
icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || icmiss || s1_speculative && !tlb.io.resp.cacheable || tlb.io.resp.pf.inst || tlb.io.resp.ae.inst
|
icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || s2_replay || s1_speculative && !tlb.io.resp.cacheable || tlb.io.resp.pf.inst || tlb.io.resp.ae.inst
|
||||||
icache.io.s2_kill := false
|
icache.io.s2_kill := false
|
||||||
icache.io.resp.ready := !stall
|
|
||||||
|
|
||||||
val s2_kill = s2_speculative && !s2_cacheable || s2_xcpt
|
val s2_kill = s2_speculative && !s2_cacheable || s2_xcpt
|
||||||
io.cpu.resp.valid := s2_valid && (icache.io.resp.valid || s2_kill)
|
fq.io.enq.valid := s2_valid && (icache.io.resp.valid || s2_kill)
|
||||||
io.cpu.resp.bits.pc := s2_pc
|
fq.io.enq.bits.pc := s2_pc
|
||||||
io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc)
|
io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc)
|
||||||
|
|
||||||
io.cpu.resp.bits.data := icache.io.resp.bits
|
fq.io.enq.bits.data := icache.io.resp.bits
|
||||||
io.cpu.resp.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes))
|
fq.io.enq.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes))
|
||||||
io.cpu.resp.bits.pf := s2_pf
|
fq.io.enq.bits.pf := s2_pf
|
||||||
io.cpu.resp.bits.ae := s2_ae
|
fq.io.enq.bits.ae := s2_ae
|
||||||
io.cpu.resp.bits.replay := s2_kill && !icache.io.resp.valid && !s2_xcpt
|
fq.io.enq.bits.replay := s2_kill && !icache.io.resp.valid && !s2_xcpt
|
||||||
io.cpu.resp.bits.btb.valid := s2_btb_resp_valid
|
fq.io.enq.bits.btb.valid := s2_btb_resp_valid
|
||||||
io.cpu.resp.bits.btb.bits := s2_btb_resp_bits
|
fq.io.enq.bits.btb.bits := s2_btb_resp_bits
|
||||||
|
|
||||||
|
io.cpu.resp <> fq.io.deq
|
||||||
|
|
||||||
// performance events
|
// performance events
|
||||||
io.cpu.acquire := edge.done(icache.io.mem(0).a)
|
io.cpu.acquire := edge.done(icache.io.mem(0).a)
|
||||||
|
@ -84,7 +84,7 @@ class IBuf(implicit p: Parameters) extends CoreModule {
|
|||||||
val ae = valid & (Mux(buf.ae, bufMask, UInt(0)) | Mux(io.imem.bits.ae, ~bufMask, UInt(0)))
|
val ae = valid & (Mux(buf.ae, bufMask, UInt(0)) | Mux(io.imem.bits.ae, ~bufMask, UInt(0)))
|
||||||
val ic_replay = valid & (Mux(buf.replay, bufMask, UInt(0)) | Mux(io.imem.bits.replay, ~bufMask, UInt(0)))
|
val ic_replay = valid & (Mux(buf.replay, bufMask, UInt(0)) | Mux(io.imem.bits.replay, ~bufMask, UInt(0)))
|
||||||
val ibufBTBHitMask = Mux(ibufBTBHit, UIntToOH(ibufBTBResp.bridx), UInt(0))
|
val ibufBTBHitMask = Mux(ibufBTBHit, UIntToOH(ibufBTBResp.bridx), UInt(0))
|
||||||
assert(!io.imem.bits.btb.valid || io.imem.bits.btb.bits.bridx >= pcWordBits)
|
assert(!io.imem.valid || !io.imem.bits.btb.valid || io.imem.bits.btb.bits.bridx >= pcWordBits)
|
||||||
val icBTBHitMask = Mux(io.imem.bits.btb.valid, UIntToOH(io.imem.bits.btb.bits.bridx +& nBufValid - pcWordBits), UInt(0))
|
val icBTBHitMask = Mux(io.imem.bits.btb.valid, UIntToOH(io.imem.bits.btb.bits.bridx +& nBufValid - pcWordBits), UInt(0))
|
||||||
val btbHitMask = ibufBTBHitMask & bufMask | icBTBHitMask & ~bufMask
|
val btbHitMask = ibufBTBHitMask & bufMask | icBTBHitMask & ~bufMask
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) {
|
|||||||
val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req
|
val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req
|
||||||
val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission
|
val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission
|
||||||
|
|
||||||
val resp = Decoupled(UInt(width = coreInstBits * fetchWidth))
|
val resp = Valid(UInt(width = coreInstBits * fetchWidth))
|
||||||
val invalidate = Bool(INPUT)
|
val invalidate = Bool(INPUT)
|
||||||
val mem = outer.node.bundleOut
|
val mem = outer.node.bundleOut
|
||||||
}
|
}
|
||||||
@ -65,7 +65,6 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
|||||||
val s_ready :: s_request :: s_refill :: Nil = Enum(UInt(), 3)
|
val s_ready :: s_request :: s_refill :: Nil = Enum(UInt(), 3)
|
||||||
val state = Reg(init=s_ready)
|
val state = Reg(init=s_ready)
|
||||||
val invalidated = Reg(Bool())
|
val invalidated = Reg(Bool())
|
||||||
val stall = !io.resp.ready
|
|
||||||
|
|
||||||
val refill_addr = Reg(UInt(width = paddrBits))
|
val refill_addr = Reg(UInt(width = paddrBits))
|
||||||
val s1_tag_hit = Wire(Vec(nWays, Bool()))
|
val s1_tag_hit = Wire(Vec(nWays, Bool()))
|
||||||
@ -78,10 +77,10 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
|||||||
val s1_hit = out_valid && s1_any_tag_hit
|
val s1_hit = out_valid && s1_any_tag_hit
|
||||||
val s1_miss = s1_valid && state === s_ready && !s1_any_tag_hit
|
val s1_miss = s1_valid && state === s_ready && !s1_any_tag_hit
|
||||||
|
|
||||||
val s0_valid = io.req.valid && state === s_ready && !(s1_valid && stall)
|
val s0_valid = io.req.valid && state === s_ready
|
||||||
val s0_vaddr = io.req.bits.addr
|
val s0_vaddr = io.req.bits.addr
|
||||||
|
|
||||||
s1_valid := s0_valid || out_valid && stall
|
s1_valid := s0_valid
|
||||||
|
|
||||||
when (s1_miss) { refill_addr := io.s1_paddr }
|
when (s1_miss) { refill_addr := io.s1_paddr }
|
||||||
val refill_tag = refill_addr(tagBits+untagBits-1,untagBits)
|
val refill_tag = refill_addr(tagBits+untagBits-1,untagBits)
|
||||||
@ -116,8 +115,8 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
|||||||
|
|
||||||
for (i <- 0 until nWays) {
|
for (i <- 0 until nWays) {
|
||||||
val s1_vb = vb_array(Cat(UInt(i), io.s1_paddr(untagBits-1,blockOffBits))).toBool
|
val s1_vb = vb_array(Cat(UInt(i), io.s1_paddr(untagBits-1,blockOffBits))).toBool
|
||||||
s1_tag_disparity(i) := (code.decode(tag_rdata(i)).error holdUnless s1_dout_valid)
|
s1_tag_disparity(i) := code.decode(tag_rdata(i)).error
|
||||||
s1_tag_hit(i) := s1_vb && ((code.decode(tag_rdata(i)).uncorrected === s1_tag) holdUnless s1_dout_valid)
|
s1_tag_hit(i) := s1_vb && code.decode(tag_rdata(i)).uncorrected === s1_tag
|
||||||
}
|
}
|
||||||
|
|
||||||
require(rowBits % wordBits == 0)
|
require(rowBits % wordBits == 0)
|
||||||
@ -131,7 +130,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
|||||||
}
|
}
|
||||||
def wordMatch(addr: UInt) = addr.extract(log2Ceil(rowBytes)-1, log2Ceil(wordBits/8)) === i
|
def wordMatch(addr: UInt) = addr.extract(log2Ceil(rowBytes)-1, log2Ceil(wordBits/8)) === i
|
||||||
val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles))
|
val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles))
|
||||||
val dout = data_array.read(s0_raddr, !wen && (s0_valid && wordMatch(s0_vaddr))) holdUnless s1_dout_valid
|
val dout = data_array.read(s0_raddr, !wen && (s0_valid && wordMatch(s0_vaddr)))
|
||||||
when (wordMatch(io.s1_paddr)) {
|
when (wordMatch(io.s1_paddr)) {
|
||||||
s1_dout := dout
|
s1_dout := dout
|
||||||
}
|
}
|
||||||
@ -145,7 +144,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
|||||||
data_array.write((refill_idx << log2Ceil(refillCycles)) | refill_cnt, e_d)
|
data_array.write((refill_idx << log2Ceil(refillCycles)) | refill_cnt, e_d)
|
||||||
}
|
}
|
||||||
val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles))
|
val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles))
|
||||||
s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid) holdUnless s1_dout_valid
|
s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid)
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -156,13 +155,13 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
|||||||
io.resp.bits := Mux1H(s1_tag_hit, s1_dout)
|
io.resp.bits := Mux1H(s1_tag_hit, s1_dout)
|
||||||
io.resp.valid := s1_hit
|
io.resp.valid := s1_hit
|
||||||
case 2 =>
|
case 2 =>
|
||||||
val s2_valid = RegEnable(out_valid, Bool(false), !stall)
|
val s2_valid = RegNext(out_valid, Bool(false))
|
||||||
val s2_hit = RegEnable(s1_hit, Bool(false), !stall)
|
val s2_hit = RegNext(s1_hit, Bool(false))
|
||||||
val s2_tag_hit = RegEnable(s1_tag_hit, !stall)
|
val s2_tag_hit = RegEnable(s1_tag_hit, s1_valid)
|
||||||
val s2_dout = RegEnable(s1_dout, !stall)
|
val s2_dout = RegEnable(s1_dout, s1_valid)
|
||||||
val s2_way_mux = Mux1H(s2_tag_hit, s2_dout)
|
val s2_way_mux = Mux1H(s2_tag_hit, s2_dout)
|
||||||
|
|
||||||
val s2_tag_disparity = RegEnable(s1_tag_disparity, !stall).asUInt.orR
|
val s2_tag_disparity = RegEnable(s1_tag_disparity, s1_valid).asUInt.orR
|
||||||
val s2_data_disparity = code.decode(s2_way_mux).error
|
val s2_data_disparity = code.decode(s2_way_mux).error
|
||||||
val s2_disparity = s2_tag_disparity || s2_data_disparity
|
val s2_disparity = s2_tag_disparity || s2_data_disparity
|
||||||
when (s2_valid && s2_disparity) { invalidate := true }
|
when (s2_valid && s2_disparity) { invalidate := true }
|
||||||
|
@ -173,7 +173,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
|
|||||||
val ibuf = Module(new IBuf)
|
val ibuf = Module(new IBuf)
|
||||||
val id_expanded_inst = ibuf.io.inst.map(_.bits.inst)
|
val id_expanded_inst = ibuf.io.inst.map(_.bits.inst)
|
||||||
val id_inst = id_expanded_inst.map(_.bits)
|
val id_inst = id_expanded_inst.map(_.bits)
|
||||||
ibuf.io.imem <> (if (usingCompressed) withReset(reset || take_pc) { Queue(io.imem.resp, 1, flow = true) } else io.imem.resp)
|
ibuf.io.imem <> io.imem.resp
|
||||||
ibuf.io.kill := take_pc
|
ibuf.io.kill := take_pc
|
||||||
|
|
||||||
require(decodeWidth == 1 /* TODO */ && retireWidth == decodeWidth)
|
require(decodeWidth == 1 /* TODO */ && retireWidth == decodeWidth)
|
||||||
|
Loading…
Reference in New Issue
Block a user