Merge pull request #849 from freechipsproject/l2-tlb
L1 memory system improvements
This commit is contained in:
commit
a0cbc376b4
@ -146,6 +146,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
|
||||
tlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0)
|
||||
tlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1)
|
||||
tlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data
|
||||
tlb.io.req.bits.sfence.bits.addr := s1_req.addr
|
||||
tlb.io.req.bits.passthrough := s1_req.phys
|
||||
tlb.io.req.bits.vaddr := s1_req.addr
|
||||
tlb.io.req.bits.instruction := false
|
||||
@ -473,7 +474,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
|
||||
probe_bits.address := Cat(s2_victim_tag, s2_req.addr(idxMSB, idxLSB)) << idxLSB
|
||||
}
|
||||
when (s2_probe) {
|
||||
s1_nack := true
|
||||
val probeNack = Wire(init = true.B)
|
||||
when (s2_meta_error) {
|
||||
release_state := s_probe_retry
|
||||
}.elsewhen (s2_prb_ack_data) {
|
||||
@ -484,9 +485,10 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
|
||||
release_state := Mux(releaseDone, s_probe_write_meta, s_probe_rep_clean)
|
||||
}.otherwise {
|
||||
tl_out.c.valid := true
|
||||
s1_nack := !releaseDone
|
||||
probeNack := !releaseDone
|
||||
release_state := Mux(releaseDone, s_ready, s_probe_rep_miss)
|
||||
}
|
||||
when (probeNack) { s1_nack := true }
|
||||
}
|
||||
when (release_state === s_probe_retry) {
|
||||
metaArb.io.in(6).valid := true
|
||||
|
@ -168,12 +168,13 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
|
||||
fq.io.enq.bits.pc := s2_pc
|
||||
io.cpu.npc := ~(~Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc) | (coreInstBytes-1)) // discard LSB(s)
|
||||
|
||||
fq.io.enq.bits.data := icache.io.resp.bits
|
||||
fq.io.enq.bits.data := icache.io.resp.bits.data
|
||||
fq.io.enq.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes))
|
||||
fq.io.enq.bits.xcpt := s2_tlb_resp
|
||||
fq.io.enq.bits.replay := icache.io.s2_kill && !icache.io.resp.valid && !s2_xcpt
|
||||
fq.io.enq.bits.btb.valid := s2_btb_resp_valid
|
||||
fq.io.enq.bits.btb.bits := s2_btb_resp_bits
|
||||
fq.io.enq.bits.xcpt := s2_tlb_resp
|
||||
when (icache.io.resp.valid && icache.io.resp.bits.ae) { fq.io.enq.bits.xcpt.ae.inst := true }
|
||||
|
||||
io.cpu.resp <> fq.io.deq
|
||||
|
||||
|
@ -58,6 +58,13 @@ class ICache(val icacheParams: ICacheParams, val hartid: Int)(implicit p: Parame
|
||||
}
|
||||
}
|
||||
|
||||
class ICacheResp(outer: ICache) extends Bundle {
|
||||
val data = UInt(width = outer.icacheParams.fetchBytes*8)
|
||||
val ae = Bool()
|
||||
|
||||
override def cloneType = new ICacheResp(outer).asInstanceOf[this.type]
|
||||
}
|
||||
|
||||
class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) {
|
||||
val hartid = UInt(INPUT, hartIdLen)
|
||||
val req = Decoupled(new ICacheReq).flip
|
||||
@ -66,7 +73,7 @@ class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) {
|
||||
val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req
|
||||
val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission
|
||||
|
||||
val resp = Valid(UInt(width = outer.icacheParams.fetchBytes*8))
|
||||
val resp = Valid(new ICacheResp(outer))
|
||||
val invalidate = Bool(INPUT)
|
||||
val tl_out = outer.masterNode.bundleOut
|
||||
val tl_in = outer.slaveNode.map(_.bundleIn)
|
||||
@ -145,15 +152,18 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
||||
v
|
||||
}
|
||||
|
||||
val tag_array = SeqMem(nSets, Vec(nWays, Bits(width = tECC.width(tagBits))))
|
||||
val tag_array = SeqMem(nSets, Vec(nWays, UInt(width = tECC.width(1 + tagBits))))
|
||||
val tag_rdata = tag_array.read(s0_vaddr(untagBits-1,blockOffBits), !refill_done && s0_valid)
|
||||
val accruedRefillError = Reg(Bool())
|
||||
val refillError = tl_out.d.bits.error || (refill_cnt > 0 && accruedRefillError)
|
||||
when (refill_done) {
|
||||
val tag = tECC.encode(refill_tag)
|
||||
tag_array.write(refill_idx, Vec.fill(nWays)(tag), Vec.tabulate(nWays)(repl_way === _))
|
||||
val enc_tag = tECC.encode(Cat(refillError, refill_tag))
|
||||
tag_array.write(refill_idx, Vec.fill(nWays)(enc_tag), Seq.tabulate(nWays)(repl_way === _))
|
||||
}
|
||||
|
||||
val vb_array = Reg(init=Bits(0, nSets*nWays))
|
||||
when (tl_out.d.fire()) {
|
||||
accruedRefillError := refillError
|
||||
// clear bit when refill starts so hit-under-miss doesn't fetch bad data
|
||||
vb_array := vb_array.bitSet(Cat(repl_way, refill_idx), refill_done && !invalidated)
|
||||
}
|
||||
@ -164,6 +174,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
||||
}
|
||||
|
||||
val s1_tag_disparity = Wire(Vec(nWays, Bool()))
|
||||
val s1_tl_error = Wire(Vec(nWays, Bool()))
|
||||
val wordBits = outer.icacheParams.fetchBytes*8
|
||||
val s1_dout = Wire(Vec(nWays, UInt(width = dECC.width(wordBits))))
|
||||
|
||||
@ -179,8 +190,12 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
||||
lineInScratchpad(scratchpadLine(s1s3_slaveAddr)) && scratchpadWay(s1s3_slaveAddr) === i,
|
||||
addrInScratchpad(io.s1_paddr) && scratchpadWay(io.s1_paddr) === i)
|
||||
val s1_vb = vb_array(Cat(UInt(i), s1_idx)) && !s1_slaveValid
|
||||
s1_tag_disparity(i) := s1_vb && tECC.decode(tag_rdata(i)).error
|
||||
s1_tag_hit(i) := scratchpadHit || (s1_vb && tECC.decode(tag_rdata(i)).uncorrected === s1_tag)
|
||||
val enc_tag = tECC.decode(tag_rdata(i))
|
||||
val (tl_error, tag) = Split(enc_tag.uncorrected, tagBits)
|
||||
val tagMatch = s1_vb && tag === s1_tag
|
||||
s1_tag_disparity(i) := s1_vb && enc_tag.error
|
||||
s1_tl_error(i) := tagMatch && tl_error.toBool
|
||||
s1_tag_hit(i) := tagMatch || scratchpadHit
|
||||
}
|
||||
assert(!(s1_valid || s1_slaveValid) || PopCount(s1_tag_hit zip s1_tag_disparity map { case (h, d) => h && !d }) <= 1)
|
||||
|
||||
@ -212,7 +227,8 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
||||
require(tECC.isInstanceOf[uncore.util.IdentityCode])
|
||||
require(dECC.isInstanceOf[uncore.util.IdentityCode])
|
||||
require(outer.icacheParams.itimAddr.isEmpty)
|
||||
io.resp.bits := Mux1H(s1_tag_hit, s1_dout)
|
||||
io.resp.bits.data := Mux1H(s1_tag_hit, s1_dout)
|
||||
io.resp.bits.ae := s1_tl_error.asUInt.orR
|
||||
io.resp.valid := s1_valid && s1_hit
|
||||
|
||||
case 2 =>
|
||||
@ -221,11 +237,13 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
|
||||
val s2_way_mux = Mux1H(s2_tag_hit, s2_dout)
|
||||
|
||||
val s2_tag_disparity = RegEnable(s1_tag_disparity, s1_valid || s1_slaveValid).asUInt.orR
|
||||
val s2_tl_error = RegEnable(s1_tl_error.asUInt.orR, s1_valid || s1_slaveValid)
|
||||
val s2_data_decoded = dECC.decode(s2_way_mux)
|
||||
val s2_disparity = s2_tag_disparity || s2_data_decoded.error
|
||||
when (s2_valid && s2_disparity) { invalidate := true }
|
||||
|
||||
io.resp.bits := s2_data_decoded.uncorrected
|
||||
io.resp.bits.data := s2_data_decoded.uncorrected
|
||||
io.resp.bits.ae := s2_tl_error
|
||||
io.resp.valid := s2_valid && s2_hit && !s2_disparity
|
||||
|
||||
tl_in.map { tl =>
|
||||
|
@ -708,6 +708,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule
|
||||
dtlb.io.req.bits.sfence.valid := s1_sfence
|
||||
dtlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0)
|
||||
dtlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1)
|
||||
dtlb.io.req.bits.sfence.bits.addr := s1_req.addr
|
||||
dtlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data
|
||||
dtlb.io.req.bits.passthrough := s1_req.phys
|
||||
dtlb.io.req.bits.vaddr := s1_req.addr
|
||||
|
@ -11,6 +11,7 @@ import coreplex.CacheBlockBytes
|
||||
import uncore.constants._
|
||||
import uncore.tilelink2._
|
||||
import util._
|
||||
import uncore.util.ParityCode
|
||||
|
||||
import scala.collection.mutable.ListBuffer
|
||||
|
||||
@ -37,7 +38,7 @@ class TLBPTWIO(implicit p: Parameters) extends CoreBundle()(p)
|
||||
class DatapathPTWIO(implicit p: Parameters) extends CoreBundle()(p)
|
||||
with HasRocketCoreParameters {
|
||||
val ptbr = new PTBR().asInput
|
||||
val invalidate = Bool(INPUT)
|
||||
val sfence = Valid(new SFenceReq).flip
|
||||
val status = new MStatus().asInput
|
||||
val pmp = Vec(nPMPs, new PMP).asInput
|
||||
}
|
||||
@ -125,17 +126,69 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(
|
||||
data(r) := pte.ppn
|
||||
}
|
||||
when (hit && state === s_req) { plru.access(OHToUInt(hits)) }
|
||||
when (io.dpath.invalidate) { valid := 0 }
|
||||
when (io.dpath.sfence.valid && !io.dpath.sfence.bits.rs1) { valid := 0 }
|
||||
|
||||
(hit && count < pgLevels-1, Mux1H(hits, data))
|
||||
}
|
||||
|
||||
io.mem.req.valid := state === s_req
|
||||
val l2_refill = RegNext(false.B)
|
||||
val (l2_hit, l2_pte) = if (coreParams.nL2TLBEntries == 0) (false.B, Wire(new PTE)) else {
|
||||
class Entry extends Bundle {
|
||||
val ppn = UInt(width = ppnBits)
|
||||
val d = Bool()
|
||||
val a = Bool()
|
||||
val u = Bool()
|
||||
val x = Bool()
|
||||
val w = Bool()
|
||||
val r = Bool()
|
||||
}
|
||||
|
||||
val code = new ParityCode
|
||||
require(isPow2(coreParams.nL2TLBEntries))
|
||||
val idxBits = log2Ceil(coreParams.nL2TLBEntries)
|
||||
val tagBits = vpnBits - idxBits
|
||||
val ram = SeqMem(coreParams.nL2TLBEntries, UInt(width = code.width(new Entry().getWidth + tagBits)))
|
||||
val g = Reg(UInt(width = coreParams.nL2TLBEntries))
|
||||
val valid = RegInit(UInt(0, coreParams.nL2TLBEntries))
|
||||
val (r_tag, r_idx) = Split(r_req.addr, idxBits)
|
||||
when (l2_refill) {
|
||||
val entry = Wire(new Entry)
|
||||
entry := r_pte
|
||||
ram.write(r_idx, code.encode(Cat(entry.asUInt, r_tag)))
|
||||
|
||||
val mask = UIntToOH(r_idx)
|
||||
valid := valid | mask
|
||||
g := Mux(r_pte.g, g | mask, g & ~mask)
|
||||
}
|
||||
when (io.dpath.sfence.valid) {
|
||||
valid :=
|
||||
Mux(io.dpath.sfence.bits.rs1, valid & ~UIntToOH(io.dpath.sfence.bits.addr(idxBits+pgIdxBits-1, pgIdxBits)),
|
||||
Mux(io.dpath.sfence.bits.rs2, valid & g, 0.U))
|
||||
}
|
||||
|
||||
val s0_valid = !l2_refill && arb.io.out.fire()
|
||||
val s1_valid = RegNext(s0_valid)
|
||||
val s2_valid = RegNext(s1_valid && valid(r_idx))
|
||||
val s1_rdata = ram.read(arb.io.out.bits.addr(idxBits-1, 0), s0_valid)
|
||||
val s2_rdata = code.decode(RegEnable(s1_rdata, s1_valid))
|
||||
when (s2_valid && s2_rdata.error) { valid := 0.U }
|
||||
|
||||
val (s2_entry, s2_tag) = Split(s2_rdata.uncorrected, tagBits)
|
||||
val s2_hit = s2_valid && !s2_rdata.error && r_tag === s2_tag
|
||||
val s2_pte = Wire(new PTE)
|
||||
s2_pte := s2_entry.asTypeOf(new Entry)
|
||||
s2_pte.g := g(r_idx)
|
||||
s2_pte.v := true
|
||||
|
||||
(s2_hit, s2_pte)
|
||||
}
|
||||
|
||||
io.mem.req.valid := state === s_req && !l2_hit
|
||||
io.mem.req.bits.phys := Bool(true)
|
||||
io.mem.req.bits.cmd := M_XRD
|
||||
io.mem.req.bits.typ := log2Ceil(xLen/8)
|
||||
io.mem.req.bits.addr := pte_addr
|
||||
io.mem.s1_kill := s1_kill
|
||||
io.mem.s1_kill := s1_kill || l2_hit
|
||||
io.mem.invalidate_lr := Bool(false)
|
||||
|
||||
val pmaPgLevelHomogeneous = (0 until pgLevels) map { i =>
|
||||
@ -159,7 +212,7 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(
|
||||
// control state machine
|
||||
switch (state) {
|
||||
is (s_ready) {
|
||||
when (arb.io.out.valid) {
|
||||
when (arb.io.out.fire()) {
|
||||
state := s_req
|
||||
}
|
||||
count := UInt(0)
|
||||
@ -186,6 +239,7 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(
|
||||
state := s_req
|
||||
count := count + 1
|
||||
}.otherwise {
|
||||
l2_refill := pte.v && !invalid_paddr && count === pgLevels-1
|
||||
resp_ae := pte.v && invalid_paddr
|
||||
state := s_ready
|
||||
resp_valid(r_req_dest) := true
|
||||
@ -198,6 +252,12 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(
|
||||
}
|
||||
}
|
||||
}
|
||||
when (l2_hit) {
|
||||
state := s_ready
|
||||
resp_valid(r_req_dest) := true
|
||||
resp_ae := false
|
||||
r_pte := l2_pte
|
||||
}
|
||||
}
|
||||
|
||||
/** Mix-ins for constructing tiles that might have a PTW */
|
||||
|
@ -25,6 +25,7 @@ case class RocketCoreParams(
|
||||
nPMPs: Int = 8,
|
||||
nPerfCounters: Int = 0,
|
||||
nCustomMRWCSRs: Int = 0,
|
||||
nL2TLBEntries: Int = 0,
|
||||
mtvecInit: Option[BigInt] = Some(BigInt(0)),
|
||||
mtvecWritable: Boolean = true,
|
||||
fastLoadWord: Boolean = true,
|
||||
@ -588,8 +589,9 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
|
||||
io.imem.sfence.valid := wb_reg_valid && wb_reg_sfence
|
||||
io.imem.sfence.bits.rs1 := wb_ctrl.mem_type(0)
|
||||
io.imem.sfence.bits.rs2 := wb_ctrl.mem_type(1)
|
||||
io.imem.sfence.bits.addr := wb_reg_wdata
|
||||
io.imem.sfence.bits.asid := wb_reg_rs2
|
||||
io.ptw.invalidate := io.imem.sfence.valid && !io.imem.sfence.bits.rs1
|
||||
io.ptw.sfence := io.imem.sfence
|
||||
|
||||
ibuf.io.inst(0).ready := !ctrl_stalld || csr.io.interrupt
|
||||
|
||||
|
@ -20,6 +20,7 @@ case object ASIdBits extends Field[Int]
|
||||
class SFenceReq(implicit p: Parameters) extends CoreBundle()(p) {
|
||||
val rs1 = Bool()
|
||||
val rs2 = Bool()
|
||||
val addr = UInt(width = vaddrBits)
|
||||
val asid = UInt(width = asIdBits max 1) // TODO zero-width
|
||||
}
|
||||
|
||||
@ -252,6 +253,7 @@ class TLB(lgMaxSize: Int, nEntries: Int)(implicit edge: TLEdgeOut, p: Parameters
|
||||
}
|
||||
|
||||
when (sfence) {
|
||||
assert((io.req.bits.sfence.bits.addr >> pgIdxBits) === vpn(vpnBits-1,0))
|
||||
valid := Mux(io.req.bits.sfence.bits.rs1, valid & ~hits(totalEntries-1, 0),
|
||||
Mux(io.req.bits.sfence.bits.rs2, valid & entries.map(_.g).asUInt, 0))
|
||||
}
|
||||
|
@ -24,6 +24,7 @@ trait CoreParams {
|
||||
val retireWidth: Int
|
||||
val instBits: Int
|
||||
val nLocalInterrupts: Int
|
||||
val nL2TLBEntries: Int
|
||||
}
|
||||
|
||||
trait HasCoreParameters extends HasTileParameters {
|
||||
|
Loading…
Reference in New Issue
Block a user