1
0

Merge pull request #849 from freechipsproject/l2-tlb

L1 memory system improvements
This commit is contained in:
Andrew Waterman 2017-07-06 13:03:06 -07:00 committed by GitHub
commit a0cbc376b4
8 changed files with 105 additions and 18 deletions

View File

@ -146,6 +146,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
tlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0)
tlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1)
tlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data
tlb.io.req.bits.sfence.bits.addr := s1_req.addr
tlb.io.req.bits.passthrough := s1_req.phys
tlb.io.req.bits.vaddr := s1_req.addr
tlb.io.req.bits.instruction := false
@ -473,7 +474,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
probe_bits.address := Cat(s2_victim_tag, s2_req.addr(idxMSB, idxLSB)) << idxLSB
}
when (s2_probe) {
s1_nack := true
val probeNack = Wire(init = true.B)
when (s2_meta_error) {
release_state := s_probe_retry
}.elsewhen (s2_prb_ack_data) {
@ -484,9 +485,10 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
release_state := Mux(releaseDone, s_probe_write_meta, s_probe_rep_clean)
}.otherwise {
tl_out.c.valid := true
s1_nack := !releaseDone
probeNack := !releaseDone
release_state := Mux(releaseDone, s_ready, s_probe_rep_miss)
}
when (probeNack) { s1_nack := true }
}
when (release_state === s_probe_retry) {
metaArb.io.in(6).valid := true

View File

@ -168,12 +168,13 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
fq.io.enq.bits.pc := s2_pc
io.cpu.npc := ~(~Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc) | (coreInstBytes-1)) // discard LSB(s)
fq.io.enq.bits.data := icache.io.resp.bits
fq.io.enq.bits.data := icache.io.resp.bits.data
fq.io.enq.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes))
fq.io.enq.bits.xcpt := s2_tlb_resp
fq.io.enq.bits.replay := icache.io.s2_kill && !icache.io.resp.valid && !s2_xcpt
fq.io.enq.bits.btb.valid := s2_btb_resp_valid
fq.io.enq.bits.btb.bits := s2_btb_resp_bits
fq.io.enq.bits.xcpt := s2_tlb_resp
when (icache.io.resp.valid && icache.io.resp.bits.ae) { fq.io.enq.bits.xcpt.ae.inst := true }
io.cpu.resp <> fq.io.deq

View File

@ -58,6 +58,13 @@ class ICache(val icacheParams: ICacheParams, val hartid: Int)(implicit p: Parame
}
}
class ICacheResp(outer: ICache) extends Bundle {
val data = UInt(width = outer.icacheParams.fetchBytes*8)
val ae = Bool()
override def cloneType = new ICacheResp(outer).asInstanceOf[this.type]
}
class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) {
val hartid = UInt(INPUT, hartIdLen)
val req = Decoupled(new ICacheReq).flip
@ -66,7 +73,7 @@ class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) {
val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req
val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission
val resp = Valid(UInt(width = outer.icacheParams.fetchBytes*8))
val resp = Valid(new ICacheResp(outer))
val invalidate = Bool(INPUT)
val tl_out = outer.masterNode.bundleOut
val tl_in = outer.slaveNode.map(_.bundleIn)
@ -145,15 +152,18 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
v
}
val tag_array = SeqMem(nSets, Vec(nWays, Bits(width = tECC.width(tagBits))))
val tag_array = SeqMem(nSets, Vec(nWays, UInt(width = tECC.width(1 + tagBits))))
val tag_rdata = tag_array.read(s0_vaddr(untagBits-1,blockOffBits), !refill_done && s0_valid)
val accruedRefillError = Reg(Bool())
val refillError = tl_out.d.bits.error || (refill_cnt > 0 && accruedRefillError)
when (refill_done) {
val tag = tECC.encode(refill_tag)
tag_array.write(refill_idx, Vec.fill(nWays)(tag), Vec.tabulate(nWays)(repl_way === _))
val enc_tag = tECC.encode(Cat(refillError, refill_tag))
tag_array.write(refill_idx, Vec.fill(nWays)(enc_tag), Seq.tabulate(nWays)(repl_way === _))
}
val vb_array = Reg(init=Bits(0, nSets*nWays))
when (tl_out.d.fire()) {
accruedRefillError := refillError
// clear bit when refill starts so hit-under-miss doesn't fetch bad data
vb_array := vb_array.bitSet(Cat(repl_way, refill_idx), refill_done && !invalidated)
}
@ -164,6 +174,7 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
}
val s1_tag_disparity = Wire(Vec(nWays, Bool()))
val s1_tl_error = Wire(Vec(nWays, Bool()))
val wordBits = outer.icacheParams.fetchBytes*8
val s1_dout = Wire(Vec(nWays, UInt(width = dECC.width(wordBits))))
@ -179,8 +190,12 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
lineInScratchpad(scratchpadLine(s1s3_slaveAddr)) && scratchpadWay(s1s3_slaveAddr) === i,
addrInScratchpad(io.s1_paddr) && scratchpadWay(io.s1_paddr) === i)
val s1_vb = vb_array(Cat(UInt(i), s1_idx)) && !s1_slaveValid
s1_tag_disparity(i) := s1_vb && tECC.decode(tag_rdata(i)).error
s1_tag_hit(i) := scratchpadHit || (s1_vb && tECC.decode(tag_rdata(i)).uncorrected === s1_tag)
val enc_tag = tECC.decode(tag_rdata(i))
val (tl_error, tag) = Split(enc_tag.uncorrected, tagBits)
val tagMatch = s1_vb && tag === s1_tag
s1_tag_disparity(i) := s1_vb && enc_tag.error
s1_tl_error(i) := tagMatch && tl_error.toBool
s1_tag_hit(i) := tagMatch || scratchpadHit
}
assert(!(s1_valid || s1_slaveValid) || PopCount(s1_tag_hit zip s1_tag_disparity map { case (h, d) => h && !d }) <= 1)
@ -212,7 +227,8 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
require(tECC.isInstanceOf[uncore.util.IdentityCode])
require(dECC.isInstanceOf[uncore.util.IdentityCode])
require(outer.icacheParams.itimAddr.isEmpty)
io.resp.bits := Mux1H(s1_tag_hit, s1_dout)
io.resp.bits.data := Mux1H(s1_tag_hit, s1_dout)
io.resp.bits.ae := s1_tl_error.asUInt.orR
io.resp.valid := s1_valid && s1_hit
case 2 =>
@ -221,11 +237,13 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
val s2_way_mux = Mux1H(s2_tag_hit, s2_dout)
val s2_tag_disparity = RegEnable(s1_tag_disparity, s1_valid || s1_slaveValid).asUInt.orR
val s2_tl_error = RegEnable(s1_tl_error.asUInt.orR, s1_valid || s1_slaveValid)
val s2_data_decoded = dECC.decode(s2_way_mux)
val s2_disparity = s2_tag_disparity || s2_data_decoded.error
when (s2_valid && s2_disparity) { invalidate := true }
io.resp.bits := s2_data_decoded.uncorrected
io.resp.bits.data := s2_data_decoded.uncorrected
io.resp.bits.ae := s2_tl_error
io.resp.valid := s2_valid && s2_hit && !s2_disparity
tl_in.map { tl =>

View File

@ -708,6 +708,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule
dtlb.io.req.bits.sfence.valid := s1_sfence
dtlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0)
dtlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1)
dtlb.io.req.bits.sfence.bits.addr := s1_req.addr
dtlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data
dtlb.io.req.bits.passthrough := s1_req.phys
dtlb.io.req.bits.vaddr := s1_req.addr

View File

@ -11,6 +11,7 @@ import coreplex.CacheBlockBytes
import uncore.constants._
import uncore.tilelink2._
import util._
import uncore.util.ParityCode
import scala.collection.mutable.ListBuffer
@ -37,7 +38,7 @@ class TLBPTWIO(implicit p: Parameters) extends CoreBundle()(p)
class DatapathPTWIO(implicit p: Parameters) extends CoreBundle()(p)
with HasRocketCoreParameters {
val ptbr = new PTBR().asInput
val invalidate = Bool(INPUT)
val sfence = Valid(new SFenceReq).flip
val status = new MStatus().asInput
val pmp = Vec(nPMPs, new PMP).asInput
}
@ -125,17 +126,69 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(
data(r) := pte.ppn
}
when (hit && state === s_req) { plru.access(OHToUInt(hits)) }
when (io.dpath.invalidate) { valid := 0 }
when (io.dpath.sfence.valid && !io.dpath.sfence.bits.rs1) { valid := 0 }
(hit && count < pgLevels-1, Mux1H(hits, data))
}
io.mem.req.valid := state === s_req
val l2_refill = RegNext(false.B)
val (l2_hit, l2_pte) = if (coreParams.nL2TLBEntries == 0) (false.B, Wire(new PTE)) else {
class Entry extends Bundle {
val ppn = UInt(width = ppnBits)
val d = Bool()
val a = Bool()
val u = Bool()
val x = Bool()
val w = Bool()
val r = Bool()
}
val code = new ParityCode
require(isPow2(coreParams.nL2TLBEntries))
val idxBits = log2Ceil(coreParams.nL2TLBEntries)
val tagBits = vpnBits - idxBits
val ram = SeqMem(coreParams.nL2TLBEntries, UInt(width = code.width(new Entry().getWidth + tagBits)))
val g = Reg(UInt(width = coreParams.nL2TLBEntries))
val valid = RegInit(UInt(0, coreParams.nL2TLBEntries))
val (r_tag, r_idx) = Split(r_req.addr, idxBits)
when (l2_refill) {
val entry = Wire(new Entry)
entry := r_pte
ram.write(r_idx, code.encode(Cat(entry.asUInt, r_tag)))
val mask = UIntToOH(r_idx)
valid := valid | mask
g := Mux(r_pte.g, g | mask, g & ~mask)
}
when (io.dpath.sfence.valid) {
valid :=
Mux(io.dpath.sfence.bits.rs1, valid & ~UIntToOH(io.dpath.sfence.bits.addr(idxBits+pgIdxBits-1, pgIdxBits)),
Mux(io.dpath.sfence.bits.rs2, valid & g, 0.U))
}
val s0_valid = !l2_refill && arb.io.out.fire()
val s1_valid = RegNext(s0_valid)
val s2_valid = RegNext(s1_valid && valid(r_idx))
val s1_rdata = ram.read(arb.io.out.bits.addr(idxBits-1, 0), s0_valid)
val s2_rdata = code.decode(RegEnable(s1_rdata, s1_valid))
when (s2_valid && s2_rdata.error) { valid := 0.U }
val (s2_entry, s2_tag) = Split(s2_rdata.uncorrected, tagBits)
val s2_hit = s2_valid && !s2_rdata.error && r_tag === s2_tag
val s2_pte = Wire(new PTE)
s2_pte := s2_entry.asTypeOf(new Entry)
s2_pte.g := g(r_idx)
s2_pte.v := true
(s2_hit, s2_pte)
}
io.mem.req.valid := state === s_req && !l2_hit
io.mem.req.bits.phys := Bool(true)
io.mem.req.bits.cmd := M_XRD
io.mem.req.bits.typ := log2Ceil(xLen/8)
io.mem.req.bits.addr := pte_addr
io.mem.s1_kill := s1_kill
io.mem.s1_kill := s1_kill || l2_hit
io.mem.invalidate_lr := Bool(false)
val pmaPgLevelHomogeneous = (0 until pgLevels) map { i =>
@ -159,7 +212,7 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(
// control state machine
switch (state) {
is (s_ready) {
when (arb.io.out.valid) {
when (arb.io.out.fire()) {
state := s_req
}
count := UInt(0)
@ -186,6 +239,7 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(
state := s_req
count := count + 1
}.otherwise {
l2_refill := pte.v && !invalid_paddr && count === pgLevels-1
resp_ae := pte.v && invalid_paddr
state := s_ready
resp_valid(r_req_dest) := true
@ -198,6 +252,12 @@ class PTW(n: Int)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(
}
}
}
when (l2_hit) {
state := s_ready
resp_valid(r_req_dest) := true
resp_ae := false
r_pte := l2_pte
}
}
/** Mix-ins for constructing tiles that might have a PTW */

View File

@ -25,6 +25,7 @@ case class RocketCoreParams(
nPMPs: Int = 8,
nPerfCounters: Int = 0,
nCustomMRWCSRs: Int = 0,
nL2TLBEntries: Int = 0,
mtvecInit: Option[BigInt] = Some(BigInt(0)),
mtvecWritable: Boolean = true,
fastLoadWord: Boolean = true,
@ -588,8 +589,9 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
io.imem.sfence.valid := wb_reg_valid && wb_reg_sfence
io.imem.sfence.bits.rs1 := wb_ctrl.mem_type(0)
io.imem.sfence.bits.rs2 := wb_ctrl.mem_type(1)
io.imem.sfence.bits.addr := wb_reg_wdata
io.imem.sfence.bits.asid := wb_reg_rs2
io.ptw.invalidate := io.imem.sfence.valid && !io.imem.sfence.bits.rs1
io.ptw.sfence := io.imem.sfence
ibuf.io.inst(0).ready := !ctrl_stalld || csr.io.interrupt

View File

@ -20,6 +20,7 @@ case object ASIdBits extends Field[Int]
class SFenceReq(implicit p: Parameters) extends CoreBundle()(p) {
val rs1 = Bool()
val rs2 = Bool()
val addr = UInt(width = vaddrBits)
val asid = UInt(width = asIdBits max 1) // TODO zero-width
}
@ -252,6 +253,7 @@ class TLB(lgMaxSize: Int, nEntries: Int)(implicit edge: TLEdgeOut, p: Parameters
}
when (sfence) {
assert((io.req.bits.sfence.bits.addr >> pgIdxBits) === vpn(vpnBits-1,0))
valid := Mux(io.req.bits.sfence.bits.rs1, valid & ~hits(totalEntries-1, 0),
Mux(io.req.bits.sfence.bits.rs2, valid & entries.map(_.g).asUInt, 0))
}

View File

@ -24,6 +24,7 @@ trait CoreParams {
val retireWidth: Int
val instBits: Int
val nLocalInterrupts: Int
val nL2TLBEntries: Int
}
trait HasCoreParameters extends HasTileParameters {