1
0

Reduce latency of page table walks

A small cache in the PTW caches non-leaf PTEs, reducing latency and D$ misses.
This commit is contained in:
Andrew Waterman 2015-03-21 20:12:25 -07:00
parent 31d17cbf86
commit 0332c1e7fe
2 changed files with 55 additions and 19 deletions

View File

@ -13,9 +13,7 @@ class PTWReq extends CoreBundle {
class PTWResp extends CoreBundle { class PTWResp extends CoreBundle {
val error = Bool() val error = Bool()
val ppn = UInt(width = ppnBits) val pte = new PTE
val perm = Bits(width = permBits)
val dirty = Bool()
} }
class TLBPTWIO extends CoreBundle { class TLBPTWIO extends CoreBundle {
@ -31,6 +29,17 @@ class DatapathPTWIO extends CoreBundle {
val status = new MStatus().asInput val status = new MStatus().asInput
} }
class PTE extends CoreBundle {
val ppn = Bits(width = ppnBits)
val reserved = Bits(width = 2)
val d = Bool()
val r = Bool()
val perm = Bits(width = 6)
val g = Bool()
val t = Bool()
val v = Bool()
}
class PTW(n: Int) extends CoreModule class PTW(n: Int) extends CoreModule
{ {
val io = new Bundle { val io = new Bundle {
@ -49,7 +58,7 @@ class PTW(n: Int) extends CoreModule
val r_req = Reg(new PTWReq) val r_req = Reg(new PTWReq)
val r_req_dest = Reg(Bits()) val r_req_dest = Reg(Bits())
val r_pte = Reg(Bits()) val r_pte = Reg(new PTE)
val vpn_idx = Vec((0 until levels).map(i => (r_req.addr >> (levels-i-1)*bitsPerLevel)(bitsPerLevel-1,0)))(count) val vpn_idx = Vec((0 until levels).map(i => (r_req.addr >> (levels-i-1)*bitsPerLevel)(bitsPerLevel-1,0)))(count)
@ -57,16 +66,39 @@ class PTW(n: Int) extends CoreModule
arb.io.in <> io.requestor.map(_.req) arb.io.in <> io.requestor.map(_.req)
arb.io.out.ready := state === s_ready arb.io.out.ready := state === s_ready
val pte = io.mem.resp.bits.data val pte = new PTE().fromBits(io.mem.resp.bits.data)
val pte_addr = Cat(r_pte.ppn, vpn_idx).toUInt << log2Up(xLen/8)
when (arb.io.out.fire()) { when (arb.io.out.fire()) {
r_req := arb.io.out.bits r_req := arb.io.out.bits
r_req_dest := arb.io.chosen r_req_dest := arb.io.chosen
r_pte := Cat(io.dpath.ptbr(paddrBits-1,pgIdxBits), pte(pgIdxBits-1,0)) r_pte.ppn := io.dpath.ptbr(paddrBits-1,pgIdxBits)
} }
val perm_ok = (pte(8,3) & r_req.perm).orR val (pte_cache_hit, pte_cache_data) = {
val size = log2Up(levels * 2)
val plru = new PseudoLRU(size)
val valid = Reg(init = Bits(0, size))
val tags = Mem(UInt(width = paddrBits), size)
val data = Mem(UInt(width = paddrBits - pgIdxBits), size)
val hits = Vec(tags.map(_ === pte_addr)).toBits & valid
val hit = hits.orR
when (io.mem.resp.valid && io.mem.resp.bits.data(1,0).andR && !hit) {
val r = Mux(valid.andR, plru.replace, PriorityEncoder(~valid))
valid(r) := true
tags(r) := pte_addr
data(r) := io.mem.resp.bits.data(paddrBits-1,pgIdxBits)
}
when (hit && state === s_req) { plru.access(OHToUInt(hits)) }
when (io.dpath.invalidate) { valid := 0 }
(hit, Mux1H(hits, data))
}
val perm_ok = (pte.perm & r_req.perm).orR
val is_store = r_req.perm(1) || r_req.perm(4) val is_store = r_req.perm(1) || r_req.perm(4)
val set_dirty_bit = perm_ok && !pte(1) && (!pte(9) || (is_store && !pte(10))) val set_dirty_bit = perm_ok && !pte.t && (!pte.r || (is_store && !pte.d))
when (io.mem.resp.valid && state === s_wait && !set_dirty_bit) { when (io.mem.resp.valid && state === s_wait && !set_dirty_bit) {
r_pte := pte r_pte := pte
} }
@ -75,7 +107,7 @@ class PTW(n: Int) extends CoreModule
io.mem.req.bits.phys := Bool(true) io.mem.req.bits.phys := Bool(true)
io.mem.req.bits.cmd := Mux(state === s_set_dirty, M_XA_OR, M_XRD) io.mem.req.bits.cmd := Mux(state === s_set_dirty, M_XA_OR, M_XRD)
io.mem.req.bits.typ := MT_D io.mem.req.bits.typ := MT_D
io.mem.req.bits.addr := Cat(r_pte(paddrBits-1,pgIdxBits), vpn_idx).toUInt << log2Up(xLen/8) io.mem.req.bits.addr := pte_addr
io.mem.req.bits.kill := Bool(false) io.mem.req.bits.kill := Bool(false)
io.mem.req.bits.data := UInt(1<<9) | Mux(is_store, UInt(1<<10), UInt(0)) io.mem.req.bits.data := UInt(1<<9) | Mux(is_store, UInt(1<<10), UInt(0))
@ -89,9 +121,8 @@ class PTW(n: Int) extends CoreModule
val me = r_req_dest === UInt(i) val me = r_req_dest === UInt(i)
io.requestor(i).resp.valid := resp_val && me io.requestor(i).resp.valid := resp_val && me
io.requestor(i).resp.bits.error := resp_err io.requestor(i).resp.bits.error := resp_err
io.requestor(i).resp.bits.perm := r_pte(8,3) io.requestor(i).resp.bits.pte := r_pte
io.requestor(i).resp.bits.dirty := r_pte(10) io.requestor(i).resp.bits.pte.ppn := resp_ppn
io.requestor(i).resp.bits.ppn := resp_ppn.toUInt
io.requestor(i).invalidate := io.dpath.invalidate io.requestor(i).invalidate := io.dpath.invalidate
io.requestor(i).status := io.dpath.status io.requestor(i).status := io.dpath.status
} }
@ -105,7 +136,12 @@ class PTW(n: Int) extends CoreModule
count := UInt(0) count := UInt(0)
} }
is (s_req) { is (s_req) {
when (io.mem.req.ready) { when (pte_cache_hit && count < levels-1) {
io.mem.req.valid := false
state := s_req
count := count + 1
r_pte.ppn := pte_cache_data
}.elsewhen (io.mem.req.ready) {
state := s_wait state := s_wait
} }
} }
@ -115,10 +151,10 @@ class PTW(n: Int) extends CoreModule
} }
when (io.mem.resp.valid) { when (io.mem.resp.valid) {
state := s_error state := s_error
when (pte(0)) { when (pte.v) {
when (set_dirty_bit) { when (set_dirty_bit) {
state := s_set_dirty state := s_set_dirty
}.elsewhen (!pte(1)) { }.elsewhen (!pte.t) {
state := s_done state := s_done
}.elsewhen (count < levels-1) { }.elsewhen (count < levels-1) {
state := s_req state := s_req

View File

@ -109,7 +109,7 @@ class TLB extends TLBModule {
val r_req = Reg(new TLBReq) val r_req = Reg(new TLBReq)
val tag_cam = Module(new RocketCAM) val tag_cam = Module(new RocketCAM)
val tag_ram = Mem(io.ptw.resp.bits.ppn.clone, entries) val tag_ram = Mem(io.ptw.resp.bits.pte.ppn.clone, entries)
val lookup_tag = Cat(io.req.bits.asid, io.req.bits.vpn).toUInt val lookup_tag = Cat(io.req.bits.asid, io.req.bits.vpn).toUInt
tag_cam.io.tag := lookup_tag tag_cam.io.tag := lookup_tag
@ -128,8 +128,8 @@ class TLB extends TLBModule {
val sx_array = Reg(Bits()) // supervisor execute permission val sx_array = Reg(Bits()) // supervisor execute permission
val dirty_array = Reg(Bits()) // PTE dirty bit val dirty_array = Reg(Bits()) // PTE dirty bit
when (io.ptw.resp.valid) { when (io.ptw.resp.valid) {
val perm = io.ptw.resp.bits.perm & ~io.ptw.resp.bits.error.toSInt val perm = io.ptw.resp.bits.pte.perm & ~io.ptw.resp.bits.error.toSInt
tag_ram(r_refill_waddr) := io.ptw.resp.bits.ppn tag_ram(r_refill_waddr) := io.ptw.resp.bits.pte.ppn
valid_array := valid_array.bitSet(r_refill_waddr, !io.ptw.resp.bits.error) valid_array := valid_array.bitSet(r_refill_waddr, !io.ptw.resp.bits.error)
ur_array := ur_array.bitSet(r_refill_waddr, perm(0) || perm(2)) ur_array := ur_array.bitSet(r_refill_waddr, perm(0) || perm(2))
uw_array := uw_array.bitSet(r_refill_waddr, perm(1)) uw_array := uw_array.bitSet(r_refill_waddr, perm(1))
@ -137,7 +137,7 @@ class TLB extends TLBModule {
sr_array := sr_array.bitSet(r_refill_waddr, perm(3) || perm(5)) sr_array := sr_array.bitSet(r_refill_waddr, perm(3) || perm(5))
sw_array := sw_array.bitSet(r_refill_waddr, perm(4)) sw_array := sw_array.bitSet(r_refill_waddr, perm(4))
sx_array := sx_array.bitSet(r_refill_waddr, perm(5)) sx_array := sx_array.bitSet(r_refill_waddr, perm(5))
dirty_array := dirty_array.bitSet(r_refill_waddr, io.ptw.resp.bits.dirty) dirty_array := dirty_array.bitSet(r_refill_waddr, io.ptw.resp.bits.pte.d)
} }
// high if there are any unused (invalid) entries in the TLB // high if there are any unused (invalid) entries in the TLB