Reduce latency of page table walks
A small cache in the PTW caches non-leaf PTEs, reducing latency and D$ misses.
This commit is contained in:
		@@ -13,9 +13,7 @@ class PTWReq extends CoreBundle {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class PTWResp extends CoreBundle {
 | 
					class PTWResp extends CoreBundle {
 | 
				
			||||||
  val error = Bool()
 | 
					  val error = Bool()
 | 
				
			||||||
  val ppn = UInt(width = ppnBits)
 | 
					  val pte = new PTE
 | 
				
			||||||
  val perm = Bits(width = permBits)
 | 
					 | 
				
			||||||
  val dirty = Bool()
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TLBPTWIO extends CoreBundle {
 | 
					class TLBPTWIO extends CoreBundle {
 | 
				
			||||||
@@ -31,6 +29,17 @@ class DatapathPTWIO extends CoreBundle {
 | 
				
			|||||||
  val status = new MStatus().asInput
 | 
					  val status = new MStatus().asInput
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class PTE extends CoreBundle {
 | 
				
			||||||
 | 
					  val ppn = Bits(width = ppnBits)
 | 
				
			||||||
 | 
					  val reserved = Bits(width = 2)
 | 
				
			||||||
 | 
					  val d = Bool()
 | 
				
			||||||
 | 
					  val r = Bool()
 | 
				
			||||||
 | 
					  val perm = Bits(width = 6)
 | 
				
			||||||
 | 
					  val g = Bool()
 | 
				
			||||||
 | 
					  val t = Bool()
 | 
				
			||||||
 | 
					  val v = Bool()
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PTW(n: Int) extends CoreModule
 | 
					class PTW(n: Int) extends CoreModule
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  val io = new Bundle {
 | 
					  val io = new Bundle {
 | 
				
			||||||
@@ -49,7 +58,7 @@ class PTW(n: Int) extends CoreModule
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  val r_req = Reg(new PTWReq)
 | 
					  val r_req = Reg(new PTWReq)
 | 
				
			||||||
  val r_req_dest = Reg(Bits())
 | 
					  val r_req_dest = Reg(Bits())
 | 
				
			||||||
  val r_pte = Reg(Bits())
 | 
					  val r_pte = Reg(new PTE)
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  val vpn_idx = Vec((0 until levels).map(i => (r_req.addr >> (levels-i-1)*bitsPerLevel)(bitsPerLevel-1,0)))(count)
 | 
					  val vpn_idx = Vec((0 until levels).map(i => (r_req.addr >> (levels-i-1)*bitsPerLevel)(bitsPerLevel-1,0)))(count)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -57,16 +66,39 @@ class PTW(n: Int) extends CoreModule
 | 
				
			|||||||
  arb.io.in <> io.requestor.map(_.req)
 | 
					  arb.io.in <> io.requestor.map(_.req)
 | 
				
			||||||
  arb.io.out.ready := state === s_ready
 | 
					  arb.io.out.ready := state === s_ready
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  val pte = io.mem.resp.bits.data
 | 
					  val pte = new PTE().fromBits(io.mem.resp.bits.data)
 | 
				
			||||||
 | 
					  val pte_addr = Cat(r_pte.ppn, vpn_idx).toUInt << log2Up(xLen/8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  when (arb.io.out.fire()) {
 | 
					  when (arb.io.out.fire()) {
 | 
				
			||||||
    r_req := arb.io.out.bits
 | 
					    r_req := arb.io.out.bits
 | 
				
			||||||
    r_req_dest := arb.io.chosen
 | 
					    r_req_dest := arb.io.chosen
 | 
				
			||||||
    r_pte := Cat(io.dpath.ptbr(paddrBits-1,pgIdxBits), pte(pgIdxBits-1,0))
 | 
					    r_pte.ppn := io.dpath.ptbr(paddrBits-1,pgIdxBits)
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  val perm_ok = (pte(8,3) & r_req.perm).orR
 | 
					  val (pte_cache_hit, pte_cache_data) = {
 | 
				
			||||||
 | 
					    val size = log2Up(levels * 2)
 | 
				
			||||||
 | 
					    val plru = new PseudoLRU(size)
 | 
				
			||||||
 | 
					    val valid = Reg(init = Bits(0, size))
 | 
				
			||||||
 | 
					    val tags = Mem(UInt(width = paddrBits), size)
 | 
				
			||||||
 | 
					    val data = Mem(UInt(width = paddrBits - pgIdxBits), size)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    val hits = Vec(tags.map(_ === pte_addr)).toBits & valid
 | 
				
			||||||
 | 
					    val hit = hits.orR
 | 
				
			||||||
 | 
					    when (io.mem.resp.valid && io.mem.resp.bits.data(1,0).andR && !hit) {
 | 
				
			||||||
 | 
					      val r = Mux(valid.andR, plru.replace, PriorityEncoder(~valid))
 | 
				
			||||||
 | 
					      valid(r) := true
 | 
				
			||||||
 | 
					      tags(r) := pte_addr
 | 
				
			||||||
 | 
					      data(r) := io.mem.resp.bits.data(paddrBits-1,pgIdxBits)
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    when (hit && state === s_req) { plru.access(OHToUInt(hits)) }
 | 
				
			||||||
 | 
					    when (io.dpath.invalidate) { valid := 0 }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    (hit, Mux1H(hits, data))
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  val perm_ok = (pte.perm & r_req.perm).orR
 | 
				
			||||||
  val is_store = r_req.perm(1) || r_req.perm(4)
 | 
					  val is_store = r_req.perm(1) || r_req.perm(4)
 | 
				
			||||||
  val set_dirty_bit = perm_ok && !pte(1) && (!pte(9) || (is_store && !pte(10)))
 | 
					  val set_dirty_bit = perm_ok && !pte.t && (!pte.r || (is_store && !pte.d))
 | 
				
			||||||
  when (io.mem.resp.valid && state === s_wait && !set_dirty_bit) {
 | 
					  when (io.mem.resp.valid && state === s_wait && !set_dirty_bit) {
 | 
				
			||||||
    r_pte := pte
 | 
					    r_pte := pte
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -75,7 +107,7 @@ class PTW(n: Int) extends CoreModule
 | 
				
			|||||||
  io.mem.req.bits.phys := Bool(true)
 | 
					  io.mem.req.bits.phys := Bool(true)
 | 
				
			||||||
  io.mem.req.bits.cmd  := Mux(state === s_set_dirty, M_XA_OR, M_XRD)
 | 
					  io.mem.req.bits.cmd  := Mux(state === s_set_dirty, M_XA_OR, M_XRD)
 | 
				
			||||||
  io.mem.req.bits.typ  := MT_D
 | 
					  io.mem.req.bits.typ  := MT_D
 | 
				
			||||||
  io.mem.req.bits.addr := Cat(r_pte(paddrBits-1,pgIdxBits), vpn_idx).toUInt << log2Up(xLen/8)
 | 
					  io.mem.req.bits.addr := pte_addr
 | 
				
			||||||
  io.mem.req.bits.kill := Bool(false)
 | 
					  io.mem.req.bits.kill := Bool(false)
 | 
				
			||||||
  io.mem.req.bits.data := UInt(1<<9) | Mux(is_store, UInt(1<<10), UInt(0))
 | 
					  io.mem.req.bits.data := UInt(1<<9) | Mux(is_store, UInt(1<<10), UInt(0))
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
@@ -89,9 +121,8 @@ class PTW(n: Int) extends CoreModule
 | 
				
			|||||||
    val me = r_req_dest === UInt(i)
 | 
					    val me = r_req_dest === UInt(i)
 | 
				
			||||||
    io.requestor(i).resp.valid := resp_val && me
 | 
					    io.requestor(i).resp.valid := resp_val && me
 | 
				
			||||||
    io.requestor(i).resp.bits.error := resp_err
 | 
					    io.requestor(i).resp.bits.error := resp_err
 | 
				
			||||||
    io.requestor(i).resp.bits.perm := r_pte(8,3)
 | 
					    io.requestor(i).resp.bits.pte := r_pte
 | 
				
			||||||
    io.requestor(i).resp.bits.dirty := r_pte(10)
 | 
					    io.requestor(i).resp.bits.pte.ppn := resp_ppn
 | 
				
			||||||
    io.requestor(i).resp.bits.ppn := resp_ppn.toUInt
 | 
					 | 
				
			||||||
    io.requestor(i).invalidate := io.dpath.invalidate
 | 
					    io.requestor(i).invalidate := io.dpath.invalidate
 | 
				
			||||||
    io.requestor(i).status := io.dpath.status
 | 
					    io.requestor(i).status := io.dpath.status
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -105,7 +136,12 @@ class PTW(n: Int) extends CoreModule
 | 
				
			|||||||
      count := UInt(0)
 | 
					      count := UInt(0)
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    is (s_req) {
 | 
					    is (s_req) {
 | 
				
			||||||
      when (io.mem.req.ready) {
 | 
					      when (pte_cache_hit && count < levels-1) {
 | 
				
			||||||
 | 
					        io.mem.req.valid := false
 | 
				
			||||||
 | 
					        state := s_req
 | 
				
			||||||
 | 
					        count := count + 1
 | 
				
			||||||
 | 
					        r_pte.ppn := pte_cache_data
 | 
				
			||||||
 | 
					      }.elsewhen (io.mem.req.ready) {
 | 
				
			||||||
        state := s_wait
 | 
					        state := s_wait
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -115,10 +151,10 @@ class PTW(n: Int) extends CoreModule
 | 
				
			|||||||
      }
 | 
					      }
 | 
				
			||||||
      when (io.mem.resp.valid) {
 | 
					      when (io.mem.resp.valid) {
 | 
				
			||||||
        state := s_error
 | 
					        state := s_error
 | 
				
			||||||
        when (pte(0)) {
 | 
					        when (pte.v) {
 | 
				
			||||||
          when (set_dirty_bit) {
 | 
					          when (set_dirty_bit) {
 | 
				
			||||||
            state := s_set_dirty
 | 
					            state := s_set_dirty
 | 
				
			||||||
          }.elsewhen (!pte(1)) {
 | 
					          }.elsewhen (!pte.t) {
 | 
				
			||||||
            state := s_done
 | 
					            state := s_done
 | 
				
			||||||
          }.elsewhen (count < levels-1) {
 | 
					          }.elsewhen (count < levels-1) {
 | 
				
			||||||
            state := s_req
 | 
					            state := s_req
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -109,7 +109,7 @@ class TLB extends TLBModule {
 | 
				
			|||||||
  val r_req = Reg(new TLBReq)
 | 
					  val r_req = Reg(new TLBReq)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  val tag_cam = Module(new RocketCAM)
 | 
					  val tag_cam = Module(new RocketCAM)
 | 
				
			||||||
  val tag_ram = Mem(io.ptw.resp.bits.ppn.clone, entries)
 | 
					  val tag_ram = Mem(io.ptw.resp.bits.pte.ppn.clone, entries)
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  val lookup_tag = Cat(io.req.bits.asid, io.req.bits.vpn).toUInt
 | 
					  val lookup_tag = Cat(io.req.bits.asid, io.req.bits.vpn).toUInt
 | 
				
			||||||
  tag_cam.io.tag := lookup_tag
 | 
					  tag_cam.io.tag := lookup_tag
 | 
				
			||||||
@@ -128,8 +128,8 @@ class TLB extends TLBModule {
 | 
				
			|||||||
  val sx_array = Reg(Bits()) // supervisor execute permission
 | 
					  val sx_array = Reg(Bits()) // supervisor execute permission
 | 
				
			||||||
  val dirty_array = Reg(Bits()) // PTE dirty bit
 | 
					  val dirty_array = Reg(Bits()) // PTE dirty bit
 | 
				
			||||||
  when (io.ptw.resp.valid) {
 | 
					  when (io.ptw.resp.valid) {
 | 
				
			||||||
    val perm = io.ptw.resp.bits.perm & ~io.ptw.resp.bits.error.toSInt
 | 
					    val perm = io.ptw.resp.bits.pte.perm & ~io.ptw.resp.bits.error.toSInt
 | 
				
			||||||
    tag_ram(r_refill_waddr) := io.ptw.resp.bits.ppn
 | 
					    tag_ram(r_refill_waddr) := io.ptw.resp.bits.pte.ppn
 | 
				
			||||||
    valid_array := valid_array.bitSet(r_refill_waddr, !io.ptw.resp.bits.error)
 | 
					    valid_array := valid_array.bitSet(r_refill_waddr, !io.ptw.resp.bits.error)
 | 
				
			||||||
    ur_array := ur_array.bitSet(r_refill_waddr, perm(0) || perm(2))
 | 
					    ur_array := ur_array.bitSet(r_refill_waddr, perm(0) || perm(2))
 | 
				
			||||||
    uw_array := uw_array.bitSet(r_refill_waddr, perm(1))
 | 
					    uw_array := uw_array.bitSet(r_refill_waddr, perm(1))
 | 
				
			||||||
@@ -137,7 +137,7 @@ class TLB extends TLBModule {
 | 
				
			|||||||
    sr_array := sr_array.bitSet(r_refill_waddr, perm(3) || perm(5))
 | 
					    sr_array := sr_array.bitSet(r_refill_waddr, perm(3) || perm(5))
 | 
				
			||||||
    sw_array := sw_array.bitSet(r_refill_waddr, perm(4))
 | 
					    sw_array := sw_array.bitSet(r_refill_waddr, perm(4))
 | 
				
			||||||
    sx_array := sx_array.bitSet(r_refill_waddr, perm(5))
 | 
					    sx_array := sx_array.bitSet(r_refill_waddr, perm(5))
 | 
				
			||||||
    dirty_array := dirty_array.bitSet(r_refill_waddr, io.ptw.resp.bits.dirty)
 | 
					    dirty_array := dirty_array.bitSet(r_refill_waddr, io.ptw.resp.bits.pte.d)
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
  // high if there are any unused (invalid) entries in the TLB
 | 
					  // high if there are any unused (invalid) entries in the TLB
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user