1
0

don't wastefully read 2x the bits from D$ RAMs

This commit is contained in:
Andrew Waterman 2012-11-26 20:34:30 -08:00
parent 352bb464b5
commit 608f65e716
2 changed files with 71 additions and 28 deletions

View File

@ -8,7 +8,8 @@ import Util._
case class DCacheConfig(sets: Int, ways: Int, co: CoherencePolicy, case class DCacheConfig(sets: Int, ways: Int, co: CoherencePolicy,
nmshr: Int, nrpq: Int, nsdq: Int, ntlb: Int, nmshr: Int, nrpq: Int, nsdq: Int, ntlb: Int,
reqtagbits: Int = -1, databits: Int = -1) reqtagbits: Int = -1, databits: Int = -1,
narrowRead: Boolean = true)
{ {
require(isPow2(sets)) require(isPow2(sets))
require(isPow2(ways)) // TODO: relax this require(isPow2(ways)) // TODO: relax this
@ -27,6 +28,7 @@ case class DCacheConfig(sets: Int, ways: Int, co: CoherencePolicy,
def ramoffbits = log2Up(MEM_DATA_BITS/8) def ramoffbits = log2Up(MEM_DATA_BITS/8)
def databytes = databits/8 def databytes = databits/8
def wordoffbits = log2Up(databytes) def wordoffbits = log2Up(databytes)
def isNarrowRead = narrowRead && databits*ways % MEM_DATA_BITS == 0
} }
abstract class ReplacementPolicy abstract class ReplacementPolicy
@ -595,6 +597,32 @@ class DataArray(implicit conf: DCacheConfig) extends Component {
val waddr = io.write.bits.addr >> conf.ramoffbits val waddr = io.write.bits.addr >> conf.ramoffbits
val raddr = io.read.bits.addr >> conf.ramoffbits val raddr = io.read.bits.addr >> conf.ramoffbits
if (conf.isNarrowRead) {
val waysPerMem = MEM_DATA_BITS/conf.databits
for (w <- 0 until conf.ways by waysPerMem) {
val resp = Vec(MEM_DATA_BITS/conf.databits){Reg{Bits(width = MEM_DATA_BITS)}}
val r_raddr = RegEn(io.read.bits.addr, io.read.valid)
for (p <- 0 until resp.size) {
val array = Mem(conf.sets*REFILL_CYCLES, seqRead = true){ Bits(width=MEM_DATA_BITS) }
val way_en = io.write.bits.way_en(w+waysPerMem-1,w)
when (way_en.orR && io.write.valid && io.write.bits.wmask(p)) {
val data = Fill(waysPerMem, io.write.bits.data(conf.databits*(p+1)-1,conf.databits*p))
val mask = FillInterleaved(conf.databits, way_en)
array.write(waddr, data, mask)
}
when (way_en.orR && io.read.valid) {
resp(p) := array(raddr)
}
}
for (dw <- 0 until waysPerMem) {
val r = AVec(resp.map(_(conf.databits*(dw+1)-1,conf.databits*dw)))
val resp_mux =
if (r.size == 1) r
else AVec(r(r_raddr(conf.ramoffbits-1,conf.wordoffbits)), r.tail:_*)
io.resp(w+dw) := resp_mux.toBits
}
}
} else {
for (w <- 0 until conf.ways) { for (w <- 0 until conf.ways) {
val rdata = Reg() { Bits() } val rdata = Reg() { Bits() }
val array = Mem(conf.sets*REFILL_CYCLES, seqRead = true){ Bits(width=MEM_DATA_BITS) } val array = Mem(conf.sets*REFILL_CYCLES, seqRead = true){ Bits(width=MEM_DATA_BITS) }
@ -606,6 +634,7 @@ class DataArray(implicit conf: DCacheConfig) extends Component {
} }
io.resp(w) := rdata io.resp(w) := rdata
} }
}
io.read.ready := Bool(true) io.read.ready := Bool(true)
io.write.ready := Bool(true) io.write.ready := Bool(true)
@ -714,24 +743,16 @@ class HellaCache(implicit conf: DCacheConfig) extends Component {
val s1_req = Reg{io.cpu.req.bits.clone} val s1_req = Reg{io.cpu.req.bits.clone}
val s1_valid_masked = s1_valid && !io.cpu.req.bits.kill val s1_valid_masked = s1_valid && !io.cpu.req.bits.kill
val s1_replay = Reg(resetVal = Bool(false)) val s1_replay = Reg(resetVal = Bool(false))
val s1_store_bypass = Bool()
val s2_valid = Reg(s1_valid_masked, resetVal = Bool(false)) val s2_valid = Reg(s1_valid_masked, resetVal = Bool(false))
val s2_req = Reg{io.cpu.req.bits.clone} val s2_req = Reg{io.cpu.req.bits.clone}
val s2_replay = Reg(s1_replay, resetVal = Bool(false)) val s2_replay = Reg(s1_replay, resetVal = Bool(false))
val s2_valid_masked = Bool() val s2_valid_masked = Bool()
val s2_nack_hit = Bool()
val s2_store_bypass = Reg{Bool()}
val s2_store_bypass_data = Reg{Bits(width = conf.databits)}
val s2_store_bypass_mask = Reg{Bits(width = conf.databytes)}
val s3_valid = Reg(resetVal = Bool(false)) val s3_valid = Reg(resetVal = Bool(false))
val s3_req = Reg{io.cpu.req.bits.clone} val s3_req = Reg{io.cpu.req.bits.clone}
val s3_way = Reg{Bits()} val s3_way = Reg{Bits()}
val s4_valid = Reg(s3_valid, resetVal = Bool(false))
val s4_req = RegEn(s3_req, s3_valid)
val s1_read = isRead(s1_req.cmd) val s1_read = isRead(s1_req.cmd)
val s1_write = isWrite(s1_req.cmd) val s1_write = isWrite(s1_req.cmd)
val s1_readwrite = s1_read || s1_write || isPrefetch(s1_req.cmd) val s1_readwrite = s1_read || s1_write || isPrefetch(s1_req.cmd)
@ -766,7 +787,6 @@ class HellaCache(implicit conf: DCacheConfig) extends Component {
s2_req.typ := s1_req.typ s2_req.typ := s1_req.typ
s2_req.cmd := s1_req.cmd s2_req.cmd := s1_req.cmd
s2_req.tag := s1_req.tag s2_req.tag := s1_req.tag
s2_store_bypass := s1_store_bypass
when (s1_write) { when (s1_write) {
s2_req.data := Mux(s1_replay, mshr.io.replay.bits.data, io.cpu.req.bits.data) s2_req.data := Mux(s1_replay, mshr.io.replay.bits.data, io.cpu.req.bits.data)
} }
@ -813,11 +833,22 @@ class HellaCache(implicit conf: DCacheConfig) extends Component {
val s1_tag_eq_way = wayMap((w: Int) => meta.io.resp(w).tag === (s1_addr >> conf.untagbits)){Bits()}.toBits val s1_tag_eq_way = wayMap((w: Int) => meta.io.resp(w).tag === (s1_addr >> conf.untagbits)){Bits()}.toBits
val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && conf.co.isValid(meta.io.resp(w).state)){Bits()}.toBits val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && conf.co.isValid(meta.io.resp(w).state)){Bits()}.toBits
val s1_clk_en = Reg(metaReadArb.io.out.valid) val s1_clk_en = Reg(metaReadArb.io.out.valid)
val s1_writeback = s1_clk_en && !s1_valid && !s1_replay
val s2_tag_match_way = RegEn(s1_tag_match_way, s1_clk_en) val s2_tag_match_way = RegEn(s1_tag_match_way, s1_clk_en)
val s2_tag_match = s2_tag_match_way.orR val s2_tag_match = s2_tag_match_way.orR
val s2_hit_state = Mux1H(s2_tag_match_way, wayMap((w: Int) => RegEn(meta.io.resp(w).state, s1_clk_en && s1_tag_eq_way(w))){Bits()}) val s2_hit_state = Mux1H(s2_tag_match_way, wayMap((w: Int) => RegEn(meta.io.resp(w).state, s1_clk_en && s1_tag_eq_way(w))){Bits()})
val s2_hit = conf.co.isHit(s2_req.cmd, s2_hit_state) && s2_hit_state === conf.co.newStateOnHit(s2_req.cmd, s2_hit_state) val s2_hit = conf.co.isHit(s2_req.cmd, s2_hit_state) && s2_hit_state === conf.co.newStateOnHit(s2_req.cmd, s2_hit_state)
val s2_data = wayMap((w: Int) => RegEn(data.io.resp(w), s1_clk_en && s1_tag_eq_way(w))){Bits()}
val s2_data = Vec(conf.ways){Bits(width = MEM_DATA_BITS)}
for (w <- 0 until conf.ways) {
val regs = Vec(MEM_DATA_BITS/conf.databits){Reg{Bits(width = conf.databits)}}
val en1 = s1_clk_en && s1_tag_eq_way(w)
for (i <- 0 until regs.size) {
val en = en1 && (Bool(i == 0 || !conf.isNarrowRead) || s1_writeback)
when (en) { regs(i) := data.io.resp(w) >> conf.databits*i }
}
s2_data(w) := Cat(regs.last, regs.init.reverse:_*)
}
val data_resp_mux = Mux1H(s2_tag_match_way, s2_data) val data_resp_mux = Mux1H(s2_tag_match_way, s2_data)
// store/amo hits // store/amo hits
@ -843,7 +874,7 @@ class HellaCache(implicit conf: DCacheConfig) extends Component {
val s2_repl_tag = Mux1H(s2_replaced_way_en, wayMap((w: Int) => RegEn(meta.io.resp(w).tag, s1_clk_en && s1_replaced_way_en(w))){Bits()}) val s2_repl_tag = Mux1H(s2_replaced_way_en, wayMap((w: Int) => RegEn(meta.io.resp(w).tag, s1_clk_en && s1_replaced_way_en(w))){Bits()})
// miss handling // miss handling
mshr.io.req.valid := s2_valid_masked && !s2_hit && (isPrefetch(s2_req.cmd) || isRead(s2_req.cmd) || isWrite(s2_req.cmd)) && !s2_nack_hit mshr.io.req.valid := s2_valid_masked && !s2_hit && (isPrefetch(s2_req.cmd) || isRead(s2_req.cmd) || isWrite(s2_req.cmd))
mshr.io.req.bits := s2_req mshr.io.req.bits := s2_req
mshr.io.req.bits.tag_match := s2_tag_match mshr.io.req.bits.tag_match := s2_tag_match
mshr.io.req.bits.old_meta.state := s2_repl_state mshr.io.req.bits.old_meta.state := s2_repl_state
@ -893,20 +924,29 @@ class HellaCache(implicit conf: DCacheConfig) extends Component {
wb.io.probe_rep_data <> io.mem.probe_rep_data wb.io.probe_rep_data <> io.mem.probe_rep_data
// store->load bypassing // store->load bypassing
val s4_valid = Reg(s3_valid, resetVal = Bool(false))
val s4_req = RegEn(s3_req, s3_valid && metaReadArb.io.out.valid)
val bypasses = List( val bypasses = List(
(s2_valid_masked || s2_replay, s2_req, amoalu.io.out), (s2_valid_masked || s2_replay, s2_req, amoalu.io.out),
(s3_valid, s3_req, s3_req.data), (s3_valid, s3_req, s3_req.data),
(s4_valid, s4_req, s4_req.data) (s4_valid, s4_req, s4_req.data)
).map(r => (r._1 && (s1_addr >> conf.wordoffbits === r._2.addr >> conf.wordoffbits) && isWrite(r._2.cmd), r._3, StoreGen(r._2).mask)) ).map(r => (r._1 && (s1_addr >> conf.wordoffbits === r._2.addr >> conf.wordoffbits) && isWrite(r._2.cmd), r._3, StoreGen(r._2).mask))
s1_store_bypass := bypasses.map(_._1).reduce(_||_) val s2_store_bypass_data = Reg{Bits(width = conf.databits)}
when (s1_clk_en && s1_store_bypass) { val s2_store_bypass_mask = Reg{Bits(width = conf.databytes)}
when (s1_clk_en) {
when (bypasses.map(_._1).reduce(_||_)) {
s2_store_bypass_data := PriorityMux(bypasses.map(x => (x._1, x._2))) s2_store_bypass_data := PriorityMux(bypasses.map(x => (x._1, x._2)))
s2_store_bypass_mask := PriorityMux(bypasses.map(x => (x._1, x._3))) s2_store_bypass_mask := PriorityMux(bypasses.map(x => (x._1, x._3)))
}.otherwise {
s2_store_bypass_mask := Bits(0)
}
} }
// load data subword mux/sign extension // load data subword mux/sign extension
val s2_data_word_prebypass = data_resp_mux >> Cat(s2_req.addr(log2Up(MEM_DATA_BITS/8)-1,3), Bits(0,log2Up(conf.databits))) val s2_data_word_prebypass =
val s2_data_word = Cat(null, (0 until conf.databytes).map(i => Mux(s2_store_bypass && s2_store_bypass_mask(i), s2_store_bypass_data, s2_data_word_prebypass)(8*(i+1)-1,8*i)).reverse:_*) if (conf.isNarrowRead) data_resp_mux(conf.databits-1,0)
else data_resp_mux >> Cat(s2_req.addr(log2Up(MEM_DATA_BITS/8)-1,3), Bits(0,log2Up(conf.databits)))
val s2_data_word = Cat(null, (0 until conf.databytes).map(i => Mux(s2_store_bypass_mask(i), s2_store_bypass_data, s2_data_word_prebypass)(8*(i+1)-1,8*i)).reverse:_*)
val loadgen = new LoadGen(s2_req.typ, s2_req.addr, s2_data_word) val loadgen = new LoadGen(s2_req.typ, s2_req.addr, s2_data_word)
amoalu.io := s2_req amoalu.io := s2_req
@ -916,10 +956,12 @@ class HellaCache(implicit conf: DCacheConfig) extends Component {
// nack it like it's hot // nack it like it's hot
val s1_nack = dtlb.io.req.valid && dtlb.io.resp.miss || val s1_nack = dtlb.io.req.valid && dtlb.io.resp.miss ||
s1_req.addr(indexmsb,indexlsb) === prober.io.meta_write.bits.idx && !prober.io.req.ready s1_req.addr(indexmsb,indexlsb) === prober.io.meta_write.bits.idx && !prober.io.req.ready
s2_nack_hit := Reg(s1_nack) || s2_hit && mshr.io.secondary_miss val s2_nack_hit = RegEn(s1_nack, s1_valid || s1_replay)
when (s2_nack_hit) { mshr.io.req.valid := Bool(false) }
val s2_nack_victim = s2_hit && mshr.io.secondary_miss
val s2_nack_miss = !s2_hit && !mshr.io.req.ready val s2_nack_miss = !s2_hit && !mshr.io.req.ready
val s2_nack_fence = s2_req.cmd === M_FENCE && !mshr.io.fence_rdy val s2_nack_fence = s2_req.cmd === M_FENCE && !mshr.io.fence_rdy
val s2_nack = s2_nack_hit || s2_nack_miss || s2_nack_fence val s2_nack = s2_nack_hit || s2_nack_victim || s2_nack_miss || s2_nack_fence
s2_valid_masked := s2_valid && !s2_nack s2_valid_masked := s2_valid && !s2_nack
// after a nack, block until nack condition resolves to save energy // after a nack, block until nack condition resolves to save energy

View File

@ -18,6 +18,7 @@ object AVec
require(elts.tail.forall(elts.head.getClass == _.getClass)) require(elts.tail.forall(elts.head.getClass == _.getClass))
Vec(elts) { elts.head.clone } Vec(elts) { elts.head.clone }
} }
def apply[T <: Data](elts: Vec[T]): Vec[T] = apply(elts.toSeq)
def apply[T <: Data](elt0: T, elts: T*): Vec[T] = apply(elt0 :: elts.toList) def apply[T <: Data](elt0: T, elts: T*): Vec[T] = apply(elt0 :: elts.toList)
} }