From 1da8ef2ddf8aeb78a0faa12252741beae5721328 Mon Sep 17 00:00:00 2001 From: Henry Cook Date: Mon, 7 Apr 2014 18:22:46 -0700 Subject: [PATCH] Added serdes to decouple cache row size from tilelink data size --- rocket/src/main/scala/icache.scala | 40 ++++++--- rocket/src/main/scala/nbdcache.scala | 120 ++++++++++++++++----------- rocket/src/main/scala/util.scala | 47 +++++++++++ 3 files changed, 146 insertions(+), 61 deletions(-) diff --git a/rocket/src/main/scala/icache.scala b/rocket/src/main/scala/icache.scala index 8ed0091f..742905b7 100644 --- a/rocket/src/main/scala/icache.scala +++ b/rocket/src/main/scala/icache.scala @@ -5,7 +5,7 @@ import uncore._ import Util._ case class ICacheConfig(sets: Int, assoc: Int, - ibytes: Int = 4, rowbytes: Int = 64, + ibytes: Int = 4, rowbytes: Int = 16, ntlb: Int = 8, tl: TileLinkConfiguration, as: AddressSpaceConfiguration, @@ -187,10 +187,25 @@ class ICache(implicit c: ICacheConfig) extends Module val s2_miss = s2_valid && !s2_any_tag_hit rdy := state === s_ready && !s2_miss - //assert(!co.isVoluntary(io.mem.grant.bits.payload) || !io.mem.grant.valid, "UncachedRequestors shouldn't get voluntary grants.") - val (rf_cnt, refill_done) = (if(c.refillcycles > 1) Counter(io.mem.grant.valid, c.refillcycles) else (UInt(0), state === s_refill)) - val repl_way = if (c.dm) UInt(0) else LFSR16(s2_miss)(log2Up(c.assoc)-1,0) + var refill_cnt = UInt(0) + var refill_done = state === s_refill + var refill_valid = io.mem.grant.valid + var refill_bits = io.mem.grant.bits + def doRefill(g: Grant): Bool = Bool(true) + if(c.refillcycles > 1) { + val ser = Module(new FlowThroughSerializer(io.mem.grant.bits, c.refillcycles, doRefill)) + ser.io.in <> io.mem.grant + refill_cnt = ser.io.cnt + refill_done = ser.io.done + refill_valid = ser.io.out.valid + refill_bits = ser.io.out.bits + ser.io.out.ready := Bool(true) + } else { + io.mem.grant.ready := Bool(true) + } + //assert(!c.tlco.isVoluntary(refill_bits.payload) || !refill_valid, "UncachedRequestors shouldn't get voluntary grants.") + val repl_way = if (c.dm) UInt(0) else LFSR16(s2_miss)(log2Up(c.assoc)-1,0) val enc_tagbits = c.code.width(c.tagbits) val tag_array = Mem(Bits(width = enc_tagbits*c.assoc), c.sets, seqRead = true) val tag_raddr = Reg(UInt()) @@ -240,14 +255,14 @@ class ICache(implicit c: ICacheConfig) extends Module for (i <- 0 until c.assoc) { val data_array = Mem(Bits(width = c.code.width(c.rowbits)), c.sets*c.refillcycles, seqRead = true) val s1_raddr = Reg(UInt()) - when (io.mem.grant.valid && repl_way === UInt(i)) { - val d = io.mem.grant.bits.payload.data - if(c.refillcycles > 1) data_array(Cat(s2_idx,rf_cnt)) := c.code.encode(d) - else data_array(s2_idx) := c.code.encode(d) + when (refill_valid && repl_way === UInt(i)) { + val e_d = c.code.encode(refill_bits.payload.data) + if(c.refillcycles > 1) data_array(Cat(s2_idx,refill_cnt)) := e_d + else data_array(s2_idx) := e_d } // /*.else*/when (s0_valid) { // uncomment ".else" to infer 6T SRAM .elsewhen (s0_valid) { - s1_raddr := s0_pgoff(c.untagbits-1,c.offbits-(if(c.refillcycles > 1) rf_cnt.getWidth else 0)) + s1_raddr := s0_pgoff(c.untagbits-1,c.offbits-(if(c.refillcycles > 1) refill_cnt.getWidth else 0)) } // if s1_tag_match is critical, replace with partial tag check when (s1_valid && rdy && !stall && (Bool(c.dm) || s1_tag_match(i))) { s2_dout(i) := data_array(s1_raddr) } @@ -257,16 +272,15 @@ class ICache(implicit c: ICacheConfig) extends Module io.resp.bits.datablock := Mux1H(s2_tag_hit, s2_dout) val ack_q = Module(new Queue(new LogicalNetworkIO(new GrantAck), 1)) - ack_q.io.enq.valid := refill_done && tl.co.requiresAckForGrant(io.mem.grant.bits.payload.g_type) - ack_q.io.enq.bits.payload.master_xact_id := io.mem.grant.bits.payload.master_xact_id - ack_q.io.enq.bits.header.dst := io.mem.grant.bits.header.src + ack_q.io.enq.valid := refill_done && tl.co.requiresAckForGrant(refill_bits.payload.g_type) + ack_q.io.enq.bits.payload.master_xact_id := refill_bits.payload.master_xact_id + ack_q.io.enq.bits.header.dst := refill_bits.header.src // output signals io.resp.valid := s2_hit io.mem.acquire.valid := (state === s_request) && ack_q.io.enq.ready io.mem.acquire.bits.payload := Acquire(tl.co.getUncachedReadAcquireType, s2_addr >> UInt(c.offbits), UInt(0)) io.mem.grant_ack <> ack_q.io.deq - io.mem.grant.ready := Bool(true) // control state machine switch (state) { diff --git a/rocket/src/main/scala/nbdcache.scala b/rocket/src/main/scala/nbdcache.scala index 39390aae..a4b686e5 100644 --- a/rocket/src/main/scala/nbdcache.scala +++ b/rocket/src/main/scala/nbdcache.scala @@ -9,13 +9,10 @@ case class DCacheConfig(sets: Int, ways: Int, tl: TileLinkConfiguration, as: AddressSpaceConfiguration, reqtagbits: Int, databits: Int, - rowwords: Int = 8, + rowwords: Int = 2, code: Code = new IdentityCode, narrowRead: Boolean = true) { - require(states > 0) - require(isPow2(sets)) - require(isPow2(ways)) // TODO: relax this def states = tl.co.nClientStates def lines = sets*ways def dm = ways == 1 @@ -35,7 +32,7 @@ case class DCacheConfig(sets: Int, ways: Int, def rowbits = rowwords*databits def rowbytes = rowwords*databytes def rowoffbits = log2Up(rowbytes) - def refillcycles = tl.dataBits/(rowwords*databits) + def refillcycles = tl.dataBits/(rowbits) def isNarrowRead = narrowRead && databits*ways % rowbits == 0 val statebits = log2Up(states) val metabits = statebits + tagbits @@ -43,6 +40,11 @@ case class DCacheConfig(sets: Int, ways: Int, val encmetabits = code.width(metabits) val encrowbits = rowwords*encdatabits val lrsc_cycles = 32 // ISA requires 16-insn LRSC sequences to succeed + + require(states > 0) + require(isPow2(sets)) + require(isPow2(ways)) // TODO: relax this + require(rowbits <= tl.dataBits) } abstract trait DCacheBundle extends Bundle { @@ -200,6 +202,7 @@ class MSHR(id: Int)(implicit conf: DCacheConfig) extends Module { val idx_match = req_idx === io.req_bits.addr(conf.untagbits-1,conf.offbits) val sec_rdy = idx_match && (state === s_wb_req || state === s_wb_resp || state === s_meta_clear || (state === s_refill_req || state === s_refill_resp) && !tl.co.needsTransactionOnSecondaryMiss(req_cmd, io.mem_req.bits)) + require(isPow2(conf.refillcycles)) val reply = io.mem_grant.valid && io.mem_grant.bits.payload.client_xact_id === UInt(id) val refill_done = reply && (if(conf.refillcycles > 1) refill_count.andR else Bool(true)) val wb_done = reply && (state === s_wb_resp) @@ -302,7 +305,6 @@ class MSHR(id: Int)(implicit conf: DCacheConfig) extends Module { io.mem_req.bits.addr := Cat(io.tag, req_idx).toUInt io.mem_req.bits.client_xact_id := Bits(id) io.mem_finish <> ackq.io.deq - io.mem_req.bits.client_xact_id := Bits(id) io.meta_read.valid := state === s_drain_rpq io.meta_read.bits.addr := io.mem_req.bits.addr << conf.offbits @@ -430,59 +432,73 @@ class WritebackUnit(implicit conf: DCacheConfig) extends Module { val release = Decoupled(new Release) } - require(conf.refillcycles == 1) // TODO Currently will issue refillcycles distinct releases; need to merge if rowsize < tilelink.dataSize - - val valid = Reg(init=Bool(false)) + val active = Reg(init=Bool(false)) val r1_data_req_fired = Reg(init=Bool(false)) val r2_data_req_fired = Reg(init=Bool(false)) - val cmd_sent = Reg(Bool()) - val cnt = Reg(UInt(width = log2Up(conf.refillcycles+1))) + val cnt = Reg(init = UInt(0, width = log2Up(conf.refillcycles+1))) val req = Reg(new WritebackReq) - when (valid) { + io.release.valid := false + when (active) { r1_data_req_fired := false r2_data_req_fired := r1_data_req_fired when (io.data_req.fire() && io.meta_read.fire()) { r1_data_req_fired := true cnt := cnt + 1 } - when (r2_data_req_fired && !io.release.ready) { - r1_data_req_fired := false - r2_data_req_fired := false - cnt := (if(conf.refillcycles > 1) cnt - Mux[UInt](r1_data_req_fired, 2, 1) else UInt(0)) + if(conf.refillcycles > 1) { // Coalescing buffer inserted + when (!r1_data_req_fired && !r2_data_req_fired && cnt === conf.refillcycles) { + io.release.valid := true + active := !io.release.ready + } + } else { // No buffer, data released a cycle earlier + when (r2_data_req_fired) { + io.release.valid := true + when(!io.release.ready) { + r1_data_req_fired := false + r2_data_req_fired := false + cnt := UInt(0) + } .otherwise { + active := false + } + } } - when (io.release.fire()) { - cmd_sent := true - } - when (!r1_data_req_fired && !r2_data_req_fired && cmd_sent && cnt === conf.refillcycles) { - valid := false - } - } when (io.req.fire()) { - valid := true - cmd_sent := false + active := true cnt := 0 req := io.req.bits } - val fire = valid && cnt < UInt(conf.refillcycles) - io.req.ready := !valid - io.data_req.valid := fire - io.data_req.bits.way_en := req.way_en - io.data_req.bits.addr := (if(conf.refillcycles > 1) Cat(req.idx, cnt(log2Up(conf.refillcycles)-1,0)) - else req.idx) << conf.rowoffbits - - io.release.valid := valid && r2_data_req_fired - io.release.bits.r_type := req.r_type - io.release.bits.addr := Cat(req.tag, req.idx).toUInt - io.release.bits.client_xact_id := req.client_xact_id - io.release.bits.master_xact_id := req.master_xact_id - io.release.bits.data := io.data_resp + val fire = active && cnt < UInt(conf.refillcycles) + io.req.ready := !active // We reissue the meta read as it sets up the muxing for s2_data_muxed io.meta_read.valid := fire io.meta_read.bits.addr := io.release.bits.addr << conf.offbits + + io.data_req.valid := fire + io.data_req.bits.way_en := req.way_en + if(conf.refillcycles > 1) { + io.data_req.bits.addr := Cat(req.idx, cnt(log2Up(conf.refillcycles)-1,0)) << conf.rowoffbits + } else { + io.data_req.bits.addr := req.idx << conf.rowoffbits + } + + io.release.bits.r_type := req.r_type + io.release.bits.addr := Cat(req.tag, req.idx).toUInt + io.release.bits.client_xact_id := req.client_xact_id + io.release.bits.master_xact_id := req.master_xact_id + if(conf.refillcycles > 1) { + val data_buf = Reg(Bits()) + when(active && r2_data_req_fired) { + data_buf := Cat(io.data_resp, data_buf(conf.refillcycles*conf.encrowbits-1, conf.encrowbits)) + } + io.release.bits.data := data_buf + } else { + io.release.bits.data := io.data_resp + } + } class ProbeUnit(implicit conf: DCacheConfig) extends Module { @@ -541,7 +557,7 @@ class ProbeUnit(implicit conf: DCacheConfig) extends Module { } io.req.ready := state === s_invalid - io.rep.valid := state === s_release && !tl.co.needsWriteback(line_state) + io.rep.valid := state === s_release && !(hit && tl.co.needsWriteback(line_state)) io.rep.bits := Release(tl.co.getReleaseTypeOnProbe(req, Mux(hit, line_state, tl.co.newStateOnFlush)), req.addr, req.client_xact_id, req.master_xact_id) io.meta_read.valid := state === s_meta_read @@ -827,8 +843,6 @@ class HellaCache(implicit conf: DCacheConfig) extends Module { // data val data = Module(new DataArray) val readArb = Module(new Arbiter(new DataReadReq, 4)) - readArb.io.out.ready := !io.mem.grant.valid || io.mem.grant.ready // insert bubble if refill gets blocked - readArb.io.out <> data.io.read val writeArb = Module(new Arbiter(new DataWriteReq, 2)) data.io.write.valid := writeArb.io.out.valid @@ -912,7 +926,9 @@ class HellaCache(implicit conf: DCacheConfig) extends Module { } writeArb.io.in(0).bits.addr := s3_req.addr - writeArb.io.in(0).bits.wmask := UInt(1) << s3_req.addr(conf.rowoffbits-1,offsetlsb).toUInt + writeArb.io.in(0).bits.wmask := UInt(1) << (if(conf.rowoffbits > offsetlsb) + s3_req.addr(conf.rowoffbits-1,offsetlsb).toUInt + else UInt(0)) writeArb.io.in(0).bits.data := Fill(conf.rowwords, s3_req.data) writeArb.io.in(0).valid := s3_valid writeArb.io.in(0).bits.way_en := s3_way @@ -932,8 +948,6 @@ class HellaCache(implicit conf: DCacheConfig) extends Module { mshrs.io.req.bits.data := s2_req.data when (mshrs.io.req.fire()) { replacer.miss } - mshrs.io.mem_grant.valid := io.mem.grant.fire() - mshrs.io.mem_grant.bits := io.mem.grant.bits io.mem.acquire <> DecoupledLogicalNetworkIOWrapper(mshrs.io.mem_req) // replays @@ -944,6 +958,7 @@ class HellaCache(implicit conf: DCacheConfig) extends Module { s1_replay := mshrs.io.replay.valid && readArb.io.in(1).ready metaReadArb.io.in(1) <> mshrs.io.meta_read metaWriteArb.io.in(0) <> mshrs.io.meta_write + // probes val releaseArb = Module(new Arbiter(new Release, 2)) DecoupledLogicalNetworkIOWrapper(releaseArb.io.out) <> io.mem.release @@ -960,12 +975,21 @@ class HellaCache(implicit conf: DCacheConfig) extends Module { prober.io.mshr_rdy := mshrs.io.probe_rdy // refills - val refill = tl.co.messageUpdatesDataArray(io.mem.grant.bits.payload) - writeArb.io.in(1).valid := io.mem.grant.valid && refill - io.mem.grant.ready := writeArb.io.in(1).ready || !refill + def doRefill(g: Grant): Bool = tl.co.messageUpdatesDataArray(g) + val refill = if(conf.refillcycles > 1) { + val ser = Module(new FlowThroughSerializer(io.mem.grant.bits, conf.refillcycles, doRefill)) + ser.io.in <> io.mem.grant + ser.io.out + } else io.mem.grant + mshrs.io.mem_grant.valid := refill.fire() + mshrs.io.mem_grant.bits := refill.bits + refill.ready := writeArb.io.in(1).ready || !doRefill(refill.bits.payload) + writeArb.io.in(1).valid := refill.valid && doRefill(refill.bits.payload) writeArb.io.in(1).bits := mshrs.io.mem_resp writeArb.io.in(1).bits.wmask := SInt(-1) - writeArb.io.in(1).bits.data := io.mem.grant.bits.payload.data + writeArb.io.in(1).bits.data := refill.bits.payload.data(conf.encrowbits-1,0) + readArb.io.out.ready := !refill.valid || refill.ready // insert bubble if refill gets blocked + readArb.io.out <> data.io.read // writebacks val wbArb = Module(new Arbiter(new WritebackReq, 2)) diff --git a/rocket/src/main/scala/util.scala b/rocket/src/main/scala/util.scala index 24016763..e912bb1b 100644 --- a/rocket/src/main/scala/util.scala +++ b/rocket/src/main/scala/util.scala @@ -1,6 +1,7 @@ package rocket import Chisel._ +import uncore._ import scala.math._ class BooleanToInt(x: Int) { @@ -161,3 +162,49 @@ object Random private def partition(value: UInt, slices: Int) = Vec.tabulate(slices)(i => value < round((i << value.getWidth).toDouble / slices)) } + +class FlowThroughSerializer[T <: HasTileLinkData](gen: LogicalNetworkIO[T], n: Int, doSer: T => Bool) extends Module { + val io = new Bundle { + val in = Decoupled(gen.clone).flip + val out = Decoupled(gen.clone) + val cnt = UInt(OUTPUT, log2Up(n)) + val done = Bool(OUTPUT) + } + require(io.in.bits.payload.data.width % n == 0) + val narrowWidth = io.in.bits.payload.data.width / n + val cnt = Reg(init=UInt(0, width = log2Up(n))) + val wrap = cnt === UInt(n-1) + val rbits = Reg(init=io.in.bits) + val active = Reg(init=Bool(false)) + + val shifter = Vec.fill(n){Bits(width = narrowWidth)} + (0 until n).foreach { + i => shifter(i) := rbits.payload.data((i+1)*narrowWidth-1,i*narrowWidth) + } + + io.done := Bool(false) + io.cnt := cnt + io.in.ready := !active + io.out.valid := active || io.in.valid + io.out.bits := io.in.bits + when(!active && io.in.valid) { + when(doSer(io.in.bits.payload)) { + cnt := Mux(io.out.ready, UInt(1), UInt(0)) + rbits := io.in.bits + active := Bool(true) + } + io.done := !doSer(io.in.bits.payload) + } + when(active) { + io.out.bits := rbits + io.out.bits.payload.data := shifter(cnt) + when(io.out.ready) { + cnt := cnt + UInt(1) + when(wrap) { + io.done := Bool(true) + active := Bool(false) + } + } + } +} +