1
0

Add support for L1 data scratchpads instead of caches

They fit in the same part of the address space as DRAM would be, and
are coherent (because they are not cacheable).

They are currently limited to single cores without DRAM.  We intend
to lift both restrictions, probably when we add support for
heterogeneous tiles.
This commit is contained in:
Andrew Waterman
2016-09-02 15:59:16 -07:00
parent dc9ae19936
commit 63679bb019
8 changed files with 140 additions and 36 deletions

View File

@ -54,11 +54,8 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) {
// tags
val replacer = p(Replacer)()
def onReset = L1Metadata(UInt(0), ClientMetadata.onReset)
val meta = Module(new MetadataArray(onReset _))
val metaReadArb = Module(new Arbiter(new MetaReadReq, 3))
val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, 3))
meta.io.read <> metaReadArb.io.out
meta.io.write <> metaWriteArb.io.out
// data
val data = Module(new DCacheDataArray)
@ -116,13 +113,28 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) {
val s1_paddr = Cat(tlb.io.resp.ppn, s1_req.addr(pgIdxBits-1,0))
val s1_tag = Mux(s1_probe, probe_bits.addr_block >> idxBits, s1_paddr(paddrBits-1, untagBits))
val s1_hit_way = meta.io.resp.map(r => r.coh.isValid() && r.tag === s1_tag).asUInt
val s1_hit_state = ClientMetadata.onReset.fromBits(
meta.io.resp.map(r => Mux(r.tag === s1_tag, r.coh.asUInt, UInt(0)))
.reduce (_|_))
val s1_victim_way = Wire(init = replacer.way)
val (s1_hit_way, s1_hit_state, s1_victim_meta) =
if (usingDataScratchpad) {
require(nWays == 1)
metaWriteArb.io.out.ready := true
metaReadArb.io.out.ready := !metaWriteArb.io.out.valid
val inScratchpad = addrMap(s"io:int:dmem${tileId}").containsAddress(s1_paddr)
val hitState = Mux(inScratchpad, ClientMetadata.onReset.onHit(M_XWR), ClientMetadata.onReset)
(inScratchpad, hitState, L1Metadata(UInt(0), ClientMetadata.onReset))
} else {
val meta = Module(new MetadataArray(onReset _))
meta.io.read <> metaReadArb.io.out
meta.io.write <> metaWriteArb.io.out
val s1_meta = meta.io.resp
val s1_hit_way = s1_meta.map(r => r.coh.isValid() && r.tag === s1_tag).asUInt
val s1_hit_state = ClientMetadata.onReset.fromBits(
s1_meta.map(r => Mux(r.tag === s1_tag, r.coh.asUInt, UInt(0)))
.reduce (_|_))
(s1_hit_way, s1_hit_state, s1_meta(s1_victim_way))
}
val s1_data_way = Mux(inWriteback, releaseWay, s1_hit_way)
val s1_data = Mux1H(s1_data_way, data.io.resp) // retime into s2 if critical
val s1_victim_way = Wire(init = replacer.way)
val s2_valid = Reg(next=s1_valid_masked, init=Bool(false))
val s2_probe = Reg(next=s1_probe, init=Bool(false))
@ -133,7 +145,7 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) {
when (s1_valid_not_nacked || s1_flush_valid) {
s2_req := s1_req
s2_req.addr := s1_paddr
s2_uncached := !tlb.io.resp.cacheable
s2_uncached := !tlb.io.resp.cacheable || Bool(usingDataScratchpad)
}
val s2_read = isRead(s2_req.cmd)
val s2_write = isWrite(s2_req.cmd)
@ -151,8 +163,8 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) {
val s2_victimize = s2_valid_cached_miss || s2_flush_valid
val s2_valid_uncached = s2_valid_miss && s2_uncached
val s2_victim_way = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_way, UIntToOH(RegEnable(s1_victim_way, s1_valid_not_nacked || s1_flush_valid)))
val s2_victim_tag = RegEnable(meta.io.resp(s1_victim_way).tag, s1_valid_not_nacked || s1_flush_valid)
val s2_victim_state = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_state, RegEnable(meta.io.resp(s1_victim_way).coh, s1_valid_not_nacked || s1_flush_valid))
val s2_victim_tag = RegEnable(s1_victim_meta.tag, s1_valid_not_nacked || s1_flush_valid)
val s2_victim_state = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_state, RegEnable(s1_victim_meta.coh, s1_valid_not_nacked || s1_flush_valid))
val s2_victim_valid = s2_victim_state.isValid()
val s2_victim_dirty = s2_victim_state.requiresVoluntaryWriteback()
val s2_new_hit_state = s2_hit_state.onHit(s2_req.cmd)
@ -261,7 +273,8 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) {
io.mem.acquire.valid := ((s2_valid_cached_miss && !s2_victim_dirty) || s2_valid_uncached) && fq.io.enq.ready
io.mem.acquire.bits := cachedGetMessage
when (s2_uncached) {
assert(!s2_valid_masked || !s2_hit_state.isValid(), "cache hit on uncached access")
if (!usingDataScratchpad)
assert(!s2_valid_masked || !s2_hit_state.isValid(), "cache hit on uncached access")
io.mem.acquire.bits := uncachedGetMessage
when (s2_write) {
io.mem.acquire.bits := uncachedPutMessage
@ -419,7 +432,7 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) {
val flushed = Reg(init=Bool(true))
val flushing = Reg(init=Bool(false))
val flushCounter = Counter(nSets * nWays)
when (io.mem.acquire.fire()) { flushed := false }
when (io.mem.acquire.fire() && !s2_uncached) { flushed := false }
when (s2_valid_masked && s2_req.cmd === M_FLUSH_ALL) {
io.cpu.s2_nack := !flushed
when (!flushed) {
@ -442,3 +455,61 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) {
}
}
}
class ScratchpadSlavePort(implicit p: Parameters) extends CoreModule()(p) {
val io = new Bundle {
val tl = new ClientUncachedTileLinkIO().flip
val dmem = new HellaCacheIO
}
val s_ready :: s_wait :: s_replay :: s_grant :: Nil = Enum(UInt(), 4)
val state = Reg(init = s_ready)
when (io.dmem.resp.valid) { state := s_grant }
when (io.tl.grant.fire()) { state := s_ready }
when (io.dmem.s2_nack) { state := s_replay }
when (io.dmem.req.fire()) { state := s_wait }
val acq = Reg(io.tl.acquire.bits)
when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data }
when (io.tl.acquire.fire()) { acq := io.tl.acquire.bits }
val isRead = acq.isBuiltInType(Acquire.getType)
val isWrite = acq.isBuiltInType(Acquire.putType)
assert(state === s_ready || isRead || isWrite)
require(coreDataBits == acq.tlDataBits)
require(usingDataScratchpad)
def formCacheReq(acq: Acquire) = {
val req = Wire(new HellaCacheReq)
// treat all loads as full words, so bytes appear in correct lane
req.typ := Mux(isRead, log2Ceil(acq.tlDataBytes), acq.op_size())
req.cmd := acq.op_code()
req.addr := Mux(isRead, ~(~acq.full_addr() | (acq.tlDataBytes-1)), acq.full_addr())
req.tag := UInt(0)
req
}
val ready = state === s_ready || io.tl.grant.fire()
io.dmem.req.valid := (io.tl.acquire.valid && ready) || state === s_replay
io.tl.acquire.ready := io.dmem.req.ready && ready
io.dmem.req.bits := formCacheReq(Mux(state === s_replay, acq, io.tl.acquire.bits))
// this blows. the TL data is already in the correct byte lane, but the D$
// expects right-justified store data, so that it can steer the bytes.
io.dmem.s1_data := new LoadGen(acq.op_size(), Bool(false), acq.addr_byte(), acq.data, Bool(false), acq.tlDataBytes).data
io.dmem.s1_kill := false
io.dmem.invalidate_lr := false
// place AMO data in correct word lane
val minAMOBytes = 4
val grantData = Mux(io.dmem.resp.valid, io.dmem.resp.bits.data, acq.data)
val alignedGrantData = Mux(acq.op_size() <= log2Ceil(minAMOBytes), Fill(coreDataBytes/minAMOBytes, grantData(8*minAMOBytes-1, 0)), grantData)
io.tl.grant.valid := io.dmem.resp.valid || state === s_grant
io.tl.grant.bits := Grant(
is_builtin_type = Bool(true),
g_type = acq.getBuiltInGrantType(),
client_xact_id = acq.client_xact_id,
manager_xact_id = UInt(0),
addr_beat = acq.addr_beat,
data = alignedGrantData)
}

View File

@ -30,6 +30,7 @@ case object ResetVector extends Field[BigInt]
case object NBreakpoints extends Field[Int]
case object NPerfCounters extends Field[Int]
case object NPerfEvents extends Field[Int]
case object DataScratchpadSize extends Field[Int]
trait HasCoreParameters extends HasAddrMapParameters {
implicit val p: Parameters
@ -48,6 +49,7 @@ trait HasCoreParameters extends HasAddrMapParameters {
val nBreakpoints = p(NBreakpoints)
val nPerfCounters = p(NPerfCounters)
val nPerfEvents = p(NPerfEvents)
val usingDataScratchpad = p(DataScratchpadSize) > 0
val retireWidth = p(RetireWidth)
val fetchWidth = p(FetchWidth)
@ -55,7 +57,7 @@ trait HasCoreParameters extends HasAddrMapParameters {
val coreInstBytes = coreInstBits/8
val coreDataBits = xLen
val coreDataBytes = coreDataBits/8
val dcacheArbPorts = 1 + (if (usingVM) 1 else 0) + p(BuildRoCC).size
val dcacheArbPorts = 1 + usingVM.toInt + usingDataScratchpad.toInt + p(BuildRoCC).size
val coreDCacheReqTagBits = 6
val dcacheReqTagBits = coreDCacheReqTagBits + log2Ceil(dcacheArbPorts)

View File

@ -5,6 +5,7 @@ package rocket
import Chisel._
import uncore.tilelink._
import uncore.agents._
import uncore.converters._
import uncore.devices._
import Util._
import cde.{Parameters, Field}
@ -31,6 +32,7 @@ abstract class Tile(clockSignal: Clock = null, resetSignal: Bool = null)
val cached = Vec(nCachedTileLinkPorts, new ClientTileLinkIO)
val uncached = Vec(nUncachedTileLinkPorts, new ClientUncachedTileLinkIO)
val prci = new PRCITileIO().flip
val slave = (p(DataScratchpadSize) > 0).option(new ClientUncachedTileLinkIO().flip)
}
val io = new TileIO
@ -121,6 +123,12 @@ class RocketTile(clockSignal: Clock = null, resetSignal: Bool = null)
core.io.ptw <> ptw.io.dpath
}
io.slave foreach { case slavePort =>
val adapter = Module(new ScratchpadSlavePort()(dcacheParams))
adapter.io.tl <> TileLinkFragmenter(slavePort)
adapter.io.dmem +=: dcPorts
}
require(dcPorts.size == core.dcacheArbPorts)
val dcArb = Module(new HellaCacheArbiter(dcPorts.size)(dcacheParams))
dcArb.io.requestor <> dcPorts