From 63679bb01921075ddd052d1c09af815167f1ec91 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 2 Sep 2016 15:59:16 -0700 Subject: [PATCH] Add support for L1 data scratchpads instead of caches They fit in the same part of the address space as DRAM would be, and are coherent (because they are not cacheable). They are currently limited to single cores without DRAM. We intend to lift both restrictions, probably when we add support for heterogeneous tiles. --- chisel3 | 2 +- riscv-tools | 2 +- src/main/scala/coreplex/Configs.scala | 9 +++ src/main/scala/coreplex/Coreplex.scala | 15 ++-- src/main/scala/rocket/dcache.scala | 97 +++++++++++++++++++++---- src/main/scala/rocket/rocket.scala | 4 +- src/main/scala/rocket/tile.scala | 8 ++ src/main/scala/rocketchip/Configs.scala | 39 ++++++---- 8 files changed, 140 insertions(+), 36 deletions(-) diff --git a/chisel3 b/chisel3 index 2a074c82..16426b3a 160000 --- a/chisel3 +++ b/chisel3 @@ -1 +1 @@ -Subproject commit 2a074c828ddd8e6c20fa21d618664d50120f3d7a +Subproject commit 16426b3a68d85ce7dd9655b0ce773431eb69fc74 diff --git a/riscv-tools b/riscv-tools index 705db10f..61d74b58 160000 --- a/riscv-tools +++ b/riscv-tools @@ -1 +1 @@ -Subproject commit 705db10fd14b313d5cc96f193d1271256bff75da +Subproject commit 61d74b5837d270f116fc21e907ed78f582361688 diff --git a/src/main/scala/coreplex/Configs.scala b/src/main/scala/coreplex/Configs.scala index 0767cafc..7dc26198 100644 --- a/src/main/scala/coreplex/Configs.scala +++ b/src/main/scala/coreplex/Configs.scala @@ -63,6 +63,7 @@ class BaseCoreplexConfig extends Config ( case BtbKey => BtbParameters() //L1DataCache case DCacheKey => DCacheConfig(nMSHRs = site(Knob("L1D_MSHRS"))) + case DataScratchpadSize => 0 //L2 Memory System Params case AmoAluOperandBits => site(XLen) case NAcquireTransactors => 7 @@ -239,6 +240,13 @@ class WithNBanksPerMemChannel(n: Int) extends Config( case _ => throw new CDEMatchError }) +class WithDataScratchpad(n: Int) extends Config( + (pname,site,here) => pname match { + case DataScratchpadSize => n + case NSets if site(CacheName) == "L1D" => n / site(CacheBlockBytes) + case _ => throw new CDEMatchError + }) + class WithL2Cache extends Config( (pname,site,here) => pname match { case "L2_CAPACITY_IN_KB" => Knob("L2_CAPACITY_IN_KB") @@ -330,6 +338,7 @@ class WithRV32 extends Config( "rv32mi-p-csr", "rv32ui-p-sh", "rv32ui-p-lh", + "rv32uc-p-rvc", "rv32mi-p-sbreak", "rv32ui-p-sll") case _ => throw new CDEMatchError diff --git a/src/main/scala/coreplex/Coreplex.scala b/src/main/scala/coreplex/Coreplex.scala index 7ef9f275..023b471b 100644 --- a/src/main/scala/coreplex/Coreplex.scala +++ b/src/main/scala/coreplex/Coreplex.scala @@ -66,6 +66,7 @@ class Uncore(implicit val p: Parameters) extends Module val mem = Vec(nMemChannels, new ClientUncachedTileLinkIO()(outermostParams)) val tiles_cached = Vec(nCachedTilePorts, new ClientTileLinkIO).flip val tiles_uncached = Vec(nUncachedTilePorts, new ClientUncachedTileLinkIO).flip + val tiles_slave = Vec(nTiles, new ClientUncachedTileLinkIO) val ext_uncached = Vec(nExtClients, new ClientUncachedTileLinkIO()(innerParams)).flip val prci = Vec(nTiles, new PRCITileIO).asOutput val mmio = exportMMIO.option(new ClientUncachedTileLinkIO()(outermostMMIOParams)) @@ -92,7 +93,8 @@ class Uncore(implicit val p: Parameters) extends Module rom.order(ByteOrder.LITTLE_ENDIAN) // for now, have the reset vector jump straight to memory - val resetToMemDist = p(GlobalAddrMap)("mem").start - p(ResetVector) + val memBase = (if (p(GlobalAddrMap) contains "mem") p(GlobalAddrMap)("mem") else p(GlobalAddrMap)("io:int:dmem0")).start + val resetToMemDist = memBase - p(ResetVector) require(resetToMemDist == (resetToMemDist.toInt >> 12 << 12)) val configStringAddr = p(ResetVector).toInt + rom.capacity @@ -134,6 +136,10 @@ class Uncore(implicit val p: Parameters) extends Module io.prci(i).reset := reset } + val tileSlavePorts = (0 until nTiles) map (i => s"int:dmem$i") filter (ioAddrMap contains _) + for ((t, m) <- io.tiles_slave zip (tileSlavePorts map (mmioNetwork port _))) + t <> ClientUncachedTileLinkEnqueuer(m, 1) + val bootROM = Module(new ROMSlave(makeBootROM())) bootROM.io <> mmioNetwork.port("int:bootrom") @@ -174,11 +180,9 @@ class DefaultOuterMemorySystem(implicit p: Parameters) extends OuterMemorySystem // Cached ports are first in client list, making sharerToClientId just an indentity function // addrToBank is sed to hash physical addresses (of cache blocks) to banks (and thereby memory channels) def sharerToClientId(sharerId: UInt) = sharerId - def addrToBank(addr: UInt): UInt = { + def addrToBank(addr: UInt): UInt = if (nBanks == 0) UInt(0) else { val isMemory = p(GlobalAddrMap).isInRegion("mem", addr << log2Up(p(CacheBlockBytes))) - Mux(isMemory, - if (nBanks > 1) addr(lsb + log2Up(nBanks) - 1, lsb) else UInt(0), - UInt(nBanks)) + Mux(isMemory, addr.extract(lsb + log2Ceil(nBanks) - 1, lsb), UInt(nBanks)) } val preBuffering = TileLinkDepths(1,1,2,2,0) val l1tol2net = Module(new PortedTileLinkCrossbar(addrToBank, sharerToClientId, preBuffering)) @@ -274,6 +278,7 @@ class DefaultCoreplex(topParams: Parameters) extends Coreplex()(topParams) { // Connect the uncore to the tile memory ports, HostIO and MemIO uncore.io.tiles_cached <> tileList.map(_.io.cached).flatten uncore.io.tiles_uncached <> tileList.map(_.io.uncached).flatten + (tileList.map(_.io.slave).flatten zip uncore.io.tiles_slave) foreach { case (x, y) => x <> y } uncore.io.interrupts <> io.interrupts uncore.io.debug <> io.debug uncore.io.ext_uncached <> io.ext_clients diff --git a/src/main/scala/rocket/dcache.scala b/src/main/scala/rocket/dcache.scala index e2edb148..68dab522 100644 --- a/src/main/scala/rocket/dcache.scala +++ b/src/main/scala/rocket/dcache.scala @@ -54,11 +54,8 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { // tags val replacer = p(Replacer)() def onReset = L1Metadata(UInt(0), ClientMetadata.onReset) - val meta = Module(new MetadataArray(onReset _)) val metaReadArb = Module(new Arbiter(new MetaReadReq, 3)) val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, 3)) - meta.io.read <> metaReadArb.io.out - meta.io.write <> metaWriteArb.io.out // data val data = Module(new DCacheDataArray) @@ -116,13 +113,28 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { val s1_paddr = Cat(tlb.io.resp.ppn, s1_req.addr(pgIdxBits-1,0)) val s1_tag = Mux(s1_probe, probe_bits.addr_block >> idxBits, s1_paddr(paddrBits-1, untagBits)) - val s1_hit_way = meta.io.resp.map(r => r.coh.isValid() && r.tag === s1_tag).asUInt - val s1_hit_state = ClientMetadata.onReset.fromBits( - meta.io.resp.map(r => Mux(r.tag === s1_tag, r.coh.asUInt, UInt(0))) - .reduce (_|_)) + val s1_victim_way = Wire(init = replacer.way) + val (s1_hit_way, s1_hit_state, s1_victim_meta) = + if (usingDataScratchpad) { + require(nWays == 1) + metaWriteArb.io.out.ready := true + metaReadArb.io.out.ready := !metaWriteArb.io.out.valid + val inScratchpad = addrMap(s"io:int:dmem${tileId}").containsAddress(s1_paddr) + val hitState = Mux(inScratchpad, ClientMetadata.onReset.onHit(M_XWR), ClientMetadata.onReset) + (inScratchpad, hitState, L1Metadata(UInt(0), ClientMetadata.onReset)) + } else { + val meta = Module(new MetadataArray(onReset _)) + meta.io.read <> metaReadArb.io.out + meta.io.write <> metaWriteArb.io.out + val s1_meta = meta.io.resp + val s1_hit_way = s1_meta.map(r => r.coh.isValid() && r.tag === s1_tag).asUInt + val s1_hit_state = ClientMetadata.onReset.fromBits( + s1_meta.map(r => Mux(r.tag === s1_tag, r.coh.asUInt, UInt(0))) + .reduce (_|_)) + (s1_hit_way, s1_hit_state, s1_meta(s1_victim_way)) + } val s1_data_way = Mux(inWriteback, releaseWay, s1_hit_way) val s1_data = Mux1H(s1_data_way, data.io.resp) // retime into s2 if critical - val s1_victim_way = Wire(init = replacer.way) val s2_valid = Reg(next=s1_valid_masked, init=Bool(false)) val s2_probe = Reg(next=s1_probe, init=Bool(false)) @@ -133,7 +145,7 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { when (s1_valid_not_nacked || s1_flush_valid) { s2_req := s1_req s2_req.addr := s1_paddr - s2_uncached := !tlb.io.resp.cacheable + s2_uncached := !tlb.io.resp.cacheable || Bool(usingDataScratchpad) } val s2_read = isRead(s2_req.cmd) val s2_write = isWrite(s2_req.cmd) @@ -151,8 +163,8 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { val s2_victimize = s2_valid_cached_miss || s2_flush_valid val s2_valid_uncached = s2_valid_miss && s2_uncached val s2_victim_way = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_way, UIntToOH(RegEnable(s1_victim_way, s1_valid_not_nacked || s1_flush_valid))) - val s2_victim_tag = RegEnable(meta.io.resp(s1_victim_way).tag, s1_valid_not_nacked || s1_flush_valid) - val s2_victim_state = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_state, RegEnable(meta.io.resp(s1_victim_way).coh, s1_valid_not_nacked || s1_flush_valid)) + val s2_victim_tag = RegEnable(s1_victim_meta.tag, s1_valid_not_nacked || s1_flush_valid) + val s2_victim_state = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_state, RegEnable(s1_victim_meta.coh, s1_valid_not_nacked || s1_flush_valid)) val s2_victim_valid = s2_victim_state.isValid() val s2_victim_dirty = s2_victim_state.requiresVoluntaryWriteback() val s2_new_hit_state = s2_hit_state.onHit(s2_req.cmd) @@ -261,7 +273,8 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { io.mem.acquire.valid := ((s2_valid_cached_miss && !s2_victim_dirty) || s2_valid_uncached) && fq.io.enq.ready io.mem.acquire.bits := cachedGetMessage when (s2_uncached) { - assert(!s2_valid_masked || !s2_hit_state.isValid(), "cache hit on uncached access") + if (!usingDataScratchpad) + assert(!s2_valid_masked || !s2_hit_state.isValid(), "cache hit on uncached access") io.mem.acquire.bits := uncachedGetMessage when (s2_write) { io.mem.acquire.bits := uncachedPutMessage @@ -419,7 +432,7 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { val flushed = Reg(init=Bool(true)) val flushing = Reg(init=Bool(false)) val flushCounter = Counter(nSets * nWays) - when (io.mem.acquire.fire()) { flushed := false } + when (io.mem.acquire.fire() && !s2_uncached) { flushed := false } when (s2_valid_masked && s2_req.cmd === M_FLUSH_ALL) { io.cpu.s2_nack := !flushed when (!flushed) { @@ -442,3 +455,61 @@ class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { } } } + +class ScratchpadSlavePort(implicit p: Parameters) extends CoreModule()(p) { + val io = new Bundle { + val tl = new ClientUncachedTileLinkIO().flip + val dmem = new HellaCacheIO + } + + val s_ready :: s_wait :: s_replay :: s_grant :: Nil = Enum(UInt(), 4) + val state = Reg(init = s_ready) + when (io.dmem.resp.valid) { state := s_grant } + when (io.tl.grant.fire()) { state := s_ready } + when (io.dmem.s2_nack) { state := s_replay } + when (io.dmem.req.fire()) { state := s_wait } + + val acq = Reg(io.tl.acquire.bits) + when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data } + when (io.tl.acquire.fire()) { acq := io.tl.acquire.bits } + + val isRead = acq.isBuiltInType(Acquire.getType) + val isWrite = acq.isBuiltInType(Acquire.putType) + assert(state === s_ready || isRead || isWrite) + require(coreDataBits == acq.tlDataBits) + require(usingDataScratchpad) + + def formCacheReq(acq: Acquire) = { + val req = Wire(new HellaCacheReq) + // treat all loads as full words, so bytes appear in correct lane + req.typ := Mux(isRead, log2Ceil(acq.tlDataBytes), acq.op_size()) + req.cmd := acq.op_code() + req.addr := Mux(isRead, ~(~acq.full_addr() | (acq.tlDataBytes-1)), acq.full_addr()) + req.tag := UInt(0) + req + } + + val ready = state === s_ready || io.tl.grant.fire() + io.dmem.req.valid := (io.tl.acquire.valid && ready) || state === s_replay + io.tl.acquire.ready := io.dmem.req.ready && ready + io.dmem.req.bits := formCacheReq(Mux(state === s_replay, acq, io.tl.acquire.bits)) + // this blows. the TL data is already in the correct byte lane, but the D$ + // expects right-justified store data, so that it can steer the bytes. + io.dmem.s1_data := new LoadGen(acq.op_size(), Bool(false), acq.addr_byte(), acq.data, Bool(false), acq.tlDataBytes).data + io.dmem.s1_kill := false + io.dmem.invalidate_lr := false + + // place AMO data in correct word lane + val minAMOBytes = 4 + val grantData = Mux(io.dmem.resp.valid, io.dmem.resp.bits.data, acq.data) + val alignedGrantData = Mux(acq.op_size() <= log2Ceil(minAMOBytes), Fill(coreDataBytes/minAMOBytes, grantData(8*minAMOBytes-1, 0)), grantData) + + io.tl.grant.valid := io.dmem.resp.valid || state === s_grant + io.tl.grant.bits := Grant( + is_builtin_type = Bool(true), + g_type = acq.getBuiltInGrantType(), + client_xact_id = acq.client_xact_id, + manager_xact_id = UInt(0), + addr_beat = acq.addr_beat, + data = alignedGrantData) +} diff --git a/src/main/scala/rocket/rocket.scala b/src/main/scala/rocket/rocket.scala index 3c4ed21c..49b32be7 100644 --- a/src/main/scala/rocket/rocket.scala +++ b/src/main/scala/rocket/rocket.scala @@ -30,6 +30,7 @@ case object ResetVector extends Field[BigInt] case object NBreakpoints extends Field[Int] case object NPerfCounters extends Field[Int] case object NPerfEvents extends Field[Int] +case object DataScratchpadSize extends Field[Int] trait HasCoreParameters extends HasAddrMapParameters { implicit val p: Parameters @@ -48,6 +49,7 @@ trait HasCoreParameters extends HasAddrMapParameters { val nBreakpoints = p(NBreakpoints) val nPerfCounters = p(NPerfCounters) val nPerfEvents = p(NPerfEvents) + val usingDataScratchpad = p(DataScratchpadSize) > 0 val retireWidth = p(RetireWidth) val fetchWidth = p(FetchWidth) @@ -55,7 +57,7 @@ trait HasCoreParameters extends HasAddrMapParameters { val coreInstBytes = coreInstBits/8 val coreDataBits = xLen val coreDataBytes = coreDataBits/8 - val dcacheArbPorts = 1 + (if (usingVM) 1 else 0) + p(BuildRoCC).size + val dcacheArbPorts = 1 + usingVM.toInt + usingDataScratchpad.toInt + p(BuildRoCC).size val coreDCacheReqTagBits = 6 val dcacheReqTagBits = coreDCacheReqTagBits + log2Ceil(dcacheArbPorts) diff --git a/src/main/scala/rocket/tile.scala b/src/main/scala/rocket/tile.scala index bea82e65..df055f90 100644 --- a/src/main/scala/rocket/tile.scala +++ b/src/main/scala/rocket/tile.scala @@ -5,6 +5,7 @@ package rocket import Chisel._ import uncore.tilelink._ import uncore.agents._ +import uncore.converters._ import uncore.devices._ import Util._ import cde.{Parameters, Field} @@ -31,6 +32,7 @@ abstract class Tile(clockSignal: Clock = null, resetSignal: Bool = null) val cached = Vec(nCachedTileLinkPorts, new ClientTileLinkIO) val uncached = Vec(nUncachedTileLinkPorts, new ClientUncachedTileLinkIO) val prci = new PRCITileIO().flip + val slave = (p(DataScratchpadSize) > 0).option(new ClientUncachedTileLinkIO().flip) } val io = new TileIO @@ -121,6 +123,12 @@ class RocketTile(clockSignal: Clock = null, resetSignal: Bool = null) core.io.ptw <> ptw.io.dpath } + io.slave foreach { case slavePort => + val adapter = Module(new ScratchpadSlavePort()(dcacheParams)) + adapter.io.tl <> TileLinkFragmenter(slavePort) + adapter.io.dmem +=: dcPorts + } + require(dcPorts.size == core.dcacheArbPorts) val dcArb = Module(new HellaCacheArbiter(dcPorts.size)(dcacheParams)) dcArb.io.requestor <> dcPorts diff --git a/src/main/scala/rocketchip/Configs.scala b/src/main/scala/rocketchip/Configs.scala index 72c72631..916188ee 100644 --- a/src/main/scala/rocketchip/Configs.scala +++ b/src/main/scala/rocketchip/Configs.scala @@ -5,6 +5,7 @@ package rocketchip import Chisel._ import junctions._ import rocket._ +import rocket.Util._ import uncore.agents._ import uncore.tilelink._ import uncore.devices._ @@ -26,6 +27,11 @@ class BasePlatformConfig extends Config ( entries += AddrMapEntry("bootrom", MemSize(4096, MemAttr(AddrMapProt.RX))) entries += AddrMapEntry("plic", MemRange(0x40000000, 0x4000000, MemAttr(AddrMapProt.RW))) entries += AddrMapEntry("prci", MemSize(0x4000000, MemAttr(AddrMapProt.RW))) + if (site(DataScratchpadSize) > 0) { // TODO heterogeneous tiles + require(site(NTiles) == 1) // TODO relax this + require(site(NMemoryChannels) == 0) // TODO allow both scratchpad & DRAM + entries += AddrMapEntry("dmem0", MemRange(0x80000000L, site[Int](DataScratchpadSize), MemAttr(AddrMapProt.RWX))) + } new AddrMap(entries) } lazy val externalAddrMap = new AddrMap( @@ -38,13 +44,11 @@ class BasePlatformConfig extends Config ( val intern = AddrMapEntry("int", internalIOAddrMap) val extern = AddrMapEntry("ext", externalAddrMap) - val ioMap = if (site(ExportMMIOPort)) AddrMap(intern, extern) else AddrMap(intern) + val io = AddrMapEntry("io", AddrMap((intern +: site(ExportMMIOPort).option(extern).toSeq):_*)) + val mem = AddrMapEntry("mem", MemRange(memBase, memSize, MemAttr(AddrMapProt.RWX, true))) + val addrMap = AddrMap((io +: (site(NMemoryChannels) > 0).option(mem).toSeq):_*) - val addrMap = AddrMap( - AddrMapEntry("io", ioMap), - AddrMapEntry("mem", MemRange(memBase, memSize, MemAttr(AddrMapProt.RWX, true)))) - - Dump("MEM_BASE", addrMap("mem").start) + Dump("MEM_BASE", memBase) addrMap } def makeConfigString() = { @@ -62,15 +66,17 @@ class BasePlatformConfig extends Config ( res append "rtc {\n" res append s" addr 0x${(prciAddr + PRCI.time).toString(16)};\n" res append "};\n" - res append "ram {\n" - res append " 0 {\n" - res append s" addr 0x${addrMap("mem").start.toString(16)};\n" - res append s" size 0x${addrMap("mem").size.toString(16)};\n" - res append " };\n" - res append "};\n" - res append "core {\n" - for (i <- 0 until site(NTiles)) { - val isa = s"rv${site(XLen)}im${if (site(UseAtomics)) "a" else ""}${if (site(FPUKey).nonEmpty) "fd" else ""}" + if (addrMap contains "mem") { + res append "ram {\n" + res append " 0 {\n" + res append s" addr 0x${addrMap("mem").start.toString(16)};\n" + res append s" size 0x${addrMap("mem").size.toString(16)};\n" + res append " };\n" + res append "};\n" + res append "core {\n" + } + for (i <- 0 until site(NTiles)) { // TODO heterogeneous tiles + val isa = s"rv${site(XLen)}i${site(MulDivKey).map(x=>"m").mkString}${if (site(UseAtomics)) "a" else ""}${if (site(FPUKey).nonEmpty) "fd" else ""}" res append s" $i {\n" res append " 0 {\n" res append s" isa $isa;\n" @@ -204,6 +210,8 @@ class WithTL extends Config( case NExtMMIOTLChannels => 1 }) +class WithScratchpads extends Config(new WithNMemoryChannels(0) ++ new WithDataScratchpad(16384)) + class DefaultFPGASmallConfig extends Config(new WithSmallCores ++ new DefaultFPGAConfig) class DefaultSmallConfig extends Config(new WithSmallCores ++ new BaseConfig) class DefaultRV32Config extends Config(new WithRV32 ++ new DefaultSmallConfig) @@ -253,6 +261,7 @@ class DualCoreConfig extends Config( new WithNCores(2) ++ new WithL2Cache ++ new BaseConfig) class TinyConfig extends Config( + new WithScratchpads ++ new WithRV32 ++ new WithSmallCores ++ new WithStatelessBridge ++ new BaseConfig)