1
0

[tl2] convert NBDcache to TL2 (WIP; compiles but untested)

This commit is contained in:
Henry Cook 2016-11-18 19:01:36 -08:00
parent 5f1cc19d71
commit 8b908465e0
8 changed files with 1049 additions and 1019 deletions

View File

@ -0,0 +1,167 @@
// See LICENSE for license details.
package rocket
import Chisel._
import cde.{Parameters, Field}
import diplomacy._
import uncore.tilelink2._
import uncore.agents._
import uncore.constants._
import uncore.tilelink.{TLKey, TLId}
import util.ParameterizedBundle
case class DCacheConfig(
nMSHRs: Int = 1,
nSDQ: Int = 17,
nRPQ: Int = 16,
nMMIOs: Int = 1)
case object DCacheKey extends Field[DCacheConfig]
trait HasL1HellaCacheParameters extends HasCacheParameters with HasCoreParameters {
val outerDataBeats = p(TLKey(p(TLId))).dataBeats
val outerDataBits = p(TLKey(p(TLId))).dataBitsPerBeat
val refillCyclesPerBeat = outerDataBits/rowBits
require(refillCyclesPerBeat == 1)
val refillCycles = refillCyclesPerBeat*outerDataBeats
val cacheBlockBytes = p(CacheBlockBytes)
val lgCacheBlockBytes = log2Up(cacheBlockBytes)
val wordBits = xLen // really, xLen max
val wordBytes = wordBits/8
val wordOffBits = log2Up(wordBytes)
val beatBytes = cacheBlockBytes / outerDataBeats
val beatWords = beatBytes / wordBytes
val beatOffBits = log2Up(beatBytes)
val idxMSB = untagBits-1
val idxLSB = blockOffBits
val offsetmsb = idxLSB-1
val offsetlsb = wordOffBits
val rowWords = rowBits/wordBits
val doNarrowRead = coreDataBits * nWays % rowBits == 0
val encDataBits = code.width(coreDataBits)
val encRowBits = encDataBits*rowWords
val nIOMSHRs = 1
val lrscCycles = 32 // ISA requires 16-insn LRSC sequences to succeed
require(isPow2(nSets))
require(rowBits >= coreDataBits)
require(rowBits <= outerDataBits)
require(xLen <= outerDataBits) // would need offset addr for puts if data width < xlen
require(!usingVM || untagBits <= pgIdxBits)
}
abstract class L1HellaCacheModule(implicit val p: Parameters) extends Module
with HasL1HellaCacheParameters
abstract class L1HellaCacheBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
with HasL1HellaCacheParameters
class L1Metadata(implicit p: Parameters) extends Metadata()(p) with HasL1HellaCacheParameters {
val coh = new ClientMetadata
}
object L1Metadata {
def apply(tag: Bits, coh: ClientMetadata)(implicit p: Parameters) = {
val meta = Wire(new L1Metadata)
meta.tag := tag
meta.coh := coh
meta
}
}
class L1MetaReadReq(implicit p: Parameters) extends MetaReadReq {
val tag = Bits(width = tagBits)
override def cloneType = new L1MetaReadReq()(p).asInstanceOf[this.type] //TODO remove
}
class L1MetaWriteReq(implicit p: Parameters) extends
MetaWriteReq[L1Metadata](new L1Metadata)
trait HasCoreMemOp extends HasCoreParameters {
val addr = UInt(width = coreMaxAddrBits)
val tag = Bits(width = dcacheReqTagBits)
val cmd = Bits(width = M_SZ)
val typ = Bits(width = MT_SZ)
}
trait HasCoreData extends HasCoreParameters {
val data = Bits(width = coreDataBits)
}
class HellaCacheReq(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasCoreData
class HellaCacheResp(implicit p: Parameters) extends CoreBundle()(p)
with HasCoreMemOp
with HasCoreData {
val replay = Bool()
val has_data = Bool()
val data_word_bypass = Bits(width = coreDataBits)
val store_data = Bits(width = coreDataBits)
}
class AlignmentExceptions extends Bundle {
val ld = Bool()
val st = Bool()
}
class HellaCacheExceptions extends Bundle {
val ma = new AlignmentExceptions
val pf = new AlignmentExceptions
}
// interface between D$ and processor/DTLB
class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) {
val req = Decoupled(new HellaCacheReq)
val s1_kill = Bool(OUTPUT) // kill previous cycle's req
val s1_data = Bits(OUTPUT, coreDataBits) // data for previous cycle's req
val s2_nack = Bool(INPUT) // req from two cycles ago is rejected
val resp = Valid(new HellaCacheResp).flip
val replay_next = Bool(INPUT)
val xcpt = (new HellaCacheExceptions).asInput
val invalidate_lr = Bool(OUTPUT)
val ordered = Bool(INPUT)
}
abstract class HellaCache(val cfg: DCacheConfig)(implicit val p: Parameters) extends LazyModule {
val node = TLClientNode(TLClientParameters(
sourceId = IdRange(0, cfg.nMSHRs + cfg.nMMIOs),
supportsProbe = TransferSizes(p(CacheBlockBytes))))
val module: HellaCacheModule
}
class HellaCacheBundle(outer: HellaCache)(implicit p: Parameters) extends Bundle {
val cpu = (new HellaCacheIO).flip
val ptw = new TLBPTWIO()
val mem = outer.node.bundleOut
}
class HellaCacheModule(outer: HellaCache)(implicit val p: Parameters) extends LazyModuleImp(outer)
with HasL1HellaCacheParameters {
implicit val cfg = outer.cfg
val io = new HellaCacheBundle(outer)
val edge = outer.node.edgesOut(0)
val tl_out = io.mem(0)
/* TODO
edge.manager.managers.foreach { m =>
if (m.supportsGet) {
require (m.supportsGet.contains(TransferSizes(1, tlDataBytes)))
....etc
}
}
*/
}
object HellaCache {
def apply(cfg: DCacheConfig)(implicit p: Parameters) = {
if (cfg.nMSHRs == 0) LazyModule(new DCache(cfg))
else LazyModule(new NonBlockingDCache(cfg))
}
}

View File

@ -0,0 +1,107 @@
// See LICENSE for license details.
package rocket
import Chisel._
import Chisel.ImplicitConversions._
import cde.Parameters
import junctions._
import diplomacy._
import uncore.constants._
import uncore.tilelink2._
import uncore.util._
class ScratchpadSlavePort(implicit val p: Parameters) extends LazyModule with HasCoreParameters {
val node = TLManagerNode(TLManagerPortParameters(
Seq(TLManagerParameters(
address = List(AddressSet(0x80000000L, BigInt(p(DataScratchpadSize)-1))),
regionType = RegionType.UNCACHED,
executable = true,
supportsArithmetic = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none,
supportsLogical = if (p(UseAtomics)) TransferSizes(1, coreDataBytes) else TransferSizes.none,
supportsPutPartial = TransferSizes(1, coreDataBytes),
supportsPutFull = TransferSizes(1, coreDataBytes),
supportsGet = TransferSizes(1, coreDataBytes),
fifoId = Some(0))), // requests handled in FIFO order
beatBytes = coreDataBytes,
minLatency = 1))
// Make sure this ends up with the same name as before
override def name = "dmem0"
lazy val module = new LazyModuleImp(this) {
val io = new Bundle {
val tl_in = node.bundleIn
val dmem = new HellaCacheIO
}
val tl_in = io.tl_in(0)
val edge = node.edgesIn(0)
require(usingDataScratchpad)
val s_ready :: s_wait :: s_replay :: s_grant :: Nil = Enum(UInt(), 4)
val state = Reg(init = s_ready)
when (io.dmem.resp.valid) { state := s_grant }
when (tl_in.d.fire()) { state := s_ready }
when (io.dmem.s2_nack) { state := s_replay }
when (io.dmem.req.fire()) { state := s_wait }
val acq = Reg(tl_in.a.bits)
when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data }
when (tl_in.a.fire()) { acq := tl_in.a.bits }
val isWrite = acq.opcode === TLMessages.PutFullData || acq.opcode === TLMessages.PutPartialData
val isRead = !edge.hasData(acq)
def formCacheReq(acq: TLBundleA) = {
val req = Wire(new HellaCacheReq)
req.cmd := MuxLookup(acq.opcode, Wire(M_XRD), Array(
TLMessages.PutFullData -> M_XWR,
TLMessages.PutPartialData -> M_XWR,
TLMessages.ArithmeticData -> MuxLookup(acq.param, Wire(M_XRD), Array(
TLAtomics.MIN -> M_XA_MIN,
TLAtomics.MAX -> M_XA_MAX,
TLAtomics.MINU -> M_XA_MINU,
TLAtomics.MAXU -> M_XA_MAXU,
TLAtomics.ADD -> M_XA_ADD)),
TLMessages.LogicalData -> MuxLookup(acq.param, Wire(M_XRD), Array(
TLAtomics.XOR -> M_XA_XOR,
TLAtomics.OR -> M_XA_OR,
TLAtomics.AND -> M_XA_AND,
TLAtomics.SWAP -> M_XA_SWAP)),
TLMessages.Get -> M_XRD))
// treat all loads as full words, so bytes appear in correct lane
req.typ := Mux(isRead, log2Ceil(coreDataBytes), acq.size)
req.addr := Mux(isRead, ~(~acq.address | (coreDataBytes-1)), acq.address)
req.tag := UInt(0)
req
}
val ready = state === s_ready || tl_in.d.fire()
io.dmem.req.valid := (tl_in.a.valid && ready) || state === s_replay
tl_in.a.ready := io.dmem.req.ready && ready
io.dmem.req.bits := formCacheReq(Mux(state === s_replay, acq, tl_in.a.bits))
// the TL data is already in the correct byte lane, but the D$
// expects right-justified store data, so that it can steer the bytes.
io.dmem.s1_data := new LoadGen(acq.size, Bool(false), acq.address(log2Ceil(coreDataBytes)-1,0), acq.data, Bool(false), coreDataBytes).data
io.dmem.s1_kill := false
io.dmem.invalidate_lr := false
// place AMO data in correct word lane
val minAMOBytes = 4
val grantData = Mux(io.dmem.resp.valid, io.dmem.resp.bits.data, acq.data)
val alignedGrantData = Mux(acq.size <= log2Ceil(minAMOBytes), Fill(coreDataBytes/minAMOBytes, grantData(8*minAMOBytes-1, 0)), grantData)
tl_in.d.valid := io.dmem.resp.valid || state === s_grant
tl_in.d.bits := Mux(isWrite,
edge.AccessAck(acq, UInt(0)),
edge.AccessAck(acq, UInt(0), UInt(0)))
tl_in.d.bits.data := alignedGrantData
// Tie off unused channels
tl_in.b.valid := Bool(false)
tl_in.c.ready := Bool(true)
tl_in.e.ready := Bool(true)
}
}

View File

@ -0,0 +1,136 @@
// See LICENSE for license details.
package rocket
import Chisel._
import Chisel.ImplicitConversions._
import cde.Parameters
import util._
/**
* This module buffers requests made by the SimpleHellaCacheIF in case they
* are nacked. Nacked requests must be replayed in order, and no other requests
* must be allowed to go through until the replayed requests are successfully
* completed.
*/
class SimpleHellaCacheIFReplayQueue(depth: Int)
(implicit val p: Parameters) extends Module
with HasL1HellaCacheParameters {
val io = new Bundle {
val req = Decoupled(new HellaCacheReq).flip
val nack = Valid(Bits(width = coreDCacheReqTagBits)).flip
val resp = Valid(new HellaCacheResp).flip
val replay = Decoupled(new HellaCacheReq)
}
// Registers to store the sent request
// When a request is sent the first time,
// it is stored in one of the reqs registers
// and the corresponding inflight bit is set.
// The reqs register will be deallocated once the request is
// successfully completed.
val inflight = Reg(init = UInt(0, depth))
val reqs = Reg(Vec(depth, new HellaCacheReq))
// The nack queue stores the index of nacked requests (in the reqs vector)
// in the order that they were nacked. A request is enqueued onto nackq
// when it is newly nacked (i.e. not a nack for a previous replay).
// The head of the nack queue will be replayed until it is
// successfully completed, at which time the request is dequeued.
// No new requests will be made or other replays attempted until the head
// of the nackq is successfully completed.
val nackq = Module(new Queue(UInt(width = log2Up(depth)), depth))
val replaying = Reg(init = Bool(false))
val next_inflight_onehot = PriorityEncoderOH(~inflight)
val next_inflight = OHToUInt(next_inflight_onehot)
val next_replay = nackq.io.deq.bits
val next_replay_onehot = UIntToOH(next_replay)
val next_replay_req = reqs(next_replay)
// Keep sending the head of the nack queue until it succeeds
io.replay.valid := nackq.io.deq.valid && !replaying
io.replay.bits := next_replay_req
// Don't allow new requests if there is are replays waiting
// or something being nacked.
io.req.ready := !inflight.andR && !nackq.io.deq.valid && !io.nack.valid
// Match on the tags to determine the index of nacks or responses
val nack_onehot = Cat(reqs.map(_.tag === io.nack.bits).reverse) & inflight
val resp_onehot = Cat(reqs.map(_.tag === io.resp.bits.tag).reverse) & inflight
val replay_complete = io.resp.valid && replaying && io.resp.bits.tag === next_replay_req.tag
val nack_head = io.nack.valid && nackq.io.deq.valid && io.nack.bits === next_replay_req.tag
// Enqueue to the nack queue if there is a nack that is not in response to
// the previous replay
nackq.io.enq.valid := io.nack.valid && !nack_head
nackq.io.enq.bits := OHToUInt(nack_onehot)
assert(!nackq.io.enq.valid || nackq.io.enq.ready,
"SimpleHellaCacheIF: ReplayQueue nack queue overflow")
// Dequeue from the nack queue if the last replay was successfully completed
nackq.io.deq.ready := replay_complete
assert(!nackq.io.deq.ready || nackq.io.deq.valid,
"SimpleHellaCacheIF: ReplayQueue nack queue underflow")
// Set inflight bit when a request is made
// Clear it when it is successfully completed
inflight := (inflight | Mux(io.req.fire(), next_inflight_onehot, UInt(0))) &
~Mux(io.resp.valid, resp_onehot, UInt(0))
when (io.req.fire()) {
reqs(next_inflight) := io.req.bits
}
// Only one replay outstanding at a time
when (io.replay.fire()) { replaying := Bool(true) }
when (nack_head || replay_complete) { replaying := Bool(false) }
}
// exposes a sane decoupled request interface
class SimpleHellaCacheIF(implicit p: Parameters) extends Module
{
val io = new Bundle {
val requestor = new HellaCacheIO().flip
val cache = new HellaCacheIO
}
val replayq = Module(new SimpleHellaCacheIFReplayQueue(2))
val req_arb = Module(new Arbiter(new HellaCacheReq, 2))
val req_helper = DecoupledHelper(
req_arb.io.in(1).ready,
replayq.io.req.ready,
io.requestor.req.valid)
req_arb.io.in(0) <> replayq.io.replay
req_arb.io.in(1).valid := req_helper.fire(req_arb.io.in(1).ready)
req_arb.io.in(1).bits := io.requestor.req.bits
io.requestor.req.ready := req_helper.fire(io.requestor.req.valid)
replayq.io.req.valid := req_helper.fire(replayq.io.req.ready)
replayq.io.req.bits := io.requestor.req.bits
val s0_req_fire = io.cache.req.fire()
val s1_req_fire = Reg(next = s0_req_fire)
val s2_req_fire = Reg(next = s1_req_fire)
val s1_req_tag = Reg(next = io.cache.req.bits.tag)
val s2_req_tag = Reg(next = s1_req_tag)
val s2_kill = Reg(next = io.cache.s1_kill)
io.cache.invalidate_lr := io.requestor.invalidate_lr
io.cache.req <> req_arb.io.out
io.cache.s1_kill := io.cache.s2_nack
io.cache.s1_data := RegEnable(req_arb.io.out.bits.data, s0_req_fire)
replayq.io.nack.valid := (io.cache.s2_nack || s2_kill) && s2_req_fire
replayq.io.nack.bits := s2_req_tag
replayq.io.resp := io.cache.resp
io.requestor.resp := io.cache.resp
assert(!Reg(next = io.cache.req.fire()) ||
!(io.cache.xcpt.ma.ld || io.cache.xcpt.ma.st ||
io.cache.xcpt.pf.ld || io.cache.xcpt.pf.st),
"SimpleHellaCacheIF exception")
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -13,6 +13,14 @@ object TLArbiter
val lowestIndexFirst: Policy = (valids, granted) => val lowestIndexFirst: Policy = (valids, granted) =>
valids.scanLeft(Bool(true))(_ && !_).init valids.scanLeft(Bool(true))(_ && !_).init
def lowestFromSeq[T <: TLChannel](edge: TLEdge, sink: DecoupledIO[T], sources: Seq[DecoupledIO[T]]) {
apply(lowestIndexFirst)(sink, sources.map(s => (edge.numBeats1(s.bits), s)):_*)
}
def lowest[T <: TLChannel](edge: TLEdge, sink: DecoupledIO[T], sources: DecoupledIO[T]*) {
apply(lowestIndexFirst)(sink, sources.toList.map(s => (edge.numBeats1(s.bits), s)):_*)
}
def apply[T <: Data](policy: Policy)(sink: DecoupledIO[T], sources: (UInt, DecoupledIO[T])*) { def apply[T <: Data](policy: Policy)(sink: DecoupledIO[T], sources: (UInt, DecoupledIO[T])*) {
if (sources.isEmpty) { if (sources.isEmpty) {
sink.valid := Bool(false) sink.valid := Bool(false)

View File

@ -136,11 +136,9 @@ class TLBroadcast(lineBytes: Int, numTrackers: Int = 4, bufferless: Boolean = fa
putfull.bits := edgeOut.Put(Cat(put_what, in.c.bits.source), in.c.bits.address, in.c.bits.size, in.c.bits.data)._2 putfull.bits := edgeOut.Put(Cat(put_what, in.c.bits.source), in.c.bits.address, in.c.bits.size, in.c.bits.data)._2
// Combine ReleaseAck or the modified D // Combine ReleaseAck or the modified D
TLArbiter(TLArbiter.lowestIndexFirst)(in.d, (UInt(0), releaseack), (edgeOut.numBeats1(d_normal.bits), d_normal)) TLArbiter.lowest(edgeOut, in.d, releaseack, d_normal)
// Combine the PutFull with the trackers // Combine the PutFull with the trackers
TLArbiter(TLArbiter.lowestIndexFirst)(out.a, TLArbiter.lowestFromSeq(edgeOut, out.a, putfull +: trackers.map(_.out_a))
((edgeOut.numBeats1(putfull.bits), putfull) +:
trackers.map { t => (edgeOut.numBeats1(t.out_a.bits), t.out_a) }):_*)
// The Probe FSM walks all caches and probes them // The Probe FSM walks all caches and probes them
val probe_todo = RegInit(UInt(0, width = max(1, caches.size))) val probe_todo = RegInit(UInt(0, width = max(1, caches.size)))

View File

@ -191,27 +191,32 @@ class TLEdge(
def first(bits: TLChannel, fire: Bool): Bool = firstlastHelper(bits, fire)._1 def first(bits: TLChannel, fire: Bool): Bool = firstlastHelper(bits, fire)._1
def first(x: DecoupledIO[TLChannel]): Bool = first(x.bits, x.fire()) def first(x: DecoupledIO[TLChannel]): Bool = first(x.bits, x.fire())
def first(x: ValidIO[TLChannel]): Bool = first(x.bits, x.valid)
def last(bits: TLChannel, fire: Bool): Bool = firstlastHelper(bits, fire)._2 def last(bits: TLChannel, fire: Bool): Bool = firstlastHelper(bits, fire)._2
def last(x: DecoupledIO[TLChannel]): Bool = last(x.bits, x.fire()) def last(x: DecoupledIO[TLChannel]): Bool = last(x.bits, x.fire())
def last(x: ValidIO[TLChannel]): Bool = last(x.bits, x.valid)
def firstlast(bits: TLChannel, fire: Bool): (Bool, Bool, Bool) = { def firstlast(bits: TLChannel, fire: Bool): (Bool, Bool, Bool) = {
val r = firstlastHelper(bits, fire) val r = firstlastHelper(bits, fire)
(r._1, r._2, r._3) (r._1, r._2, r._3)
} }
def firstlast(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool) = firstlast(x.bits, x.fire()) def firstlast(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool) = firstlast(x.bits, x.fire())
def firstlast(x: ValidIO[TLChannel]): (Bool, Bool, Bool) = firstlast(x.bits, x.valid)
def count(bits: TLChannel, fire: Bool): (Bool, Bool, Bool, UInt) = { def count(bits: TLChannel, fire: Bool): (Bool, Bool, Bool, UInt) = {
val r = firstlastHelper(bits, fire) val r = firstlastHelper(bits, fire)
(r._1, r._2, r._3, r._4) (r._1, r._2, r._3, r._4)
} }
def count(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool, UInt) = count(x.bits, x.fire()) def count(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool, UInt) = count(x.bits, x.fire())
def count(x: ValidIO[TLChannel]): (Bool, Bool, Bool, UInt) = count(x.bits, x.valid)
def addr_inc(bits: TLChannel, fire: Bool): (Bool, Bool, Bool, UInt) = { def addr_inc(bits: TLChannel, fire: Bool): (Bool, Bool, Bool, UInt) = {
val r = firstlastHelper(bits, fire) val r = firstlastHelper(bits, fire)
(r._1, r._2, r._3, r._4 << log2Ceil(manager.beatBytes)) (r._1, r._2, r._3, r._4 << log2Ceil(manager.beatBytes))
} }
def addr_inc(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool, UInt) = addr_inc(x.bits, x.fire()) def addr_inc(x: DecoupledIO[TLChannel]): (Bool, Bool, Bool, UInt) = addr_inc(x.bits, x.fire())
def addr_inc(x: ValidIO[TLChannel]): (Bool, Bool, Bool, UInt) = addr_inc(x.bits, x.valid)
} }
class TLEdgeOut( class TLEdgeOut(