From 972953868cd100d22f8e2513d72ad562bb496af7 Mon Sep 17 00:00:00 2001 From: "Wesley W. Terpstra" Date: Sun, 29 Jan 2017 15:17:52 -0800 Subject: [PATCH] uncore: switch to new diplomacy Node API Most adapters should work on multiple ports. This patch changes them all. --- .../scala/rocket/ScratchpadSlavePort.scala | 4 +- src/main/scala/uncore/ahb/Nodes.scala | 18 +- .../scala/uncore/ahb/RegisterRouter.scala | 4 +- src/main/scala/uncore/ahb/SRAM.scala | 4 +- src/main/scala/uncore/ahb/Xbar.scala | 2 +- src/main/scala/uncore/apb/Nodes.scala | 14 +- .../scala/uncore/apb/RegisterRouter.scala | 4 +- src/main/scala/uncore/apb/SRAM.scala | 4 +- src/main/scala/uncore/apb/Xbar.scala | 2 +- src/main/scala/uncore/axi4/Buffer.scala | 4 +- src/main/scala/uncore/axi4/Fragmenter.scala | 453 +++++++++-------- src/main/scala/uncore/axi4/Nodes.scala | 15 +- .../scala/uncore/axi4/RegisterRouter.scala | 4 +- src/main/scala/uncore/axi4/SRAM.scala | 4 +- src/main/scala/uncore/axi4/ToTL.scala | 222 ++++----- src/main/scala/uncore/devices/Plic.scala | 2 +- .../uncore/tilelink2/AtomicAutomata.scala | 424 ++++++++-------- .../scala/uncore/tilelink2/Broadcast.scala | 290 ++++++----- src/main/scala/uncore/tilelink2/Buffer.scala | 4 +- .../scala/uncore/tilelink2/CacheCork.scala | 170 ++++--- src/main/scala/uncore/tilelink2/Filter.scala | 4 +- .../scala/uncore/tilelink2/Fragmenter.scala | 381 +++++++------- .../scala/uncore/tilelink2/HintHandler.scala | 123 +++-- .../scala/uncore/tilelink2/IntNodes.scala | 16 +- src/main/scala/uncore/tilelink2/Nodes.scala | 58 +-- .../scala/uncore/tilelink2/RAMModel.scala | 464 +++++++++--------- .../uncore/tilelink2/RegisterRouter.scala | 4 +- src/main/scala/uncore/tilelink2/SRAM.scala | 4 +- .../uncore/tilelink2/SourceShrinker.scala | 81 ++- src/main/scala/uncore/tilelink2/ToAHB.scala | 162 +++--- src/main/scala/uncore/tilelink2/ToAPB.scala | 86 ++-- src/main/scala/uncore/tilelink2/ToAXI4.scala | 329 ++++++------- .../scala/uncore/tilelink2/WidthWidget.scala | 41 +- src/main/scala/uncore/tilelink2/Xbar.scala | 2 +- 34 files changed, 1681 insertions(+), 1722 deletions(-) diff --git a/src/main/scala/rocket/ScratchpadSlavePort.scala b/src/main/scala/rocket/ScratchpadSlavePort.scala index ccec47fe..7acf9e16 100644 --- a/src/main/scala/rocket/ScratchpadSlavePort.scala +++ b/src/main/scala/rocket/ScratchpadSlavePort.scala @@ -13,7 +13,7 @@ import uncore.util._ class ScratchpadSlavePort(implicit p: Parameters) extends LazyModule { val coreDataBytes = p(XLen)/8 - val node = TLManagerNode(TLManagerPortParameters( + val node = TLManagerNode(Seq(TLManagerPortParameters( Seq(TLManagerParameters( address = List(AddressSet(0x80000000L, BigInt(p(DataScratchpadSize)-1))), regionType = RegionType.UNCACHED, @@ -25,7 +25,7 @@ class ScratchpadSlavePort(implicit p: Parameters) extends LazyModule { supportsGet = TransferSizes(1, coreDataBytes), fifoId = Some(0))), // requests handled in FIFO order beatBytes = coreDataBytes, - minLatency = 1)) + minLatency = 1))) lazy val module = new LazyModuleImp(this) { val io = new Bundle { diff --git a/src/main/scala/uncore/ahb/Nodes.scala b/src/main/scala/uncore/ahb/Nodes.scala index 7ed88dd8..44f4b669 100644 --- a/src/main/scala/uncore/ahb/Nodes.scala +++ b/src/main/scala/uncore/ahb/Nodes.scala @@ -31,16 +31,14 @@ object AHBImp extends NodeImp[AHBMasterPortParameters, AHBSlavePortParameters, A // Nodes implemented inside modules case class AHBIdentityNode() extends IdentityNode(AHBImp) -case class AHBMasterNode(portParams: AHBMasterPortParameters, numPorts: Range.Inclusive = 1 to 1) - extends SourceNode(AHBImp)(portParams, numPorts) -case class AHBSlaveNode(portParams: AHBSlavePortParameters, numPorts: Range.Inclusive = 1 to 1) - extends SinkNode(AHBImp)(portParams, numPorts) -case class AHBAdapterNode( - masterFn: Seq[AHBMasterPortParameters] => AHBMasterPortParameters, - slaveFn: Seq[AHBSlavePortParameters] => AHBSlavePortParameters, - numMasterPorts: Range.Inclusive = 1 to 1, - numSlavePorts: Range.Inclusive = 1 to 1) - extends InteriorNode(AHBImp)(masterFn, slaveFn, numMasterPorts, numSlavePorts) +case class AHBMasterNode(portParams: Seq[AHBMasterPortParameters]) extends SourceNode(AHBImp)(portParams) +case class AHBSlaveNode(portParams: Seq[AHBSlavePortParameters]) extends SinkNode(AHBImp)(portParams) +case class AHBNexusNode( + masterFn: Seq[AHBMasterPortParameters] => AHBMasterPortParameters, + slaveFn: Seq[AHBSlavePortParameters] => AHBSlavePortParameters, + numMasterPorts: Range.Inclusive = 1 to 999, + numSlavePorts: Range.Inclusive = 1 to 999) + extends NexusNode(AHBImp)(masterFn, slaveFn, numMasterPorts, numSlavePorts) // Nodes passed from an inner module case class AHBOutputNode() extends OutputNode(AHBImp) diff --git a/src/main/scala/uncore/ahb/RegisterRouter.scala b/src/main/scala/uncore/ahb/RegisterRouter.scala index 88528bf4..034df6c8 100644 --- a/src/main/scala/uncore/ahb/RegisterRouter.scala +++ b/src/main/scala/uncore/ahb/RegisterRouter.scala @@ -9,13 +9,13 @@ import regmapper._ import scala.math.{min,max} class AHBRegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int = 4, undefZero: Boolean = true, executable: Boolean = false) - extends AHBSlaveNode(AHBSlavePortParameters( + extends AHBSlaveNode(Seq(AHBSlavePortParameters( Seq(AHBSlaveParameters( address = Seq(address), executable = executable, supportsWrite = TransferSizes(1, min(address.alignment.toInt, beatBytes * AHBParameters.maxTransfer)), supportsRead = TransferSizes(1, min(address.alignment.toInt, beatBytes * AHBParameters.maxTransfer)))), - beatBytes = beatBytes)) + beatBytes = beatBytes))) { require (address.contiguous) diff --git a/src/main/scala/uncore/ahb/SRAM.scala b/src/main/scala/uncore/ahb/SRAM.scala index 6c60c52b..99fe4597 100644 --- a/src/main/scala/uncore/ahb/SRAM.scala +++ b/src/main/scala/uncore/ahb/SRAM.scala @@ -8,14 +8,14 @@ import diplomacy._ class AHBRAM(address: AddressSet, executable: Boolean = true, beatBytes: Int = 4)(implicit p: Parameters) extends LazyModule { - val node = AHBSlaveNode(AHBSlavePortParameters( + val node = AHBSlaveNode(Seq(AHBSlavePortParameters( Seq(AHBSlaveParameters( address = List(address), regionType = RegionType.UNCACHED, executable = executable, supportsRead = TransferSizes(1, beatBytes * AHBParameters.maxTransfer), supportsWrite = TransferSizes(1, beatBytes * AHBParameters.maxTransfer))), - beatBytes = beatBytes)) + beatBytes = beatBytes))) // We require the address range to include an entire beat (for the write mask) require ((address.mask & (beatBytes-1)) == beatBytes-1) diff --git a/src/main/scala/uncore/ahb/Xbar.scala b/src/main/scala/uncore/ahb/Xbar.scala index e3128a6d..8e1378eb 100644 --- a/src/main/scala/uncore/ahb/Xbar.scala +++ b/src/main/scala/uncore/ahb/Xbar.scala @@ -9,7 +9,7 @@ import regmapper._ import scala.math.{min,max} class AHBFanout()(implicit p: Parameters) extends LazyModule { - val node = AHBAdapterNode( + val node = AHBNexusNode( numSlavePorts = 1 to 1, numMasterPorts = 1 to 32, masterFn = { case Seq(m) => m }, diff --git a/src/main/scala/uncore/apb/Nodes.scala b/src/main/scala/uncore/apb/Nodes.scala index 7d22743b..98bb598a 100644 --- a/src/main/scala/uncore/apb/Nodes.scala +++ b/src/main/scala/uncore/apb/Nodes.scala @@ -31,16 +31,14 @@ object APBImp extends NodeImp[APBMasterPortParameters, APBSlavePortParameters, A // Nodes implemented inside modules case class APBIdentityNode() extends IdentityNode(APBImp) -case class APBMasterNode(portParams: APBMasterPortParameters, numPorts: Range.Inclusive = 1 to 1) - extends SourceNode(APBImp)(portParams, numPorts) -case class APBSlaveNode(portParams: APBSlavePortParameters, numPorts: Range.Inclusive = 1 to 1) - extends SinkNode(APBImp)(portParams, numPorts) -case class APBAdapterNode( - masterFn: Seq[APBMasterPortParameters] => APBMasterPortParameters, - slaveFn: Seq[APBSlavePortParameters] => APBSlavePortParameters, +case class APBMasterNode(portParams: Seq[APBMasterPortParameters]) extends SourceNode(APBImp)(portParams) +case class APBSlaveNode(portParams: Seq[APBSlavePortParameters]) extends SinkNode(APBImp)(portParams) +case class APBNexusNode( + masterFn: Seq[APBMasterPortParameters] => APBMasterPortParameters, + slaveFn: Seq[APBSlavePortParameters] => APBSlavePortParameters, numMasterPorts: Range.Inclusive = 1 to 1, numSlavePorts: Range.Inclusive = 1 to 1) - extends InteriorNode(APBImp)(masterFn, slaveFn, numMasterPorts, numSlavePorts) + extends NexusNode(APBImp)(masterFn, slaveFn, numMasterPorts, numSlavePorts) // Nodes passed from an inner module case class APBOutputNode() extends OutputNode(APBImp) diff --git a/src/main/scala/uncore/apb/RegisterRouter.scala b/src/main/scala/uncore/apb/RegisterRouter.scala index 20bf4928..2f8bfa86 100644 --- a/src/main/scala/uncore/apb/RegisterRouter.scala +++ b/src/main/scala/uncore/apb/RegisterRouter.scala @@ -9,13 +9,13 @@ import regmapper._ import scala.math.{min,max} class APBRegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int = 4, undefZero: Boolean = true, executable: Boolean = false) - extends APBSlaveNode(APBSlavePortParameters( + extends APBSlaveNode(Seq(APBSlavePortParameters( Seq(APBSlaveParameters( address = Seq(address), executable = executable, supportsWrite = true, supportsRead = true)), - beatBytes = beatBytes)) + beatBytes = beatBytes))) { require (address.contiguous) diff --git a/src/main/scala/uncore/apb/SRAM.scala b/src/main/scala/uncore/apb/SRAM.scala index c2bf6d1f..10cd7e60 100644 --- a/src/main/scala/uncore/apb/SRAM.scala +++ b/src/main/scala/uncore/apb/SRAM.scala @@ -8,14 +8,14 @@ import diplomacy._ class APBRAM(address: AddressSet, executable: Boolean = true, beatBytes: Int = 4)(implicit p: Parameters) extends LazyModule { - val node = APBSlaveNode(APBSlavePortParameters( + val node = APBSlaveNode(Seq(APBSlavePortParameters( Seq(APBSlaveParameters( address = List(address), regionType = RegionType.UNCACHED, executable = executable, supportsRead = true, supportsWrite = true)), - beatBytes = beatBytes)) + beatBytes = beatBytes))) // We require the address range to include an entire beat (for the write mask) require ((address.mask & (beatBytes-1)) == beatBytes-1) diff --git a/src/main/scala/uncore/apb/Xbar.scala b/src/main/scala/uncore/apb/Xbar.scala index 2809fb96..c5b78f48 100644 --- a/src/main/scala/uncore/apb/Xbar.scala +++ b/src/main/scala/uncore/apb/Xbar.scala @@ -9,7 +9,7 @@ import regmapper._ import scala.math.{min,max} class APBFanout()(implicit p: Parameters) extends LazyModule { - val node = APBAdapterNode( + val node = APBNexusNode( numSlavePorts = 1 to 1, numMasterPorts = 1 to 32, masterFn = { case Seq(m) => m }, diff --git a/src/main/scala/uncore/axi4/Buffer.scala b/src/main/scala/uncore/axi4/Buffer.scala index 322699c2..ee3304dd 100644 --- a/src/main/scala/uncore/axi4/Buffer.scala +++ b/src/main/scala/uncore/axi4/Buffer.scala @@ -18,8 +18,8 @@ class AXI4Buffer(aw: Int = 2, w: Int = 2, b: Int = 2, ar: Int = 2, r: Int = 2, p require (r >= 0) val node = AXI4AdapterNode( - masterFn = { case Seq(p) => p }, - slaveFn = { case Seq(p) => p.copy(minLatency = p.minLatency + min(1,min(aw,ar)) + min(1,min(r,b))) }) + masterFn = { p => p }, + slaveFn = { p => p.copy(minLatency = p.minLatency + min(1,min(aw,ar)) + min(1,min(r,b))) }) lazy val module = new LazyModuleImp(this) { val io = new Bundle { diff --git a/src/main/scala/uncore/axi4/Fragmenter.scala b/src/main/scala/uncore/axi4/Fragmenter.scala index f6cdf372..42cf8fd7 100644 --- a/src/main/scala/uncore/axi4/Fragmenter.scala +++ b/src/main/scala/uncore/axi4/Fragmenter.scala @@ -23,8 +23,8 @@ class AXI4Fragmenter(lite: Boolean = false, maxInFlight: => Int = 32, combinatio def mapMaster(m: AXI4MasterParameters) = m.copy(aligned = true) val node = AXI4AdapterNode( - masterFn = { case Seq(mp) => mp.copy(masters = mp.masters.map(m => mapMaster(m))) }, - slaveFn = { case Seq(sp) => sp.copy(slaves = sp.slaves .map(s => mapSlave(s, sp.beatBytes))) }) + masterFn = { mp => mp.copy(masters = mp.masters.map(m => mapMaster(m))) }, + slaveFn = { sp => sp.copy(slaves = sp.slaves .map(s => mapSlave(s, sp.beatBytes))) }) lazy val module = new LazyModuleImp(this) { val io = new Bundle { @@ -32,256 +32,253 @@ class AXI4Fragmenter(lite: Boolean = false, maxInFlight: => Int = 32, combinatio val out = node.bundleOut } - val edgeOut = node.edgesOut(0) - val edgeIn = node.edgesIn(0) - val slave = edgeOut.slave - val slaves = slave.slaves - val beatBytes = slave.beatBytes - val lgBytes = log2Ceil(beatBytes) - val master = edgeIn.master - val masters = master.masters + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val slave = edgeOut.slave + val slaves = slave.slaves + val beatBytes = slave.beatBytes + val lgBytes = log2Ceil(beatBytes) + val master = edgeIn.master + val masters = master.masters - // If the user claimed this was a lite interface, then there must be only one Id - require (!lite || master.endId == 1) + // If the user claimed this was a lite interface, then there must be only one Id + require (!lite || master.endId == 1) - // We don't support fragmenting to sub-beat accesses - slaves.foreach { s => - require (!s.supportsRead || s.supportsRead.contains(beatBytes)) - require (!s.supportsWrite || s.supportsWrite.contains(beatBytes)) - } + // We don't support fragmenting to sub-beat accesses + slaves.foreach { s => + require (!s.supportsRead || s.supportsRead.contains(beatBytes)) + require (!s.supportsWrite || s.supportsWrite.contains(beatBytes)) + } - /* We need to decompose a request into - * FIXED => each beat is a new request - * WRAP/INCR => take xfr up to next power of two, capped by max size of target - * - * On AR and AW, we fragment one request into many - * On W we set 'last' on beats which are fragment boundaries - * On R we clear 'last' on the fragments being reassembled - * On B we clear 'valid' on the responses for the injected fragments - * - * AR=>R and AW+W=>B are completely independent state machines. - */ - - /* Returns the number of beats to execute and the new address */ - def fragment(a: IrrevocableIO[AXI4BundleA], supportedSizes1: Seq[Int]): (IrrevocableIO[AXI4BundleA], Bool, UInt) = { - val out = Wire(a) - - val busy = RegInit(Bool(false)) - val r_addr = Reg(UInt(width = a.bits.params.addrBits)) - val r_len = Reg(UInt(width = AXI4Parameters.lenBits)) - - val len = Mux(busy, r_len, a.bits.len) - val addr = Mux(busy, r_addr, a.bits.addr) - - val lo = if (lgBytes == 0) UInt(0) else addr(lgBytes-1, 0) - val hi = addr >> lgBytes - val alignment = hi(AXI4Parameters.lenBits-1,0) - - val allSame = supportedSizes1.filter(_ >= 0).distinct.size <= 1 - val dynamic1 = Mux1H(slave.findFast(addr), supportedSizes1.map(s => UInt(max(0, s)))) - val fixed1 = UInt(supportedSizes1.filter(_ >= 0).headOption.getOrElse(0)) - - /* We need to compute the largest transfer allowed by the AXI len. - * len+1 is the number of beats to execute. - * We want the MSB(len+1)-1; one less than the largest power of two we could execute. - * There are two cases; either len is 2^n-1 in which case we leave it unchanged, ELSE - * fill the bits from highest to lowest, and shift right by one bit. + /* We need to decompose a request into + * FIXED => each beat is a new request + * WRAP/INCR => take xfr up to next power of two, capped by max size of target + * + * On AR and AW, we fragment one request into many + * On W we set 'last' on beats which are fragment boundaries + * On R we clear 'last' on the fragments being reassembled + * On B we clear 'valid' on the responses for the injected fragments + * + * AR=>R and AW+W=>B are completely independent state machines. */ - val fillLow = rightOR(len) >> 1 // set all bits in positions < a set bit - val wipeHigh = ~leftOR(~len) // clear all bits in position >= a cleared bit - val remain1 = fillLow | wipeHigh // MSB(a.len+1)-1 - val align1 = ~leftOR(alignment) // transfer size limited by address alignment - val support1 = if (allSame) fixed1 else dynamic1 // maximum supported size-1 based on target address - val maxSupported1 = remain1 & align1 & support1 // Take the minimum of all the limits - // Things that cause us to degenerate to a single beat - val fixed = a.bits.burst === AXI4Parameters.BURST_FIXED - val narrow = a.bits.size =/= UInt(lgBytes) - val bad = fixed || narrow + /* Returns the number of beats to execute and the new address */ + def fragment(a: IrrevocableIO[AXI4BundleA], supportedSizes1: Seq[Int]): (IrrevocableIO[AXI4BundleA], Bool, UInt) = { + val out = Wire(a) - // The number of beats-1 to execute - val beats1 = Mux(bad, UInt(0), maxSupported1) - val beats = OH1ToOH(beats1) // beats1 + 1 + val busy = RegInit(Bool(false)) + val r_addr = Reg(UInt(width = a.bits.params.addrBits)) + val r_len = Reg(UInt(width = AXI4Parameters.lenBits)) - val inc_addr = addr + (beats << a.bits.size) // address after adding transfer - val wrapMask = a.bits.bytes1() // only these bits may change, if wrapping - val mux_addr = Wire(init = inc_addr) - when (a.bits.burst === AXI4Parameters.BURST_WRAP) { - mux_addr := (inc_addr & wrapMask) | ~(~a.bits.addr | wrapMask) - } - when (a.bits.burst === AXI4Parameters.BURST_FIXED) { - mux_addr := a.bits.addr + val len = Mux(busy, r_len, a.bits.len) + val addr = Mux(busy, r_addr, a.bits.addr) + + val lo = if (lgBytes == 0) UInt(0) else addr(lgBytes-1, 0) + val hi = addr >> lgBytes + val alignment = hi(AXI4Parameters.lenBits-1,0) + + val allSame = supportedSizes1.filter(_ >= 0).distinct.size <= 1 + val dynamic1 = Mux1H(slave.findFast(addr), supportedSizes1.map(s => UInt(max(0, s)))) + val fixed1 = UInt(supportedSizes1.filter(_ >= 0).headOption.getOrElse(0)) + + /* We need to compute the largest transfer allowed by the AXI len. + * len+1 is the number of beats to execute. + * We want the MSB(len+1)-1; one less than the largest power of two we could execute. + * There are two cases; either len is 2^n-1 in which case we leave it unchanged, ELSE + * fill the bits from highest to lowest, and shift right by one bit. + */ + val fillLow = rightOR(len) >> 1 // set all bits in positions < a set bit + val wipeHigh = ~leftOR(~len) // clear all bits in position >= a cleared bit + val remain1 = fillLow | wipeHigh // MSB(a.len+1)-1 + val align1 = ~leftOR(alignment) // transfer size limited by address alignment + val support1 = if (allSame) fixed1 else dynamic1 // maximum supported size-1 based on target address + val maxSupported1 = remain1 & align1 & support1 // Take the minimum of all the limits + + // Things that cause us to degenerate to a single beat + val fixed = a.bits.burst === AXI4Parameters.BURST_FIXED + val narrow = a.bits.size =/= UInt(lgBytes) + val bad = fixed || narrow + + // The number of beats-1 to execute + val beats1 = Mux(bad, UInt(0), maxSupported1) + val beats = OH1ToOH(beats1) // beats1 + 1 + + val inc_addr = addr + (beats << a.bits.size) // address after adding transfer + val wrapMask = a.bits.bytes1() // only these bits may change, if wrapping + val mux_addr = Wire(init = inc_addr) + when (a.bits.burst === AXI4Parameters.BURST_WRAP) { + mux_addr := (inc_addr & wrapMask) | ~(~a.bits.addr | wrapMask) + } + when (a.bits.burst === AXI4Parameters.BURST_FIXED) { + mux_addr := a.bits.addr + } + + val last = beats1 === len + a.ready := out.ready && last + out.valid := a.valid + + out.bits := a.bits + out.bits.len := beats1 + + // We forcibly align every access. If the first beat was misaligned, the strb bits + // for the lower addresses must not have been set. Therefore, rounding the address + // down is harmless. We can do this after the address update algorithm, because the + // incremented values will be rounded down the same way. Furthermore, a subword + // offset cannot cause a premature wrap-around. + out.bits.addr := ~(~addr | UIntToOH1(a.bits.size, lgBytes)) + + when (out.fire()) { + busy := !last + r_addr := mux_addr + r_len := len - beats + } + + (out, last, beats) } - val last = beats1 === len - a.ready := out.ready && last - out.valid := a.valid + // The size to which we will fragment the access + val readSizes1 = slaves.map(s => s.supportsRead .max/beatBytes-1) + val writeSizes1 = slaves.map(s => s.supportsWrite.max/beatBytes-1) - out.bits := a.bits - out.bits.len := beats1 + // Indirection variables for inputs and outputs; makes transformation application easier + val (in_ar, ar_last, _) = fragment(Queue.irrevocable(in.ar, 1, flow=true), readSizes1) + val (in_aw, aw_last, w_beats) = fragment(Queue.irrevocable(in.aw, 1, flow=true), writeSizes1) + val in_w = in.w + val in_r = in.r + val in_b = in.b + val out_ar = Wire(out.ar) + val out_aw = out.aw + val out_w = out.w + val out_r = Wire(out.r) + val out_b = Wire(out.b) - // We forcibly align every access. If the first beat was misaligned, the strb bits - // for the lower addresses must not have been set. Therefore, rounding the address - // down is harmless. We can do this after the address update algorithm, because the - // incremented values will be rounded down the same way. Furthermore, a subword - // offset cannot cause a premature wrap-around. - out.bits.addr := ~(~addr | UIntToOH1(a.bits.size, lgBytes)) - - when (out.fire()) { - busy := !last - r_addr := mux_addr - r_len := len - beats + val depth = if (combinational) 1 else 2 + // In case a slave ties arready := rready, we need a queue to break the combinational loop + // between the two branches (in_ar => {out_ar => out_r, sideband} => in_r). + if (in.ar.bits.getWidth < in.r.bits.getWidth) { + out.ar <> Queue(out_ar, depth, flow=combinational) + out_r <> out.r + } else { + out.ar <> out_ar + out_r <> Queue(out.r, depth, flow=combinational) } + // In case a slave ties awready := bready or wready := bready, we need this queue + out_b <> Queue(out.b, depth, flow=combinational) - (out, last, beats) - } + // Sideband to track which transfers were the last fragment + def sideband() = if (lite) { + Module(new Queue(Bool(), maxInFlight, flow=combinational)).io + } else { + Module(new AXI4FragmenterSideband(maxInFlight, flow=combinational)).io + } + val sideband_ar_r = sideband() + val sideband_aw_b = sideband() - val in = io.in(0) - val out = io.out(0) + // AR flow control + out_ar.valid := in_ar.valid && sideband_ar_r.enq.ready + in_ar.ready := sideband_ar_r.enq.ready && out_ar.ready + sideband_ar_r.enq.valid := in_ar.valid && out_ar.ready + out_ar.bits := in_ar.bits + sideband_ar_r.enq.bits := ar_last - // The size to which we will fragment the access - val readSizes1 = slaves.map(s => s.supportsRead .max/beatBytes-1) - val writeSizes1 = slaves.map(s => s.supportsWrite.max/beatBytes-1) + // When does W channel start counting a new transfer + val wbeats_latched = RegInit(Bool(false)) + val wbeats_ready = Wire(Bool()) + val wbeats_valid = Wire(Bool()) + when (wbeats_valid && wbeats_ready) { wbeats_latched := Bool(true) } + when (out_aw.fire()) { wbeats_latched := Bool(false) } - // Indirection variables for inputs and outputs; makes transformation application easier - val (in_ar, ar_last, _) = fragment(Queue.irrevocable(in.ar, 1, flow=true), readSizes1) - val (in_aw, aw_last, w_beats) = fragment(Queue.irrevocable(in.aw, 1, flow=true), writeSizes1) - val in_w = in.w - val in_r = in.r - val in_b = in.b - val out_ar = Wire(out.ar) - val out_aw = out.aw - val out_w = out.w - val out_r = Wire(out.r) - val out_b = Wire(out.b) + // AW flow control + out_aw.valid := in_aw.valid && sideband_aw_b.enq.ready && (wbeats_ready || wbeats_latched) + in_aw.ready := sideband_aw_b.enq.ready && out_aw.ready && (wbeats_ready || wbeats_latched) + sideband_aw_b.enq.valid := in_aw.valid && out_aw.ready && (wbeats_ready || wbeats_latched) + wbeats_valid := in_aw.valid && !wbeats_latched + out_aw.bits := in_aw.bits + sideband_aw_b.enq.bits := aw_last - val depth = if (combinational) 1 else 2 - // In case a slave ties arready := rready, we need a queue to break the combinational loop - // between the two branches (in_ar => {out_ar => out_r, sideband} => in_r). - if (in.ar.bits.getWidth < in.r.bits.getWidth) { - out.ar <> Queue(out_ar, depth, flow=combinational) - out_r <> out.r - } else { - out.ar <> out_ar - out_r <> Queue(out.r, depth, flow=combinational) - } - // In case a slave ties awready := bready or wready := bready, we need this queue - out_b <> Queue(out.b, depth, flow=combinational) + // We need to inject 'last' into the W channel fragments, count! + val w_counter = RegInit(UInt(0, width = AXI4Parameters.lenBits+1)) + val w_idle = w_counter === UInt(0) + val w_todo = Mux(w_idle, Mux(wbeats_valid, w_beats, UInt(0)), w_counter) + val w_last = w_todo === UInt(1) + w_counter := w_todo - out_w.fire() + assert (!out_w.fire() || w_todo =/= UInt(0)) // underflow impossible - // Sideband to track which transfers were the last fragment - def sideband() = if (lite) { - Module(new Queue(Bool(), maxInFlight, flow=combinational)).io - } else { - Module(new AXI4FragmenterSideband(maxInFlight, flow=combinational)).io - } - val sideband_ar_r = sideband() - val sideband_aw_b = sideband() + // W flow control + wbeats_ready := w_idle + out_w.valid := in_w.valid && (!wbeats_ready || wbeats_valid) + in_w.ready := out_w.ready && (!wbeats_ready || wbeats_valid) + out_w.bits := in_w.bits + out_w.bits.last := w_last + // We should also recreate the last last + assert (!out_w.valid || !in_w.bits.last || w_last) - // AR flow control - out_ar.valid := in_ar.valid && sideband_ar_r.enq.ready - in_ar.ready := sideband_ar_r.enq.ready && out_ar.ready - sideband_ar_r.enq.valid := in_ar.valid && out_ar.ready - out_ar.bits := in_ar.bits - sideband_ar_r.enq.bits := ar_last + // R flow control + val r_last = out_r.bits.last + in_r.valid := out_r.valid && (!r_last || sideband_ar_r.deq.valid) + out_r.ready := in_r.ready && (!r_last || sideband_ar_r.deq.valid) + sideband_ar_r.deq.ready := r_last && out_r.valid && in_r.ready + in_r.bits := out_r.bits + in_r.bits.last := r_last && sideband_ar_r.deq.bits - // When does W channel start counting a new transfer - val wbeats_latched = RegInit(Bool(false)) - val wbeats_ready = Wire(Bool()) - val wbeats_valid = Wire(Bool()) - when (wbeats_valid && wbeats_ready) { wbeats_latched := Bool(true) } - when (out_aw.fire()) { wbeats_latched := Bool(false) } + // B flow control + val b_last = sideband_aw_b.deq.bits + in_b.valid := out_b.valid && sideband_aw_b.deq.valid && b_last + out_b.ready := sideband_aw_b.deq.valid && (!b_last || in_b.ready) + sideband_aw_b.deq.ready := out_b.valid && (!b_last || in_b.ready) + in_b.bits := out_b.bits - // AW flow control - out_aw.valid := in_aw.valid && sideband_aw_b.enq.ready && (wbeats_ready || wbeats_latched) - in_aw.ready := sideband_aw_b.enq.ready && out_aw.ready && (wbeats_ready || wbeats_latched) - sideband_aw_b.enq.valid := in_aw.valid && out_aw.ready && (wbeats_ready || wbeats_latched) - wbeats_valid := in_aw.valid && !wbeats_latched - out_aw.bits := in_aw.bits - sideband_aw_b.enq.bits := aw_last - - // We need to inject 'last' into the W channel fragments, count! - val w_counter = RegInit(UInt(0, width = AXI4Parameters.lenBits+1)) - val w_idle = w_counter === UInt(0) - val w_todo = Mux(w_idle, Mux(wbeats_valid, w_beats, UInt(0)), w_counter) - val w_last = w_todo === UInt(1) - w_counter := w_todo - out_w.fire() - assert (!out_w.fire() || w_todo =/= UInt(0)) // underflow impossible - - // W flow control - wbeats_ready := w_idle - out_w.valid := in_w.valid && (!wbeats_ready || wbeats_valid) - in_w.ready := out_w.ready && (!wbeats_ready || wbeats_valid) - out_w.bits := in_w.bits - out_w.bits.last := w_last - // We should also recreate the last last - assert (!out_w.valid || !in_w.bits.last || w_last) - - // R flow control - val r_last = out_r.bits.last - in_r.valid := out_r.valid && (!r_last || sideband_ar_r.deq.valid) - out_r.ready := in_r.ready && (!r_last || sideband_ar_r.deq.valid) - sideband_ar_r.deq.ready := r_last && out_r.valid && in_r.ready - in_r.bits := out_r.bits - in_r.bits.last := r_last && sideband_ar_r.deq.bits - - // B flow control - val b_last = sideband_aw_b.deq.bits - in_b.valid := out_b.valid && sideband_aw_b.deq.valid && b_last - out_b.ready := sideband_aw_b.deq.valid && (!b_last || in_b.ready) - sideband_aw_b.deq.ready := out_b.valid && (!b_last || in_b.ready) - in_b.bits := out_b.bits - - // Merge errors from dropped B responses - val r_resp = RegInit(UInt(0, width = AXI4Parameters.respBits)) - val resp = out_b.bits.resp | r_resp - when (out_b.fire()) { r_resp := Mux(b_last, UInt(0), resp) } - in_b.bits.resp := resp - } -} - -/* We want to put barriers between the fragments of a fragmented transfer and all other transfers. - * This lets us use very little state to reassemble the fragments (else we need one FIFO per ID). - * Furthermore, because all the fragments share the same AXI ID, they come back contiguously. - * This guarantees that no other R responses might get mixed between fragments, ensuring that the - * interleavedId for the slaves remains unaffected by the fragmentation transformation. - * Of course, if you need to fragment, this means there is a potentially hefty serialization cost. - * However, this design allows full concurrency in the common no-fragmentation-needed scenario. - */ -class AXI4FragmenterSideband(maxInFlight: Int, flow: Boolean = false) extends Module -{ - val io = new QueueIO(Bool(), maxInFlight) - io.count := UInt(0) - - val PASS = UInt(2, width = 2) // allow 'last=1' bits to enque, on 'last=0' if count>0 block else accept+FIND - val FIND = UInt(0, width = 2) // allow 'last=0' bits to enque, accept 'last=1' and switch to WAIT - val WAIT = UInt(1, width = 2) // block all access till count=0 - - val state = RegInit(PASS) - val count = RegInit(UInt(0, width = log2Up(maxInFlight))) - val full = count === UInt(maxInFlight-1) - val empty = count === UInt(0) - val last = count === UInt(1) - - io.deq.bits := state(1) || (last && state(0)) // PASS || (last && WAIT) - io.deq.valid := !empty - - io.enq.ready := !full && (empty || (state === FIND) || (state === PASS && io.enq.bits)) - - // WAIT => count > 0 - assert (state =/= WAIT || count =/= UInt(0)) - - if (flow) { - when (io.enq.valid) { - io.deq.valid := Bool(true) - when (empty) { io.deq.bits := io.enq.bits } + // Merge errors from dropped B responses + val r_resp = RegInit(UInt(0, width = AXI4Parameters.respBits)) + val resp = out_b.bits.resp | r_resp + when (out_b.fire()) { r_resp := Mux(b_last, UInt(0), resp) } + in_b.bits.resp := resp } } - count := count + io.enq.fire() - io.deq.fire() - switch (state) { - is(PASS) { when (io.enq.valid && !io.enq.bits && empty) { state := FIND } } - is(FIND) { when (io.enq.valid && io.enq.bits && !full) { state := Mux(empty, PASS, WAIT) } } - is(WAIT) { when (last && io.deq.ready) { state := PASS } } + /* We want to put barriers between the fragments of a fragmented transfer and all other transfers. + * This lets us use very little state to reassemble the fragments (else we need one FIFO per ID). + * Furthermore, because all the fragments share the same AXI ID, they come back contiguously. + * This guarantees that no other R responses might get mixed between fragments, ensuring that the + * interleavedId for the slaves remains unaffected by the fragmentation transformation. + * Of course, if you need to fragment, this means there is a potentially hefty serialization cost. + * However, this design allows full concurrency in the common no-fragmentation-needed scenario. + */ + class AXI4FragmenterSideband(maxInFlight: Int, flow: Boolean = false) extends Module + { + val io = new QueueIO(Bool(), maxInFlight) + io.count := UInt(0) + + val PASS = UInt(2, width = 2) // allow 'last=1' bits to enque, on 'last=0' if count>0 block else accept+FIND + val FIND = UInt(0, width = 2) // allow 'last=0' bits to enque, accept 'last=1' and switch to WAIT + val WAIT = UInt(1, width = 2) // block all access till count=0 + + val state = RegInit(PASS) + val count = RegInit(UInt(0, width = log2Up(maxInFlight))) + val full = count === UInt(maxInFlight-1) + val empty = count === UInt(0) + val last = count === UInt(1) + + io.deq.bits := state(1) || (last && state(0)) // PASS || (last && WAIT) + io.deq.valid := !empty + + io.enq.ready := !full && (empty || (state === FIND) || (state === PASS && io.enq.bits)) + + // WAIT => count > 0 + assert (state =/= WAIT || count =/= UInt(0)) + + if (flow) { + when (io.enq.valid) { + io.deq.valid := Bool(true) + when (empty) { io.deq.bits := io.enq.bits } + } + } + + count := count + io.enq.fire() - io.deq.fire() + switch (state) { + is(PASS) { when (io.enq.valid && !io.enq.bits && empty) { state := FIND } } + is(FIND) { when (io.enq.valid && io.enq.bits && !full) { state := Mux(empty, PASS, WAIT) } } + is(WAIT) { when (last && io.deq.ready) { state := PASS } } + } } } diff --git a/src/main/scala/uncore/axi4/Nodes.scala b/src/main/scala/uncore/axi4/Nodes.scala index 03c9e840..097c0aed 100644 --- a/src/main/scala/uncore/axi4/Nodes.scala +++ b/src/main/scala/uncore/axi4/Nodes.scala @@ -31,16 +31,13 @@ object AXI4Imp extends NodeImp[AXI4MasterPortParameters, AXI4SlavePortParameters // Nodes implemented inside modules case class AXI4IdentityNode() extends IdentityNode(AXI4Imp) -case class AXI4MasterNode(portParams: AXI4MasterPortParameters, numPorts: Range.Inclusive = 1 to 1) - extends SourceNode(AXI4Imp)(portParams, numPorts) -case class AXI4SlaveNode(portParams: AXI4SlavePortParameters, numPorts: Range.Inclusive = 1 to 1) - extends SinkNode(AXI4Imp)(portParams, numPorts) +case class AXI4MasterNode(portParams: Seq[AXI4MasterPortParameters]) extends SourceNode(AXI4Imp)(portParams) +case class AXI4SlaveNode(portParams: Seq[AXI4SlavePortParameters]) extends SinkNode(AXI4Imp)(portParams) case class AXI4AdapterNode( - masterFn: Seq[AXI4MasterPortParameters] => AXI4MasterPortParameters, - slaveFn: Seq[AXI4SlavePortParameters] => AXI4SlavePortParameters, - numMasterPorts: Range.Inclusive = 1 to 1, - numSlavePorts: Range.Inclusive = 1 to 1) - extends InteriorNode(AXI4Imp)(masterFn, slaveFn, numMasterPorts, numSlavePorts) + masterFn: AXI4MasterPortParameters => AXI4MasterPortParameters, + slaveFn: AXI4SlavePortParameters => AXI4SlavePortParameters, + numPorts: Range.Inclusive = 0 to 999) + extends AdapterNode(AXI4Imp)(masterFn, slaveFn, numPorts) // Nodes passed from an inner module case class AXI4OutputNode() extends OutputNode(AXI4Imp) diff --git a/src/main/scala/uncore/axi4/RegisterRouter.scala b/src/main/scala/uncore/axi4/RegisterRouter.scala index 679c16c1..326f65eb 100644 --- a/src/main/scala/uncore/axi4/RegisterRouter.scala +++ b/src/main/scala/uncore/axi4/RegisterRouter.scala @@ -9,7 +9,7 @@ import regmapper._ import scala.math.{min,max} class AXI4RegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int = 4, undefZero: Boolean = true, executable: Boolean = false) - extends AXI4SlaveNode(AXI4SlavePortParameters( + extends AXI4SlaveNode(Seq(AXI4SlavePortParameters( Seq(AXI4SlaveParameters( address = Seq(address), executable = executable, @@ -17,7 +17,7 @@ class AXI4RegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int supportsRead = TransferSizes(1, beatBytes), interleavedId = Some(0))), beatBytes = beatBytes, - minLatency = min(concurrency, 1))) // the Queue adds at most one cycle + minLatency = min(concurrency, 1)))) // the Queue adds at most one cycle { require (address.contiguous) diff --git a/src/main/scala/uncore/axi4/SRAM.scala b/src/main/scala/uncore/axi4/SRAM.scala index c5ecf566..26a08ca2 100644 --- a/src/main/scala/uncore/axi4/SRAM.scala +++ b/src/main/scala/uncore/axi4/SRAM.scala @@ -8,7 +8,7 @@ import diplomacy._ class AXI4RAM(address: AddressSet, executable: Boolean = true, beatBytes: Int = 4)(implicit p: Parameters) extends LazyModule { - val node = AXI4SlaveNode(AXI4SlavePortParameters( + val node = AXI4SlaveNode(Seq(AXI4SlavePortParameters( Seq(AXI4SlaveParameters( address = List(address), regionType = RegionType.UNCACHED, @@ -17,7 +17,7 @@ class AXI4RAM(address: AddressSet, executable: Boolean = true, beatBytes: Int = supportsWrite = TransferSizes(1, beatBytes), interleavedId = Some(0))), beatBytes = beatBytes, - minLatency = 0)) // B responds on same cycle + minLatency = 0))) // B responds on same cycle // We require the address range to include an entire beat (for the write mask) require ((address.mask & (beatBytes-1)) == beatBytes-1) diff --git a/src/main/scala/uncore/axi4/ToTL.scala b/src/main/scala/uncore/axi4/ToTL.scala index 0f62b5f8..6561d1fd 100644 --- a/src/main/scala/uncore/axi4/ToTL.scala +++ b/src/main/scala/uncore/axi4/ToTL.scala @@ -8,15 +8,15 @@ import config._ import diplomacy._ import uncore.tilelink2._ -case class AXI4ToTLNode() extends MixedNode(AXI4Imp, TLImp)( - dFn = { case (1, Seq(AXI4MasterPortParameters(masters))) => - Seq(TLClientPortParameters(clients = masters.map { m => +case class AXI4ToTLNode() extends MixedAdapterNode(AXI4Imp, TLImp)( + dFn = { case AXI4MasterPortParameters(masters) => + TLClientPortParameters(clients = masters.map { m => TLClientParameters( sourceId = IdRange(m.id.start << 1, m.id.end << 1), // R+W ids are distinct nodePath = m.nodePath) - })) + }) }, - uFn = { case (1, Seq(mp)) => Seq(AXI4SlavePortParameters( + uFn = { mp => AXI4SlavePortParameters( slaves = mp.managers.map { m => AXI4SlaveParameters( address = m.address, @@ -27,10 +27,8 @@ case class AXI4ToTLNode() extends MixedNode(AXI4Imp, TLImp)( supportsRead = m.supportsGet, interleavedId = Some(0))}, // TL2 never interleaves D beats beatBytes = mp.beatBytes, - minLatency = mp.minLatency)) - }, - numPO = 1 to 1, - numPI = 1 to 1) + minLatency = mp.minLatency) + }) class AXI4ToTL()(implicit p: Parameters) extends LazyModule { @@ -42,131 +40,129 @@ class AXI4ToTL()(implicit p: Parameters) extends LazyModule val out = node.bundleOut } - val in = io.in(0) - val out = io.out(0) - val edgeIn = node.edgesIn(0) - val edgeOut = node.edgesOut(0) - val numIds = edgeIn.master.endId - val beatBytes = edgeOut.manager.beatBytes - val countBits = AXI4Parameters.lenBits + (1 << AXI4Parameters.sizeBits) - 1 + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val numIds = edgeIn.master.endId + val beatBytes = edgeOut.manager.beatBytes + val countBits = AXI4Parameters.lenBits + (1 << AXI4Parameters.sizeBits) - 1 - require (edgeIn.master.masters(0).aligned) + require (edgeIn.master.masters(0).aligned) - val r_out = Wire(out.a) - val r_inflight = RegInit(UInt(0, width = numIds)) - val r_block = r_inflight(in.ar.bits.id) - val r_size1 = in.ar.bits.bytes1() - val r_size = OH1ToUInt(r_size1) - val r_addr = in.ar.bits.addr - val r_ok = edgeOut.manager.supportsGetSafe(r_addr, r_size) - val r_err_in = Wire(Decoupled(new AXI4BundleRError(in.ar.bits.params))) - val r_err_out = Queue(r_err_in, 2) - val r_count = RegInit(UInt(0, width = in.ar.bits.params.lenBits)) - val r_last = r_count === in.ar.bits.len + val r_out = Wire(out.a) + val r_inflight = RegInit(UInt(0, width = numIds)) + val r_block = r_inflight(in.ar.bits.id) + val r_size1 = in.ar.bits.bytes1() + val r_size = OH1ToUInt(r_size1) + val r_addr = in.ar.bits.addr + val r_ok = edgeOut.manager.supportsGetSafe(r_addr, r_size) + val r_err_in = Wire(Decoupled(new AXI4BundleRError(in.ar.bits.params))) + val r_err_out = Queue(r_err_in, 2) + val r_count = RegInit(UInt(0, width = in.ar.bits.params.lenBits)) + val r_last = r_count === in.ar.bits.len - assert (!in.ar.valid || r_size1 === UIntToOH1(r_size, countBits)) // because aligned - in.ar.ready := Mux(r_ok, r_out.ready, r_err_in.ready && r_last) && !r_block - r_out.valid := in.ar.valid && !r_block && r_ok - r_out.bits := edgeOut.Get(in.ar.bits.id << 1 | UInt(1), r_addr, r_size)._2 - r_err_in.valid := in.ar.valid && !r_block && !r_ok - r_err_in.bits.last := r_last - r_err_in.bits.id := in.ar.bits.id + assert (!in.ar.valid || r_size1 === UIntToOH1(r_size, countBits)) // because aligned + in.ar.ready := Mux(r_ok, r_out.ready, r_err_in.ready && r_last) && !r_block + r_out.valid := in.ar.valid && !r_block && r_ok + r_out.bits := edgeOut.Get(in.ar.bits.id << 1 | UInt(1), r_addr, r_size)._2 + r_err_in.valid := in.ar.valid && !r_block && !r_ok + r_err_in.bits.last := r_last + r_err_in.bits.id := in.ar.bits.id - when (r_err_in.fire()) { r_count := Mux(r_last, UInt(0), r_count + UInt(1)) } + when (r_err_in.fire()) { r_count := Mux(r_last, UInt(0), r_count + UInt(1)) } - val w_out = Wire(out.a) - val w_inflight = RegInit(UInt(0, width = numIds)) - val w_block = w_inflight(in.aw.bits.id) - val w_size1 = in.aw.bits.bytes1() - val w_size = OH1ToUInt(w_size1) - val w_addr = in.aw.bits.addr - val w_ok = edgeOut.manager.supportsPutPartialSafe(w_addr, w_size) - val w_err_in = Wire(Decoupled(in.aw.bits.id)) - val w_err_out = Queue(w_err_in, 2) + val w_out = Wire(out.a) + val w_inflight = RegInit(UInt(0, width = numIds)) + val w_block = w_inflight(in.aw.bits.id) + val w_size1 = in.aw.bits.bytes1() + val w_size = OH1ToUInt(w_size1) + val w_addr = in.aw.bits.addr + val w_ok = edgeOut.manager.supportsPutPartialSafe(w_addr, w_size) + val w_err_in = Wire(Decoupled(in.aw.bits.id)) + val w_err_out = Queue(w_err_in, 2) - assert (!in.aw.valid || w_size1 === UIntToOH1(w_size, countBits)) // because aligned - assert (!in.aw.valid || in.aw.bits.len === UInt(0) || in.aw.bits.size === UInt(log2Ceil(beatBytes))) // because aligned - in.aw.ready := Mux(w_ok, w_out.ready, w_err_in.ready) && in.w.valid && in.w.bits.last && !w_block - in.w.ready := Mux(w_ok, w_out.ready, w_err_in.ready || !in.w.bits.last) && in.aw.valid && !w_block - w_out.valid := in.aw.valid && in.w.valid && !w_block && w_ok - w_out.bits := edgeOut.Put(in.aw.bits.id << 1, w_addr, w_size, in.w.bits.data, in.w.bits.strb)._2 - w_err_in.valid := in.aw.valid && in.w.valid && !w_block && !w_ok && in.w.bits.last - w_err_in.bits := in.aw.bits.id + assert (!in.aw.valid || w_size1 === UIntToOH1(w_size, countBits)) // because aligned + assert (!in.aw.valid || in.aw.bits.len === UInt(0) || in.aw.bits.size === UInt(log2Ceil(beatBytes))) // because aligned + in.aw.ready := Mux(w_ok, w_out.ready, w_err_in.ready) && in.w.valid && in.w.bits.last && !w_block + in.w.ready := Mux(w_ok, w_out.ready, w_err_in.ready || !in.w.bits.last) && in.aw.valid && !w_block + w_out.valid := in.aw.valid && in.w.valid && !w_block && w_ok + w_out.bits := edgeOut.Put(in.aw.bits.id << 1, w_addr, w_size, in.w.bits.data, in.w.bits.strb)._2 + w_err_in.valid := in.aw.valid && in.w.valid && !w_block && !w_ok && in.w.bits.last + w_err_in.bits := in.aw.bits.id - TLArbiter(TLArbiter.lowestIndexFirst)(out.a, (UInt(0), r_out), (in.aw.bits.len, w_out)) + TLArbiter(TLArbiter.lowestIndexFirst)(out.a, (UInt(0), r_out), (in.aw.bits.len, w_out)) - val ok_b = Wire(in.b) - val err_b = Wire(in.b) - val mux_b = Wire(in.b) - val ok_r = Wire(in.r) - val err_r = Wire(in.r) - val mux_r = Wire(in.r) + val ok_b = Wire(in.b) + val err_b = Wire(in.b) + val mux_b = Wire(in.b) + val ok_r = Wire(in.r) + val err_r = Wire(in.r) + val mux_r = Wire(in.r) - val d_resp = Mux(out.d.bits.error, AXI4Parameters.RESP_SLVERR, AXI4Parameters.RESP_OKAY) - val d_hasData = edgeOut.hasData(out.d.bits) - val d_last = edgeOut.last(out.d) + val d_resp = Mux(out.d.bits.error, AXI4Parameters.RESP_SLVERR, AXI4Parameters.RESP_OKAY) + val d_hasData = edgeOut.hasData(out.d.bits) + val d_last = edgeOut.last(out.d) - out.d.ready := Mux(d_hasData, ok_r.ready, ok_b.ready) - ok_r.valid := out.d.valid && d_hasData - ok_b.valid := out.d.valid && !d_hasData + out.d.ready := Mux(d_hasData, ok_r.ready, ok_b.ready) + ok_r.valid := out.d.valid && d_hasData + ok_b.valid := out.d.valid && !d_hasData - ok_r.bits.id := out.d.bits.source >> 1 - ok_r.bits.data := out.d.bits.data - ok_r.bits.resp := d_resp - ok_r.bits.last := d_last + ok_r.bits.id := out.d.bits.source >> 1 + ok_r.bits.data := out.d.bits.data + ok_r.bits.resp := d_resp + ok_r.bits.last := d_last - r_err_out.ready := err_r.ready - err_r.valid := r_err_out.valid - err_r.bits.id := r_err_out.bits.id - err_r.bits.data := out.d.bits.data // don't care - err_r.bits.resp := AXI4Parameters.RESP_DECERR - err_r.bits.last := r_err_out.bits.last + r_err_out.ready := err_r.ready + err_r.valid := r_err_out.valid + err_r.bits.id := r_err_out.bits.id + err_r.bits.data := out.d.bits.data // don't care + err_r.bits.resp := AXI4Parameters.RESP_DECERR + err_r.bits.last := r_err_out.bits.last - // AXI4 must hold R to one source until last - val mux_lock_ok = RegInit(Bool(false)) - val mux_lock_err = RegInit(Bool(false)) - when (ok_r .fire()) { mux_lock_ok := !ok_r .bits.last } - when (err_r.fire()) { mux_lock_err := !err_r.bits.last } - assert (!mux_lock_ok || !mux_lock_err) + // AXI4 must hold R to one source until last + val mux_lock_ok = RegInit(Bool(false)) + val mux_lock_err = RegInit(Bool(false)) + when (ok_r .fire()) { mux_lock_ok := !ok_r .bits.last } + when (err_r.fire()) { mux_lock_err := !err_r.bits.last } + assert (!mux_lock_ok || !mux_lock_err) - // Prioritize err over ok (b/c err_r.valid comes from a register) - mux_r.valid := (!mux_lock_err && ok_r.valid) || (!mux_lock_ok && err_r.valid) - mux_r.bits := Mux(!mux_lock_ok && err_r.valid, err_r.bits, ok_r.bits) - ok_r.ready := mux_r.ready && (mux_lock_ok || !err_r.valid) - err_r.ready := mux_r.ready && !mux_lock_ok + // Prioritize err over ok (b/c err_r.valid comes from a register) + mux_r.valid := (!mux_lock_err && ok_r.valid) || (!mux_lock_ok && err_r.valid) + mux_r.bits := Mux(!mux_lock_ok && err_r.valid, err_r.bits, ok_r.bits) + ok_r.ready := mux_r.ready && (mux_lock_ok || !err_r.valid) + err_r.ready := mux_r.ready && !mux_lock_ok - // AXI4 needs irrevocable behaviour - in.r <> Queue.irrevocable(mux_r, 1, flow=true) + // AXI4 needs irrevocable behaviour + in.r <> Queue.irrevocable(mux_r, 1, flow=true) - ok_b.bits.id := out.d.bits.source >> 1 - ok_b.bits.resp := d_resp + ok_b.bits.id := out.d.bits.source >> 1 + ok_b.bits.resp := d_resp - w_err_out.ready := err_b.ready - err_b.valid := w_err_out.valid - err_b.bits.id := w_err_out.bits - err_b.bits.resp := AXI4Parameters.RESP_DECERR + w_err_out.ready := err_b.ready + err_b.valid := w_err_out.valid + err_b.bits.id := w_err_out.bits + err_b.bits.resp := AXI4Parameters.RESP_DECERR - // Prioritize err over ok (b/c err_b.valid comes from a register) - mux_b.valid := ok_b.valid || err_b.valid - mux_b.bits := Mux(err_b.valid, err_b.bits, ok_b.bits) - ok_b.ready := mux_b.ready && !err_b.valid - err_b.ready := mux_b.ready + // Prioritize err over ok (b/c err_b.valid comes from a register) + mux_b.valid := ok_b.valid || err_b.valid + mux_b.bits := Mux(err_b.valid, err_b.bits, ok_b.bits) + ok_b.ready := mux_b.ready && !err_b.valid + err_b.ready := mux_b.ready - // AXI4 needs irrevocable behaviour - in.b <> Queue.irrevocable(mux_b, 1, flow=true) + // AXI4 needs irrevocable behaviour + in.b <> Queue.irrevocable(mux_b, 1, flow=true) - // Update flight trackers - val r_set = in.ar.fire().asUInt << in.ar.bits.id - val r_clr = (in.r.fire() && in.r.bits.last).asUInt << in.r.bits.id - r_inflight := (r_inflight | r_set) & ~r_clr - val w_set = in.aw.fire().asUInt << in.aw.bits.id - val w_clr = in.b.fire().asUInt << in.b.bits.id - w_inflight := (w_inflight | w_set) & ~w_clr + // Update flight trackers + val r_set = in.ar.fire().asUInt << in.ar.bits.id + val r_clr = (in.r.fire() && in.r.bits.last).asUInt << in.r.bits.id + r_inflight := (r_inflight | r_set) & ~r_clr + val w_set = in.aw.fire().asUInt << in.aw.bits.id + val w_clr = in.b.fire().asUInt << in.b.bits.id + w_inflight := (w_inflight | w_set) & ~w_clr - // Unused channels - out.b.ready := Bool(true) - out.c.valid := Bool(false) - out.e.valid := Bool(false) + // Unused channels + out.b.ready := Bool(true) + out.c.valid := Bool(false) + out.e.valid := Bool(false) + } } } diff --git a/src/main/scala/uncore/devices/Plic.scala b/src/main/scala/uncore/devices/Plic.scala index 25d39fa0..bbdf69f8 100644 --- a/src/main/scala/uncore/devices/Plic.scala +++ b/src/main/scala/uncore/devices/Plic.scala @@ -62,7 +62,7 @@ class TLPLIC(supervisor: Boolean, maxPriorities: Int, address: BigInt = 0xC00000 beatBytes = p(rocket.XLen)/8, undefZero = false) - val intnode = IntAdapterNode( + val intnode = IntNexusNode( numSourcePorts = 0 to 1024, numSinkPorts = 0 to 1024, sourceFn = { _ => IntSourcePortParameters(Seq(IntSourceParameters(contextsPerHart))) }, diff --git a/src/main/scala/uncore/tilelink2/AtomicAutomata.scala b/src/main/scala/uncore/tilelink2/AtomicAutomata.scala index 1674f45d..1752a9fc 100644 --- a/src/main/scala/uncore/tilelink2/AtomicAutomata.scala +++ b/src/main/scala/uncore/tilelink2/AtomicAutomata.scala @@ -6,6 +6,7 @@ import Chisel._ import chisel3.internal.sourceinfo.SourceInfo import config._ import diplomacy._ +import util.GenericParameterizedBundle import scala.math.{min,max} // Ensures that all downstream RW managers support Atomic operationss. @@ -15,8 +16,8 @@ class TLAtomicAutomata(logical: Boolean = true, arithmetic: Boolean = true, conc require (concurrency >= 1) val node = TLAdapterNode( - clientFn = { case Seq(cp) => require (!cp.unsafeAtomics); cp.copy(unsafeAtomics = true) }, - managerFn = { case Seq(mp) => mp.copy(managers = mp.managers.map { m => + clientFn = { case cp => require (!cp.unsafeAtomics); cp.copy(unsafeAtomics = true) }, + managerFn = { case mp => mp.copy(managers = mp.managers.map { m => val ourSupport = TransferSizes(1, mp.beatBytes) def widen(x: TransferSizes) = if (passthrough && x.min <= 2*mp.beatBytes) TransferSizes(1, max(mp.beatBytes, x.max)) else ourSupport val canDoit = m.supportsPutFull.contains(ourSupport) && m.supportsGet.contains(ourSupport) @@ -33,245 +34,232 @@ class TLAtomicAutomata(logical: Boolean = true, arithmetic: Boolean = true, conc val out = node.bundleOut } - val in = io.in(0) - val out = io.out(0) - val edgeIn = node.edgesIn(0) - val edgeOut = node.edgesOut(0) - val managers = edgeOut.manager.managers - val beatBytes = edgeOut.manager.beatBytes + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val managers = edgeOut.manager.managers + val beatBytes = edgeOut.manager.beatBytes - // To which managers are we adding atomic support? - val ourSupport = TransferSizes(1, edgeOut.manager.beatBytes) - val managersNeedingHelp = managers.filter { m => - m.supportsPutFull.contains(ourSupport) && - m.supportsGet.contains(ourSupport) && - ((logical && !m.supportsLogical .contains(ourSupport)) || - (arithmetic && !m.supportsArithmetic.contains(ourSupport)) || - !passthrough) // we will do atomics for everyone we can - } - // We cannot add atomcis to a non-FIFO manager - managersNeedingHelp foreach { m => require (m.fifoId.isDefined) } - // We need to preserve FIFO semantics across FIFO domains, not managers - // Suppose you have Put(42) Atomic(+1) both inflight; valid results: 42 or 43 - // If we allow Put(42) Get() Put(+1) concurrent; valid results: 42 43 OR undef - // Making non-FIFO work requires waiting for all Acks to come back (=> use FIFOFixer) - val domainsNeedingHelp = managersNeedingHelp.map(_.fifoId.get).distinct - // Don't overprovision the CAM - val camSize = min(domainsNeedingHelp.size, concurrency) - // Compact the fifoIds to only those we care about - val camFifoIds = managers.map(m => UInt(m.fifoId.map(id => max(0, domainsNeedingHelp.indexOf(id))).getOrElse(0))) - - // CAM entry state machine - val FREE = UInt(0) // unused waiting on Atomic from A - val GET = UInt(3) // Get sent down A waiting on AccessDataAck from D - val AMO = UInt(2) // AccessDataAck sent up D waiting for A availability - val ACK = UInt(1) // Put sent down A waiting for PutAck from D - - def helper(select: Seq[Bool], x: Seq[TransferSizes], lgSize: UInt) = - if (!passthrough) Bool(false) else - if (x.map(_ == x(0)).reduce(_ && _)) x(0).containsLg(lgSize) else - Mux1H(select, x.map(_.containsLg(lgSize))) - - // Do we need to do anything at all? - if (camSize > 0) { - class CAM_S extends Bundle { - val state = UInt(width = 2) - } - class CAM_A extends Bundle { - val bits = new TLBundleA(out.a.bits.params) - val fifoId = UInt(width = log2Up(domainsNeedingHelp.size)) - val lut = UInt(width = 4) - } - class CAM_D extends Bundle { - val data = UInt(width = out.a.bits.params.dataBits) + // To which managers are we adding atomic support? + val ourSupport = TransferSizes(1, edgeOut.manager.beatBytes) + val managersNeedingHelp = managers.filter { m => + m.supportsPutFull.contains(ourSupport) && + m.supportsGet.contains(ourSupport) && + ((logical && !m.supportsLogical .contains(ourSupport)) || + (arithmetic && !m.supportsArithmetic.contains(ourSupport)) || + !passthrough) // we will do atomics for everyone we can } + // We cannot add atomcis to a non-FIFO manager + managersNeedingHelp foreach { m => require (m.fifoId.isDefined) } + // We need to preserve FIFO semantics across FIFO domains, not managers + // Suppose you have Put(42) Atomic(+1) both inflight; valid results: 42 or 43 + // If we allow Put(42) Get() Put(+1) concurrent; valid results: 42 43 OR undef + // Making non-FIFO work requires waiting for all Acks to come back (=> use FIFOFixer) + val domainsNeedingHelp = managersNeedingHelp.map(_.fifoId.get).distinct + // Don't overprovision the CAM + val camSize = min(domainsNeedingHelp.size, concurrency) + // Compact the fifoIds to only those we care about + val camFifoIds = managers.map(m => UInt(m.fifoId.map(id => max(0, domainsNeedingHelp.indexOf(id))).getOrElse(0))) - val initval = Wire(new CAM_S) - initval.state := FREE - val cam_s = RegInit(Vec.fill(camSize)(initval)) - val cam_a = Reg(Vec(camSize, new CAM_A)) - val cam_d = Reg(Vec(camSize, new CAM_D)) + // CAM entry state machine + val FREE = UInt(0) // unused waiting on Atomic from A + val GET = UInt(3) // Get sent down A waiting on AccessDataAck from D + val AMO = UInt(2) // AccessDataAck sent up D waiting for A availability + val ACK = UInt(1) // Put sent down A waiting for PutAck from D - val cam_free = cam_s.map(_.state === FREE) - val cam_amo = cam_s.map(_.state === AMO) - val cam_abusy = cam_s.map(e => e.state === GET || e.state === AMO) // A is blocked - val cam_dmatch = cam_s.map(e => e.state =/= FREE) // D should inspect these entries + def helper(select: Seq[Bool], x: Seq[TransferSizes], lgSize: UInt) = + if (!passthrough) Bool(false) else + if (x.map(_ == x(0)).reduce(_ && _)) x(0).containsLg(lgSize) else + Mux1H(select, x.map(_.containsLg(lgSize))) - // Can the manager already handle this message? - val a_size = edgeIn.size(in.a.bits) - val a_select = edgeOut.manager.findFast(edgeIn.address(in.a.bits)) - val a_canLogical = helper(a_select, managers.map(_.supportsLogical), a_size) - val a_canArithmetic = helper(a_select, managers.map(_.supportsArithmetic), a_size) - val a_isLogical = in.a.bits.opcode === TLMessages.LogicalData - val a_isArithmetic = in.a.bits.opcode === TLMessages.ArithmeticData - val a_isSupported = Mux(a_isLogical, a_canLogical, Mux(a_isArithmetic, a_canArithmetic, Bool(true))) + val params = TLAtomicAutomata.CAMParams(out.a.bits.params, domainsNeedingHelp.size) + // Do we need to do anything at all? + if (camSize > 0) { + val initval = Wire(new TLAtomicAutomata.CAM_S(params)) + initval.state := FREE + val cam_s = RegInit(Vec.fill(camSize)(initval)) + val cam_a = Reg(Vec(camSize, new TLAtomicAutomata.CAM_A(params))) + val cam_d = Reg(Vec(camSize, new TLAtomicAutomata.CAM_D(params))) - // Must we do a Put? - val a_cam_any_put = cam_amo.reduce(_ || _) - val a_cam_por_put = cam_amo.scanLeft(Bool(false))(_||_).init - val a_cam_sel_put = (cam_amo zip a_cam_por_put) map { case (a, b) => a && !b } - val a_cam_a = PriorityMux(cam_amo, cam_a) - val a_cam_d = PriorityMux(cam_amo, cam_d) - val a_a = a_cam_a.bits.data - val a_d = a_cam_d.data + val cam_free = cam_s.map(_.state === FREE) + val cam_amo = cam_s.map(_.state === AMO) + val cam_abusy = cam_s.map(e => e.state === GET || e.state === AMO) // A is blocked + val cam_dmatch = cam_s.map(e => e.state =/= FREE) // D should inspect these entries - // Does the A request conflict with an inflight AMO? - val a_fifoId = Mux1H(a_select, camFifoIds) - val a_cam_busy = (cam_abusy zip cam_a.map(_.fifoId === a_fifoId)) map { case (a,b) => a&&b } reduce (_||_) + // Can the manager already handle this message? + val a_size = edgeIn.size(in.a.bits) + val a_select = edgeOut.manager.findFast(edgeIn.address(in.a.bits)) + val a_canLogical = helper(a_select, managers.map(_.supportsLogical), a_size) + val a_canArithmetic = helper(a_select, managers.map(_.supportsArithmetic), a_size) + val a_isLogical = in.a.bits.opcode === TLMessages.LogicalData + val a_isArithmetic = in.a.bits.opcode === TLMessages.ArithmeticData + val a_isSupported = Mux(a_isLogical, a_canLogical, Mux(a_isArithmetic, a_canArithmetic, Bool(true))) - // (Where) are we are allocating in the CAM? - val a_cam_any_free = cam_free.reduce(_ || _) - val a_cam_por_free = cam_free.scanLeft(Bool(false))(_||_).init - val a_cam_sel_free = (cam_free zip a_cam_por_free) map { case (a,b) => a && !b } + // Must we do a Put? + val a_cam_any_put = cam_amo.reduce(_ || _) + val a_cam_por_put = cam_amo.scanLeft(Bool(false))(_||_).init + val a_cam_sel_put = (cam_amo zip a_cam_por_put) map { case (a, b) => a && !b } + val a_cam_a = PriorityMux(cam_amo, cam_a) + val a_cam_d = PriorityMux(cam_amo, cam_d) + val a_a = a_cam_a.bits.data + val a_d = a_cam_d.data - // Logical AMO - val indexes = Seq.tabulate(beatBytes*8) { i => Cat(a_a(i,i), a_d(i,i)) } - val logic_out = Cat(indexes.map(x => a_cam_a.lut(x).asUInt).reverse) + // Does the A request conflict with an inflight AMO? + val a_fifoId = Mux1H(a_select, camFifoIds) + val a_cam_busy = (cam_abusy zip cam_a.map(_.fifoId === a_fifoId)) map { case (a,b) => a&&b } reduce (_||_) - // Arithmetic AMO - val unsigned = a_cam_a.bits.param(1) - val take_max = a_cam_a.bits.param(0) - val adder = a_cam_a.bits.param(2) - val mask = a_cam_a.bits.mask - val signSel = ~(~mask | (mask >> 1)) - val signbits_a = Cat(Seq.tabulate(beatBytes) { i => a_a(8*i+7,8*i+7) } .reverse) - val signbits_d = Cat(Seq.tabulate(beatBytes) { i => a_d(8*i+7,8*i+7) } .reverse) - // Move the selected sign bit into the first byte position it will extend - val signbit_a = ((signbits_a & signSel) << 1)(beatBytes-1, 0) - val signbit_d = ((signbits_d & signSel) << 1)(beatBytes-1, 0) - val signext_a = FillInterleaved(8, leftOR(signbit_a)) - val signext_d = FillInterleaved(8, leftOR(signbit_d)) - // NOTE: sign-extension does not change the relative ordering in EITHER unsigned or signed arithmetic - val wide_mask = FillInterleaved(8, mask) - val a_a_ext = (a_a & wide_mask) | signext_a - val a_d_ext = (a_d & wide_mask) | signext_d - val a_d_inv = Mux(adder, a_d_ext, ~a_d_ext) - val adder_out = a_a_ext + a_d_inv - val h = 8*beatBytes-1 // now sign-extended; use biggest bit - val a_bigger_uneq = unsigned === a_a_ext(h) // result if high bits are unequal - val a_bigger = Mux(a_a_ext(h) === a_d_ext(h), !adder_out(h), a_bigger_uneq) - val pick_a = take_max === a_bigger - val arith_out = Mux(adder, adder_out, Mux(pick_a, a_a, a_d)) + // (Where) are we are allocating in the CAM? + val a_cam_any_free = cam_free.reduce(_ || _) + val a_cam_por_free = cam_free.scanLeft(Bool(false))(_||_).init + val a_cam_sel_free = (cam_free zip a_cam_por_free) map { case (a,b) => a && !b } - // AMO result data - val amo_data = - if (!logical) arith_out else - if (!arithmetic) logic_out else - Mux(a_cam_a.bits.opcode(0), logic_out, arith_out) + // Logical AMO + val indexes = Seq.tabulate(beatBytes*8) { i => Cat(a_a(i,i), a_d(i,i)) } + val logic_out = Cat(indexes.map(x => a_cam_a.lut(x).asUInt).reverse) - // Potentially mutate the message from inner - val source_i = Wire(in.a) - val a_allow = !a_cam_busy && (a_isSupported || a_cam_any_free) - in.a.ready := source_i.ready && a_allow - source_i.valid := in.a.valid && a_allow - source_i.bits := in.a.bits - when (!a_isSupported) { // minimal mux difference - source_i.bits.opcode := TLMessages.Get - source_i.bits.param := UInt(0) - } + // Arithmetic AMO + val unsigned = a_cam_a.bits.param(1) + val take_max = a_cam_a.bits.param(0) + val adder = a_cam_a.bits.param(2) + val mask = a_cam_a.bits.mask + val signSel = ~(~mask | (mask >> 1)) + val signbits_a = Cat(Seq.tabulate(beatBytes) { i => a_a(8*i+7,8*i+7) } .reverse) + val signbits_d = Cat(Seq.tabulate(beatBytes) { i => a_d(8*i+7,8*i+7) } .reverse) + // Move the selected sign bit into the first byte position it will extend + val signbit_a = ((signbits_a & signSel) << 1)(beatBytes-1, 0) + val signbit_d = ((signbits_d & signSel) << 1)(beatBytes-1, 0) + val signext_a = FillInterleaved(8, leftOR(signbit_a)) + val signext_d = FillInterleaved(8, leftOR(signbit_d)) + // NOTE: sign-extension does not change the relative ordering in EITHER unsigned or signed arithmetic + val wide_mask = FillInterleaved(8, mask) + val a_a_ext = (a_a & wide_mask) | signext_a + val a_d_ext = (a_d & wide_mask) | signext_d + val a_d_inv = Mux(adder, a_d_ext, ~a_d_ext) + val adder_out = a_a_ext + a_d_inv + val h = 8*beatBytes-1 // now sign-extended; use biggest bit + val a_bigger_uneq = unsigned === a_a_ext(h) // result if high bits are unequal + val a_bigger = Mux(a_a_ext(h) === a_d_ext(h), !adder_out(h), a_bigger_uneq) + val pick_a = take_max === a_bigger + val arith_out = Mux(adder, adder_out, Mux(pick_a, a_a, a_d)) - // Potentially take the message from the CAM - val source_c = Wire(in.a) - source_c.valid := a_cam_any_put - source_c.bits := edgeOut.Put(a_cam_a.bits.source, edgeIn.address(a_cam_a.bits), a_cam_a.bits.size, amo_data)._2 + // AMO result data + val amo_data = + if (!logical) arith_out else + if (!arithmetic) logic_out else + Mux(a_cam_a.bits.opcode(0), logic_out, arith_out) - // Finishing an AMO from the CAM has highest priority - TLArbiter(TLArbiter.lowestIndexFirst)(out.a, (UInt(0), source_c), (edgeOut.numBeats1(in.a.bits), source_i)) + // Potentially mutate the message from inner + val source_i = Wire(in.a) + val a_allow = !a_cam_busy && (a_isSupported || a_cam_any_free) + in.a.ready := source_i.ready && a_allow + source_i.valid := in.a.valid && a_allow + source_i.bits := in.a.bits + when (!a_isSupported) { // minimal mux difference + source_i.bits.opcode := TLMessages.Get + source_i.bits.param := UInt(0) + } - // Capture the A state into the CAM - when (source_i.fire() && !a_isSupported) { - (a_cam_sel_free zip cam_a) foreach { case (en, r) => - when (en) { - r.fifoId := a_fifoId - r.bits := in.a.bits - r.lut := MuxLookup(in.a.bits.param(1, 0), UInt(0, width = 4), Array( - TLAtomics.AND -> UInt(0x8), - TLAtomics.OR -> UInt(0xe), - TLAtomics.XOR -> UInt(0x6), - TLAtomics.SWAP -> UInt(0xc))) + // Potentially take the message from the CAM + val source_c = Wire(in.a) + source_c.valid := a_cam_any_put + source_c.bits := edgeOut.Put(a_cam_a.bits.source, edgeIn.address(a_cam_a.bits), a_cam_a.bits.size, amo_data)._2 + + // Finishing an AMO from the CAM has highest priority + TLArbiter(TLArbiter.lowestIndexFirst)(out.a, (UInt(0), source_c), (edgeOut.numBeats1(in.a.bits), source_i)) + + // Capture the A state into the CAM + when (source_i.fire() && !a_isSupported) { + (a_cam_sel_free zip cam_a) foreach { case (en, r) => + when (en) { + r.fifoId := a_fifoId + r.bits := in.a.bits + r.lut := MuxLookup(in.a.bits.param(1, 0), UInt(0, width = 4), Array( + TLAtomics.AND -> UInt(0x8), + TLAtomics.OR -> UInt(0xe), + TLAtomics.XOR -> UInt(0x6), + TLAtomics.SWAP -> UInt(0xc))) + } + } + (a_cam_sel_free zip cam_s) foreach { case (en, r) => + when (en) { + r.state := GET + } } } - (a_cam_sel_free zip cam_s) foreach { case (en, r) => - when (en) { - r.state := GET + + // Advance the put state + when (source_c.fire()) { + (a_cam_sel_put zip cam_s) foreach { case (en, r) => + when (en) { + r.state := ACK + } } } - } - // Advance the put state - when (source_c.fire()) { - (a_cam_sel_put zip cam_s) foreach { case (en, r) => - when (en) { - r.state := ACK + // We need to deal with a potential D response in the same cycle as the A request + val d_cam_sel_raw = cam_a.map(_.bits.source === in.d.bits.source) + val d_cam_sel_match = (d_cam_sel_raw zip cam_dmatch) map { case (a,b) => a&&b } + val d_cam_data = Mux1H(d_cam_sel_match, cam_d.map(_.data)) + val d_cam_sel_bypass = if (edgeOut.manager.minLatency > 0) Bool(false) else + out.d.bits.source === in.a.bits.source && in.a.valid && !a_isSupported + val d_cam_sel = (a_cam_sel_free zip d_cam_sel_match) map { case (a,d) => Mux(d_cam_sel_bypass, a, d) } + val d_cam_sel_any = d_cam_sel_bypass || d_cam_sel_match.reduce(_ || _) + val d_ackd = out.d.bits.opcode === TLMessages.AccessAckData + val d_ack = out.d.bits.opcode === TLMessages.AccessAck + + when (out.d.fire()) { + (d_cam_sel zip cam_d) foreach { case (en, r) => + when (en && d_ackd) { + r.data := out.d.bits.data + } + } + (d_cam_sel zip cam_s) foreach { case (en, r) => + when (en) { + // Note: it is important that this comes AFTER the := GET, so we can go FREE=>GET=>AMO in one cycle + r.state := Mux(d_ackd, AMO, FREE) + } } } - } - // We need to deal with a potential D response in the same cycle as the A request - val d_cam_sel_raw = cam_a.map(_.bits.source === in.d.bits.source) - val d_cam_sel_match = (d_cam_sel_raw zip cam_dmatch) map { case (a,b) => a&&b } - val d_cam_data = Mux1H(d_cam_sel_match, cam_d.map(_.data)) - val d_cam_sel_bypass = if (edgeOut.manager.minLatency > 0) Bool(false) else - out.d.bits.source === in.a.bits.source && in.a.valid && !a_isSupported - val d_cam_sel = (a_cam_sel_free zip d_cam_sel_match) map { case (a,d) => Mux(d_cam_sel_bypass, a, d) } - val d_cam_sel_any = d_cam_sel_bypass || d_cam_sel_match.reduce(_ || _) - val d_ackd = out.d.bits.opcode === TLMessages.AccessAckData - val d_ack = out.d.bits.opcode === TLMessages.AccessAck + val d_drop = d_ackd && d_cam_sel_any + val d_replace = d_ack && d_cam_sel_match.reduce(_ || _) - when (out.d.fire()) { - (d_cam_sel zip cam_d) foreach { case (en, r) => - when (en && d_ackd) { - r.data := out.d.bits.data - } - } - (d_cam_sel zip cam_s) foreach { case (en, r) => - when (en) { - // Note: it is important that this comes AFTER the := GET, so we can go FREE=>GET=>AMO in one cycle - r.state := Mux(d_ackd, AMO, FREE) - } + in.d.valid := out.d.valid && !d_drop + out.d.ready := in.d.ready || d_drop + + in.d.bits := out.d.bits + when (d_replace) { // minimal muxes + in.d.bits.opcode := TLMessages.AccessAckData + in.d.bits.data := d_cam_data } + } else { + out.a.valid := in.a.valid + in.a.ready := out.a.ready + out.a.bits := in.a.bits + + in.d.valid := out.d.valid + out.d.ready := in.d.ready + in.d.bits := out.d.bits } - val d_drop = d_ackd && d_cam_sel_any - val d_replace = d_ack && d_cam_sel_match.reduce(_ || _) + if (edgeOut.manager.anySupportAcquireB && edgeIn.client.anySupportProbe) { + in.b.valid := out.b.valid + out.b.ready := in.b.ready + in.b.bits := out.b.bits - in.d.valid := out.d.valid && !d_drop - out.d.ready := in.d.ready || d_drop + out.c.valid := in.c.valid + in.c.ready := out.c.ready + out.c.bits := in.c.bits - in.d.bits := out.d.bits - when (d_replace) { // minimal muxes - in.d.bits.opcode := TLMessages.AccessAckData - in.d.bits.data := d_cam_data + out.e.valid := in.e.valid + in.e.ready := out.e.ready + out.e.bits := in.e.bits + } else { + in.b.valid := Bool(false) + in.c.ready := Bool(true) + in.e.ready := Bool(true) + out.b.ready := Bool(true) + out.c.valid := Bool(false) + out.e.valid := Bool(false) } - } else { - out.a.valid := in.a.valid - in.a.ready := out.a.ready - out.a.bits := in.a.bits - - in.d.valid := out.d.valid - out.d.ready := in.d.ready - in.d.bits := out.d.bits - } - - if (edgeOut.manager.anySupportAcquireB && edgeIn.client.anySupportProbe) { - in.b.valid := out.b.valid - out.b.ready := in.b.ready - in.b.bits := out.b.bits - - out.c.valid := in.c.valid - in.c.ready := out.c.ready - out.c.bits := in.c.bits - - out.e.valid := in.e.valid - in.e.ready := out.e.ready - out.e.bits := in.e.bits - } else { - in.b.valid := Bool(false) - in.c.ready := Bool(true) - in.e.ready := Bool(true) - out.b.ready := Bool(true) - out.c.valid := Bool(false) - out.e.valid := Bool(false) } } } @@ -284,6 +272,20 @@ object TLAtomicAutomata atomics.node := x atomics.node } + + case class CAMParams(a: TLBundleParameters, domainsNeedingHelp: Int) + + class CAM_S(params: CAMParams) extends GenericParameterizedBundle(params) { + val state = UInt(width = 2) + } + class CAM_A(params: CAMParams) extends GenericParameterizedBundle(params) { + val bits = new TLBundleA(params.a) + val fifoId = UInt(width = log2Up(params.domainsNeedingHelp)) + val lut = UInt(width = 4) + } + class CAM_D(params: CAMParams) extends GenericParameterizedBundle(params) { + val data = UInt(width = params.a.dataBits) + } } /** Synthesizeable unit tests */ diff --git a/src/main/scala/uncore/tilelink2/Broadcast.scala b/src/main/scala/uncore/tilelink2/Broadcast.scala index 8e7f9629..77fd6c52 100644 --- a/src/main/scala/uncore/tilelink2/Broadcast.scala +++ b/src/main/scala/uncore/tilelink2/Broadcast.scala @@ -13,11 +13,11 @@ class TLBroadcast(lineBytes: Int, numTrackers: Int = 4, bufferless: Boolean = fa require (numTrackers > 0) val node = TLAdapterNode( - clientFn = { case Seq(cp) => + clientFn = { cp => cp.copy(clients = Seq(TLClientParameters( sourceId = IdRange(0, 1 << log2Ceil(cp.endSourceId*4))))) }, - managerFn = { case Seq(mp) => + managerFn = { mp => mp.copy( endSinkId = numTrackers, managers = mp.managers.map { m => @@ -56,154 +56,152 @@ class TLBroadcast(lineBytes: Int, numTrackers: Int = 4, bufferless: Boolean = fa val out = node.bundleOut } - val in = io.in(0) - val out = io.out(0) - val edgeIn = node.edgesIn(0) - val edgeOut = node.edgesOut(0) - val clients = edgeIn.client.clients - val managers = edgeOut.manager.managers - val lineShift = log2Ceil(lineBytes) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val clients = edgeIn.client.clients + val managers = edgeOut.manager.managers + val lineShift = log2Ceil(lineBytes) - import TLBroadcastConstants._ + import TLBroadcastConstants._ - require (lineBytes >= edgeOut.manager.beatBytes) - // For the probe walker, we need to identify all the caches - val caches = clients.filter(_.supportsProbe).map(_.sourceId) - val cache_targets = caches.map(c => UInt(c.start)) + require (lineBytes >= edgeOut.manager.beatBytes) + // For the probe walker, we need to identify all the caches + val caches = clients.filter(_.supportsProbe).map(_.sourceId) + val cache_targets = caches.map(c => UInt(c.start)) - // Create the request tracker queues - val trackers = Seq.tabulate(numTrackers) { id => - Module(new TLBroadcastTracker(id, lineBytes, log2Up(caches.size+1), bufferless, edgeIn, edgeOut)).io + // Create the request tracker queues + val trackers = Seq.tabulate(numTrackers) { id => + Module(new TLBroadcastTracker(id, lineBytes, log2Up(caches.size+1), bufferless, edgeIn, edgeOut)).io + } + + // We always accept E + in.e.ready := Bool(true) + (trackers zip UIntToOH(in.e.bits.sink).toBools) foreach { case (tracker, select) => + tracker.e_last := select && in.e.fire() + } + + // Depending on the high source bits, we might transform D + val d_high = log2Ceil(edgeIn.client.endSourceId) + val d_what = out.d.bits.source(d_high+1, d_high) + val d_drop = d_what === DROP + val d_hasData = edgeOut.hasData(out.d.bits) + val d_normal = Wire(in.d) + val d_trackerOH = Vec(trackers.map { t => !t.idle && t.source === d_normal.bits.source }).asUInt + + assert (!out.d.valid || !d_drop || out.d.bits.opcode === TLMessages.AccessAck) + + out.d.ready := d_normal.ready || d_drop + d_normal.valid := out.d.valid && !d_drop + d_normal.bits := out.d.bits // truncates source + when (d_what(1)) { // TRANSFORM_* + d_normal.bits.opcode := Mux(d_hasData, TLMessages.GrantData, TLMessages.ReleaseAck) + d_normal.bits.param := Mux(d_hasData, Mux(d_what(0), TLPermissions.toT, TLPermissions.toB), UInt(0)) + } + d_normal.bits.sink := OHToUInt(d_trackerOH) + assert (!d_normal.valid || (d_trackerOH.orR() || d_normal.bits.opcode === TLMessages.ReleaseAck)) + + // A tracker response is anything neither dropped nor a ReleaseAck + val d_response = d_hasData || !d_what(1) + val d_last = edgeIn.last(d_normal) + (trackers zip d_trackerOH.toBools) foreach { case (tracker, select) => + tracker.d_last := select && d_normal.fire() && d_response && d_last + tracker.probedack := select && out.d.fire() && d_drop + } + + // Incoming C can be: + // ProbeAck => decrement tracker, drop + // ProbeAckData => decrement tracker, send out A as PutFull(DROP) + // ReleaseData => send out A as PutFull(TRANSFORM) + // Release => send out D as ReleaseAck + + val c_probeack = in.c.bits.opcode === TLMessages.ProbeAck + val c_probeackdata = in.c.bits.opcode === TLMessages.ProbeAckData + val c_releasedata = in.c.bits.opcode === TLMessages.ReleaseData + val c_release = in.c.bits.opcode === TLMessages.Release + val c_trackerOH = trackers.map { t => t.line === (in.c.bits.address >> lineShift) } + val c_trackerSrc = Mux1H(c_trackerOH, trackers.map { _.source }) + + // Decrement the tracker's outstanding probe counter + (trackers zip c_trackerOH) foreach { case (tracker, select) => + tracker.probenack := in.c.fire() && c_probeack && select + } + + val releaseack = Wire(in.d) + val putfull = Wire(out.a) + + in.c.ready := c_probeack || Mux(c_release, releaseack.ready, putfull.ready) + + releaseack.valid := in.c.valid && c_release + releaseack.bits := edgeIn.ReleaseAck(in.c.bits.address, UInt(0), in.c.bits.source, in.c.bits.size) + + val put_what = Mux(c_releasedata, TRANSFORM_B, DROP) + val put_who = Mux(c_releasedata, in.c.bits.source, c_trackerSrc) + putfull.valid := in.c.valid && (c_probeackdata || c_releasedata) + putfull.bits := edgeOut.Put(Cat(put_what, put_who), in.c.bits.address, in.c.bits.size, in.c.bits.data)._2 + + // Combine ReleaseAck or the modified D + TLArbiter.lowest(edgeOut, in.d, releaseack, d_normal) + // Combine the PutFull with the trackers + TLArbiter.lowestFromSeq(edgeOut, out.a, putfull +: trackers.map(_.out_a)) + + // The Probe FSM walks all caches and probes them + val probe_todo = RegInit(UInt(0, width = max(1, caches.size))) + val probe_line = Reg(UInt()) + val probe_perms = Reg(UInt(width = 2)) + val probe_next = probe_todo & ~(leftOR(probe_todo) << 1) + val probe_busy = probe_todo.orR() + val probe_target = if (caches.size == 0) UInt(0) else Mux1H(probe_next, cache_targets) + + // Probe whatever the FSM wants to do next + in.b.valid := probe_busy + if (caches.size != 0) { + in.b.bits := edgeIn.Probe(probe_line << lineShift, probe_target, UInt(lineShift), probe_perms)._2 + } + when (in.b.fire()) { probe_todo := probe_todo & ~probe_next } + + // Which cache does a request come from? + val a_cache = if (caches.size == 0) UInt(1) else Vec(caches.map(_.contains(in.a.bits.source))).asUInt + val a_first = edgeIn.first(in.a) + + // To accept a request from A, the probe FSM must be idle and there must be a matching tracker + val freeTrackers = Vec(trackers.map { t => t.idle }).asUInt + val freeTracker = freeTrackers.orR() + val matchTrackers = Vec(trackers.map { t => t.line === in.a.bits.address >> lineShift }).asUInt + val matchTracker = matchTrackers.orR() + val allocTracker = freeTrackers & ~(leftOR(freeTrackers) << 1) + val selectTracker = Mux(matchTracker, matchTrackers, allocTracker) + + val trackerReady = Vec(trackers.map(_.in_a.ready)).asUInt + in.a.ready := (!a_first || !probe_busy) && (selectTracker & trackerReady).orR() + (trackers zip selectTracker.toBools) foreach { case (t, select) => + t.in_a.valid := in.a.valid && select && (!a_first || !probe_busy) + t.in_a.bits := in.a.bits + t.in_a_first := a_first + t.probe := (if (caches.size == 0) UInt(0) else Mux(a_cache.orR(), UInt(caches.size-1), UInt(caches.size))) + } + + when (in.a.fire() && a_first) { + probe_todo := ~a_cache // probe all but the cache who poked us + probe_line := in.a.bits.address >> lineShift + probe_perms := MuxLookup(in.a.bits.opcode, Wire(UInt(width = 2)), Array( + TLMessages.PutFullData -> TLPermissions.toN, + TLMessages.PutPartialData -> TLPermissions.toN, + TLMessages.ArithmeticData -> TLPermissions.toN, + TLMessages.LogicalData -> TLPermissions.toN, + TLMessages.Get -> TLPermissions.toB, + TLMessages.Hint -> MuxLookup(in.a.bits.param, Wire(UInt(width = 2)), Array( + TLHints.PREFETCH_READ -> TLPermissions.toB, + TLHints.PREFETCH_WRITE -> TLPermissions.toN)), + TLMessages.Acquire -> MuxLookup(in.a.bits.param, Wire(UInt(width = 2)), Array( + TLPermissions.NtoB -> TLPermissions.toB, + TLPermissions.NtoT -> TLPermissions.toN, + TLPermissions.BtoT -> TLPermissions.toN)))) + } + + // The outer TL connections may not be cached + out.b.ready := Bool(true) + out.c.valid := Bool(false) + out.e.valid := Bool(false) } - - // We always accept E - in.e.ready := Bool(true) - (trackers zip UIntToOH(in.e.bits.sink).toBools) foreach { case (tracker, select) => - tracker.e_last := select && in.e.fire() - } - - // Depending on the high source bits, we might transform D - val d_high = log2Ceil(edgeIn.client.endSourceId) - val d_what = out.d.bits.source(d_high+1, d_high) - val d_drop = d_what === DROP - val d_hasData = edgeOut.hasData(out.d.bits) - val d_normal = Wire(in.d) - val d_trackerOH = Vec(trackers.map { t => !t.idle && t.source === d_normal.bits.source }).asUInt - - assert (!out.d.valid || !d_drop || out.d.bits.opcode === TLMessages.AccessAck) - - out.d.ready := d_normal.ready || d_drop - d_normal.valid := out.d.valid && !d_drop - d_normal.bits := out.d.bits // truncates source - when (d_what(1)) { // TRANSFORM_* - d_normal.bits.opcode := Mux(d_hasData, TLMessages.GrantData, TLMessages.ReleaseAck) - d_normal.bits.param := Mux(d_hasData, Mux(d_what(0), TLPermissions.toT, TLPermissions.toB), UInt(0)) - } - d_normal.bits.sink := OHToUInt(d_trackerOH) - assert (!d_normal.valid || (d_trackerOH.orR() || d_normal.bits.opcode === TLMessages.ReleaseAck)) - - // A tracker response is anything neither dropped nor a ReleaseAck - val d_response = d_hasData || !d_what(1) - val d_last = edgeIn.last(d_normal) - (trackers zip d_trackerOH.toBools) foreach { case (tracker, select) => - tracker.d_last := select && d_normal.fire() && d_response && d_last - tracker.probedack := select && out.d.fire() && d_drop - } - - // Incoming C can be: - // ProbeAck => decrement tracker, drop - // ProbeAckData => decrement tracker, send out A as PutFull(DROP) - // ReleaseData => send out A as PutFull(TRANSFORM) - // Release => send out D as ReleaseAck - - val c_probeack = in.c.bits.opcode === TLMessages.ProbeAck - val c_probeackdata = in.c.bits.opcode === TLMessages.ProbeAckData - val c_releasedata = in.c.bits.opcode === TLMessages.ReleaseData - val c_release = in.c.bits.opcode === TLMessages.Release - val c_trackerOH = trackers.map { t => t.line === (in.c.bits.address >> lineShift) } - val c_trackerSrc = Mux1H(c_trackerOH, trackers.map { _.source }) - - // Decrement the tracker's outstanding probe counter - (trackers zip c_trackerOH) foreach { case (tracker, select) => - tracker.probenack := in.c.fire() && c_probeack && select - } - - val releaseack = Wire(in.d) - val putfull = Wire(out.a) - - in.c.ready := c_probeack || Mux(c_release, releaseack.ready, putfull.ready) - - releaseack.valid := in.c.valid && c_release - releaseack.bits := edgeIn.ReleaseAck(in.c.bits.address, UInt(0), in.c.bits.source, in.c.bits.size) - - val put_what = Mux(c_releasedata, TRANSFORM_B, DROP) - val put_who = Mux(c_releasedata, in.c.bits.source, c_trackerSrc) - putfull.valid := in.c.valid && (c_probeackdata || c_releasedata) - putfull.bits := edgeOut.Put(Cat(put_what, put_who), in.c.bits.address, in.c.bits.size, in.c.bits.data)._2 - - // Combine ReleaseAck or the modified D - TLArbiter.lowest(edgeOut, in.d, releaseack, d_normal) - // Combine the PutFull with the trackers - TLArbiter.lowestFromSeq(edgeOut, out.a, putfull +: trackers.map(_.out_a)) - - // The Probe FSM walks all caches and probes them - val probe_todo = RegInit(UInt(0, width = max(1, caches.size))) - val probe_line = Reg(UInt()) - val probe_perms = Reg(UInt(width = 2)) - val probe_next = probe_todo & ~(leftOR(probe_todo) << 1) - val probe_busy = probe_todo.orR() - val probe_target = if (caches.size == 0) UInt(0) else Mux1H(probe_next, cache_targets) - - // Probe whatever the FSM wants to do next - in.b.valid := probe_busy - if (caches.size != 0) { - in.b.bits := edgeIn.Probe(probe_line << lineShift, probe_target, UInt(lineShift), probe_perms)._2 - } - when (in.b.fire()) { probe_todo := probe_todo & ~probe_next } - - // Which cache does a request come from? - val a_cache = if (caches.size == 0) UInt(1) else Vec(caches.map(_.contains(in.a.bits.source))).asUInt - val a_first = edgeIn.first(in.a) - - // To accept a request from A, the probe FSM must be idle and there must be a matching tracker - val freeTrackers = Vec(trackers.map { t => t.idle }).asUInt - val freeTracker = freeTrackers.orR() - val matchTrackers = Vec(trackers.map { t => t.line === in.a.bits.address >> lineShift }).asUInt - val matchTracker = matchTrackers.orR() - val allocTracker = freeTrackers & ~(leftOR(freeTrackers) << 1) - val selectTracker = Mux(matchTracker, matchTrackers, allocTracker) - - val trackerReady = Vec(trackers.map(_.in_a.ready)).asUInt - in.a.ready := (!a_first || !probe_busy) && (selectTracker & trackerReady).orR() - (trackers zip selectTracker.toBools) foreach { case (t, select) => - t.in_a.valid := in.a.valid && select && (!a_first || !probe_busy) - t.in_a.bits := in.a.bits - t.in_a_first := a_first - t.probe := (if (caches.size == 0) UInt(0) else Mux(a_cache.orR(), UInt(caches.size-1), UInt(caches.size))) - } - - when (in.a.fire() && a_first) { - probe_todo := ~a_cache // probe all but the cache who poked us - probe_line := in.a.bits.address >> lineShift - probe_perms := MuxLookup(in.a.bits.opcode, Wire(UInt(width = 2)), Array( - TLMessages.PutFullData -> TLPermissions.toN, - TLMessages.PutPartialData -> TLPermissions.toN, - TLMessages.ArithmeticData -> TLPermissions.toN, - TLMessages.LogicalData -> TLPermissions.toN, - TLMessages.Get -> TLPermissions.toB, - TLMessages.Hint -> MuxLookup(in.a.bits.param, Wire(UInt(width = 2)), Array( - TLHints.PREFETCH_READ -> TLPermissions.toB, - TLHints.PREFETCH_WRITE -> TLPermissions.toN)), - TLMessages.Acquire -> MuxLookup(in.a.bits.param, Wire(UInt(width = 2)), Array( - TLPermissions.NtoB -> TLPermissions.toB, - TLPermissions.NtoT -> TLPermissions.toN, - TLPermissions.BtoT -> TLPermissions.toN)))) - } - - // The outer TL connections may not be cached - out.b.ready := Bool(true) - out.c.valid := Bool(false) - out.e.valid := Bool(false) } } diff --git a/src/main/scala/uncore/tilelink2/Buffer.scala b/src/main/scala/uncore/tilelink2/Buffer.scala index c25f5561..8a864ec3 100644 --- a/src/main/scala/uncore/tilelink2/Buffer.scala +++ b/src/main/scala/uncore/tilelink2/Buffer.scala @@ -18,8 +18,8 @@ class TLBuffer(a: Int = 2, b: Int = 2, c: Int = 2, d: Int = 2, e: Int = 2, pipe: require (e >= 0) val node = TLAdapterNode( - clientFn = { case Seq(p) => p.copy(minLatency = p.minLatency + min(1,b) + min(1,c)) }, - managerFn = { case Seq(p) => p.copy(minLatency = p.minLatency + min(1,a) + min(1,d)) }) + clientFn = { p => p.copy(minLatency = p.minLatency + min(1,b) + min(1,c)) }, + managerFn = { p => p.copy(minLatency = p.minLatency + min(1,a) + min(1,d)) }) lazy val module = new LazyModuleImp(this) { val io = new Bundle { diff --git a/src/main/scala/uncore/tilelink2/CacheCork.scala b/src/main/scala/uncore/tilelink2/CacheCork.scala index 60e7305d..5b245b0b 100644 --- a/src/main/scala/uncore/tilelink2/CacheCork.scala +++ b/src/main/scala/uncore/tilelink2/CacheCork.scala @@ -12,10 +12,10 @@ import TLMessages._ class TLCacheCork(unsafe: Boolean = false)(implicit p: Parameters) extends LazyModule { val node = TLAdapterNode( - clientFn = { case Seq(cp) => + clientFn = { case cp => cp.copy(clients = cp.clients.map { c => c.copy( sourceId = IdRange(c.sourceId.start*2, c.sourceId.end*2))})}, - managerFn = { case Seq(mp) => + managerFn = { case mp => mp.copy(managers = mp.managers.map { m => m.copy( regionType = if (m.regionType == RegionType.UNCACHED) RegionType.TRACKED else m.regionType, supportsAcquireB = m.supportsGet, @@ -27,93 +27,89 @@ class TLCacheCork(unsafe: Boolean = false)(implicit p: Parameters) extends LazyM val out = node.bundleOut } - val edgeIn = node.edgesIn(0) - val edgeOut = node.edgesOut(0) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + require (edgeIn.client.clients.size == 1 || unsafe, "Only one client can safely use a TLCacheCork") + require (edgeIn.client.clients.filter(_.supportsProbe).size == 1, "Only one caching client allowed") + edgeOut.manager.managers.foreach { case m => + require (!m.supportsAcquireB, "Cannot support caches beyond the Cork") + } - require (edgeIn.client.clients.size == 1 || unsafe, "Only one client can safely use a TLCacheCork") - require (edgeIn.client.clients.filter(_.supportsProbe).size == 1, "Only one caching client allowed") - edgeOut.manager.managers.foreach { case m => - require (!m.supportsAcquireB, "Cannot support caches beyond the Cork") + // The Cork turns [Acquire=>Get] => [AccessAckData=>GrantData] + // and [ReleaseData=>PutFullData] => [AccessAck=>ReleaseAck] + // We need to encode information sufficient to reverse the transformation in output. + // A caveat is that we get Acquire+Release with the same source and must keep the + // source unique after transformation onto the A channel. + // The coding scheme is: + // Put: 1, Release: 0 => AccessAck + // *: 0, Acquire: 1 => AccessAckData + + // Take requests from A to A + val isPut = in.a.bits.opcode === PutFullData || in.a.bits.opcode === PutPartialData + val a_a = Wire(out.a) + a_a <> in.a + a_a.bits.source := in.a.bits.source << 1 | Mux(isPut, UInt(1), UInt(0)) + + // Transform Acquire into Get + when (in.a.bits.opcode === Acquire) { + a_a.bits.opcode := Get + a_a.bits.param := UInt(0) + a_a.bits.source := in.a.bits.source << 1 | UInt(1) + } + + // Take ReleaseData from C to A; Release from C to D + val c_a = Wire(out.a) + c_a.valid := in.c.valid && in.c.bits.opcode === ReleaseData + c_a.bits.opcode := PutFullData + c_a.bits.param := UInt(0) + c_a.bits.size := in.c.bits.size + c_a.bits.source := in.c.bits.source << 1 + c_a.bits.address := in.c.bits.address + c_a.bits.mask := edgeOut.mask(in.c.bits.address, in.c.bits.size) + c_a.bits.data := in.c.bits.data + + val c_d = Wire(in.d) + c_d.valid := in.c.valid && in.c.bits.opcode === Release + c_d.bits.opcode := ReleaseAck + c_d.bits.param := UInt(0) + c_d.bits.size := in.c.bits.size + c_d.bits.source := in.c.bits.source + c_d.bits.sink := UInt(0) + c_d.bits.addr_lo := in.c.bits.address + c_d.bits.data := UInt(0) + c_d.bits.error := Bool(false) + + assert (!in.c.valid || in.c.bits.opcode === Release || in.c.bits.opcode === ReleaseData) + in.c.ready := Mux(in.c.bits.opcode === Release, c_d.ready, c_a.ready) + + // Discard E + in.e.ready := Bool(true) + + // Block B; should never happen + out.b.ready := Bool(false) + assert (!out.b.valid) + + // Take responses from D and transform them + val d_d = Wire(in.d) + d_d <> out.d + d_d.bits.source := out.d.bits.source >> 1 + + when (out.d.bits.opcode === AccessAckData && out.d.bits.source(0)) { + d_d.bits.opcode := GrantData + d_d.bits.param := TLPermissions.toT + } + when (out.d.bits.opcode === AccessAck && !out.d.bits.source(0)) { + d_d.bits.opcode := ReleaseAck + } + + // Combine the sources of messages into the channels + TLArbiter(TLArbiter.lowestIndexFirst)(out.a, (edgeOut.numBeats1(c_a.bits), c_a), (edgeOut.numBeats1(a_a.bits), a_a)) + TLArbiter(TLArbiter.lowestIndexFirst)(in.d, (edgeIn .numBeats1(d_d.bits), d_d), (UInt(0), Queue(c_d, 2))) + + // Tie off unused ports + in.b.valid := Bool(false) + out.c.valid := Bool(false) + out.e.valid := Bool(false) } - - val out = io.out(0) - val in = io.in(0) - - // The Cork turns [Acquire=>Get] => [AccessAckData=>GrantData] - // and [ReleaseData=>PutFullData] => [AccessAck=>ReleaseAck] - // We need to encode information sufficient to reverse the transformation in output. - // A caveat is that we get Acquire+Release with the same source and must keep the - // source unique after transformation onto the A channel. - // The coding scheme is: - // Put: 1, Release: 0 => AccessAck - // *: 0, Acquire: 1 => AccessAckData - - // Take requests from A to A - val isPut = in.a.bits.opcode === PutFullData || in.a.bits.opcode === PutPartialData - val a_a = Wire(out.a) - a_a <> in.a - a_a.bits.source := in.a.bits.source << 1 | Mux(isPut, UInt(1), UInt(0)) - - // Transform Acquire into Get - when (in.a.bits.opcode === Acquire) { - a_a.bits.opcode := Get - a_a.bits.param := UInt(0) - a_a.bits.source := in.a.bits.source << 1 | UInt(1) - } - - // Take ReleaseData from C to A; Release from C to D - val c_a = Wire(out.a) - c_a.valid := in.c.valid && in.c.bits.opcode === ReleaseData - c_a.bits.opcode := PutFullData - c_a.bits.param := UInt(0) - c_a.bits.size := in.c.bits.size - c_a.bits.source := in.c.bits.source << 1 - c_a.bits.address := in.c.bits.address - c_a.bits.mask := edgeOut.mask(in.c.bits.address, in.c.bits.size) - c_a.bits.data := in.c.bits.data - - val c_d = Wire(in.d) - c_d.valid := in.c.valid && in.c.bits.opcode === Release - c_d.bits.opcode := ReleaseAck - c_d.bits.param := UInt(0) - c_d.bits.size := in.c.bits.size - c_d.bits.source := in.c.bits.source - c_d.bits.sink := UInt(0) - c_d.bits.addr_lo := in.c.bits.address - c_d.bits.data := UInt(0) - c_d.bits.error := Bool(false) - - assert (!in.c.valid || in.c.bits.opcode === Release || in.c.bits.opcode === ReleaseData) - in.c.ready := Mux(in.c.bits.opcode === Release, c_d.ready, c_a.ready) - - // Discard E - in.e.ready := Bool(true) - - // Block B; should never happen - out.b.ready := Bool(false) - assert (!out.b.valid) - - // Take responses from D and transform them - val d_d = Wire(in.d) - d_d <> out.d - d_d.bits.source := out.d.bits.source >> 1 - - when (out.d.bits.opcode === AccessAckData && out.d.bits.source(0)) { - d_d.bits.opcode := GrantData - d_d.bits.param := TLPermissions.toT - } - when (out.d.bits.opcode === AccessAck && !out.d.bits.source(0)) { - d_d.bits.opcode := ReleaseAck - } - - // Combine the sources of messages into the channels - TLArbiter(TLArbiter.lowestIndexFirst)(out.a, (edgeOut.numBeats1(c_a.bits), c_a), (edgeOut.numBeats1(a_a.bits), a_a)) - TLArbiter(TLArbiter.lowestIndexFirst)(in.d, (edgeIn .numBeats1(d_d.bits), d_d), (UInt(0), Queue(c_d, 2))) - - // Tie off unused ports - in.b.valid := Bool(false) - out.c.valid := Bool(false) - out.e.valid := Bool(false) } } diff --git a/src/main/scala/uncore/tilelink2/Filter.scala b/src/main/scala/uncore/tilelink2/Filter.scala index 3c0a04b7..7c3899ff 100644 --- a/src/main/scala/uncore/tilelink2/Filter.scala +++ b/src/main/scala/uncore/tilelink2/Filter.scala @@ -11,8 +11,8 @@ import scala.math.{min,max} class TLFilter(select: AddressSet)(implicit p: Parameters) extends LazyModule { val node = TLAdapterNode( - clientFn = { case Seq(cp) => cp }, - managerFn = { case Seq(mp) => + clientFn = { cp => cp }, + managerFn = { mp => mp.copy(managers = mp.managers.map { m => val filtered = m.address.map(_.intersect(select)).flatten val alignment = select.alignment /* alignment 0 means 'select' selected everything */ diff --git a/src/main/scala/uncore/tilelink2/Fragmenter.scala b/src/main/scala/uncore/tilelink2/Fragmenter.scala index 934fb9b1..1b5a1538 100644 --- a/src/main/scala/uncore/tilelink2/Fragmenter.scala +++ b/src/main/scala/uncore/tilelink2/Fragmenter.scala @@ -41,8 +41,8 @@ class TLFragmenter(val minSize: Int, val maxSize: Int, val alwaysMin: Boolean = sourceId = IdRange(c.sourceId.start << fragmentBits, c.sourceId.end << fragmentBits)) val node = TLAdapterNode( - clientFn = { case Seq(c) => c.copy(clients = c.clients.map(mapClient)) }, - managerFn = { case Seq(m) => m.copy(managers = m.managers.map(mapManager)) }) + clientFn = { c => c.copy(clients = c.clients.map(mapClient)) }, + managerFn = { m => m.copy(managers = m.managers.map(mapManager)) }) lazy val module = new LazyModuleImp(this) { val io = new Bundle { @@ -50,204 +50,201 @@ class TLFragmenter(val minSize: Int, val maxSize: Int, val alwaysMin: Boolean = val out = node.bundleOut } - // All managers must share a common FIFO domain (responses might end up interleaved) - val edgeOut = node.edgesOut(0) - val edgeIn = node.edgesIn(0) - val manager = edgeOut.manager - val managers = manager.managers - val beatBytes = manager.beatBytes - val fifoId = managers(0).fifoId - require (fifoId.isDefined && managers.map(_.fifoId == fifoId).reduce(_ && _)) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + // All managers must share a common FIFO domain (responses might end up interleaved) + val manager = edgeOut.manager + val managers = manager.managers + val beatBytes = manager.beatBytes + val fifoId = managers(0).fifoId + require (fifoId.isDefined && managers.map(_.fifoId == fifoId).reduce(_ && _)) - // We don't support fragmenting to sub-beat accesses - require (minSize >= beatBytes) - // We can't support devices which are cached on both sides of us - require (!edgeOut.manager.anySupportAcquireB || !edgeIn.client.anySupportProbe) + // We don't support fragmenting to sub-beat accesses + require (minSize >= beatBytes) + // We can't support devices which are cached on both sides of us + require (!edgeOut.manager.anySupportAcquireB || !edgeIn.client.anySupportProbe) - /* The Fragmenter is a bit tricky, because there are 5 sizes in play: - * max size -- the maximum transfer size possible - * orig size -- the original pre-fragmenter size - * frag size -- the modified post-fragmenter size - * min size -- the threshold below which frag=orig - * beat size -- the amount transfered on any given beat - * - * The relationships are as follows: - * max >= orig >= frag - * max > min >= beat - * It IS possible that orig <= min (then frag=orig; ie: no fragmentation) - * - * The fragment# (sent via TL.source) is measured in multiples of min size. - * Meanwhile, to track the progress, counters measure in multiples of beat size. - * - * Here is an example of a bus with max=256, min=8, beat=4 and a device supporting 16. - * - * in.A out.A (frag#) out.D (frag#) in.D gen# ack# - * get64 get16 6 ackD16 6 ackD64 12 15 - * ackD16 6 ackD64 14 - * ackD16 6 ackD64 13 - * ackD16 6 ackD64 12 - * get16 4 ackD16 4 ackD64 8 11 - * ackD16 4 ackD64 10 - * ackD16 4 ackD64 9 - * ackD16 4 ackD64 8 - * get16 2 ackD16 2 ackD64 4 7 - * ackD16 2 ackD64 6 - * ackD16 2 ackD64 5 - * ackD16 2 ackD64 4 - * get16 0 ackD16 0 ackD64 0 3 - * ackD16 0 ackD64 2 - * ackD16 0 ackD64 1 - * ackD16 0 ackD64 0 - * - * get8 get8 0 ackD8 0 ackD8 0 1 - * ackD8 0 ackD8 0 - * - * get4 get4 0 ackD4 0 ackD4 0 0 - * get1 get1 0 ackD1 0 ackD1 0 0 - * - * put64 put16 6 15 - * put64 put16 6 14 - * put64 put16 6 13 - * put64 put16 6 ack16 6 12 12 - * put64 put16 4 11 - * put64 put16 4 10 - * put64 put16 4 9 - * put64 put16 4 ack16 4 8 8 - * put64 put16 2 7 - * put64 put16 2 6 - * put64 put16 2 5 - * put64 put16 2 ack16 2 4 4 - * put64 put16 0 3 - * put64 put16 0 2 - * put64 put16 0 1 - * put64 put16 0 ack16 0 ack64 0 0 - * - * put8 put8 0 1 - * put8 put8 0 ack8 0 ack8 0 0 - * - * put4 put4 0 ack4 0 ack4 0 0 - * put1 put1 0 ack1 0 ack1 0 0 - */ + /* The Fragmenter is a bit tricky, because there are 5 sizes in play: + * max size -- the maximum transfer size possible + * orig size -- the original pre-fragmenter size + * frag size -- the modified post-fragmenter size + * min size -- the threshold below which frag=orig + * beat size -- the amount transfered on any given beat + * + * The relationships are as follows: + * max >= orig >= frag + * max > min >= beat + * It IS possible that orig <= min (then frag=orig; ie: no fragmentation) + * + * The fragment# (sent via TL.source) is measured in multiples of min size. + * Meanwhile, to track the progress, counters measure in multiples of beat size. + * + * Here is an example of a bus with max=256, min=8, beat=4 and a device supporting 16. + * + * in.A out.A (frag#) out.D (frag#) in.D gen# ack# + * get64 get16 6 ackD16 6 ackD64 12 15 + * ackD16 6 ackD64 14 + * ackD16 6 ackD64 13 + * ackD16 6 ackD64 12 + * get16 4 ackD16 4 ackD64 8 11 + * ackD16 4 ackD64 10 + * ackD16 4 ackD64 9 + * ackD16 4 ackD64 8 + * get16 2 ackD16 2 ackD64 4 7 + * ackD16 2 ackD64 6 + * ackD16 2 ackD64 5 + * ackD16 2 ackD64 4 + * get16 0 ackD16 0 ackD64 0 3 + * ackD16 0 ackD64 2 + * ackD16 0 ackD64 1 + * ackD16 0 ackD64 0 + * + * get8 get8 0 ackD8 0 ackD8 0 1 + * ackD8 0 ackD8 0 + * + * get4 get4 0 ackD4 0 ackD4 0 0 + * get1 get1 0 ackD1 0 ackD1 0 0 + * + * put64 put16 6 15 + * put64 put16 6 14 + * put64 put16 6 13 + * put64 put16 6 ack16 6 12 12 + * put64 put16 4 11 + * put64 put16 4 10 + * put64 put16 4 9 + * put64 put16 4 ack16 4 8 8 + * put64 put16 2 7 + * put64 put16 2 6 + * put64 put16 2 5 + * put64 put16 2 ack16 2 4 4 + * put64 put16 0 3 + * put64 put16 0 2 + * put64 put16 0 1 + * put64 put16 0 ack16 0 ack64 0 0 + * + * put8 put8 0 1 + * put8 put8 0 ack8 0 ack8 0 0 + * + * put4 put4 0 ack4 0 ack4 0 0 + * put1 put1 0 ack1 0 ack1 0 0 + */ - val in = io.in(0) - val out = io.out(0) + val counterBits = log2Up(maxSize/beatBytes) + val maxDownSize = if (alwaysMin) minSize else manager.maxTransfer - val counterBits = log2Up(maxSize/beatBytes) - val maxDownSize = if (alwaysMin) minSize else manager.maxTransfer + // First, handle the return path + val acknum = RegInit(UInt(0, width = counterBits)) + val dOrig = Reg(UInt()) + val dFragnum = out.d.bits.source(fragmentBits-1, 0) + val dFirst = acknum === UInt(0) + val dsizeOH = UIntToOH (out.d.bits.size, log2Ceil(maxDownSize)+1) + val dsizeOH1 = UIntToOH1(out.d.bits.size, log2Up(maxDownSize)) + val dHasData = edgeOut.hasData(out.d.bits) - // First, handle the return path - val acknum = RegInit(UInt(0, width = counterBits)) - val dOrig = Reg(UInt()) - val dFragnum = out.d.bits.source(fragmentBits-1, 0) - val dFirst = acknum === UInt(0) - val dsizeOH = UIntToOH (out.d.bits.size, log2Ceil(maxDownSize)+1) - val dsizeOH1 = UIntToOH1(out.d.bits.size, log2Up(maxDownSize)) - val dHasData = edgeOut.hasData(out.d.bits) + // calculate new acknum + val acknum_fragment = dFragnum << log2Ceil(minSize/beatBytes) + val acknum_size = dsizeOH1 >> log2Ceil(beatBytes) + assert (!out.d.valid || (acknum_fragment & acknum_size) === UInt(0)) + val dFirst_acknum = acknum_fragment | Mux(dHasData, acknum_size, UInt(0)) + val ack_decrement = Mux(dHasData, UInt(1), dsizeOH >> log2Ceil(beatBytes)) + // calculate the original size + val dFirst_size = OH1ToUInt((dFragnum << log2Ceil(minSize)) | dsizeOH1) - // calculate new acknum - val acknum_fragment = dFragnum << log2Ceil(minSize/beatBytes) - val acknum_size = dsizeOH1 >> log2Ceil(beatBytes) - assert (!out.d.valid || (acknum_fragment & acknum_size) === UInt(0)) - val dFirst_acknum = acknum_fragment | Mux(dHasData, acknum_size, UInt(0)) - val ack_decrement = Mux(dHasData, UInt(1), dsizeOH >> log2Ceil(beatBytes)) - // calculate the original size - val dFirst_size = OH1ToUInt((dFragnum << log2Ceil(minSize)) | dsizeOH1) + when (out.d.fire()) { + acknum := Mux(dFirst, dFirst_acknum, acknum - ack_decrement) + when (dFirst) { dOrig := dFirst_size } + } - when (out.d.fire()) { - acknum := Mux(dFirst, dFirst_acknum, acknum - ack_decrement) - when (dFirst) { dOrig := dFirst_size } + // Swallow up non-data ack fragments + val drop = !dHasData && (dFragnum =/= UInt(0)) + out.d.ready := in.d.ready || drop + in.d.valid := out.d.valid && !drop + in.d.bits := out.d.bits // pass most stuff unchanged + in.d.bits.addr_lo := out.d.bits.addr_lo & ~dsizeOH1 + in.d.bits.source := out.d.bits.source >> fragmentBits + in.d.bits.size := Mux(dFirst, dFirst_size, dOrig) + + // Combine the error flag + val r_error = RegInit(Bool(false)) + val d_error = r_error | out.d.bits.error + when (out.d.fire()) { r_error := Mux(drop, d_error, UInt(0)) } + in.d.bits.error := d_error + + // What maximum transfer sizes do downstream devices support? + val maxArithmetics = managers.map(_.supportsArithmetic.max) + val maxLogicals = managers.map(_.supportsLogical.max) + val maxGets = managers.map(_.supportsGet.max) + val maxPutFulls = managers.map(_.supportsPutFull.max) + val maxPutPartials = managers.map(_.supportsPutPartial.max) + val maxHints = managers.map(m => if (m.supportsHint) maxDownSize else 0) + + // We assume that the request is valid => size 0 is impossible + val lgMinSize = UInt(log2Ceil(minSize)) + val maxLgArithmetics = maxArithmetics.map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) + val maxLgLogicals = maxLogicals .map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) + val maxLgGets = maxGets .map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) + val maxLgPutFulls = maxPutFulls .map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) + val maxLgPutPartials = maxPutPartials.map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) + val maxLgHints = maxHints .map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) + + // Make the request repeatable + val repeater = Module(new Repeater(in.a.bits)) + repeater.io.enq <> in.a + val in_a = repeater.io.deq + + // If this is infront of a single manager, these become constants + val find = manager.findFast(edgeIn.address(in_a.bits)) + val maxLgArithmetic = Mux1H(find, maxLgArithmetics) + val maxLgLogical = Mux1H(find, maxLgLogicals) + val maxLgGet = Mux1H(find, maxLgGets) + val maxLgPutFull = Mux1H(find, maxLgPutFulls) + val maxLgPutPartial = Mux1H(find, maxLgPutPartials) + val maxLgHint = Mux1H(find, maxLgHints) + + val limit = if (alwaysMin) lgMinSize else + MuxLookup(in_a.bits.opcode, lgMinSize, Array( + TLMessages.PutFullData -> maxLgPutFull, + TLMessages.PutPartialData -> maxLgPutPartial, + TLMessages.ArithmeticData -> maxLgArithmetic, + TLMessages.LogicalData -> maxLgLogical, + TLMessages.Get -> maxLgGet, + TLMessages.Hint -> maxLgHint)) + + val aOrig = in_a.bits.size + val aFrag = Mux(aOrig > limit, limit, aOrig) + val aOrigOH1 = UIntToOH1(aOrig, log2Ceil(maxSize)) + val aFragOH1 = UIntToOH1(aFrag, log2Up(maxDownSize)) + val aHasData = node.edgesIn(0).hasData(in_a.bits) + val aMask = Mux(aHasData, UInt(0), aFragOH1) + + val gennum = RegInit(UInt(0, width = counterBits)) + val aFirst = gennum === UInt(0) + val old_gennum1 = Mux(aFirst, aOrigOH1 >> log2Ceil(beatBytes), gennum - UInt(1)) + val new_gennum = ~(~old_gennum1 | (aMask >> log2Ceil(beatBytes))) // ~(~x|y) is width safe + val aFragnum = ~(~(old_gennum1 >> log2Ceil(minSize/beatBytes)) | (aFragOH1 >> log2Ceil(minSize))) + + when (out.a.fire()) { gennum := new_gennum } + + repeater.io.repeat := !aHasData && aFragnum =/= UInt(0) + out.a <> in_a + out.a.bits.address := in_a.bits.address | (~aFragnum << log2Ceil(minSize) & aOrigOH1) + out.a.bits.source := Cat(in_a.bits.source, aFragnum) + out.a.bits.size := aFrag + + // Optimize away some of the Repeater's registers + assert (!repeater.io.full || !aHasData) + out.a.bits.data := in.a.bits.data + val fullMask = UInt((BigInt(1) << beatBytes) - 1) + assert (!repeater.io.full || in_a.bits.mask === fullMask) + out.a.bits.mask := Mux(repeater.io.full, fullMask, in.a.bits.mask) + + // Tie off unused channels + in.b.valid := Bool(false) + in.c.ready := Bool(true) + in.e.ready := Bool(true) + out.b.ready := Bool(true) + out.c.valid := Bool(false) + out.e.valid := Bool(false) } - - // Swallow up non-data ack fragments - val drop = !dHasData && (dFragnum =/= UInt(0)) - out.d.ready := in.d.ready || drop - in.d.valid := out.d.valid && !drop - in.d.bits := out.d.bits // pass most stuff unchanged - in.d.bits.addr_lo := out.d.bits.addr_lo & ~dsizeOH1 - in.d.bits.source := out.d.bits.source >> fragmentBits - in.d.bits.size := Mux(dFirst, dFirst_size, dOrig) - - // Combine the error flag - val r_error = RegInit(Bool(false)) - val d_error = r_error | out.d.bits.error - when (out.d.fire()) { r_error := Mux(drop, d_error, UInt(0)) } - in.d.bits.error := d_error - - // What maximum transfer sizes do downstream devices support? - val maxArithmetics = managers.map(_.supportsArithmetic.max) - val maxLogicals = managers.map(_.supportsLogical.max) - val maxGets = managers.map(_.supportsGet.max) - val maxPutFulls = managers.map(_.supportsPutFull.max) - val maxPutPartials = managers.map(_.supportsPutPartial.max) - val maxHints = managers.map(m => if (m.supportsHint) maxDownSize else 0) - - // We assume that the request is valid => size 0 is impossible - val lgMinSize = UInt(log2Ceil(minSize)) - val maxLgArithmetics = maxArithmetics.map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) - val maxLgLogicals = maxLogicals .map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) - val maxLgGets = maxGets .map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) - val maxLgPutFulls = maxPutFulls .map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) - val maxLgPutPartials = maxPutPartials.map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) - val maxLgHints = maxHints .map(m => if (m == 0) lgMinSize else UInt(log2Ceil(m))) - - // Make the request repeatable - val repeater = Module(new Repeater(in.a.bits)) - repeater.io.enq <> in.a - val in_a = repeater.io.deq - - // If this is infront of a single manager, these become constants - val find = manager.findFast(edgeIn.address(in_a.bits)) - val maxLgArithmetic = Mux1H(find, maxLgArithmetics) - val maxLgLogical = Mux1H(find, maxLgLogicals) - val maxLgGet = Mux1H(find, maxLgGets) - val maxLgPutFull = Mux1H(find, maxLgPutFulls) - val maxLgPutPartial = Mux1H(find, maxLgPutPartials) - val maxLgHint = Mux1H(find, maxLgHints) - - val limit = if (alwaysMin) lgMinSize else - MuxLookup(in_a.bits.opcode, lgMinSize, Array( - TLMessages.PutFullData -> maxLgPutFull, - TLMessages.PutPartialData -> maxLgPutPartial, - TLMessages.ArithmeticData -> maxLgArithmetic, - TLMessages.LogicalData -> maxLgLogical, - TLMessages.Get -> maxLgGet, - TLMessages.Hint -> maxLgHint)) - - val aOrig = in_a.bits.size - val aFrag = Mux(aOrig > limit, limit, aOrig) - val aOrigOH1 = UIntToOH1(aOrig, log2Ceil(maxSize)) - val aFragOH1 = UIntToOH1(aFrag, log2Up(maxDownSize)) - val aHasData = node.edgesIn(0).hasData(in_a.bits) - val aMask = Mux(aHasData, UInt(0), aFragOH1) - - val gennum = RegInit(UInt(0, width = counterBits)) - val aFirst = gennum === UInt(0) - val old_gennum1 = Mux(aFirst, aOrigOH1 >> log2Ceil(beatBytes), gennum - UInt(1)) - val new_gennum = ~(~old_gennum1 | (aMask >> log2Ceil(beatBytes))) // ~(~x|y) is width safe - val aFragnum = ~(~(old_gennum1 >> log2Ceil(minSize/beatBytes)) | (aFragOH1 >> log2Ceil(minSize))) - - when (out.a.fire()) { gennum := new_gennum } - - repeater.io.repeat := !aHasData && aFragnum =/= UInt(0) - out.a <> in_a - out.a.bits.address := in_a.bits.address | (~aFragnum << log2Ceil(minSize) & aOrigOH1) - out.a.bits.source := Cat(in_a.bits.source, aFragnum) - out.a.bits.size := aFrag - - // Optimize away some of the Repeater's registers - assert (!repeater.io.full || !aHasData) - out.a.bits.data := in.a.bits.data - val fullMask = UInt((BigInt(1) << beatBytes) - 1) - assert (!repeater.io.full || in_a.bits.mask === fullMask) - out.a.bits.mask := Mux(repeater.io.full, fullMask, in.a.bits.mask) - - // Tie off unused channels - in.b.valid := Bool(false) - in.c.ready := Bool(true) - in.e.ready := Bool(true) - out.b.ready := Bool(true) - out.c.valid := Bool(false) - out.e.valid := Bool(false) } } diff --git a/src/main/scala/uncore/tilelink2/HintHandler.scala b/src/main/scala/uncore/tilelink2/HintHandler.scala index 623d4887..ff31beb5 100644 --- a/src/main/scala/uncore/tilelink2/HintHandler.scala +++ b/src/main/scala/uncore/tilelink2/HintHandler.scala @@ -12,8 +12,8 @@ import diplomacy._ class TLHintHandler(supportManagers: Boolean = true, supportClients: Boolean = false, passthrough: Boolean = true)(implicit p: Parameters) extends LazyModule { val node = TLAdapterNode( - clientFn = { case Seq(c) => if (!supportClients) c else c.copy(minLatency = min(1, c.minLatency), clients = c.clients .map(_.copy(supportsHint = TransferSizes(1, c.maxTransfer)))) }, - managerFn = { case Seq(m) => if (!supportManagers) m else m.copy(minLatency = min(1, m.minLatency), managers = m.managers.map(_.copy(supportsHint = TransferSizes(1, m.maxTransfer)))) }) + clientFn = { c => if (!supportClients) c else c.copy(minLatency = min(1, c.minLatency), clients = c.clients .map(_.copy(supportsHint = TransferSizes(1, c.maxTransfer)))) }, + managerFn = { m => if (!supportManagers) m else m.copy(minLatency = min(1, m.minLatency), managers = m.managers.map(_.copy(supportsHint = TransferSizes(1, m.maxTransfer)))) }) lazy val module = new LazyModuleImp(this) { val io = new Bundle { @@ -21,79 +21,76 @@ class TLHintHandler(supportManagers: Boolean = true, supportClients: Boolean = f val out = node.bundleOut } - val in = io.in(0) - val out = io.out(0) - val edgeIn = node.edgesIn(0) - val edgeOut = node.edgesOut(0) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + // Don't add support for clients if there is no BCE channel + val bce = edgeOut.manager.anySupportAcquireB && edgeIn.client.anySupportProbe + require (!supportClients || bce) - // Don't add support for clients if there is no BCE channel - val bce = edgeOut.manager.anySupportAcquireB && edgeIn.client.anySupportProbe - require (!supportClients || bce) + // Does it even make sense to add the HintHandler? + val smartClients = edgeIn.client.clients.map(_.supportsHint.max == edgeIn.client.maxTransfer).reduce(_&&_) + val smartManagers = edgeOut.manager.managers.map(_.supportsHint.max == edgeOut.manager.maxTransfer).reduce(_&&_) - // Does it even make sense to add the HintHandler? - val smartClients = edgeIn.client.clients.map(_.supportsHint.max == edgeIn.client.maxTransfer).reduce(_&&_) - val smartManagers = edgeOut.manager.managers.map(_.supportsHint.max == edgeOut.manager.maxTransfer).reduce(_&&_) + if (supportManagers && !(passthrough && smartManagers)) { + val address = edgeIn.address(in.a.bits) + val handleA = if (passthrough) !edgeOut.manager.supportsHintFast(address, edgeIn.size(in.a.bits)) else Bool(true) + val hintBitsAtA = handleA && in.a.bits.opcode === TLMessages.Hint + val hint = Wire(out.d) - if (supportManagers && !(passthrough && smartManagers)) { - val address = edgeIn.address(in.a.bits) - val handleA = if (passthrough) !edgeOut.manager.supportsHintFast(address, edgeIn.size(in.a.bits)) else Bool(true) - val hintBitsAtA = handleA && in.a.bits.opcode === TLMessages.Hint - val hint = Wire(out.d) + hint.valid := in.a.valid && hintBitsAtA + out.a.valid := in.a.valid && !hintBitsAtA + in.a.ready := Mux(hintBitsAtA, hint.ready, out.a.ready) - hint.valid := in.a.valid && hintBitsAtA - out.a.valid := in.a.valid && !hintBitsAtA - in.a.ready := Mux(hintBitsAtA, hint.ready, out.a.ready) + hint.bits := edgeIn.HintAck(in.a.bits, UInt(0)) + out.a.bits := in.a.bits - hint.bits := edgeIn.HintAck(in.a.bits, UInt(0)) - out.a.bits := in.a.bits + TLArbiter(TLArbiter.lowestIndexFirst)(in.d, (edgeOut.numBeats1(out.d.bits), out.d), (UInt(0), Queue(hint, 1))) + } else { + out.a.valid := in.a.valid + in.a.ready := out.a.ready + out.a.bits := in.a.bits - TLArbiter(TLArbiter.lowestIndexFirst)(in.d, (edgeOut.numBeats1(out.d.bits), out.d), (UInt(0), Queue(hint, 1))) - } else { - out.a.valid := in.a.valid - in.a.ready := out.a.ready - out.a.bits := in.a.bits + in.d.valid := out.d.valid + out.d.ready := in.d.ready + in.d.bits := out.d.bits + } - in.d.valid := out.d.valid - out.d.ready := in.d.ready - in.d.bits := out.d.bits - } + if (supportClients && !(passthrough && smartClients)) { + val handleB = if (passthrough) !edgeIn.client.supportsHint(out.b.bits.source, edgeOut.size(out.b.bits)) else Bool(true) + val hintBitsAtB = handleB && out.b.bits.opcode === TLMessages.Hint + val hint = Wire(in.c) - if (supportClients && !(passthrough && smartClients)) { - val handleB = if (passthrough) !edgeIn.client.supportsHint(out.b.bits.source, edgeOut.size(out.b.bits)) else Bool(true) - val hintBitsAtB = handleB && out.b.bits.opcode === TLMessages.Hint - val hint = Wire(in.c) + hint.valid := out.b.valid && hintBitsAtB + in.b.valid := out.b.valid && !hintBitsAtB + out.b.ready := Mux(hintBitsAtB, hint.ready, in.b.ready) - hint.valid := out.b.valid && hintBitsAtB - in.b.valid := out.b.valid && !hintBitsAtB - out.b.ready := Mux(hintBitsAtB, hint.ready, in.b.ready) + hint.bits := edgeOut.HintAck(out.b.bits) + in.b.bits := out.b.bits - hint.bits := edgeOut.HintAck(out.b.bits) - in.b.bits := out.b.bits + TLArbiter(TLArbiter.lowestIndexFirst)(out.c, (edgeIn.numBeats1(in.c.bits), in.c), (UInt(0), Queue(hint, 1))) + } else if (bce) { + in.b.valid := out.b.valid + out.b.ready := in.b.ready + in.b.bits := out.b.bits - TLArbiter(TLArbiter.lowestIndexFirst)(out.c, (edgeIn.numBeats1(in.c.bits), in.c), (UInt(0), Queue(hint, 1))) - } else if (bce) { - in.b.valid := out.b.valid - out.b.ready := in.b.ready - in.b.bits := out.b.bits + out.c.valid := in.c.valid + in.c.ready := out.c.ready + out.c.bits := in.c.bits + } else { + in.b.valid := Bool(false) + in.c.ready := Bool(true) + out.b.ready := Bool(true) + out.c.valid := Bool(false) + } - out.c.valid := in.c.valid - in.c.ready := out.c.ready - out.c.bits := in.c.bits - } else { - in.b.valid := Bool(false) - in.c.ready := Bool(true) - out.b.ready := Bool(true) - out.c.valid := Bool(false) - } - - if (bce) { - // Pass E through unchanged - out.e.valid := in.e.valid - in.e.ready := out.e.ready - out.e.bits := in.e.bits - } else { - in.e.ready := Bool(true) - out.e.valid := Bool(false) + if (bce) { + // Pass E through unchanged + out.e.valid := in.e.valid + in.e.ready := out.e.ready + out.e.bits := in.e.bits + } else { + in.e.ready := Bool(true) + out.e.valid := Bool(false) + } } } } diff --git a/src/main/scala/uncore/tilelink2/IntNodes.scala b/src/main/scala/uncore/tilelink2/IntNodes.scala index 18d0553e..ddfa56e1 100644 --- a/src/main/scala/uncore/tilelink2/IntNodes.scala +++ b/src/main/scala/uncore/tilelink2/IntNodes.scala @@ -81,16 +81,16 @@ object IntImp extends NodeImp[IntSourcePortParameters, IntSinkPortParameters, In case class IntIdentityNode() extends IdentityNode(IntImp) case class IntSourceNode(num: Int) extends SourceNode(IntImp)( - IntSourcePortParameters(Seq(IntSourceParameters(num))), (if (num == 0) 0 else 1) to 1) + if (num == 0) Seq() else Seq(IntSourcePortParameters(Seq(IntSourceParameters(num))))) case class IntSinkNode() extends SinkNode(IntImp)( - IntSinkPortParameters(Seq(IntSinkParameters()))) + Seq(IntSinkPortParameters(Seq(IntSinkParameters())))) -case class IntAdapterNode( +case class IntNexusNode( sourceFn: Seq[IntSourcePortParameters] => IntSourcePortParameters, sinkFn: Seq[IntSinkPortParameters] => IntSinkPortParameters, - numSourcePorts: Range.Inclusive = 1 to 1, - numSinkPorts: Range.Inclusive = 1 to 1) - extends InteriorNode(IntImp)(sourceFn, sinkFn, numSourcePorts, numSinkPorts) + numSourcePorts: Range.Inclusive = 0 to 128, + numSinkPorts: Range.Inclusive = 0 to 128) + extends NexusNode(IntImp)(sourceFn, sinkFn, numSourcePorts, numSinkPorts) case class IntOutputNode() extends OutputNode(IntImp) case class IntInputNode() extends InputNode(IntImp) @@ -103,9 +103,7 @@ case class IntInternalInputNode(num: Int) extends InternalInputNode(IntImp)(Seq( class IntXbar()(implicit p: Parameters) extends LazyModule { - val intnode = IntAdapterNode( - numSourcePorts = 0 to 128, - numSinkPorts = 0 to 128, + val intnode = IntNexusNode( sinkFn = { _ => IntSinkPortParameters(Seq(IntSinkParameters())) }, sourceFn = { seq => IntSourcePortParameters((seq zip seq.map(_.num).scanLeft(0)(_+_).init).map { diff --git a/src/main/scala/uncore/tilelink2/Nodes.scala b/src/main/scala/uncore/tilelink2/Nodes.scala index 3c2bd137..3cd6e26d 100644 --- a/src/main/scala/uncore/tilelink2/Nodes.scala +++ b/src/main/scala/uncore/tilelink2/Nodes.scala @@ -86,29 +86,33 @@ object TLImp extends NodeImp[TLClientPortParameters, TLManagerPortParameters, TL // Nodes implemented inside modules case class TLIdentityNode() extends IdentityNode(TLImp) -case class TLClientNode(portParams: TLClientPortParameters, numPorts: Range.Inclusive = 1 to 1) - extends SourceNode(TLImp)(portParams, numPorts) -case class TLManagerNode(portParams: TLManagerPortParameters, numPorts: Range.Inclusive = 1 to 1) - extends SinkNode(TLImp)(portParams, numPorts) +case class TLClientNode(portParams: Seq[TLClientPortParameters]) extends SourceNode(TLImp)(portParams) +case class TLManagerNode(portParams: Seq[TLManagerPortParameters]) extends SinkNode(TLImp)(portParams) object TLClientNode { def apply(params: TLClientParameters) = - new TLClientNode(TLClientPortParameters(Seq(params)), 1 to 1) + new TLClientNode(Seq(TLClientPortParameters(Seq(params)))) } object TLManagerNode { def apply(beatBytes: Int, params: TLManagerParameters) = - new TLManagerNode(TLManagerPortParameters(Seq(params), beatBytes, minLatency = 0), 1 to 1) + new TLManagerNode(Seq(TLManagerPortParameters(Seq(params), beatBytes, minLatency = 0))) } case class TLAdapterNode( + clientFn: TLClientPortParameters => TLClientPortParameters, + managerFn: TLManagerPortParameters => TLManagerPortParameters, + num: Range.Inclusive = 0 to 999) + extends AdapterNode(TLImp)(clientFn, managerFn, num) + +case class TLNexusNode( clientFn: Seq[TLClientPortParameters] => TLClientPortParameters, managerFn: Seq[TLManagerPortParameters] => TLManagerPortParameters, - numClientPorts: Range.Inclusive = 1 to 1, - numManagerPorts: Range.Inclusive = 1 to 1) - extends InteriorNode(TLImp)(clientFn, managerFn, numClientPorts, numManagerPorts) + numClientPorts: Range.Inclusive = 1 to 999, + numManagerPorts: Range.Inclusive = 1 to 999) + extends NexusNode(TLImp)(clientFn, managerFn, numClientPorts, numManagerPorts) // Nodes passed from an inner module case class TLOutputNode() extends OutputNode(TLImp) @@ -169,17 +173,15 @@ case class TLAsyncIdentityNode() extends IdentityNode(TLAsyncImp) case class TLAsyncOutputNode() extends OutputNode(TLAsyncImp) case class TLAsyncInputNode() extends InputNode(TLAsyncImp) -case class TLAsyncSourceNode(sync: Int) extends MixedNode(TLImp, TLAsyncImp)( - dFn = { case (1, Seq(p)) => Seq(TLAsyncClientPortParameters(p)) }, - uFn = { case (1, Seq(p)) => Seq(p.base.copy(minLatency = sync+1)) }, // discard cycles in other clock domain - numPO = 1 to 1, - numPI = 1 to 1) +case class TLAsyncSourceNode(sync: Int) + extends MixedAdapterNode(TLImp, TLAsyncImp)( + dFn = { p => TLAsyncClientPortParameters(p) }, + uFn = { p => p.base.copy(minLatency = sync+1) }) // discard cycles in other clock domain -case class TLAsyncSinkNode(depth: Int, sync: Int) extends MixedNode(TLAsyncImp, TLImp)( - dFn = { case (1, Seq(p)) => Seq(p.base.copy(minLatency = sync+1)) }, - uFn = { case (1, Seq(p)) => Seq(TLAsyncManagerPortParameters(depth, p)) }, - numPO = 1 to 1, - numPI = 1 to 1) +case class TLAsyncSinkNode(depth: Int, sync: Int) + extends MixedAdapterNode(TLAsyncImp, TLImp)( + dFn = { p => p.base.copy(minLatency = sync+1) }, + uFn = { p => TLAsyncManagerPortParameters(depth, p) }) object TLRationalImp extends NodeImp[TLClientPortParameters, TLManagerPortParameters, TLEdgeParameters, TLEdgeParameters, TLRationalBundle] { @@ -205,14 +207,12 @@ case class TLRationalIdentityNode() extends IdentityNode(TLRationalImp) case class TLRationalOutputNode() extends OutputNode(TLRationalImp) case class TLRationalInputNode() extends InputNode(TLRationalImp) -case class TLRationalSourceNode() extends MixedNode(TLImp, TLRationalImp)( - dFn = { case (_, s) => s }, - uFn = { case (_, s) => s.map(p => p.copy(minLatency = 1)) }, // discard cycles from other clock domain - numPO = 0 to 999, - numPI = 0 to 999) +case class TLRationalSourceNode() + extends MixedAdapterNode(TLImp, TLRationalImp)( + dFn = { p => p }, + uFn = { p => p.copy(minLatency = 1) }) // discard cycles from other clock domain -case class TLRationalSinkNode() extends MixedNode(TLRationalImp, TLImp)( - dFn = { case (_, s) => s.map(p => p.copy(minLatency = 1)) }, - uFn = { case (_, s) => s }, - numPO = 0 to 999, - numPI = 0 to 999) +case class TLRationalSinkNode() + extends MixedAdapterNode(TLRationalImp, TLImp)( + dFn = { p => p.copy(minLatency = 1) }, + uFn = { p => p }) diff --git a/src/main/scala/uncore/tilelink2/RAMModel.scala b/src/main/scala/uncore/tilelink2/RAMModel.scala index cee51935..0cfe05f6 100644 --- a/src/main/scala/uncore/tilelink2/RAMModel.scala +++ b/src/main/scala/uncore/tilelink2/RAMModel.scala @@ -5,6 +5,7 @@ package uncore.tilelink2 import Chisel._ import config._ import diplomacy._ +import util.GenericParameterizedBundle // We detect concurrent puts that put memory into an undefined state. // put0, put0Ack, put1, put1Ack => ok: defined @@ -31,268 +32,271 @@ class TLRAMModel(log: String = "")(implicit p: Parameters) extends LazyModule val out = node.bundleOut } - // !!! support multiple clients via clock division - require (io.out.size == 1) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val edge = edgeIn + val endAddress = edge.manager.maxAddress + 1 + val endSourceId = edge.client.endSourceId + val maxTransfer = edge.manager.maxTransfer + val beatBytes = edge.manager.beatBytes + val endAddressHi = (endAddress / beatBytes).intValue + val maxLgBeats = log2Up(maxTransfer/beatBytes) + val shift = log2Ceil(beatBytes) + val decTrees = log2Up(maxTransfer/beatBytes) + val addressBits = log2Up(endAddress) + val countBits = log2Up(endSourceId) + val sizeBits = edge.bundle.sizeBits - val in = io.in(0) - val out = io.out(0) + // Reset control logic + val wipeIndex = RegInit(UInt(0, width = log2Ceil(endAddressHi) + 1)) + val wipe = !wipeIndex(log2Ceil(endAddressHi)) + wipeIndex := wipeIndex + wipe.asUInt - val edge = node.edgesIn(0) - val endAddress = edge.manager.maxAddress + 1 - val endSourceId = edge.client.endSourceId - val maxTransfer = edge.manager.maxTransfer - val beatBytes = edge.manager.beatBytes - val endAddressHi = (endAddress / beatBytes).intValue - val maxLgBeats = log2Up(maxTransfer/beatBytes) - val shift = log2Ceil(beatBytes) - val decTrees = log2Up(maxTransfer/beatBytes) - val addressBits = log2Up(endAddress) - val countBits = log2Up(endSourceId) - val sizeBits = edge.bundle.sizeBits + // Block traffic while wiping Mems + in.a.ready := out.a.ready && !wipe + out.a.valid := in.a.valid && !wipe + out.a.bits := in.a.bits + out.d.ready := in.d.ready && !wipe + in.d.valid := out.d.valid && !wipe + in.d.bits := out.d.bits - // Reset control logic - val wipeIndex = RegInit(UInt(0, width = log2Ceil(endAddressHi) + 1)) - val wipe = !wipeIndex(log2Ceil(endAddressHi)) - wipeIndex := wipeIndex + wipe.asUInt + // BCE unsupported + in.b.valid := Bool(false) + out.c.valid := Bool(false) + out.e.valid := Bool(false) + out.b.ready := Bool(true) + in.c.ready := Bool(true) + in.e.ready := Bool(true) - // Block traffic while wiping Mems - in.a.ready := out.a.ready && !wipe - out.a.valid := in.a.valid && !wipe - out.a.bits := in.a.bits - out.d.ready := in.d.ready && !wipe - in.d.valid := out.d.valid && !wipe - in.d.bits := out.d.bits + val params = TLRAMModel.MonitorParameters(addressBits, sizeBits) - // BCE unsupported - in.b.valid := Bool(false) - out.c.valid := Bool(false) - out.e.valid := Bool(false) - out.b.ready := Bool(true) - in.c.ready := Bool(true) - in.e.ready := Bool(true) + // Infer as simple dual port BRAM/M10k with write-first/new-data semantics (bypass needed) + val shadow = Seq.fill(beatBytes) { Mem(endAddressHi, new TLRAMModel.ByteMonitor(params)) } + val inc_bytes = Seq.fill(beatBytes) { Mem(endAddressHi, UInt(width = countBits)) } + val dec_bytes = Seq.fill(beatBytes) { Mem(endAddressHi, UInt(width = countBits)) } + val inc_trees = Seq.tabulate(decTrees) { i => Mem(endAddressHi >> (i+1), UInt(width = countBits)) } + val dec_trees = Seq.tabulate(decTrees) { i => Mem(endAddressHi >> (i+1), UInt(width = countBits)) } - class ByteMonitor extends Bundle { - val valid = Bool() - val value = UInt(width = 8) - } - class FlightMonitor extends Bundle { - val base = UInt(width = addressBits) - val size = UInt(width = sizeBits) - val opcode = UInt(width = 3) - } + val shadow_wen = Wire(init = Fill(beatBytes, wipe)) + val inc_bytes_wen = Wire(init = Fill(beatBytes, wipe)) + val dec_bytes_wen = Wire(init = Fill(beatBytes, wipe)) + val inc_trees_wen = Wire(init = Fill(decTrees, wipe)) + val dec_trees_wen = Wire(init = Fill(decTrees, wipe)) - // Infer as simple dual port BRAM/M10k with write-first/new-data semantics (bypass needed) - val shadow = Seq.fill(beatBytes) { Mem(endAddressHi, new ByteMonitor) } - val inc_bytes = Seq.fill(beatBytes) { Mem(endAddressHi, UInt(width = countBits)) } - val dec_bytes = Seq.fill(beatBytes) { Mem(endAddressHi, UInt(width = countBits)) } - val inc_trees = Seq.tabulate(decTrees) { i => Mem(endAddressHi >> (i+1), UInt(width = countBits)) } - val dec_trees = Seq.tabulate(decTrees) { i => Mem(endAddressHi >> (i+1), UInt(width = countBits)) } + // This must be registers b/c we build a CAM from it + val flight = Reg(Vec(endSourceId, new TLRAMModel.FlightMonitor(params))) + val valid = Reg(Vec(endSourceId, Bool())) - val shadow_wen = Wire(init = Fill(beatBytes, wipe)) - val inc_bytes_wen = Wire(init = Fill(beatBytes, wipe)) - val dec_bytes_wen = Wire(init = Fill(beatBytes, wipe)) - val inc_trees_wen = Wire(init = Fill(decTrees, wipe)) - val dec_trees_wen = Wire(init = Fill(decTrees, wipe)) + // We want to cross flight data from A to D in the same cycle (for combinational TL2 devices) + val a_flight = Wire(new TLRAMModel.FlightMonitor(params)) + a_flight.base := edge.address(in.a.bits) + a_flight.size := edge.size(in.a.bits) + a_flight.opcode := in.a.bits.opcode - // This must be registers b/c we build a CAM from it - val flight = Reg(Vec(endSourceId, new FlightMonitor)) - val valid = Reg(Vec(endSourceId, Bool())) + when (in.a.fire()) { flight(in.a.bits.source) := a_flight } + val bypass = if (edge.manager.minLatency > 0) Bool(false) else in.a.valid && in.a.bits.source === out.d.bits.source + val d_flight = RegNext(Mux(bypass, a_flight, flight(out.d.bits.source))) - // We want to cross flight data from A to D in the same cycle (for combinational TL2 devices) - val a_flight = Wire(new FlightMonitor) - a_flight.base := edge.address(in.a.bits) - a_flight.size := edge.size(in.a.bits) - a_flight.opcode := in.a.bits.opcode + // Process A access requests + val a = Reg(next = in.a.bits) + val a_fire = Reg(next = in.a.fire(), init = Bool(false)) + val (a_first, a_last, _, a_address_inc) = edge.addr_inc(a, a_fire) + val a_size = edge.size(a) + val a_sizeOH = UIntToOH(a_size) + val a_address = a.address | a_address_inc + val a_addr_hi = edge.addr_hi(a_address) + val a_base = edge.address(a) + val a_mask = edge.mask(a_base, a_size) + val a_fifo = edge.manager.hasFifoIdFast(a_base) - when (in.a.fire()) { flight(in.a.bits.source) := a_flight } - val bypass = if (edge.manager.minLatency > 0) Bool(false) else in.a.valid && in.a.bits.source === out.d.bits.source - val d_flight = RegNext(Mux(bypass, a_flight, flight(out.d.bits.source))) + // Grab the concurrency state we need + val a_inc_bytes = inc_bytes.map(_.read(a_addr_hi)) + val a_dec_bytes = dec_bytes.map(_.read(a_addr_hi)) + val a_inc_trees = inc_trees.zipWithIndex.map{ case (m, i) => m.read(a_addr_hi >> (i+1)) } + val a_dec_trees = dec_trees.zipWithIndex.map{ case (m, i) => m.read(a_addr_hi >> (i+1)) } + val a_inc_tree = a_inc_trees.fold(UInt(0))(_ + _) + val a_dec_tree = a_dec_trees.fold(UInt(0))(_ + _) + val a_inc = a_inc_bytes.map(_ + a_inc_tree) + val a_dec = a_dec_bytes.map(_ + a_dec_tree) - // Process A access requests - val a = Reg(next = in.a.bits) - val a_fire = Reg(next = in.a.fire(), init = Bool(false)) - val (a_first, a_last, _, a_address_inc) = edge.addr_inc(a, a_fire) - val a_size = edge.size(a) - val a_sizeOH = UIntToOH(a_size) - val a_address = a.address | a_address_inc - val a_addr_hi = edge.addr_hi(a_address) - val a_base = edge.address(a) - val a_mask = edge.mask(a_base, a_size) - val a_fifo = edge.manager.hasFifoIdFast(a_base) + when (a_fire) { + // Record the request so we can handle it's response + assert (a.opcode =/= TLMessages.Acquire) - // Grab the concurrency state we need - val a_inc_bytes = inc_bytes.map(_.read(a_addr_hi)) - val a_dec_bytes = dec_bytes.map(_.read(a_addr_hi)) - val a_inc_trees = inc_trees.zipWithIndex.map{ case (m, i) => m.read(a_addr_hi >> (i+1)) } - val a_dec_trees = dec_trees.zipWithIndex.map{ case (m, i) => m.read(a_addr_hi >> (i+1)) } - val a_inc_tree = a_inc_trees.fold(UInt(0))(_ + _) - val a_dec_tree = a_dec_trees.fold(UInt(0))(_ + _) - val a_inc = a_inc_bytes.map(_ + a_inc_tree) - val a_dec = a_dec_bytes.map(_ + a_dec_tree) + // Mark the operation as valid + valid(a.source) := Bool(true) - when (a_fire) { - // Record the request so we can handle it's response - assert (a.opcode =/= TLMessages.Acquire) - - // Mark the operation as valid - valid(a.source) := Bool(true) - - // Increase the per-byte flight counter for the whole transaction - when (a_first && a.opcode =/= TLMessages.Hint && a.opcode =/= TLMessages.Get) { - when (a_size <= UInt(shift)) { - inc_bytes_wen := a_mask + // Increase the per-byte flight counter for the whole transaction + when (a_first && a.opcode =/= TLMessages.Hint && a.opcode =/= TLMessages.Get) { + when (a_size <= UInt(shift)) { + inc_bytes_wen := a_mask + } + inc_trees_wen := a_sizeOH >> (shift+1) } - inc_trees_wen := a_sizeOH >> (shift+1) - } - when (a.opcode === TLMessages.PutFullData || a.opcode === TLMessages.PutPartialData || - a.opcode === TLMessages.ArithmeticData || a.opcode === TLMessages.LogicalData) { - shadow_wen := a.mask - for (i <- 0 until beatBytes) { - val busy = a_inc(i) - a_dec(i) - (!a_first).asUInt - val byte = a.data(8*(i+1)-1, 8*i) - when (a.mask(i)) { - printf(log + " ") - when (a.opcode === TLMessages.PutFullData) { printf("PF") } - when (a.opcode === TLMessages.PutPartialData) { printf("PP") } - when (a.opcode === TLMessages.ArithmeticData) { printf("A ") } - when (a.opcode === TLMessages.LogicalData) { printf("L ") } - printf(" 0x%x := 0x%x #%d %x\n", a_addr_hi << shift | UInt(i), byte, busy, a.param) + when (a.opcode === TLMessages.PutFullData || a.opcode === TLMessages.PutPartialData || + a.opcode === TLMessages.ArithmeticData || a.opcode === TLMessages.LogicalData) { + shadow_wen := a.mask + for (i <- 0 until beatBytes) { + val busy = a_inc(i) - a_dec(i) - (!a_first).asUInt + val byte = a.data(8*(i+1)-1, 8*i) + when (a.mask(i)) { + printf(log + " ") + when (a.opcode === TLMessages.PutFullData) { printf("PF") } + when (a.opcode === TLMessages.PutPartialData) { printf("PP") } + when (a.opcode === TLMessages.ArithmeticData) { printf("A ") } + when (a.opcode === TLMessages.LogicalData) { printf("L ") } + printf(" 0x%x := 0x%x #%d %x\n", a_addr_hi << shift | UInt(i), byte, busy, a.param) + } } } - } - when (a.opcode === TLMessages.Get) { - printf(log + " G 0x%x - 0%x\n", a_base, a_base | UIntToOH1(a_size, addressBits)) - } - } - - val a_waddr = Mux(wipe, wipeIndex, a_addr_hi) - for (i <- 0 until beatBytes) { - val data = Wire(new ByteMonitor) - val busy = a_inc(i) =/= a_dec(i) + (!a_first).asUInt - val amo = a.opcode === TLMessages.ArithmeticData || a.opcode === TLMessages.LogicalData - data.valid := Mux(wipe, Bool(false), (!busy || a_fifo) && !amo) - // !!! calculate the AMO? - data.value := a.data(8*(i+1)-1, 8*i) - when (shadow_wen(i)) { - shadow(i).write(a_waddr, data) - } - } - - for (i <- 0 until beatBytes) { - val data = Mux(wipe, UInt(0), a_inc_bytes(i) + UInt(1)) - when (inc_bytes_wen(i)) { - inc_bytes(i).write(a_waddr, data) - } - } - - for (i <- 0 until inc_trees.size) { - val data = Mux(wipe, UInt(0), a_inc_trees(i) + UInt(1)) - when (inc_trees_wen(i)) { - inc_trees(i).write(a_waddr >> (i+1), data) - } - } - - // Process D access responses - val d = RegNext(out.d.bits) - val d_fire = Reg(next = out.d.fire(), init = Bool(false)) - val (d_first, d_last, _, d_address_inc) = edge.addr_inc(d, d_fire) - val d_size = edge.size(d) - val d_sizeOH = UIntToOH(d_size) - val d_base = d_flight.base - val d_address = d_base | d_address_inc - val d_addr_hi = edge.addr_hi(d_address) - val d_mask = edge.mask(d_base, d_size) - val d_fifo = edge.manager.hasFifoIdFast(d_flight.base) - - // Grab the concurrency state we need - val d_inc_bytes = inc_bytes.map(_.read(d_addr_hi)) - val d_dec_bytes = dec_bytes.map(_.read(d_addr_hi)) - val d_inc_trees = inc_trees.zipWithIndex.map{ case (m, i) => m.read(d_addr_hi >> (i+1)) } - val d_dec_trees = dec_trees.zipWithIndex.map{ case (m, i) => m.read(d_addr_hi >> (i+1)) } - val d_inc_tree = d_inc_trees.fold(UInt(0))(_ + _) - val d_dec_tree = d_dec_trees.fold(UInt(0))(_ + _) - val d_inc = d_inc_bytes.map(_ + d_inc_tree) - val d_dec = d_dec_bytes.map(_ + d_dec_tree) - val d_shadow = shadow.map(_.read(d_addr_hi)) - val d_valid = valid(d.source) - - when (d_fire) { - // Check the response is correct - assert (d_size === d_flight.size) - // addr_lo is allowed to differ - - when (d_flight.opcode === TLMessages.Hint) { - assert (d.opcode === TLMessages.HintAck) - } - - // Decrease the per-byte flight counter for the whole transaction - when (d_last && d_flight.opcode =/= TLMessages.Hint && d_flight.opcode =/= TLMessages.Get) { - when (d_size <= UInt(shift)) { - dec_bytes_wen := d_mask - } - dec_trees_wen := d_sizeOH >> (shift+1) - // NOTE: D channel carries uninterrupted multibeast op, so updating on last is fine - for (i <- 0 until endSourceId) { - // Does this modification overlap a Get? => wipe it's valid - val f_base = flight(i).base - val f_size = flight(i).size - val f_bits = UIntToOH1(f_size, addressBits) - val d_bits = UIntToOH1(d_size, addressBits) - val overlap = ~(~(f_base ^ d_base) | (f_bits | d_bits)) === UInt(0) - when (overlap) { valid(i) := Bool(false) } + when (a.opcode === TLMessages.Get) { + printf(log + " G 0x%x - 0%x\n", a_base, a_base | UIntToOH1(a_size, addressBits)) } } - when (d_flight.opcode === TLMessages.PutFullData || d_flight.opcode === TLMessages.PutPartialData) { - assert (d.opcode === TLMessages.AccessAck) - printf(log + " ") - when (d_flight.opcode === TLMessages.PutFullData) { printf("pf") } - when (d_flight.opcode === TLMessages.PutPartialData) { printf("pp") } - printf(" 0x%x - 0x%x\n", d_base, d_base | UIntToOH1(d_size, addressBits)) + val a_waddr = Mux(wipe, wipeIndex, a_addr_hi) + for (i <- 0 until beatBytes) { + val data = Wire(new TLRAMModel.ByteMonitor(params)) + val busy = a_inc(i) =/= a_dec(i) + (!a_first).asUInt + val amo = a.opcode === TLMessages.ArithmeticData || a.opcode === TLMessages.LogicalData + data.valid := Mux(wipe, Bool(false), (!busy || a_fifo) && !amo) + // !!! calculate the AMO? + data.value := a.data(8*(i+1)-1, 8*i) + when (shadow_wen(i)) { + shadow(i).write(a_waddr, data) + } } - when (d_flight.opcode === TLMessages.Get || d_flight.opcode === TLMessages.ArithmeticData || d_flight.opcode === TLMessages.LogicalData) { - assert (d.opcode === TLMessages.AccessAckData) - for (i <- 0 until beatBytes) { - val got = d.data(8*(i+1)-1, 8*i) - val shadow = Wire(init = d_shadow(i)) - when (d_mask(i)) { - val d_addr = d_addr_hi << shift | UInt(i) - printf(log + " ") - when (d_flight.opcode === TLMessages.Get) { printf("g ") } - when (d_flight.opcode === TLMessages.ArithmeticData) { printf("a ") } - when (d_flight.opcode === TLMessages.LogicalData) { printf("l ") } - printf(" 0x%x := 0x%x", d_addr, got) - when (!shadow.valid) { - printf(", undefined (uninitialized or prior overlapping puts)\n") - } .elsewhen (d_inc(i) =/= d_dec(i)) { - printf(", undefined (concurrent incomplete puts #%d)\n", d_inc(i) - d_dec(i)) - } .elsewhen (!d_fifo && !d_valid) { - printf(", undefined (concurrent completed put)\n") - } .otherwise { - printf("\n") - assert (shadow.value === got) + for (i <- 0 until beatBytes) { + val data = Mux(wipe, UInt(0), a_inc_bytes(i) + UInt(1)) + when (inc_bytes_wen(i)) { + inc_bytes(i).write(a_waddr, data) + } + } + + for (i <- 0 until inc_trees.size) { + val data = Mux(wipe, UInt(0), a_inc_trees(i) + UInt(1)) + when (inc_trees_wen(i)) { + inc_trees(i).write(a_waddr >> (i+1), data) + } + } + + // Process D access responses + val d = RegNext(out.d.bits) + val d_fire = Reg(next = out.d.fire(), init = Bool(false)) + val (d_first, d_last, _, d_address_inc) = edge.addr_inc(d, d_fire) + val d_size = edge.size(d) + val d_sizeOH = UIntToOH(d_size) + val d_base = d_flight.base + val d_address = d_base | d_address_inc + val d_addr_hi = edge.addr_hi(d_address) + val d_mask = edge.mask(d_base, d_size) + val d_fifo = edge.manager.hasFifoIdFast(d_flight.base) + + // Grab the concurrency state we need + val d_inc_bytes = inc_bytes.map(_.read(d_addr_hi)) + val d_dec_bytes = dec_bytes.map(_.read(d_addr_hi)) + val d_inc_trees = inc_trees.zipWithIndex.map{ case (m, i) => m.read(d_addr_hi >> (i+1)) } + val d_dec_trees = dec_trees.zipWithIndex.map{ case (m, i) => m.read(d_addr_hi >> (i+1)) } + val d_inc_tree = d_inc_trees.fold(UInt(0))(_ + _) + val d_dec_tree = d_dec_trees.fold(UInt(0))(_ + _) + val d_inc = d_inc_bytes.map(_ + d_inc_tree) + val d_dec = d_dec_bytes.map(_ + d_dec_tree) + val d_shadow = shadow.map(_.read(d_addr_hi)) + val d_valid = valid(d.source) + + when (d_fire) { + // Check the response is correct + assert (d_size === d_flight.size) + // addr_lo is allowed to differ + + when (d_flight.opcode === TLMessages.Hint) { + assert (d.opcode === TLMessages.HintAck) + } + + // Decrease the per-byte flight counter for the whole transaction + when (d_last && d_flight.opcode =/= TLMessages.Hint && d_flight.opcode =/= TLMessages.Get) { + when (d_size <= UInt(shift)) { + dec_bytes_wen := d_mask + } + dec_trees_wen := d_sizeOH >> (shift+1) + // NOTE: D channel carries uninterrupted multibeast op, so updating on last is fine + for (i <- 0 until endSourceId) { + // Does this modification overlap a Get? => wipe it's valid + val f_base = flight(i).base + val f_size = flight(i).size + val f_bits = UIntToOH1(f_size, addressBits) + val d_bits = UIntToOH1(d_size, addressBits) + val overlap = ~(~(f_base ^ d_base) | (f_bits | d_bits)) === UInt(0) + when (overlap) { valid(i) := Bool(false) } + } + } + + when (d_flight.opcode === TLMessages.PutFullData || d_flight.opcode === TLMessages.PutPartialData) { + assert (d.opcode === TLMessages.AccessAck) + printf(log + " ") + when (d_flight.opcode === TLMessages.PutFullData) { printf("pf") } + when (d_flight.opcode === TLMessages.PutPartialData) { printf("pp") } + printf(" 0x%x - 0x%x\n", d_base, d_base | UIntToOH1(d_size, addressBits)) + } + + when (d_flight.opcode === TLMessages.Get || d_flight.opcode === TLMessages.ArithmeticData || d_flight.opcode === TLMessages.LogicalData) { + assert (d.opcode === TLMessages.AccessAckData) + for (i <- 0 until beatBytes) { + val got = d.data(8*(i+1)-1, 8*i) + val shadow = Wire(init = d_shadow(i)) + when (d_mask(i)) { + val d_addr = d_addr_hi << shift | UInt(i) + printf(log + " ") + when (d_flight.opcode === TLMessages.Get) { printf("g ") } + when (d_flight.opcode === TLMessages.ArithmeticData) { printf("a ") } + when (d_flight.opcode === TLMessages.LogicalData) { printf("l ") } + printf(" 0x%x := 0x%x", d_addr, got) + when (!shadow.valid) { + printf(", undefined (uninitialized or prior overlapping puts)\n") + } .elsewhen (d_inc(i) =/= d_dec(i)) { + printf(", undefined (concurrent incomplete puts #%d)\n", d_inc(i) - d_dec(i)) + } .elsewhen (!d_fifo && !d_valid) { + printf(", undefined (concurrent completed put)\n") + } .otherwise { + printf("\n") + assert (shadow.value === got) + } } } } } - } - val d_waddr = Mux(wipe, wipeIndex, d_addr_hi) - for (i <- 0 until beatBytes) { - val data = Mux(wipe, UInt(0), d_dec_bytes(i) + UInt(1)) - when (dec_bytes_wen(i)) { - dec_bytes(i).write(d_waddr, data) + val d_waddr = Mux(wipe, wipeIndex, d_addr_hi) + for (i <- 0 until beatBytes) { + val data = Mux(wipe, UInt(0), d_dec_bytes(i) + UInt(1)) + when (dec_bytes_wen(i)) { + dec_bytes(i).write(d_waddr, data) + } } - } - for (i <- 0 until dec_trees.size) { - val data = Mux(wipe, UInt(0), d_dec_trees(i) + UInt(1)) - when (dec_trees_wen(i)) { - dec_trees(i).write(d_waddr >> (i+1), data) + for (i <- 0 until dec_trees.size) { + val data = Mux(wipe, UInt(0), d_dec_trees(i) + UInt(1)) + when (dec_trees_wen(i)) { + dec_trees(i).write(d_waddr >> (i+1), data) + } } } } } + +object TLRAMModel +{ + case class MonitorParameters(addressBits: Int, sizeBits: Int) + + class ByteMonitor(params: MonitorParameters) extends GenericParameterizedBundle(params) { + val valid = Bool() + val value = UInt(width = 8) + } + class FlightMonitor(params: MonitorParameters) extends GenericParameterizedBundle(params) { + val base = UInt(width = params.addressBits) + val size = UInt(width = params.sizeBits) + val opcode = UInt(width = 3) + } +} diff --git a/src/main/scala/uncore/tilelink2/RegisterRouter.scala b/src/main/scala/uncore/tilelink2/RegisterRouter.scala index 0508c930..2faa08dc 100644 --- a/src/main/scala/uncore/tilelink2/RegisterRouter.scala +++ b/src/main/scala/uncore/tilelink2/RegisterRouter.scala @@ -9,7 +9,7 @@ import regmapper._ import scala.math.{min,max} class TLRegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int = 4, undefZero: Boolean = true, executable: Boolean = false) - extends TLManagerNode(TLManagerPortParameters( + extends TLManagerNode(Seq(TLManagerPortParameters( Seq(TLManagerParameters( address = Seq(address), executable = executable, @@ -18,7 +18,7 @@ class TLRegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int = supportsPutFull = TransferSizes(1, beatBytes), fifoId = Some(0))), // requests are handled in order beatBytes = beatBytes, - minLatency = min(concurrency, 1))) // the Queue adds at most one cycle + minLatency = min(concurrency, 1)))) // the Queue adds at most one cycle { require (address.contiguous) diff --git a/src/main/scala/uncore/tilelink2/SRAM.scala b/src/main/scala/uncore/tilelink2/SRAM.scala index 86c2730b..e9846750 100644 --- a/src/main/scala/uncore/tilelink2/SRAM.scala +++ b/src/main/scala/uncore/tilelink2/SRAM.scala @@ -8,7 +8,7 @@ import diplomacy._ class TLRAM(address: AddressSet, executable: Boolean = true, beatBytes: Int = 4)(implicit p: Parameters) extends LazyModule { - val node = TLManagerNode(TLManagerPortParameters( + val node = TLManagerNode(Seq(TLManagerPortParameters( Seq(TLManagerParameters( address = List(address), regionType = RegionType.UNCACHED, @@ -18,7 +18,7 @@ class TLRAM(address: AddressSet, executable: Boolean = true, beatBytes: Int = 4) supportsPutFull = TransferSizes(1, beatBytes), fifoId = Some(0))), // requests are handled in order beatBytes = beatBytes, - minLatency = 1)) // no bypass needed for this device + minLatency = 1))) // no bypass needed for this device // We require the address range to include an entire beat (for the write mask) require ((address.mask & (beatBytes-1)) == beatBytes-1) diff --git a/src/main/scala/uncore/tilelink2/SourceShrinker.scala b/src/main/scala/uncore/tilelink2/SourceShrinker.scala index 5e4788cf..9d0c164c 100644 --- a/src/main/scala/uncore/tilelink2/SourceShrinker.scala +++ b/src/main/scala/uncore/tilelink2/SourceShrinker.scala @@ -15,8 +15,8 @@ class TLSourceShrinker(maxInFlight: Int)(implicit p: Parameters) extends LazyMod private val client = TLClientParameters(sourceId = IdRange(0, maxInFlight)) val node = TLAdapterNode( // We erase all client information since we crush the source Ids - clientFn = { case _ => TLClientPortParameters(clients = Seq(client)) }, - managerFn = { case Seq(mp) => mp }) + clientFn = { _ => TLClientPortParameters(clients = Seq(client)) }, + managerFn = { mp => mp }) lazy val module = new LazyModuleImp(this) { val io = new Bundle { @@ -24,54 +24,51 @@ class TLSourceShrinker(maxInFlight: Int)(implicit p: Parameters) extends LazyMod val out = node.bundleOut } - val edgeIn = node.edgesIn(0) - val edgeOut = node.edgesOut(0) - val in = io.in(0) - val out = io.out(0) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + // Acquires cannot pass this adapter; it makes Probes impossible + require (!edgeIn.client.anySupportProbe || + !edgeOut.manager.anySupportAcquireB) - // Acquires cannot pass this adapter; it makes Probes impossible - require (!edgeIn.client.anySupportProbe || - !edgeOut.manager.anySupportAcquireB) + out.b.ready := Bool(true) + out.c.valid := Bool(false) + out.e.valid := Bool(false) + in.b.valid := Bool(false) + in.c.ready := Bool(true) + in.e.ready := Bool(true) - out.b.ready := Bool(true) - out.c.valid := Bool(false) - out.e.valid := Bool(false) - in.b.valid := Bool(false) - in.c.ready := Bool(true) - in.e.ready := Bool(true) + if (maxInFlight >= edgeIn.client.endSourceId) { + out.a <> in.a + in.d <> out.d + } else { + // State tracking + val sourceIdMap = Mem(maxInFlight, in.a.bits.source) + val allocated = RegInit(UInt(0, width = maxInFlight)) + val nextFreeOH = ~(leftOR(~allocated) << 1) & ~allocated + val nextFree = OHToUInt(nextFreeOH) + val full = allocated.andR() - if (maxInFlight >= edgeIn.client.endSourceId) { - out.a <> in.a - in.d <> out.d - } else { - // State tracking - val sourceIdMap = Mem(maxInFlight, in.a.bits.source) - val allocated = RegInit(UInt(0, width = maxInFlight)) - val nextFreeOH = ~(leftOR(~allocated) << 1) & ~allocated - val nextFree = OHToUInt(nextFreeOH) - val full = allocated.andR() + val a_first = edgeIn.first(in.a) + val d_last = edgeIn.last(in.d) - val a_first = edgeIn.first(in.a) - val d_last = edgeIn.last(in.d) + val block = a_first && full + in.a.ready := out.a.ready && !block + out.a.valid := in.a.valid && !block + out.a.bits := in.a.bits + out.a.bits.source := holdUnless(nextFree, a_first) - val block = a_first && full - in.a.ready := out.a.ready && !block - out.a.valid := in.a.valid && !block - out.a.bits := in.a.bits - out.a.bits.source := holdUnless(nextFree, a_first) + in.d <> out.d + in.d.bits.source := sourceIdMap(out.d.bits.source) - in.d <> out.d - in.d.bits.source := sourceIdMap(out.d.bits.source) + when (a_first && in.a.fire()) { + sourceIdMap(nextFree) := in.a.bits.source + } - when (a_first && in.a.fire()) { - sourceIdMap(nextFree) := in.a.bits.source + val alloc = a_first && in.a.fire() + val free = d_last && in.d.fire() + val alloc_id = Mux(alloc, nextFreeOH, UInt(0)) + val free_id = Mux(free, UIntToOH(out.d.bits.source), UInt(0)) + allocated := (allocated | alloc_id) & ~free_id } - - val alloc = a_first && in.a.fire() - val free = d_last && in.d.fire() - val alloc_id = Mux(alloc, nextFreeOH, UInt(0)) - val free_id = Mux(free, UIntToOH(out.d.bits.source), UInt(0)) - allocated := (allocated | alloc_id) & ~free_id } } } diff --git a/src/main/scala/uncore/tilelink2/ToAHB.scala b/src/main/scala/uncore/tilelink2/ToAHB.scala index 7ceb86f7..4d7f8c81 100644 --- a/src/main/scala/uncore/tilelink2/ToAHB.scala +++ b/src/main/scala/uncore/tilelink2/ToAHB.scala @@ -10,12 +10,12 @@ import uncore.ahb._ import scala.math.{min, max} import AHBParameters._ -case class TLToAHBNode() extends MixedNode(TLImp, AHBImp)( - dFn = { case (1, Seq(TLClientPortParameters(clients, unsafeAtomics, minLatency))) => +case class TLToAHBNode() extends MixedAdapterNode(TLImp, AHBImp)( + dFn = { case TLClientPortParameters(clients, unsafeAtomics, minLatency) => val masters = clients.map { case c => AHBMasterParameters(nodePath = c.nodePath) } - Seq(AHBMasterPortParameters(masters)) + AHBMasterPortParameters(masters) }, - uFn = { case (1, Seq(AHBSlavePortParameters(slaves, beatBytes))) => + uFn = { case AHBSlavePortParameters(slaves, beatBytes) => val managers = slaves.map { case s => TLManagerParameters( address = s.address, @@ -26,10 +26,8 @@ case class TLToAHBNode() extends MixedNode(TLImp, AHBImp)( supportsPutFull = s.supportsWrite, // but not PutPartial fifoId = Some(0)) // a common FIFO domain } - Seq(TLManagerPortParameters(managers, beatBytes, 1, 1)) - }, - numPO = 1 to 1, - numPI = 1 to 1) + TLManagerPortParameters(managers, beatBytes, 1, 1) + }) class TLToAHB(combinational: Boolean = true)(implicit p: Parameters) extends LazyModule { @@ -41,91 +39,89 @@ class TLToAHB(combinational: Boolean = true)(implicit p: Parameters) extends Laz val out = node.bundleOut } - val in = io.in(0) - val out = io.out(0) - val edgeIn = node.edgesIn(0) - val edgeOut = node.edgesOut(0) - val beatBytes = edgeOut.slave.beatBytes - val maxTransfer = edgeOut.slave.maxTransfer - val lgMax = log2Ceil(maxTransfer) - val lgBytes = log2Ceil(beatBytes) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val beatBytes = edgeOut.slave.beatBytes + val maxTransfer = edgeOut.slave.maxTransfer + val lgMax = log2Ceil(maxTransfer) + val lgBytes = log2Ceil(beatBytes) - // AHB has no cache coherence - in.b.valid := Bool(false) - in.c.ready := Bool(true) - in.e.ready := Bool(true) + // AHB has no cache coherence + in.b.valid := Bool(false) + in.c.ready := Bool(true) + in.e.ready := Bool(true) - // We need a skidpad to capture D output: - // We cannot know if the D response will be accepted until we have - // presented it on D as valid. We also can't back-pressure AHB in the - // data phase. Therefore, we must have enough space to save the data - // phase result. Whenever we have a queued response, we can not allow - // AHB to present new responses, so we must quash the address phase. - val d = Wire(in.d) - in.d <> Queue(d, 1, flow = true) - val a_quash = in.d.valid && !in.d.ready + // We need a skidpad to capture D output: + // We cannot know if the D response will be accepted until we have + // presented it on D as valid. We also can't back-pressure AHB in the + // data phase. Therefore, we must have enough space to save the data + // phase result. Whenever we have a queued response, we can not allow + // AHB to present new responses, so we must quash the address phase. + val d = Wire(in.d) + in.d <> Queue(d, 1, flow = true) + val a_quash = in.d.valid && !in.d.ready - // Record what is coming out in d_phase - val d_valid = RegInit(Bool(false)) - val d_hasData = Reg(Bool()) - val d_error = Reg(Bool()) - val d_addr_lo = Reg(UInt(width = lgBytes)) - val d_source = Reg(UInt()) - val d_size = Reg(UInt()) + // Record what is coming out in d_phase + val d_valid = RegInit(Bool(false)) + val d_hasData = Reg(Bool()) + val d_error = Reg(Bool()) + val d_addr_lo = Reg(UInt(width = lgBytes)) + val d_source = Reg(UInt()) + val d_size = Reg(UInt()) - when (out.hreadyout) { d_error := d_error || out.hresp } - when (d.fire()) { d_valid := Bool(false) } + when (out.hreadyout) { d_error := d_error || out.hresp } + when (d.fire()) { d_valid := Bool(false) } - d.valid := d_valid && out.hreadyout - d.bits := edgeIn.AccessAck(d_addr_lo, UInt(0), d_source, d_size, out.hrdata, out.hresp || d_error) - d.bits.opcode := Mux(d_hasData, TLMessages.AccessAckData, TLMessages.AccessAck) + d.valid := d_valid && out.hreadyout + d.bits := edgeIn.AccessAck(d_addr_lo, UInt(0), d_source, d_size, out.hrdata, out.hresp || d_error) + d.bits.opcode := Mux(d_hasData, TLMessages.AccessAckData, TLMessages.AccessAck) - // We need an irrevocable input for AHB to stall on read bursts - // We also need the values to NOT change when valid goes low => 1 entry only - val a = Queue(in.a, 1, flow = combinational, pipe = !combinational) - val a_valid = a.valid && !a_quash + // We need an irrevocable input for AHB to stall on read bursts + // We also need the values to NOT change when valid goes low => 1 entry only + val a = Queue(in.a, 1, flow = combinational, pipe = !combinational) + val a_valid = a.valid && !a_quash - // This is lot like TLEdge.firstlast, but counts beats also for single-beat TL types - val a_size = edgeIn.size(a.bits) - val a_beats1 = UIntToOH1(a_size, lgMax) >> lgBytes - val a_counter = RegInit(UInt(0, width = log2Up(maxTransfer/beatBytes))) - val a_counter1 = a_counter - UInt(1) - val a_first = a_counter === UInt(0) - val a_last = a_counter === UInt(1) || a_beats1 === UInt(0) - val a_offset = (a_beats1 & ~a_counter1) << lgBytes - val a_hasData = edgeIn.hasData(a.bits) + // This is lot like TLEdge.firstlast, but counts beats also for single-beat TL types + val a_size = edgeIn.size(a.bits) + val a_beats1 = UIntToOH1(a_size, lgMax) >> lgBytes + val a_counter = RegInit(UInt(0, width = log2Up(maxTransfer/beatBytes))) + val a_counter1 = a_counter - UInt(1) + val a_first = a_counter === UInt(0) + val a_last = a_counter === UInt(1) || a_beats1 === UInt(0) + val a_offset = (a_beats1 & ~a_counter1) << lgBytes + val a_hasData = edgeIn.hasData(a.bits) - // Expand no-data A-channel requests into multiple beats - a.ready := (a_hasData || a_last) && out.hreadyout && !a_quash - when (a_valid && out.hreadyout) { - a_counter := Mux(a_first, a_beats1, a_counter1) - d_valid := !a_hasData || a_last - // Record what will be in the data phase - when (a_first) { - d_hasData := !a_hasData - d_error := Bool(false) - d_addr_lo := a.bits.address - d_source := a.bits.source - d_size := a.bits.size + // Expand no-data A-channel requests into multiple beats + a.ready := (a_hasData || a_last) && out.hreadyout && !a_quash + when (a_valid && out.hreadyout) { + a_counter := Mux(a_first, a_beats1, a_counter1) + d_valid := !a_hasData || a_last + // Record what will be in the data phase + when (a_first) { + d_hasData := !a_hasData + d_error := Bool(false) + d_addr_lo := a.bits.address + d_source := a.bits.source + d_size := a.bits.size + } } + + // Transform TL size into AHB hsize+hburst + val a_size_bits = a_size.getWidth + val a_sizeDelta = Cat(UInt(0, width = 1), a_size) - UInt(lgBytes+1) + val a_singleBeat = a_sizeDelta(a_size_bits) + val a_logBeats1 = a_sizeDelta(a_size_bits-1, 0) + + out.hmastlock := Bool(false) // for now + out.htrans := Mux(a_valid, Mux(a_first, TRANS_NONSEQ, TRANS_SEQ), Mux(a_first, TRANS_IDLE, TRANS_BUSY)) + out.hsel := a_valid || !a_first + out.hready := out.hreadyout + out.hwrite := a_hasData + out.haddr := a.bits.address | a_offset + out.hsize := Mux(a_singleBeat, a.bits.size, UInt(lgBytes)) + out.hburst := Mux(a_singleBeat, BURST_SINGLE, (a_logBeats1<<1) | UInt(1)) + out.hprot := PROT_DEFAULT + out.hwdata := RegEnable(a.bits.data, a.fire()) } - - // Transform TL size into AHB hsize+hburst - val a_size_bits = a_size.getWidth - val a_sizeDelta = Cat(UInt(0, width = 1), a_size) - UInt(lgBytes+1) - val a_singleBeat = a_sizeDelta(a_size_bits) - val a_logBeats1 = a_sizeDelta(a_size_bits-1, 0) - - out.hmastlock := Bool(false) // for now - out.htrans := Mux(a_valid, Mux(a_first, TRANS_NONSEQ, TRANS_SEQ), Mux(a_first, TRANS_IDLE, TRANS_BUSY)) - out.hsel := a_valid || !a_first - out.hready := out.hreadyout - out.hwrite := a_hasData - out.haddr := a.bits.address | a_offset - out.hsize := Mux(a_singleBeat, a.bits.size, UInt(lgBytes)) - out.hburst := Mux(a_singleBeat, BURST_SINGLE, (a_logBeats1<<1) | UInt(1)) - out.hprot := PROT_DEFAULT - out.hwdata := RegEnable(a.bits.data, a.fire()) } } diff --git a/src/main/scala/uncore/tilelink2/ToAPB.scala b/src/main/scala/uncore/tilelink2/ToAPB.scala index 218892f6..c2966751 100644 --- a/src/main/scala/uncore/tilelink2/ToAPB.scala +++ b/src/main/scala/uncore/tilelink2/ToAPB.scala @@ -10,12 +10,12 @@ import uncore.apb._ import scala.math.{min, max} import APBParameters._ -case class TLToAPBNode() extends MixedNode(TLImp, APBImp)( - dFn = { case (1, Seq(TLClientPortParameters(clients, unsafeAtomics, minLatency))) => +case class TLToAPBNode() extends MixedAdapterNode(TLImp, APBImp)( + dFn = { case TLClientPortParameters(clients, unsafeAtomics, minLatency) => val masters = clients.map { case c => APBMasterParameters(nodePath = c.nodePath) } - Seq(APBMasterPortParameters(masters)) + APBMasterPortParameters(masters) }, - uFn = { case (1, Seq(APBSlavePortParameters(slaves, beatBytes))) => + uFn = { case APBSlavePortParameters(slaves, beatBytes) => val managers = slaves.map { case s => TLManagerParameters( address = s.address, @@ -27,10 +27,8 @@ case class TLToAPBNode() extends MixedNode(TLImp, APBImp)( supportsPutFull = if (s.supportsWrite) TransferSizes(1, beatBytes) else TransferSizes.none, fifoId = Some(0)) // a common FIFO domain } - Seq(TLManagerPortParameters(managers, beatBytes, 1, 0)) - }, - numPO = 1 to 1, - numPI = 1 to 1) + TLManagerPortParameters(managers, beatBytes, 1, 0) + }) class TLToAPB(combinational: Boolean = true)(implicit p: Parameters) extends LazyModule { @@ -42,51 +40,49 @@ class TLToAPB(combinational: Boolean = true)(implicit p: Parameters) extends Laz val out = node.bundleOut } - val in = io.in(0) - val out = io.out(0) - val edgeIn = node.edgesIn(0) - val edgeOut = node.edgesOut(0) - val beatBytes = edgeOut.slave.beatBytes - val lgBytes = log2Ceil(beatBytes) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val beatBytes = edgeOut.slave.beatBytes + val lgBytes = log2Ceil(beatBytes) - // APB has no cache coherence - in.b.valid := Bool(false) - in.c.ready := Bool(true) - in.e.ready := Bool(true) + // APB has no cache coherence + in.b.valid := Bool(false) + in.c.ready := Bool(true) + in.e.ready := Bool(true) - // We need a skidpad to capture D output: - // We cannot know if the D response will be accepted until we have - // presented it on D as valid. We also can't back-pressure APB in the - // data phase. Therefore, we must have enough space to save the data - // phase result. Whenever we have a queued response, we can not allow - // APB to present new responses, so we must quash the address phase. - val d = Wire(in.d) - in.d <> Queue(d, 1, flow = true) + // We need a skidpad to capture D output: + // We cannot know if the D response will be accepted until we have + // presented it on D as valid. We also can't back-pressure APB in the + // data phase. Therefore, we must have enough space to save the data + // phase result. Whenever we have a queued response, we can not allow + // APB to present new responses, so we must quash the address phase. + val d = Wire(in.d) + in.d <> Queue(d, 1, flow = true) - // We need an irrevocable input for APB to stall - val a = Queue(in.a, 1, flow = combinational, pipe = !combinational) + // We need an irrevocable input for APB to stall + val a = Queue(in.a, 1, flow = combinational, pipe = !combinational) - val a_enable = RegInit(Bool(false)) - val a_sel = a.valid && RegNext(!in.d.valid || in.d.ready) - val a_write = edgeIn.hasData(a.bits) + val a_enable = RegInit(Bool(false)) + val a_sel = a.valid && RegNext(!in.d.valid || in.d.ready) + val a_write = edgeIn.hasData(a.bits) - when (a_sel) { a_enable := Bool(true) } - when (d.fire()) { a_enable := Bool(false) } + when (a_sel) { a_enable := Bool(true) } + when (d.fire()) { a_enable := Bool(false) } - out.psel := a_sel - out.penable := a_enable - out.pwrite := a_write - out.paddr := a.bits.address - out.pprot := PROT_DEFAULT - out.pwdata := a.bits.data - out.pstrb := Mux(a_write, a.bits.mask, UInt(0)) + out.psel := a_sel + out.penable := a_enable + out.pwrite := a_write + out.paddr := a.bits.address + out.pprot := PROT_DEFAULT + out.pwdata := a.bits.data + out.pstrb := Mux(a_write, a.bits.mask, UInt(0)) - a.ready := a_enable && out.pready - d.valid := a_enable && out.pready - assert (!d.valid || d.ready) + a.ready := a_enable && out.pready + d.valid := a_enable && out.pready + assert (!d.valid || d.ready) - d.bits := edgeIn.AccessAck(a.bits, UInt(0), out.prdata, out.pslverr) - d.bits.opcode := Mux(a_write, TLMessages.AccessAck, TLMessages.AccessAckData) + d.bits := edgeIn.AccessAck(a.bits, UInt(0), out.prdata, out.pslverr) + d.bits.opcode := Mux(a_write, TLMessages.AccessAck, TLMessages.AccessAckData) + } } } diff --git a/src/main/scala/uncore/tilelink2/ToAXI4.scala b/src/main/scala/uncore/tilelink2/ToAXI4.scala index 3cad2a31..c0b7a6c8 100644 --- a/src/main/scala/uncore/tilelink2/ToAXI4.scala +++ b/src/main/scala/uncore/tilelink2/ToAXI4.scala @@ -10,16 +10,16 @@ import util.PositionalMultiQueue import uncore.axi4._ import scala.math.{min, max} -case class TLToAXI4Node(idBits: Int) extends MixedNode(TLImp, AXI4Imp)( - dFn = { case (1, _) => +case class TLToAXI4Node(idBits: Int) extends MixedAdapterNode(TLImp, AXI4Imp)( + dFn = { _ => // We must erase all client information, because we crush their source Ids val masters = Seq( AXI4MasterParameters( id = IdRange(0, 1 << idBits), aligned = true)) - Seq(AXI4MasterPortParameters(masters)) + AXI4MasterPortParameters(masters) }, - uFn = { case (1, Seq(p)) => Seq(TLManagerPortParameters( + uFn = { p => TLManagerPortParameters( managers = p.slaves.map { case s => TLManagerParameters( address = s.address, @@ -31,10 +31,8 @@ case class TLToAXI4Node(idBits: Int) extends MixedNode(TLImp, AXI4Imp)( supportsPutPartial = s.supportsWrite)}, // AXI4 is NEVER fifo in TL sense (R+W are independent) beatBytes = p.beatBytes, - minLatency = p.minLatency)) - }, - numPO = 1 to 1, - numPI = 1 to 1) + minLatency = p.minLatency) + }) class TLToAXI4(idBits: Int, combinational: Boolean = true)(implicit p: Parameters) extends LazyModule { @@ -46,185 +44,182 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true)(implicit p: Parameter val out = node.bundleOut } - val in = io.in(0) - val out = io.out(0) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val slaves = edgeOut.slave.slaves - val edgeIn = node.edgesIn(0) - val edgeOut = node.edgesOut(0) - val slaves = edgeOut.slave.slaves + // All pairs of slaves must promise that they will never interleave data + require (slaves(0).interleavedId.isDefined) + slaves.foreach { s => require (s.interleavedId == slaves(0).interleavedId) } - // All pairs of slaves must promise that they will never interleave data - require (slaves(0).interleavedId.isDefined) - slaves.foreach { s => require (s.interleavedId == slaves(0).interleavedId) } + // We need to ensure that a slave does not stall trying to send B while we need to receive R + // Since R&W have independent flow control, it is possible for a W to cut in-line and get into + // a slave's buffers, preventing us from getting all the R responses we need to release D for B. + // This risk is compounded by an AXI fragmentation. Even a slave which responds completely to + // AR before working on AW might have an AW slipped between two AR fragments. + val out_b = Queue.irrevocable(out.b, entries=edgeIn.client.endSourceId, flow=combinational) - // We need to ensure that a slave does not stall trying to send B while we need to receive R - // Since R&W have independent flow control, it is possible for a W to cut in-line and get into - // a slave's buffers, preventing us from getting all the R responses we need to release D for B. - // This risk is compounded by an AXI fragmentation. Even a slave which responds completely to - // AR before working on AW might have an AW slipped between two AR fragments. - val out_b = Queue.irrevocable(out.b, entries=edgeIn.client.endSourceId, flow=combinational) + // We need to keep the following state from A => D: (addr_lo, size, source) + // All of those fields could potentially require 0 bits (argh. Chisel.) + // We will pack as many of the lowest bits of state as fit into the AXI ID. + // Any bits left-over must be put into a bank of Queues. + // The Queues are indexed by as many of the source bits as fit into the AXI ID. + // The Queues are deep enough that every source has guaranteed space in its Queue. - // We need to keep the following state from A => D: (addr_lo, size, source) - // All of those fields could potentially require 0 bits (argh. Chisel.) - // We will pack as many of the lowest bits of state as fit into the AXI ID. - // Any bits left-over must be put into a bank of Queues. - // The Queues are indexed by as many of the source bits as fit into the AXI ID. - // The Queues are deep enough that every source has guaranteed space in its Queue. + val sourceBits = log2Ceil(edgeIn.client.endSourceId) + val sizeBits = log2Ceil(edgeIn.maxLgSize+1) + val addrBits = log2Ceil(edgeIn.manager.beatBytes) + val stateBits = addrBits + sizeBits + sourceBits // could be 0 - val sourceBits = log2Ceil(edgeIn.client.endSourceId) - val sizeBits = log2Ceil(edgeIn.maxLgSize+1) - val addrBits = log2Ceil(edgeIn.manager.beatBytes) - val stateBits = addrBits + sizeBits + sourceBits // could be 0 + val a_address = edgeIn.address(in.a.bits) + val a_addr_lo = edgeIn.addr_lo(a_address) + val a_source = in.a.bits.source + val a_size = edgeIn.size(in.a.bits) + val a_isPut = edgeIn.hasData(in.a.bits) + val a_last = edgeIn.last(in.a) - val a_address = edgeIn.address(in.a.bits) - val a_addr_lo = edgeIn.addr_lo(a_address) - val a_source = in.a.bits.source - val a_size = edgeIn.size(in.a.bits) - val a_isPut = edgeIn.hasData(in.a.bits) - val a_last = edgeIn.last(in.a) + // Make sure the fields are within the bounds we assumed + assert (a_source < UInt(BigInt(1) << sourceBits)) + assert (a_size < UInt(BigInt(1) << sizeBits)) + assert (a_addr_lo < UInt(BigInt(1) << addrBits)) - // Make sure the fields are within the bounds we assumed - assert (a_source < UInt(BigInt(1) << sourceBits)) - assert (a_size < UInt(BigInt(1) << sizeBits)) - assert (a_addr_lo < UInt(BigInt(1) << addrBits)) + // Carefully pack/unpack fields into the state we send + val baseEnd = 0 + val (sourceEnd, sourceOff) = (sourceBits + baseEnd, baseEnd) + val (sizeEnd, sizeOff) = (sizeBits + sourceEnd, sourceEnd) + val (addrEnd, addrOff) = (addrBits + sizeEnd, sizeEnd) + require (addrEnd == stateBits) - // Carefully pack/unpack fields into the state we send - val baseEnd = 0 - val (sourceEnd, sourceOff) = (sourceBits + baseEnd, baseEnd) - val (sizeEnd, sizeOff) = (sizeBits + sourceEnd, sourceEnd) - val (addrEnd, addrOff) = (addrBits + sizeEnd, sizeEnd) - require (addrEnd == stateBits) + val a_state = (a_source << sourceOff) | (a_size << sizeOff) | (a_addr_lo << addrOff) + val a_id = if (idBits == 0) UInt(0) else a_state - val a_state = (a_source << sourceOff) | (a_size << sizeOff) | (a_addr_lo << addrOff) - val a_id = if (idBits == 0) UInt(0) else a_state + val r_state = Wire(UInt(width = stateBits)) + val r_source = if (sourceBits > 0) r_state(sourceEnd-1, sourceOff) else UInt(0) + val r_size = if (sizeBits > 0) r_state(sizeEnd -1, sizeOff) else UInt(0) + val r_addr_lo = if (addrBits > 0) r_state(addrEnd -1, addrOff) else UInt(0) - val r_state = Wire(UInt(width = stateBits)) - val r_source = if (sourceBits > 0) r_state(sourceEnd-1, sourceOff) else UInt(0) - val r_size = if (sizeBits > 0) r_state(sizeEnd -1, sizeOff) else UInt(0) - val r_addr_lo = if (addrBits > 0) r_state(addrEnd -1, addrOff) else UInt(0) + val b_state = Wire(UInt(width = stateBits)) + val b_source = if (sourceBits > 0) b_state(sourceEnd-1, sourceOff) else UInt(0) + val b_size = if (sizeBits > 0) b_state(sizeEnd -1, sizeOff) else UInt(0) + val b_addr_lo = if (addrBits > 0) b_state(addrEnd -1, addrOff) else UInt(0) - val b_state = Wire(UInt(width = stateBits)) - val b_source = if (sourceBits > 0) b_state(sourceEnd-1, sourceOff) else UInt(0) - val b_size = if (sizeBits > 0) b_state(sizeEnd -1, sizeOff) else UInt(0) - val b_addr_lo = if (addrBits > 0) b_state(addrEnd -1, addrOff) else UInt(0) + val r_last = out.r.bits.last + val r_id = out.r.bits.id + val b_id = out_b.bits.id - val r_last = out.r.bits.last - val r_id = out.r.bits.id - val b_id = out_b.bits.id + if (stateBits <= idBits) { // No need for any state tracking + r_state := r_id + b_state := b_id + } else { + val bankIndexBits = min(sourceBits, idBits) + val posBits = max(0, sourceBits - idBits) + val implicitBits = max(idBits, sourceBits) + val bankBits = stateBits - implicitBits + val numBanks = min(1 << bankIndexBits, edgeIn.client.endSourceId) + def bankEntries(i: Int) = (edgeIn.client.endSourceId+numBanks-i-1) / numBanks - if (stateBits <= idBits) { // No need for any state tracking - r_state := r_id - b_state := b_id - } else { - val bankIndexBits = min(sourceBits, idBits) - val posBits = max(0, sourceBits - idBits) - val implicitBits = max(idBits, sourceBits) - val bankBits = stateBits - implicitBits - val numBanks = min(1 << bankIndexBits, edgeIn.client.endSourceId) - def bankEntries(i: Int) = (edgeIn.client.endSourceId+numBanks-i-1) / numBanks + val banks = Seq.tabulate(numBanks) { i => + // We know there can only be as many outstanding requests as TL sources + // However, AXI read and write queues are not mutually FIFO. + // Therefore, we want to pop them individually, but share the storage. + val bypass = combinational && edgeOut.slave.minLatency == 0 + PositionalMultiQueue(UInt(width=max(1,bankBits)), positions=bankEntries(i), ways=2, combinational=bypass) + } - val banks = Seq.tabulate(numBanks) { i => - // We know there can only be as many outstanding requests as TL sources - // However, AXI read and write queues are not mutually FIFO. - // Therefore, we want to pop them individually, but share the storage. - val bypass = combinational && edgeOut.slave.minLatency == 0 - PositionalMultiQueue(UInt(width=max(1,bankBits)), positions=bankEntries(i), ways=2, combinational=bypass) + val a_bankPosition = if (posBits == 0) UInt(0) else a_source(sourceBits-1, idBits) + val a_bankIndex = if (bankIndexBits == 0) UInt(0) else a_source(bankIndexBits-1, 0) + val r_bankIndex = if (bankIndexBits == 0) UInt(0) else r_id(bankIndexBits-1, 0) + val b_bankIndex = if (bankIndexBits == 0) UInt(0) else b_id(bankIndexBits-1, 0) + val a_bankSelect = UIntToOH(a_bankIndex, numBanks) + val r_bankSelect = UIntToOH(r_bankIndex, numBanks) + val b_bankSelect = UIntToOH(b_bankIndex, numBanks) + + banks.zipWithIndex.foreach { case (q, i) => + // Push a_state into the banks + q.io.enq.valid := in.a.fire() && a_last && a_bankSelect(i) + q.io.enq.bits.pos := a_bankPosition + q.io.enq.bits.data := a_state >> implicitBits + q.io.enq.bits.way := Mux(a_isPut, UInt(0), UInt(1)) + // Pop the bank's ways + q.io.deq(0).ready := out_b.fire() && b_bankSelect(i) + q.io.deq(1).ready := out.r.fire() && r_bankSelect(i) && r_last + // The FIFOs must be valid when we're ready to pop them... + assert (q.io.deq(0).valid || !q.io.deq(0).ready) + assert (q.io.deq(1).valid || !q.io.deq(1).ready) + } + + val b_bankData = Vec(banks.map(_.io.deq(0).bits.data))(b_bankIndex) + val b_bankPos = Vec(banks.map(_.io.deq(0).bits.pos ))(b_bankIndex) + val r_bankData = Vec(banks.map(_.io.deq(1).bits.data))(r_bankIndex) + val r_bankPos = Vec(banks.map(_.io.deq(1).bits.pos ))(r_bankIndex) + + def optCat(x: (Boolean, UInt)*) = { Cat(x.toList.filter(_._1).map(_._2)) } + b_state := optCat((bankBits > 0, b_bankData), (posBits > 0, b_bankPos), (idBits > 0, b_id)) + r_state := optCat((bankBits > 0, r_bankData), (posBits > 0, r_bankPos), (idBits > 0, r_id)) } - val a_bankPosition = if (posBits == 0) UInt(0) else a_source(sourceBits-1, idBits) - val a_bankIndex = if (bankIndexBits == 0) UInt(0) else a_source(bankIndexBits-1, 0) - val r_bankIndex = if (bankIndexBits == 0) UInt(0) else r_id(bankIndexBits-1, 0) - val b_bankIndex = if (bankIndexBits == 0) UInt(0) else b_id(bankIndexBits-1, 0) - val a_bankSelect = UIntToOH(a_bankIndex, numBanks) - val r_bankSelect = UIntToOH(r_bankIndex, numBanks) - val b_bankSelect = UIntToOH(b_bankIndex, numBanks) + // We need these Queues because AXI4 queues are irrevocable + val depth = if (combinational) 1 else 2 + val out_arw = Wire(Decoupled(new AXI4BundleARW(out.params))) + val out_w = Wire(out.w) + out.w <> Queue.irrevocable(out_w, entries=depth, flow=combinational) + val queue_arw = Queue.irrevocable(out_arw, entries=depth, flow=combinational) - banks.zipWithIndex.foreach { case (q, i) => - // Push a_state into the banks - q.io.enq.valid := in.a.fire() && a_last && a_bankSelect(i) - q.io.enq.bits.pos := a_bankPosition - q.io.enq.bits.data := a_state >> implicitBits - q.io.enq.bits.way := Mux(a_isPut, UInt(0), UInt(1)) - // Pop the bank's ways - q.io.deq(0).ready := out_b.fire() && b_bankSelect(i) - q.io.deq(1).ready := out.r.fire() && r_bankSelect(i) && r_last - // The FIFOs must be valid when we're ready to pop them... - assert (q.io.deq(0).valid || !q.io.deq(0).ready) - assert (q.io.deq(1).valid || !q.io.deq(1).ready) - } + // Fan out the ARW channel to AR and AW + out.ar.bits := queue_arw.bits + out.aw.bits := queue_arw.bits + out.ar.valid := queue_arw.valid && !queue_arw.bits.wen + out.aw.valid := queue_arw.valid && queue_arw.bits.wen + queue_arw.ready := Mux(queue_arw.bits.wen, out.aw.ready, out.ar.ready) - val b_bankData = Vec(banks.map(_.io.deq(0).bits.data))(b_bankIndex) - val b_bankPos = Vec(banks.map(_.io.deq(0).bits.pos ))(b_bankIndex) - val r_bankData = Vec(banks.map(_.io.deq(1).bits.data))(r_bankIndex) - val r_bankPos = Vec(banks.map(_.io.deq(1).bits.pos ))(r_bankIndex) + val beatBytes = edgeIn.manager.beatBytes + val maxSize = UInt(log2Ceil(beatBytes)) + val doneAW = RegInit(Bool(false)) + when (in.a.fire()) { doneAW := !a_last } - def optCat(x: (Boolean, UInt)*) = { Cat(x.toList.filter(_._1).map(_._2)) } - b_state := optCat((bankBits > 0, b_bankData), (posBits > 0, b_bankPos), (idBits > 0, b_id)) - r_state := optCat((bankBits > 0, r_bankData), (posBits > 0, r_bankPos), (idBits > 0, r_id)) + val arw = out_arw.bits + arw.wen := a_isPut + arw.id := a_id // truncated + arw.addr := a_address + arw.len := UIntToOH1(a_size, AXI4Parameters.lenBits + log2Ceil(beatBytes)) >> log2Ceil(beatBytes) + arw.size := Mux(a_size >= maxSize, maxSize, a_size) + arw.burst := AXI4Parameters.BURST_INCR + arw.lock := UInt(0) // not exclusive (LR/SC unsupported b/c no forward progress guarantee) + arw.cache := UInt(0) // do not allow AXI to modify our transactions + arw.prot := AXI4Parameters.PROT_PRIVILEDGED + arw.qos := UInt(0) // no QoS + + in.a.ready := Mux(a_isPut, (doneAW || out_arw.ready) && out_w.ready, out_arw.ready) + out_arw.valid := in.a.valid && Mux(a_isPut, !doneAW && out_w.ready, Bool(true)) + + out_w.valid := in.a.valid && a_isPut && (doneAW || out_arw.ready) + out_w.bits.data := in.a.bits.data + out_w.bits.strb := in.a.bits.mask + out_w.bits.last := a_last + + // R and B => D arbitration + val r_holds_d = RegInit(Bool(false)) + when (out.r.fire()) { r_holds_d := !out.r.bits.last } + // Give R higher priority than B + val r_wins = out.r.valid || r_holds_d + + out.r.ready := in.d.ready + out_b.ready := in.d.ready && !r_wins + in.d.valid := Mux(r_wins, out.r.valid, out_b.valid) + + val r_error = out.r.bits.resp =/= AXI4Parameters.RESP_OKAY + val b_error = out_b.bits.resp =/= AXI4Parameters.RESP_OKAY + + val r_d = edgeIn.AccessAck(r_addr_lo, UInt(0), r_source, r_size, UInt(0), r_error) + val b_d = edgeIn.AccessAck(b_addr_lo, UInt(0), b_source, b_size, b_error) + + in.d.bits := Mux(r_wins, r_d, b_d) + in.d.bits.data := out.r.bits.data // avoid a costly Mux + + // Tie off unused channels + in.b.valid := Bool(false) + in.c.ready := Bool(true) + in.e.ready := Bool(true) } - - // We need these Queues because AXI4 queues are irrevocable - val depth = if (combinational) 1 else 2 - val out_arw = Wire(Decoupled(new AXI4BundleARW(out.params))) - val out_w = Wire(out.w) - out.w <> Queue.irrevocable(out_w, entries=depth, flow=combinational) - val queue_arw = Queue.irrevocable(out_arw, entries=depth, flow=combinational) - - // Fan out the ARW channel to AR and AW - out.ar.bits := queue_arw.bits - out.aw.bits := queue_arw.bits - out.ar.valid := queue_arw.valid && !queue_arw.bits.wen - out.aw.valid := queue_arw.valid && queue_arw.bits.wen - queue_arw.ready := Mux(queue_arw.bits.wen, out.aw.ready, out.ar.ready) - - val beatBytes = edgeIn.manager.beatBytes - val maxSize = UInt(log2Ceil(beatBytes)) - val doneAW = RegInit(Bool(false)) - when (in.a.fire()) { doneAW := !a_last } - - val arw = out_arw.bits - arw.wen := a_isPut - arw.id := a_id // truncated - arw.addr := a_address - arw.len := UIntToOH1(a_size, AXI4Parameters.lenBits + log2Ceil(beatBytes)) >> log2Ceil(beatBytes) - arw.size := Mux(a_size >= maxSize, maxSize, a_size) - arw.burst := AXI4Parameters.BURST_INCR - arw.lock := UInt(0) // not exclusive (LR/SC unsupported b/c no forward progress guarantee) - arw.cache := UInt(0) // do not allow AXI to modify our transactions - arw.prot := AXI4Parameters.PROT_PRIVILEDGED - arw.qos := UInt(0) // no QoS - - in.a.ready := Mux(a_isPut, (doneAW || out_arw.ready) && out_w.ready, out_arw.ready) - out_arw.valid := in.a.valid && Mux(a_isPut, !doneAW && out_w.ready, Bool(true)) - - out_w.valid := in.a.valid && a_isPut && (doneAW || out_arw.ready) - out_w.bits.data := in.a.bits.data - out_w.bits.strb := in.a.bits.mask - out_w.bits.last := a_last - - // R and B => D arbitration - val r_holds_d = RegInit(Bool(false)) - when (out.r.fire()) { r_holds_d := !out.r.bits.last } - // Give R higher priority than B - val r_wins = out.r.valid || r_holds_d - - out.r.ready := in.d.ready - out_b.ready := in.d.ready && !r_wins - in.d.valid := Mux(r_wins, out.r.valid, out_b.valid) - - val r_error = out.r.bits.resp =/= AXI4Parameters.RESP_OKAY - val b_error = out_b.bits.resp =/= AXI4Parameters.RESP_OKAY - - val r_d = edgeIn.AccessAck(r_addr_lo, UInt(0), r_source, r_size, UInt(0), r_error) - val b_d = edgeIn.AccessAck(b_addr_lo, UInt(0), b_source, b_size, b_error) - - in.d.bits := Mux(r_wins, r_d, b_d) - in.d.bits.data := out.r.bits.data // avoid a costly Mux - - // Tie off unused channels - in.b.valid := Bool(false) - in.c.ready := Bool(true) - in.e.ready := Bool(true) } } diff --git a/src/main/scala/uncore/tilelink2/WidthWidget.scala b/src/main/scala/uncore/tilelink2/WidthWidget.scala index 7c24d082..f431cc31 100644 --- a/src/main/scala/uncore/tilelink2/WidthWidget.scala +++ b/src/main/scala/uncore/tilelink2/WidthWidget.scala @@ -12,8 +12,8 @@ import scala.math.{min,max} class TLWidthWidget(innerBeatBytes: Int)(implicit p: Parameters) extends LazyModule { val node = TLAdapterNode( - clientFn = { case Seq(c) => c }, - managerFn = { case Seq(m) => m.copy(beatBytes = innerBeatBytes) }) + clientFn = { case c => c }, + managerFn = { case m => m.copy(beatBytes = innerBeatBytes) }) lazy val module = new LazyModuleImp(this) { val io = new Bundle { @@ -139,27 +139,24 @@ class TLWidthWidget(innerBeatBytes: Int)(implicit p: Parameters) extends LazyMod } } - val edgeOut = node.edgesOut(0) - val edgeIn = node.edgesIn(0) - val in = io.in(0) - val out = io.out(0) + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + splice(edgeIn, in.a, edgeOut, out.a) + splice(edgeOut, out.d, edgeIn, in.d) - splice(edgeIn, in.a, edgeOut, out.a) - splice(edgeOut, out.d, edgeIn, in.d) - - if (edgeOut.manager.anySupportAcquireB && edgeIn.client.anySupportProbe) { - splice(edgeOut, out.b, edgeIn, in.b) - splice(edgeIn, in.c, edgeOut, out.c) - in.e.ready := out.e.ready - out.e.valid := in.e.valid - out.e.bits := in.e.bits - } else { - in.b.valid := Bool(false) - in.c.ready := Bool(true) - in.e.ready := Bool(true) - out.b.ready := Bool(true) - out.c.valid := Bool(false) - out.e.valid := Bool(false) + if (edgeOut.manager.anySupportAcquireB && edgeIn.client.anySupportProbe) { + splice(edgeOut, out.b, edgeIn, in.b) + splice(edgeIn, in.c, edgeOut, out.c) + in.e.ready := out.e.ready + out.e.valid := in.e.valid + out.e.bits := in.e.bits + } else { + in.b.valid := Bool(false) + in.c.ready := Bool(true) + in.e.ready := Bool(true) + out.b.ready := Bool(true) + out.c.valid := Bool(false) + out.e.valid := Bool(false) + } } } } diff --git a/src/main/scala/uncore/tilelink2/Xbar.scala b/src/main/scala/uncore/tilelink2/Xbar.scala index cd18c9c6..a030c8ef 100644 --- a/src/main/scala/uncore/tilelink2/Xbar.scala +++ b/src/main/scala/uncore/tilelink2/Xbar.scala @@ -34,7 +34,7 @@ class TLXbar(policy: TLArbiter.Policy = TLArbiter.lowestIndexFirst)(implicit p: } } - val node = TLAdapterNode( + val node = TLNexusNode( numClientPorts = 1 to 32, numManagerPorts = 1 to 32, clientFn = { seq =>