diff --git a/src/main/scala/coreplex/CoreplexNetwork.scala b/src/main/scala/coreplex/CoreplexNetwork.scala index 5368d2cf..8c99e189 100644 --- a/src/main/scala/coreplex/CoreplexNetwork.scala +++ b/src/main/scala/coreplex/CoreplexNetwork.scala @@ -32,7 +32,9 @@ trait CoreplexNetwork extends HasCoreplexParameters { // Allows a variable number of inputs from outside to the Xbar private val l2in_buffer = LazyModule(new TLBuffer) - l1tol2.node :=* l2in_buffer.node + private val l2in_fifo = LazyModule(new TLFIFOFixer) + l1tol2.node :=* l2in_fifo.node + l2in_fifo.node :=* l2in_buffer.node l2in_buffer.node :=* l2in private val l2out_buffer = LazyModule(new TLBuffer(BufferParams.flow, BufferParams.none)) diff --git a/src/main/scala/diplomacy/Parameters.scala b/src/main/scala/diplomacy/Parameters.scala index 795f98a9..4e8ef4c1 100644 --- a/src/main/scala/diplomacy/Parameters.scala +++ b/src/main/scala/diplomacy/Parameters.scala @@ -17,14 +17,16 @@ object RegionType { } // A non-empty half-open range; [start, end) -case class IdRange(start: Int, end: Int) +case class IdRange(start: Int, end: Int) extends Ordered[IdRange] { require (start >= 0, s"Ids cannot be negative, but got: $start.") require (start < end, "Id ranges cannot be empty.") - // This is a strict partial ordering - def <(x: IdRange) = end <= x.start - def >(x: IdRange) = x < this + def compare(x: IdRange) = { + val primary = (this.start - x.start).signum + val secondary = (x.end - this.end).signum + if (primary != 0) primary else secondary + } def overlaps(x: IdRange) = start < x.end && x.start < end def contains(x: IdRange) = start <= x.start && x.end <= end @@ -43,6 +45,14 @@ case class IdRange(start: Int, end: Int) def range = start until end } +object IdRange +{ + def overlaps(s: Seq[IdRange]) = if (s.isEmpty) None else { + val ranges = s.sorted + (ranges.tail zip ranges.init) find { case (a, b) => a overlaps b } + } +} + // An potentially empty inclusive range of 2-powers [min, max] (in bytes) case class TransferSizes(min: Int, max: Int) { diff --git a/src/main/scala/rocket/ScratchpadSlavePort.scala b/src/main/scala/rocket/ScratchpadSlavePort.scala index de8702ac..df9312df 100644 --- a/src/main/scala/rocket/ScratchpadSlavePort.scala +++ b/src/main/scala/rocket/ScratchpadSlavePort.scala @@ -104,7 +104,7 @@ trait CanHaveScratchpad extends HasHellaCache with HasICacheFrontend with HasCor val slaveNode = TLInputNode() // Up to two uses for this input node: // 1) Frontend always exists, but may or may not have a scratchpad node - val fg = LazyModule(new TLFragmenter(fetchWidth*coreInstBytes, p(CacheBlockBytes), true)) + val fg = LazyModule(new TLFragmenter(fetchWidth*coreInstBytes, p(CacheBlockBytes), earlyAck=true)) val ww = LazyModule(new TLWidthWidget(xLen/8)) frontend.slaveNode :*= fg.node fg.node :*= ww.node @@ -113,7 +113,7 @@ trait CanHaveScratchpad extends HasHellaCache with HasICacheFrontend with HasCor // 2) ScratchpadSlavePort always has a node, but only exists when the HellaCache has a scratchpad val scratch = tileParams.dcache.flatMap(d => d.scratch.map(s => LazyModule(new ScratchpadSlavePort(AddressSet(s, d.dataScratchpadBytes-1))))) - scratch foreach { lm => lm.node := TLFragmenter(xLen/8, p(CacheBlockBytes))(slaveNode) } + scratch foreach { lm => lm.node := TLFragmenter(xLen/8, p(CacheBlockBytes), earlyAck=true)(slaveNode) } def findScratchpadFromICache: Option[AddressSet] = scratch.map { s => val finalNode = frontend.masterNode.edgesOut.head.manager.managers.find(_.nodePath.last == s.node) diff --git a/src/main/scala/rocketchip/Configs.scala b/src/main/scala/rocketchip/Configs.scala index c5c883ff..9068d1b7 100644 --- a/src/main/scala/rocketchip/Configs.scala +++ b/src/main/scala/rocketchip/Configs.scala @@ -39,9 +39,10 @@ class BasePlatformConfig extends Config((site, here, up) => { case IncludeJtagDTM => false case JtagDTMKey => new JtagDTMKeyDefault() case ZeroConfig => ZeroConfig(base=0xa000000L, size=0x2000000L, beatBytes=8) + case ErrorConfig => ErrorConfig(Seq(AddressSet(0x3000, 0xfff))) case ExtMem => MasterConfig(base=0x80000000L, size=0x10000000L, beatBytes=8, idBits=4) case ExtBus => MasterConfig(base=0x60000000L, size=0x20000000L, beatBytes=8, idBits=4) - case ExtIn => SlaveConfig(beatBytes=8, idBits=8, sourceBits=2) + case ExtIn => SlaveConfig(beatBytes=8, idBits=8, sourceBits=4) }) /** Actual elaboratable target Configs */ diff --git a/src/main/scala/rocketchip/ExampleTop.scala b/src/main/scala/rocketchip/ExampleTop.scala index 96eabc2c..cdf49871 100644 --- a/src/main/scala/rocketchip/ExampleTop.scala +++ b/src/main/scala/rocketchip/ExampleTop.scala @@ -10,6 +10,7 @@ import rocketchip._ /** Example Top with Periphery (w/o coreplex) */ abstract class ExampleTop(implicit p: Parameters) extends BaseTop with PeripheryAsyncExtInterrupts + with PeripheryErrorSlave with PeripheryMasterAXI4Mem with PeripheryMasterAXI4MMIO with PeripherySlaveAXI4 { @@ -18,12 +19,14 @@ abstract class ExampleTop(implicit p: Parameters) extends BaseTop class ExampleTopBundle[+L <: ExampleTop](_outer: L) extends BaseTopBundle(_outer) with PeripheryExtInterruptsBundle + with PeripheryErrorSlaveBundle with PeripheryMasterAXI4MemBundle with PeripheryMasterAXI4MMIOBundle with PeripherySlaveAXI4Bundle class ExampleTopModule[+L <: ExampleTop, +B <: ExampleTopBundle[L]](_outer: L, _io: () => B) extends BaseTopModule(_outer, _io) with PeripheryExtInterruptsModule + with PeripheryErrorSlaveModule with PeripheryMasterAXI4MemModule with PeripheryMasterAXI4MMIOModule with PeripherySlaveAXI4Module diff --git a/src/main/scala/rocketchip/Periphery.scala b/src/main/scala/rocketchip/Periphery.scala index 75071cf0..2fac8023 100644 --- a/src/main/scala/rocketchip/Periphery.scala +++ b/src/main/scala/rocketchip/Periphery.scala @@ -13,7 +13,7 @@ import uncore.converters._ import uncore.devices._ import uncore.util._ import util._ -import scala.math.max +import scala.math.{min,max} /** Specifies the size of external memory */ case class MasterConfig(base: Long, size: Long, beatBytes: Int, idBits: Int) @@ -33,6 +33,9 @@ case object SOCBusConfig extends Field[TLBusConfig] /* Specifies the location of the Zero device */ case class ZeroConfig(base: Long, size: Long, beatBytes: Int) case object ZeroConfig extends Field[ZeroConfig] +/* Specifies the location of the Error device */ +case class ErrorConfig(address: Seq[AddressSet]) +case object ErrorConfig extends Field[ErrorConfig] /** Utility trait for quick access to some relevant parameters */ trait HasPeripheryParameters { @@ -131,12 +134,16 @@ trait PeripheryMasterAXI4Mem { beatBytes = config.beatBytes) }) - private val converter = LazyModule(new TLToAXI4(config.idBits)) + private val converter = LazyModule(new TLToAXI4(config.beatBytes)) + private val trim = LazyModule(new AXI4IdIndexer(config.idBits)) + private val yank = LazyModule(new AXI4UserYanker) private val buffer = LazyModule(new AXI4Buffer) mem foreach { case xbar => converter.node := xbar.node - buffer.node := converter.node + trim.node := converter.node + yank.node := trim.node + buffer.node := yank.node mem_axi4 := buffer.node } } @@ -199,16 +206,17 @@ trait PeripheryMasterAXI4MMIO { resources = device.reg, executable = true, // Can we run programs on this memory? supportsWrite = TransferSizes(1, 256), // The slave supports 1-256 byte transfers - supportsRead = TransferSizes(1, 256), - interleavedId = Some(0))), // slave does not interleave read responses + supportsRead = TransferSizes(1, 256))), beatBytes = config.beatBytes))) mmio_axi4 := AXI4Buffer()( - // AXI4Fragmenter(lite=false, maxInFlight = 20)( // beef device up to support awlen = 0xff - TLToAXI4(idBits = config.idBits)( // use idBits = 0 for AXI4-Lite + AXI4UserYanker()( + AXI4Deinterleaver(cacheBlockBytes)( + AXI4IdIndexer(config.idBits)( + TLToAXI4(config.beatBytes)( TLWidthWidget(socBusConfig.beatBytes)( // convert width before attaching to socBus - socBus.node))) + socBus.node)))))) } trait PeripheryMasterAXI4MMIOBundle { @@ -235,12 +243,14 @@ trait PeripherySlaveAXI4 extends HasTopLevelNetworks { masters = Seq(AXI4MasterParameters( id = IdRange(0, 1 << config.idBits)))))) + private val fifoBits = 1 fsb.node := - TLSourceShrinker(1 << config.sourceBits)( TLWidthWidget(config.beatBytes)( AXI4ToTL()( + AXI4UserYanker(Some(1 << (config.sourceBits - fifoBits - 1)))( AXI4Fragmenter()( - l2FrontendAXI4Node)))) + AXI4IdIndexer(fifoBits)( + l2FrontendAXI4Node))))) } trait PeripherySlaveAXI4Bundle extends HasTopLevelNetworksBundle { @@ -388,3 +398,26 @@ trait PeripheryTestBusMasterModule { val io: PeripheryTestBusMasterBundle } => } + +///// + +trait PeripheryErrorSlave { + this: HasTopLevelNetworks => + private val config = p(ErrorConfig) + private val maxXfer = min(config.address.map(_.alignment).max.toInt, 4096) + val error = LazyModule(new TLError(config.address, peripheryBusConfig.beatBytes)) + error.node := TLFragmenter(peripheryBusConfig.beatBytes, maxXfer)(peripheryBus.node) +} + +trait PeripheryErrorSlaveBundle { + this: HasTopLevelNetworksBundle { + val outer: PeripheryErrorSlave + } => +} + +trait PeripheryErrorSlaveModule { + this: HasTopLevelNetworksModule { + val outer: PeripheryErrorSlave + val io: PeripheryErrorSlaveBundle + } => +} diff --git a/src/main/scala/rocketchip/TestHarness.scala b/src/main/scala/rocketchip/TestHarness.scala index 5c1c360e..12db2e00 100644 --- a/src/main/scala/rocketchip/TestHarness.scala +++ b/src/main/scala/rocketchip/TestHarness.scala @@ -53,7 +53,7 @@ class SimAXIMem(channels: Int, forceSize: BigInt = 0)(implicit p: Parameters) ex for (i <- 0 until channels) { val sram = LazyModule(new AXI4RAM(AddressSet(0, size-1), beatBytes = config.beatBytes)) - sram.node := AXI4Buffer()(AXI4Fragmenter(maxInFlight = 4)(node)) + sram.node := AXI4Buffer()(AXI4Fragmenter()(node)) } lazy val module = new LazyModuleImp(this) { diff --git a/src/main/scala/uncore/axi4/Bundles.scala b/src/main/scala/uncore/axi4/Bundles.scala index f8c9ddc6..75f43c66 100644 --- a/src/main/scala/uncore/axi4/Bundles.scala +++ b/src/main/scala/uncore/axi4/Bundles.scala @@ -19,6 +19,7 @@ abstract class AXI4BundleA(params: AXI4BundleParameters) extends AXI4BundleBase( val cache = UInt(width = params.cacheBits) val prot = UInt(width = params.protBits) val qos = UInt(width = params.qosBits) // 0=no QoS, bigger = higher priority + val user = if (params.userBits > 0) Some(UInt(width = params.userBits)) else None // val region = UInt(width = 4) // optional // Number of bytes-1 in this operation @@ -51,6 +52,7 @@ class AXI4BundleR(params: AXI4BundleParameters) extends AXI4BundleBase(params) val id = UInt(width = params.idBits) val data = UInt(width = params.dataBits) val resp = UInt(width = params.respBits) + val user = if (params.userBits > 0) Some(UInt(width = params.userBits)) else None val last = Bool() } @@ -58,6 +60,7 @@ class AXI4BundleB(params: AXI4BundleParameters) extends AXI4BundleBase(params) { val id = UInt(width = params.idBits) val resp = UInt(width = params.respBits) + val user = if (params.userBits > 0) Some(UInt(width = params.userBits)) else None } class AXI4Bundle(params: AXI4BundleParameters) extends AXI4BundleBase(params) diff --git a/src/main/scala/uncore/axi4/Deinterleaver.scala b/src/main/scala/uncore/axi4/Deinterleaver.scala new file mode 100644 index 00000000..bc926c69 --- /dev/null +++ b/src/main/scala/uncore/axi4/Deinterleaver.scala @@ -0,0 +1,102 @@ +// See LICENSE.SiFive for license details. + +package uncore.axi4 + +import Chisel._ +import chisel3.internal.sourceinfo.SourceInfo +import chisel3.util.IrrevocableIO +import config._ +import diplomacy._ +import scala.math.{min,max} +import uncore.tilelink2.{leftOR, rightOR, UIntToOH1, OH1ToOH} + +class AXI4Deinterleaver(maxReadBytes: Int)(implicit p: Parameters) extends LazyModule +{ + require (maxReadBytes >= 1 && isPow2(maxReadBytes)) + + val node = AXI4AdapterNode( + masterFn = { mp => mp }, + slaveFn = { sp => sp.copy(slaves = sp.slaves.map(s => s.copy( + supportsRead = s.supportsRead.intersect(TransferSizes(1, maxReadBytes)), + interleavedId = Some(0)))) + }) + + lazy val module = new LazyModuleImp(this) { + val io = new Bundle { + val in = node.bundleIn + val out = node.bundleOut + } + + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val queues = edgeOut.master.endId + val beatBytes = edgeOut.slave.beatBytes + val beats = (maxReadBytes+beatBytes-1) / beatBytes + + // This adapter leaves the control + write paths completely untouched + out.ar <> in.ar + out.aw <> in.aw + out.w <> in.w + in.b <> out.b + + if (queues == 1) { + // Gracefully do nothing + in.r <> out.r + } else { + // Buffer R response + val count = RegInit(Vec.fill(queues) { UInt(0, width=log2Ceil(beats+1)) }) + val qs = Seq.fill(queues) { Module(new Queue(out.r.bits, beats)) } + + // Which ID is being enqueued and dequeued? + val locked = RegInit(Bool(false)) + val deq_id = Reg(UInt(width=log2Ceil(queues))) + val enq_id = out.r.bits.id + val deq_OH = UIntToOH(deq_id, queues) + val enq_OH = UIntToOH(enq_id, queues) + + // Track the number of completely received bursts per FIFO id + val next_count = Wire(count) + ((count zip next_count) zip (enq_OH.toBools zip deq_OH.toBools)) foreach { case ((p, n), (i, d)) => + val inc = i && out.r.fire() && out.r.bits.last + val dec = d && in.r.fire() && in.r.bits.last + n := p + inc.asUInt - dec.asUInt + // Bounds checking + assert (!dec || p =/= UInt(0)) + assert (!inc || p =/= UInt(beats)) + } + count := next_count + + // Select which Q will we start sending next cycle + val pending = Cat(next_count.map(_ =/= UInt(0)).reverse) + val winner = pending & ~(leftOR(pending) << 1) + when (!locked || (in.r.fire() && in.r.bits.last)) { + locked := pending.orR + deq_id := OHToUInt(winner) + } + + // Transmit the selected burst to inner + in.r.valid := locked + in.r.bits := Vec(qs.map(_.io.deq.bits))(deq_id) + (deq_OH.toBools zip qs) foreach { case (s, q) => + q.io.deq.ready := s && in.r.fire() + } + + // Feed response into matching Q + out.r.ready := Vec(qs.map(_.io.enq.ready))(enq_id) + (enq_OH.toBools zip qs) foreach { case (s, q) => + q.io.enq.valid := s && out.r.valid + q.io.enq.bits := out.r.bits + } + } + } + } +} + +object AXI4Deinterleaver +{ + // applied to the AXI4 source node; y.node := AXI4Deinterleaver()(x.node) + def apply(maxReadBytes: Int)(x: AXI4OutwardNode)(implicit p: Parameters, sourceInfo: SourceInfo): AXI4OutwardNode = { + val deinterleaver = LazyModule(new AXI4Deinterleaver(maxReadBytes)) + deinterleaver.node := x + deinterleaver.node + } +} diff --git a/src/main/scala/uncore/axi4/Fragmenter.scala b/src/main/scala/uncore/axi4/Fragmenter.scala index 42cf8fd7..d29f553d 100644 --- a/src/main/scala/uncore/axi4/Fragmenter.scala +++ b/src/main/scala/uncore/axi4/Fragmenter.scala @@ -10,8 +10,7 @@ import diplomacy._ import scala.math.{min,max} import uncore.tilelink2.{leftOR, rightOR, UIntToOH1, OH1ToOH} -// lite: masters all use only one ID => reads will not be interleaved -class AXI4Fragmenter(lite: Boolean = false, maxInFlight: => Int = 32, combinational: Boolean = true)(implicit p: Parameters) extends LazyModule +class AXI4Fragmenter()(implicit p: Parameters) extends LazyModule { val maxBeats = 1 << AXI4Parameters.lenBits def expandTransfer(x: TransferSizes, beatBytes: Int, alignment: BigInt) = @@ -19,11 +18,11 @@ class AXI4Fragmenter(lite: Boolean = false, maxInFlight: => Int = 32, combinatio def mapSlave(s: AXI4SlaveParameters, beatBytes: Int) = s.copy( supportsWrite = expandTransfer(s.supportsWrite, beatBytes, s.minAlignment), supportsRead = expandTransfer(s.supportsRead, beatBytes, s.minAlignment), - interleavedId = if (lite) Some(0) else s.interleavedId) // see AXI4FragmenterSideband for !lite case + interleavedId = None) // this breaks interleaving guarantees def mapMaster(m: AXI4MasterParameters) = m.copy(aligned = true) val node = AXI4AdapterNode( - masterFn = { mp => mp.copy(masters = mp.masters.map(m => mapMaster(m))) }, + masterFn = { mp => mp.copy(masters = mp.masters.map(m => mapMaster(m)), userBits = mp.userBits + 1) }, slaveFn = { sp => sp.copy(slaves = sp.slaves .map(s => mapSlave(s, sp.beatBytes))) }) lazy val module = new LazyModuleImp(this) { @@ -40,9 +39,6 @@ class AXI4Fragmenter(lite: Boolean = false, maxInFlight: => Int = 32, combinatio val master = edgeIn.master val masters = master.masters - // If the user claimed this was a lite interface, then there must be only one Id - require (!lite || master.endId == 1) - // We don't support fragmenting to sub-beat accesses slaves.foreach { s => require (!s.supportsRead || s.supportsRead.contains(beatBytes)) @@ -139,154 +135,77 @@ class AXI4Fragmenter(lite: Boolean = false, maxInFlight: => Int = 32, combinatio val readSizes1 = slaves.map(s => s.supportsRead .max/beatBytes-1) val writeSizes1 = slaves.map(s => s.supportsWrite.max/beatBytes-1) - // Indirection variables for inputs and outputs; makes transformation application easier + // Irrevocable queues in front because we want to accept the request before responses come back val (in_ar, ar_last, _) = fragment(Queue.irrevocable(in.ar, 1, flow=true), readSizes1) val (in_aw, aw_last, w_beats) = fragment(Queue.irrevocable(in.aw, 1, flow=true), writeSizes1) - val in_w = in.w - val in_r = in.r - val in_b = in.b - val out_ar = Wire(out.ar) - val out_aw = out.aw - val out_w = out.w - val out_r = Wire(out.r) - val out_b = Wire(out.b) - val depth = if (combinational) 1 else 2 - // In case a slave ties arready := rready, we need a queue to break the combinational loop - // between the two branches (in_ar => {out_ar => out_r, sideband} => in_r). - if (in.ar.bits.getWidth < in.r.bits.getWidth) { - out.ar <> Queue(out_ar, depth, flow=combinational) - out_r <> out.r - } else { - out.ar <> out_ar - out_r <> Queue(out.r, depth, flow=combinational) - } - // In case a slave ties awready := bready or wready := bready, we need this queue - out_b <> Queue(out.b, depth, flow=combinational) + // AXI ready may not depend on valid of other channels + // We cut wready here along with awready and arready before AXI4ToTL + val in_w = Queue.irrevocable(in.w, 1, flow=true) - // Sideband to track which transfers were the last fragment - def sideband() = if (lite) { - Module(new Queue(Bool(), maxInFlight, flow=combinational)).io - } else { - Module(new AXI4FragmenterSideband(maxInFlight, flow=combinational)).io - } - val sideband_ar_r = sideband() - val sideband_aw_b = sideband() - - // AR flow control - out_ar.valid := in_ar.valid && sideband_ar_r.enq.ready - in_ar.ready := sideband_ar_r.enq.ready && out_ar.ready - sideband_ar_r.enq.valid := in_ar.valid && out_ar.ready - out_ar.bits := in_ar.bits - sideband_ar_r.enq.bits := ar_last + // AR flow control; super easy + out.ar <> in_ar + out.ar.bits.user.get := Cat(in_ar.bits.user.toList ++ Seq(ar_last)) // When does W channel start counting a new transfer val wbeats_latched = RegInit(Bool(false)) val wbeats_ready = Wire(Bool()) val wbeats_valid = Wire(Bool()) when (wbeats_valid && wbeats_ready) { wbeats_latched := Bool(true) } - when (out_aw.fire()) { wbeats_latched := Bool(false) } + when (out.aw.fire()) { wbeats_latched := Bool(false) } // AW flow control - out_aw.valid := in_aw.valid && sideband_aw_b.enq.ready && (wbeats_ready || wbeats_latched) - in_aw.ready := sideband_aw_b.enq.ready && out_aw.ready && (wbeats_ready || wbeats_latched) - sideband_aw_b.enq.valid := in_aw.valid && out_aw.ready && (wbeats_ready || wbeats_latched) + out.aw.valid := in_aw.valid && (wbeats_ready || wbeats_latched) + in_aw.ready := out.aw.ready && (wbeats_ready || wbeats_latched) wbeats_valid := in_aw.valid && !wbeats_latched - out_aw.bits := in_aw.bits - sideband_aw_b.enq.bits := aw_last + out.aw.bits := in_aw.bits + out.aw.bits.user.get := Cat(in_aw.bits.user.toList ++ Seq(aw_last)) // We need to inject 'last' into the W channel fragments, count! val w_counter = RegInit(UInt(0, width = AXI4Parameters.lenBits+1)) val w_idle = w_counter === UInt(0) val w_todo = Mux(w_idle, Mux(wbeats_valid, w_beats, UInt(0)), w_counter) val w_last = w_todo === UInt(1) - w_counter := w_todo - out_w.fire() - assert (!out_w.fire() || w_todo =/= UInt(0)) // underflow impossible + w_counter := w_todo - out.w.fire() + assert (!out.w.fire() || w_todo =/= UInt(0)) // underflow impossible // W flow control wbeats_ready := w_idle - out_w.valid := in_w.valid && (!wbeats_ready || wbeats_valid) - in_w.ready := out_w.ready && (!wbeats_ready || wbeats_valid) - out_w.bits := in_w.bits - out_w.bits.last := w_last + out.w.valid := in_w.valid && (!wbeats_ready || wbeats_valid) + in_w.ready := out.w.ready && (!wbeats_ready || wbeats_valid) + out.w.bits := in_w.bits + out.w.bits.last := w_last // We should also recreate the last last - assert (!out_w.valid || !in_w.bits.last || w_last) + assert (!out.w.valid || !in_w.bits.last || w_last) // R flow control - val r_last = out_r.bits.last - in_r.valid := out_r.valid && (!r_last || sideband_ar_r.deq.valid) - out_r.ready := in_r.ready && (!r_last || sideband_ar_r.deq.valid) - sideband_ar_r.deq.ready := r_last && out_r.valid && in_r.ready - in_r.bits := out_r.bits - in_r.bits.last := r_last && sideband_ar_r.deq.bits + val r_last = out.r.bits.user.get(0) + in.r <> out.r + in.r.bits.last := out.r.bits.last && r_last + in.r.bits.user.foreach { _ := out.r.bits.user.get >> 1 } // B flow control - val b_last = sideband_aw_b.deq.bits - in_b.valid := out_b.valid && sideband_aw_b.deq.valid && b_last - out_b.ready := sideband_aw_b.deq.valid && (!b_last || in_b.ready) - sideband_aw_b.deq.ready := out_b.valid && (!b_last || in_b.ready) - in_b.bits := out_b.bits + val b_last = out.b.bits.user.get(0) + in.b <> out.b + in.b.valid := out.b.valid && b_last + out.b.ready := in.b.ready || !b_last + in.b.bits.user.foreach { _ := out.b.bits.user.get >> 1 } // Merge errors from dropped B responses - val r_resp = RegInit(UInt(0, width = AXI4Parameters.respBits)) - val resp = out_b.bits.resp | r_resp - when (out_b.fire()) { r_resp := Mux(b_last, UInt(0), resp) } - in_b.bits.resp := resp - } - } - - /* We want to put barriers between the fragments of a fragmented transfer and all other transfers. - * This lets us use very little state to reassemble the fragments (else we need one FIFO per ID). - * Furthermore, because all the fragments share the same AXI ID, they come back contiguously. - * This guarantees that no other R responses might get mixed between fragments, ensuring that the - * interleavedId for the slaves remains unaffected by the fragmentation transformation. - * Of course, if you need to fragment, this means there is a potentially hefty serialization cost. - * However, this design allows full concurrency in the common no-fragmentation-needed scenario. - */ - class AXI4FragmenterSideband(maxInFlight: Int, flow: Boolean = false) extends Module - { - val io = new QueueIO(Bool(), maxInFlight) - io.count := UInt(0) - - val PASS = UInt(2, width = 2) // allow 'last=1' bits to enque, on 'last=0' if count>0 block else accept+FIND - val FIND = UInt(0, width = 2) // allow 'last=0' bits to enque, accept 'last=1' and switch to WAIT - val WAIT = UInt(1, width = 2) // block all access till count=0 - - val state = RegInit(PASS) - val count = RegInit(UInt(0, width = log2Up(maxInFlight))) - val full = count === UInt(maxInFlight-1) - val empty = count === UInt(0) - val last = count === UInt(1) - - io.deq.bits := state(1) || (last && state(0)) // PASS || (last && WAIT) - io.deq.valid := !empty - - io.enq.ready := !full && (empty || (state === FIND) || (state === PASS && io.enq.bits)) - - // WAIT => count > 0 - assert (state =/= WAIT || count =/= UInt(0)) - - if (flow) { - when (io.enq.valid) { - io.deq.valid := Bool(true) - when (empty) { io.deq.bits := io.enq.bits } + val error = RegInit(Vec.fill(edgeIn.master.endId) { UInt(0, width = AXI4Parameters.respBits)}) + in.b.bits.resp := out.b.bits.resp | error(out.b.bits.id) + (error zip UIntToOH(out.b.bits.id, edgeIn.master.endId).toBools) foreach { case (reg, sel) => + when (sel && out.b.fire()) { reg := Mux(b_last, UInt(0), reg | out.b.bits.resp) } } } - - count := count + io.enq.fire() - io.deq.fire() - switch (state) { - is(PASS) { when (io.enq.valid && !io.enq.bits && empty) { state := FIND } } - is(FIND) { when (io.enq.valid && io.enq.bits && !full) { state := Mux(empty, PASS, WAIT) } } - is(WAIT) { when (last && io.deq.ready) { state := PASS } } - } } } object AXI4Fragmenter { // applied to the AXI4 source node; y.node := AXI4Fragmenter()(x.node) - def apply(lite: Boolean = false, maxInFlight: => Int = 32, combinational: Boolean = true)(x: AXI4OutwardNode)(implicit p: Parameters, sourceInfo: SourceInfo): AXI4OutwardNode = { - val fragmenter = LazyModule(new AXI4Fragmenter(lite, maxInFlight, combinational)) + def apply()(x: AXI4OutwardNode)(implicit p: Parameters, sourceInfo: SourceInfo): AXI4OutwardNode = { + val fragmenter = LazyModule(new AXI4Fragmenter) fragmenter.node := x fragmenter.node } diff --git a/src/main/scala/uncore/axi4/IdIndexer.scala b/src/main/scala/uncore/axi4/IdIndexer.scala new file mode 100644 index 00000000..b7283417 --- /dev/null +++ b/src/main/scala/uncore/axi4/IdIndexer.scala @@ -0,0 +1,78 @@ +// See LICENSE.SiFive for license details. + +package uncore.axi4 + +import Chisel._ +import chisel3.internal.sourceinfo.SourceInfo +import config._ +import diplomacy._ +import scala.math.{min,max} + +class AXI4IdIndexer(idBits: Int)(implicit p: Parameters) extends LazyModule +{ + require (idBits >= 0) + + val node = AXI4AdapterNode( + masterFn = { mp => + // Create one new "master" per ID + val masters = Array.tabulate(1 << idBits) { i => AXI4MasterParameters( + id = IdRange(i, i+1), + aligned = true, + maxFlight = Some(0)) + } + // Squash the information from original masters into new ID masters + mp.masters.foreach { m => + for (i <- m.id.start until m.id.end) { + val j = i % (1 << idBits) + val old = masters(j) + masters(j) = old.copy( + aligned = old.aligned && m.aligned, + maxFlight = old.maxFlight.flatMap { o => m.maxFlight.map { n => o+n } }) + } + } + mp.copy( + userBits = mp.userBits + max(0, log2Ceil(mp.endId) - idBits), + masters = masters) + }, + slaveFn = { sp => sp.copy( + slaves = sp.slaves.map(s => s.copy( + interleavedId = if (idBits == 0) Some(0) else s.interleavedId))) + }) + + lazy val module = new LazyModuleImp(this) { + val io = new Bundle { + val in = node.bundleIn + val out = node.bundleOut + } + + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + + // Leave everything mostly untouched + out.ar <> in.ar + out.aw <> in.aw + out.w <> in.w + in.b <> out.b + in.r <> out.r + + val bits = log2Ceil(edgeIn.master.endId) - idBits + if (bits > 0) { + out.ar.bits.user.get := Cat(in.ar.bits.user.toList ++ Seq(in.ar.bits.id >> idBits)) + out.aw.bits.user.get := Cat(in.aw.bits.user.toList ++ Seq(in.aw.bits.id >> idBits)) + in.r.bits.user.foreach { _ := out.r.bits.user.get >> bits } + in.b.bits.user.foreach { _ := out.b.bits.user.get >> bits } + in.r.bits.id := Cat(out.r.bits.user.get, out.r.bits.id) + in.b.bits.id := Cat(out.b.bits.user.get, out.b.bits.id) + } + } + } +} + +object AXI4IdIndexer +{ + // applied to the AXI4 source node; y.node := AXI4IdIndexer(idBits)(x.node) + def apply(idBits: Int)(x: AXI4OutwardNode)(implicit p: Parameters, sourceInfo: SourceInfo): AXI4OutwardNode = { + val indexer = LazyModule(new AXI4IdIndexer(idBits)) + indexer.node := x + indexer.node + } +} diff --git a/src/main/scala/uncore/axi4/Parameters.scala b/src/main/scala/uncore/axi4/Parameters.scala index 017bc001..8642613e 100644 --- a/src/main/scala/uncore/axi4/Parameters.scala +++ b/src/main/scala/uncore/axi4/Parameters.scala @@ -62,26 +62,33 @@ case class AXI4SlavePortParameters( } case class AXI4MasterParameters( - id: IdRange = IdRange(0, 1), - aligned: Boolean = false, - nodePath: Seq[BaseNode] = Seq()) + id: IdRange = IdRange(0, 1), + aligned: Boolean = false, + maxFlight: Option[Int] = None, // None = infinite, else is a per-ID cap + nodePath: Seq[BaseNode] = Seq()) { val name = nodePath.lastOption.map(_.lazyModule.name).getOrElse("disconnected") + maxFlight.foreach { m => require (m >= 0) } } case class AXI4MasterPortParameters( - masters: Seq[AXI4MasterParameters]) + masters: Seq[AXI4MasterParameters], + userBits: Int = 0) { val endId = masters.map(_.id.end).max + require (userBits >= 0) // Require disjoint ranges for ids - masters.combinations(2).foreach { case Seq(x,y) => require (!x.id.overlaps(y.id), s"$x and $y overlap") } + IdRange.overlaps(masters.map(_.id)).foreach { case (x, y) => + require (!x.overlaps(y), s"AXI4MasterParameters.id $x and $y overlap") + } } case class AXI4BundleParameters( addrBits: Int, dataBits: Int, - idBits: Int) + idBits: Int, + userBits: Int) { require (dataBits >= 8, s"AXI4 data bits must be >= 8 (got $dataBits)") require (addrBits >= 1, s"AXI4 addr bits must be >= 1 (got $addrBits)") @@ -102,19 +109,21 @@ case class AXI4BundleParameters( AXI4BundleParameters( max(addrBits, x.addrBits), max(dataBits, x.dataBits), - max(idBits, x.idBits)) + max(idBits, x.idBits), + max(userBits, x.userBits)) } object AXI4BundleParameters { - val emptyBundleParams = AXI4BundleParameters(addrBits=1, dataBits=8, idBits=1) + val emptyBundleParams = AXI4BundleParameters(addrBits=1, dataBits=8, idBits=1, userBits=0) def union(x: Seq[AXI4BundleParameters]) = x.foldLeft(emptyBundleParams)((x,y) => x.union(y)) def apply(master: AXI4MasterPortParameters, slave: AXI4SlavePortParameters) = new AXI4BundleParameters( addrBits = log2Up(slave.maxAddress+1), dataBits = slave.beatBytes * 8, - idBits = log2Up(master.endId)) + idBits = log2Up(master.endId), + userBits = master.userBits) } case class AXI4EdgeParameters( diff --git a/src/main/scala/uncore/axi4/RegisterRouter.scala b/src/main/scala/uncore/axi4/RegisterRouter.scala index b7d51611..7ac32fae 100644 --- a/src/main/scala/uncore/axi4/RegisterRouter.scala +++ b/src/main/scala/uncore/axi4/RegisterRouter.scala @@ -17,7 +17,7 @@ class AXI4RegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int supportsRead = TransferSizes(1, beatBytes), interleavedId = Some(0))), beatBytes = beatBytes, - minLatency = min(concurrency, 1)))) // the Queue adds at most one cycle + minLatency = 1))) { require (address.contiguous) @@ -30,7 +30,7 @@ class AXI4RegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int val r = bundleIn(0).r val b = bundleIn(0).b - val params = RegMapperParams(log2Up((address.mask+1)/beatBytes), beatBytes, ar.bits.params.idBits) + val params = RegMapperParams(log2Up((address.mask+1)/beatBytes), beatBytes, ar.bits.params.idBits + ar.bits.params.userBits) val in = Wire(Decoupled(new RegMapperInput(params))) // Prefer to execute reads first @@ -39,34 +39,39 @@ class AXI4RegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int aw.ready := in.ready && !ar.valid && w .valid w .ready := in.ready && !ar.valid && aw.valid - val addr = Mux(ar.valid, ar.bits.addr, aw.bits.addr) - val in_id = Mux(ar.valid, ar.bits.id, aw.bits.id) + val ar_extra = Cat(Seq(ar.bits.id) ++ ar.bits.user.toList) + val aw_extra = Cat(Seq(aw.bits.id) ++ aw.bits.user.toList) + val in_extra = Mux(ar.valid, ar_extra, aw_extra) + val addr = Mux(ar.valid, ar.bits.addr, aw.bits.addr) val mask = uncore.tilelink2.maskGen(ar.bits.addr, ar.bits.size, beatBytes) in.bits.read := ar.valid in.bits.index := addr >> log2Ceil(beatBytes) in.bits.data := w.bits.data in.bits.mask := Mux(ar.valid, mask, w.bits.strb) - in.bits.extra := in_id + in.bits.extra := in_extra // Invoke the register map builder and make it Irrevocable val out = Queue.irrevocable( RegMapper(beatBytes, concurrency, undefZero, in, mapping:_*), - entries = 1, flow = true) + entries = 2) // No flow control needed out.ready := Mux(out.bits.read, r.ready, b.ready) r.valid := out.valid && out.bits.read b.valid := out.valid && !out.bits.read - val out_id = if (r.bits.params.idBits == 0) UInt(0) else out.bits.extra + val out_id = if (r.bits.params.idBits == 0) UInt(0) else (out.bits.extra >> ar.bits.params.userBits) r.bits.id := out_id r.bits.data := out.bits.data r.bits.last := Bool(true) r.bits.resp := AXI4Parameters.RESP_OKAY + r.bits.user.foreach { _ := out.bits.extra } + b.bits.id := out_id b.bits.resp := AXI4Parameters.RESP_OKAY + b.bits.user.foreach { _ := out.bits.extra } } } diff --git a/src/main/scala/uncore/axi4/SRAM.scala b/src/main/scala/uncore/axi4/SRAM.scala index 38251361..e0ae971b 100644 --- a/src/main/scala/uncore/axi4/SRAM.scala +++ b/src/main/scala/uncore/axi4/SRAM.scala @@ -18,7 +18,7 @@ class AXI4RAM(address: AddressSet, executable: Boolean = true, beatBytes: Int = supportsWrite = TransferSizes(1, beatBytes), interleavedId = Some(0))), beatBytes = beatBytes, - minLatency = 0))) // B responds on same cycle + minLatency = 1))) // We require the address range to include an entire beat (for the write mask) require ((address.mask & (beatBytes-1)) == beatBytes-1) @@ -38,36 +38,53 @@ class AXI4RAM(address: AddressSet, executable: Boolean = true, beatBytes: Int = val r_addr = Cat((mask zip (in.ar.bits.addr >> log2Ceil(beatBytes)).toBools).filter(_._1).map(_._2).reverse) val w_addr = Cat((mask zip (in.aw.bits.addr >> log2Ceil(beatBytes)).toBools).filter(_._1).map(_._2).reverse) - in.aw.ready := in. w.valid && in.b.ready - in. w.ready := in.aw.valid && in.b.ready - in. b.valid := in.w.valid && in.aw.valid + val w_full = RegInit(Bool(false)) + val w_id = Reg(UInt()) + val w_user = Reg(UInt()) + + when (in. b.fire()) { w_full := Bool(false) } + when (in.aw.fire()) { w_full := Bool(true) } + + when (in.aw.fire()) { + w_id := in.aw.bits.id + in.aw.bits.user.foreach { w_user := _ } + } - in.b.bits.id := in.aw.bits.id - in.b.bits.resp := AXI4Parameters.RESP_OKAY val wdata = Vec.tabulate(beatBytes) { i => in.w.bits.data(8*(i+1)-1, 8*i) } - when (in.b.fire()) { + when (in.aw.fire()) { mem.write(w_addr, wdata, in.w.bits.strb.toBools) } + in. b.valid := w_full + in.aw.ready := in. w.valid && (in.b.ready || !w_full) + in. w.ready := in.aw.valid && (in.b.ready || !w_full) + + in.b.bits.id := w_id + in.b.bits.resp := AXI4Parameters.RESP_OKAY + in.b.bits.user.foreach { _ := w_user } + val r_full = RegInit(Bool(false)) val r_id = Reg(UInt()) + val r_user = Reg(UInt()) when (in. r.fire()) { r_full := Bool(false) } when (in.ar.fire()) { r_full := Bool(true) } - in. r.valid := r_full - in.ar.ready := in.r.ready || !r_full - when (in.ar.fire()) { r_id := in.ar.bits.id + in.ar.bits.user.foreach { r_user := _ } } val ren = in.ar.fire() val rdata = mem.readAndHold(r_addr, ren) + in. r.valid := r_full + in.ar.ready := in.r.ready || !r_full + in.r.bits.id := r_id in.r.bits.resp := AXI4Parameters.RESP_OKAY in.r.bits.data := Cat(rdata.reverse) + in.r.bits.user.foreach { _ := r_user } in.r.bits.last := Bool(true) } } diff --git a/src/main/scala/uncore/axi4/Test.scala b/src/main/scala/uncore/axi4/Test.scala index 11bf11a8..69c97580 100644 --- a/src/main/scala/uncore/axi4/Test.scala +++ b/src/main/scala/uncore/axi4/Test.scala @@ -26,8 +26,8 @@ class AXI4LiteFuzzRAM()(implicit p: Parameters) extends LazyModule model.node := fuzz.node xbar.node := TLDelayer(0.1)(TLBuffer(BufferParams.flow)(TLDelayer(0.2)(model.node))) - ram.node := AXI4Fragmenter(lite=true)(TLToAXI4(0, true )(xbar.node)) - gpio.node := AXI4Fragmenter(lite=true)(TLToAXI4(0, false)(xbar.node)) + ram.node := AXI4Fragmenter()(AXI4Deinterleaver(16)(TLToAXI4(4, true )(xbar.node))) + gpio.node := AXI4Fragmenter()(AXI4Deinterleaver(16)(TLToAXI4(4, false)(xbar.node))) lazy val module = new LazyModuleImp(this) with HasUnitTestIO { io.finished := fuzz.module.io.finished @@ -49,8 +49,8 @@ class AXI4FullFuzzRAM()(implicit p: Parameters) extends LazyModule model.node := fuzz.node xbar.node := TLDelayer(0.1)(TLBuffer(BufferParams.flow)(TLDelayer(0.2)(model.node))) - ram.node := AXI4Fragmenter(lite=false, maxInFlight = 2)(TLToAXI4(4,false)(xbar.node)) - gpio.node := AXI4Fragmenter(lite=false, maxInFlight = 5)(TLToAXI4(4,true )(xbar.node)) + ram.node := AXI4Fragmenter()(AXI4Deinterleaver(16)(TLToAXI4(4,false)(xbar.node))) + gpio.node := AXI4Fragmenter()(AXI4Deinterleaver(16)(TLToAXI4(4,true )(xbar.node))) lazy val module = new LazyModuleImp(this) with HasUnitTestIO { io.finished := fuzz.module.io.finished @@ -70,11 +70,13 @@ class AXI4FuzzMaster()(implicit p: Parameters) extends LazyModule model.node := fuzz.node node := + AXI4UserYanker()( + AXI4Deinterleaver(64)( TLToAXI4(4)( TLDelayer(0.1)( TLBuffer(BufferParams.flow)( TLDelayer(0.1)( - model.node)))) + model.node)))))) lazy val module = new LazyModuleImp(this) { val io = new Bundle { @@ -89,16 +91,23 @@ class AXI4FuzzMaster()(implicit p: Parameters) extends LazyModule class AXI4FuzzSlave()(implicit p: Parameters) extends LazyModule { val node = AXI4InputNode() - val ram = LazyModule(new TLTestRAM(AddressSet(0x0, 0xfff))) + val xbar = LazyModule(new TLXbar) + val ram = LazyModule(new TLRAM(AddressSet(0x0, 0xfff))) + val error= LazyModule(new TLError(Seq(AddressSet(0x1800, 0xff)))) - ram.node := - TLFragmenter(4, 16)( + ram.node := TLFragmenter(4, 16)(xbar.node) + error.node := TLFragmenter(4, 16)(xbar.node) + + xbar.node := + TLFIFOFixer()( TLDelayer(0.1)( TLBuffer(BufferParams.flow)( TLDelayer(0.1)( AXI4ToTL()( + AXI4UserYanker(Some(4))( AXI4Fragmenter()( - node)))))) + AXI4IdIndexer(2)( + node)))))))) lazy val module = new LazyModuleImp(this) { val io = new Bundle { diff --git a/src/main/scala/uncore/axi4/ToTL.scala b/src/main/scala/uncore/axi4/ToTL.scala index a7612b36..bce1c60b 100644 --- a/src/main/scala/uncore/axi4/ToTL.scala +++ b/src/main/scala/uncore/axi4/ToTL.scala @@ -9,23 +9,29 @@ import diplomacy._ import uncore.tilelink2._ case class AXI4ToTLNode() extends MixedAdapterNode(AXI4Imp, TLImp)( - dFn = { case AXI4MasterPortParameters(masters) => - TLClientPortParameters(clients = masters.map { m => - TLClientParameters( - sourceId = IdRange(m.id.start << 1, m.id.end << 1), // R+W ids are distinct - nodePath = m.nodePath) - }) + dFn = { case AXI4MasterPortParameters(masters, userBits) => + masters.foreach { m => require (m.maxFlight.isDefined, "AXI4 must include a transaction maximum per ID to convert to TL") } + val maxFlight = masters.map(_.maxFlight.get).max + TLClientPortParameters( + clients = masters.flatMap { m => + for (id <- m.id.start until m.id.end) + yield TLClientParameters( + sourceId = IdRange(id * maxFlight*2, (id+1) * maxFlight*2), // R+W ids are distinct + nodePath = m.nodePath, + requestFifo = true) + }) }, uFn = { mp => AXI4SlavePortParameters( slaves = mp.managers.map { m => + val maxXfer = TransferSizes(1, mp.beatBytes * (1 << AXI4Parameters.lenBits)) AXI4SlaveParameters( address = m.address, resources = m.resources, regionType = m.regionType, executable = m.executable, nodePath = m.nodePath, - supportsWrite = m.supportsPutPartial, - supportsRead = m.supportsGet, + supportsWrite = m.supportsPutPartial.intersect(maxXfer), + supportsRead = m.supportsGet.intersect(maxXfer), interleavedId = Some(0))}, // TL2 never interleaves D beats beatBytes = mp.beatBytes, minLatency = mp.minLatency) @@ -45,58 +51,64 @@ class AXI4ToTL()(implicit p: Parameters) extends LazyModule val numIds = edgeIn.master.endId val beatBytes = edgeOut.manager.beatBytes val countBits = AXI4Parameters.lenBits + (1 << AXI4Parameters.sizeBits) - 1 + val maxFlight = edgeIn.master.masters.map(_.maxFlight.get).max + val addedBits = log2Ceil(maxFlight) + 1 + require (edgeIn.master.userBits == 0, "AXI4 user bits cannot be transported by TL") require (edgeIn.master.masters(0).aligned) + edgeOut.manager.requireFifo() + + // Look for an Error device to redirect bad requests + val errorDevs = edgeOut.manager.managers.filter(_.nodePath.last.lazyModule.className == "TLError") + require (!errorDevs.isEmpty, "There is no TLError reachable from AXI4ToTL. One must be instantiated.") + val error = errorDevs.head.address.head.base + require (errorDevs.head.supportsPutPartial.contains(edgeOut.manager.maxTransfer), + s"Error device supports ${errorDevs.head.supportsPutPartial} PutPartial but must support ${edgeOut.manager.maxTransfer}") + require (errorDevs.head.supportsGet.contains(edgeOut.manager.maxTransfer), + s"Error device supports ${errorDevs.head.supportsGet} Get but must support ${edgeOut.manager.maxTransfer}") val r_out = Wire(out.a) - val r_inflight = RegInit(UInt(0, width = numIds)) - val r_block = r_inflight(in.ar.bits.id) val r_size1 = in.ar.bits.bytes1() val r_size = OH1ToUInt(r_size1) - val r_addr = in.ar.bits.addr - val r_ok = edgeOut.manager.supportsGetSafe(r_addr, r_size) - val r_err_in = Wire(Decoupled(new AXI4BundleRError(in.ar.bits.params))) - val r_err_out = Queue(r_err_in, 2) - val r_count = RegInit(UInt(0, width = in.ar.bits.params.lenBits)) - val r_last = r_count === in.ar.bits.len + val r_ok = edgeOut.manager.supportsGetSafe(in.ar.bits.addr, r_size) + val r_addr = Mux(r_ok, in.ar.bits.addr, UInt(error) | in.ar.bits.addr(log2Up(beatBytes)-1, 0)) + val r_count = RegInit(Vec.fill(numIds) { UInt(0, width = log2Ceil(maxFlight)) }) + val r_id = Cat(in.ar.bits.id, r_count(in.ar.bits.id), UInt(0, width=1)) assert (!in.ar.valid || r_size1 === UIntToOH1(r_size, countBits)) // because aligned - in.ar.ready := Mux(r_ok, r_out.ready, r_err_in.ready && r_last) && !r_block - r_out.valid := in.ar.valid && !r_block && r_ok - r_out.bits := edgeOut.Get(in.ar.bits.id << 1 | UInt(1), r_addr, r_size)._2 - r_err_in.valid := in.ar.valid && !r_block && !r_ok - r_err_in.bits.last := r_last - r_err_in.bits.id := in.ar.bits.id + in.ar.ready := r_out.ready + r_out.valid := in.ar.valid + r_out.bits := edgeOut.Get(r_id, r_addr, r_size)._2 - when (r_err_in.fire()) { r_count := Mux(r_last, UInt(0), r_count + UInt(1)) } + val r_sel = UIntToOH(in.ar.bits.id, numIds) + (r_sel.toBools zip r_count) foreach { case (s, r) => + when (in.ar.fire() && s) { r := r + UInt(1) } + } val w_out = Wire(out.a) - val w_inflight = RegInit(UInt(0, width = numIds)) - val w_block = w_inflight(in.aw.bits.id) val w_size1 = in.aw.bits.bytes1() val w_size = OH1ToUInt(w_size1) - val w_addr = in.aw.bits.addr - val w_ok = edgeOut.manager.supportsPutPartialSafe(w_addr, w_size) - val w_err_in = Wire(Decoupled(in.aw.bits.id)) - val w_err_out = Queue(w_err_in, 2) + val w_ok = edgeOut.manager.supportsPutPartialSafe(in.aw.bits.addr, w_size) + val w_addr = Mux(w_ok, in.aw.bits.addr, UInt(error) | in.aw.bits.addr(log2Up(beatBytes)-1, 0)) + val w_count = RegInit(Vec.fill(numIds) { UInt(0, width = log2Ceil(maxFlight)) }) + val w_id = Cat(in.aw.bits.id, w_count(in.aw.bits.id), UInt(1, width=1)) assert (!in.aw.valid || w_size1 === UIntToOH1(w_size, countBits)) // because aligned assert (!in.aw.valid || in.aw.bits.len === UInt(0) || in.aw.bits.size === UInt(log2Ceil(beatBytes))) // because aligned - in.aw.ready := Mux(w_ok, w_out.ready, w_err_in.ready) && in.w.valid && in.w.bits.last && !w_block - in.w.ready := Mux(w_ok, w_out.ready, w_err_in.ready || !in.w.bits.last) && in.aw.valid && !w_block - w_out.valid := in.aw.valid && in.w.valid && !w_block && w_ok - w_out.bits := edgeOut.Put(in.aw.bits.id << 1, w_addr, w_size, in.w.bits.data, in.w.bits.strb)._2 - w_err_in.valid := in.aw.valid && in.w.valid && !w_block && !w_ok && in.w.bits.last - w_err_in.bits := in.aw.bits.id + in.aw.ready := w_out.ready && in.w.valid && in.w.bits.last + in.w.ready := w_out.ready && in.aw.valid + w_out.valid := in.aw.valid && in.w.valid + w_out.bits := edgeOut.Put(w_id, w_addr, w_size, in.w.bits.data, in.w.bits.strb)._2 - TLArbiter(TLArbiter.lowestIndexFirst)(out.a, (UInt(0), r_out), (in.aw.bits.len, w_out)) + val w_sel = UIntToOH(in.aw.bits.id, numIds) + (w_sel.toBools zip w_count) foreach { case (s, r) => + when (in.aw.fire() && s) { r := r + UInt(1) } + } + + TLArbiter(TLArbiter.roundRobin)(out.a, (UInt(0), r_out), (in.aw.bits.len, w_out)) val ok_b = Wire(in.b) - val err_b = Wire(in.b) - val mux_b = Wire(in.b) val ok_r = Wire(in.r) - val err_r = Wire(in.r) - val mux_r = Wire(in.r) val d_resp = Mux(out.d.bits.error, AXI4Parameters.RESP_SLVERR, AXI4Parameters.RESP_OKAY) val d_hasData = edgeOut.hasData(out.d.bits) @@ -106,58 +118,33 @@ class AXI4ToTL()(implicit p: Parameters) extends LazyModule ok_r.valid := out.d.valid && d_hasData ok_b.valid := out.d.valid && !d_hasData - ok_r.bits.id := out.d.bits.source >> 1 + ok_r.bits.id := out.d.bits.source >> addedBits ok_r.bits.data := out.d.bits.data ok_r.bits.resp := d_resp ok_r.bits.last := d_last - r_err_out.ready := err_r.ready - err_r.valid := r_err_out.valid - err_r.bits.id := r_err_out.bits.id - err_r.bits.data := out.d.bits.data // don't care - err_r.bits.resp := AXI4Parameters.RESP_DECERR - err_r.bits.last := r_err_out.bits.last - - // AXI4 must hold R to one source until last - val mux_lock_ok = RegInit(Bool(false)) - val mux_lock_err = RegInit(Bool(false)) - when (ok_r .fire()) { mux_lock_ok := !ok_r .bits.last } - when (err_r.fire()) { mux_lock_err := !err_r.bits.last } - assert (!mux_lock_ok || !mux_lock_err) - - // Prioritize err over ok (b/c err_r.valid comes from a register) - mux_r.valid := (!mux_lock_err && ok_r.valid) || (!mux_lock_ok && err_r.valid) - mux_r.bits := Mux(!mux_lock_ok && err_r.valid, err_r.bits, ok_r.bits) - ok_r.ready := mux_r.ready && (mux_lock_ok || !err_r.valid) - err_r.ready := mux_r.ready && !mux_lock_ok - // AXI4 needs irrevocable behaviour - in.r <> Queue.irrevocable(mux_r, 1, flow=true) + in.r <> Queue.irrevocable(ok_r, 1, flow=true) - ok_b.bits.id := out.d.bits.source >> 1 + ok_b.bits.id := out.d.bits.source >> addedBits ok_b.bits.resp := d_resp - w_err_out.ready := err_b.ready - err_b.valid := w_err_out.valid - err_b.bits.id := w_err_out.bits - err_b.bits.resp := AXI4Parameters.RESP_DECERR - - // Prioritize err over ok (b/c err_b.valid comes from a register) - mux_b.valid := ok_b.valid || err_b.valid - mux_b.bits := Mux(err_b.valid, err_b.bits, ok_b.bits) - ok_b.ready := mux_b.ready && !err_b.valid - err_b.ready := mux_b.ready - // AXI4 needs irrevocable behaviour - in.b <> Queue.irrevocable(mux_b, 1, flow=true) + val q_b = Queue.irrevocable(ok_b, 1, flow=true) - // Update flight trackers - val r_set = in.ar.fire().asUInt << in.ar.bits.id - val r_clr = (in.r.fire() && in.r.bits.last).asUInt << in.r.bits.id - r_inflight := (r_inflight | r_set) & ~r_clr - val w_set = in.aw.fire().asUInt << in.aw.bits.id - val w_clr = in.b.fire().asUInt << in.b.bits.id - w_inflight := (w_inflight | w_set) & ~w_clr + // We need to prevent sending B valid before the last W beat is accepted + // TileLink allows early acknowledgement of a write burst, but AXI does not. + val b_count = RegInit(Vec.fill(numIds) { UInt(0, width = log2Ceil(maxFlight)) }) + val b_allow = b_count(in.b.bits.id) =/= w_count(in.b.bits.id) + val b_sel = UIntToOH(in.b.bits.id, numIds) + + (b_sel.toBools zip b_count) foreach { case (s, r) => + when (in.b.fire() && s) { r := r + UInt(1) } + } + + in.b.bits := q_b.bits + in.b.valid := q_b.valid && b_allow + q_b.ready := in.b.ready && b_allow // Unused channels out.b.ready := Bool(true) diff --git a/src/main/scala/uncore/axi4/UserYanker.scala b/src/main/scala/uncore/axi4/UserYanker.scala new file mode 100644 index 00000000..72bd7bdc --- /dev/null +++ b/src/main/scala/uncore/axi4/UserYanker.scala @@ -0,0 +1,106 @@ +// See LICENSE.SiFive for license details. + +package uncore.axi4 + +import Chisel._ +import chisel3.internal.sourceinfo.SourceInfo +import config._ +import diplomacy._ +import uncore.tilelink2.UIntToOH1 + +class AXI4UserYanker(capMaxFlight: Option[Int] = None)(implicit p: Parameters) extends LazyModule +{ + val node = AXI4AdapterNode( + masterFn = { mp => mp.copy( + userBits = 0, + masters = mp.masters.map { m => m.copy( + maxFlight = (m.maxFlight, capMaxFlight) match { + case (Some(x), Some(y)) => Some(x min y) + case (Some(x), None) => Some(x) + case (None, Some(y)) => Some(y) + case (None, None) => None })})}, + slaveFn = { sp => sp }) + + lazy val module = new LazyModuleImp(this) { + val io = new Bundle { + val in = node.bundleIn + val out = node.bundleOut + } + + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + val bits = edgeIn.bundle.userBits + val need_bypass = edgeOut.slave.minLatency < 1 + require (bits > 0) // useless UserYanker! + + edgeOut.master.masters.foreach { m => + require (m.maxFlight.isDefined, "UserYanker needs a flight cap on each ID") + } + + def queue(id: Int) = { + val depth = edgeOut.master.masters.find(_.id.contains(id)).flatMap(_.maxFlight).getOrElse(0) + if (depth == 0) { + Wire(new QueueIO(UInt(width = bits), 1)) // unused ID => undefined value + } else { + Module(new Queue(UInt(width = bits), depth, flow=need_bypass)).io + } + } + + val rqueues = Seq.tabulate(edgeIn.master.endId) { i => queue(i) } + val wqueues = Seq.tabulate(edgeIn.master.endId) { i => queue(i) } + + val arid = in.ar.bits.id + val ar_ready = Vec(rqueues.map(_.enq.ready))(arid) + in .ar.ready := out.ar.ready && ar_ready + out.ar.valid := in .ar.valid && ar_ready + out.ar.bits := in .ar.bits + + val rid = out.r.bits.id + val r_valid = Vec(rqueues.map(_.deq.valid))(rid) + val r_bits = Vec(rqueues.map(_.deq.bits))(rid) + assert (!out.r.valid || r_valid) // Q must be ready faster than the response + in.r <> out.r + in.r.bits.user.get := r_bits + + val arsel = UIntToOH(arid, edgeIn.master.endId).toBools + val rsel = UIntToOH(rid, edgeIn.master.endId).toBools + (rqueues zip (arsel zip rsel)) foreach { case (q, (ar, r)) => + q.deq.ready := out.r .valid && in .r .ready && r && out.r.bits.last + q.enq.valid := in .ar.valid && out.ar.ready && ar + q.enq.bits := in.ar.bits.user.get + } + + val awid = in.aw.bits.id + val aw_ready = Vec(wqueues.map(_.enq.ready))(awid) + in .aw.ready := out.aw.ready && aw_ready + out.aw.valid := in .aw.valid && aw_ready + out.aw.bits := in .aw.bits + + val bid = out.b.bits.id + val b_valid = Vec(wqueues.map(_.deq.valid))(bid) + val b_bits = Vec(wqueues.map(_.deq.bits))(bid) + assert (!out.b.valid || b_valid) // Q must be ready faster than the response + in.b <> out.b + in.b.bits.user.get := b_bits + + val awsel = UIntToOH(awid, edgeIn.master.endId).toBools + val bsel = UIntToOH(bid, edgeIn.master.endId).toBools + (wqueues zip (awsel zip bsel)) foreach { case (q, (aw, b)) => + q.deq.ready := out.b .valid && in .b .ready && b + q.enq.valid := in .aw.valid && out.aw.ready && aw + q.enq.bits := in.aw.bits.user.get + } + + out.w <> in.w + } + } +} + +object AXI4UserYanker +{ + // applied to the AXI4 source node; y.node := AXI4UserYanker(idBits, maxFlight)(x.node) + def apply(capMaxFlight: Option[Int] = None)(x: AXI4OutwardNode)(implicit p: Parameters, sourceInfo: SourceInfo): AXI4OutwardNode = { + val yanker = LazyModule(new AXI4UserYanker(capMaxFlight)) + yanker.node := x + yanker.node + } +} diff --git a/src/main/scala/uncore/tilelink2/Arbiter.scala b/src/main/scala/uncore/tilelink2/Arbiter.scala index f468c26a..2a99f843 100644 --- a/src/main/scala/uncore/tilelink2/Arbiter.scala +++ b/src/main/scala/uncore/tilelink2/Arbiter.scala @@ -3,15 +3,28 @@ package uncore.tilelink2 import Chisel._ +import config._ import diplomacy._ object TLArbiter { - // (valids, granted) => readys - type Policy = (Seq[Bool], Bool) => Seq[Bool] + // (valids, select) => readys + type Policy = (Integer, UInt, Bool) => UInt - val lowestIndexFirst: Policy = (valids, granted) => - valids.scanLeft(Bool(true))(_ && !_).init + val lowestIndexFirst: Policy = (width, valids, select) => ~(leftOR(valids) << 1)(width-1, 0) + + val roundRobin: Policy = (width, valids, select) => { + val valid = valids(width-1, 0) + assert (valid === valids) + val mask = RegInit(~UInt(0, width=width)) + val filter = Cat(valid & ~mask, valid) + val unready = (rightOR(filter, width*2) >> 1) | (mask << width) // last right shift unneeded + val readys = ~((unready >> width) & unready(width-1, 0)) + when (select && valid.orR) { + mask := leftOR(readys & valid, width) + } + readys(width-1, 0) + } def lowestFromSeq[T <: TLChannel](edge: TLEdge, sink: DecoupledIO[T], sources: Seq[DecoupledIO[T]]) { apply(lowestIndexFirst)(sink, sources.map(s => (edge.numBeats1(s.bits), s)):_*) @@ -21,6 +34,10 @@ object TLArbiter apply(lowestIndexFirst)(sink, sources.toList.map(s => (edge.numBeats1(s.bits), s)):_*) } + def robin[T <: TLChannel](edge: TLEdge, sink: DecoupledIO[T], sources: DecoupledIO[T]*) { + apply(roundRobin)(sink, sources.toList.map(s => (edge.numBeats1(s.bits), s)):_*) + } + def apply[T <: Data](policy: Policy)(sink: DecoupledIO[T], sources: (UInt, DecoupledIO[T])*) { if (sources.isEmpty) { sink.valid := Bool(false) @@ -37,13 +54,13 @@ object TLArbiter // Who wants access to the sink? val valids = sourcesIn.map(_.valid) // Arbitrate amongst the requests - val readys = Vec(policy(valids, latch)) + val readys = Vec(policy(valids.size, Cat(valids.reverse), latch).toBools) // Which request wins arbitration? val winner = Vec((readys zip valids) map { case (r,v) => r&&v }) // Confirm the policy works properly require (readys.size == valids.size) - // Never two winner + // Never two winners val prefixOR = winner.scanLeft(Bool(false))(_||_).init assert((prefixOR zip winner) map { case (p,w) => !p || !w } reduce {_ && _}) // If there was any request, there is a winner @@ -73,3 +90,32 @@ object TLArbiter } } } + +/** Synthesizeable unit tests */ +import unittest._ + +class TestRobin()(implicit p: Parameters) extends UnitTest(timeout = 500000) { + val sources = Wire(Vec(6, DecoupledIO(UInt(width=3)))) + val sink = Wire(DecoupledIO(UInt(width=3))) + val count = RegInit(UInt(0, width=8)) + + val lfsr = LFSR16(Bool(true)) + val valid = lfsr(0) + val ready = lfsr(15) + + sources.zipWithIndex.map { case (z, i) => z.bits := UInt(i) } + sources(0).valid := valid + sources(1).valid := Bool(false) + sources(2).valid := valid + sources(3).valid := valid + sources(4).valid := Bool(false) + sources(5).valid := valid + sink.ready := ready + + TLArbiter(TLArbiter.roundRobin)(sink, sources.zipWithIndex.map { case (z, i) => (UInt(i), z) }:_*) + when (sink.fire()) { printf("TestRobin: %d\n", sink.bits) } + when (!sink.fire()) { printf("TestRobin: idle (%d %d)\n", valid, ready) } + + count := count + UInt(1) + io.finished := count >= UInt(128) +} diff --git a/src/main/scala/uncore/tilelink2/Error.scala b/src/main/scala/uncore/tilelink2/Error.scala new file mode 100644 index 00000000..38ebf646 --- /dev/null +++ b/src/main/scala/uncore/tilelink2/Error.scala @@ -0,0 +1,56 @@ +// See LICENSE.SiFive for license details. + +package uncore.tilelink2 + +import Chisel._ +import config._ +import diplomacy._ +import util._ + +class TLError(address: Seq[AddressSet], beatBytes: Int = 4)(implicit p: Parameters) extends LazyModule +{ + val device = new SimpleDevice("error-device", Seq("sifive,error0")) + + val node = TLManagerNode(Seq(TLManagerPortParameters( + Seq(TLManagerParameters( + address = address, + resources = device.reg, + supportsGet = TransferSizes(1, beatBytes), + supportsPutPartial = TransferSizes(1, beatBytes), + supportsPutFull = TransferSizes(1, beatBytes), + supportsArithmetic = TransferSizes(1, beatBytes), + supportsLogical = TransferSizes(1, beatBytes), + supportsHint = TransferSizes(1, beatBytes), + fifoId = Some(0))), // requests are handled in order + beatBytes = beatBytes, + minLatency = 1))) // no bypass needed for this device + + lazy val module = new LazyModuleImp(this) { + val io = new Bundle { + val in = node.bundleIn + } + + import TLMessages._ + val opcodes = Vec(AccessAck, AccessAck, AccessAckData, AccessAckData, AccessAckData, HintAck) + + val in = io.in(0) + val a = Queue(in.a, 1) + val d = in.d + + a.ready := d.ready + d.valid := a.valid + d.bits.opcode := opcodes(a.bits.opcode) + d.bits.param := UInt(0) + d.bits.size := a.bits.size + d.bits.source := a.bits.source + d.bits.sink := UInt(0) + d.bits.addr_lo := a.bits.address + d.bits.data := UInt(0) + d.bits.error := a.bits.opcode =/= Hint // Hints may not error + + // Tie off unused channels + in.b.valid := Bool(false) + in.c.ready := Bool(true) + in.e.ready := Bool(true) + } +} diff --git a/src/main/scala/uncore/tilelink2/FIFOFixer.scala b/src/main/scala/uncore/tilelink2/FIFOFixer.scala index 32b2fa4f..8b3943ea 100644 --- a/src/main/scala/uncore/tilelink2/FIFOFixer.scala +++ b/src/main/scala/uncore/tilelink2/FIFOFixer.scala @@ -10,9 +10,8 @@ import scala.math.max class TLFIFOFixer(implicit p: Parameters) extends LazyModule { - // We request downstream FIFO so we can use the existing fifoId val node = TLAdapterNode( - clientFn = { cp => cp.copy(clients = cp.clients .map(c => c.copy(requestFifo = !c.supportsProbe))) }, + clientFn = { cp => cp }, managerFn = { mp => mp.copy(managers = mp.managers.map(m => m.copy(fifoId = Some(0)))) }) lazy val module = new LazyModuleImp(this) { diff --git a/src/main/scala/uncore/tilelink2/Fragmenter.scala b/src/main/scala/uncore/tilelink2/Fragmenter.scala index 7676aa2e..c726d7ff 100644 --- a/src/main/scala/uncore/tilelink2/Fragmenter.scala +++ b/src/main/scala/uncore/tilelink2/Fragmenter.scala @@ -14,7 +14,7 @@ import scala.math.{min,max} // Fragmenter modifies: PutFull, PutPartial, LogicalData, Get, Hint // Fragmenter passes: ArithmeticData (truncated to minSize if alwaysMin) // Fragmenter cannot modify acquire (could livelock); thus it is unsafe to put caches on both sides -class TLFragmenter(val minSize: Int, val maxSize: Int, val alwaysMin: Boolean = false)(implicit p: Parameters) extends LazyModule +class TLFragmenter(val minSize: Int, val maxSize: Int, val alwaysMin: Boolean = false, val earlyAck: Boolean = false)(implicit p: Parameters) extends LazyModule { require (isPow2 (maxSize)) require (isPow2 (minSize)) @@ -137,6 +137,7 @@ class TLFragmenter(val minSize: Int, val maxSize: Int, val alwaysMin: Boolean = val dOrig = Reg(UInt()) val dFragnum = out.d.bits.source(fragmentBits-1, 0) val dFirst = acknum === UInt(0) + val dLast = dFragnum === UInt(0) val dsizeOH = UIntToOH (out.d.bits.size, log2Ceil(maxDownSize)+1) val dsizeOH1 = UIntToOH1(out.d.bits.size, log2Up(maxDownSize)) val dHasData = edgeOut.hasData(out.d.bits) @@ -156,7 +157,7 @@ class TLFragmenter(val minSize: Int, val maxSize: Int, val alwaysMin: Boolean = } // Swallow up non-data ack fragments - val drop = !dHasData && (dFragnum =/= UInt(0)) + val drop = !dHasData && !(if (earlyAck) dFirst else dLast) out.d.ready := in.d.ready || drop in.d.valid := out.d.valid && !drop in.d.bits := out.d.bits // pass most stuff unchanged @@ -164,11 +165,18 @@ class TLFragmenter(val minSize: Int, val maxSize: Int, val alwaysMin: Boolean = in.d.bits.source := out.d.bits.source >> fragmentBits in.d.bits.size := Mux(dFirst, dFirst_size, dOrig) - // Combine the error flag - val r_error = RegInit(Bool(false)) - val d_error = r_error | out.d.bits.error - when (out.d.fire()) { r_error := Mux(drop, d_error, UInt(0)) } - in.d.bits.error := d_error + if (earlyAck) { + // If you do early Ack, errors may not be dropped + // ... which roughly means: Puts may not fail + assert (!out.d.bits.error || !drop) + in.d.bits.error := out.d.bits.error + } else { + // Combine the error flag + val r_error = RegInit(Bool(false)) + val d_error = r_error | out.d.bits.error + when (out.d.fire()) { r_error := Mux(drop, d_error, UInt(0)) } + in.d.bits.error := d_error + } // What maximum transfer sizes do downstream devices support? val maxArithmetics = managers.map(_.supportsArithmetic.max) @@ -252,8 +260,8 @@ class TLFragmenter(val minSize: Int, val maxSize: Int, val alwaysMin: Boolean = object TLFragmenter { // applied to the TL source node; y.node := TLFragmenter(x.node, 256, 4) - def apply(minSize: Int, maxSize: Int, alwaysMin: Boolean = false)(x: TLOutwardNode)(implicit p: Parameters, sourceInfo: SourceInfo): TLOutwardNode = { - val fragmenter = LazyModule(new TLFragmenter(minSize, maxSize, alwaysMin)) + def apply(minSize: Int, maxSize: Int, alwaysMin: Boolean = false, earlyAck: Boolean = false)(x: TLOutwardNode)(implicit p: Parameters, sourceInfo: SourceInfo): TLOutwardNode = { + val fragmenter = LazyModule(new TLFragmenter(minSize, maxSize, alwaysMin, earlyAck)) fragmenter.node := x fragmenter.node } diff --git a/src/main/scala/uncore/tilelink2/Parameters.scala b/src/main/scala/uncore/tilelink2/Parameters.scala index 3665b15a..aec47951 100644 --- a/src/main/scala/uncore/tilelink2/Parameters.scala +++ b/src/main/scala/uncore/tilelink2/Parameters.scala @@ -199,9 +199,9 @@ case class TLClientPortParameters( require (minLatency >= 0) // Require disjoint ranges for Ids - clients.combinations(2).foreach({ case Seq(x,y) => - require (!x.sourceId.overlaps(y.sourceId)) - }) + IdRange.overlaps(clients.map(_.sourceId)).foreach { case (x, y) => + require (!x.overlaps(y), s"TLClientParameters.sourceId ${x} overlaps ${y}") + } // Bounds on required sizes def endSourceId = clients.map(_.sourceId.end).max diff --git a/src/main/scala/uncore/tilelink2/ToAXI4.scala b/src/main/scala/uncore/tilelink2/ToAXI4.scala index 8cfb311e..75003eda 100644 --- a/src/main/scala/uncore/tilelink2/ToAXI4.scala +++ b/src/main/scala/uncore/tilelink2/ToAXI4.scala @@ -10,14 +10,20 @@ import util.PositionalMultiQueue import uncore.axi4._ import scala.math.{min, max} -case class TLToAXI4Node(idBits: Int) extends MixedAdapterNode(TLImp, AXI4Imp)( - dFn = { _ => - // We must erase all client information, because we crush their source Ids - val masters = Seq( +case class TLToAXI4Node(beatBytes: Int) extends MixedAdapterNode(TLImp, AXI4Imp)( + dFn = { p => + val idSize = p.clients.map { c => if (c.requestFifo) 1 else c.sourceId.size } + val idStart = idSize.scanLeft(0)(_+_).init + val masters = ((idStart zip idSize) zip p.clients) map { case ((start, size), c) => AXI4MasterParameters( - id = IdRange(0, 1 << idBits), - aligned = true)) - AXI4MasterPortParameters(masters) + id = IdRange(start, start+size), + aligned = true, + maxFlight = Some(if (c.requestFifo) c.sourceId.size else 1), + nodePath = c.nodePath) + } + AXI4MasterPortParameters( + masters = masters, + userBits = log2Ceil(p.endSourceId) + 4 + log2Ceil(beatBytes)) }, uFn = { p => TLManagerPortParameters( managers = p.slaves.map { case s => @@ -29,15 +35,15 @@ case class TLToAXI4Node(idBits: Int) extends MixedAdapterNode(TLImp, AXI4Imp)( nodePath = s.nodePath, supportsGet = s.supportsRead, supportsPutFull = s.supportsWrite, - supportsPutPartial = s.supportsWrite)}, - // AXI4 is NEVER fifo in TL sense (R+W are independent) + supportsPutPartial = s.supportsWrite, + fifoId = Some(0))}, beatBytes = p.beatBytes, minLatency = p.minLatency) }) -class TLToAXI4(val idBits: Int, val combinational: Boolean = true)(implicit p: Parameters) extends LazyModule +class TLToAXI4(beatBytes: Int, combinational: Boolean = true)(implicit p: Parameters) extends LazyModule { - val node = TLToAXI4Node(idBits) + val node = TLToAXI4Node(beatBytes) lazy val module = new LazyModuleImp(this) { val io = new Bundle { @@ -52,24 +58,26 @@ class TLToAXI4(val idBits: Int, val combinational: Boolean = true)(implicit p: P require (slaves(0).interleavedId.isDefined) slaves.foreach { s => require (s.interleavedId == slaves(0).interleavedId) } - // We need to ensure that a slave does not stall trying to send B while we need to receive R - // Since R&W have independent flow control, it is possible for a W to cut in-line and get into - // a slave's buffers, preventing us from getting all the R responses we need to release D for B. - // This risk is compounded by an AXI fragmentation. Even a slave which responds completely to - // AR before working on AW might have an AW slipped between two AR fragments. - val out_b = Queue.irrevocable(out.b, entries=edgeIn.client.endSourceId, flow=combinational) + // Construct the source=>ID mapping table + val idTable = Wire(Vec(edgeIn.client.endSourceId, out.aw.bits.id)) + var idCount = Array.fill(edgeOut.master.endId) { 0 } + (edgeIn.client.clients zip edgeOut.master.masters) foreach { case (c, m) => + for (i <- 0 until c.sourceId.size) { + val id = m.id.start + (if (c.requestFifo) 0 else i) + idTable(c.sourceId.start + i) := UInt(id) + idCount(id) = idCount(id) + 1 + } + } // We need to keep the following state from A => D: (addr_lo, size, source) // All of those fields could potentially require 0 bits (argh. Chisel.) - // We will pack as many of the lowest bits of state as fit into the AXI ID. - // Any bits left-over must be put into a bank of Queues. - // The Queues are indexed by as many of the source bits as fit into the AXI ID. - // The Queues are deep enough that every source has guaranteed space in its Queue. + // We will pack all of that extra information into the user bits. val sourceBits = log2Ceil(edgeIn.client.endSourceId) val sizeBits = log2Ceil(edgeIn.maxLgSize+1) val addrBits = log2Ceil(edgeIn.manager.beatBytes) val stateBits = addrBits + sizeBits + sourceBits // could be 0 + require (stateBits <= out.aw.bits.params.userBits) val a_address = edgeIn.address(in.a.bits) val a_addr_lo = edgeIn.addr_lo(a_address) @@ -91,73 +99,17 @@ class TLToAXI4(val idBits: Int, val combinational: Boolean = true)(implicit p: P require (addrEnd == stateBits) val a_state = (a_source << sourceOff) | (a_size << sizeOff) | (a_addr_lo << addrOff) - val a_id = if (idBits == 0) UInt(0) else a_state - val r_state = Wire(UInt(width = stateBits)) + val r_state = out.r.bits.user.getOrElse(UInt(0)) val r_source = if (sourceBits > 0) r_state(sourceEnd-1, sourceOff) else UInt(0) val r_size = if (sizeBits > 0) r_state(sizeEnd -1, sizeOff) else UInt(0) val r_addr_lo = if (addrBits > 0) r_state(addrEnd -1, addrOff) else UInt(0) - val b_state = Wire(UInt(width = stateBits)) + val b_state = out.b.bits.user.getOrElse(UInt(0)) val b_source = if (sourceBits > 0) b_state(sourceEnd-1, sourceOff) else UInt(0) val b_size = if (sizeBits > 0) b_state(sizeEnd -1, sizeOff) else UInt(0) val b_addr_lo = if (addrBits > 0) b_state(addrEnd -1, addrOff) else UInt(0) - val r_last = out.r.bits.last - val r_id = out.r.bits.id - val b_id = out_b.bits.id - - if (stateBits <= idBits) { // No need for any state tracking - r_state := r_id - b_state := b_id - } else { - val bankIndexBits = min(sourceBits, idBits) - val posBits = max(0, sourceBits - idBits) - val implicitBits = max(idBits, sourceBits) - val bankBits = stateBits - implicitBits - val numBanks = min(1 << bankIndexBits, edgeIn.client.endSourceId) - def bankEntries(i: Int) = (edgeIn.client.endSourceId+numBanks-i-1) / numBanks - - val banks = Seq.tabulate(numBanks) { i => - // We know there can only be as many outstanding requests as TL sources - // However, AXI read and write queues are not mutually FIFO. - // Therefore, we want to pop them individually, but share the storage. - val bypass = combinational && edgeOut.slave.minLatency == 0 - PositionalMultiQueue(UInt(width=max(1,bankBits)), positions=bankEntries(i), ways=2, combinational=bypass) - } - - val a_bankPosition = if (posBits == 0) UInt(0) else a_source(sourceBits-1, idBits) - val a_bankIndex = if (bankIndexBits == 0) UInt(0) else a_source(bankIndexBits-1, 0) - val r_bankIndex = if (bankIndexBits == 0) UInt(0) else r_id(bankIndexBits-1, 0) - val b_bankIndex = if (bankIndexBits == 0) UInt(0) else b_id(bankIndexBits-1, 0) - val a_bankSelect = UIntToOH(a_bankIndex, numBanks) - val r_bankSelect = UIntToOH(r_bankIndex, numBanks) - val b_bankSelect = UIntToOH(b_bankIndex, numBanks) - - banks.zipWithIndex.foreach { case (q, i) => - // Push a_state into the banks - q.io.enq.valid := in.a.fire() && a_last && a_bankSelect(i) - q.io.enq.bits.pos := a_bankPosition - q.io.enq.bits.data := a_state >> implicitBits - q.io.enq.bits.way := Mux(a_isPut, UInt(0), UInt(1)) - // Pop the bank's ways - q.io.deq(0).ready := out_b.fire() && b_bankSelect(i) - q.io.deq(1).ready := out.r.fire() && r_bankSelect(i) && r_last - // The FIFOs must be valid when we're ready to pop them... - assert (q.io.deq(0).valid || !q.io.deq(0).ready) - assert (q.io.deq(1).valid || !q.io.deq(1).ready) - } - - val b_bankData = Vec(banks.map(_.io.deq(0).bits.data))(b_bankIndex) - val b_bankPos = Vec(banks.map(_.io.deq(0).bits.pos ))(b_bankIndex) - val r_bankData = Vec(banks.map(_.io.deq(1).bits.data))(r_bankIndex) - val r_bankPos = Vec(banks.map(_.io.deq(1).bits.pos ))(r_bankIndex) - - def optCat(x: (Boolean, UInt)*) = { Cat(x.toList.filter(_._1).map(_._2)) } - b_state := optCat((bankBits > 0, b_bankData), (posBits > 0, b_bankPos), (idBits > 0, b_id)) - r_state := optCat((bankBits > 0, r_bankData), (posBits > 0, r_bankPos), (idBits > 0, r_id)) - } - // We need these Queues because AXI4 queues are irrevocable val depth = if (combinational) 1 else 2 val out_arw = Wire(Decoupled(new AXI4BundleARW(out.params))) @@ -179,7 +131,7 @@ class TLToAXI4(val idBits: Int, val combinational: Boolean = true)(implicit p: P val arw = out_arw.bits arw.wen := a_isPut - arw.id := a_id // truncated + arw.id := idTable(a_source) arw.addr := a_address arw.len := UIntToOH1(a_size, AXI4Parameters.lenBits + log2Ceil(beatBytes)) >> log2Ceil(beatBytes) arw.size := Mux(a_size >= maxSize, maxSize, a_size) @@ -188,11 +140,13 @@ class TLToAXI4(val idBits: Int, val combinational: Boolean = true)(implicit p: P arw.cache := UInt(0) // do not allow AXI to modify our transactions arw.prot := AXI4Parameters.PROT_PRIVILEDGED arw.qos := UInt(0) // no QoS + arw.user.foreach { _ := a_state } - in.a.ready := Mux(a_isPut, (doneAW || out_arw.ready) && out_w.ready, out_arw.ready) - out_arw.valid := in.a.valid && Mux(a_isPut, !doneAW && out_w.ready, Bool(true)) + val stall = Wire(Bool()) + in.a.ready := !stall && Mux(a_isPut, (doneAW || out_arw.ready) && out_w.ready, out_arw.ready) + out_arw.valid := !stall && in.a.valid && Mux(a_isPut, !doneAW && out_w.ready, Bool(true)) - out_w.valid := in.a.valid && a_isPut && (doneAW || out_arw.ready) + out_w.valid := !stall && in.a.valid && a_isPut && (doneAW || out_arw.ready) out_w.bits.data := in.a.bits.data out_w.bits.strb := in.a.bits.mask out_w.bits.last := a_last @@ -204,11 +158,11 @@ class TLToAXI4(val idBits: Int, val combinational: Boolean = true)(implicit p: P val r_wins = out.r.valid || r_holds_d out.r.ready := in.d.ready - out_b.ready := in.d.ready && !r_wins - in.d.valid := Mux(r_wins, out.r.valid, out_b.valid) + out.b.ready := in.d.ready && !r_wins + in.d.valid := Mux(r_wins, out.r.valid, out.b.valid) val r_error = out.r.bits.resp =/= AXI4Parameters.RESP_OKAY - val b_error = out_b.bits.resp =/= AXI4Parameters.RESP_OKAY + val b_error = out.b.bits.resp =/= AXI4Parameters.RESP_OKAY val r_d = edgeIn.AccessAck(r_addr_lo, UInt(0), r_source, r_size, UInt(0), r_error) val b_d = edgeIn.AccessAck(b_addr_lo, UInt(0), b_source, b_size, b_error) @@ -216,6 +170,31 @@ class TLToAXI4(val idBits: Int, val combinational: Boolean = true)(implicit p: P in.d.bits := Mux(r_wins, r_d, b_d) in.d.bits.data := out.r.bits.data // avoid a costly Mux + // We need to track if any reads or writes are inflight for a given ID. + // If the opposite type arrives, we must stall until it completes. + val a_sel = UIntToOH(arw.id, edgeOut.master.endId).toBools + val d_sel = UIntToOH(Mux(r_wins, out.r.bits.id, out.b.bits.id), edgeOut.master.endId).toBools + val d_last = Mux(r_wins, out.r.bits.last, Bool(true)) + val d_first = RegInit(Bool(true)) + when (in.d.fire()) { d_first := d_last } + val stalls = ((a_sel zip d_sel) zip idCount) filter { case (_, n) => n > 1 } map { case ((as, ds), n) => + val count = RegInit(UInt(0, width = log2Ceil(n + 1))) + val write = Reg(Bool()) + val idle = count === UInt(0) + + // Once we start getting the response, it's safe to already switch R/W + val inc = as && out_arw.fire() + val dec = ds && d_first && in.d.fire() + count := count + inc.asUInt - dec.asUInt + + assert (!dec || count =/= UInt(0)) // underflow + assert (!inc || count =/= UInt(n)) // overflow + + when (inc) { write := arw.wen } + !idle && write =/= arw.wen + } + stall := stalls.foldLeft(Bool(false))(_||_) + // Tie off unused channels in.b.valid := Bool(false) in.c.ready := Bool(true) @@ -226,9 +205,9 @@ class TLToAXI4(val idBits: Int, val combinational: Boolean = true)(implicit p: P object TLToAXI4 { - // applied to the TL source node; y.node := TLToAXI4(idBits)(x.node) - def apply(idBits: Int, combinational: Boolean = true)(x: TLOutwardNode)(implicit p: Parameters, sourceInfo: SourceInfo): AXI4OutwardNode = { - val axi4 = LazyModule(new TLToAXI4(idBits, combinational)) + // applied to the TL source node; y.node := TLToAXI4(beatBytes)(x.node) + def apply(beatBytes: Int, combinational: Boolean = true)(x: TLOutwardNode)(implicit p: Parameters, sourceInfo: SourceInfo): AXI4OutwardNode = { + val axi4 = LazyModule(new TLToAXI4(beatBytes, combinational)) axi4.node := x axi4.node } diff --git a/src/main/scala/uncore/tilelink2/package.scala b/src/main/scala/uncore/tilelink2/package.scala index 677e7248..7370b463 100644 --- a/src/main/scala/uncore/tilelink2/package.scala +++ b/src/main/scala/uncore/tilelink2/package.scala @@ -19,18 +19,18 @@ package object tilelink2 def UIntToOH1(x: UInt, width: Int) = ~(SInt(-1, width=width).asUInt << x)(width-1, 0) def trailingZeros(x: Int) = if (x > 0) Some(log2Ceil(x & -x)) else None // Fill 1s from low bits to high bits - def leftOR(x: UInt) = { - val w = x.getWidth + def leftOR(x: UInt): UInt = leftOR(x, x.getWidth) + def leftOR(x: UInt, w: Integer): UInt = { def helper(s: Int, x: UInt): UInt = if (s >= w) x else helper(s+s, x | (x << s)(w-1,0)) - helper(1, x) + helper(1, x)(w-1, 0) } // Fill 1s form high bits to low bits - def rightOR(x: UInt) = { - val w = x.getWidth + def rightOR(x: UInt): UInt = rightOR(x, x.getWidth) + def rightOR(x: UInt, w: Integer): UInt = { def helper(s: Int, x: UInt): UInt = if (s >= w) x else helper(s+s, x | (x >> s)) - helper(1, x) + helper(1, x)(w-1, 0) } // This gets used everywhere, so make the smallest circuit possible ... // Given an address and size, create a mask of beatBytes size