diff --git a/src/main/scala/diplomacy/Parameters.scala b/src/main/scala/diplomacy/Parameters.scala index 2ce88b64..ddbe96d9 100644 --- a/src/main/scala/diplomacy/Parameters.scala +++ b/src/main/scala/diplomacy/Parameters.scala @@ -123,24 +123,15 @@ case class AddressSet(base: BigInt, mask: BigInt) extends Ordered[AddressSet] object AddressSet { - def misaligned(base: BigInt, size: BigInt): Seq[AddressSet] = { - val largestPow2 = BigInt(1) << log2Floor(size) - val mostZeros = (base + size - 1) & ~(largestPow2 - 1) - def splitLo(low: BigInt, high: BigInt, tail: Seq[AddressSet]): Seq[AddressSet] = { - if (low == high) tail else { - val toggleBits = low ^ high - val misalignment = toggleBits & (-toggleBits) - splitLo(low+misalignment, high, AddressSet(low, misalignment-1) +: tail) - } + def misaligned(base: BigInt, size: BigInt, tail: Seq[AddressSet] = Seq()): Seq[AddressSet] = { + if (size == 0) tail.reverse else { + val maxBaseAlignment = base & (-base) // 0 for infinite (LSB) + val maxSizeAlignment = BigInt(1) << log2Floor(size) // MSB of size + val step = + if (maxBaseAlignment == 0 || maxBaseAlignment > maxSizeAlignment) + maxSizeAlignment else maxBaseAlignment + misaligned(base+step, size-step, AddressSet(base, step-1) +: tail) } - def splitHi(low: BigInt, high: BigInt, tail: Seq[AddressSet]): Seq[AddressSet] = { - if (low == high) tail else { - val toggleBits = low ^ high - val misalignment = toggleBits & (-toggleBits) - splitHi(low, high-misalignment, AddressSet(high-misalignment, misalignment-1) +: tail) - } - } - splitLo(base, mostZeros, splitHi(mostZeros, base+size, Seq())).sorted } } diff --git a/src/main/scala/uncore/axi4/Buffer.scala b/src/main/scala/uncore/axi4/Buffer.scala new file mode 100644 index 00000000..0b164ba9 --- /dev/null +++ b/src/main/scala/uncore/axi4/Buffer.scala @@ -0,0 +1,50 @@ +// See LICENSE for license details. + +package uncore.axi4 + +import Chisel._ +import chisel3.internal.sourceinfo.SourceInfo +import diplomacy._ +import scala.math.max + +// pipe is only used if a queue has depth = 1 +class AXI4Buffer(aw: Int = 2, w: Int = 2, b: Int = 2, ar: Int = 2, r: Int = 2, pipe: Boolean = true) extends LazyModule +{ + require (aw >= 0) + require (w >= 0) + require (b >= 0) + require (ar >= 0) + require (r >= 0) + + val node = AXI4IdentityNode() + + lazy val module = new LazyModuleImp(this) { + val io = new Bundle { + val in = node.bundleIn + val out = node.bundleOut + } + + ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) => + if (aw>0) { out.aw <> Queue(in .aw, aw, pipe && aw<2) } else { out.aw <> in .aw } + if (w >0) { out.w <> Queue(in .w, w, pipe && w <2) } else { out.w <> in .w } + if (b >0) { in .b <> Queue(out.b, b, pipe && b <2) } else { in .b <> out.b } + if (ar>0) { out.ar <> Queue(in .ar, ar, pipe && ar<2) } else { out.ar <> in .ar } + if (r >0) { in .r <> Queue(out.r, r, pipe && r <2) } else { in .r <> out.r } + } + } +} + +object AXI4Buffer +{ + // applied to the AXI4 source node; y.node := AXI4Buffer(x.node) + def apply() (x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(2)(x) + def apply(entries: Int) (x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(entries, true)(x) + def apply(entries: Int, pipe: Boolean) (x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(entries, entries, pipe)(x) + def apply(aw: Int, br: Int) (x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(aw, br, true)(x) + def apply(aw: Int, br: Int, pipe: Boolean)(x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(aw, aw, br, aw, br, pipe)(x) + def apply(aw: Int, w: Int, b: Int, ar: Int, r: Int, pipe: Boolean = true)(x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = { + val buffer = LazyModule(new AXI4Buffer(aw, w, b, ar, r, pipe)) + buffer.node := x + buffer.node + } +} diff --git a/src/main/scala/uncore/axi4/Fragmenter.scala b/src/main/scala/uncore/axi4/Fragmenter.scala new file mode 100644 index 00000000..57015359 --- /dev/null +++ b/src/main/scala/uncore/axi4/Fragmenter.scala @@ -0,0 +1,295 @@ +// See LICENSE for license details. + +package uncore.axi4 + +import Chisel._ +import chisel3.internal.sourceinfo.SourceInfo +import chisel3.util.IrrevocableIO +import diplomacy._ +import scala.math.{min,max} +import uncore.tilelink2.{leftOR, rightOR, UIntToOH1} + +// lite: masters all use only one ID => reads will not be interleaved +class AXI4Fragmenter(lite: Boolean = false, maxInFlight: Int = 32, combinational: Boolean = true) extends LazyModule +{ + val maxBeats = 1 << AXI4Parameters.lenBits + def expandTransfer(x: TransferSizes, beatBytes: Int, alignment: BigInt) = + if (!x) x else TransferSizes(x.min, alignment.min(maxBeats*beatBytes).intValue) + def mapSlave(s: AXI4SlaveParameters, beatBytes: Int) = s.copy( + supportsWrite = expandTransfer(s.supportsWrite, beatBytes, s.minAlignment), + supportsRead = expandTransfer(s.supportsRead, beatBytes, s.minAlignment), + interleavedId = if (lite) Some(0) else s.interleavedId) // see AXI4FragmenterSideband for !lite case + def mapMaster(m: AXI4MasterParameters) = m.copy(aligned = true) + + val node = AXI4AdapterNode( + masterFn = { case Seq(mp) => mp.copy(masters = mp.masters.map(m => mapMaster(m))) }, + slaveFn = { case Seq(sp) => sp.copy(slaves = sp.slaves .map(s => mapSlave(s, sp.beatBytes))) }) + + lazy val module = new LazyModuleImp(this) { + val io = new Bundle { + val in = node.bundleIn + val out = node.bundleOut + } + + val edgeOut = node.edgesOut(0) + val edgeIn = node.edgesIn(0) + val slave = edgeOut.slave + val slaves = slave.slaves + val beatBytes = slave.beatBytes + val lgBytes = log2Ceil(beatBytes) + val master = edgeIn.master + val masters = master.masters + + // If the user claimed this was a lite interface, then there must be only one Id + require (!lite || master.endId == 1) + + // We don't support fragmenting to sub-beat accesses + slaves.foreach { s => + require (!s.supportsRead || s.supportsRead.contains(beatBytes)) + require (!s.supportsWrite || s.supportsWrite.contains(beatBytes)) + } + + /* We need to decompose a request into + * FIXED => each beat is a new request + * WRAP/INCR => take xfr up to next power of two, capped by max size of target + * + * On AR and AW, we fragment one request into many + * On W we set 'last' on beats which are fragment boundaries + * On R we clear 'last' on the fragments being reassembled + * On B we clear 'valid' on the responses for the injected fragments + * + * AR=>R and AW+W=>B are completely independent state machines. + */ + + /* Returns the number of beats to execute and the new address */ + def fragment(a: IrrevocableIO[AXI4BundleA], supportedSizes1: Seq[Int]): (IrrevocableIO[AXI4BundleA], Bool, UInt) = { + val out = Wire(a) + + val busy = RegInit(Bool(false)) + val r_addr = Reg(UInt(width = a.bits.params.addrBits)) + val r_len = Reg(UInt(width = AXI4Parameters.lenBits)) + + val len = Mux(busy, r_len, a.bits.len) + val addr = Mux(busy, r_addr, a.bits.addr) + + val lo = if (lgBytes == 0) UInt(0) else addr(lgBytes-1, 0) + val hi = addr >> lgBytes + val alignment = hi(AXI4Parameters.lenBits-1,0) + + val allSame = supportedSizes1.filter(_ >= 0).distinct.size <= 1 + val dynamic1 = Mux1H(slave.findFast(addr), supportedSizes1.map(s => UInt(max(0, s)))) + val fixed1 = UInt(supportedSizes1.filter(_ >= 0).headOption.getOrElse(0)) + + /* We need to compute the largest transfer allowed by the AXI len. + * len+1 is the number of beats to execute. + * We want the MSB(len+1)-1; one less than the largest power of two we could execute. + * There are two cases; either len is 2^n-1 in which case we leave it unchanged, ELSE + * fill the bits from highest to lowest, and shift right by one bit. + */ + val fillLow = rightOR(len) >> 1 // set all bits in positions < a set bit + val wipeHigh = ~leftOR(~len) // clear all bits in position >= a cleared bit + val remain1 = fillLow | wipeHigh // MSB(a.len+1)-1 + val align1 = ~leftOR(alignment) // transfer size limited by address alignment + val support1 = if (allSame) fixed1 else dynamic1 // maximum supported size-1 based on target address + val maxSupported1 = remain1 & align1 & support1 // Take the minimum of all the limits + + // Things that cause us to degenerate to a single beat + val fixed = a.bits.burst === AXI4Parameters.BURST_FIXED + val narrow = a.bits.size =/= UInt(lgBytes) + val bad = fixed || narrow + + // The number of beats-1 to execute + val beats1 = Mux(bad, UInt(0), maxSupported1) + val beats = ~(~(beats1 << 1 | UInt(1)) | beats1) // beats1 + 1 + + val inc_addr = addr + (beats << a.bits.size) // address after adding transfer + val wrapMask = ~(~a.bits.len << a.bits.size) // only these bits may change, if wrapping + val mux_addr = Wire(init = inc_addr) + when (a.bits.burst === AXI4Parameters.BURST_WRAP) { + mux_addr := (inc_addr & wrapMask) | ~(~a.bits.addr | wrapMask) + } + when (a.bits.burst === AXI4Parameters.BURST_FIXED) { + mux_addr := a.bits.addr + } + + val last = beats1 === len + a.ready := out.ready && last + out.valid := a.valid + + out.bits := a.bits + out.bits.len := beats1 + + // We forcibly align every access. If the first beat was misaligned, the strb bits + // for the lower addresses must not have been set. Therefore, rounding the address + // down is harmless. We can do this after the address update algorithm, because the + // incremented values will be rounded down the same way. Furthermore, a subword + // offset cannot cause a premature wrap-around. + out.bits.addr := ~(~addr | UIntToOH1(a.bits.size, lgBytes)) + + when (out.fire()) { + busy := !last + r_addr := mux_addr + r_len := len - beats + } + + (out, last, beats) + } + + val in = io.in(0) + val out = io.out(0) + + // The size to which we will fragment the access + val readSizes1 = slaves.map(s => s.supportsRead .max/beatBytes-1) + val writeSizes1 = slaves.map(s => s.supportsWrite.max/beatBytes-1) + + // Indirection variables for inputs and outputs; makes transformation application easier + val (in_ar, ar_last, _) = fragment(in.ar, readSizes1) + val (in_aw, aw_last, w_beats) = fragment(in.aw, writeSizes1) + val in_w = in.w + val in_r = in.r + val in_b = in.b + val out_ar = Wire(out.ar) + val out_aw = out.aw + val out_w = out.w + val out_r = Wire(out.r) + val out_b = Wire(out.b) + + val depth = if (combinational) 1 else 2 + // In case a slave ties arready := rready, we need a queue to break the combinational loop + // between the two branches (in_ar => {out_ar => out_r, sideband} => in_r). + if (in.ar.bits.getWidth < in.r.bits.getWidth) { + out.ar <> Queue(out_ar, depth, flow=combinational) + out_r <> out.r + } else { + out.ar <> out_ar + out_r <> Queue(out.r, depth, flow=combinational) + } + // In case a slave ties awready := bready or wready := bready, we need this queue + out_b <> Queue(out.b, depth, flow=combinational) + + // Sideband to track which transfers were the last fragment + def sideband() = if (lite) { + Module(new Queue(Bool(), maxInFlight, flow=combinational)).io + } else { + Module(new AXI4FragmenterSideband(maxInFlight, flow=combinational)).io + } + val sideband_ar_r = sideband() + val sideband_aw_b = sideband() + + // AR flow control + out_ar.valid := in_ar.valid && sideband_ar_r.enq.ready + in_ar.ready := sideband_ar_r.enq.ready && out_ar.ready + sideband_ar_r.enq.valid := in_ar.valid && out_ar.ready + out_ar.bits := in_ar.bits + sideband_ar_r.enq.bits := ar_last + + // When does W channel start counting a new transfer + val wbeats_latched = RegInit(Bool(false)) + val wbeats_ready = Wire(Bool()) + val wbeats_valid = Wire(Bool()) + when (wbeats_valid && wbeats_ready) { wbeats_latched := Bool(true) } + when (out_aw.fire()) { wbeats_latched := Bool(false) } + + // AW flow control + out_aw.valid := in_aw.valid && sideband_aw_b.enq.ready && (wbeats_ready || wbeats_latched) + in_aw.ready := sideband_aw_b.enq.ready && out_aw.ready && (wbeats_ready || wbeats_latched) + sideband_aw_b.enq.valid := in_aw.valid && out_aw.ready && (wbeats_ready || wbeats_latched) + wbeats_valid := in_aw.valid && !wbeats_latched + out_aw.bits := in_aw.bits + sideband_aw_b.enq.bits := aw_last + + // We need to inject 'last' into the W channel fragments, count! + val w_counter = RegInit(UInt(0, width = AXI4Parameters.lenBits+1)) + val w_idle = w_counter === UInt(0) + val w_todo = Mux(w_idle, Mux(wbeats_valid, w_beats, UInt(0)), w_counter) + val w_last = w_todo === UInt(1) + w_counter := w_todo - out_w.fire() + assert (!out_w.fire() || w_todo =/= UInt(0)) // underflow impossible + + // W flow control + wbeats_ready := w_idle + out_w.valid := in_w.valid && (!wbeats_ready || wbeats_valid) + in_w.ready := out_w.ready && (!wbeats_ready || wbeats_valid) + out_w.bits := in_w.bits + out_w.bits.last := w_last + // We should also recreate the last last + assert (!out_w.valid || !in_w.bits.last || w_last) + + // R flow control + val r_last = out_r.bits.last + in_r.valid := out_r.valid && (!r_last || sideband_ar_r.deq.valid) + out_r.ready := in_r.ready && (!r_last || sideband_ar_r.deq.valid) + sideband_ar_r.deq.ready := r_last && out_r.valid && in_r.ready + in_r.bits := out_r.bits + in_r.bits.last := r_last && sideband_ar_r.deq.bits + + // B flow control + val b_last = sideband_aw_b.deq.bits + in_b.valid := out_b.valid && sideband_aw_b.deq.valid && b_last + out_b.ready := sideband_aw_b.deq.valid && (!b_last || in_b.ready) + sideband_aw_b.deq.ready := out_b.valid && (!b_last || in_b.ready) + in_b.bits := out_b.bits + + // Merge errors from dropped B responses + val r_resp = RegInit(UInt(0, width = AXI4Parameters.respBits)) + val resp = out_b.bits.resp | r_resp + when (out_b.fire()) { r_resp := Mux(b_last, UInt(0), resp) } + in_b.bits.resp := resp + } +} + +/* We want to put barriers between the fragments of a fragmented transfer and all other transfers. + * This lets us use very little state to reassemble the fragments (else we need one FIFO per ID). + * Furthermore, because all the fragments share the same AXI ID, they come back contiguously. + * This guarantees that no other R responses might get mixed between fragments, ensuring that the + * interleavedId for the slaves remains unaffected by the fragmentation transformation. + * Of course, if you need to fragment, this means there is a potentially hefty serialization cost. + * However, this design allows full concurrency in the common no-fragmentation-needed scenario. + */ +class AXI4FragmenterSideband(maxInFlight: Int, flow: Boolean = false) extends Module +{ + val io = new QueueIO(Bool(), maxInFlight) + io.count := UInt(0) + + val PASS = UInt(2, width = 2) // allow 'last=1' bits to enque, on 'last=0' if count>0 block else accept+FIND + val FIND = UInt(0, width = 2) // allow 'last=0' bits to enque, accept 'last=1' and switch to WAIT + val WAIT = UInt(1, width = 2) // block all access till count=0 + + val state = RegInit(PASS) + val count = RegInit(UInt(0, width = log2Up(maxInFlight))) + val full = count === UInt(maxInFlight-1) + val empty = count === UInt(0) + val last = count === UInt(1) + + io.deq.bits := state(1) || (last && state(0)) // PASS || (last && WAIT) + io.deq.valid := !empty + + io.enq.ready := !full && (empty || (state === FIND) || (state === PASS && io.enq.bits)) + + // WAIT => count > 0 + assert (state =/= WAIT || count =/= UInt(0)) + + if (flow) { + when (io.enq.valid) { + io.deq.valid := Bool(true) + when (empty) { io.deq.bits := io.enq.bits } + } + } + + count := count + io.enq.fire() - io.deq.fire() + switch (state) { + is(PASS) { when (io.enq.valid && !io.enq.bits && empty) { state := FIND } } + is(FIND) { when (io.enq.valid && io.enq.bits && !full) { state := Mux(empty, PASS, WAIT) } } + is(WAIT) { when (last && io.deq.ready) { state := PASS } } + } +} + +object AXI4Fragmenter +{ + // applied to the AXI4 source node; y.node := AXI4Fragmenter()(x.node) + def apply(lite: Boolean = false, maxInFlight: Int = 32, combinational: Boolean = true)(x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = { + val fragmenter = LazyModule(new AXI4Fragmenter(lite, maxInFlight, combinational)) + fragmenter.node := x + fragmenter.node + } +} diff --git a/src/main/scala/uncore/axi4/Nodes.scala b/src/main/scala/uncore/axi4/Nodes.scala index cafac4f1..0613a70b 100644 --- a/src/main/scala/uncore/axi4/Nodes.scala +++ b/src/main/scala/uncore/axi4/Nodes.scala @@ -40,8 +40,8 @@ case class AXI4SlaveNode(portParams: AXI4SlavePortParameters, numPorts: Range.In extends SinkNode(AXI4Imp)(portParams, numPorts) case class AXI4AdapterNode( - clientFn: Seq[AXI4MasterPortParameters] => AXI4MasterPortParameters, - managerFn: Seq[AXI4SlavePortParameters] => AXI4SlavePortParameters, + masterFn: Seq[AXI4MasterPortParameters] => AXI4MasterPortParameters, + slaveFn: Seq[AXI4SlavePortParameters] => AXI4SlavePortParameters, numMasterPorts: Range.Inclusive = 1 to 1, numSlavePorts: Range.Inclusive = 1 to 1) - extends InteriorNode(AXI4Imp)(clientFn, managerFn, numMasterPorts, numSlavePorts) + extends InteriorNode(AXI4Imp)(masterFn, slaveFn, numMasterPorts, numSlavePorts) diff --git a/src/main/scala/uncore/axi4/Parameters.scala b/src/main/scala/uncore/axi4/Parameters.scala index dacc80ee..17a74140 100644 --- a/src/main/scala/uncore/axi4/Parameters.scala +++ b/src/main/scala/uncore/axi4/Parameters.scala @@ -21,9 +21,10 @@ case class AXI4SlaveParameters( val name = nodePath.lastOption.map(_.lazyModule.name).getOrElse("disconnected") val maxTransfer = max(supportsWrite.max, supportsRead.max) val maxAddress = address.map(_.max).max + val minAlignment = address.map(_.alignment).min // The device had better not support a transfer larger than it's alignment - address.foreach { case a => require (a.alignment >= maxTransfer) } + require (minAlignment >= maxTransfer) } case class AXI4SlavePortParameters( @@ -41,6 +42,10 @@ case class AXI4SlavePortParameters( // Check that the link can be implemented in AXI4 require (maxTransfer <= beatBytes * (1 << AXI4Parameters.lenBits)) + lazy val routingMask = AddressDecoder(slaves.map(_.address)) + def findSafe(address: UInt) = Vec(slaves.map(_.address.map(_.contains(address)).reduce(_ || _))) + def findFast(address: UInt) = Vec(slaves.map(_.address.map(_.widen(~routingMask)).distinct.map(_.contains(address)).reduce(_ || _))) + // Require disjoint ranges for addresses slaves.combinations(2).foreach { case Seq(x,y) => x.address.foreach { a => y.address.foreach { b => @@ -51,6 +56,7 @@ case class AXI4SlavePortParameters( case class AXI4MasterParameters( id: IdRange = IdRange(0, 1), + aligned: Boolean = false, nodePath: Seq[BaseNode] = Seq()) { val name = nodePath.lastOption.map(_.lazyModule.name).getOrElse("disconnected") diff --git a/src/main/scala/uncore/axi4/RegisterRouter.scala b/src/main/scala/uncore/axi4/RegisterRouter.scala index b46d69f0..97b598d5 100644 --- a/src/main/scala/uncore/axi4/RegisterRouter.scala +++ b/src/main/scala/uncore/axi4/RegisterRouter.scala @@ -49,7 +49,7 @@ class AXI4RegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int // Invoke the register map builder and make it Irrevocable val out = Queue.irrevocable( RegMapper(beatBytes, concurrency, undefZero, in, mapping:_*), - entries = 1, pipe = true, flow = true) + entries = 1, flow = true) // No flow control needed out.ready := Mux(out.bits.read, r.ready, b.ready) diff --git a/src/main/scala/uncore/axi4/Test.scala b/src/main/scala/uncore/axi4/Test.scala new file mode 100644 index 00000000..53620ff3 --- /dev/null +++ b/src/main/scala/uncore/axi4/Test.scala @@ -0,0 +1,61 @@ +// See LICENSE for license details. +package uncore.axi4 + +import Chisel._ +import diplomacy._ +import uncore.tilelink2._ +import unittest._ + +class RRTest0(address: BigInt) extends AXI4RegisterRouter(address, 0, 32, 0, 4)( + new AXI4RegBundle((), _) with RRTest0Bundle)( + new AXI4RegModule((), _, _) with RRTest0Module) + +class RRTest1(address: BigInt) extends AXI4RegisterRouter(address, 0, 32, 6, 4, false)( + new AXI4RegBundle((), _) with RRTest1Bundle)( + new AXI4RegModule((), _, _) with RRTest1Module) + +class AXI4LiteFuzzRAM extends LazyModule +{ + val fuzz = LazyModule(new TLFuzzer(5000)) + val model = LazyModule(new TLRAMModel("AXI4LiteFuzzRAM")) + val xbar = LazyModule(new TLXbar) + val gpio = LazyModule(new RRTest1(0x400)) + val ram = LazyModule(new AXI4RAM(AddressSet(0x0, 0x3ff))) + + model.node := fuzz.node + xbar.node := model.node + ram.node := AXI4Fragmenter(lite=true)(TLToAXI4(0, true )(xbar.node)) + gpio.node := AXI4Fragmenter(lite=true)(TLToAXI4(0, false)(xbar.node)) + + lazy val module = new LazyModuleImp(this) with HasUnitTestIO { + io.finished := fuzz.module.io.finished + } +} + +class AXI4LiteFuzzRAMTest extends UnitTest(500000) { + val dut = Module(LazyModule(new AXI4LiteFuzzRAM).module) + io.finished := dut.io.finished +} + +class AXI4FullFuzzRAM extends LazyModule +{ + val fuzz = LazyModule(new TLFuzzer(5000)) + val model = LazyModule(new TLRAMModel("AXI4FullFuzzRAM")) + val xbar = LazyModule(new TLXbar) + val gpio = LazyModule(new RRTest0(0x400)) + val ram = LazyModule(new AXI4RAM(AddressSet(0x0, 0x3ff))) + + model.node := fuzz.node + xbar.node := model.node + ram.node := AXI4Fragmenter(lite=false, maxInFlight = 2)(TLToAXI4(4,false)(xbar.node)) + gpio.node := AXI4Fragmenter(lite=false, maxInFlight = 5)(TLToAXI4(4,true )(xbar.node)) + + lazy val module = new LazyModuleImp(this) with HasUnitTestIO { + io.finished := fuzz.module.io.finished + } +} + +class AXI4FullFuzzRAMTest extends UnitTest(500000) { + val dut = Module(LazyModule(new AXI4FullFuzzRAM).module) + io.finished := dut.io.finished +} diff --git a/src/main/scala/uncore/tilelink2/AtomicAutomata.scala b/src/main/scala/uncore/tilelink2/AtomicAutomata.scala index dd74f6e6..8bea60de 100644 --- a/src/main/scala/uncore/tilelink2/AtomicAutomata.scala +++ b/src/main/scala/uncore/tilelink2/AtomicAutomata.scala @@ -138,8 +138,8 @@ class TLAtomicAutomata(logical: Boolean = true, arithmetic: Boolean = true, conc // Move the selected sign bit into the first byte position it will extend val signbit_a = ((signbits_a & signSel) << 1)(beatBytes-1, 0) val signbit_d = ((signbits_d & signSel) << 1)(beatBytes-1, 0) - val signext_a = FillInterleaved(8, highOR(signbit_a)) - val signext_d = FillInterleaved(8, highOR(signbit_d)) + val signext_a = FillInterleaved(8, leftOR(signbit_a)) + val signext_d = FillInterleaved(8, leftOR(signbit_d)) // NOTE: sign-extension does not change the relative ordering in EITHER unsigned or signed arithmetic val wide_mask = FillInterleaved(8, mask) val a_a_ext = (a_a & wide_mask) | signext_a diff --git a/src/main/scala/uncore/tilelink2/Fragmenter.scala b/src/main/scala/uncore/tilelink2/Fragmenter.scala index 4de68af9..940342b9 100644 --- a/src/main/scala/uncore/tilelink2/Fragmenter.scala +++ b/src/main/scala/uncore/tilelink2/Fragmenter.scala @@ -12,7 +12,7 @@ import scala.math.{min,max} // alwaysMin: fragment all requests down to minSize (else fragment to maximum supported by manager) // Fragmenter modifies: PutFull, PutPartial, LogicalData, Get, Hint // Fragmenter passes: ArithmeticData (truncated to minSize if alwaysMin) -// Fragmenter breaks: Acquire (and thus cuts BCE channels) +// Fragmenter cannot modify acquire (could livelock); thus it is unsafe to put caches on both sides class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) extends LazyModule { require (isPow2 (maxSize)) @@ -30,7 +30,6 @@ class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) exten if (x.min <= minSize) TransferSizes(x.min, min(minSize, x.max)) else TransferSizes.none def mapManager(m: TLManagerParameters) = m.copy( - supportsAcquire = TransferSizes.none, // this adapter breaks acquires supportsArithmetic = shrinkTransfer(m.supportsArithmetic), supportsLogical = expandTransfer(m.supportsLogical), supportsGet = expandTransfer(m.supportsGet), @@ -38,15 +37,7 @@ class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) exten supportsPutPartial = expandTransfer(m.supportsPutPartial), supportsHint = expandTransfer(m.supportsHint)) def mapClient(c: TLClientParameters) = c.copy( - sourceId = IdRange(c.sourceId.start << fragmentBits, c.sourceId.end << fragmentBits), - // since we break Acquires, none of these work either: - supportsProbe = TransferSizes.none, - supportsArithmetic = TransferSizes.none, - supportsLogical = TransferSizes.none, - supportsGet = TransferSizes.none, - supportsPutFull = TransferSizes.none, - supportsPutPartial = TransferSizes.none, - supportsHint = TransferSizes.none) + sourceId = IdRange(c.sourceId.start << fragmentBits, c.sourceId.end << fragmentBits)) // Because the Fragmenter stalls inner A while serving outer, it can wipe away inner latency val node = TLAdapterNode( @@ -70,6 +61,8 @@ class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) exten // We don't support fragmenting to sub-beat accesses require (minSize >= beatBytes) + // We can't support devices which are cached on both sides of us + require (!edgeOut.manager.anySupportAcquire || !edgeIn.client.anySupportProbe) /* The Fragmenter is a bit tricky, because there are 5 sizes in play: * max size -- the maximum transfer size possible @@ -174,6 +167,12 @@ class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) exten in.d.bits.source := out.d.bits.source >> fragmentBits in.d.bits.size := Mux(dFirst, dFirst_size, dOrig) + // Combine the error flag + val r_error = RegInit(Bool(false)) + val d_error = r_error | out.d.bits.error + when (out.d.fire()) { r_error := Mux(drop, d_error, UInt(0)) } + in.d.bits.error := d_error + // What maximum transfer sizes do downstream devices support? val maxArithmetics = managers.map(_.supportsArithmetic.max) val maxLogicals = managers.map(_.supportsLogical.max) @@ -271,4 +270,3 @@ class TLRAMFragmenter(ramBeatBytes: Int, maxSize: Int) extends LazyModule { class TLRAMFragmenterTest(ramBeatBytes: Int, maxSize: Int) extends UnitTest(timeout = 500000) { io.finished := Module(LazyModule(new TLRAMFragmenter(ramBeatBytes,maxSize)).module).io.finished } - diff --git a/src/main/scala/uncore/tilelink2/Fuzzer.scala b/src/main/scala/uncore/tilelink2/Fuzzer.scala index 4063985d..de9c9fa5 100644 --- a/src/main/scala/uncore/tilelink2/Fuzzer.scala +++ b/src/main/scala/uncore/tilelink2/Fuzzer.scala @@ -17,7 +17,7 @@ class IDMapGenerator(numIds: Int) extends Module { io.free.ready := Bool(true) assert (!io.free.valid || !bitmap(io.free.bits)) // No double freeing - val select = ~(highOR(bitmap) << 1) & bitmap + val select = ~(leftOR(bitmap) << 1) & bitmap io.alloc.bits := OHToUInt(select) io.alloc.valid := bitmap.orR() @@ -206,7 +206,7 @@ import unittest._ class TLFuzzRAM extends LazyModule { - val model = LazyModule(new TLRAMModel) + val model = LazyModule(new TLRAMModel("TLFuzzRAM")) val ram = LazyModule(new TLRAM(AddressSet(0x800, 0x7ff))) val ram2 = LazyModule(new TLRAM(AddressSet(0, 0x3ff), beatBytes = 16)) val gpio = LazyModule(new RRTest1(0x400)) diff --git a/src/main/scala/uncore/tilelink2/Parameters.scala b/src/main/scala/uncore/tilelink2/Parameters.scala index 45b95c13..097f7a22 100644 --- a/src/main/scala/uncore/tilelink2/Parameters.scala +++ b/src/main/scala/uncore/tilelink2/Parameters.scala @@ -25,9 +25,7 @@ case class TLManagerParameters( customDTS: Option[String]= None) { address.foreach { a => require (a.finite) } - address.combinations(2).foreach({ case Seq(x,y) => - require (!x.overlaps(y)) - }) + address.combinations(2).foreach { case Seq(x,y) => require (!x.overlaps(y)) } require (supportsPutFull.contains(supportsPutPartial)) // Largest support transfer of all types @@ -38,6 +36,7 @@ case class TLManagerParameters( supportsGet.max, supportsPutFull.max, supportsPutPartial.max).max + val maxAddress = address.map(_.max).max val name = nodePath.lastOption.map(_.lazyModule.name).getOrElse("disconnected") @@ -53,9 +52,8 @@ case class TLManagerParameters( } // The device had better not support a transfer larger than it's alignment - address.foreach({ case a => - require (a.alignment >= maxTransfer) - }) + val minAlignment = address.map(_.alignment).min + require (minAlignment >= maxTransfer) } case class TLManagerPortParameters( @@ -77,7 +75,7 @@ case class TLManagerPortParameters( // Bounds on required sizes def endSinkId = managers.map(_.sinkId.end).max - def maxAddress = managers.map(_.address.map(_.max).max).max + def maxAddress = managers.map(_.maxAddress).max def maxTransfer = managers.map(_.maxTransfer).max // Operation sizes supported by all outward Managers @@ -166,6 +164,13 @@ case class TLClientParameters( supportsHint: TransferSizes = TransferSizes.none) { require (supportsPutFull.contains(supportsPutPartial)) + // We only support these operations if we support Probe (ie: we're a cache) + require (supportsProbe.contains(supportsArithmetic)) + require (supportsProbe.contains(supportsLogical)) + require (supportsProbe.contains(supportsGet)) + require (supportsProbe.contains(supportsPutFull)) + require (supportsProbe.contains(supportsPutPartial)) + require (supportsProbe.contains(supportsHint)) val maxTransfer = List( supportsProbe.max, diff --git a/src/main/scala/uncore/tilelink2/RAMModel.scala b/src/main/scala/uncore/tilelink2/RAMModel.scala index cfeb8cce..2ee1d607 100644 --- a/src/main/scala/uncore/tilelink2/RAMModel.scala +++ b/src/main/scala/uncore/tilelink2/RAMModel.scala @@ -20,7 +20,7 @@ import diplomacy._ // put, get, getAck, putAck => ok: detected by getAck (it sees busy>0) impossible for FIFO // If FIFO, the getAck should check data even if its validity was wiped -class TLRAMModel extends LazyModule +class TLRAMModel(log: String = "") extends LazyModule { val node = TLIdentityNode() @@ -150,6 +150,7 @@ class TLRAMModel extends LazyModule val busy = a_inc(i) - a_dec(i) - (!a_first).asUInt val byte = a.data(8*(i+1)-1, 8*i) when (a.mask(i)) { + printf(log + " ") when (a.opcode === TLMessages.PutFullData) { printf("PF") } when (a.opcode === TLMessages.PutPartialData) { printf("PP") } when (a.opcode === TLMessages.ArithmeticData) { printf("A ") } @@ -160,7 +161,7 @@ class TLRAMModel extends LazyModule } when (a.opcode === TLMessages.Get) { - printf("G 0x%x - 0%x\n", a_base, a_base | UIntToOH1(a_size, addressBits)) + printf(log + " G 0x%x - 0%x\n", a_base, a_base | UIntToOH1(a_size, addressBits)) } } @@ -245,6 +246,7 @@ class TLRAMModel extends LazyModule when (d_flight.opcode === TLMessages.PutFullData || d_flight.opcode === TLMessages.PutPartialData) { assert (d.opcode === TLMessages.AccessAck) + printf(log + " ") when (d_flight.opcode === TLMessages.PutFullData) { printf("pf") } when (d_flight.opcode === TLMessages.PutPartialData) { printf("pp") } printf(" 0x%x - 0x%x\n", d_base, d_base | UIntToOH1(d_size, addressBits)) @@ -257,6 +259,7 @@ class TLRAMModel extends LazyModule val shadow = Wire(init = d_shadow(i)) when (d_mask(i)) { val d_addr = d_addr_hi << shift | UInt(i) + printf(log + " ") when (d_flight.opcode === TLMessages.Get) { printf("g ") } when (d_flight.opcode === TLMessages.ArithmeticData) { printf("a ") } when (d_flight.opcode === TLMessages.LogicalData) { printf("l ") } diff --git a/src/main/scala/uncore/tilelink2/RegisterRouter.scala b/src/main/scala/uncore/tilelink2/RegisterRouter.scala index dddd0a60..89c62021 100644 --- a/src/main/scala/uncore/tilelink2/RegisterRouter.scala +++ b/src/main/scala/uncore/tilelink2/RegisterRouter.scala @@ -44,7 +44,7 @@ class TLRegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int = // Invoke the register map builder and make it Irrevocable val out = Queue.irrevocable( RegMapper(beatBytes, concurrency, undefZero, in, mapping:_*), - entries = 1, pipe = true, flow = true) + entries = 1, flow = true) // No flow control needed in.valid := a.valid diff --git a/src/main/scala/uncore/tilelink2/ToAXI4.scala b/src/main/scala/uncore/tilelink2/ToAXI4.scala index f68d22a6..079f37dd 100644 --- a/src/main/scala/uncore/tilelink2/ToAXI4.scala +++ b/src/main/scala/uncore/tilelink2/ToAXI4.scala @@ -12,7 +12,11 @@ import scala.math.{min, max} case class TLToAXI4Node(idBits: Int) extends MixedNode(TLImp, AXI4Imp)( dFn = { case (1, _) => // We must erase all client information, because we crush their source Ids - Seq(AXI4MasterPortParameters(Seq(AXI4MasterParameters(id = IdRange(0, 1 << idBits))))) + val masters = Seq( + AXI4MasterParameters( + id = IdRange(0, 1 << idBits), + aligned = true)) + Seq(AXI4MasterPortParameters(masters)) }, uFn = { case (1, Seq(AXI4SlavePortParameters(slaves, beatBytes))) => val managers = slaves.zipWithIndex.map { case (s, id) => @@ -53,6 +57,13 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule require (slaves(0).interleavedId.isDefined) slaves.foreach { s => require (s.interleavedId == slaves(0).interleavedId) } + // We need to ensure that a slave does not stall trying to send B while we need to receive R + // Since R&W have independent flow control, it is possible for a W to cut in-line and get into + // a slave's buffers, preventing us from getting all the R responses we need to release D for B. + // This risk is compounded by an AXI fragmentation. Even a slave which responds completely to + // AR before working on AW might have an AW slipped between two AR fragments. + val out_b = Queue.irrevocable(out.b, entries=edgeIn.client.endSourceId, flow=combinational) + // We need to keep the following state from A => D: (addr_lo, size, sink, source) // All of those fields could potentially require 0 bits (argh. Chisel.) // We will pack as many of the lowest bits of state as fit into the AXI ID. @@ -113,7 +124,7 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule val r_last = out.r.bits.last val r_id = out.r.bits.id - val b_id = out.b.bits.id + val b_id = out_b.bits.id if (stateBits <= idBits) { // No need for any state tracking r_state := r_id @@ -148,7 +159,7 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule q.io.enq.bits.data := a_state >> implicitBits q.io.enq.bits.way := Mux(a_isPut, UInt(0), UInt(1)) // Pop the bank's ways - q.io.deq(0).ready := out.b.fire() && b_bankSelect(i) + q.io.deq(0).ready := out_b.fire() && b_bankSelect(i) q.io.deq(1).ready := out.r.fire() && r_bankSelect(i) && r_last // The FIFOs must be valid when we're ready to pop them... assert (q.io.deq(0).valid || !q.io.deq(0).ready) @@ -169,8 +180,8 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule val depth = if (combinational) 1 else 2 val out_arw = Wire(Decoupled(new AXI4BundleARW(out.params))) val out_w = Wire(out.w) - out.w <> Queue.irrevocable(out_w, entries=depth, pipe=combinational, flow=combinational) - val queue_arw = Queue.irrevocable(out_arw, entries=depth, pipe=combinational, flow=combinational) + out.w <> Queue.irrevocable(out_w, entries=depth, flow=combinational) + val queue_arw = Queue.irrevocable(out_arw, entries=depth, flow=combinational) // Fan out the ARW channel to AR and AW out.ar.bits := queue_arw.bits @@ -210,18 +221,21 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule // Give R higher priority than B val r_wins = out.r.valid || r_holds_d - out.r.ready := in.d.ready - out.b.ready := in.d.ready && !r_wins - in.d.valid := Mux(r_wins, out.r.valid, out.b.valid) + val in_d = Wire(in.d) + in.d <> Queue.irrevocable(in_d, entries=1, flow=combinational) + + out.r.ready := in_d.ready + out_b.ready := in_d.ready && !r_wins + in_d.valid := Mux(r_wins, out.r.valid, out_b.valid) val r_error = out.r.bits.resp =/= AXI4Parameters.RESP_OKAY - val b_error = out.b.bits.resp =/= AXI4Parameters.RESP_OKAY + val b_error = out_b.bits.resp =/= AXI4Parameters.RESP_OKAY val r_d = edgeIn.AccessAck(r_addr_lo, r_sink, r_source, r_size, UInt(0), r_error) val b_d = edgeIn.AccessAck(b_addr_lo, b_sink, b_source, b_size, b_error) - in.d.bits := Mux(r_wins, r_d, b_d) - in.d.bits.data := out.r.bits.data // avoid a costly Mux + in_d.bits := Mux(r_wins, r_d, b_d) + in_d.bits.data := out.r.bits.data // avoid a costly Mux // Tie off unused channels in.b.valid := Bool(false) diff --git a/src/main/scala/uncore/tilelink2/package.scala b/src/main/scala/uncore/tilelink2/package.scala index e996f2ba..415aa308 100644 --- a/src/main/scala/uncore/tilelink2/package.scala +++ b/src/main/scala/uncore/tilelink2/package.scala @@ -11,12 +11,20 @@ package object tilelink2 def OH1ToUInt(x: UInt) = OHToUInt((x << 1 | UInt(1)) ^ x) def UIntToOH1(x: UInt, width: Int) = ~(SInt(-1, width=width).asUInt << x)(width-1, 0) def trailingZeros(x: Int) = if (x > 0) Some(log2Ceil(x & -x)) else None - def highOR(x: UInt) = { + // Fill 1s from low bits to high bits + def leftOR(x: UInt) = { val w = x.getWidth def helper(s: Int, x: UInt): UInt = if (s >= w) x else helper(s+s, x | (x << s)(w-1,0)) helper(1, x) } + // Fill 1s form high bits to low bits + def rightOR(x: UInt) = { + val w = x.getWidth + def helper(s: Int, x: UInt): UInt = + if (s >= w) x else helper(s+s, x | (x >> s)) + helper(1, x) + } // This gets used everywhere, so make the smallest circuit possible ... def maskGen(addr_lo: UInt, lgSize: UInt, beatBytes: Int): UInt = { val lgBytes = log2Ceil(beatBytes) diff --git a/src/main/scala/unittest/Configs.scala b/src/main/scala/unittest/Configs.scala index ac4ecfb0..ac0488af 100644 --- a/src/main/scala/unittest/Configs.scala +++ b/src/main/scala/unittest/Configs.scala @@ -25,7 +25,9 @@ class WithUncoreUnitTests extends Config( case UnitTests => (p: Parameters) => Seq( Module(new uncore.devices.ROMSlaveTest()(p)), Module(new uncore.devices.TileLinkRAMTest()(p)), - Module(new uncore.tilelink2.TLFuzzRAMTest)) + Module(new uncore.tilelink2.TLFuzzRAMTest), + Module(new uncore.axi4.AXI4LiteFuzzRAMTest), + Module(new uncore.axi4.AXI4FullFuzzRAMTest)) case _ => throw new CDEMatchError } )