diff --git a/src/main/scala/diplomacy/Parameters.scala b/src/main/scala/diplomacy/Parameters.scala
index 2ce88b64..ddbe96d9 100644
--- a/src/main/scala/diplomacy/Parameters.scala
+++ b/src/main/scala/diplomacy/Parameters.scala
@@ -123,24 +123,15 @@ case class AddressSet(base: BigInt, mask: BigInt) extends Ordered[AddressSet]
 
 object AddressSet
 {
-  def misaligned(base: BigInt, size: BigInt): Seq[AddressSet] = {
-    val largestPow2 = BigInt(1) << log2Floor(size)
-    val mostZeros = (base + size - 1) & ~(largestPow2 - 1)
-    def splitLo(low: BigInt, high: BigInt, tail: Seq[AddressSet]): Seq[AddressSet] = {
-      if (low == high) tail else {
-        val toggleBits = low ^ high
-        val misalignment = toggleBits & (-toggleBits)
-        splitLo(low+misalignment, high, AddressSet(low, misalignment-1) +: tail)
-      }
+  def misaligned(base: BigInt, size: BigInt, tail: Seq[AddressSet] = Seq()): Seq[AddressSet] = {
+    if (size == 0) tail.reverse else {
+      val maxBaseAlignment = base & (-base) // 0 for infinite (LSB)
+      val maxSizeAlignment = BigInt(1) << log2Floor(size) // MSB of size
+      val step =
+        if (maxBaseAlignment == 0 || maxBaseAlignment > maxSizeAlignment)
+        maxSizeAlignment else maxBaseAlignment
+      misaligned(base+step, size-step, AddressSet(base, step-1) +: tail)
     }
-    def splitHi(low: BigInt, high: BigInt, tail: Seq[AddressSet]): Seq[AddressSet] = {
-      if (low == high) tail else {
-        val toggleBits = low ^ high
-        val misalignment = toggleBits & (-toggleBits)
-        splitHi(low, high-misalignment, AddressSet(high-misalignment, misalignment-1) +: tail)
-      }
-    }
-    splitLo(base, mostZeros, splitHi(mostZeros, base+size, Seq())).sorted
   }
 }
 
diff --git a/src/main/scala/uncore/axi4/Buffer.scala b/src/main/scala/uncore/axi4/Buffer.scala
new file mode 100644
index 00000000..0b164ba9
--- /dev/null
+++ b/src/main/scala/uncore/axi4/Buffer.scala
@@ -0,0 +1,50 @@
+// See LICENSE for license details.
+
+package uncore.axi4
+
+import Chisel._
+import chisel3.internal.sourceinfo.SourceInfo
+import diplomacy._
+import scala.math.max
+
+// pipe is only used if a queue has depth = 1
+class AXI4Buffer(aw: Int = 2, w: Int = 2, b: Int = 2, ar: Int = 2, r: Int = 2, pipe: Boolean = true) extends LazyModule
+{
+  require (aw >= 0)
+  require (w  >= 0)
+  require (b  >= 0)
+  require (ar >= 0)
+  require (r  >= 0)
+
+  val node = AXI4IdentityNode()
+
+  lazy val module = new LazyModuleImp(this) {
+    val io = new Bundle {
+      val in  = node.bundleIn
+      val out = node.bundleOut
+    }
+
+    ((io.in zip io.out) zip (node.edgesIn zip node.edgesOut)) foreach { case ((in, out), (edgeIn, edgeOut)) =>
+      if (aw>0) { out.aw <> Queue(in .aw, aw, pipe && aw<2) } else { out.aw <> in .aw }
+      if (w >0) { out.w  <> Queue(in .w,  w,  pipe && w <2) } else { out.w  <> in .w  }
+      if (b >0) { in .b  <> Queue(out.b,  b,  pipe && b <2) } else { in .b  <> out.b  }
+      if (ar>0) { out.ar <> Queue(in .ar, ar, pipe && ar<2) } else { out.ar <> in .ar }
+      if (r >0) { in .r  <> Queue(out.r,  r,  pipe && r <2) } else { in .r  <> out.r  }
+    }
+  }
+}
+
+object AXI4Buffer
+{
+  // applied to the AXI4 source node; y.node := AXI4Buffer(x.node)
+  def apply()                               (x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(2)(x)
+  def apply(entries: Int)                   (x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(entries, true)(x)
+  def apply(entries: Int, pipe: Boolean)    (x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(entries, entries, pipe)(x)
+  def apply(aw: Int, br: Int)               (x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(aw, br, true)(x)
+  def apply(aw: Int, br: Int, pipe: Boolean)(x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = apply(aw, aw, br, aw, br, pipe)(x)
+  def apply(aw: Int, w: Int, b: Int, ar: Int, r: Int, pipe: Boolean = true)(x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = {
+    val buffer = LazyModule(new AXI4Buffer(aw, w, b, ar, r, pipe))
+    buffer.node := x
+    buffer.node
+  }
+}
diff --git a/src/main/scala/uncore/axi4/Fragmenter.scala b/src/main/scala/uncore/axi4/Fragmenter.scala
new file mode 100644
index 00000000..57015359
--- /dev/null
+++ b/src/main/scala/uncore/axi4/Fragmenter.scala
@@ -0,0 +1,295 @@
+// See LICENSE for license details.
+
+package uncore.axi4
+
+import Chisel._
+import chisel3.internal.sourceinfo.SourceInfo
+import chisel3.util.IrrevocableIO
+import diplomacy._
+import scala.math.{min,max}
+import uncore.tilelink2.{leftOR, rightOR, UIntToOH1}
+
+// lite: masters all use only one ID => reads will not be interleaved
+class AXI4Fragmenter(lite: Boolean = false, maxInFlight: Int = 32, combinational: Boolean = true) extends LazyModule
+{
+  val maxBeats = 1 << AXI4Parameters.lenBits
+  def expandTransfer(x: TransferSizes, beatBytes: Int, alignment: BigInt) =
+    if (!x) x else TransferSizes(x.min, alignment.min(maxBeats*beatBytes).intValue)
+  def mapSlave(s: AXI4SlaveParameters, beatBytes: Int) = s.copy(
+    supportsWrite = expandTransfer(s.supportsWrite, beatBytes, s.minAlignment),
+    supportsRead  = expandTransfer(s.supportsRead,  beatBytes, s.minAlignment),
+    interleavedId = if (lite) Some(0) else s.interleavedId) // see AXI4FragmenterSideband for !lite case
+  def mapMaster(m: AXI4MasterParameters) = m.copy(aligned = true)
+
+  val node = AXI4AdapterNode(
+    masterFn = { case Seq(mp) => mp.copy(masters = mp.masters.map(m => mapMaster(m))) },
+    slaveFn  = { case Seq(sp) => sp.copy(slaves  = sp.slaves .map(s => mapSlave(s, sp.beatBytes))) })
+
+  lazy val module = new LazyModuleImp(this) {
+    val io = new Bundle {
+      val in  = node.bundleIn
+      val out = node.bundleOut
+    }
+
+    val edgeOut   = node.edgesOut(0)
+    val edgeIn    = node.edgesIn(0)
+    val slave     = edgeOut.slave
+    val slaves    = slave.slaves
+    val beatBytes = slave.beatBytes
+    val lgBytes   = log2Ceil(beatBytes)
+    val master    = edgeIn.master
+    val masters   = master.masters
+
+    // If the user claimed this was a lite interface, then there must be only one Id
+    require (!lite || master.endId == 1)
+
+    // We don't support fragmenting to sub-beat accesses
+    slaves.foreach { s =>
+      require (!s.supportsRead  || s.supportsRead.contains(beatBytes))
+      require (!s.supportsWrite || s.supportsWrite.contains(beatBytes))
+    }
+
+    /* We need to decompose a request into 
+     *   FIXED => each beat is a new request
+     *   WRAP/INCR => take xfr up to next power of two, capped by max size of target
+     *
+     * On AR and AW, we fragment one request into many
+     * On W we set 'last' on beats which are fragment boundaries
+     * On R we clear 'last' on the fragments being reassembled
+     * On B we clear 'valid' on the responses for the injected fragments
+     *
+     * AR=>R and AW+W=>B are completely independent state machines.
+     */
+
+    /* Returns the number of beats to execute and the new address */
+    def fragment(a: IrrevocableIO[AXI4BundleA], supportedSizes1: Seq[Int]): (IrrevocableIO[AXI4BundleA], Bool, UInt) = {
+      val out = Wire(a)
+
+      val busy   = RegInit(Bool(false))
+      val r_addr = Reg(UInt(width = a.bits.params.addrBits))
+      val r_len  = Reg(UInt(width = AXI4Parameters.lenBits))
+
+      val len  = Mux(busy, r_len,  a.bits.len)
+      val addr = Mux(busy, r_addr, a.bits.addr)
+
+      val lo = if (lgBytes == 0) UInt(0) else addr(lgBytes-1, 0)
+      val hi = addr >> lgBytes
+      val alignment = hi(AXI4Parameters.lenBits-1,0)
+
+      val allSame = supportedSizes1.filter(_ >= 0).distinct.size <= 1
+      val dynamic1 = Mux1H(slave.findFast(addr), supportedSizes1.map(s => UInt(max(0, s))))
+      val fixed1 = UInt(supportedSizes1.filter(_ >= 0).headOption.getOrElse(0))
+
+      /* We need to compute the largest transfer allowed by the AXI len.
+       * len+1 is the number of beats to execute.
+       * We want the MSB(len+1)-1; one less than the largest power of two we could execute.
+       * There are two cases; either len is 2^n-1 in which case we leave it unchanged, ELSE
+       *   fill the bits from highest to lowest, and shift right by one bit.
+       */
+      val fillLow  = rightOR(len) >> 1   // set   all bits in positions <  a set     bit
+      val wipeHigh = ~leftOR(~len)       // clear all bits in position  >= a cleared bit
+      val remain1  = fillLow | wipeHigh  // MSB(a.len+1)-1
+      val align1   = ~leftOR(alignment)  // transfer size limited by address alignment
+      val support1 = if (allSame) fixed1 else dynamic1 // maximum supported size-1 based on target address
+      val maxSupported1 = remain1 & align1 & support1 // Take the minimum of all the limits
+
+      // Things that cause us to degenerate to a single beat
+      val fixed = a.bits.burst === AXI4Parameters.BURST_FIXED
+      val narrow = a.bits.size =/= UInt(lgBytes)
+      val bad = fixed || narrow
+
+      // The number of beats-1 to execute
+      val beats1 = Mux(bad, UInt(0), maxSupported1)
+      val beats = ~(~(beats1 << 1 | UInt(1)) | beats1) // beats1 + 1
+
+      val inc_addr = addr + (beats << a.bits.size) // address after adding transfer
+      val wrapMask = ~(~a.bits.len << a.bits.size) // only these bits may change, if wrapping
+      val mux_addr = Wire(init = inc_addr)
+      when (a.bits.burst === AXI4Parameters.BURST_WRAP) {
+        mux_addr := (inc_addr & wrapMask) | ~(~a.bits.addr | wrapMask)
+      }
+      when (a.bits.burst === AXI4Parameters.BURST_FIXED) {
+        mux_addr := a.bits.addr
+      }
+
+      val last = beats1 === len
+      a.ready := out.ready && last
+      out.valid := a.valid
+
+      out.bits := a.bits
+      out.bits.len := beats1
+
+      // We forcibly align every access. If the first beat was misaligned, the strb bits
+      // for the lower addresses must not have been set. Therefore, rounding the address
+      // down is harmless. We can do this after the address update algorithm, because the
+      // incremented values will be rounded down the same way. Furthermore, a subword
+      // offset cannot cause a premature wrap-around.
+      out.bits.addr := ~(~addr | UIntToOH1(a.bits.size, lgBytes))
+
+      when (out.fire()) {
+        busy := !last
+        r_addr := mux_addr
+        r_len  := len - beats
+      }
+
+      (out, last, beats)
+    }
+
+    val in = io.in(0)
+    val out = io.out(0)
+
+    // The size to which we will fragment the access
+    val readSizes1  = slaves.map(s => s.supportsRead .max/beatBytes-1)
+    val writeSizes1 = slaves.map(s => s.supportsWrite.max/beatBytes-1)
+
+    // Indirection variables for inputs and outputs; makes transformation application easier
+    val (in_ar, ar_last, _)       = fragment(in.ar, readSizes1)
+    val (in_aw, aw_last, w_beats) = fragment(in.aw, writeSizes1)
+    val in_w = in.w
+    val in_r = in.r
+    val in_b = in.b
+    val out_ar = Wire(out.ar)
+    val out_aw = out.aw
+    val out_w = out.w
+    val out_r = Wire(out.r)
+    val out_b = Wire(out.b)
+
+    val depth = if (combinational) 1 else 2
+    // In case a slave ties arready := rready, we need a queue to break the combinational loop
+    // between the two branches (in_ar => {out_ar => out_r, sideband} => in_r).
+    if (in.ar.bits.getWidth < in.r.bits.getWidth) {
+      out.ar <> Queue(out_ar, depth, flow=combinational)
+      out_r <> out.r
+    } else {
+      out.ar <> out_ar
+      out_r <> Queue(out.r, depth, flow=combinational)
+    }
+    // In case a slave ties awready := bready or wready := bready, we need this queue
+    out_b <> Queue(out.b, depth, flow=combinational)
+
+    // Sideband to track which transfers were the last fragment
+    def sideband() = if (lite) {
+      Module(new Queue(Bool(), maxInFlight, flow=combinational)).io
+    } else {
+      Module(new AXI4FragmenterSideband(maxInFlight, flow=combinational)).io
+    }
+    val sideband_ar_r = sideband()
+    val sideband_aw_b = sideband()
+
+    // AR flow control
+    out_ar.valid := in_ar.valid && sideband_ar_r.enq.ready
+    in_ar.ready := sideband_ar_r.enq.ready && out_ar.ready
+    sideband_ar_r.enq.valid := in_ar.valid && out_ar.ready
+    out_ar.bits := in_ar.bits
+    sideband_ar_r.enq.bits := ar_last
+
+    // When does W channel start counting a new transfer
+    val wbeats_latched = RegInit(Bool(false))
+    val wbeats_ready = Wire(Bool())
+    val wbeats_valid = Wire(Bool())
+    when (wbeats_valid && wbeats_ready) { wbeats_latched := Bool(true) }
+    when (out_aw.fire()) { wbeats_latched := Bool(false) }
+
+    // AW flow control
+    out_aw.valid := in_aw.valid && sideband_aw_b.enq.ready && (wbeats_ready || wbeats_latched)
+    in_aw.ready := sideband_aw_b.enq.ready && out_aw.ready && (wbeats_ready || wbeats_latched)
+    sideband_aw_b.enq.valid := in_aw.valid && out_aw.ready && (wbeats_ready || wbeats_latched)
+    wbeats_valid := in_aw.valid && !wbeats_latched
+    out_aw.bits := in_aw.bits
+    sideband_aw_b.enq.bits := aw_last
+
+    // We need to inject 'last' into the W channel fragments, count!
+    val w_counter = RegInit(UInt(0, width = AXI4Parameters.lenBits+1))
+    val w_idle = w_counter === UInt(0)
+    val w_todo = Mux(w_idle, Mux(wbeats_valid, w_beats, UInt(0)), w_counter)
+    val w_last = w_todo === UInt(1)
+    w_counter := w_todo - out_w.fire()
+    assert (!out_w.fire() || w_todo =/= UInt(0)) // underflow impossible
+
+    // W flow control
+    wbeats_ready := w_idle
+    out_w.valid := in_w.valid && (!wbeats_ready || wbeats_valid)
+    in_w.ready := out_w.ready && (!wbeats_ready || wbeats_valid)
+    out_w.bits := in_w.bits
+    out_w.bits.last := w_last
+    // We should also recreate the last last
+    assert (!out_w.valid || !in_w.bits.last || w_last)
+
+    // R flow control
+    val r_last = out_r.bits.last
+    in_r.valid := out_r.valid && (!r_last || sideband_ar_r.deq.valid)
+    out_r.ready := in_r.ready && (!r_last || sideband_ar_r.deq.valid)
+    sideband_ar_r.deq.ready := r_last && out_r.valid && in_r.ready
+    in_r.bits := out_r.bits
+    in_r.bits.last := r_last && sideband_ar_r.deq.bits
+
+    // B flow control
+    val b_last = sideband_aw_b.deq.bits
+    in_b.valid := out_b.valid && sideband_aw_b.deq.valid && b_last
+    out_b.ready := sideband_aw_b.deq.valid && (!b_last || in_b.ready)
+    sideband_aw_b.deq.ready := out_b.valid && (!b_last || in_b.ready)
+    in_b.bits := out_b.bits
+
+    // Merge errors from dropped B responses
+    val r_resp = RegInit(UInt(0, width = AXI4Parameters.respBits))
+    val resp = out_b.bits.resp | r_resp
+    when (out_b.fire()) { r_resp := Mux(b_last, UInt(0), resp) }
+    in_b.bits.resp := resp
+  }
+}
+
+/* We want to put barriers between the fragments of a fragmented transfer and all other transfers.
+ * This lets us use very little state to reassemble the fragments (else we need one FIFO per ID).
+ * Furthermore, because all the fragments share the same AXI ID, they come back contiguously.
+ * This guarantees that no other R responses might get mixed between fragments, ensuring that the
+ * interleavedId for the slaves remains unaffected by the fragmentation transformation.
+ * Of course, if you need to fragment, this means there is a potentially hefty serialization cost.
+ * However, this design allows full concurrency in the common no-fragmentation-needed scenario.
+ */
+class AXI4FragmenterSideband(maxInFlight: Int, flow: Boolean = false) extends Module
+{
+  val io = new QueueIO(Bool(), maxInFlight)
+  io.count := UInt(0)
+
+  val PASS = UInt(2, width = 2) // allow 'last=1' bits to enque, on 'last=0' if count>0 block else accept+FIND
+  val FIND = UInt(0, width = 2) // allow 'last=0' bits to enque, accept 'last=1' and switch to WAIT
+  val WAIT = UInt(1, width = 2) // block all access till count=0
+
+  val state = RegInit(PASS)
+  val count = RegInit(UInt(0, width = log2Up(maxInFlight)))
+  val full  = count === UInt(maxInFlight-1)
+  val empty = count === UInt(0)
+  val last  = count === UInt(1)
+
+  io.deq.bits := state(1) || (last && state(0)) // PASS || (last && WAIT)
+  io.deq.valid := !empty
+
+  io.enq.ready := !full && (empty || (state === FIND) || (state === PASS && io.enq.bits))
+
+  // WAIT => count > 0
+  assert (state =/= WAIT || count =/= UInt(0))
+
+  if (flow) {
+    when (io.enq.valid) {
+      io.deq.valid := Bool(true)
+      when (empty) { io.deq.bits := io.enq.bits }
+    }
+  }
+
+  count := count + io.enq.fire() - io.deq.fire()
+  switch (state) {
+    is(PASS) { when (io.enq.valid && !io.enq.bits && empty) { state := FIND } }
+    is(FIND) { when (io.enq.valid &&  io.enq.bits && !full) { state := Mux(empty, PASS, WAIT) } }
+    is(WAIT) { when (last && io.deq.ready)                  { state := PASS } }
+  }
+}
+
+object AXI4Fragmenter
+{
+  // applied to the AXI4 source node; y.node := AXI4Fragmenter()(x.node)
+  def apply(lite: Boolean = false, maxInFlight: Int = 32, combinational: Boolean = true)(x: AXI4OutwardNode)(implicit sourceInfo: SourceInfo): AXI4OutwardNode = {
+    val fragmenter = LazyModule(new AXI4Fragmenter(lite, maxInFlight, combinational))
+    fragmenter.node := x
+    fragmenter.node
+  }
+}
diff --git a/src/main/scala/uncore/axi4/Nodes.scala b/src/main/scala/uncore/axi4/Nodes.scala
index cafac4f1..0613a70b 100644
--- a/src/main/scala/uncore/axi4/Nodes.scala
+++ b/src/main/scala/uncore/axi4/Nodes.scala
@@ -40,8 +40,8 @@ case class AXI4SlaveNode(portParams: AXI4SlavePortParameters, numPorts: Range.In
   extends SinkNode(AXI4Imp)(portParams, numPorts)
 
 case class AXI4AdapterNode(
-  clientFn:       Seq[AXI4MasterPortParameters]  => AXI4MasterPortParameters,
-  managerFn:      Seq[AXI4SlavePortParameters] => AXI4SlavePortParameters,
+  masterFn:       Seq[AXI4MasterPortParameters]  => AXI4MasterPortParameters,
+  slaveFn:        Seq[AXI4SlavePortParameters] => AXI4SlavePortParameters,
   numMasterPorts: Range.Inclusive = 1 to 1,
   numSlavePorts:  Range.Inclusive = 1 to 1)
-  extends InteriorNode(AXI4Imp)(clientFn, managerFn, numMasterPorts, numSlavePorts)
+  extends InteriorNode(AXI4Imp)(masterFn, slaveFn, numMasterPorts, numSlavePorts)
diff --git a/src/main/scala/uncore/axi4/Parameters.scala b/src/main/scala/uncore/axi4/Parameters.scala
index dacc80ee..17a74140 100644
--- a/src/main/scala/uncore/axi4/Parameters.scala
+++ b/src/main/scala/uncore/axi4/Parameters.scala
@@ -21,9 +21,10 @@ case class AXI4SlaveParameters(
   val name = nodePath.lastOption.map(_.lazyModule.name).getOrElse("disconnected")
   val maxTransfer = max(supportsWrite.max, supportsRead.max)
   val maxAddress = address.map(_.max).max
+  val minAlignment = address.map(_.alignment).min
 
   // The device had better not support a transfer larger than it's alignment
-  address.foreach { case a => require (a.alignment >= maxTransfer) }
+  require (minAlignment >= maxTransfer)
 }
 
 case class AXI4SlavePortParameters(
@@ -41,6 +42,10 @@ case class AXI4SlavePortParameters(
   // Check that the link can be implemented in AXI4
   require (maxTransfer <= beatBytes * (1 << AXI4Parameters.lenBits))
 
+  lazy val routingMask = AddressDecoder(slaves.map(_.address))
+  def findSafe(address: UInt) = Vec(slaves.map(_.address.map(_.contains(address)).reduce(_ || _)))
+  def findFast(address: UInt) = Vec(slaves.map(_.address.map(_.widen(~routingMask)).distinct.map(_.contains(address)).reduce(_ || _)))
+
   // Require disjoint ranges for addresses
   slaves.combinations(2).foreach { case Seq(x,y) =>
     x.address.foreach { a => y.address.foreach { b =>
@@ -51,6 +56,7 @@ case class AXI4SlavePortParameters(
 
 case class AXI4MasterParameters(
   id:       IdRange       = IdRange(0, 1),
+  aligned:  Boolean       = false,
   nodePath: Seq[BaseNode] = Seq())
 {
   val name = nodePath.lastOption.map(_.lazyModule.name).getOrElse("disconnected")
diff --git a/src/main/scala/uncore/axi4/RegisterRouter.scala b/src/main/scala/uncore/axi4/RegisterRouter.scala
index b46d69f0..97b598d5 100644
--- a/src/main/scala/uncore/axi4/RegisterRouter.scala
+++ b/src/main/scala/uncore/axi4/RegisterRouter.scala
@@ -49,7 +49,7 @@ class AXI4RegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int
     // Invoke the register map builder and make it Irrevocable
     val out = Queue.irrevocable(
       RegMapper(beatBytes, concurrency, undefZero, in, mapping:_*),
-      entries = 1, pipe = true, flow = true)
+      entries = 1, flow = true)
 
     // No flow control needed
     out.ready := Mux(out.bits.read, r.ready, b.ready)
diff --git a/src/main/scala/uncore/axi4/Test.scala b/src/main/scala/uncore/axi4/Test.scala
new file mode 100644
index 00000000..53620ff3
--- /dev/null
+++ b/src/main/scala/uncore/axi4/Test.scala
@@ -0,0 +1,61 @@
+// See LICENSE for license details.
+package uncore.axi4
+
+import Chisel._
+import diplomacy._
+import uncore.tilelink2._
+import unittest._
+
+class RRTest0(address: BigInt) extends AXI4RegisterRouter(address, 0, 32, 0, 4)(
+  new AXI4RegBundle((), _)    with RRTest0Bundle)(
+  new AXI4RegModule((), _, _) with RRTest0Module)
+
+class RRTest1(address: BigInt) extends AXI4RegisterRouter(address, 0, 32, 6, 4, false)(
+  new AXI4RegBundle((), _)    with RRTest1Bundle)(
+  new AXI4RegModule((), _, _) with RRTest1Module)
+
+class AXI4LiteFuzzRAM extends LazyModule
+{
+  val fuzz  = LazyModule(new TLFuzzer(5000))
+  val model = LazyModule(new TLRAMModel("AXI4LiteFuzzRAM"))
+  val xbar  = LazyModule(new TLXbar)
+  val gpio  = LazyModule(new RRTest1(0x400))
+  val ram   = LazyModule(new AXI4RAM(AddressSet(0x0, 0x3ff)))
+
+  model.node := fuzz.node
+  xbar.node  := model.node
+  ram.node   := AXI4Fragmenter(lite=true)(TLToAXI4(0, true )(xbar.node))
+  gpio.node  := AXI4Fragmenter(lite=true)(TLToAXI4(0, false)(xbar.node))
+
+  lazy val module = new LazyModuleImp(this) with HasUnitTestIO {
+    io.finished := fuzz.module.io.finished
+  }
+}
+
+class AXI4LiteFuzzRAMTest extends UnitTest(500000) {
+  val dut = Module(LazyModule(new AXI4LiteFuzzRAM).module)
+  io.finished := dut.io.finished
+}
+
+class AXI4FullFuzzRAM extends LazyModule
+{
+  val fuzz  = LazyModule(new TLFuzzer(5000))
+  val model = LazyModule(new TLRAMModel("AXI4FullFuzzRAM"))
+  val xbar  = LazyModule(new TLXbar)
+  val gpio  = LazyModule(new RRTest0(0x400))
+  val ram   = LazyModule(new AXI4RAM(AddressSet(0x0, 0x3ff)))
+
+  model.node := fuzz.node
+  xbar.node  := model.node
+  ram.node   := AXI4Fragmenter(lite=false, maxInFlight = 2)(TLToAXI4(4,false)(xbar.node))
+  gpio.node  := AXI4Fragmenter(lite=false, maxInFlight = 5)(TLToAXI4(4,true )(xbar.node))
+
+  lazy val module = new LazyModuleImp(this) with HasUnitTestIO {
+    io.finished := fuzz.module.io.finished
+  }
+}
+
+class AXI4FullFuzzRAMTest extends UnitTest(500000) {
+  val dut = Module(LazyModule(new AXI4FullFuzzRAM).module)
+  io.finished := dut.io.finished
+}
diff --git a/src/main/scala/uncore/tilelink2/AtomicAutomata.scala b/src/main/scala/uncore/tilelink2/AtomicAutomata.scala
index dd74f6e6..8bea60de 100644
--- a/src/main/scala/uncore/tilelink2/AtomicAutomata.scala
+++ b/src/main/scala/uncore/tilelink2/AtomicAutomata.scala
@@ -138,8 +138,8 @@ class TLAtomicAutomata(logical: Boolean = true, arithmetic: Boolean = true, conc
       // Move the selected sign bit into the first byte position it will extend
       val signbit_a = ((signbits_a & signSel) << 1)(beatBytes-1, 0)
       val signbit_d = ((signbits_d & signSel) << 1)(beatBytes-1, 0)
-      val signext_a = FillInterleaved(8, highOR(signbit_a))
-      val signext_d = FillInterleaved(8, highOR(signbit_d))
+      val signext_a = FillInterleaved(8, leftOR(signbit_a))
+      val signext_d = FillInterleaved(8, leftOR(signbit_d))
       // NOTE: sign-extension does not change the relative ordering in EITHER unsigned or signed arithmetic
       val wide_mask = FillInterleaved(8, mask)
       val a_a_ext = (a_a & wide_mask) | signext_a
diff --git a/src/main/scala/uncore/tilelink2/Fragmenter.scala b/src/main/scala/uncore/tilelink2/Fragmenter.scala
index 4de68af9..940342b9 100644
--- a/src/main/scala/uncore/tilelink2/Fragmenter.scala
+++ b/src/main/scala/uncore/tilelink2/Fragmenter.scala
@@ -12,7 +12,7 @@ import scala.math.{min,max}
 // alwaysMin: fragment all requests down to minSize (else fragment to maximum supported by manager)
 // Fragmenter modifies: PutFull, PutPartial, LogicalData, Get, Hint
 // Fragmenter passes: ArithmeticData (truncated to minSize if alwaysMin)
-// Fragmenter breaks: Acquire (and thus cuts BCE channels)
+// Fragmenter cannot modify acquire (could livelock); thus it is unsafe to put caches on both sides
 class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) extends LazyModule
 {
   require (isPow2 (maxSize))
@@ -30,7 +30,6 @@ class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) exten
     if (x.min <= minSize) TransferSizes(x.min, min(minSize, x.max)) else
     TransferSizes.none
   def mapManager(m: TLManagerParameters) = m.copy(
-    supportsAcquire    = TransferSizes.none, // this adapter breaks acquires
     supportsArithmetic = shrinkTransfer(m.supportsArithmetic),
     supportsLogical    = expandTransfer(m.supportsLogical),
     supportsGet        = expandTransfer(m.supportsGet),
@@ -38,15 +37,7 @@ class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) exten
     supportsPutPartial = expandTransfer(m.supportsPutPartial),
     supportsHint       = expandTransfer(m.supportsHint))
   def mapClient(c: TLClientParameters) = c.copy(
-    sourceId = IdRange(c.sourceId.start << fragmentBits, c.sourceId.end << fragmentBits),
-    // since we break Acquires, none of these work either:
-    supportsProbe      = TransferSizes.none,
-    supportsArithmetic = TransferSizes.none,
-    supportsLogical    = TransferSizes.none,
-    supportsGet        = TransferSizes.none,
-    supportsPutFull    = TransferSizes.none,
-    supportsPutPartial = TransferSizes.none,
-    supportsHint       = TransferSizes.none)
+    sourceId = IdRange(c.sourceId.start << fragmentBits, c.sourceId.end << fragmentBits))
 
   // Because the Fragmenter stalls inner A while serving outer, it can wipe away inner latency
   val node = TLAdapterNode(
@@ -70,6 +61,8 @@ class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) exten
 
     // We don't support fragmenting to sub-beat accesses
     require (minSize >= beatBytes)
+    // We can't support devices which are cached on both sides of us
+    require (!edgeOut.manager.anySupportAcquire || !edgeIn.client.anySupportProbe)
 
     /* The Fragmenter is a bit tricky, because there are 5 sizes in play:
      *   max  size -- the maximum transfer size possible
@@ -174,6 +167,12 @@ class TLFragmenter(minSize: Int, maxSize: Int, alwaysMin: Boolean = false) exten
     in.d.bits.source := out.d.bits.source >> fragmentBits
     in.d.bits.size   := Mux(dFirst, dFirst_size, dOrig)
 
+    // Combine the error flag
+    val r_error = RegInit(Bool(false))
+    val d_error = r_error | out.d.bits.error
+    when (out.d.fire()) { r_error := Mux(drop, d_error, UInt(0)) }
+    in.d.bits.error := d_error
+
     // What maximum transfer sizes do downstream devices support?
     val maxArithmetics = managers.map(_.supportsArithmetic.max)
     val maxLogicals    = managers.map(_.supportsLogical.max)
@@ -271,4 +270,3 @@ class TLRAMFragmenter(ramBeatBytes: Int, maxSize: Int) extends LazyModule {
 class TLRAMFragmenterTest(ramBeatBytes: Int, maxSize: Int) extends UnitTest(timeout = 500000) {
   io.finished := Module(LazyModule(new TLRAMFragmenter(ramBeatBytes,maxSize)).module).io.finished
 }
-
diff --git a/src/main/scala/uncore/tilelink2/Fuzzer.scala b/src/main/scala/uncore/tilelink2/Fuzzer.scala
index 4063985d..de9c9fa5 100644
--- a/src/main/scala/uncore/tilelink2/Fuzzer.scala
+++ b/src/main/scala/uncore/tilelink2/Fuzzer.scala
@@ -17,7 +17,7 @@ class IDMapGenerator(numIds: Int) extends Module {
   io.free.ready := Bool(true)
   assert (!io.free.valid || !bitmap(io.free.bits)) // No double freeing
 
-  val select = ~(highOR(bitmap) << 1) & bitmap
+  val select = ~(leftOR(bitmap) << 1) & bitmap
   io.alloc.bits := OHToUInt(select)
   io.alloc.valid := bitmap.orR()
 
@@ -206,7 +206,7 @@ import unittest._
 
 class TLFuzzRAM extends LazyModule
 {
-  val model = LazyModule(new TLRAMModel)
+  val model = LazyModule(new TLRAMModel("TLFuzzRAM"))
   val ram  = LazyModule(new TLRAM(AddressSet(0x800, 0x7ff)))
   val ram2 = LazyModule(new TLRAM(AddressSet(0, 0x3ff), beatBytes = 16))
   val gpio = LazyModule(new RRTest1(0x400))
diff --git a/src/main/scala/uncore/tilelink2/Parameters.scala b/src/main/scala/uncore/tilelink2/Parameters.scala
index 45b95c13..097f7a22 100644
--- a/src/main/scala/uncore/tilelink2/Parameters.scala
+++ b/src/main/scala/uncore/tilelink2/Parameters.scala
@@ -25,9 +25,7 @@ case class TLManagerParameters(
   customDTS:          Option[String]= None)
 {
   address.foreach { a => require (a.finite) }
-  address.combinations(2).foreach({ case Seq(x,y) =>
-    require (!x.overlaps(y))
-  })
+  address.combinations(2).foreach { case Seq(x,y) => require (!x.overlaps(y)) }
   require (supportsPutFull.contains(supportsPutPartial))
 
   // Largest support transfer of all types
@@ -38,6 +36,7 @@ case class TLManagerParameters(
     supportsGet.max,
     supportsPutFull.max,
     supportsPutPartial.max).max
+  val maxAddress = address.map(_.max).max
 
   val name = nodePath.lastOption.map(_.lazyModule.name).getOrElse("disconnected")
 
@@ -53,9 +52,8 @@ case class TLManagerParameters(
   }
 
   // The device had better not support a transfer larger than it's alignment
-  address.foreach({ case a =>
-    require (a.alignment >= maxTransfer)
-  })
+  val minAlignment = address.map(_.alignment).min
+  require (minAlignment >= maxTransfer)
 }
 
 case class TLManagerPortParameters(
@@ -77,7 +75,7 @@ case class TLManagerPortParameters(
 
   // Bounds on required sizes
   def endSinkId   = managers.map(_.sinkId.end).max
-  def maxAddress  = managers.map(_.address.map(_.max).max).max
+  def maxAddress  = managers.map(_.maxAddress).max
   def maxTransfer = managers.map(_.maxTransfer).max
   
   // Operation sizes supported by all outward Managers
@@ -166,6 +164,13 @@ case class TLClientParameters(
   supportsHint:        TransferSizes = TransferSizes.none)
 {
   require (supportsPutFull.contains(supportsPutPartial))
+  // We only support these operations if we support Probe (ie: we're a cache)
+  require (supportsProbe.contains(supportsArithmetic))
+  require (supportsProbe.contains(supportsLogical))
+  require (supportsProbe.contains(supportsGet))
+  require (supportsProbe.contains(supportsPutFull))
+  require (supportsProbe.contains(supportsPutPartial))
+  require (supportsProbe.contains(supportsHint))
 
   val maxTransfer = List(
     supportsProbe.max,
diff --git a/src/main/scala/uncore/tilelink2/RAMModel.scala b/src/main/scala/uncore/tilelink2/RAMModel.scala
index cfeb8cce..2ee1d607 100644
--- a/src/main/scala/uncore/tilelink2/RAMModel.scala
+++ b/src/main/scala/uncore/tilelink2/RAMModel.scala
@@ -20,7 +20,7 @@ import diplomacy._
 // put, get, getAck, putAck => ok: detected by getAck (it sees busy>0)		impossible for FIFO
 // If FIFO, the getAck should check data even if its validity was wiped
 
-class TLRAMModel extends LazyModule
+class TLRAMModel(log: String = "") extends LazyModule
 {
   val node = TLIdentityNode()
 
@@ -150,6 +150,7 @@ class TLRAMModel extends LazyModule
           val busy = a_inc(i) - a_dec(i) - (!a_first).asUInt
           val byte = a.data(8*(i+1)-1, 8*i)
           when (a.mask(i)) {
+            printf(log + " ")
             when (a.opcode === TLMessages.PutFullData) { printf("PF") }
             when (a.opcode === TLMessages.PutPartialData) { printf("PP") }
             when (a.opcode === TLMessages.ArithmeticData) { printf("A ") }
@@ -160,7 +161,7 @@ class TLRAMModel extends LazyModule
       }
 
       when (a.opcode === TLMessages.Get) {
-        printf("G  0x%x - 0%x\n", a_base, a_base | UIntToOH1(a_size, addressBits))
+        printf(log + " G  0x%x - 0%x\n", a_base, a_base | UIntToOH1(a_size, addressBits))
       }
     }
 
@@ -245,6 +246,7 @@ class TLRAMModel extends LazyModule
 
       when (d_flight.opcode === TLMessages.PutFullData || d_flight.opcode === TLMessages.PutPartialData) {
         assert (d.opcode === TLMessages.AccessAck)
+        printf(log + " ")
         when (d_flight.opcode === TLMessages.PutFullData) { printf("pf") }
         when (d_flight.opcode === TLMessages.PutPartialData) { printf("pp") }
         printf(" 0x%x - 0x%x\n", d_base, d_base | UIntToOH1(d_size, addressBits))
@@ -257,6 +259,7 @@ class TLRAMModel extends LazyModule
           val shadow = Wire(init = d_shadow(i))
           when (d_mask(i)) {
             val d_addr = d_addr_hi << shift | UInt(i)
+            printf(log + " ")
             when (d_flight.opcode === TLMessages.Get) { printf("g ") }
             when (d_flight.opcode === TLMessages.ArithmeticData) { printf("a ") }
             when (d_flight.opcode === TLMessages.LogicalData) { printf("l ") }
diff --git a/src/main/scala/uncore/tilelink2/RegisterRouter.scala b/src/main/scala/uncore/tilelink2/RegisterRouter.scala
index dddd0a60..89c62021 100644
--- a/src/main/scala/uncore/tilelink2/RegisterRouter.scala
+++ b/src/main/scala/uncore/tilelink2/RegisterRouter.scala
@@ -44,7 +44,7 @@ class TLRegisterNode(address: AddressSet, concurrency: Int = 0, beatBytes: Int =
     // Invoke the register map builder and make it Irrevocable
     val out = Queue.irrevocable(
       RegMapper(beatBytes, concurrency, undefZero, in, mapping:_*),
-      entries = 1, pipe = true, flow = true)
+      entries = 1, flow = true)
 
     // No flow control needed
     in.valid  := a.valid
diff --git a/src/main/scala/uncore/tilelink2/ToAXI4.scala b/src/main/scala/uncore/tilelink2/ToAXI4.scala
index f68d22a6..079f37dd 100644
--- a/src/main/scala/uncore/tilelink2/ToAXI4.scala
+++ b/src/main/scala/uncore/tilelink2/ToAXI4.scala
@@ -12,7 +12,11 @@ import scala.math.{min, max}
 case class TLToAXI4Node(idBits: Int) extends MixedNode(TLImp, AXI4Imp)(
   dFn = { case (1, _) =>
     // We must erase all client information, because we crush their source Ids
-    Seq(AXI4MasterPortParameters(Seq(AXI4MasterParameters(id = IdRange(0, 1 << idBits)))))
+    val masters = Seq(
+      AXI4MasterParameters(
+        id      = IdRange(0, 1 << idBits),
+        aligned = true))
+    Seq(AXI4MasterPortParameters(masters))
   },
   uFn = { case (1, Seq(AXI4SlavePortParameters(slaves, beatBytes))) =>
     val managers = slaves.zipWithIndex.map { case (s, id) =>
@@ -53,6 +57,13 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule
     require (slaves(0).interleavedId.isDefined)
     slaves.foreach { s => require (s.interleavedId == slaves(0).interleavedId) }
 
+    // We need to ensure that a slave does not stall trying to send B while we need to receive R
+    // Since R&W have independent flow control, it is possible for a W to cut in-line and get into
+    // a slave's buffers, preventing us from getting all the R responses we need to release D for B.
+    // This risk is compounded by an AXI fragmentation. Even a slave which responds completely to
+    // AR before working on AW might have an AW slipped between two AR fragments.
+    val out_b = Queue.irrevocable(out.b, entries=edgeIn.client.endSourceId, flow=combinational)
+
     // We need to keep the following state from A => D: (addr_lo, size, sink, source)
     // All of those fields could potentially require 0 bits (argh. Chisel.)
     // We will pack as many of the lowest bits of state as fit into the AXI ID.
@@ -113,7 +124,7 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule
 
     val r_last = out.r.bits.last
     val r_id = out.r.bits.id
-    val b_id = out.b.bits.id
+    val b_id = out_b.bits.id
 
     if (stateBits <= idBits) { // No need for any state tracking
       r_state := r_id
@@ -148,7 +159,7 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule
         q.io.enq.bits.data := a_state >> implicitBits
         q.io.enq.bits.way  := Mux(a_isPut, UInt(0), UInt(1))
         // Pop the bank's ways
-        q.io.deq(0).ready := out.b.fire() && b_bankSelect(i)
+        q.io.deq(0).ready := out_b.fire() && b_bankSelect(i)
         q.io.deq(1).ready := out.r.fire() && r_bankSelect(i) && r_last
         // The FIFOs must be valid when we're ready to pop them...
         assert (q.io.deq(0).valid || !q.io.deq(0).ready)
@@ -169,8 +180,8 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule
     val depth = if (combinational) 1 else 2
     val out_arw = Wire(Decoupled(new AXI4BundleARW(out.params)))
     val out_w = Wire(out.w)
-    out.w <> Queue.irrevocable(out_w, entries=depth, pipe=combinational, flow=combinational)
-    val queue_arw = Queue.irrevocable(out_arw, entries=depth, pipe=combinational, flow=combinational)
+    out.w <> Queue.irrevocable(out_w, entries=depth, flow=combinational)
+    val queue_arw = Queue.irrevocable(out_arw, entries=depth, flow=combinational)
 
     // Fan out the ARW channel to AR and AW
     out.ar.bits := queue_arw.bits
@@ -210,18 +221,21 @@ class TLToAXI4(idBits: Int, combinational: Boolean = true) extends LazyModule
     // Give R higher priority than B
     val r_wins = out.r.valid || r_holds_d
 
-    out.r.ready := in.d.ready
-    out.b.ready := in.d.ready && !r_wins
-    in.d.valid := Mux(r_wins, out.r.valid, out.b.valid)
+    val in_d = Wire(in.d)
+    in.d <> Queue.irrevocable(in_d, entries=1, flow=combinational)
+
+    out.r.ready := in_d.ready
+    out_b.ready := in_d.ready && !r_wins
+    in_d.valid := Mux(r_wins, out.r.valid, out_b.valid)
 
     val r_error = out.r.bits.resp =/= AXI4Parameters.RESP_OKAY
-    val b_error = out.b.bits.resp =/= AXI4Parameters.RESP_OKAY
+    val b_error = out_b.bits.resp =/= AXI4Parameters.RESP_OKAY
 
     val r_d = edgeIn.AccessAck(r_addr_lo, r_sink, r_source, r_size, UInt(0), r_error)
     val b_d = edgeIn.AccessAck(b_addr_lo, b_sink, b_source, b_size, b_error)
 
-    in.d.bits := Mux(r_wins, r_d, b_d)
-    in.d.bits.data := out.r.bits.data // avoid a costly Mux
+    in_d.bits := Mux(r_wins, r_d, b_d)
+    in_d.bits.data := out.r.bits.data // avoid a costly Mux
 
     // Tie off unused channels
     in.b.valid := Bool(false)
diff --git a/src/main/scala/uncore/tilelink2/package.scala b/src/main/scala/uncore/tilelink2/package.scala
index e996f2ba..415aa308 100644
--- a/src/main/scala/uncore/tilelink2/package.scala
+++ b/src/main/scala/uncore/tilelink2/package.scala
@@ -11,12 +11,20 @@ package object tilelink2
   def OH1ToUInt(x: UInt) = OHToUInt((x << 1 | UInt(1)) ^ x)
   def UIntToOH1(x: UInt, width: Int) = ~(SInt(-1, width=width).asUInt << x)(width-1, 0)
   def trailingZeros(x: Int) = if (x > 0) Some(log2Ceil(x & -x)) else None
-  def highOR(x: UInt) = {
+  // Fill 1s from low bits to high bits
+  def leftOR(x: UInt) = {
     val w = x.getWidth
     def helper(s: Int, x: UInt): UInt =
       if (s >= w) x else helper(s+s, x | (x << s)(w-1,0))
     helper(1, x)
   }
+  // Fill 1s form high bits to low bits
+  def rightOR(x: UInt) = {
+    val w = x.getWidth
+    def helper(s: Int, x: UInt): UInt =
+      if (s >= w) x else helper(s+s, x | (x >> s))
+    helper(1, x)
+  }
   // This gets used everywhere, so make the smallest circuit possible ...
   def maskGen(addr_lo: UInt, lgSize: UInt, beatBytes: Int): UInt = {
     val lgBytes = log2Ceil(beatBytes)
diff --git a/src/main/scala/unittest/Configs.scala b/src/main/scala/unittest/Configs.scala
index ac4ecfb0..ac0488af 100644
--- a/src/main/scala/unittest/Configs.scala
+++ b/src/main/scala/unittest/Configs.scala
@@ -25,7 +25,9 @@ class WithUncoreUnitTests extends Config(
     case UnitTests => (p: Parameters) => Seq(
       Module(new uncore.devices.ROMSlaveTest()(p)),
       Module(new uncore.devices.TileLinkRAMTest()(p)),
-      Module(new uncore.tilelink2.TLFuzzRAMTest))
+      Module(new uncore.tilelink2.TLFuzzRAMTest),
+      Module(new uncore.axi4.AXI4LiteFuzzRAMTest),
+      Module(new uncore.axi4.AXI4FullFuzzRAMTest))
     case _ => throw new CDEMatchError
   }
 )