Merge remote-tracking branch 'junctions/master' into mono-repo

2016-07-28 11:27:47 -07:00
parent 373fd427dc 59d700bf66
commit a5b88d0bdc
17 changed files with 3246 additions and 0 deletions
--- a/junctions/.gitignore
+++ b/junctions/.gitignore
@@ -0,0 +1,17 @@
+*.class
+*.log
+
+# sbt specific
+.cache
+.history
+.lib/
+dist/*
+target/
+lib_managed/
+src_managed/
+project/boot/
+project/plugins/project/
+
+# Scala-IDE specific
+.scala_dependencies
+.worksheet
--- a/junctions/LICENSE
+++ b/junctions/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of junctions nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING
+OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS
+BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
+HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
+MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+
--- a/junctions/README.md
+++ b/junctions/README.md
@@ -0,0 +1,6 @@
+# junctions
+A repository for peripheral components and IO devices associated with the RocketChip project.
+
+To uses these modules, include this repo as a git submodule within the your chip repository and add it as Project in your chip's build.scala. These components are only dependent on Chisel, i.e.
+
+    lazy val junctions = project.dependsOn(chisel)
--- a/junctions/build.sbt
+++ b/junctions/build.sbt
@@ -0,0 +1,19 @@
+organization := "edu.berkeley.cs"
+
+version := "1.0"
+
+name := "junctions"
+
+scalaVersion := "2.11.6"
+
+// Provide a managed dependency on chisel if -DchiselVersion="" is supplied on the command line.
+libraryDependencies ++= (Seq("chisel","cde").map {
+  dep: String => sys.props.get(dep + "Version") map { "edu.berkeley.cs" %% dep % _ }}).flatten
+
+site.settings
+
+site.includeScaladoc()
+
+ghpages.settings
+
+git.remoteRepo := "git@github.com:ucb-bar/junctions.git"
--- a/junctions/project/plugins.sbt
+++ b/junctions/project/plugins.sbt
@@ -0,0 +1,5 @@
+resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven"
+
+addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.5.3")
+
+addSbtPlugin("com.typesafe.sbt" % "sbt-site" % "0.8.1")
--- a/junctions/src/main/scala/addrmap.scala
+++ b/junctions/src/main/scala/addrmap.scala
@@ -0,0 +1,150 @@
+// See LICENSE for license details.
+
+package junctions
+
+import Chisel._
+import cde.{Parameters, Field}
+import scala.collection.mutable.HashMap
+
+case object PAddrBits extends Field[Int]
+case object VAddrBits extends Field[Int]
+case object PgIdxBits extends Field[Int]
+case object PgLevels extends Field[Int]
+case object PgLevelBits extends Field[Int]
+case object ASIdBits extends Field[Int]
+case object PPNBits extends Field[Int]
+case object VPNBits extends Field[Int]
+
+case object GlobalAddrMap extends Field[AddrMap]
+
+trait HasAddrMapParameters {
+  implicit val p: Parameters
+
+  val paddrBits = p(PAddrBits)
+  val vaddrBits = p(VAddrBits)
+  val pgIdxBits = p(PgIdxBits)
+  val ppnBits = p(PPNBits)
+  val vpnBits = p(VPNBits)
+  val pgLevels = p(PgLevels)
+  val pgLevelBits = p(PgLevelBits)
+  val asIdBits = p(ASIdBits)
+
+  val addrMap = p(GlobalAddrMap)
+}
+
+case class MemAttr(prot: Int, cacheable: Boolean = false)
+
+sealed abstract class MemRegion {
+  def start: BigInt
+  def size: BigInt
+  def numSlaves: Int
+  def attr: MemAttr
+
+  def containsAddress(x: UInt) = UInt(start) <= x && x < UInt(start + size)
+}
+
+case class MemSize(size: BigInt, attr: MemAttr) extends MemRegion {
+  def start = 0
+  def numSlaves = 1
+}
+
+case class MemRange(start: BigInt, size: BigInt, attr: MemAttr) extends MemRegion {
+  def numSlaves = 1
+}
+
+object AddrMapProt {
+  val R = 0x1
+  val W = 0x2
+  val X = 0x4
+  val RW = R | W
+  val RX = R | X
+  val RWX = R | W | X
+  val SZ = 3
+}
+
+class AddrMapProt extends Bundle {
+  val x = Bool()
+  val w = Bool()
+  val r = Bool()
+}
+
+case class AddrMapEntry(name: String, region: MemRegion)
+
+object AddrMap {
+  def apply(elems: AddrMapEntry*): AddrMap = new AddrMap(elems)
+}
+
+class AddrMap(entriesIn: Seq[AddrMapEntry], val start: BigInt = BigInt(0)) extends MemRegion {
+  def isEmpty = entries.isEmpty
+  def length = entries.size
+  def numSlaves = entries.map(_.region.numSlaves).foldLeft(0)(_ + _)
+  def attr = ???
+
+  private val slavePorts = HashMap[String, Int]()
+  private val mapping = HashMap[String, MemRegion]()
+
+  val (size: BigInt, entries: Seq[AddrMapEntry]) = {
+    var ind = 0
+    var base = start
+    var rebasedEntries = collection.mutable.ArrayBuffer[AddrMapEntry]()
+    for (AddrMapEntry(name, r) <- entriesIn) {
+      if (r.start != 0) {
+        val align = BigInt(1) << log2Ceil(r.size)
+        require(r.start >= base, s"region $name base address 0x${r.start.toString(16)} overlaps previous base 0x${base.toString(16)}")
+        require(r.start % align == 0, s"region $name base address 0x${r.start.toString(16)} not aligned to 0x${align.toString(16)}")
+        base = r.start
+      } else {
+        base = (base + r.size - 1) / r.size * r.size
+      }
+
+      r match {
+        case r: AddrMap =>
+          val subMap = new AddrMap(r.entries, base)
+          rebasedEntries += AddrMapEntry(name, subMap)
+          mapping += name -> subMap
+          mapping ++= subMap.mapping.map { case (k, v) => s"$name:$k" -> v }
+          slavePorts ++= subMap.slavePorts.map { case (k, v) => s"$name:$k" -> (ind + v) }
+        case _ =>
+          val e = MemRange(base, r.size, r.attr)
+          rebasedEntries += AddrMapEntry(name, e)
+          mapping += name -> e
+          slavePorts += name -> ind
+      }
+
+      ind += r.numSlaves
+      base += r.size
+    }
+    (base - start, rebasedEntries)
+  }
+
+  val flatten: Seq[(String, MemRange)] = {
+    val arr = new Array[(String, MemRange)](slavePorts.size)
+    for ((name, port) <- slavePorts)
+      arr(port) = (name, mapping(name).asInstanceOf[MemRange])
+    arr
+  }
+
+  def apply(name: String): MemRegion = mapping(name)
+  def port(name: String): Int = slavePorts(name)
+  def subMap(name: String): AddrMap = mapping(name).asInstanceOf[AddrMap]
+  def isInRegion(name: String, addr: UInt): Bool = mapping(name).containsAddress(addr)
+
+  def isCacheable(addr: UInt): Bool = {
+    flatten.filter(_._2.attr.cacheable).map { case (_, region) =>
+      region.containsAddress(addr)
+    }.foldLeft(Bool(false))(_ || _)
+  }
+
+  def isValid(addr: UInt): Bool = {
+    flatten.map { case (_, region) =>
+      region.containsAddress(addr)
+    }.foldLeft(Bool(false))(_ || _)
+  }
+
+  def getProt(addr: UInt): AddrMapProt = {
+    val protForRegion = flatten.map { case (_, region) =>
+      Mux(region.containsAddress(addr), UInt(region.attr.prot, AddrMapProt.SZ), UInt(0))
+    }
+    new AddrMapProt().fromBits(protForRegion.reduce(_|_))
+  }
+}
--- a/junctions/src/main/scala/atos.scala
+++ b/junctions/src/main/scala/atos.scala
@@ -0,0 +1,333 @@
+package junctions
+
+import Chisel._
+import scala.math.max
+import cde.{Parameters, Field}
+
+trait HasAtosParameters extends HasNastiParameters {
+  // round up to a multiple of 32
+  def roundup(n: Int) = 32 * ((n - 1) / 32 + 1)
+
+  val atosUnionBits = max(
+    nastiXIdBits + nastiXDataBits + nastiWStrobeBits + 1,
+    nastiXIdBits + nastiXBurstBits +
+    nastiXSizeBits + nastiXLenBits + nastiXAddrBits)
+  val atosIdBits = nastiXIdBits
+  val atosTypBits = 2
+  val atosRespBits = nastiXRespBits
+  val atosDataBits = nastiXDataBits
+
+  val atosAddrOffset = atosIdBits
+  val atosLenOffset = atosIdBits + nastiXAddrBits
+  val atosSizeOffset = atosLenOffset + nastiXLenBits
+  val atosBurstOffset = atosSizeOffset + nastiXSizeBits
+
+  val atosDataOffset = atosIdBits
+  val atosStrobeOffset = nastiXDataBits + atosIdBits
+  val atosLastOffset = atosStrobeOffset + nastiWStrobeBits
+
+  val atosRequestBits = roundup(atosTypBits + atosUnionBits)
+  val atosResponseBits = roundup(atosTypBits + atosIdBits + atosRespBits + atosDataBits + 1)
+  val atosRequestBytes = atosRequestBits / 8
+  val atosResponseBytes = atosResponseBits / 8
+  val atosRequestWords = atosRequestBytes / 4
+  val atosResponseWords = atosResponseBytes / 4
+}
+
+abstract class AtosModule(implicit val p: Parameters)
+  extends Module with HasAtosParameters
+abstract class AtosBundle(implicit val p: Parameters)
+  extends ParameterizedBundle()(p) with HasAtosParameters
+
+object AtosRequest {
+  def arType = UInt("b00")
+  def awType = UInt("b01")
+  def wType  = UInt("b10")
+
+  def apply(typ: UInt, union: UInt)(implicit p: Parameters): AtosRequest = {
+    val areq = Wire(new AtosRequest)
+    areq.typ := typ
+    areq.union := union
+    areq
+  }
+
+  def apply(ar: NastiReadAddressChannel)(implicit p: Parameters): AtosRequest =
+    apply(arType, Cat(ar.burst, ar.size, ar.len, ar.addr, ar.id))
+
+  def apply(aw: NastiWriteAddressChannel)(implicit p: Parameters): AtosRequest =
+    apply(awType, Cat(aw.burst, aw.size, aw.len, aw.addr, aw.id))
+
+  def apply(w: NastiWriteDataChannel)(implicit p: Parameters): AtosRequest =
+    apply(wType, Cat(w.last, w.strb, w.data, w.id))
+}
+
+class AtosRequest(implicit p: Parameters)
+    extends AtosBundle()(p) with Serializable {
+  val typ = UInt(width = atosTypBits)
+  val union = UInt(width = atosUnionBits)
+
+  def burst(dummy: Int = 0) =
+    union(atosUnionBits - 1, atosBurstOffset)
+
+  def size(dummy: Int = 0) =
+    union(atosBurstOffset - 1, atosSizeOffset)
+
+  def len(dummy: Int = 0) =
+    union(atosSizeOffset - 1, atosLenOffset)
+
+  def addr(dummy: Int = 0) =
+    union(atosLenOffset - 1, atosAddrOffset)
+
+  def id(dummy: Int = 0) =
+    union(atosIdBits - 1, 0)
+
+  def data(dummy: Int = 0) =
+    union(atosStrobeOffset - 1, atosDataOffset)
+
+  def strb(dummy: Int = 0) =
+    union(atosLastOffset - 1, atosStrobeOffset)
+
+  def last(dummy: Int = 0) =
+    union(atosLastOffset)
+
+  def has_addr(dummy: Int = 0) =
+    typ === AtosRequest.arType || typ === AtosRequest.awType
+
+  def has_data(dummy: Int = 0) =
+    typ === AtosRequest.wType
+
+  def is_last(dummy: Int = 0) =
+    typ === AtosRequest.arType || (typ === AtosRequest.wType && last())
+
+  def nbits: Int = atosRequestBits
+
+  def resp_len(dummy: Int = 0) =
+    MuxLookup(typ, UInt(0), Seq(
+      AtosRequest.arType -> (len() + UInt(1)),
+      AtosRequest.awType -> UInt(1)))
+}
+
+object AtosResponse {
+  def rType = UInt("b00")
+  def bType = UInt("b01")
+
+  def apply(typ: UInt, id: UInt, resp: UInt, data: UInt, last: Bool)
+      (implicit p: Parameters): AtosResponse = {
+    val aresp = Wire(new AtosResponse)
+    aresp.typ := typ
+    aresp.id := id
+    aresp.resp := resp
+    aresp.data := data
+    aresp.last := last
+    aresp
+  }
+
+  def apply(r: NastiReadDataChannel)(implicit p: Parameters): AtosResponse =
+    apply(rType, r.id, r.resp, r.data, r.last)
+
+  def apply(b: NastiWriteResponseChannel)(implicit p: Parameters): AtosResponse =
+    apply(bType, b.id, b.resp, UInt(0), Bool(false))
+}
+
+class AtosResponse(implicit p: Parameters)
+    extends AtosBundle()(p) with Serializable {
+  val typ = UInt(width = atosTypBits)
+  val id = UInt(width = atosIdBits)
+  val resp = UInt(width = atosRespBits)
+  val last = Bool()
+  val data = UInt(width = atosDataBits)
+
+  def has_data(dummy: Int = 0) = typ === AtosResponse.rType
+
+  def is_last(dummy: Int = 0) = !has_data() || last
+
+  def nbits: Int = atosResponseBits
+}
+
+class AtosIO(implicit p: Parameters) extends AtosBundle()(p) {
+  val req = Decoupled(new AtosRequest)
+  val resp = Decoupled(new AtosResponse).flip
+}
+
+class AtosRequestEncoder(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val ar = Decoupled(new NastiReadAddressChannel).flip
+    val aw = Decoupled(new NastiWriteAddressChannel).flip
+    val w  = Decoupled(new NastiWriteDataChannel).flip
+    val req = Decoupled(new AtosRequest)
+  }
+
+  val writing = Reg(init = Bool(false))
+
+  io.ar.ready := !writing && io.req.ready
+  io.aw.ready := !writing && !io.ar.valid && io.req.ready
+  io.w.ready  := writing && io.req.ready
+
+  io.req.valid := Mux(writing, io.w.valid, io.ar.valid || io.aw.valid)
+  io.req.bits := Mux(writing, AtosRequest(io.w.bits),
+    Mux(io.ar.valid, AtosRequest(io.ar.bits), AtosRequest(io.aw.bits)))
+
+  when (io.aw.fire()) { writing := Bool(true) }
+  when (io.w.fire() && io.w.bits.last) { writing := Bool(false) }
+}
+
+class AtosResponseDecoder(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val resp = Decoupled(new AtosResponse).flip
+    val b = Decoupled(new NastiWriteResponseChannel)
+    val r = Decoupled(new NastiReadDataChannel)
+  }
+
+  val is_b = io.resp.bits.typ === AtosResponse.bType
+  val is_r = io.resp.bits.typ === AtosResponse.rType
+
+  io.b.valid := io.resp.valid && is_b
+  io.b.bits := NastiWriteResponseChannel(
+    id = io.resp.bits.id,
+    resp = io.resp.bits.resp)
+
+  io.r.valid := io.resp.valid && is_r
+  io.r.bits := NastiReadDataChannel(
+    id = io.resp.bits.id,
+    data = io.resp.bits.data,
+    last = io.resp.bits.last,
+    resp = io.resp.bits.resp)
+
+  io.resp.ready := (is_b && io.b.ready) || (is_r && io.r.ready)
+}
+
+class AtosClientConverter(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val atos = new AtosIO
+  }
+
+  val req_enc = Module(new AtosRequestEncoder)
+  req_enc.io.ar <> io.nasti.ar
+  req_enc.io.aw <> io.nasti.aw
+  req_enc.io.w  <> io.nasti.w
+  io.atos.req <> req_enc.io.req
+
+  val resp_dec = Module(new AtosResponseDecoder)
+  resp_dec.io.resp <> io.atos.resp
+  io.nasti.b <> resp_dec.io.b
+  io.nasti.r <> resp_dec.io.r
+}
+
+class AtosRequestDecoder(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val req = Decoupled(new AtosRequest).flip
+    val ar = Decoupled(new NastiReadAddressChannel)
+    val aw = Decoupled(new NastiWriteAddressChannel)
+    val w  = Decoupled(new NastiWriteDataChannel)
+  }
+
+  val is_ar = io.req.bits.typ === AtosRequest.arType
+  val is_aw = io.req.bits.typ === AtosRequest.awType
+  val is_w  = io.req.bits.typ === AtosRequest.wType
+
+  io.ar.valid := io.req.valid && is_ar
+  io.ar.bits := NastiReadAddressChannel(
+    id = io.req.bits.id(),
+    addr = io.req.bits.addr(),
+    size = io.req.bits.size(),
+    len = io.req.bits.len(),
+    burst = io.req.bits.burst())
+
+  io.aw.valid := io.req.valid && is_aw
+  io.aw.bits := NastiWriteAddressChannel(
+    id = io.req.bits.id(),
+    addr = io.req.bits.addr(),
+    size = io.req.bits.size(),
+    len = io.req.bits.len(),
+    burst = io.req.bits.burst())
+
+  io.w.valid := io.req.valid && is_w
+  io.w.bits := NastiWriteDataChannel(
+    id = io.req.bits.id(),
+    data = io.req.bits.data(),
+    strb = Some(io.req.bits.strb()),
+    last = io.req.bits.last())
+
+  io.req.ready := (io.ar.ready && is_ar) ||
+                  (io.aw.ready && is_aw) ||
+                  (io.w.ready  && is_w)
+}
+
+class AtosResponseEncoder(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val b = Decoupled(new NastiWriteResponseChannel).flip
+    val r = Decoupled(new NastiReadDataChannel).flip
+    val resp = Decoupled(new AtosResponse)
+  }
+
+  val locked = Reg(init = Bool(false))
+
+  io.resp.valid := (io.b.valid && !locked) || io.r.valid
+  io.resp.bits := Mux(io.r.valid,
+    AtosResponse(io.r.bits), AtosResponse(io.b.bits))
+
+  io.b.ready := !locked && !io.r.valid && io.resp.ready
+  io.r.ready := io.resp.ready
+
+  when (io.r.fire() && !io.r.bits.last) { locked := Bool(true) }
+  when (io.r.fire() && io.r.bits.last) { locked := Bool(false) }
+}
+
+class AtosManagerConverter(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val atos = (new AtosIO).flip
+    val nasti = new NastiIO
+  }
+
+  val req_dec = Module(new AtosRequestDecoder)
+  val resp_enc = Module(new AtosResponseEncoder)
+
+  req_dec.io.req <> io.atos.req
+  io.atos.resp <> resp_enc.io.resp
+
+  io.nasti.ar <> req_dec.io.ar
+  io.nasti.aw <> req_dec.io.aw
+  io.nasti.w  <> req_dec.io.w
+
+  resp_enc.io.b <> io.nasti.b
+  resp_enc.io.r <> io.nasti.r
+}
+
+class AtosSerializedIO(w: Int)(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val req = Decoupled(Bits(width = w))
+  val resp = Decoupled(Bits(width = w)).flip
+  val clk = Bool(OUTPUT)
+  val clk_edge = Bool(OUTPUT)
+  override def cloneType = new AtosSerializedIO(w)(p).asInstanceOf[this.type]
+}
+
+class AtosSerdes(w: Int)(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val wide = (new AtosIO).flip
+    val narrow = new AtosSerializedIO(w)
+  }
+
+  val ser = Module(new Serializer(w, new AtosRequest))
+  ser.io.in <> io.wide.req
+  io.narrow.req <> ser.io.out
+
+  val des = Module(new Deserializer(w, new AtosResponse))
+  des.io.in <> io.narrow.resp
+  io.wide.resp <> des.io.out
+}
+
+class AtosDesser(w: Int)(implicit p: Parameters) extends AtosModule()(p) {
+  val io = new Bundle {
+    val narrow = new AtosSerializedIO(w).flip
+    val wide = new AtosIO
+  }
+
+  val des = Module(new Deserializer(w, new AtosRequest))
+  des.io.in <> io.narrow.req
+  io.wide.req <> des.io.out
+
+  val ser = Module(new Serializer(w, new AtosResponse))
+  ser.io.in <> io.wide.resp
+  io.narrow.resp <> ser.io.out
+}
--- a/junctions/src/main/scala/crossing.scala
+++ b/junctions/src/main/scala/crossing.scala
@@ -0,0 +1,150 @@
+package junctions
+import Chisel._
+
+class Crossing[T <: Data](gen: T, enq_sync: Boolean, deq_sync: Boolean) extends Bundle {
+    val enq = Decoupled(gen).flip()
+    val deq = Decoupled(gen)
+    val enq_clock = if (enq_sync) Some(Clock(INPUT)) else None
+    val deq_clock = if (deq_sync) Some(Clock(INPUT)) else None
+    val enq_reset = if (enq_sync) Some(Bool(INPUT))  else None
+    val deq_reset = if (deq_sync) Some(Bool(INPUT))  else None
+}
+
+// Output is 1 for one cycle after any edge of 'in'
+object AsyncHandshakePulse {
+  def apply(in: Bool, sync: Int): Bool = {
+    val syncv = RegInit(Vec.fill(sync+1){Bool(false)})
+    syncv.last := in
+    (syncv.init zip syncv.tail).foreach { case (sink, source) => sink := source }
+    syncv(0) =/= syncv(1)
+  }
+}
+
+class AsyncHandshakeSource[T <: Data](gen: T, sync: Int, clock: Clock, reset: Bool)
+    extends Module(_clock = clock, _reset = reset) {
+  val io = new Bundle {
+    // These come from the source clock domain
+    val enq  = Decoupled(gen).flip()
+    // These cross to the sink clock domain
+    val bits = gen.cloneType.asOutput
+    val push = Bool(OUTPUT)
+    val pop  = Bool(INPUT)
+  }
+
+  val ready = RegInit(Bool(true))
+  val bits = Reg(gen)
+  val push = RegInit(Bool(false))
+
+  io.enq.ready := ready
+  io.bits := bits
+  io.push := push
+
+  val pop = AsyncHandshakePulse(io.pop, sync)
+  assert (!pop || !ready)
+
+  when (pop) {
+    ready := Bool(true)
+  }
+
+  when (io.enq.fire()) {
+    ready := Bool(false)
+    bits := io.enq.bits
+    push := !push
+  }
+}
+
+class AsyncHandshakeSink[T <: Data](gen: T, sync: Int, clock: Clock, reset: Bool) 
+    extends Module(_clock = clock, _reset = reset) {
+  val io = new Bundle {
+    // These cross to the source clock domain
+    val bits = gen.cloneType.asInput
+    val push = Bool(INPUT)
+    val pop  = Bool(OUTPUT)
+    // These go to the sink clock domain
+    val deq = Decoupled(gen)
+  }
+
+  val valid = RegInit(Bool(false))
+  val bits  = Reg(gen)
+  val pop   = RegInit(Bool(false))
+
+  io.deq.valid := valid
+  io.deq.bits  := bits
+  io.pop := pop
+
+  val push = AsyncHandshakePulse(io.push, sync)
+  assert (!push || !valid)
+
+  when (push) {
+    valid := Bool(true)
+    bits  := io.bits
+  }
+
+  when (io.deq.fire()) {
+    valid := Bool(false)
+    pop := !pop
+  }
+}
+
+class AsyncHandshake[T <: Data](gen: T, sync: Int = 2) extends Module {
+  val io = new Crossing(gen, true, true)
+  require (sync >= 2)
+
+  val source = Module(new AsyncHandshakeSource(gen, sync, io.enq_clock.get, io.enq_reset.get))
+  val sink   = Module(new AsyncHandshakeSink  (gen, sync, io.deq_clock.get, io.deq_reset.get))
+
+  source.io.enq <> io.enq
+  io.deq <> sink.io.deq
+
+  sink.io.bits := source.io.bits
+  sink.io.push := source.io.push
+  source.io.pop := sink.io.pop
+}
+
+class AsyncDecoupledTo[T <: Data](gen: T, depth: Int = 0, sync: Int = 2) extends Module {
+  val io = new Crossing(gen, false, true)
+
+  // !!! if depth == 0 { use Handshake } else { use AsyncFIFO }
+  val crossing = Module(new AsyncHandshake(gen, sync)).io
+  crossing.enq_clock.get := clock
+  crossing.enq_reset.get := reset
+  crossing.enq <> io.enq
+  crossing.deq_clock.get := io.deq_clock.get
+  crossing.deq_reset.get := io.deq_reset.get
+  io.deq <> crossing.deq
+}
+
+object AsyncDecoupledTo {
+  // source is in our clock domain, output is in the 'to' clock domain
+  def apply[T <: Data](to_clock: Clock, to_reset: Bool, source: DecoupledIO[T], depth: Int = 0, sync: Int = 2): DecoupledIO[T] = {
+    val to = Module(new AsyncDecoupledTo(source.bits, depth, sync))
+    to.io.deq_clock.get := to_clock
+    to.io.deq_reset.get := to_reset
+    to.io.enq <> source
+    to.io.deq
+  }
+}
+
+class AsyncDecoupledFrom[T <: Data](gen: T, depth: Int = 0, sync: Int = 2) extends Module {
+  val io = new Crossing(gen, true, false)
+
+  // !!! if depth == 0 { use Handshake } else { use AsyncFIFO }
+  val crossing = Module(new AsyncHandshake(gen, sync)).io
+  crossing.enq_clock.get := io.enq_clock.get
+  crossing.enq_reset.get := io.enq_reset.get
+  crossing.enq <> io.enq
+  crossing.deq_clock.get := clock
+  crossing.deq_reset.get := reset
+  io.deq <> crossing.deq
+}
+
+object AsyncDecoupledFrom {
+  // source is in the 'from' clock domain, output is in our clock domain
+  def apply[T <: Data](from_clock: Clock, from_reset: Bool, source: DecoupledIO[T], depth: Int = 0, sync: Int = 2): DecoupledIO[T] = {
+    val from = Module(new AsyncDecoupledFrom(source.bits, depth, sync))
+    from.io.enq_clock.get := from_clock
+    from.io.enq_reset.get := from_reset
+    from.io.enq <> source
+    from.io.deq
+  }
+}
--- a/junctions/src/main/scala/hasti.scala
+++ b/junctions/src/main/scala/hasti.scala
@@ -0,0 +1,549 @@
+package junctions
+
+import Chisel._
+import cde.{Parameters, Field}
+
+object HastiConstants
+{
+  // Values for htrans
+  val SZ_HTRANS     = 2
+  val HTRANS_IDLE   = UInt(0, SZ_HTRANS) // No transfer requested, not in a burst
+  val HTRANS_BUSY   = UInt(1, SZ_HTRANS) // No transfer requested, in a burst
+  val HTRANS_NONSEQ = UInt(2, SZ_HTRANS) // First (potentially only) request in a burst
+  val HTRANS_SEQ    = UInt(3, SZ_HTRANS) // Following requests in a burst
+
+  // Values for hburst
+  val SZ_HBURST     = 3
+  val HBURST_SINGLE = UInt(0, SZ_HBURST) // Single access (no burst)
+  val HBURST_INCR   = UInt(1, SZ_HBURST) // Incrementing burst of arbitrary length, not crossing 1KB
+  val HBURST_WRAP4  = UInt(2, SZ_HBURST) // 4-beat wrapping burst
+  val HBURST_INCR4  = UInt(3, SZ_HBURST) // 4-beat incrementing burst
+  val HBURST_WRAP8  = UInt(4, SZ_HBURST) // 8-beat wrapping burst
+  val HBURST_INCR8  = UInt(5, SZ_HBURST) // 8-beat incrementing burst
+  val HBURST_WRAP16 = UInt(6, SZ_HBURST) // 16-beat wrapping burst
+  val HBURST_INCR16 = UInt(7, SZ_HBURST) // 16-beat incrementing burst
+
+  // Values for hresp
+  val SZ_HRESP      = 1
+  val HRESP_OKAY    = UInt(0, SZ_HRESP)
+  val HRESP_ERROR   = UInt(1, SZ_HRESP)
+
+  // Values for hsize are identical to TileLink MT_SZ
+  // ie: 8*2^SZ_HSIZE bit transfers
+  val SZ_HSIZE = 3
+  
+  // Values for hprot (a bitmask)
+  val SZ_HPROT = 4
+  def HPROT_DATA       = UInt("b0001") // Data access or Opcode fetch
+  def HPROT_PRIVILEGED = UInt("b0010") // Privileged or User access
+  def HPROT_BUFFERABLE = UInt("b0100") // Bufferable or non-bufferable
+  def HPROT_CACHEABLE  = UInt("b1000") // Cacheable or non-cacheable
+
+  def dgate(valid: Bool, b: UInt) = Fill(b.getWidth, valid) & b
+}
+
+import HastiConstants._
+
+case class HastiParameters(dataBits: Int, addrBits: Int)
+case object HastiId extends Field[String]
+case class HastiKey(id: String) extends Field[HastiParameters]
+
+trait HasHastiParameters {
+  implicit val p: Parameters
+  val hastiParams = p(HastiKey(p(HastiId)))
+  val hastiAddrBits = hastiParams.addrBits
+  val hastiDataBits = hastiParams.dataBits
+  val hastiDataBytes = hastiDataBits/8
+  val hastiAlignment = log2Ceil(hastiDataBytes)
+}
+
+abstract class HastiModule(implicit val p: Parameters) extends Module
+  with HasHastiParameters
+abstract class HastiBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
+  with HasHastiParameters
+
+class HastiMasterIO(implicit p: Parameters) extends HastiBundle()(p) {
+  val htrans    = UInt(OUTPUT, SZ_HTRANS)
+  val hmastlock = Bool(OUTPUT)
+  val haddr     = UInt(OUTPUT, hastiAddrBits)
+  val hwrite    = Bool(OUTPUT)
+  val hburst    = UInt(OUTPUT, SZ_HBURST)
+  val hsize     = UInt(OUTPUT, SZ_HSIZE)
+  val hprot     = UInt(OUTPUT, SZ_HPROT)
+
+  val hwdata = Bits(OUTPUT, hastiDataBits)
+  val hrdata = Bits(INPUT,  hastiDataBits)
+
+  val hready = Bool(INPUT)
+  val hresp  = UInt(INPUT, SZ_HRESP)
+
+  def isNSeq(dummy:Int=0) = htrans === HTRANS_NONSEQ // SEQ does not start a NEW request
+  def isHold(dummy:Int=0) = htrans === HTRANS_BUSY || htrans === HTRANS_SEQ
+  def isIdle(dummy:Int=0) = htrans === HTRANS_IDLE || htrans === HTRANS_BUSY
+}
+
+class HastiSlaveIO(implicit p: Parameters) extends HastiBundle()(p) {
+  val htrans    = UInt(INPUT, SZ_HTRANS)
+  val hmastlock = Bool(INPUT)
+  val haddr     = UInt(INPUT, hastiAddrBits)
+  val hwrite    = Bool(INPUT)
+  val hburst    = UInt(INPUT, SZ_HBURST)
+  val hsize     = UInt(INPUT, SZ_HSIZE)
+  val hprot     = UInt(INPUT, SZ_HPROT)
+
+  val hwdata = Bits(INPUT,  hastiDataBits)
+  val hrdata = Bits(OUTPUT, hastiDataBits)
+
+  val hsel   = Bool(INPUT)
+  val hready = Bool(OUTPUT)
+  val hresp  = UInt(OUTPUT, SZ_HRESP)
+}
+
+/* A diverted master is told hready when his address phase goes nowhere.
+ * In this case, we buffer his address phase request and replay it later.
+ * NOTE: this must optimize to nothing when divert is constantly false.
+ */
+class MasterDiversion(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val in     = (new HastiMasterIO).flip
+    val out    = (new HastiMasterIO)
+    val divert = Bool(INPUT)
+  }
+  
+  val full   = Reg(init = Bool(false))
+  val buffer = Reg(new HastiMasterIO)
+  
+  when (io.out.hready) {
+    full := Bool(false)
+  }
+  when (io.divert) {
+    full := Bool(true)
+    buffer := io.in
+  }
+  
+  // If the master is diverted, he must also have been told hready
+  assert (!io.divert || io.in.hready,
+    "Diverted but not ready");
+  
+  // Replay the request we diverted
+  io.out.htrans    := Mux(full, buffer.htrans,    io.in.htrans)
+  io.out.hmastlock := Mux(full, buffer.hmastlock, io.in.hmastlock)
+  io.out.haddr     := Mux(full, buffer.haddr,     io.in.haddr)
+  io.out.hwrite    := Mux(full, buffer.hwrite,    io.in.hwrite)
+  io.out.hburst    := Mux(full, buffer.hburst,    io.in.hburst)
+  io.out.hsize     := Mux(full, buffer.hsize,     io.in.hsize)
+  io.out.hprot     := Mux(full, buffer.hprot,     io.in.hprot)
+  io.out.hwdata    := Mux(full, buffer.hwdata,    io.in.hwdata)
+  
+  // Pass slave response back
+  io.in.hrdata := io.out.hrdata
+  io.in.hresp  := io.out.hresp
+  io.in.hready := io.out.hready && !full // Block master while we steal his address phase
+}
+
+/* Masters with lower index have priority over higher index masters.
+ * However, a lower priority master will retain control of a slave when EITHER:
+ *   1. a burst is in progress (switching slaves mid-burst violates AHB-lite at slave)
+ *   2. a transfer was waited (the standard forbids changing requests in this case)
+ *
+ * If a master raises hmastlock, it will be waited until no other master has inflight
+ * requests; then, it acquires exclusive control of the crossbar until hmastlock is low.
+ *
+ * To implement an AHB-lite crossbar, it is important to realize that requests and
+ * responses are coupled. Unlike modern bus protocols where the response data has flow
+ * control independent of the request data, in AHB-lite, both flow at the same time at
+ * the sole discretion of the slave via the hready signal. The address and data are
+ * delivered on two back-to-back cycles, the so-called address and data phases.
+ *
+ * Masters can only be connected to a single slave at a time. If a master had two different
+ * slave connections on the address and data phases, there would be two independent hready
+ * signals. An AHB-lite slave can assume that data flows when it asserts hready. If the data
+ * slave deasserts hready while the address slave asserts hready, the master is put in the
+ * impossible position of being in data phase on two slaves at once. For this reason, when
+ * a master issues back-to-back accesses to distinct slaves, we inject a pipeline bubble
+ * between the two requests to limit the master to just a single slave at a time.
+ *
+ * Conversely, a slave CAN have two masters attached to it. This is unproblematic, because
+ * the only signal which governs data flow is hready. Thus, both masters can be stalled
+ * safely by the single slave.
+ */
+class HastiXbar(nMasters: Int, addressMap: Seq[UInt=>Bool])(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val masters = Vec(nMasters,        new HastiMasterIO).flip
+    val slaves  = Vec(addressMap.size, new HastiSlaveIO).flip
+  }
+  
+  val nSlaves = addressMap.size
+  
+  // Setup diversions infront of each master
+  val diversions = Seq.tabulate(nMasters) { m => Module(new MasterDiversion) }
+  (io.masters zip diversions) foreach { case (m, d) => d.io.in <> m }
+  
+  // Handy short-hand
+  val masters = diversions map (_.io.out)
+  val slaves  = io.slaves
+  
+  // Lock status of the crossbar
+  val lockedM = Reg(init = Vec.fill(nMasters)(Bool(false)))
+  val isLocked = lockedM.reduce(_ || _)
+  
+  // This matrix governs the master-slave connections in the address phase
+  // It is indexed by addressPhaseGrantSM(slave)(master)
+  // It is guaranteed to have at most one 'true' per column and per row
+  val addressPhaseGrantSM = Wire(Vec(nSlaves, Vec(nMasters, Bool())))
+  // This matrix governs the master-slave connections in the data phase
+  // It is guaranteed to have at most one 'true' per column and per row
+  val dataPhaseGrantSM    = Reg (init = Vec.fill(nSlaves)(Vec.fill(nMasters)(Bool(false))))
+  // This matrix is the union of the address and data phases.
+  // It is transposed with respect to the two previous matrices.
+  // It is guaranteed to contain at most one 'true' per master row.
+  // However, two 'true's per slave column are permitted.
+  val unionGrantMS = Vec.tabulate(nMasters) { m => Vec.tabulate(nSlaves) { s => 
+                       addressPhaseGrantSM(s)(m) || dataPhaseGrantSM(s)(m) } }
+  
+  // Confirm the guarantees made above
+  def justOnce(v: Vec[Bool]) = v.fold(Bool(false)) { case (p, v) =>
+    assert (!p || !v)
+    p || v
+  }
+  addressPhaseGrantSM foreach { s => justOnce(s) }
+  unionGrantMS        foreach { s => justOnce(s) }
+  
+  // Data phase follows address phase whenever the slave is ready
+  (slaves zip (dataPhaseGrantSM zip addressPhaseGrantSM)) foreach { case (s, (d, a)) =>
+    when (s.hready) { d := a }
+  }
+  
+  // Record the grant state from the previous cycle; needed in case we hold access
+  val priorAddressPhaseGrantSM = RegNext(addressPhaseGrantSM)
+  
+  // If a master says BUSY or SEQ, it is in the middle of a burst.
+  // In this case, it MUST stay attached to the same slave as before.
+  // Otherwise, it would violate the AHB-lite specification as seen by
+  // the slave, which is guaranteed a complete burst of the promised length.
+  // One case where this matters is preventing preemption of low-prio masters.
+  // NOTE: this exposes a slave to bad addresses when a master is buggy
+  val holdBurstM = Vec(masters map { _.isHold() })
+  
+  // Transform the burst hold requirement from master indexing to slave indexing
+  // We use the previous cycle's binding because the master continues the prior burst
+  val holdBurstS = Vec(priorAddressPhaseGrantSM map { m => Mux1H(m, holdBurstM) })
+  
+  // If a slave says !hready to a request, it must retain the same master next cycle.
+  // The AHB-lite specification requires that a waited transfer remain unchanged.
+  // If we preempted a waited master, the new master's request could potentially differ.
+  val holdBusyS = RegNext(Vec(slaves map { s => !s.hready && s.hsel }))
+  
+  // Combine the above two grounds to determine if the slave retains its prior master
+  val holdS = Vec((holdBurstS zip holdBusyS) map ({ case (a,b) => a||b }))
+  
+  // Determine which master addresses match which slaves
+  val matchMS = Vec(masters map { m => Vec(addressMap map { afn => afn(m.haddr) }) })
+  // Detect requests to nowhere; we need to allow progress in this case
+  val nowhereM = Vec(matchMS map { s => !s.reduce(_ || _) })
+  
+  // Detect if we need to inject a pipeline bubble between the master requests.
+  // Divert masters already granted a data phase different from next request.
+  // NOTE: if only one slave, matchMS is always true => bubble always false
+  //       => the diversion registers are optimized away as they are unread
+  // NOTE: bubble => dataPhase => have an hready signal
+  val bubbleM =
+    Vec.tabulate(nMasters) { m =>
+      Vec.tabulate(nSlaves) { s => dataPhaseGrantSM(s)(m) && !matchMS(m)(s) }
+      .reduce(_ || _) }
+  
+  // Block any request that requires bus ownership or conflicts with isLocked
+  val blockedM = 
+    Vec((lockedM zip masters) map { case(l, m) => !l && (isLocked || m.hmastlock) })
+  
+  // Requested access to slaves from masters (pre-arbitration)
+  // NOTE: isNSeq does NOT include SEQ; thus, masters who are midburst do not
+  // request access to a new slave. They stay tied to the old and do not get two.
+  // NOTE: if a master was waited, it must repeat the same request as last cycle;
+  // thus, it will request the same slave and not end up with two (unless buggy).
+  val NSeq = masters.map(_.isNSeq())
+  val requestSM = Vec.tabulate(nSlaves) { s => Vec.tabulate(nMasters) { m => 
+    matchMS(m)(s) && NSeq(m) && !bubbleM(m) && !blockedM(m) } }
+  
+  // Select at most one master request per slave (lowest index = highest priority)
+  val selectedRequestSM = Vec(requestSM map { m => Vec(PriorityEncoderOH(m)) })
+  
+  // Calculate new crossbar interconnect state
+  addressPhaseGrantSM := Vec((holdS zip (priorAddressPhaseGrantSM zip selectedRequestSM))
+                             map { case (h, (p, r)) => Mux(h, p, r) })
+
+  for (m <- 0 until nMasters) {
+    // If the master is connected to a slave, the slave determines hready.
+    // However, if no slave is connected, for progress report ready anyway, if:
+    //   bad address (swallow request) OR idle (permit stupid masters to move FSM)
+    val autoready = nowhereM(m) || masters(m).isIdle()
+    val hready = Mux1H(unionGrantMS(m), slaves.map(_.hready ^ autoready)) ^ autoready
+    masters(m).hready := hready
+    // If we diverted a master, we need to absorb his address phase to replay later
+    diversions(m).io.divert := (bubbleM(m) || blockedM(m)) && NSeq(m) && hready
+  }
+  
+  // Master muxes (address and data phase are the same)
+  (masters zip unionGrantMS) foreach { case (m, g) => {
+    m.hrdata := Mux1H(g, slaves.map(_.hrdata))
+    m.hresp  := Mux1H(g, slaves.map(_.hresp))
+  } }
+  
+  // Slave address phase muxes
+  (slaves zip addressPhaseGrantSM) foreach { case (s, g) => {
+    s.htrans    := Mux1H(g, masters.map(_.htrans))
+    s.haddr     := Mux1H(g, masters.map(_.haddr))
+    s.hmastlock := isLocked
+    s.hwrite    := Mux1H(g, masters.map(_.hwrite))
+    s.hsize     := Mux1H(g, masters.map(_.hsize))
+    s.hburst    := Mux1H(g, masters.map(_.hburst))
+    s.hprot     := Mux1H(g, masters.map(_.hprot))
+    s.hsel      := g.reduce(_ || _)
+  } }
+  
+  // Slave data phase muxes
+  (slaves zip dataPhaseGrantSM) foreach { case (s, g) => {
+    s.hwdata := Mux1H(g, masters.map(_.hwdata))
+  } }
+  
+  // When no master-slave connections are active, a master can take-over the bus
+  val canLock = !addressPhaseGrantSM.map({ v => v.reduce(_ || _) }).reduce(_ || _)
+  
+  // Lowest index highest priority for lock arbitration
+  val reqLock = masters.map(_.hmastlock)
+  val winLock = PriorityEncoderOH(reqLock)
+  
+  // Lock arbitration
+  when (isLocked) {
+    lockedM := (lockedM zip reqLock) map { case (a,b) => a && b }
+  } .elsewhen (canLock) {
+    lockedM := winLock
+  }
+}
+
+class HastiBus(amap: Seq[UInt=>Bool])(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val master = new HastiMasterIO().flip
+    val slaves = Vec(amap.size, new HastiSlaveIO).flip
+  }
+
+  val bar = Module(new HastiXbar(1, amap))
+  bar.io.masters(0) <> io.master
+  bar.io.slaves <> io.slaves
+}
+
+class HastiSlaveMux(n: Int)(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val ins = Vec(n, new HastiSlaveIO)
+    val out = new HastiSlaveIO().flip
+  }
+  
+  val amap = Seq({ (_:UInt) => Bool(true)})
+  val bar = Module(new HastiXbar(n, amap))
+  io.ins <> bar.io.masters
+  io.out <> bar.io.slaves(0)
+}
+
+class HastiSlaveToMaster(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val in  = new HastiSlaveIO
+    val out = new HastiMasterIO
+  }
+
+  io.out.htrans    := Mux(io.in.hsel, io.in.htrans, HTRANS_IDLE)
+  io.out.hmastlock := io.in.hmastlock
+  io.out.haddr     := io.in.haddr
+  io.out.hwrite    := io.in.hwrite
+  io.out.hburst    := io.in.hburst
+  io.out.hsize     := io.in.hsize
+  io.out.hprot     := io.in.hprot
+  io.out.hwdata    := io.in.hwdata
+  io.in.hrdata := io.out.hrdata
+  io.in.hready := io.out.hready
+  io.in.hresp  := io.out.hresp
+}
+
+class HastiMasterIONastiIOConverter(implicit p: Parameters) extends HastiModule()(p)
+    with HasNastiParameters {
+  val io = new Bundle {
+    val nasti = new NastiIO().flip
+    val hasti = new HastiMasterIO
+  }
+
+  require(hastiAddrBits == nastiXAddrBits)
+  require(hastiDataBits == nastiXDataBits)
+
+  val r_queue = Module(new Queue(new NastiReadDataChannel, 2, pipe = true))
+
+  val s_idle :: s_read :: s_write :: s_write_resp :: Nil = Enum(Bits(), 4)
+  val state = Reg(init = s_idle)
+
+  val addr = Reg(UInt(width = hastiAddrBits))
+  val id = Reg(UInt(width = nastiXIdBits))
+  val size = Reg(UInt(width = nastiXSizeBits))
+  val len = Reg(UInt(width = nastiXLenBits))
+  val data = Reg(UInt(width = nastiXDataBits))
+  val first = Reg(init = Bool(false))
+  val is_rtrans = (state === s_read) &&
+                  (io.hasti.htrans === HTRANS_SEQ ||
+                   io.hasti.htrans === HTRANS_NONSEQ)
+  val rvalid = RegEnable(is_rtrans, Bool(false), io.hasti.hready)
+
+  io.nasti.aw.ready := (state === s_idle)
+  io.nasti.ar.ready := (state === s_idle) && !io.nasti.aw.valid
+  io.nasti.w.ready := (state === s_write) && io.hasti.hready
+  io.nasti.b.valid := (state === s_write_resp)
+  io.nasti.b.bits := NastiWriteResponseChannel(id = id)
+  io.nasti.r <> r_queue.io.deq
+
+  r_queue.io.enq.valid := io.hasti.hready && rvalid
+  r_queue.io.enq.bits := NastiReadDataChannel(
+    id = id,
+    data = io.hasti.hrdata,
+    last = (len === UInt(0)))
+
+  assert(!r_queue.io.enq.valid || r_queue.io.enq.ready,
+    "NASTI -> HASTI converter queue overflow")
+
+  // How many read requests have we not delivered a response for yet?
+  val pending_count = r_queue.io.count + rvalid
+
+  io.hasti.haddr := addr
+  io.hasti.hsize := size
+  io.hasti.hwrite := (state === s_write)
+  io.hasti.hburst := HBURST_INCR
+  io.hasti.hprot := UInt(0)
+  io.hasti.hwdata := data
+  io.hasti.hmastlock := Bool(false)
+  io.hasti.htrans := MuxLookup(state, HTRANS_IDLE, Seq(
+    s_write -> Mux(io.nasti.w.valid,
+      Mux(first, HTRANS_NONSEQ, HTRANS_SEQ),
+      Mux(first, HTRANS_IDLE, HTRANS_BUSY)),
+    s_read -> MuxCase(HTRANS_BUSY, Seq(
+      first -> HTRANS_NONSEQ,
+      (pending_count <= UInt(1)) -> HTRANS_SEQ))))
+
+  when (io.nasti.aw.fire()) {
+    first := Bool(true)
+    addr := io.nasti.aw.bits.addr
+    id := io.nasti.aw.bits.id
+    size := io.nasti.aw.bits.size
+    state := s_write
+  }
+
+  when (io.nasti.ar.fire()) {
+    first := Bool(true)
+    addr := io.nasti.ar.bits.addr
+    id := io.nasti.ar.bits.id
+    size := io.nasti.ar.bits.size
+    len := io.nasti.ar.bits.len
+    state := s_read
+  }
+
+  when (io.nasti.w.fire()) {
+    first := Bool(false)
+    addr := addr + (UInt(1) << size)
+    data := io.nasti.w.bits.data
+    when (io.nasti.w.bits.last) { state := s_write_resp }
+  }
+
+  when (io.nasti.b.fire()) { state := s_idle }
+
+  when (is_rtrans && io.hasti.hready) {
+    first := Bool(false)
+    addr := addr + (UInt(1) << size)
+    len := len - UInt(1)
+    when (len === UInt(0)) { state := s_idle }
+  }
+}
+
+class HastiTestSRAM(depth: Int)(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new HastiSlaveIO
+  
+  // This is a test SRAM with random delays
+  val ready = LFSR16(Bool(true))(0) // Bool(true)
+  
+  // Calculate the bitmask of which bytes are being accessed
+  val mask_decode = Vec.tabulate(hastiAlignment+1) (UInt(_) <= io.hsize)
+  val mask_wide   = Vec.tabulate(hastiDataBytes) { i => mask_decode(log2Up(i+1)) }
+  val mask_shift  = if (hastiAlignment == 0) UInt(1) else
+                    mask_wide.toBits().asUInt() << io.haddr(hastiAlignment-1,0)
+  
+  // The request had better have been aligned! (AHB-lite requires this)
+  if (hastiAlignment >= 1) {
+    assert (io.htrans === HTRANS_IDLE || io.htrans === HTRANS_BUSY ||
+      (io.haddr & mask_decode.toBits()(hastiAlignment,1).asUInt) === UInt(0),
+      "HASTI request not aligned")
+  }
+  
+  // The mask and address during the address phase
+  val a_request   = io.hsel && (io.htrans === HTRANS_NONSEQ || io.htrans === HTRANS_SEQ)
+  val a_mask      = Wire(UInt(width = hastiDataBytes))
+  val a_address   = io.haddr(depth-1, hastiAlignment)
+  val a_write     = io.hwrite
+
+  // for backwards compatibility with chisel2, we needed a static width in definition
+  a_mask := mask_shift(hastiDataBytes-1, 0)
+  
+  // The data phase signals
+  val d_read  = RegEnable(a_request && !a_write, Bool(false), ready)
+  val d_mask  = RegEnable(a_mask, ready && a_request)
+  val d_wdata = Vec.tabulate(hastiDataBytes) { i => io.hwdata(8*(i+1)-1, 8*i) }
+  
+  // AHB writes must occur during the data phase; this poses a structural
+  // hazard with reads which must occur during the address phase. To solve
+  // this problem, we delay the writes until there is a free cycle.
+  //
+  // The idea is to record the address information from address phase and
+  // then as soon as possible flush the pending write. This cannot be done
+  // on a cycle when there is an address phase read, but on any other cycle
+  // the write will execute. In the case of reads following a write, the
+  // result must bypass data from the pending write into the read if they
+  // happen to have matching address.
+  
+  // Remove this once HoldUnless is in chisel3
+  def holdUnless[T <: Data](in : T, enable: Bool): T = Mux(!enable, RegEnable(in, enable), in)
+  
+  // Pending write?
+  val p_valid     = RegInit(Bool(false))
+  val p_address   = Reg(a_address)
+  val p_mask      = Reg(a_mask)
+  val p_latch_d   = RegNext(ready && a_request && a_write, Bool(false))
+  val p_wdata     = holdUnless(d_wdata, p_latch_d)
+  
+  // Use single-ported memory with byte-write enable
+  val mem = SeqMem(1 << (depth-hastiAlignment), Vec(hastiDataBytes, Bits(width = 8)))
+  
+  // Decide is the SRAM port is used for reading or (potentially) writing
+  val read = ready && a_request && !a_write
+  // In case we are stalled, we need to hold the read data
+  val d_rdata = holdUnless(mem.read(a_address, read), RegNext(read))
+  // Whenever the port is not needed for reading, execute pending writes
+  when (!read) {
+    when (p_valid) { mem.write(p_address, p_wdata, p_mask.toBools) }
+    p_valid := Bool(false)
+  }
+  
+  // Record the request for later?
+  when (ready && a_request && a_write) {
+    p_valid   := Bool(true)
+    p_address := a_address
+    p_mask    := a_mask
+  }
+  
+  // Does the read need to be muxed with the previous write?
+  val a_bypass = a_address === p_address && p_valid
+  val d_bypass = RegEnable(a_bypass, ready && a_request)
+  
+  // Mux in data from the pending write
+  val muxdata = Vec((p_mask.toBools zip (p_wdata zip d_rdata))
+                    map { case (m, (p, r)) => Mux(d_bypass && m, p, r) })
+  // Wipe out any data the master should not see (for testing)
+  val outdata = Vec((d_mask.toBools zip muxdata)
+                    map { case (m, p) => Mux(d_read && ready && m, p, Bits(0)) })
+
+  // Finally, the outputs
+  io.hrdata := outdata.toBits()
+  io.hready := ready
+  io.hresp  := HRESP_OKAY
+}
--- a/junctions/src/main/scala/memserdes.scala
+++ b/junctions/src/main/scala/memserdes.scala
@@ -0,0 +1,317 @@
+// See LICENSE for license details.
+
+package junctions
+import Chisel._
+import scala.math._
+import cde.{Parameters, Field}
+
+case object MIFAddrBits extends Field[Int]
+case object MIFDataBits extends Field[Int]
+case object MIFTagBits extends Field[Int]
+case object MIFDataBeats extends Field[Int]
+
+trait HasMIFParameters {
+  implicit val p: Parameters
+  val mifTagBits = p(MIFTagBits)
+  val mifAddrBits = p(MIFAddrBits)
+  val mifDataBits = p(MIFDataBits)
+  val mifDataBeats = p(MIFDataBeats)
+}
+ 
+abstract class MIFModule(implicit val p: Parameters) extends Module with HasMIFParameters
+abstract class MIFBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
+  with HasMIFParameters
+
+trait HasMemData extends HasMIFParameters {
+  val data = Bits(width = mifDataBits)
+}
+
+trait HasMemAddr extends HasMIFParameters {
+  val addr = UInt(width = mifAddrBits)
+}
+
+trait HasMemTag extends HasMIFParameters {
+  val tag = UInt(width = mifTagBits)
+}
+
+class MemReqCmd(implicit p: Parameters) extends MIFBundle()(p) with HasMemAddr with HasMemTag {
+  val rw = Bool()
+}
+
+class MemTag(implicit p: Parameters) extends MIFBundle()(p) with HasMemTag
+class MemData(implicit p: Parameters) extends MIFBundle()(p) with HasMemData
+class MemResp(implicit p: Parameters) extends MIFBundle()(p) with HasMemData with HasMemTag
+
+class MemIO(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val req_cmd  = Decoupled(new MemReqCmd)
+  val req_data = Decoupled(new MemData)
+  val resp     = Decoupled(new MemResp).flip
+}
+
+class MemPipeIO(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val req_cmd  = Decoupled(new MemReqCmd)
+  val req_data = Decoupled(new MemData)
+  val resp     = Valid(new MemResp).flip
+}
+
+class MemSerializedIO(w: Int)(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val req = Decoupled(Bits(width = w))
+  val resp = Valid(Bits(width = w)).flip
+  override def cloneType = new MemSerializedIO(w)(p).asInstanceOf[this.type]
+}
+
+class MemSerdes(w: Int)(implicit p: Parameters) extends MIFModule
+{
+  val io = new Bundle {
+    val wide = new MemIO().flip
+    val narrow = new MemSerializedIO(w)
+  }
+  val abits = io.wide.req_cmd.bits.toBits.getWidth
+  val dbits = io.wide.req_data.bits.toBits.getWidth
+  val rbits = io.wide.resp.bits.getWidth
+
+  val out_buf = Reg(Bits())
+  val in_buf = Reg(Bits())
+
+  val s_idle :: s_read_addr :: s_write_addr :: s_write_idle :: s_write_data :: Nil = Enum(UInt(), 5)
+  val state = Reg(init=s_idle)
+  val send_cnt = Reg(init=UInt(0, log2Up((max(abits, dbits)+w-1)/w)))
+  val data_send_cnt = Reg(init=UInt(0, log2Up(mifDataBeats)))
+  val adone = io.narrow.req.ready && send_cnt === UInt((abits-1)/w)
+  val ddone = io.narrow.req.ready && send_cnt === UInt((dbits-1)/w)
+
+  when (io.narrow.req.valid && io.narrow.req.ready) {
+    send_cnt := send_cnt + UInt(1)
+    out_buf := out_buf >> UInt(w)
+  }
+  when (io.wide.req_cmd.valid && io.wide.req_cmd.ready) {
+    out_buf := io.wide.req_cmd.bits.toBits
+  }
+  when (io.wide.req_data.valid && io.wide.req_data.ready) {
+    out_buf := io.wide.req_data.bits.toBits
+  }
+
+  io.wide.req_cmd.ready := state === s_idle
+  io.wide.req_data.ready := state === s_write_idle
+  io.narrow.req.valid := state === s_read_addr || state === s_write_addr || state === s_write_data
+  io.narrow.req.bits := out_buf
+
+  when (state === s_idle && io.wide.req_cmd.valid) {
+    state := Mux(io.wide.req_cmd.bits.rw, s_write_addr, s_read_addr)
+  }
+  when (state === s_read_addr && adone) {
+    state := s_idle
+    send_cnt := UInt(0)
+  }
+  when (state === s_write_addr && adone) {
+    state := s_write_idle
+    send_cnt := UInt(0)
+  }
+  when (state === s_write_idle && io.wide.req_data.valid) {
+    state := s_write_data
+  }
+  when (state === s_write_data && ddone) {
+    data_send_cnt := data_send_cnt + UInt(1)
+    state := Mux(data_send_cnt === UInt(mifDataBeats-1), s_idle, s_write_idle)
+    send_cnt := UInt(0)
+  }
+
+  val recv_cnt = Reg(init=UInt(0, log2Up((rbits+w-1)/w)))
+  val data_recv_cnt = Reg(init=UInt(0, log2Up(mifDataBeats)))
+  val resp_val = Reg(init=Bool(false))
+
+  resp_val := Bool(false)
+  when (io.narrow.resp.valid) {
+    recv_cnt := recv_cnt + UInt(1)
+    when (recv_cnt === UInt((rbits-1)/w)) {
+      recv_cnt := UInt(0)
+      data_recv_cnt := data_recv_cnt + UInt(1)
+      resp_val := Bool(true)
+    }
+    in_buf := Cat(io.narrow.resp.bits, in_buf((rbits+w-1)/w*w-1,w))
+  }
+
+  io.wide.resp.valid := resp_val
+  io.wide.resp.bits := io.wide.resp.bits.fromBits(in_buf)
+}
+
+class MemDesserIO(w: Int)(implicit p: Parameters) extends ParameterizedBundle()(p) {
+  val narrow = new MemSerializedIO(w).flip
+  val wide = new MemIO
+}
+
+class MemDesser(w: Int)(implicit p: Parameters) extends Module // test rig side
+{
+  val io = new MemDesserIO(w)
+  val abits = io.wide.req_cmd.bits.toBits.getWidth
+  val dbits = io.wide.req_data.bits.toBits.getWidth
+  val rbits = io.wide.resp.bits.getWidth
+  val mifDataBeats = p(MIFDataBeats)
+
+  require(dbits >= abits && rbits >= dbits)
+  val recv_cnt = Reg(init=UInt(0, log2Up((rbits+w-1)/w)))
+  val data_recv_cnt = Reg(init=UInt(0, log2Up(mifDataBeats)))
+  val adone = io.narrow.req.valid && recv_cnt === UInt((abits-1)/w)
+  val ddone = io.narrow.req.valid && recv_cnt === UInt((dbits-1)/w)
+  val rdone = io.narrow.resp.valid && recv_cnt === UInt((rbits-1)/w)
+
+  val s_cmd_recv :: s_cmd :: s_data_recv :: s_data :: s_reply :: Nil = Enum(UInt(), 5)
+  val state = Reg(init=s_cmd_recv)
+
+  val in_buf = Reg(Bits())
+  when (io.narrow.req.valid && io.narrow.req.ready || io.narrow.resp.valid) {
+    recv_cnt := recv_cnt + UInt(1)
+    in_buf := Cat(io.narrow.req.bits, in_buf((rbits+w-1)/w*w-1,w))
+  }
+  io.narrow.req.ready := state === s_cmd_recv || state === s_data_recv
+
+  when (state === s_cmd_recv && adone) {
+    state := s_cmd
+    recv_cnt := UInt(0)
+  }
+  when (state === s_cmd && io.wide.req_cmd.ready) {
+    state := Mux(io.wide.req_cmd.bits.rw, s_data_recv, s_reply)
+  }
+  when (state === s_data_recv && ddone) {
+    state := s_data
+    recv_cnt := UInt(0)
+  }
+  when (state === s_data && io.wide.req_data.ready) {
+    state := s_data_recv
+    when (data_recv_cnt === UInt(mifDataBeats-1)) {
+      state := s_cmd_recv
+    }
+    data_recv_cnt := data_recv_cnt + UInt(1)
+  }
+  when (rdone) { // state === s_reply
+    when (data_recv_cnt === UInt(mifDataBeats-1)) {
+      state := s_cmd_recv
+    }
+    recv_cnt := UInt(0)
+    data_recv_cnt := data_recv_cnt + UInt(1)
+  }
+
+  val req_cmd = in_buf >> UInt(((rbits+w-1)/w - (abits+w-1)/w)*w)
+  io.wide.req_cmd.valid := state === s_cmd
+  io.wide.req_cmd.bits := io.wide.req_cmd.bits.fromBits(req_cmd)
+
+  io.wide.req_data.valid := state === s_data
+  io.wide.req_data.bits.data := in_buf >> UInt(((rbits+w-1)/w - (dbits+w-1)/w)*w)
+
+  val dataq = Module(new Queue(new MemResp, mifDataBeats))
+  dataq.io.enq <> io.wide.resp
+  dataq.io.deq.ready := recv_cnt === UInt((rbits-1)/w)
+
+  io.narrow.resp.valid := dataq.io.deq.valid
+  io.narrow.resp.bits := dataq.io.deq.bits.toBits >> (recv_cnt * UInt(w))
+}
+
+class MemIOArbiter(val arbN: Int)(implicit p: Parameters) extends MIFModule {
+  val io = new Bundle {
+    val inner = Vec(arbN, new MemIO).flip
+    val outer = new MemIO
+  }
+
+  if(arbN > 1) {
+    val cmd_arb = Module(new RRArbiter(new MemReqCmd, arbN))
+    val choice_q = Module(new Queue(cmd_arb.io.chosen, 4))
+    val (data_cnt, data_done) = Counter(io.outer.req_data.fire(), mifDataBeats)
+
+    io.inner.map(_.req_cmd).zipWithIndex.zip(cmd_arb.io.in).map{ case ((req, id), arb) => {
+      arb.valid := req.valid
+      arb.bits := req.bits
+      arb.bits.tag := Cat(req.bits.tag, UInt(id))
+      req.ready := arb.ready
+    }}
+    io.outer.req_cmd.bits := cmd_arb.io.out.bits
+    io.outer.req_cmd.valid := cmd_arb.io.out.valid && choice_q.io.enq.ready
+    cmd_arb.io.out.ready := io.outer.req_cmd.ready && choice_q.io.enq.ready
+    choice_q.io.enq.bits := cmd_arb.io.chosen
+    choice_q.io.enq.valid := cmd_arb.io.out.fire() && cmd_arb.io.out.bits.rw
+
+    io.outer.req_data.bits := io.inner(choice_q.io.deq.bits).req_data.bits
+    io.outer.req_data.valid := io.inner(choice_q.io.deq.bits).req_data.valid && choice_q.io.deq.valid
+    io.inner.map(_.req_data.ready).zipWithIndex.foreach {
+      case(r, i) => r := UInt(i) === choice_q.io.deq.bits && choice_q.io.deq.valid
+    }
+    choice_q.io.deq.ready := data_done
+
+    io.outer.resp.ready := Bool(false)
+    for (i <- 0 until arbN) {
+      io.inner(i).resp.valid := Bool(false)
+      when(io.outer.resp.bits.tag(log2Up(arbN)-1,0).toUInt === UInt(i)) {
+        io.inner(i).resp.valid := io.outer.resp.valid
+        io.outer.resp.ready := io.inner(i).resp.ready
+      }
+      io.inner(i).resp.bits := io.outer.resp.bits
+      io.inner(i).resp.bits.tag := io.outer.resp.bits.tag >> UInt(log2Up(arbN))
+    }
+  } else { io.outer <> io.inner.head }
+}
+
+object MemIOMemPipeIOConverter {
+  def apply(in: MemPipeIO)(implicit p: Parameters): MemIO = {
+    val out = Wire(new MemIO())
+    in.resp.valid := out.resp.valid
+    in.resp.bits := out.resp.bits
+    out.resp.ready := Bool(true)
+    out.req_cmd.valid := in.req_cmd.valid
+    out.req_cmd.bits := in.req_cmd.bits
+    in.req_cmd.ready := out.req_cmd.ready
+    out.req_data.valid := in.req_data.valid
+    out.req_data.bits := in.req_data.bits
+    in.req_data.ready := out.req_data.ready
+    out
+  }
+}
+
+class MemPipeIOMemIOConverter(numRequests: Int)(implicit p: Parameters) extends MIFModule {
+  val io = new Bundle {
+    val cpu = new MemIO().flip
+    val mem = new MemPipeIO
+  }
+
+  val numEntries = numRequests * mifDataBeats
+  val size = log2Down(numEntries) + 1
+
+  val inc = Wire(Bool())
+  val dec = Wire(Bool())
+  val count = Reg(init=UInt(numEntries, size))
+  val watermark = count >= UInt(mifDataBeats)
+
+  when (inc && !dec) {
+    count := count + UInt(1)
+  }
+  when (!inc && dec) {
+    count := count - UInt(mifDataBeats)
+  }
+  when (inc && dec) {
+    count := count - UInt(mifDataBeats-1)
+  }
+
+  val cmdq_mask = io.cpu.req_cmd.bits.rw || watermark
+
+  io.mem.req_cmd.valid := io.cpu.req_cmd.valid && cmdq_mask
+  io.cpu.req_cmd.ready := io.mem.req_cmd.ready && cmdq_mask
+  io.mem.req_cmd.bits := io.cpu.req_cmd.bits
+
+  io.mem.req_data <> io.cpu.req_data
+
+  // Have separate queues to allow for different mem implementations
+  val resp_data_q = Module((new HellaQueue(numEntries)) { new MemData })
+  resp_data_q.io.enq.valid := io.mem.resp.valid
+  resp_data_q.io.enq.bits.data := io.mem.resp.bits.data
+
+  val resp_tag_q = Module((new HellaQueue(numEntries)) { new MemTag })
+  resp_tag_q.io.enq.valid := io.mem.resp.valid
+  resp_tag_q.io.enq.bits.tag := io.mem.resp.bits.tag
+
+  io.cpu.resp.valid := resp_data_q.io.deq.valid && resp_tag_q.io.deq.valid
+  io.cpu.resp.bits.data := resp_data_q.io.deq.bits.data
+  io.cpu.resp.bits.tag := resp_tag_q.io.deq.bits.tag
+  resp_data_q.io.deq.ready := io.cpu.resp.ready
+  resp_tag_q.io.deq.ready := io.cpu.resp.ready
+
+  inc := resp_data_q.io.deq.fire() && resp_tag_q.io.deq.fire()
+  dec := io.mem.req_cmd.fire() && !io.mem.req_cmd.bits.rw
+}
--- a/junctions/src/main/scala/nasti.scala
+++ b/junctions/src/main/scala/nasti.scala
@@ -0,0 +1,737 @@
+/// See LICENSE for license details.
+
+package junctions
+import Chisel._
+import scala.math.max
+import scala.collection.mutable.ArraySeq
+import cde.{Parameters, Field}
+
+case object NastiKey extends Field[NastiParameters]
+
+case class NastiParameters(dataBits: Int, addrBits: Int, idBits: Int)
+
+trait HasNastiParameters {
+  implicit val p: Parameters
+  val nastiExternal = p(NastiKey)
+  val nastiXDataBits = nastiExternal.dataBits
+  val nastiWStrobeBits = nastiXDataBits / 8
+  val nastiXAddrBits = nastiExternal.addrBits
+  val nastiWIdBits = nastiExternal.idBits
+  val nastiRIdBits = nastiExternal.idBits
+  val nastiXIdBits = max(nastiWIdBits, nastiRIdBits)
+  val nastiXUserBits = 1
+  val nastiAWUserBits = nastiXUserBits
+  val nastiWUserBits = nastiXUserBits
+  val nastiBUserBits = nastiXUserBits
+  val nastiARUserBits = nastiXUserBits
+  val nastiRUserBits = nastiXUserBits
+  val nastiXLenBits = 8
+  val nastiXSizeBits = 3
+  val nastiXBurstBits = 2
+  val nastiXCacheBits = 4
+  val nastiXProtBits = 3
+  val nastiXQosBits = 4
+  val nastiXRegionBits = 4
+  val nastiXRespBits = 2
+
+  def bytesToXSize(bytes: UInt) = MuxLookup(bytes, UInt("b111"), Array(
+    UInt(1) -> UInt(0),
+    UInt(2) -> UInt(1),
+    UInt(4) -> UInt(2),
+    UInt(8) -> UInt(3),
+    UInt(16) -> UInt(4),
+    UInt(32) -> UInt(5),
+    UInt(64) -> UInt(6),
+    UInt(128) -> UInt(7)))
+}
+
+abstract class NastiModule(implicit val p: Parameters) extends Module
+  with HasNastiParameters
+abstract class NastiBundle(implicit val p: Parameters) extends ParameterizedBundle()(p)
+  with HasNastiParameters
+
+abstract class NastiChannel(implicit p: Parameters) extends NastiBundle()(p)
+abstract class NastiMasterToSlaveChannel(implicit p: Parameters) extends NastiChannel()(p)
+abstract class NastiSlaveToMasterChannel(implicit p: Parameters) extends NastiChannel()(p)
+
+trait HasNastiMetadata extends HasNastiParameters {
+  val addr   = UInt(width = nastiXAddrBits)
+  val len    = UInt(width = nastiXLenBits)
+  val size   = UInt(width = nastiXSizeBits)
+  val burst  = UInt(width = nastiXBurstBits)
+  val lock   = Bool()
+  val cache  = UInt(width = nastiXCacheBits)
+  val prot   = UInt(width = nastiXProtBits)
+  val qos    = UInt(width = nastiXQosBits)
+  val region = UInt(width = nastiXRegionBits)
+}
+
+trait HasNastiData extends HasNastiParameters {
+  val data = UInt(width = nastiXDataBits)
+  val last = Bool()
+}
+
+class NastiReadIO(implicit val p: Parameters) extends ParameterizedBundle()(p) {
+  val ar = Decoupled(new NastiReadAddressChannel)
+  val r  = Decoupled(new NastiReadDataChannel).flip
+}
+
+class NastiWriteIO(implicit val p: Parameters) extends ParameterizedBundle()(p) {
+  val aw = Decoupled(new NastiWriteAddressChannel)
+  val w  = Decoupled(new NastiWriteDataChannel)
+  val b  = Decoupled(new NastiWriteResponseChannel).flip
+}
+
+class NastiIO(implicit val p: Parameters) extends ParameterizedBundle()(p) {
+  val aw = Decoupled(new NastiWriteAddressChannel)
+  val w  = Decoupled(new NastiWriteDataChannel)
+  val b  = Decoupled(new NastiWriteResponseChannel).flip
+  val ar = Decoupled(new NastiReadAddressChannel)
+  val r  = Decoupled(new NastiReadDataChannel).flip
+}
+
+class NastiAddressChannel(implicit p: Parameters) extends NastiMasterToSlaveChannel()(p)
+    with HasNastiMetadata
+
+class NastiResponseChannel(implicit p: Parameters) extends NastiSlaveToMasterChannel()(p) {
+  val resp = UInt(width = nastiXRespBits)
+}
+
+class NastiWriteAddressChannel(implicit p: Parameters) extends NastiAddressChannel()(p) {
+  val id   = UInt(width = nastiWIdBits)
+  val user = UInt(width = nastiAWUserBits)
+}
+
+class NastiWriteDataChannel(implicit p: Parameters) extends NastiMasterToSlaveChannel()(p)
+    with HasNastiData {
+  val id   = UInt(width = nastiWIdBits)
+  val strb = UInt(width = nastiWStrobeBits)
+  val user = UInt(width = nastiWUserBits)
+}
+
+class NastiWriteResponseChannel(implicit p: Parameters) extends NastiResponseChannel()(p) {
+  val id   = UInt(width = nastiWIdBits)
+  val user = UInt(width = nastiBUserBits)
+}
+
+class NastiReadAddressChannel(implicit p: Parameters) extends NastiAddressChannel()(p) {
+  val id   = UInt(width = nastiRIdBits)
+  val user = UInt(width = nastiARUserBits)
+}
+
+class NastiReadDataChannel(implicit p: Parameters) extends NastiResponseChannel()(p)
+    with HasNastiData {
+  val id   = UInt(width = nastiRIdBits)
+  val user = UInt(width = nastiRUserBits)
+}
+
+object NastiConstants {
+  val BURST_FIXED = UInt("b00")
+  val BURST_INCR  = UInt("b01")
+  val BURST_WRAP  = UInt("b10")
+
+  val RESP_OKAY = UInt("b00")
+  val RESP_EXOKAY = UInt("b01")
+  val RESP_SLVERR = UInt("b10")
+  val RESP_DECERR = UInt("b11")
+}
+
+import NastiConstants._
+
+object NastiWriteAddressChannel {
+  def apply(id: UInt, addr: UInt, size: UInt,
+      len: UInt = UInt(0), burst: UInt = BURST_INCR)
+      (implicit p: Parameters) = {
+    val aw = Wire(new NastiWriteAddressChannel)
+    aw.id := id
+    aw.addr := addr
+    aw.len := len
+    aw.size := size
+    aw.burst := burst
+    aw.lock := Bool(false)
+    aw.cache := UInt("b0000")
+    aw.prot := UInt("b000")
+    aw.qos := UInt("b0000")
+    aw.region := UInt("b0000")
+    aw.user := UInt(0)
+    aw
+  }
+}
+
+object NastiReadAddressChannel {
+  def apply(id: UInt, addr: UInt, size: UInt,
+      len: UInt = UInt(0), burst: UInt = BURST_INCR)
+      (implicit p: Parameters) = {
+    val ar = Wire(new NastiReadAddressChannel)
+    ar.id := id
+    ar.addr := addr
+    ar.len := len
+    ar.size := size
+    ar.burst := burst
+    ar.lock := Bool(false)
+    ar.cache := UInt(0)
+    ar.prot := UInt(0)
+    ar.qos := UInt(0)
+    ar.region := UInt(0)
+    ar.user := UInt(0)
+    ar
+  }
+}
+
+object NastiWriteDataChannel {
+  def apply(data: UInt, strb: Option[UInt] = None,
+            last: Bool = Bool(true), id: UInt = UInt(0))
+           (implicit p: Parameters): NastiWriteDataChannel = {
+    val w = Wire(new NastiWriteDataChannel)
+    w.strb := strb.getOrElse(Fill(w.nastiWStrobeBits, UInt(1, 1)))
+    w.data := data
+    w.last := last
+    w.id   := id
+    w.user := UInt(0)
+    w
+  }
+}
+
+object NastiReadDataChannel {
+  def apply(id: UInt, data: UInt, last: Bool = Bool(true), resp: UInt = UInt(0))(
+      implicit p: Parameters) = {
+    val r = Wire(new NastiReadDataChannel)
+    r.id := id
+    r.data := data
+    r.last := last
+    r.resp := resp
+    r.user := UInt(0)
+    r
+  }
+}
+
+object NastiWriteResponseChannel {
+  def apply(id: UInt, resp: UInt = UInt(0))(implicit p: Parameters) = {
+    val b = Wire(new NastiWriteResponseChannel)
+    b.id := id
+    b.resp := resp
+    b.user := UInt(0)
+    b
+  }
+}
+
+class MemIONastiIOConverter(cacheBlockOffsetBits: Int)(implicit p: Parameters) extends MIFModule
+    with HasNastiParameters {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val mem = new MemIO
+  }
+
+  require(mifDataBits == nastiXDataBits, "Data sizes between LLC and MC don't agree")
+  val (mif_cnt_out, mif_wrap_out) = Counter(io.mem.resp.fire(), mifDataBeats)
+
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.size === UInt(log2Up(mifDataBits/8)),
+    "Nasti data size does not match MemIO data size")
+  assert(!io.nasti.ar.valid || io.nasti.ar.bits.size === UInt(log2Up(mifDataBits/8)),
+    "Nasti data size does not match MemIO data size")
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.len === UInt(mifDataBeats - 1),
+    "Nasti length does not match number of MemIO beats")
+  assert(!io.nasti.ar.valid || io.nasti.ar.bits.len === UInt(mifDataBeats - 1),
+    "Nasti length does not match number of MemIO beats")
+
+  // according to the spec, we can't send b until the last transfer on w
+  val b_ok = Reg(init = Bool(true))
+  when (io.nasti.aw.fire()) { b_ok := Bool(false) }
+  when (io.nasti.w.fire() && io.nasti.w.bits.last) { b_ok := Bool(true) }
+
+  val id_q = Module(new Queue(UInt(width = nastiWIdBits), 2))
+  id_q.io.enq.valid := io.nasti.aw.valid && io.mem.req_cmd.ready
+  id_q.io.enq.bits := io.nasti.aw.bits.id
+  id_q.io.deq.ready := io.nasti.b.ready && b_ok
+
+  io.mem.req_cmd.bits.addr := Mux(io.nasti.aw.valid, io.nasti.aw.bits.addr, io.nasti.ar.bits.addr) >>
+                                UInt(cacheBlockOffsetBits)
+  io.mem.req_cmd.bits.tag := Mux(io.nasti.aw.valid, io.nasti.aw.bits.id, io.nasti.ar.bits.id)
+  io.mem.req_cmd.bits.rw := io.nasti.aw.valid
+  io.mem.req_cmd.valid := (io.nasti.aw.valid && id_q.io.enq.ready) || io.nasti.ar.valid
+  io.nasti.ar.ready := io.mem.req_cmd.ready && !io.nasti.aw.valid
+  io.nasti.aw.ready := io.mem.req_cmd.ready && id_q.io.enq.ready
+
+  io.nasti.b.valid := id_q.io.deq.valid && b_ok
+  io.nasti.b.bits.id := id_q.io.deq.bits
+  io.nasti.b.bits.resp := UInt(0)
+
+  io.nasti.w.ready := io.mem.req_data.ready
+  io.mem.req_data.valid := io.nasti.w.valid
+  io.mem.req_data.bits.data := io.nasti.w.bits.data
+  assert(!io.nasti.w.valid || io.nasti.w.bits.strb.andR, "MemIO must write full cache line")
+
+  io.nasti.r.valid := io.mem.resp.valid
+  io.nasti.r.bits.data := io.mem.resp.bits.data
+  io.nasti.r.bits.last := mif_wrap_out
+  io.nasti.r.bits.id := io.mem.resp.bits.tag
+  io.nasti.r.bits.resp := UInt(0)
+  io.mem.resp.ready := io.nasti.r.ready
+}
+
+class NastiArbiterIO(arbN: Int)(implicit p: Parameters) extends Bundle {
+  val master = Vec(arbN, new NastiIO).flip
+  val slave = new NastiIO
+  override def cloneType =
+    new NastiArbiterIO(arbN).asInstanceOf[this.type]
+}
+
+/** Arbitrate among arbN masters requesting to a single slave */
+class NastiArbiter(val arbN: Int)(implicit p: Parameters) extends NastiModule {
+  val io = new NastiArbiterIO(arbN)
+
+  if (arbN > 1) {
+    val arbIdBits = log2Up(arbN)
+
+    val ar_arb = Module(new RRArbiter(new NastiReadAddressChannel, arbN))
+    val aw_arb = Module(new RRArbiter(new NastiWriteAddressChannel, arbN))
+
+    val slave_r_arb_id = io.slave.r.bits.id(arbIdBits - 1, 0)
+    val slave_b_arb_id = io.slave.b.bits.id(arbIdBits - 1, 0)
+
+    val w_chosen = Reg(UInt(width = arbIdBits))
+    val w_done = Reg(init = Bool(true))
+
+    when (aw_arb.io.out.fire()) {
+      w_chosen := aw_arb.io.chosen
+      w_done := Bool(false)
+    }
+
+    when (io.slave.w.fire() && io.slave.w.bits.last) {
+      w_done := Bool(true)
+    }
+
+    for (i <- 0 until arbN) {
+      val m_ar = io.master(i).ar
+      val m_aw = io.master(i).aw
+      val m_r = io.master(i).r
+      val m_b = io.master(i).b
+      val a_ar = ar_arb.io.in(i)
+      val a_aw = aw_arb.io.in(i)
+      val m_w = io.master(i).w
+
+      a_ar <> m_ar
+      a_ar.bits.id := Cat(m_ar.bits.id, UInt(i, arbIdBits))
+
+      a_aw <> m_aw
+      a_aw.bits.id := Cat(m_aw.bits.id, UInt(i, arbIdBits))
+
+      m_r.valid := io.slave.r.valid && slave_r_arb_id === UInt(i)
+      m_r.bits := io.slave.r.bits
+      m_r.bits.id := io.slave.r.bits.id >> UInt(arbIdBits)
+
+      m_b.valid := io.slave.b.valid && slave_b_arb_id === UInt(i)
+      m_b.bits := io.slave.b.bits
+      m_b.bits.id := io.slave.b.bits.id >> UInt(arbIdBits)
+
+      m_w.ready := io.slave.w.ready && w_chosen === UInt(i) && !w_done
+    }
+
+    io.slave.r.ready := io.master(slave_r_arb_id).r.ready
+    io.slave.b.ready := io.master(slave_b_arb_id).b.ready
+
+    io.slave.w.bits := io.master(w_chosen).w.bits
+    io.slave.w.valid := io.master(w_chosen).w.valid && !w_done
+
+    io.slave.ar <> ar_arb.io.out
+
+    io.slave.aw.bits <> aw_arb.io.out.bits
+    io.slave.aw.valid := aw_arb.io.out.valid && w_done
+    aw_arb.io.out.ready := io.slave.aw.ready && w_done
+
+  } else { io.slave <> io.master.head }
+}
+
+/** A slave that send decode error for every request it receives */
+class NastiErrorSlave(implicit p: Parameters) extends NastiModule {
+  val io = (new NastiIO).flip
+
+  when (io.ar.fire()) { printf("Invalid read address %x\n", io.ar.bits.addr) }
+  when (io.aw.fire()) { printf("Invalid write address %x\n", io.aw.bits.addr) }
+
+  val r_queue = Module(new Queue(new NastiReadAddressChannel, 1))
+  r_queue.io.enq <> io.ar
+
+  val responding = Reg(init = Bool(false))
+  val beats_left = Reg(init = UInt(0, nastiXLenBits))
+
+  when (!responding && r_queue.io.deq.valid) {
+    responding := Bool(true)
+    beats_left := r_queue.io.deq.bits.len
+  }
+
+  io.r.valid := r_queue.io.deq.valid && responding
+  io.r.bits.id := r_queue.io.deq.bits.id
+  io.r.bits.data := UInt(0)
+  io.r.bits.resp := RESP_DECERR
+  io.r.bits.last := beats_left === UInt(0)
+
+  r_queue.io.deq.ready := io.r.fire() && io.r.bits.last
+
+  when (io.r.fire()) {
+    when (beats_left === UInt(0)) {
+      responding := Bool(false)
+    } .otherwise {
+      beats_left := beats_left - UInt(1)
+    }
+  }
+
+  val draining = Reg(init = Bool(false))
+  io.w.ready := draining
+
+  when (io.aw.fire()) { draining := Bool(true) }
+  when (io.w.fire() && io.w.bits.last) { draining := Bool(false) }
+
+  val b_queue = Module(new Queue(UInt(width = nastiWIdBits), 1))
+  b_queue.io.enq.valid := io.aw.valid && !draining
+  b_queue.io.enq.bits := io.aw.bits.id
+  io.aw.ready := b_queue.io.enq.ready && !draining
+  io.b.valid := b_queue.io.deq.valid && !draining
+  io.b.bits.id := b_queue.io.deq.bits
+  io.b.bits.resp := Bits("b11")
+  b_queue.io.deq.ready := io.b.ready && !draining
+}
+
+class NastiRouterIO(nSlaves: Int)(implicit p: Parameters) extends Bundle {
+  val master = (new NastiIO).flip
+  val slave = Vec(nSlaves, new NastiIO)
+  override def cloneType =
+    new NastiRouterIO(nSlaves).asInstanceOf[this.type]
+}
+
+/** Take a single Nasti master and route its requests to various slaves
+ *  @param nSlaves the number of slaves
+ *  @param routeSel a function which takes an address and produces
+ *  a one-hot encoded selection of the slave to write to */
+class NastiRouter(nSlaves: Int, routeSel: UInt => UInt)(implicit p: Parameters)
+    extends NastiModule {
+
+  val io = new NastiRouterIO(nSlaves)
+
+  val ar_route = routeSel(io.master.ar.bits.addr)
+  val aw_route = routeSel(io.master.aw.bits.addr)
+
+  var ar_ready = Bool(false)
+  var aw_ready = Bool(false)
+  var w_ready = Bool(false)
+
+  io.slave.zipWithIndex.foreach { case (s, i) =>
+    s.ar.valid := io.master.ar.valid && ar_route(i)
+    s.ar.bits := io.master.ar.bits
+    ar_ready = ar_ready || (s.ar.ready && ar_route(i))
+
+    s.aw.valid := io.master.aw.valid && aw_route(i)
+    s.aw.bits := io.master.aw.bits
+    aw_ready = aw_ready || (s.aw.ready && aw_route(i))
+
+    val chosen = Reg(init = Bool(false))
+    when (s.w.fire() && s.w.bits.last) { chosen := Bool(false) }
+    when (s.aw.fire()) { chosen := Bool(true) }
+
+    s.w.valid := io.master.w.valid && chosen
+    s.w.bits := io.master.w.bits
+    w_ready = w_ready || (s.w.ready && chosen)
+  }
+
+  val r_invalid = !ar_route.orR
+  val w_invalid = !aw_route.orR
+
+  val err_slave = Module(new NastiErrorSlave)
+  err_slave.io.ar.valid := r_invalid && io.master.ar.valid
+  err_slave.io.ar.bits := io.master.ar.bits
+  err_slave.io.aw.valid := w_invalid && io.master.aw.valid
+  err_slave.io.aw.bits := io.master.aw.bits
+  err_slave.io.w.valid := io.master.w.valid
+  err_slave.io.w.bits := io.master.w.bits
+
+  io.master.ar.ready := ar_ready || (r_invalid && err_slave.io.ar.ready)
+  io.master.aw.ready := aw_ready || (w_invalid && err_slave.io.aw.ready)
+  io.master.w.ready := w_ready || err_slave.io.w.ready
+
+  val b_arb = Module(new RRArbiter(new NastiWriteResponseChannel, nSlaves + 1))
+  val r_arb = Module(new JunctionsPeekingArbiter(
+    new NastiReadDataChannel, nSlaves + 1,
+    // we can unlock if it's the last beat
+    (r: NastiReadDataChannel) => r.last))
+
+  for (i <- 0 until nSlaves) {
+    b_arb.io.in(i) <> io.slave(i).b
+    r_arb.io.in(i) <> io.slave(i).r
+  }
+
+  b_arb.io.in(nSlaves) <> err_slave.io.b
+  r_arb.io.in(nSlaves) <> err_slave.io.r
+
+  io.master.b <> b_arb.io.out
+  io.master.r <> r_arb.io.out
+}
+
+/** Crossbar between multiple Nasti masters and slaves
+ *  @param nMasters the number of Nasti masters
+ *  @param nSlaves the number of Nasti slaves
+ *  @param routeSel a function selecting the slave to route an address to */
+class NastiCrossbar(nMasters: Int, nSlaves: Int, routeSel: UInt => UInt)
+                   (implicit p: Parameters) extends NastiModule {
+  val io = new Bundle {
+    val masters = Vec(nMasters, new NastiIO).flip
+    val slaves = Vec(nSlaves, new NastiIO)
+  }
+
+  if (nMasters == 1) {
+    val router = Module(new NastiRouter(nSlaves, routeSel))
+    router.io.master <> io.masters.head
+    io.slaves <> router.io.slave
+  } else {
+    val routers = Vec.fill(nMasters) { Module(new NastiRouter(nSlaves, routeSel)).io }
+    val arbiters = Vec.fill(nSlaves) { Module(new NastiArbiter(nMasters)).io }
+
+    for (i <- 0 until nMasters) {
+      routers(i).master <> io.masters(i)
+    }
+
+    for (i <- 0 until nSlaves) {
+      arbiters(i).master <> Vec(routers.map(r => r.slave(i)))
+      io.slaves(i) <> arbiters(i).slave
+    }
+  }
+}
+
+class NastiInterconnectIO(val nMasters: Int, val nSlaves: Int)
+                         (implicit p: Parameters) extends Bundle {
+  /* This is a bit confusing. The interconnect is a slave to the masters and
+   * a master to the slaves. Hence why the declarations seem to be backwards. */
+  val masters = Vec(nMasters, new NastiIO).flip
+  val slaves = Vec(nSlaves, new NastiIO)
+  override def cloneType =
+    new NastiInterconnectIO(nMasters, nSlaves).asInstanceOf[this.type]
+}
+
+abstract class NastiInterconnect(implicit p: Parameters) extends NastiModule()(p) {
+  val nMasters: Int
+  val nSlaves: Int
+
+  lazy val io = new NastiInterconnectIO(nMasters, nSlaves)
+}
+
+class NastiRecursiveInterconnect(val nMasters: Int, addrMap: AddrMap)
+    (implicit p: Parameters) extends NastiInterconnect()(p) {
+  def port(name: String) = io.slaves(addrMap.port(name))
+  val nSlaves = addrMap.numSlaves
+  val routeSel = (addr: UInt) =>
+    Cat(addrMap.entries.map(e => addrMap(e.name).containsAddress(addr)).reverse)
+
+  val xbar = Module(new NastiCrossbar(nMasters, addrMap.length, routeSel))
+  xbar.io.masters <> io.masters
+
+  io.slaves <> addrMap.entries.zip(xbar.io.slaves).flatMap {
+    case (entry, xbarSlave) => {
+      entry.region match {
+        case submap: AddrMap if submap.entries.isEmpty =>
+          val err_slave = Module(new NastiErrorSlave)
+          err_slave.io <> xbarSlave
+          None
+        case submap: AddrMap =>
+          val ic = Module(new NastiRecursiveInterconnect(1, submap))
+          ic.io.masters.head <> xbarSlave
+          ic.io.slaves
+        case r: MemRange =>
+          Some(xbarSlave)
+      }
+    }
+  }
+}
+
+class ChannelHelper(nChannels: Int)
+    (implicit val p: Parameters) extends HasNastiParameters {
+
+  val dataBytes = p(MIFDataBits) * p(MIFDataBeats) / 8
+  val chanSelBits = log2Ceil(nChannels)
+  val selOffset = log2Up(dataBytes)
+  val blockOffset = selOffset + chanSelBits
+
+  def getSelect(addr: UInt) =
+    if (nChannels > 1) addr(blockOffset - 1, selOffset) else UInt(0)
+
+  def getAddr(addr: UInt) =
+    if (nChannels > 1)
+      Cat(addr(nastiXAddrBits - 1, blockOffset), addr(selOffset - 1, 0))
+    else addr
+}
+
+class NastiMemoryInterconnect(
+    nBanksPerChannel: Int, nChannels: Int)
+    (implicit p: Parameters) extends NastiInterconnect()(p) {
+
+  val nBanks = nBanksPerChannel * nChannels
+  val nMasters = nBanks
+  val nSlaves = nChannels
+
+  val chanHelper = new ChannelHelper(nChannels)
+  def connectChannel(outer: NastiIO, inner: NastiIO) {
+    outer <> inner
+    outer.ar.bits.addr := chanHelper.getAddr(inner.ar.bits.addr)
+    outer.aw.bits.addr := chanHelper.getAddr(inner.aw.bits.addr)
+  }
+
+  for (i <- 0 until nChannels) {
+    /* Bank assignments to channels are strided so that consecutive banks
+     * map to different channels. That way, consecutive cache lines also
+     * map to different channels */
+    val banks = (i until nBanks by nChannels).map(j => io.masters(j))
+
+    val channelArb = Module(new NastiArbiter(nBanksPerChannel))
+    channelArb.io.master <> banks
+    connectChannel(io.slaves(i), channelArb.io.slave)
+  }
+}
+
+/** Allows users to switch between various memory configurations.  Note that
+  * this is a dangerous operation: not only does switching the select input to
+  * this module violate Nasti, it also causes the memory of the machine to
+  * become garbled.  It's expected that select only changes at boot time, as
+  * part of the memory controller configuration. */
+class NastiMemorySelectorIO(val nBanks: Int, val maxMemChannels: Int, nConfigs: Int)
+                           (implicit p: Parameters)
+                           extends NastiInterconnectIO(nBanks, maxMemChannels) {
+  val select  = UInt(INPUT, width = log2Up(nConfigs))
+  override def cloneType =
+    new NastiMemorySelectorIO(nMasters, nSlaves, nConfigs).asInstanceOf[this.type]
+}
+
+class NastiMemorySelector(nBanks: Int, maxMemChannels: Int, configs: Seq[Int])
+                         (implicit p: Parameters)
+                         extends NastiInterconnect()(p) {
+  val nMasters = nBanks
+  val nSlaves  = maxMemChannels
+  val nConfigs = configs.size
+
+  override lazy val io = new NastiMemorySelectorIO(nBanks, maxMemChannels, nConfigs)
+
+  def muxOnSelect(up: DecoupledIO[Bundle], dn: DecoupledIO[Bundle], active: Bool): Unit = {
+    when (active) { dn.bits  := up.bits  }
+    when (active) { up.ready := dn.ready }
+    when (active) { dn.valid := up.valid }
+  }
+
+  def muxOnSelect(up: NastiIO, dn: NastiIO, active: Bool): Unit = {
+    muxOnSelect(up.aw, dn.aw, active)
+    muxOnSelect(up.w,  dn.w,  active)
+    muxOnSelect(dn.b,  up.b,  active)
+    muxOnSelect(up.ar, dn.ar, active)
+    muxOnSelect(dn.r,  up.r,  active)
+  }
+
+  def muxOnSelect(up: Vec[NastiIO], dn: Vec[NastiIO], active: Bool) : Unit = {
+    for (i <- 0 until up.size)
+      muxOnSelect(up(i), dn(i), active)
+  }
+
+  /* Disconnects a vector of Nasti ports, which involves setting them to
+   * invalid.  Due to Chisel reasons, we need to also set the bits to 0 (since
+   * there can't be any unconnected inputs). */
+  def disconnectSlave(slave: Vec[NastiIO]) = {
+    slave.foreach{ m =>
+      m.aw.valid := Bool(false)
+      m.aw.bits  := m.aw.bits.fromBits( UInt(0) )
+      m.w.valid  := Bool(false)
+      m.w.bits   := m.w.bits.fromBits( UInt(0) )
+      m.b.ready  := Bool(false)
+      m.ar.valid := Bool(false)
+      m.ar.bits  := m.ar.bits.fromBits( UInt(0) )
+      m.r.ready  := Bool(false)
+    }
+  }
+
+  def disconnectMaster(master: Vec[NastiIO]) = {
+    master.foreach{ m =>
+      m.aw.ready := Bool(false)
+      m.w.ready  := Bool(false)
+      m.b.valid  := Bool(false)
+      m.b.bits   := m.b.bits.fromBits( UInt(0) )
+      m.ar.ready := Bool(false)
+      m.r.valid  := Bool(false)
+      m.r.bits   := m.r.bits.fromBits( UInt(0) )
+    }
+  }
+
+  /* Provides default wires on all our outputs. */
+  disconnectMaster(io.masters)
+  disconnectSlave(io.slaves)
+
+  /* Constructs interconnects for each of the layouts suggested by the
+   * configuration and switches between them based on the select input. */
+  configs.zipWithIndex.foreach{ case (nChannels, select) =>
+    val nBanksPerChannel = nBanks / nChannels
+    val ic = Module(new NastiMemoryInterconnect(nBanksPerChannel, nChannels))
+    disconnectMaster(ic.io.slaves)
+    disconnectSlave(ic.io.masters)
+    muxOnSelect(   io.masters, ic.io.masters, io.select === UInt(select))
+    muxOnSelect(ic.io.slaves,     io.slaves,  io.select === UInt(select))
+  }
+}
+
+class NastiMemoryDemux(nRoutes: Int)(implicit p: Parameters) extends NastiModule()(p) {
+  val io = new Bundle {
+    val master = (new NastiIO).flip
+    val slaves = Vec(nRoutes, new NastiIO)
+    val select = UInt(INPUT, log2Up(nRoutes))
+  }
+
+  def connectReqChannel[T <: Data](idx: Int, out: DecoupledIO[T], in: DecoupledIO[T]) {
+    out.valid := in.valid && io.select === UInt(idx)
+    out.bits := in.bits
+    when (io.select === UInt(idx)) { in.ready := out.ready }
+  }
+
+  def connectRespChannel[T <: Data](idx: Int, out: DecoupledIO[T], in: DecoupledIO[T]) {
+    when (io.select === UInt(idx)) { out.valid := in.valid }
+    when (io.select === UInt(idx)) { out.bits := in.bits }
+    in.ready := out.ready && io.select === UInt(idx)
+  }
+
+  io.master.ar.ready := Bool(false)
+  io.master.aw.ready := Bool(false)
+  io.master.w.ready := Bool(false)
+  io.master.r.valid := Bool(false)
+  io.master.r.bits := NastiReadDataChannel(id = UInt(0), data = UInt(0))
+  io.master.b.valid := Bool(false)
+  io.master.b.bits := NastiWriteResponseChannel(id = UInt(0))
+
+  io.slaves.zipWithIndex.foreach { case (slave, i) =>
+    connectReqChannel(i, slave.ar, io.master.ar)
+    connectReqChannel(i, slave.aw, io.master.aw)
+    connectReqChannel(i, slave.w, io.master.w)
+    connectRespChannel(i, io.master.r, slave.r)
+    connectRespChannel(i, io.master.b, slave.b)
+  }
+}
+
+object AsyncNastiTo {
+  // source(master) is in our clock domain, output is in the 'to' clock domain
+  def apply[T <: Data](to_clock: Clock, to_reset: Bool, source: NastiIO, depth: Int = 3, sync: Int = 2)(implicit p: Parameters): NastiIO = {
+    val sink = Wire(new NastiIO)
+
+    sink.aw <> AsyncDecoupledTo(to_clock, to_reset, source.aw, depth, sync)
+    sink.ar <> AsyncDecoupledTo(to_clock, to_reset, source.ar, depth, sync)
+    sink.w  <> AsyncDecoupledTo(to_clock, to_reset, source.w,  depth, sync)
+    source.b <> AsyncDecoupledFrom(to_clock, to_reset, sink.b, depth, sync)
+    source.r <> AsyncDecoupledFrom(to_clock, to_reset, sink.r, depth, sync)
+
+    sink
+  }
+}
+
+object AsyncNastiFrom {
+  // source(master) is in the 'from' clock domain, output is in our clock domain
+  def apply[T <: Data](from_clock: Clock, from_reset: Bool, source: NastiIO, depth: Int = 3, sync: Int = 2)(implicit p: Parameters): NastiIO = {
+    val sink = Wire(new NastiIO)
+
+    sink.aw <> AsyncDecoupledFrom(from_clock, from_reset, source.aw, depth, sync)
+    sink.ar <> AsyncDecoupledFrom(from_clock, from_reset, source.ar, depth, sync)
+    sink.w  <> AsyncDecoupledFrom(from_clock, from_reset, source.w,  depth, sync)
+    source.b <> AsyncDecoupledTo(from_clock, from_reset, sink.b, depth, sync)
+    source.r <> AsyncDecoupledTo(from_clock, from_reset, sink.r, depth, sync)
+
+    sink
+  }
+}
--- a/junctions/src/main/scala/package.scala
+++ b/junctions/src/main/scala/package.scala
@@ -0,0 +1 @@
+package object junctions
--- a/junctions/src/main/scala/poci.scala
+++ b/junctions/src/main/scala/poci.scala
@@ -0,0 +1,82 @@
+package junctions
+
+import Chisel._
+import cde.{Parameters, Field}
+
+class PociIO(implicit p: Parameters) extends HastiBundle()(p)
+{
+  val paddr = UInt(OUTPUT, hastiAddrBits)
+  val pwrite = Bool(OUTPUT)
+  val psel = Bool(OUTPUT)
+  val penable = Bool(OUTPUT)
+  val pwdata = UInt(OUTPUT, hastiDataBits)
+  val prdata = UInt(INPUT, hastiDataBits)
+  val pready = Bool(INPUT)
+  val pslverr = Bool(INPUT)
+}
+
+class HastiToPociBridge(implicit p: Parameters) extends HastiModule()(p) {
+  val io = new Bundle {
+    val in = new HastiSlaveIO
+    val out = new PociIO
+  }
+
+  val s_idle :: s_setup :: s_access :: Nil = Enum(UInt(), 3)
+  val state = Reg(init = s_idle)
+  val transfer = io.in.hsel & io.in.htrans(1)
+
+  switch (state) {
+    is (s_idle) {
+      when (transfer) { state := s_setup }
+    }
+    is (s_setup) {
+      state := s_access
+    }
+    is (s_access) {
+      when (io.out.pready & ~transfer) { state := s_idle   }
+      when (io.out.pready & transfer)  { state := s_setup  }
+      when (~io.out.pready)            { state := s_access }
+    }
+  }
+
+  val haddr_reg = Reg(UInt(width = hastiAddrBits))
+  val hwrite_reg = Reg(UInt(width = 1))
+  when (transfer) {
+    haddr_reg  := io.in.haddr
+    hwrite_reg := io.in.hwrite
+  }
+
+  io.out.paddr := haddr_reg
+  io.out.pwrite := hwrite_reg(0)
+  io.out.psel := (state =/= s_idle)
+  io.out.penable := (state === s_access)
+  io.out.pwdata := io.in.hwdata
+  io.in.hrdata := io.out.prdata
+  io.in.hready := ((state === s_access) & io.out.pready) | (state === s_idle)
+  io.in.hresp := io.out.pslverr
+}
+
+class PociBus(amap: Seq[UInt=>Bool])(implicit p: Parameters) extends HastiModule()(p)
+{
+  val io = new Bundle {
+    val master = new PociIO().flip
+    val slaves = Vec(amap.size, new PociIO)
+  }
+
+  val psels = PriorityEncoderOH(
+    (io.slaves zip amap) map { case (s, afn) => {
+      s.paddr := io.master.paddr
+      s.pwrite := io.master.pwrite
+      s.pwdata := io.master.pwdata
+      afn(io.master.paddr) && io.master.psel
+  }})
+
+  (io.slaves zip psels) foreach { case (s, psel) => {
+    s.psel := psel
+    s.penable := io.master.penable && psel
+  } }
+
+  io.master.prdata := Mux1H(psels, io.slaves.map(_.prdata))
+  io.master.pready := Mux1H(psels, io.slaves.map(_.pready))
+  io.master.pslverr := Mux1H(psels, io.slaves.map(_.pslverr))
+}
--- a/junctions/src/main/scala/slowio.scala
+++ b/junctions/src/main/scala/slowio.scala
@@ -0,0 +1,70 @@
+// See LICENSE for license details.
+
+package junctions
+import Chisel._
+
+class SlowIO[T <: Data](val divisor_max: Int)(data: => T) extends Module
+{
+  val io = new Bundle {
+    val out_fast = Decoupled(data).flip
+    val out_slow = Decoupled(data)
+    val in_fast = Decoupled(data)
+    val in_slow = Decoupled(data).flip
+    val clk_slow = Bool(OUTPUT)
+    val set_divisor = Valid(Bits(width = 32)).flip
+    val divisor = Bits(OUTPUT, 32)
+  }
+
+  require(divisor_max >= 8 && divisor_max <= 65536 && isPow2(divisor_max))
+  val divisor = Reg(init=UInt(divisor_max-1))
+  val d_shadow = Reg(init=UInt(divisor_max-1))
+  val hold = Reg(init=UInt(divisor_max/4-1))
+  val h_shadow = Reg(init=UInt(divisor_max/4-1))
+  when (io.set_divisor.valid) {
+    d_shadow := io.set_divisor.bits(log2Up(divisor_max)-1, 0).toUInt
+    h_shadow := io.set_divisor.bits(log2Up(divisor_max)-1+16, 16).toUInt
+  }
+  io.divisor := (hold << 16) | divisor
+
+  val count = Reg{UInt(width = log2Up(divisor_max))}
+  val myclock = Reg{Bool()}
+  count := count + UInt(1)
+
+  val rising = count === (divisor >> 1)
+  val falling = count === divisor
+  val held = count === (divisor >> 1) + hold
+
+  when (falling) {
+    divisor := d_shadow
+    hold := h_shadow
+    count := UInt(0)
+    myclock := Bool(false)
+  }
+  when (rising) {
+    myclock := Bool(true)
+  }
+
+  val in_slow_rdy = Reg(init=Bool(false))
+  val out_slow_val = Reg(init=Bool(false))
+  val out_slow_bits = Reg(data)
+
+  val fromhost_q = Module(new Queue(data,1))
+  fromhost_q.io.enq.valid := rising && (io.in_slow.valid && in_slow_rdy || this.reset)
+  fromhost_q.io.enq.bits := io.in_slow.bits
+  io.in_fast <> fromhost_q.io.deq
+
+  val tohost_q = Module(new Queue(data,1))
+  tohost_q.io.enq <> io.out_fast
+  tohost_q.io.deq.ready := rising && io.out_slow.ready && out_slow_val
+
+  when (held) {
+    in_slow_rdy := fromhost_q.io.enq.ready
+    out_slow_val := tohost_q.io.deq.valid
+    out_slow_bits := Mux(this.reset, fromhost_q.io.deq.bits, tohost_q.io.deq.bits)
+  }
+
+  io.in_slow.ready := in_slow_rdy
+  io.out_slow.valid := out_slow_val
+  io.out_slow.bits := out_slow_bits
+  io.clk_slow := myclock
+}
--- a/junctions/src/main/scala/smi.scala
+++ b/junctions/src/main/scala/smi.scala
@@ -0,0 +1,281 @@
+package junctions
+
+import Chisel._
+import cde.Parameters
+
+class SmiReq(val dataWidth: Int, val addrWidth: Int) extends Bundle {
+  val rw = Bool()
+  val addr = UInt(width = addrWidth)
+  val data = Bits(width = dataWidth)
+
+  override def cloneType =
+    new SmiReq(dataWidth, addrWidth).asInstanceOf[this.type]
+}
+
+/** Simple Memory Interface IO. Used to communicate with PCR and SCR
+ *  @param dataWidth the width in bits of the data field
+ *  @param addrWidth the width in bits of the addr field */
+class SmiIO(val dataWidth: Int, val addrWidth: Int) extends Bundle {
+  val req = Decoupled(new SmiReq(dataWidth, addrWidth))
+  val resp = Decoupled(Bits(width = dataWidth)).flip
+
+  override def cloneType =
+    new SmiIO(dataWidth, addrWidth).asInstanceOf[this.type]
+}
+
+abstract class SmiPeripheral extends Module {
+  val dataWidth: Int
+  val addrWidth: Int
+
+  lazy val io = new SmiIO(dataWidth, addrWidth).flip
+}
+
+/** A simple sequential memory accessed through Smi */
+class SmiMem(val dataWidth: Int, val memDepth: Int) extends SmiPeripheral {
+  // override
+  val addrWidth = log2Up(memDepth)
+
+  val mem = SeqMem(memDepth, Bits(width = dataWidth))
+
+  val ren = io.req.fire() && !io.req.bits.rw
+  val wen = io.req.fire() && io.req.bits.rw
+
+  when (wen) { mem.write(io.req.bits.addr, io.req.bits.data) }
+
+  val resp_valid = Reg(init = Bool(false))
+
+  when (io.resp.fire()) { resp_valid := Bool(false) }
+  when (io.req.fire())  { resp_valid := Bool(true) }
+
+  io.resp.valid := resp_valid
+  io.resp.bits := mem.read(io.req.bits.addr, ren)
+  io.req.ready := !resp_valid
+}
+
+/** Arbitrate among several Smi clients
+ *  @param n the number of clients
+ *  @param dataWidth Smi data width
+ *  @param addrWidth Smi address width */
+class SmiArbiter(val n: Int, val dataWidth: Int, val addrWidth: Int)
+    extends Module {
+  val io = new Bundle {
+    val in = Vec(n, new SmiIO(dataWidth, addrWidth)).flip
+    val out = new SmiIO(dataWidth, addrWidth)
+  }
+
+  val wait_resp = Reg(init = Bool(false))
+  val choice = Reg(UInt(width = log2Up(n)))
+
+  val req_arb = Module(new RRArbiter(new SmiReq(dataWidth, addrWidth), n))
+  req_arb.io.in <> io.in.map(_.req)
+  req_arb.io.out.ready := io.out.req.ready && !wait_resp
+
+  io.out.req.bits := req_arb.io.out.bits
+  io.out.req.valid := req_arb.io.out.valid && !wait_resp
+
+  when (io.out.req.fire()) {
+    choice := req_arb.io.chosen
+    wait_resp := Bool(true)
+  }
+
+  when (io.out.resp.fire()) { wait_resp := Bool(false) }
+
+  for ((resp, i) <- io.in.map(_.resp).zipWithIndex) {
+    resp.bits := io.out.resp.bits
+    resp.valid := io.out.resp.valid && choice === UInt(i)
+  }
+
+  io.out.resp.ready := io.in(choice).resp.ready
+}
+
+class SmiIONastiReadIOConverter(val dataWidth: Int, val addrWidth: Int)
+                               (implicit p: Parameters) extends NastiModule()(p) {
+  val io = new Bundle {
+    val nasti = new NastiReadIO().flip
+    val smi = new SmiIO(dataWidth, addrWidth)
+  }
+
+  private val maxWordsPerBeat = nastiXDataBits / dataWidth
+  private val wordCountBits = log2Up(maxWordsPerBeat)
+  private val byteOffBits = log2Up(dataWidth / 8)
+  private val addrOffBits = addrWidth + byteOffBits
+
+  private def calcWordCount(size: UInt): UInt =
+    (UInt(1) << (size - UInt(byteOffBits))) - UInt(1)
+
+  val (s_idle :: s_read :: s_resp :: Nil) = Enum(Bits(), 3)
+  val state = Reg(init = s_idle)
+
+  val nWords = Reg(UInt(width = wordCountBits))
+  val nBeats = Reg(UInt(width = nastiXLenBits))
+  val addr = Reg(UInt(width = addrWidth))
+  val id = Reg(UInt(width = nastiRIdBits))
+
+  val byteOff = Reg(UInt(width = byteOffBits))
+  val recvInd = Reg(init = UInt(0, wordCountBits))
+  val sendDone = Reg(init = Bool(false))
+
+  val buffer = Reg(init = Vec.fill(maxWordsPerBeat) { Bits(0, dataWidth) })
+
+  io.nasti.ar.ready := (state === s_idle)
+
+  io.smi.req.valid := (state === s_read) && !sendDone
+  io.smi.req.bits.rw := Bool(false)
+  io.smi.req.bits.addr := addr
+
+  io.smi.resp.ready := (state === s_read)
+
+  io.nasti.r.valid := (state === s_resp)
+  io.nasti.r.bits := NastiReadDataChannel(
+    id = id,
+    data = buffer.toBits,
+    last = (nBeats === UInt(0)))
+
+  when (io.nasti.ar.fire()) {
+    when (io.nasti.ar.bits.size < UInt(byteOffBits)) {
+      nWords := UInt(0)
+    } .otherwise {
+      nWords := calcWordCount(io.nasti.ar.bits.size)
+    }
+    nBeats := io.nasti.ar.bits.len
+    addr := io.nasti.ar.bits.addr(addrOffBits - 1, byteOffBits)
+    if (maxWordsPerBeat > 1)
+      recvInd := io.nasti.ar.bits.addr(wordCountBits + byteOffBits - 1, byteOffBits)
+    else
+      recvInd := UInt(0)
+    id := io.nasti.ar.bits.id
+    state := s_read
+  }
+
+  when (io.smi.req.fire()) {
+    addr := addr + UInt(1)
+    sendDone := (nWords === UInt(0))
+  }
+
+  when (io.smi.resp.fire()) {
+    recvInd := recvInd + UInt(1)
+    nWords := nWords - UInt(1)
+    buffer(recvInd) := io.smi.resp.bits
+    when (nWords === UInt(0)) { state := s_resp }
+  }
+
+  when (io.nasti.r.fire()) {
+    recvInd := UInt(0)
+    sendDone := Bool(false)
+    // clear all the registers in the buffer
+    buffer.foreach(_ := Bits(0))
+    nBeats := nBeats - UInt(1)
+    state := Mux(io.nasti.r.bits.last, s_idle, s_read)
+  }
+}
+
+class SmiIONastiWriteIOConverter(val dataWidth: Int, val addrWidth: Int)
+                                (implicit p: Parameters) extends NastiModule()(p) {
+  val io = new Bundle {
+    val nasti = new NastiWriteIO().flip
+    val smi = new SmiIO(dataWidth, addrWidth)
+  }
+
+  private val dataBytes = dataWidth / 8
+  private val maxWordsPerBeat = nastiXDataBits / dataWidth
+  private val byteOffBits = log2Floor(dataBytes)
+  private val addrOffBits = addrWidth + byteOffBits
+  private val nastiByteOffBits = log2Ceil(nastiXDataBits / 8)
+
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.size >= UInt(byteOffBits),
+    "Nasti size must be >= Smi size")
+
+  val id = Reg(UInt(width = nastiWIdBits))
+  val addr = Reg(UInt(width = addrWidth))
+  val offset = Reg(UInt(width = nastiByteOffBits))
+
+  def makeStrobe(offset: UInt, size: UInt, strb: UInt) = {
+    val sizemask = (UInt(1) << (UInt(1) << size)) - UInt(1)
+    val bytemask = strb & (sizemask << offset)
+    Vec.tabulate(maxWordsPerBeat){i => bytemask(dataBytes * i)}.toBits
+  }
+
+  val size = Reg(UInt(width = nastiXSizeBits))
+  val strb = Reg(UInt(width = maxWordsPerBeat))
+  val data = Reg(UInt(width = nastiXDataBits))
+  val last = Reg(Bool())
+
+  val s_idle :: s_data :: s_send :: s_ack :: s_resp :: Nil = Enum(Bits(), 5)
+  val state = Reg(init = s_idle)
+
+  io.nasti.aw.ready := (state === s_idle)
+  io.nasti.w.ready := (state === s_data)
+  io.smi.req.valid := (state === s_send) && strb(0)
+  io.smi.req.bits.rw := Bool(true)
+  io.smi.req.bits.addr := addr
+  io.smi.req.bits.data := data(dataWidth - 1, 0)
+  io.smi.resp.ready := (state === s_ack)
+  io.nasti.b.valid := (state === s_resp)
+  io.nasti.b.bits := NastiWriteResponseChannel(id)
+
+  val jump = if (maxWordsPerBeat > 1)
+    PriorityMux(strb(maxWordsPerBeat - 1, 1),
+      (1 until maxWordsPerBeat).map(UInt(_)))
+    else UInt(1)
+
+  when (io.nasti.aw.fire()) {
+    if (dataWidth == nastiXDataBits) {
+      addr := io.nasti.aw.bits.addr(addrOffBits - 1, byteOffBits)
+    } else {
+      addr := Cat(io.nasti.aw.bits.addr(addrOffBits - 1, nastiByteOffBits),
+                  UInt(0, nastiByteOffBits - byteOffBits))
+    }
+    offset := io.nasti.aw.bits.addr(nastiByteOffBits - 1, 0)
+    id := io.nasti.aw.bits.id
+    size := io.nasti.aw.bits.size
+    last := Bool(false)
+    state := s_data
+  }
+
+  when (io.nasti.w.fire()) {
+    last := io.nasti.w.bits.last
+    strb := makeStrobe(offset, size, io.nasti.w.bits.strb)
+    data := io.nasti.w.bits.data
+    state := s_send
+  }
+
+  when (state === s_send) {
+    when (io.smi.req.ready || !strb(0)) {
+      strb := strb >> jump
+      data := data >> Cat(jump, UInt(0, log2Up(dataWidth)))
+      addr := addr + jump
+      when (strb(0)) { state := s_ack }
+    }
+  }
+
+  when (io.smi.resp.fire()) {
+    state := Mux(strb === UInt(0),
+              Mux(last, s_resp, s_data), s_send)
+  }
+
+  when (io.nasti.b.fire()) { state := s_idle }
+}
+
+/** Convert Nasti protocol to Smi protocol */
+class SmiIONastiIOConverter(val dataWidth: Int, val addrWidth: Int)
+                           (implicit p: Parameters) extends NastiModule()(p) {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val smi = new SmiIO(dataWidth, addrWidth)
+  }
+
+  require(isPow2(dataWidth), "SMI data width must be power of 2")
+  require(dataWidth <= nastiXDataBits,
+    "SMI data width must be less than or equal to NASTI data width")
+
+  val reader = Module(new SmiIONastiReadIOConverter(dataWidth, addrWidth))
+  reader.io.nasti <> io.nasti
+
+  val writer = Module(new SmiIONastiWriteIOConverter(dataWidth, addrWidth))
+  writer.io.nasti <> io.nasti
+
+  val arb = Module(new SmiArbiter(2, dataWidth, addrWidth))
+  arb.io.in(0) <> reader.io.smi
+  arb.io.in(1) <> writer.io.smi
+  io.smi <> arb.io.out
+}
--- a/junctions/src/main/scala/stream.scala
+++ b/junctions/src/main/scala/stream.scala
@@ -0,0 +1,187 @@
+package junctions
+
+import Chisel._
+import NastiConstants._
+import cde.Parameters
+
+class StreamChannel(w: Int) extends Bundle {
+  val data = UInt(width = w)
+  val last = Bool()
+
+  override def cloneType = new StreamChannel(w).asInstanceOf[this.type]
+}
+
+class StreamIO(w: Int) extends Bundle {
+  val out = Decoupled(new StreamChannel(w))
+  val in = Decoupled(new StreamChannel(w)).flip
+
+  override def cloneType = new StreamIO(w).asInstanceOf[this.type]
+}
+
+class NastiIOStreamIOConverter(w: Int)(implicit p: Parameters) extends Module {
+  val io = new Bundle {
+    val nasti = (new NastiIO).flip
+    val stream = new StreamIO(w)
+  }
+
+  val streamSize = UInt(log2Up(w / 8))
+  assert(!io.nasti.ar.valid || io.nasti.ar.bits.size === streamSize,
+         "read channel wrong size on stream")
+  assert(!io.nasti.ar.valid || io.nasti.ar.bits.len === UInt(0) ||
+         io.nasti.ar.bits.burst === BURST_FIXED,
+         "read channel wrong burst type on stream")
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.size === streamSize,
+         "write channel wrong size on stream")
+  assert(!io.nasti.aw.valid || io.nasti.aw.bits.len === UInt(0) ||
+         io.nasti.aw.bits.burst === BURST_FIXED,
+         "write channel wrong burst type on stream")
+  assert(!io.nasti.w.valid || io.nasti.w.bits.strb.andR,
+         "write channel cannot take partial writes")
+
+  val read_id = Reg(io.nasti.ar.bits.id)
+  val read_cnt = Reg(io.nasti.ar.bits.len)
+  val reading = Reg(init = Bool(false))
+
+  io.nasti.ar.ready := !reading
+  io.nasti.r.valid := reading && io.stream.in.valid
+  io.nasti.r.bits := io.stream.in.bits
+  io.nasti.r.bits.resp := UInt(0)
+  io.nasti.r.bits.id := read_id
+  io.stream.in.ready := reading && io.nasti.r.ready
+
+  when (io.nasti.ar.fire()) {
+    read_id := io.nasti.ar.bits.id
+    read_cnt := io.nasti.ar.bits.len
+    reading := Bool(true)
+  }
+
+  when (io.nasti.r.fire()) {
+    when (read_cnt === UInt(0)) {
+      reading := Bool(false)
+    } .otherwise {
+      read_cnt := read_cnt - UInt(1)
+    }
+  }
+
+  val write_id = Reg(io.nasti.aw.bits.id)
+  val writing = Reg(init = Bool(false))
+  val write_resp = Reg(init = Bool(false))
+
+  io.nasti.aw.ready := !writing && !write_resp
+  io.nasti.w.ready := writing && io.stream.out.ready
+  io.stream.out.valid := writing && io.nasti.w.valid
+  io.stream.out.bits := io.nasti.w.bits
+  io.nasti.b.valid := write_resp
+  io.nasti.b.bits.resp := UInt(0)
+  io.nasti.b.bits.id := write_id
+
+  when (io.nasti.aw.fire()) {
+    write_id := io.nasti.aw.bits.id
+    writing := Bool(true)
+  }
+
+  when (io.nasti.w.fire() && io.nasti.w.bits.last) {
+    writing := Bool(false)
+    write_resp := Bool(true)
+  }
+
+  when (io.nasti.b.fire()) { write_resp := Bool(false) }
+}
+
+class StreamNarrower(win: Int, wout: Int) extends Module {
+  require(win > wout, "Stream narrower input width must be larger than input width")
+  require(win % wout == 0, "Stream narrower input width must be multiple of output width")
+
+  val io = new Bundle {
+    val in = Decoupled(new StreamChannel(win)).flip
+    val out = Decoupled(new StreamChannel(wout))
+  }
+
+  val n_pieces = win / wout
+  val buffer = Reg(Bits(width = win))
+  val (piece_idx, pkt_done) = Counter(io.out.fire(), n_pieces)
+  val pieces = Vec.tabulate(n_pieces) { i => buffer(wout * (i + 1) - 1, wout * i) }
+  val last_piece = (piece_idx === UInt(n_pieces - 1))
+  val sending = Reg(init = Bool(false))
+  val in_last = Reg(Bool())
+
+  when (io.in.fire()) {
+    buffer := io.in.bits.data
+    in_last := io.in.bits.last
+    sending := Bool(true)
+  }
+  when (pkt_done) { sending := Bool(false) }
+
+  io.out.valid := sending
+  io.out.bits.data := pieces(piece_idx)
+  io.out.bits.last := in_last && last_piece
+  io.in.ready := !sending
+}
+
+class StreamExpander(win: Int, wout: Int) extends Module {
+  require(win < wout, "Stream expander input width must be smaller than input width")
+  require(wout % win == 0, "Stream narrower output width must be multiple of input width")
+
+  val io = new Bundle {
+    val in = Decoupled(new StreamChannel(win)).flip
+    val out = Decoupled(new StreamChannel(wout))
+  }
+
+  val n_pieces = wout / win
+  val buffer = Reg(Vec(n_pieces, UInt(width = win)))
+  val last = Reg(Bool())
+  val collecting = Reg(init = Bool(true))
+  val (piece_idx, pkt_done) = Counter(io.in.fire(), n_pieces)
+
+  when (io.in.fire()) { buffer(piece_idx) := io.in.bits.data }
+  when (pkt_done) { last := io.in.bits.last; collecting := Bool(false) }
+  when (io.out.fire()) { collecting := Bool(true) }
+
+  io.in.ready := collecting
+  io.out.valid := !collecting
+  io.out.bits.data := buffer.toBits
+  io.out.bits.last := last
+}
+
+object StreamUtils {
+  def connectStreams(a: StreamIO, b: StreamIO) {
+    a.in <> b.out
+    b.in <> a.out
+  }
+}
+
+trait Serializable {
+  def nbits: Int
+}
+
+class Serializer[T <: Data with Serializable](w: Int, typ: T) extends Module {
+  val io = new Bundle {
+    val in = Decoupled(typ).flip
+    val out = Decoupled(Bits(width = w))
+  }
+
+  val narrower = Module(new StreamNarrower(typ.nbits, w))
+  narrower.io.in.bits.data := io.in.bits.toBits
+  narrower.io.in.bits.last := Bool(true)
+  narrower.io.in.valid := io.in.valid
+  io.in.ready := narrower.io.in.ready
+  io.out.valid := narrower.io.out.valid
+  io.out.bits := narrower.io.out.bits.data
+  narrower.io.out.ready := io.out.ready
+}
+
+class Deserializer[T <: Data with Serializable](w: Int, typ: T) extends Module {
+  val io = new Bundle {
+    val in = Decoupled(Bits(width = w)).flip
+    val out = Decoupled(typ)
+  }
+
+  val expander = Module(new StreamExpander(w, typ.nbits))
+  expander.io.in.valid := io.in.valid
+  expander.io.in.bits.data := io.in.bits
+  expander.io.in.bits.last := Bool(true)
+  io.in.ready := expander.io.in.ready
+  io.out.valid := expander.io.out.valid
+  io.out.bits := typ.cloneType.fromBits(expander.io.out.bits.data)
+  expander.io.out.ready := io.out.ready
+}
--- a/junctions/src/main/scala/util.scala
+++ b/junctions/src/main/scala/util.scala
@@ -0,0 +1,314 @@
+/// See LICENSE for license details.
+package junctions
+import Chisel._
+import cde.Parameters
+
+class ParameterizedBundle(implicit p: Parameters) extends Bundle {
+  override def cloneType = {
+    try {
+      this.getClass.getConstructors.head.newInstance(p).asInstanceOf[this.type]
+    } catch {
+      case e: java.lang.IllegalArgumentException =>
+        throwException("Unable to use ParamaterizedBundle.cloneType on " +
+                       this.getClass + ", probably because " + this.getClass +
+                       "() takes more than one argument.  Consider overriding " +
+                       "cloneType() on " + this.getClass, e)
+    }
+  }
+}
+
+class HellaFlowQueue[T <: Data](val entries: Int)(data: => T) extends Module {
+  val io = new QueueIO(data, entries)
+  require(entries > 1)
+
+  val do_flow = Wire(Bool())
+  val do_enq = io.enq.fire() && !do_flow
+  val do_deq = io.deq.fire() && !do_flow
+
+  val maybe_full = Reg(init=Bool(false))
+  val enq_ptr = Counter(do_enq, entries)._1
+  val (deq_ptr, deq_done) = Counter(do_deq, entries)
+  when (do_enq =/= do_deq) { maybe_full := do_enq }
+
+  val ptr_match = enq_ptr === deq_ptr
+  val empty = ptr_match && !maybe_full
+  val full = ptr_match && maybe_full
+  val atLeastTwo = full || enq_ptr - deq_ptr >= UInt(2)
+  do_flow := empty && io.deq.ready
+
+  val ram = SeqMem(entries, data)
+  when (do_enq) { ram.write(enq_ptr, io.enq.bits) }
+
+  val ren = io.deq.ready && (atLeastTwo || !io.deq.valid && !empty)
+  val raddr = Mux(io.deq.valid, Mux(deq_done, UInt(0), deq_ptr + UInt(1)), deq_ptr)
+  val ram_out_valid = Reg(next = ren)
+
+  io.deq.valid := Mux(empty, io.enq.valid, ram_out_valid)
+  io.enq.ready := !full
+  io.deq.bits := Mux(empty, io.enq.bits, ram.read(raddr, ren))
+}
+
+class HellaQueue[T <: Data](val entries: Int)(data: => T) extends Module {
+  val io = new QueueIO(data, entries)
+
+  val fq = Module(new HellaFlowQueue(entries)(data))
+  fq.io.enq <> io.enq
+  io.deq <> Queue(fq.io.deq, 1, pipe = true)
+}
+
+object HellaQueue {
+  def apply[T <: Data](enq: DecoupledIO[T], entries: Int) = {
+    val q = Module((new HellaQueue(entries)) { enq.bits })
+    q.io.enq.valid := enq.valid // not using <> so that override is allowed
+    q.io.enq.bits := enq.bits
+    enq.ready := q.io.enq.ready
+    q.io.deq
+  }
+}
+
+/** A generalized locking RR arbiter that addresses the limitations of the
+ *  version in the Chisel standard library */
+abstract class JunctionsAbstractLockingArbiter[T <: Data](typ: T, arbN: Int)
+    extends Module {
+
+  val io = new Bundle {
+    val in = Vec(arbN, Decoupled(typ.cloneType)).flip
+    val out = Decoupled(typ.cloneType)
+  }
+
+  def rotateLeft[T <: Data](norm: Vec[T], rot: UInt): Vec[T] = {
+    val n = norm.size
+    Vec.tabulate(n) { i =>
+      Mux(rot < UInt(n - i), norm(UInt(i) + rot), norm(rot - UInt(n - i)))
+    }
+  }
+
+  val lockIdx = Reg(init = UInt(0, log2Up(arbN)))
+  val locked = Reg(init = Bool(false))
+
+  val choice = PriorityMux(
+    rotateLeft(Vec(io.in.map(_.valid)), lockIdx + UInt(1)),
+    rotateLeft(Vec((0 until arbN).map(UInt(_))), lockIdx + UInt(1)))
+
+  val chosen = Mux(locked, lockIdx, choice)
+
+  for (i <- 0 until arbN) {
+    io.in(i).ready := io.out.ready && chosen === UInt(i)
+  }
+
+  io.out.valid := io.in(chosen).valid
+  io.out.bits := io.in(chosen).bits
+}
+
+/** This locking arbiter determines when it is safe to unlock
+ *  by peeking at the data */
+class JunctionsPeekingArbiter[T <: Data](
+    typ: T, arbN: Int,
+    canUnlock: T => Bool,
+    needsLock: Option[T => Bool] = None)
+    extends JunctionsAbstractLockingArbiter(typ, arbN) {
+
+  def realNeedsLock(data: T): Bool =
+    needsLock.map(_(data)).getOrElse(Bool(true))
+
+  when (io.out.fire()) {
+    when (!locked && realNeedsLock(io.out.bits)) {
+      lockIdx := choice
+      locked := Bool(true)
+    }
+    // the unlock statement takes precedent
+    when (canUnlock(io.out.bits)) {
+      locked := Bool(false)
+    }
+  }
+}
+
+/** This arbiter determines when it is safe to unlock by counting transactions */
+class JunctionsCountingArbiter[T <: Data](
+    typ: T, arbN: Int, count: Int,
+    val needsLock: Option[T => Bool] = None)
+    extends JunctionsAbstractLockingArbiter(typ, arbN) {
+
+  def realNeedsLock(data: T): Bool =
+    needsLock.map(_(data)).getOrElse(Bool(true))
+
+  // if count is 1, you should use a non-locking arbiter
+  require(count > 1, "CountingArbiter cannot have count <= 1")
+
+  val lock_ctr = Counter(count)
+
+  when (io.out.fire()) {
+    when (!locked && realNeedsLock(io.out.bits)) {
+      lockIdx := choice
+      locked := Bool(true)
+      lock_ctr.inc()
+    }
+
+    when (locked) {
+      when (lock_ctr.inc()) { locked := Bool(false) }
+    }
+  }
+}
+
+class ReorderQueueWrite[T <: Data](dType: T, tagWidth: Int) extends Bundle {
+  val data = dType.cloneType
+  val tag = UInt(width = tagWidth)
+
+  override def cloneType =
+    new ReorderQueueWrite(dType, tagWidth).asInstanceOf[this.type]
+}
+
+class ReorderEnqueueIO[T <: Data](dType: T, tagWidth: Int)
+  extends DecoupledIO(new ReorderQueueWrite(dType, tagWidth)) {
+
+  override def cloneType =
+    new ReorderEnqueueIO(dType, tagWidth).asInstanceOf[this.type]
+}
+
+class ReorderDequeueIO[T <: Data](dType: T, tagWidth: Int) extends Bundle {
+  val valid = Bool(INPUT)
+  val tag = UInt(INPUT, tagWidth)
+  val data = dType.cloneType.asOutput
+  val matches = Bool(OUTPUT)
+
+  override def cloneType =
+    new ReorderDequeueIO(dType, tagWidth).asInstanceOf[this.type]
+}
+
+class ReorderQueue[T <: Data](dType: T, tagWidth: Int, size: Option[Int] = None)
+    extends Module {
+  val io = new Bundle {
+    val enq = new ReorderEnqueueIO(dType, tagWidth).flip
+    val deq = new ReorderDequeueIO(dType, tagWidth)
+  }
+
+  val tagSpaceSize = 1 << tagWidth
+  val actualSize = size.getOrElse(tagSpaceSize)
+
+  if (tagSpaceSize > actualSize) {
+    val roq_data = Reg(Vec(actualSize, dType))
+    val roq_tags = Reg(Vec(actualSize, UInt(width = tagWidth)))
+    val roq_free = Reg(init = Vec.fill(actualSize)(Bool(true)))
+
+    val roq_enq_addr = PriorityEncoder(roq_free)
+    val roq_matches = roq_tags.zip(roq_free)
+      .map { case (tag, free) => tag === io.deq.tag && !free }
+    val roq_deq_onehot = PriorityEncoderOH(roq_matches)
+
+    io.enq.ready := roq_free.reduce(_ || _)
+    io.deq.data := Mux1H(roq_deq_onehot, roq_data)
+    io.deq.matches := roq_matches.reduce(_ || _)
+
+    when (io.enq.valid && io.enq.ready) {
+      roq_data(roq_enq_addr) := io.enq.bits.data
+      roq_tags(roq_enq_addr) := io.enq.bits.tag
+      roq_free(roq_enq_addr) := Bool(false)
+    }
+
+    when (io.deq.valid) {
+      roq_free(OHToUInt(roq_deq_onehot)) := Bool(true)
+    }
+
+    println(s"Warning - using a CAM for ReorderQueue, tagBits: ${tagWidth} size: ${actualSize}")
+  } else {
+    val roq_data = Mem(tagSpaceSize, dType)
+    val roq_free = Reg(init = Vec.fill(tagSpaceSize)(Bool(true)))
+
+    io.enq.ready := roq_free(io.enq.bits.tag)
+    io.deq.data := roq_data(io.deq.tag)
+    io.deq.matches := !roq_free(io.deq.tag)
+
+    when (io.enq.valid && io.enq.ready) {
+      roq_data(io.enq.bits.tag) := io.enq.bits.data
+      roq_free(io.enq.bits.tag) := Bool(false)
+    }
+
+    when (io.deq.valid) {
+      roq_free(io.deq.tag) := Bool(true)
+    }
+  }
+}
+
+object DecoupledHelper {
+  def apply(rvs: Bool*) = new DecoupledHelper(rvs)
+}
+
+class DecoupledHelper(val rvs: Seq[Bool]) {
+  def fire(exclude: Bool, includes: Bool*) = {
+    (rvs.filter(_ ne exclude) ++ includes).reduce(_ && _)
+  }
+}
+
+class MultiWidthFifo(inW: Int, outW: Int, n: Int) extends Module {
+  val io = new Bundle {
+    val in = Decoupled(Bits(width = inW)).flip
+    val out = Decoupled(Bits(width = outW))
+    val count = UInt(OUTPUT, log2Up(n + 1))
+  }
+
+  if (inW == outW) {
+    val q = Module(new Queue(Bits(width = inW), n))
+    q.io.enq <> io.in
+    io.out <> q.io.deq
+    io.count := q.io.count
+  } else if (inW > outW) {
+    val nBeats = inW / outW
+
+    require(inW % outW == 0, s"MultiWidthFifo: in: $inW not divisible by out: $outW")
+    require(n % nBeats == 0, s"Cannot store $n output words when output beats is $nBeats")
+
+    val wdata = Reg(Vec(n / nBeats, Bits(width = inW)))
+    val rdata = Vec(wdata.flatMap { indat =>
+      (0 until nBeats).map(i => indat(outW * (i + 1) - 1, outW * i)) })
+
+    val head = Reg(init = UInt(0, log2Up(n / nBeats)))
+    val tail = Reg(init = UInt(0, log2Up(n)))
+    val size = Reg(init = UInt(0, log2Up(n + 1)))
+
+    when (io.in.fire()) {
+      wdata(head) := io.in.bits
+      head := head + UInt(1)
+    }
+
+    when (io.out.fire()) { tail := tail + UInt(1) }
+
+    size := MuxCase(size, Seq(
+      (io.in.fire() && io.out.fire()) -> (size + UInt(nBeats - 1)),
+      io.in.fire() -> (size + UInt(nBeats)),
+      io.out.fire() -> (size - UInt(1))))
+
+    io.out.valid := size > UInt(0)
+    io.out.bits := rdata(tail)
+    io.in.ready := size < UInt(n)
+    io.count := size
+  } else {
+    val nBeats = outW / inW
+
+    require(outW % inW == 0, s"MultiWidthFifo: out: $outW not divisible by in: $inW")
+
+    val wdata = Reg(Vec(n * nBeats, Bits(width = inW)))
+    val rdata = Vec.tabulate(n) { i =>
+      Cat(wdata.slice(i * nBeats, (i + 1) * nBeats).reverse)}
+
+    val head = Reg(init = UInt(0, log2Up(n * nBeats)))
+    val tail = Reg(init = UInt(0, log2Up(n)))
+    val size = Reg(init = UInt(0, log2Up(n * nBeats + 1)))
+
+    when (io.in.fire()) {
+      wdata(head) := io.in.bits
+      head := head + UInt(1)
+    }
+
+    when (io.out.fire()) { tail := tail + UInt(1) }
+
+    size := MuxCase(size, Seq(
+      (io.in.fire() && io.out.fire()) -> (size - UInt(nBeats - 1)),
+      io.in.fire() -> (size + UInt(1)),
+      io.out.fire() -> (size - UInt(nBeats))))
+
+    io.count := size >> UInt(log2Up(nBeats))
+    io.out.valid := io.count > UInt(0)
+    io.out.bits := rdata(tail)
+    io.in.ready := size < UInt(n * nBeats)
+  }
+}