From f49172b5bc052622202ce5254d2ae38227a5773a Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sat, 29 Apr 2017 16:47:49 -0700
Subject: [PATCH 1/5] ScratchpadSlavePort doesn't support byte/halfword atomics

---
 src/main/scala/rocket/ScratchpadSlavePort.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/rocket/ScratchpadSlavePort.scala b/src/main/scala/rocket/ScratchpadSlavePort.scala
index a96fd248..dc456557 100644
--- a/src/main/scala/rocket/ScratchpadSlavePort.scala
+++ b/src/main/scala/rocket/ScratchpadSlavePort.scala
@@ -22,8 +22,8 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L
       resources          = device.reg,
       regionType         = RegionType.UNCACHED,
       executable         = true,
-      supportsArithmetic = if (usingAtomics) TransferSizes(1, coreDataBytes) else TransferSizes.none,
-      supportsLogical    = if (usingAtomics) TransferSizes(1, coreDataBytes) else TransferSizes.none,
+      supportsArithmetic = if (usingAtomics) TransferSizes(4, coreDataBytes) else TransferSizes.none,
+      supportsLogical    = if (usingAtomics) TransferSizes(4, coreDataBytes) else TransferSizes.none,
       supportsPutPartial = TransferSizes.none, // Can't support PutPartial
       supportsPutFull    = TransferSizes(1, coreDataBytes),
       supportsGet        = TransferSizes(1, coreDataBytes),

From 044b6ed3f92439ddee20fb4cd78eae0e2eda3f5b Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Sun, 30 Apr 2017 02:22:19 -0700
Subject: [PATCH 2/5] Improve logical ops in AMOALU

As with integer ALU, shave off some muxing.
---
 src/main/scala/uncore/util/AmoAlu.scala | 31 +++++++++++++++++--------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/main/scala/uncore/util/AmoAlu.scala b/src/main/scala/uncore/util/AmoAlu.scala
index c51f4ab3..b36c4446 100644
--- a/src/main/scala/uncore/util/AmoAlu.scala
+++ b/src/main/scala/uncore/util/AmoAlu.scala
@@ -70,9 +70,11 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame
     else new StoreGen(io.typ, io.addr, io.rhs, operandBits/8)
   val rhs = storegen.wordData
   
-  val sgned = io.cmd === M_XA_MIN || io.cmd === M_XA_MAX
   val max = io.cmd === M_XA_MAX || io.cmd === M_XA_MAXU
   val min = io.cmd === M_XA_MIN || io.cmd === M_XA_MINU
+  val add = io.cmd === M_XA_ADD
+  val logic_and = io.cmd === M_XA_OR || io.cmd === M_XA_AND
+  val logic_xor = io.cmd === M_XA_XOR || io.cmd === M_XA_OR
 
   val adder_out =
     if (operandBits == 32) io.lhs + rhs
@@ -81,9 +83,15 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame
       (io.lhs & mask) + (rhs & mask)
     }
 
-  val less =
-    if (operandBits == 32) Mux(io.lhs(31) === rhs(31), io.lhs < rhs, Mux(sgned, io.lhs(31), io.rhs(31)))
-    else {
+  val less = {
+    val sgned = {
+      val mask = M_XA_MIN ^ M_XA_MINU
+      (io.cmd & mask) === (M_XA_MIN & mask)
+    }
+
+    if (operandBits == 32) {
+      Mux(io.lhs(31) === rhs(31), io.lhs < rhs, Mux(sgned, io.lhs(31), io.rhs(31)))
+    } else {
       val word = !io.typ(0)
       val cmp_lhs = Mux(word && !io.addr(2), io.lhs(31), io.lhs(63))
       val cmp_rhs = Mux(word && !io.addr(2), rhs(31), rhs(63))
@@ -93,13 +101,16 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame
       val lt = Mux(word, Mux(io.addr(2), lt_hi, lt_lo), lt_hi || eq_hi && lt_lo)
       Mux(cmp_lhs === cmp_rhs, lt, Mux(sgned, cmp_lhs, cmp_rhs))
     }
+  }
 
-  val out = Mux(io.cmd === M_XA_ADD, adder_out,
-            Mux(io.cmd === M_XA_AND, io.lhs & rhs,
-            Mux(io.cmd === M_XA_OR,  io.lhs | rhs,
-            Mux(io.cmd === M_XA_XOR, io.lhs ^ rhs,
-            Mux(Mux(less, min, max), io.lhs,
-            storegen.data)))))
+  val minmax = Mux(Mux(less, min, max), io.lhs, storegen.data)
+  val logic =
+    Mux(logic_and, io.lhs & rhs, 0.U) |
+    Mux(logic_xor, io.lhs ^ rhs, 0.U)
+  val out =
+    Mux(add,                    adder_out,
+    Mux(logic_and || logic_xor, logic,
+                                minmax))
 
   val wmask = FillInterleaved(8, storegen.mask)
   io.out := wmask & out | ~wmask & io.lhs

From f8151ce786d63947c80eb13ceed29be1acd8decd Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Mon, 1 May 2017 17:36:39 -0700
Subject: [PATCH 3/5] Remove subword load muxing in ScratchpadSlavePort

---
 src/main/scala/rocket/DCache.scala              |  1 +
 src/main/scala/rocket/HellaCache.scala          |  1 +
 src/main/scala/rocket/NBDcache.scala            |  1 +
 src/main/scala/rocket/ScratchpadSlavePort.scala | 10 ++--------
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala
index 67163a6f..5562d769 100644
--- a/src/main/scala/rocket/DCache.scala
+++ b/src/main/scala/rocket/DCache.scala
@@ -517,6 +517,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
   val loadgen = new LoadGen(s2_req.typ, mtSigned(s2_req.typ), s2_req.addr, s2_data_word, s2_sc, wordBytes)
   io.cpu.resp.bits.data := loadgen.data | s2_sc_fail
   io.cpu.resp.bits.data_word_bypass := loadgen.wordData
+  io.cpu.resp.bits.data_raw := s2_data_word
   io.cpu.resp.bits.store_data := pstore1_data
 
   // AMOs
diff --git a/src/main/scala/rocket/HellaCache.scala b/src/main/scala/rocket/HellaCache.scala
index 70072321..f50bfa0b 100644
--- a/src/main/scala/rocket/HellaCache.scala
+++ b/src/main/scala/rocket/HellaCache.scala
@@ -103,6 +103,7 @@ class HellaCacheResp(implicit p: Parameters) extends CoreBundle()(p)
   val replay = Bool()
   val has_data = Bool()
   val data_word_bypass = Bits(width = coreDataBits)
+  val data_raw = Bits(width = coreDataBits)
   val store_data = Bits(width = coreDataBits)
 }
 
diff --git a/src/main/scala/rocket/NBDcache.scala b/src/main/scala/rocket/NBDcache.scala
index ebb287f2..5ca57d76 100644
--- a/src/main/scala/rocket/NBDcache.scala
+++ b/src/main/scala/rocket/NBDcache.scala
@@ -972,6 +972,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule
   io.cpu.s2_nack := s2_valid && s2_nack
   io.cpu.resp := Mux(mshrs.io.resp.ready, uncache_resp, cache_resp)
   io.cpu.resp.bits.data_word_bypass := loadgen.wordData
+  io.cpu.resp.bits.data_raw := s2_data_word
   io.cpu.ordered := mshrs.io.fence_rdy && !s1_valid && !s2_valid
   io.cpu.replay_next := (s1_replay && s1_read) || mshrs.io.replay_next
 
diff --git a/src/main/scala/rocket/ScratchpadSlavePort.scala b/src/main/scala/rocket/ScratchpadSlavePort.scala
index dc456557..76b8ea10 100644
--- a/src/main/scala/rocket/ScratchpadSlavePort.scala
+++ b/src/main/scala/rocket/ScratchpadSlavePort.scala
@@ -48,7 +48,7 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L
     when (io.dmem.req.fire()) { state := s_wait }
 
     val acq = Reg(tl_in.a.bits)
-    when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data }
+    when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data_raw }
     when (tl_in.a.fire()) { acq := tl_in.a.bits }
 
     def formCacheReq(a: TLBundleA) = {
@@ -85,17 +85,11 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L
     io.dmem.s1_kill := false
     io.dmem.invalidate_lr := false
 
-    // place AMO data in correct word lane
-    val minAMOBytes = 4
-    val grantData = Mux(io.dmem.resp.valid, io.dmem.resp.bits.data, acq.data)
-    val alignedGrantData =
-      Mux(edge.hasData(acq) && (acq.size <= log2Ceil(minAMOBytes)), Fill(coreDataBytes/minAMOBytes, grantData(8*minAMOBytes-1, 0)), grantData)
-
     tl_in.d.valid := io.dmem.resp.valid || state === s_grant
     tl_in.d.bits := Mux(acq.opcode === TLMessages.PutFullData,
       edge.AccessAck(acq, UInt(0)),
       edge.AccessAck(acq, UInt(0), UInt(0)))
-    tl_in.d.bits.data := alignedGrantData
+    tl_in.d.bits.data := Mux(io.dmem.resp.valid, io.dmem.resp.bits.data_raw, acq.data)
 
     // Tie off unused channels
     tl_in.b.valid := Bool(false)

From 938b089543c3b27d50bd13e41be82363b4e423ae Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Tue, 2 May 2017 01:59:47 -0700
Subject: [PATCH 4/5] Remove legacy devices that use AMOALU

I'm going to change the AMOALU API, and so I'm removing dependent dead code.
---
 src/main/scala/uncore/converters/Ahb.scala    | 423 ------------------
 .../scala/uncore/converters/Tilelink.scala    | 181 --------
 src/main/scala/uncore/devices/Bram.scala      | 187 --------
 3 files changed, 791 deletions(-)
 delete mode 100644 src/main/scala/uncore/converters/Ahb.scala
 delete mode 100644 src/main/scala/uncore/devices/Bram.scala

diff --git a/src/main/scala/uncore/converters/Ahb.scala b/src/main/scala/uncore/converters/Ahb.scala
deleted file mode 100644
index 6d98af6d..00000000
--- a/src/main/scala/uncore/converters/Ahb.scala
+++ /dev/null
@@ -1,423 +0,0 @@
-// See LICENSE.SiFive for license details.
-
-package uncore.converters
-
-import Chisel._
-import junctions._
-import uncore.tilelink._
-import uncore.util._
-import uncore.constants._
-import config._
-import HastiConstants._
-
-/* We need to translate TileLink requests into operations we can actually execute on AHB.
- * The general plan of attack is:
- *   get         => one AHB=>TL read
- *   put         => [multiple AHB write fragments=>nill], one AHB write=>TL
- *   getBlock    => AHB burst reads =>TL
- *   putBlock    => AHB burst writes=>TL
- *   getPrefetch => noop=>TL
- *   putPrefetch => noop=>TL
- *   putAtomic   => one AHB=>TL read, one idle, one AHB atom_write=>nill, one idle
- *
- * This requires that we support a pipeline of optional AHB requests with optional TL responses
- */
-class AHBRequestIO(implicit p: Parameters) extends HastiMasterIO
-    with HasGrantType
-    with HasClientTransactionId
-    with HasTileLinkBeatId {
-  val executeAHB = Bool()
-  val respondTL  = Bool()
-  val latchAtom  = Bool()
-  val firstBurst = Bool()
-  val finalBurst = Bool()
-  val cmd        = Bits(width = M_SZ) // atomic op
-}
-
-// AHB stage1: translate TileLink Acquires into AHBRequests
-class AHBTileLinkIn(supportAtomics: Boolean = false)(implicit val p: Parameters) extends Module
-    with HasHastiParameters
-    with HasTileLinkParameters {
-  val io = new Bundle {
-    val acquire = new DecoupledIO(new Acquire).flip // NOTE: acquire must be either a Queue or a Pipe
-    val request = new DecoupledIO(new AHBRequestIO)
-  }
-  
-  // Match the AHB burst with a TileLink {Put,Get}Block
-  val burstSize = tlDataBeats match {
-    case 1  => HBURST_SINGLE
-    // case 2 not supported by AHB
-    case 4  => HBURST_WRAP4
-    case 8  => HBURST_WRAP8
-    case 16 => HBURST_WRAP16
-    case _  => throw new java.lang.AssertionError("TileLink beats unsupported by AHB")
-  }
-  
-  // Bursts start at 0 and wrap-around back to 0
-  val finalBurst = UInt(tlDataBeats-1, width = log2Up(tlDataBeats)).asUInt
-  val firstBurst = UInt(0,             width = log2Up(tlDataBeats))
-  val next_wmask = Wire(UInt(width = tlDataBytes)) // calculated below
-  
-  // State variables for processing more complicated TileLink Acquires
-  val s_atom_r :: s_atom_idle1 :: s_atom_w :: s_atom_idle2 :: Nil = Enum(UInt(), 4)
-  val atom_state = Reg(init = s_atom_r) // never changes if !supportAtomics
-  val done_wmask = Reg(init = UInt(0, width = tlDataBytes))
-  val burst      = Reg(init = firstBurst)
-  
-  // Grab some view of the TileLink acquire
-  val acq_wmask    = io.acquire.bits.wmask()
-  val isReadBurst  = io.acquire.bits.is(Acquire.getBlockType)
-  val isWriteBurst = io.acquire.bits.is(Acquire.putBlockType)
-  val isBurst      = isWriteBurst || isReadBurst
-  val isAtomic     = io.acquire.bits.is(Acquire.putAtomicType) && Bool(supportAtomics)
-  val isPut        = io.acquire.bits.is(Acquire.putType)
-  
-  // Final states?
-  val last_wmask = next_wmask === acq_wmask
-  val last_atom  = atom_state === s_atom_idle2
-  val last_burst = burst      === finalBurst
-  
-  // Block the incoming request until we've fully consumed it
-  // NOTE: the outgoing grant.valid may happen while acquire.ready is still false;
-  // for this reason it is essential to have a Queue or a Pipe infront of acquire
-  io.acquire.ready := io.request.ready && MuxLookup(io.acquire.bits.a_type, Bool(true), Array(
-    Acquire.getType         -> Bool(true),
-    Acquire.getBlockType    -> last_burst, // hold it until the last beat is burst
-    Acquire.putType         -> last_wmask, // only accept the put if we can fully consume its wmask
-    Acquire.putBlockType    -> Bool(true),
-    Acquire.putAtomicType   -> last_atom,  // atomic operation stages complete
-    Acquire.getPrefetchType -> Bool(true),
-    Acquire.putPrefetchType -> Bool(true)))
-  
-  // Advance the fragment state
-  when (io.request.ready && io.acquire.valid && isPut) {
-    when (last_wmask) { // if this was the last fragment, restart FSM
-      done_wmask := UInt(0)
-    } .otherwise {
-      done_wmask := next_wmask 
-    }
-  }
-  
-  // Advance the burst state
-  // We assume here that TileLink gives us all putBlock beats with nothing between them
-  when (io.request.ready && io.acquire.valid && isBurst) {
-    when (last_burst) {
-      burst := UInt(0)
-    } .otherwise {
-      burst := burst + UInt(1)
-    }
-  }
-  
-  // Advance the atomic state machine
-  when (io.request.ready && io.acquire.valid && isAtomic) {
-    switch (atom_state) {
-      is (s_atom_r)     { atom_state := s_atom_idle1 }
-      is (s_atom_idle1) { atom_state := s_atom_w     } // idle1 => AMOALU runs on a different clock than AHB slave read
-      is (s_atom_w)     { atom_state := s_atom_idle2 }
-      is (s_atom_idle2) { atom_state := s_atom_r     } // idle2 state is required by AHB after hmastlock is lowered
-    }
-  }
-  
-  // Returns (range=0, range=-1, aligned_wmask, size)
-  def mask_helper(in_0 : Bool, range : UInt): (Bool, Bool, UInt, UInt) = {
-    val len = range.getWidth
-    if (len == 1) {
-      (range === UInt(0), range === UInt(1), in_0.asUInt() & range, UInt(0))
-    } else {
-      val mid = len / 2
-      val lo  = range(mid-1, 0)
-      val hi  = range(len-1, mid)
-      val (lo_0, lo_1, lo_m, lo_s) = mask_helper(in_0,         lo)
-      val (hi_0, hi_1, hi_m, hi_s) = mask_helper(in_0 && lo_0, hi)
-      val out_0 = lo_0 && hi_0
-      val out_1 = lo_1 && hi_1
-      val out_m = Cat(hi_m, lo_m) | Fill(len, (in_0 && out_1).asUInt())
-      val out_s = Mux(out_1, UInt(log2Up(len)), Mux(lo_0, hi_s, lo_s))
-      (out_0, out_1, out_m, out_s)
-    }
-  }
-  
-  val pending_wmask = acq_wmask & ~done_wmask
-  val put_addr = PriorityEncoder(pending_wmask)
-  val (wmask_0, _, exec_wmask, put_size) = mask_helper(Bool(true), pending_wmask)
-  next_wmask := done_wmask | exec_wmask
-  
-  // Calculate the address, with consideration to put fragments and bursts
-  val addr_block = io.acquire.bits.addr_block
-  val addr_beatin= io.acquire.bits.addr_beat
-  val addr_burst = Mux(isReadBurst, addr_beatin + burst, addr_beatin)
-  val addr_byte  = Mux(isPut, put_addr, io.acquire.bits.addr_byte())
-  val addr_beat  = Mux(isWriteBurst, UInt(0), addr_burst)
-  val ahbAddr    = Cat(addr_block, addr_burst, addr_byte)
-  val ahbSize    = Mux(isPut, put_size, Mux(isBurst, UInt(log2Ceil(tlDataBytes)), io.acquire.bits.op_size()))
-  
-  val ahbBurst = MuxLookup(io.acquire.bits.a_type, HBURST_SINGLE, Array(
-    Acquire.getType         -> HBURST_SINGLE,
-    Acquire.getBlockType    -> burstSize,
-    Acquire.putType         -> HBURST_SINGLE,
-    Acquire.putBlockType    -> burstSize,
-    Acquire.putAtomicType   -> HBURST_SINGLE,
-    Acquire.getPrefetchType -> HBURST_SINGLE,
-    Acquire.putPrefetchType -> HBURST_SINGLE))
-  
-  val ahbWrite = MuxLookup(io.acquire.bits.a_type, Bool(false), Array(
-    Acquire.getType         -> Bool(false),
-    Acquire.getBlockType    -> Bool(false),
-    Acquire.putType         -> Bool(true),
-    Acquire.putBlockType    -> Bool(true),
-    Acquire.putAtomicType   -> MuxLookup(atom_state, Bool(false), Array(
-      s_atom_r              -> Bool(false),
-      s_atom_idle1          -> Bool(false),  // don't care
-      s_atom_w              -> Bool(true),
-      s_atom_idle2          -> Bool(true))), // don't care
-    Acquire.getPrefetchType -> Bool(false),  // don't care
-    Acquire.putPrefetchType -> Bool(true)))  // don't care
-  
-  val ahbExecute = MuxLookup(io.acquire.bits.a_type, Bool(false), Array(
-    Acquire.getType         -> Bool(true),
-    Acquire.getBlockType    -> Bool(true),
-    Acquire.putType         -> !wmask_0,  // handle the case of a Put with no bytes!
-    Acquire.putBlockType    -> Bool(true),
-    Acquire.putAtomicType   -> MuxLookup(atom_state, Bool(false), Array(
-      s_atom_r              -> Bool(true),
-      s_atom_idle1          -> Bool(false),
-      s_atom_w              -> Bool(true),
-      s_atom_idle2          -> Bool(false))),
-    Acquire.getPrefetchType -> Bool(false),
-    Acquire.putPrefetchType -> Bool(false)))
-  
-  val respondTL = MuxLookup(io.acquire.bits.a_type, Bool(false), Array(
-    Acquire.getType         -> Bool(true),
-    Acquire.getBlockType    -> Bool(true),
-    Acquire.putType         -> last_wmask,
-    Acquire.putBlockType    -> last_burst,
-    Acquire.putAtomicType   -> MuxLookup(atom_state, Bool(false), Array(
-      s_atom_r              -> Bool(true), // they want the old data
-      s_atom_idle1          -> Bool(false),
-      s_atom_w              -> Bool(false),
-      s_atom_idle2          -> Bool(false))),
-    Acquire.getPrefetchType -> Bool(true),
-    Acquire.putPrefetchType -> Bool(true)))
-  
-  io.request.valid                := io.acquire.valid
-  io.request.bits.htrans          := HTRANS_IDLE // unused/ignored
-  io.request.bits.haddr           := ahbAddr
-  io.request.bits.hmastlock       := isAtomic && atom_state =/= s_atom_idle2
-  io.request.bits.hwrite          := ahbWrite
-  io.request.bits.hburst          := ahbBurst
-  io.request.bits.hsize           := ahbSize
-  io.request.bits.hprot           := HPROT_DATA | HPROT_PRIVILEGED
-  io.request.bits.hwdata          := io.acquire.bits.data
-  io.request.bits.executeAHB      := ahbExecute
-  io.request.bits.respondTL       := respondTL
-  io.request.bits.latchAtom       := isAtomic && atom_state === s_atom_r
-  io.request.bits.firstBurst      := burst === firstBurst
-  io.request.bits.finalBurst      := burst === finalBurst || !isBurst
-  io.request.bits.cmd             := io.acquire.bits.op_code()
-  io.request.bits.is_builtin_type := Bool(true)
-  io.request.bits.g_type          := io.acquire.bits.getBuiltInGrantType()
-  io.request.bits.client_xact_id  := io.acquire.bits.client_xact_id
-  io.request.bits.addr_beat       := addr_beat
-
-  val debugBurst = Reg(UInt())
-  when (io.request.valid) {
-    debugBurst := addr_burst - burst
-  }
-  
-  // We only support built-in TileLink requests
-  assert(!io.acquire.valid || io.acquire.bits.is_builtin_type, "AHB bridge only supports builtin TileLink types")
-  // Ensure alignment of address to size
-  assert(!io.acquire.valid || (ahbAddr & ((UInt(1) << ahbSize) - UInt(1))) === UInt(0), "TileLink operation misaligned")
-  // If this is a putBlock, make sure it moves properly
-  assert(!io.acquire.valid || !isBurst || burst === firstBurst || debugBurst === addr_burst - burst, "TileLink putBlock beats not sequential")
-  // We better not get an incomplete TileLink acquire
-  assert(!io.acquire.valid || isBurst  || burst === firstBurst, "TileLink never completed a putBlock")
-  // If we disabled atomic support, we better not see a request
-  assert(!io.acquire.bits.is(Acquire.putAtomicType) || Bool(supportAtomics))
-}
-
-// AHB stage2: execute AHBRequests
-class AHBBusMaster(supportAtomics: Boolean = false)(implicit val p: Parameters) extends Module
-    with HasHastiParameters
-    with HasTileLinkParameters {
-  val io = new Bundle {
-    val request = new DecoupledIO(new AHBRequestIO).flip
-    val grant   = new DecoupledIO(new Grant)
-    val ahb     = new HastiMasterIO()
-  }
-  
-  // All AHB outputs are registered (they might be IOs)
-  val midBurst  = Reg(init = Bool(false))
-  val htrans    = Reg(init = HTRANS_IDLE)
-  val haddr     = Reg(UInt())
-  val hmastlock = Reg(init = Bool(false))
-  val hwrite    = Reg(Bool())
-  val hburst    = Reg(UInt())
-  val hsize     = Reg(init = UInt(0, width = SZ_HSIZE))
-  val hprot     = Reg(UInt())
-  val hwdata0   = Reg(Bits())
-  val hwdata1   = Reg(Bits())
-  val hrdata    = Reg(Bits())
-  
-  io.ahb.htrans    := htrans
-  io.ahb.haddr     := haddr
-  io.ahb.hmastlock := hmastlock
-  io.ahb.hwrite    := hwrite
-  io.ahb.hburst    := hburst
-  io.ahb.hsize     := hsize
-  io.ahb.hprot     := hprot
-  io.ahb.hwdata    := hwdata1 // one cycle after the address phase
-  
-  // TileLink response data needed in data phase
-  val respondTL0      = Reg(init = Bool(false))
-  val respondTL1      = Reg(init = Bool(false))
-  val latchAtom0      = Reg(init = Bool(false))
-  val latchAtom1      = Reg(init = Bool(false))
-  val executeAHB0     = Reg(init = Bool(false))
-  val executeAHB1     = Reg(init = Bool(false))
-  val bubble          = Reg(init = Bool(true)) // nothing useful in address phase
-  val cmd             = Reg(Bits())
-  val g_type0         = Reg(UInt())
-  val g_type1         = Reg(UInt())
-  val client_xact_id0 = Reg(Bits())
-  val client_xact_id1 = Reg(Bits())
-  val addr_beat0      = Reg(UInt())
-  val addr_beat1      = Reg(UInt())
-  val grant1          = Reg(new Grant)
-  
-  // It is allowed to progress from Idle/Busy during a wait state
-  val addrReady = io.ahb.hready || bubble || (!executeAHB1 && !executeAHB0)
-  val dataReady = io.ahb.hready || !executeAHB1
-  
-  // Only accept a new AHBRequest if we have enough buffer space in the pad
-  // to accomodate a persistent drop in TileLink's grant.ready
-  io.request.ready := addrReady && io.grant.ready
-  
-  // htrans must be updated even if no request is valid
-  when (addrReady) {
-    when (io.request.fire() && io.request.bits.executeAHB) {
-      midBurst := !io.request.bits.finalBurst
-      when (io.request.bits.firstBurst) {
-        htrans := HTRANS_NONSEQ
-      } .otherwise {
-        htrans := HTRANS_SEQ
-      }
-    } .otherwise {
-      when (midBurst) {
-        htrans := HTRANS_BUSY
-      } .otherwise {
-        htrans := HTRANS_IDLE
-      }
-    }
-  }
-
-  // Address phase, clear repondTL when we have nothing to do
-  when (addrReady) {
-    when (io.request.fire()) {
-      respondTL0 := io.request.bits.respondTL
-      latchAtom0 := io.request.bits.latchAtom
-      executeAHB0:= io.request.bits.executeAHB
-      bubble     := Bool(false)
-    } .otherwise {
-      respondTL0 := Bool(false)
-      latchAtom0 := Bool(false)
-      executeAHB0:= Bool(false)
-      bubble     := Bool(true) // an atom-injected Idle is not a bubble!
-    }
-  }
-
-  // Transfer bulk address phase
-  when (io.request.fire()) {
-    haddr     := io.request.bits.haddr
-    hmastlock := io.request.bits.hmastlock
-    hwrite    := io.request.bits.hwrite
-    hburst    := io.request.bits.hburst
-    hsize     := io.request.bits.hsize
-    hprot     := io.request.bits.hprot
-    hwdata0   := io.request.bits.hwdata
-    cmd             := io.request.bits.cmd
-    g_type0         := io.request.bits.g_type
-    client_xact_id0 := io.request.bits.client_xact_id
-    addr_beat0      := io.request.bits.addr_beat
-  }
-  
-  // Execute Atomic ops; unused and optimized away if !supportAtomics
-  val amo_p = p.alterPartial({
-    case CacheBlockOffsetBits => hastiAddrBits
-  })
-  val alu = Module(new AMOALU(hastiDataBits, rhsIsAligned = true)(amo_p))
-  alu.io.addr := haddr
-  alu.io.cmd  := cmd
-  alu.io.typ  := hsize
-  alu.io.rhs  := hwdata0
-  alu.io.lhs  := hrdata
-  
-  // Transfer bulk data phase
-  when (dataReady) {
-    when (addrReady) {
-      respondTL1    := respondTL0
-      latchAtom1    := latchAtom0
-      executeAHB1   := executeAHB0
-    } .otherwise {
-      respondTL1    := Bool(false)
-      latchAtom1    := Bool(false)
-      executeAHB1   := Bool(false)
-    }
-    hwdata1         := Mux(Bool(supportAtomics), alu.io.out, hwdata0)
-    g_type1         := g_type0
-    client_xact_id1 := client_xact_id0
-    addr_beat1      := addr_beat0
-  }
-  
-  // Latch the read result for an atomic operation
-  when (dataReady && latchAtom1) {
-    hrdata := io.ahb.hrdata
-  }
-  
-  // Only issue TL grant when the slave has provided data
-  io.grant.valid := dataReady && respondTL1
-  io.grant.bits := Grant(
-      is_builtin_type = Bool(true),
-      g_type          = g_type1,
-      client_xact_id  = client_xact_id1,
-      manager_xact_id = UInt(0),
-      addr_beat       = addr_beat1,
-      data            = io.ahb.hrdata)
-
-  // We cannot support errors from AHB to TileLink
-  assert(!io.ahb.hresp, "AHB hresp error detected and cannot be reported via TileLink")
-}
-
-class AHBBridge(supportAtomics: Boolean = true)(implicit val p: Parameters) extends Module
-    with HasHastiParameters
-    with HasTileLinkParameters {
-  val io = new Bundle {
-    val tl  = new ClientUncachedTileLinkIO().flip
-    val ahb = new HastiMasterIO()
-  }
-  
-  // Hasti and TileLink widths must agree at this point in the topology
-  require (tlDataBits == hastiDataBits)
-  require (p(rocket.PAddrBits) == hastiAddrBits)
-  
-  // AHB does not permit bursts to cross a 1KB boundary
-  require (tlDataBits * tlDataBeats <= 1024*8)
-  // tlDataBytes must be a power of 2
-  require (1 << log2Ceil(tlDataBytes) == tlDataBytes)
-  
-  // Create the sub-blocks
-  val fsm = Module(new AHBTileLinkIn(supportAtomics))
-  val bus = Module(new AHBBusMaster(supportAtomics))
-  val pad = Module(new Queue(new Grant, 4))
-  
-  fsm.io.acquire <> Queue(io.tl.acquire, 2) // Pipe is also acceptable
-  bus.io.request <> fsm.io.request
-  io.ahb         <> bus.io.ahb
-  io.tl.grant    <> pad.io.deq
-  
-  // The pad is needed to absorb AHB progress while !grant.ready
-  // We are only 'ready' if the pad has at least 3 cycles of space
-  bus.io.grant.ready := pad.io.count <= UInt(1)
-  pad.io.enq.bits  := bus.io.grant.bits
-  pad.io.enq.valid := bus.io.grant.valid
-}
diff --git a/src/main/scala/uncore/converters/Tilelink.scala b/src/main/scala/uncore/converters/Tilelink.scala
index 3aa0ec16..a5572c91 100644
--- a/src/main/scala/uncore/converters/Tilelink.scala
+++ b/src/main/scala/uncore/converters/Tilelink.scala
@@ -9,7 +9,6 @@ import rocket.PAddrBits
 import uncore.tilelink._
 import uncore.util._
 import uncore.constants._
-import uncore.devices.TileLinkTestRAM
 import unittest.UnitTest
 import config._
 
@@ -604,183 +603,3 @@ class TileLinkIONarrower(innerTLId: String, outerTLId: String)
     sending_get := Bool(false)
   }
 }
-
-class TileLinkWidthAdapterTest(implicit p: Parameters) extends UnitTest {
-  val narrowConfig = p(TLKey(p(TLId)))
-  val wideConfig = narrowConfig.copy(
-    dataBeats = narrowConfig.dataBeats / 2)
-  val adapterParams = p.alterPartial({ case TLKey("WIDE") => wideConfig })
-
-  val depth = 2 * narrowConfig.dataBeats
-  val ram = Module(new TileLinkTestRAM(depth))
-  val driver = Module(new DriverSet(
-    (driverParams: Parameters) => {
-      implicit val p = driverParams
-      Seq(
-        Module(new PutSweepDriver(depth)),
-        Module(new PutMaskDriver),
-        Module(new PutAtomicDriver),
-        Module(new PutBlockSweepDriver(depth / narrowConfig.dataBeats)),
-        Module(new PrefetchDriver),
-        Module(new GetMultiWidthDriver))
-    }))
-  val widener = Module(new TileLinkIOWidener(p(TLId), "WIDE")(adapterParams))
-  val narrower = Module(new TileLinkIONarrower("WIDE", p(TLId))(adapterParams))
-
-  widener.io.in <> driver.io.mem
-  narrower.io.in <> widener.io.out
-  ram.io <> narrower.io.out
-  driver.io.start := io.start
-  io.finished := driver.io.finished
-}
-
-class TileLinkFragmenterSource(implicit p: Parameters) extends TLModule()(p) {
-  val io = new Bundle {
-    val in  = Decoupled(new Acquire).flip
-    val out = Decoupled(new Acquire)
-    val que = Decoupled(UInt(width = tlBeatAddrBits))
-  }
-
-  // Pipeline stage with acquire data; needed to ensure in.bits stay fixed when !in.ready
-  val acq_valid = RegInit(Bool(false))
-  val acq_bits  = Reg(new Acquire)
-  // The last beat of generate acquire to send
-  val acq_last_beat = Reg(UInt(width = tlBeatAddrBits))
-  val acq_last = acq_bits.addr_beat === acq_last_beat
-
-  // 'in' has the first beat?
-  val in_multi_put = io.in.bits.isBuiltInType(Acquire.putBlockType)
-  val in_multi_get = io.in.bits.isBuiltInType(Acquire.getBlockType)
-  val in_first_beat = !in_multi_put || io.in.bits.addr_beat === UInt(0)
-
-  // Move stuff from acq to out whenever out is ready
-  io.out.valid := acq_valid
-  // When can acq accept a request?
-  val acq_ready = !acq_valid || (acq_last && io.out.ready)
-  // Move the first beat from in to acq only when both acq and que are ready
-  io.in.ready := (!in_first_beat || io.que.ready) && acq_ready
-  io.que.valid := (in_first_beat && io.in.valid) && acq_ready
-
-  // in.fire moves data from in to acq and (optionally) que
-  // out.fire moves data from acq to out
-
-  // Desired flow control results:
-  assert (!io.que.fire() || io.in.fire())                               // 1. que.fire => in.fire
-  assert (!(io.in.fire() && in_first_beat) || io.que.fire())            // 2. in.fire && in_first_beat => que.fire
-  assert (!io.out.fire() || acq_valid)                                  // 3. out.fire => acq_valid
-  assert (!io.in.fire() || (!acq_valid || (io.out.fire() && acq_last))) // 4. in.fire => !acq_valid || (out.fire && acq_last)
-  // Proofs:
-  // 1. que.fire => que.ready && in.valid && acq_ready => in.ready && in.valid
-  // 2. in.fire && in_first_beat => in.valid && acq_ready && [(!in_first_beat || que.ready) && in_first_beat] =>
-  //   in.valid && acq_ready && que.ready && in_first_beat => que.valid && que.ready
-  // 3. out.fire => out.valid => acq_valid
-  // 4. in.fire => acq_ready => !acq_valid || (acq_last && out.ready) =>
-  //   !acq_valid || (acq_valid && acq_last && out.ready) => !acq_valid || (acq_last && out.fire)
-
-  val multi_size = SInt(-1, width = tlBeatAddrBits).asUInt // TL2: use in.bits.size()/beatBits-1
-  val in_sizeMinus1 = Mux(in_multi_get || in_multi_put, multi_size, UInt(0))
-  val in_insertSizeMinus1 = Mux(in_multi_get, multi_size, UInt(0))
-
-  when (io.in.fire()) {
-    // Theorem 4 makes this safe; we overwrite garbage, or replace the final acq
-    acq_valid := Bool(true)
-    acq_bits := io.in.bits
-    acq_last_beat := io.in.bits.addr_beat + in_insertSizeMinus1
-    // Replace this with size truncation in TL2:
-    acq_bits.a_type := Mux(in_multi_put, Acquire.putType, Mux(in_multi_get, Acquire.getType, io.in.bits.a_type))
-  } .elsewhen (io.out.fire()) {
-    acq_valid := !acq_last // false => !in.valid || (!que.ready && in_first_beat)
-    acq_bits.addr_beat := acq_bits.addr_beat + UInt(1)
-    // acq_last && out.fire => acq_last && out.ready && acq_valid => acq_ready
-    // Suppose in.valid, then !in.fire => !in.ready => !(!in_first_beat || que.ready) => !que.ready && in_first_beat
-  }
-
-  // Safe by theorem 3
-  io.out.bits := acq_bits
-  // Safe by theorem 1
-  io.que.bits := in_sizeMinus1
-}
-
-class TileLinkFragmenterSink(implicit p: Parameters) extends TLModule()(p) {
-  val io = new Bundle {
-    val in  = Decoupled(new Grant).flip
-    val out = Decoupled(new Grant)
-    val que = Decoupled(UInt(width = tlBeatAddrBits)).flip
-  }
-
-  val count_valid = RegInit(Bool(false))
-  val multi_op = Reg(Bool())
-  val count_bits = Reg(UInt(width = tlBeatAddrBits))
-  val last = count_bits === UInt(0)
-
-  val in_put = io.in.bits.isBuiltInType(Grant.putAckType)
-  val in_get = io.in.bits.isBuiltInType(Grant.getDataBeatType)
-  val deliver = last || in_get
-
-  // Accept the input, discarding the non-final put grant
-  io.in.ready := count_valid && (io.out.ready || !deliver)
-  // Output the grant whenever we want delivery
-  io.out.valid := count_valid && io.in.valid && deliver
-  // Take a new number whenever we deliver the last beat
-  io.que.ready := !count_valid || (io.in.valid && io.out.ready && last)
-
-  // Desired flow control results:
-  assert (!io.out.fire() || (count_valid && io.in.fire()))   // 1. out.fire => in.fire && count_valid
-  assert (!(io.in.fire() && deliver) || io.out.fire())       // 2. in.fire && deliver => out.fire
-  assert (!(io.out.fire() && last) || io.que.ready)          // 3. out.fire && last => que.ready
-  assert (!io.que.fire() || (!count_valid || io.out.fire())) // 4. que.fire => !count_valid || (out.fire && last)
-  // Proofs:
-  // 1. out.fire => out.ready && (count_valid && in.valid && deliver) => (count_valid && out.ready) && in.valid => in.fire
-  // 2. in.fire && deliver => in.valid && count_valid && [(out.ready || !deliver) && deliver] =>
-  //      in.valid && count_valid && deliver && out.ready => out.fire
-  // 3. out.fire && last => out.valid && out.ready && last => in.valid && out.ready && last => que.ready
-  // 4. que.fire => que.valid && (!count_valid || (in.valid && out.ready && last))
-  //             => !count_valid || (count_valid && in.valid && out.ready && [last => deliver])
-  //             => !count_valid || (out.valid && out.ready && last)
-
-  when (io.que.fire()) {
-    // Theorem 4 makes this safe; we overwrite garbage or last output
-    count_valid := Bool(true)
-    count_bits := io.que.bits
-    multi_op := io.que.bits =/= UInt(0)
-  } .elsewhen (io.in.fire()) {
-    count_valid := !last // false => !que.valid
-    count_bits := count_bits - UInt(1)
-    // Proof: in.fire && [last => deliver] =2=> out.fire && last =3=> que.ready
-    //  !que.fire && que.ready => !que.valid
-  }
-
-  // Safe by Theorem 1
-  io.out.bits := io.in.bits
-  io.out.bits.g_type := Mux(multi_op, Mux(in_get, Grant.getDataBlockType, Grant.putAckType), io.in.bits.g_type)
-}
-
-class TileLinkFragmenter(depth: Int = 1)(implicit p: Parameters) extends TLModule()(p) {
-  val io = new Bundle {
-    val in = new ClientUncachedTileLinkIO().flip
-    val out = new ClientUncachedTileLinkIO
-  }
-
-  // TL2:
-  // supportsAcquire = false
-  // modify all outward managers to supportsMultibeat = true
-  // assert: all managers must behaveFIFO (not inspect duplicated id field)
-
-  val source = Module(new TileLinkFragmenterSource)
-  val sink = Module(new TileLinkFragmenterSink)
-  sink.io.que <> Queue(source.io.que, depth)
-
-  source.io.in <> io.in.acquire
-  io.out.acquire <> source.io.out
-  sink.io.in <> io.out.grant
-  io.in.grant <> sink.io.out
-}
-
-object TileLinkFragmenter {
-  // Pass the source/client to fragment
-  def apply(source: ClientUncachedTileLinkIO, depth: Int = 1): ClientUncachedTileLinkIO = {
-    val fragmenter = Module(new TileLinkFragmenter(depth)(source.p))
-    fragmenter.io.in <> source
-    fragmenter.io.out
-  }
-}
diff --git a/src/main/scala/uncore/devices/Bram.scala b/src/main/scala/uncore/devices/Bram.scala
deleted file mode 100644
index dfaee663..00000000
--- a/src/main/scala/uncore/devices/Bram.scala
+++ /dev/null
@@ -1,187 +0,0 @@
-// See LICENSE.SiFive for license details.
-// See LICENSE.Berkeley for license details.
-
-package uncore.devices
-
-import Chisel._
-import config._
-import unittest.UnitTest
-import junctions._
-import uncore.tilelink._
-import uncore.util._
-import util._
-import HastiConstants._
-
-class BRAMSlave(depth: Int)(implicit val p: Parameters) extends Module
-  with HasTileLinkParameters {
-  val io = new ClientUncachedTileLinkIO().flip
-
-  // For TL2:
-  // supportsAcquire = false
-  // supportsMultibeat = false
-  // supportsHint = false
-  // supportsAtomic = false
-
-  // Timing-wise, we assume the input is coming out of registers
-  // since you probably needed a TileLinkFragmenter infront of us
-
-  // Thus, only one pipeline stage: the grant result
-  val g_valid = RegInit(Bool(false))
-  val g_bits = Reg(new Grant)
-
-  // Just pass the pipeline straight through
-  io.grant.valid := g_valid
-  io.grant.bits := g_bits
-  io.acquire.ready := !g_valid || io.grant.ready
-
-  val acq_get  = io.acquire.bits.isBuiltInType(Acquire.getType)
-  val acq_put  = io.acquire.bits.isBuiltInType(Acquire.putType)
-  val acq_addr = Cat(io.acquire.bits.addr_block, io.acquire.bits.addr_beat)
-
-  val bram = Mem(depth, Bits(width = tlDataBits))
-
-  val ren = acq_get && io.acquire.fire()
-  val wen = acq_put && io.acquire.fire()
-
-  when (io.grant.fire()) {
-    g_valid := Bool(false)
-  }
-
-  when (io.acquire.fire()) {
-    g_valid := Bool(true)
-    g_bits := Grant(
-      is_builtin_type = Bool(true),
-      g_type = io.acquire.bits.getBuiltInGrantType(),
-      client_xact_id = io.acquire.bits.client_xact_id,
-      manager_xact_id = UInt(0),
-      addr_beat = io.acquire.bits.addr_beat,
-      data = UInt(0))
-  }
-
-  when (wen) {
-    bram.write(acq_addr, io.acquire.bits.data)
-    assert(io.acquire.bits.wmask().andR, "BRAMSlave: partial write masks not supported")
-  }
-  io.grant.bits.data := RegEnable(bram.read(acq_addr), ren)
-}
-
-class HastiRAM(depth: Int)(implicit p: Parameters) extends HastiModule()(p) {
-  val io = new HastiSlaveIO
-
-  val wdata = Vec.tabulate(hastiDataBytes)(i => io.hwdata(8*(i+1)-1,8*i))
-  val waddr = Reg(UInt(width = hastiAddrBits))
-  val wvalid = Reg(init = Bool(false))
-  val wsize = Reg(UInt(width = SZ_HSIZE))
-  val ram = SeqMem(depth, Vec(hastiDataBytes, Bits(width = 8)))
-
-  val max_size = log2Ceil(hastiDataBytes)
-  val wmask_lut = MuxLookup(wsize, SInt(-1, hastiDataBytes).asUInt,
-    (0 until max_size).map(sz => (UInt(sz) -> UInt((1 << (1 << sz)) - 1))))
-  val wmask = (wmask_lut << waddr(max_size - 1, 0))(hastiDataBytes - 1, 0)
-
-  val is_trans = io.hsel && io.htrans.isOneOf(HTRANS_NONSEQ, HTRANS_SEQ)
-  val raddr = io.haddr >> UInt(max_size)
-  val ren = is_trans && !io.hwrite
-  val bypass = Reg(init = Bool(false))
-
-  when (is_trans && io.hwrite) {
-    waddr := io.haddr
-    wsize := io.hsize
-    wvalid := Bool(true)
-  } .otherwise { wvalid := Bool(false) }
-
-  when (ren) { bypass := wvalid && (waddr >> UInt(max_size)) === raddr }
-
-  when (wvalid) {
-    ram.write(waddr >> UInt(max_size), wdata, wmask.toBools)
-  }
-
-  val rdata = ram.read(raddr, ren)
-  io.hrdata := Cat(rdata.zip(wmask.toBools).zip(wdata).map {
-    case ((rbyte, wsel), wbyte) => Mux(wsel && bypass, wbyte, rbyte)
-  }.reverse)
-
-  io.hready := Bool(true)
-  io.hresp := HRESP_OKAY
-}
-
-/**
- * This RAM is not meant to be particularly performant.
- * It just supports the entire range of uncached TileLink operations in the
- * simplest way possible.
- */
-class TileLinkTestRAM(depth: Int)(implicit val p: Parameters) extends Module
-    with HasTileLinkParameters {
-  val io = new ClientUncachedTileLinkIO().flip
-
-  val ram = Mem(depth, UInt(width = tlDataBits))
-
-  val responding = Reg(init = Bool(false))
-  val acq = io.acquire.bits
-  val r_acq = Reg(io.acquire.bits)
-  val acq_addr = Cat(acq.addr_block, acq.addr_beat)
-  val r_acq_addr = Cat(r_acq.addr_block, r_acq.addr_beat)
-
-  when (io.acquire.fire() && io.acquire.bits.last()) {
-    r_acq := io.acquire.bits
-    responding := Bool(true)
-  }
-
-  when (io.grant.fire()) {
-    val is_getblk = r_acq.isBuiltInType(Acquire.getBlockType)
-    val last_beat = r_acq.addr_beat === UInt(tlDataBeats - 1)
-    when (is_getblk && !last_beat) {
-      r_acq.addr_beat := r_acq.addr_beat + UInt(1)
-    } .otherwise { responding := Bool(false) }
-  }
-
-  val old_data = ram(acq_addr)
-  val new_data = acq.data
-  val r_old_data = RegEnable(old_data, io.acquire.fire())
-
-  io.acquire.ready := !responding
-  io.grant.valid := responding
-  io.grant.bits := Grant(
-    is_builtin_type = Bool(true),
-    g_type = r_acq.getBuiltInGrantType(),
-    client_xact_id = r_acq.client_xact_id,
-    manager_xact_id = UInt(0),
-    addr_beat = r_acq.addr_beat,
-    data = Mux(r_acq.isAtomic(), r_old_data, ram(r_acq_addr)))
-
-  val amo_shift_bits = acq.amo_shift_bytes() << UInt(3)
-  val amoalu = Module(new AMOALU(amoAluOperandBits, rhsIsAligned = true))
-  amoalu.io.addr := Cat(acq.addr_block, acq.addr_beat, acq.addr_byte())
-  amoalu.io.cmd := acq.op_code()
-  amoalu.io.typ := acq.op_size()
-  amoalu.io.lhs := old_data >> amo_shift_bits
-  amoalu.io.rhs := new_data >> amo_shift_bits
-
-  val result = Mux(acq.isAtomic(), amoalu.io.out << amo_shift_bits, new_data)
-  val wmask = FillInterleaved(8, acq.wmask())
-
-  when (io.acquire.fire() && acq.hasData()) {
-    ram(acq_addr) := (old_data & ~wmask) | (result & wmask)
-  }
-}
-
-class TileLinkRAMTest(implicit val p: Parameters)
-    extends UnitTest with HasTileLinkParameters {
-
-  val depth = 2 * tlDataBeats
-  val ram = Module(new TileLinkTestRAM(depth))
-  val driver = Module(new DriverSet(
-    (driverParams: Parameters) => {
-      implicit val p = driverParams
-      Seq(
-        Module(new PutSweepDriver(depth)),
-        Module(new PutMaskDriver),
-        Module(new PutAtomicDriver),
-        Module(new PutBlockSweepDriver(depth / tlDataBeats)),
-        Module(new PrefetchDriver),
-        Module(new GetMultiWidthDriver))
-    }))
-  ram.io <> driver.io.mem
-  driver.io.start := io.start
-  io.finished := driver.io.finished
-}

From 3a1a37d41b874ad265f1403e8bcbb7dd87c30299 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Tue, 2 May 2017 03:04:41 -0700
Subject: [PATCH 5/5] Support PutPartial in ScratchpadSlavePort

---
 src/main/scala/rocket/DCache.scala            | 25 +++++------
 src/main/scala/rocket/HellaCache.scala        |  7 ++-
 src/main/scala/rocket/IDecode.scala           |  2 +-
 src/main/scala/rocket/NBDcache.scala          | 24 +++++-----
 src/main/scala/rocket/Rocket.scala            |  5 ++-
 .../scala/rocket/ScratchpadSlavePort.scala    | 15 +++----
 .../scala/rocket/SimpleHellaCacheIF.scala     |  2 +-
 src/main/scala/tile/FPU.scala                 |  2 +-
 src/main/scala/uncore/Consts.scala            |  5 ++-
 src/main/scala/uncore/util/AmoAlu.scala       | 45 ++++++++-----------
 10 files changed, 64 insertions(+), 68 deletions(-)

diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala
index 5562d769..8e08b8d6 100644
--- a/src/main/scala/rocket/DCache.scala
+++ b/src/main/scala/rocket/DCache.scala
@@ -121,7 +121,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
   tlb.io.req.bits.sfence.valid := s1_sfence
   tlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0)
   tlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1)
-  tlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data
+  tlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data
   tlb.io.req.bits.passthrough := s1_req.phys
   tlb.io.req.bits.vaddr := s1_req.addr
   tlb.io.req.bits.instruction := false
@@ -155,6 +155,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
     }
   val s1_data_way = Mux(inWriteback, releaseWay, s1_hit_way)
   val s1_data = Mux1H(s1_data_way, data.io.resp) // retime into s2 if critical
+  val s1_mask = Mux(s1_req.cmd === M_PWR, io.cpu.s1_data.mask, new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes).mask)
 
   val s2_valid = Reg(next=s1_valid_masked && !s1_sfence, init=Bool(false)) && !io.cpu.s2_xcpt.asUInt.orR
   val s2_probe = Reg(next=s1_probe, init=Bool(false))
@@ -229,10 +230,10 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
   val pstore1_cmd = RegEnable(s1_req.cmd, s1_valid_not_nacked && s1_write)
   val pstore1_typ = RegEnable(s1_req.typ, s1_valid_not_nacked && s1_write)
   val pstore1_addr = RegEnable(s1_paddr, s1_valid_not_nacked && s1_write)
-  val pstore1_data = RegEnable(io.cpu.s1_data, s1_valid_not_nacked && s1_write)
+  val pstore1_data = RegEnable(io.cpu.s1_data.data, s1_valid_not_nacked && s1_write)
   val pstore1_way = RegEnable(s1_hit_way, s1_valid_not_nacked && s1_write)
-  val pstore1_storegen = new StoreGen(pstore1_typ, pstore1_addr, pstore1_data, wordBytes)
-  val pstore1_storegen_data = Wire(init = pstore1_storegen.data)
+  val pstore1_mask = RegEnable(s1_mask, s1_valid_not_nacked && s1_write)
+  val pstore1_storegen_data = Wire(init = pstore1_data)
   val pstore1_amo = Bool(usingAtomics) && isRead(pstore1_cmd)
   val pstore_drain_structural = pstore1_valid && pstore2_valid && ((s1_valid && s1_write) || pstore1_amo)
   val pstore_drain_opportunistic = !(io.cpu.req.valid && isRead(io.cpu.req.bits.cmd))
@@ -252,21 +253,20 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
   val pstore2_addr = RegEnable(pstore1_addr, advance_pstore1)
   val pstore2_way = RegEnable(pstore1_way, advance_pstore1)
   val pstore2_storegen_data = RegEnable(pstore1_storegen_data, advance_pstore1)
-  val pstore2_storegen_mask = RegEnable(pstore1_storegen.mask, advance_pstore1)
+  val pstore2_storegen_mask = RegEnable(pstore1_mask, advance_pstore1)
   dataArb.io.in(0).valid := pstore_drain
   dataArb.io.in(0).bits.write := true
   dataArb.io.in(0).bits.addr := Mux(pstore2_valid, pstore2_addr, pstore1_addr)
   dataArb.io.in(0).bits.way_en := Mux(pstore2_valid, pstore2_way, pstore1_way)
   dataArb.io.in(0).bits.wdata := Fill(rowWords, Mux(pstore2_valid, pstore2_storegen_data, pstore1_storegen_data))
   val pstore_mask_shift = Mux(pstore2_valid, pstore2_addr, pstore1_addr).extract(rowOffBits-1,offsetlsb) << wordOffBits
-  dataArb.io.in(0).bits.wmask := Mux(pstore2_valid, pstore2_storegen_mask, pstore1_storegen.mask) << pstore_mask_shift
+  dataArb.io.in(0).bits.wmask := Mux(pstore2_valid, pstore2_storegen_mask, pstore1_mask) << pstore_mask_shift
 
   // store->load RAW hazard detection
-  val s1_storegen = new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes)
   val s1_idx = s1_req.addr(idxMSB, wordOffBits)
   val s1_raw_hazard = s1_read &&
-    ((pstore1_valid && pstore1_addr(idxMSB, wordOffBits) === s1_idx && (pstore1_storegen.mask & s1_storegen.mask).orR) ||
-     (pstore2_valid && pstore2_addr(idxMSB, wordOffBits) === s1_idx && (pstore2_storegen_mask & s1_storegen.mask).orR))
+    ((pstore1_valid && pstore1_addr(idxMSB, wordOffBits) === s1_idx && (pstore1_mask & s1_mask).orR) ||
+     (pstore2_valid && pstore2_addr(idxMSB, wordOffBits) === s1_idx && (pstore2_storegen_mask & s1_mask).orR))
   when (s1_valid && s1_raw_hazard) { s1_nack := true }
 
   metaWriteArb.io.in(0).valid := (s2_valid_hit && s2_update_meta) || (s2_victimize && !s2_victim_dirty)
@@ -279,8 +279,8 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
   val a_source = PriorityEncoder(~uncachedInFlight.asUInt << mmioOffset) // skip the MSHR
   val acquire_address = s2_req_block_addr
   val access_address = s2_req.addr
-  val a_size = s2_req.typ(MT_SZ-2, 0)
-  val a_data = Fill(beatWords, pstore1_storegen.data)
+  val a_size = mtSize(s2_req.typ)
+  val a_data = Fill(beatWords, pstore1_data)
   val acquire = if (edge.manager.anySupportAcquireB) {
     edge.Acquire(UInt(0), acquire_address, lgCacheBlockBytes, s2_grow_param)._2 // Cacheability checked by tlb
   } else {
@@ -523,9 +523,8 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
   // AMOs
   if (usingAtomics) {
     val amoalu = Module(new AMOALU(xLen))
-    amoalu.io.addr := pstore1_addr
+    amoalu.io.mask := pstore1_mask
     amoalu.io.cmd := pstore1_cmd
-    amoalu.io.typ := pstore1_typ
     amoalu.io.lhs := s2_data_word
     amoalu.io.rhs := pstore1_data
     pstore1_storegen_data := amoalu.io.out
diff --git a/src/main/scala/rocket/HellaCache.scala b/src/main/scala/rocket/HellaCache.scala
index f50bfa0b..0624fe5d 100644
--- a/src/main/scala/rocket/HellaCache.scala
+++ b/src/main/scala/rocket/HellaCache.scala
@@ -118,11 +118,16 @@ class HellaCacheExceptions extends Bundle {
   val ae = new AlignmentExceptions
 }
 
+class HellaCacheWriteData(implicit p: Parameters) extends CoreBundle()(p) {
+  val data = UInt(width = coreDataBits)
+  val mask = UInt(width = coreDataBytes)
+}
+
 // interface between D$ and processor/DTLB
 class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) {
   val req = Decoupled(new HellaCacheReq)
   val s1_kill = Bool(OUTPUT) // kill previous cycle's req
-  val s1_data = Bits(OUTPUT, coreDataBits) // data for previous cycle's req
+  val s1_data = new HellaCacheWriteData().asOutput // data for previous cycle's req
   val s2_nack = Bool(INPUT) // req from two cycles ago is rejected
 
   // performance events
diff --git a/src/main/scala/rocket/IDecode.scala b/src/main/scala/rocket/IDecode.scala
index 86c0a03a..ca8ae4ac 100644
--- a/src/main/scala/rocket/IDecode.scala
+++ b/src/main/scala/rocket/IDecode.scala
@@ -129,7 +129,7 @@ class IDecode(implicit val p: Parameters) extends DecodeConstants
 class SDecode(implicit val p: Parameters) extends DecodeConstants
 {
   val table: Array[(BitPat, List[BitPat])] = Array(
-    SFENCE_VMA->List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_SFENCE,   MT_X, N,N,N,N,N,N,CSR.N,N,N,N,N),
+    SFENCE_VMA->List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD,   Y,M_SFENCE,   MT_W, N,N,N,N,N,N,CSR.N,N,N,N,N),
     SRET->      List(Y,N,N,N,N,N,N,X,A2_X,   A1_X,   IMM_X, DW_X,  FN_X,     N,M_X,        MT_X, N,N,N,N,N,N,CSR.I,N,N,N,N))
 }
 
diff --git a/src/main/scala/rocket/NBDcache.scala b/src/main/scala/rocket/NBDcache.scala
index 5ca57d76..b03361b2 100644
--- a/src/main/scala/rocket/NBDcache.scala
+++ b/src/main/scala/rocket/NBDcache.scala
@@ -69,20 +69,18 @@ class IOMSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCa
   }
 
   val req = Reg(new HellaCacheReq)
-  val req_cmd_sc = req.cmd === M_XSC
   val grant_word = Reg(UInt(width = wordBits))
 
   val s_idle :: s_mem_access :: s_mem_ack :: s_resp :: Nil = Enum(Bits(), 4)
   val state = Reg(init = s_idle)
   io.req.ready := (state === s_idle)
 
-  val storegen = new StoreGen(req.typ, req.addr, req.data, wordBytes)
-  val loadgen = new LoadGen(req.typ, mtSigned(req.typ), req.addr, grant_word, req_cmd_sc, wordBytes)
+  val loadgen = new LoadGen(req.typ, mtSigned(req.typ), req.addr, grant_word, false.B, wordBytes)
  
   val a_source = UInt(id)
   val a_address = req.addr
-  val a_size = storegen.size
-  val a_data = Fill(beatWords, storegen.data)
+  val a_size = mtSize(req.typ)
+  val a_data = Fill(beatWords, req.data)
 
   val get     = edge.Get(a_source, a_address, a_size)._2
   val put     = edge.Put(a_source, a_address, a_size, a_data)._2
@@ -99,9 +97,10 @@ class IOMSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCa
       M_XA_MAXU -> edge.Arithmetic(a_source, a_address, a_size, a_data, TLAtomics.MAXU)._2))
   } else {
     // If no managers support atomics, assert fail if processor asks for them
-    assert (!isAMO(req.cmd))
+    assert(state === s_idle || !isAMO(req.cmd))
     Wire(new TLBundleA(edge.bundle))
   }
+  assert(state === s_idle || req.cmd =/= M_XSC)
 
   io.mem_access.valid := (state === s_mem_access)
   io.mem_access.bits := Mux(isAMO(req.cmd), atomics, Mux(isRead(req.cmd), get, put))
@@ -110,7 +109,7 @@ class IOMSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCa
   io.resp.valid := (state === s_resp)
   io.resp.bits := req
   io.resp.bits.has_data := isRead(req.cmd)
-  io.resp.bits.data := loadgen.data | req_cmd_sc
+  io.resp.bits.data := loadgen.data
   io.resp.bits.store_data := req.data
   io.resp.bits.replay := Bool(true)
 
@@ -696,6 +695,8 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule
   val s1_read  = isRead(s1_req.cmd)
   val s1_write = isWrite(s1_req.cmd)
   val s1_readwrite = s1_read || s1_write || isPrefetch(s1_req.cmd)
+  // check for unsupported operations
+  assert(!s1_valid || !s1_req.cmd.isOneOf(M_PWR))
 
   val dtlb = Module(new TLB(log2Ceil(coreDataBytes), nTLBEntries))
   io.ptw <> dtlb.io.ptw
@@ -703,7 +704,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule
   dtlb.io.req.bits.sfence.valid := s1_sfence
   dtlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0)
   dtlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1)
-  dtlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data
+  dtlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data
   dtlb.io.req.bits.passthrough := s1_req.phys
   dtlb.io.req.bits.vaddr := s1_req.addr
   dtlb.io.req.bits.instruction := Bool(false)
@@ -736,7 +737,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule
     s2_req.phys := s1_req.phys
     s2_req.addr := s1_addr
     when (s1_write) {
-      s2_req.data := Mux(s1_replay, mshrs.io.replay.bits.data, io.cpu.s1_data)
+      s2_req.data := Mux(s1_replay, mshrs.io.replay.bits.data, io.cpu.s1_data.data)
     }
     when (s1_recycled) { s2_req.data := s1_req.data }
     s2_req.tag := s1_req.tag
@@ -927,10 +928,9 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule
   val s2_data_word_prebypass = s2_data_uncorrected >> Cat(s2_word_idx, Bits(0,log2Up(coreDataBits)))
   val s2_data_word = Mux(s2_store_bypass, s2_store_bypass_data, s2_data_word_prebypass)
   val loadgen = new LoadGen(s2_req.typ, mtSigned(s2_req.typ), s2_req.addr, s2_data_word, s2_sc, wordBytes)
-  
-  amoalu.io.addr := s2_req.addr
+
+  amoalu.io.mask := new StoreGen(s2_req.typ, s2_req.addr, 0.U, xLen/8).mask
   amoalu.io.cmd := s2_req.cmd
-  amoalu.io.typ := s2_req.typ
   amoalu.io.lhs := s2_data_word
   amoalu.io.rhs := s2_req.data
 
diff --git a/src/main/scala/rocket/Rocket.scala b/src/main/scala/rocket/Rocket.scala
index 112b4d6c..27999f14 100644
--- a/src/main/scala/rocket/Rocket.scala
+++ b/src/main/scala/rocket/Rocket.scala
@@ -394,7 +394,8 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
     mem_reg_pc := ex_reg_pc
     mem_reg_wdata := alu.io.out
     when (ex_ctrl.rxs2 && (ex_ctrl.mem || ex_ctrl.rocc)) {
-      mem_reg_rs2 := ex_rs(1)
+      val typ = Mux(ex_ctrl.rocc, log2Ceil(xLen/8).U, ex_ctrl.mem_type)
+      mem_reg_rs2 := new uncore.util.StoreGen(typ, 0.U, ex_rs(1), coreDataBytes).data
     }
   }
 
@@ -625,7 +626,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
   io.dmem.req.bits.phys := Bool(false)
   io.dmem.req.bits.addr := encodeVirtualAddress(ex_rs(0), alu.io.adder_out)
   io.dmem.invalidate_lr := wb_xcpt
-  io.dmem.s1_data := Mux(mem_ctrl.fp, io.fpu.store_data, mem_reg_rs2)
+  io.dmem.s1_data.data := Mux(mem_ctrl.fp, io.fpu.store_data, mem_reg_rs2)
   io.dmem.s1_kill := killm_common || mem_breakpoint
 
   io.rocc.cmd.valid := wb_reg_valid && wb_ctrl.rocc && !replay_wb_common
diff --git a/src/main/scala/rocket/ScratchpadSlavePort.scala b/src/main/scala/rocket/ScratchpadSlavePort.scala
index 76b8ea10..de8702ac 100644
--- a/src/main/scala/rocket/ScratchpadSlavePort.scala
+++ b/src/main/scala/rocket/ScratchpadSlavePort.scala
@@ -24,7 +24,7 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L
       executable         = true,
       supportsArithmetic = if (usingAtomics) TransferSizes(4, coreDataBytes) else TransferSizes.none,
       supportsLogical    = if (usingAtomics) TransferSizes(4, coreDataBytes) else TransferSizes.none,
-      supportsPutPartial = TransferSizes.none, // Can't support PutPartial
+      supportsPutPartial = TransferSizes(1, coreDataBytes),
       supportsPutFull    = TransferSizes(1, coreDataBytes),
       supportsGet        = TransferSizes(1, coreDataBytes),
       fifoId             = Some(0))), // requests handled in FIFO order
@@ -55,6 +55,7 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L
       val req = Wire(new HellaCacheReq)
       req.cmd := MuxLookup(a.opcode, Wire(M_XRD), Array(
         TLMessages.PutFullData    -> M_XWR,
+        TLMessages.PutPartialData -> M_PWR,
         TLMessages.ArithmeticData -> MuxLookup(a.param, Wire(M_XRD), Array(
           TLAtomics.MIN           -> M_XA_MIN,
           TLAtomics.MAX           -> M_XA_MAX,
@@ -67,9 +68,8 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L
           TLAtomics.AND           -> M_XA_AND,
           TLAtomics.SWAP          -> M_XA_SWAP)),
         TLMessages.Get            -> M_XRD))
-      // treat all loads as full words, so bytes appear in correct lane
-      req.typ := Mux(edge.hasData(a), a.size, log2Ceil(coreDataBytes))
-      req.addr := Mux(edge.hasData(a), a.address, ~(~a.address | (coreDataBytes-1)))
+      req.typ := a.size
+      req.addr := a.address
       req.tag := UInt(0)
       req.phys := true
       req
@@ -79,14 +79,13 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L
     io.dmem.req.valid := (tl_in.a.valid && ready) || state === s_replay
     tl_in.a.ready := io.dmem.req.ready && ready
     io.dmem.req.bits := formCacheReq(Mux(state === s_replay, acq, tl_in.a.bits))
-    // the TL data is already in the correct byte lane, but the D$
-    // expects right-justified store data, so that it can steer the bytes.
-    io.dmem.s1_data := new LoadGen(acq.size, Bool(false), acq.address(log2Ceil(coreDataBytes)-1,0), acq.data, Bool(false), coreDataBytes).data
+    io.dmem.s1_data.data := acq.data
+    io.dmem.s1_data.mask := acq.mask
     io.dmem.s1_kill := false
     io.dmem.invalidate_lr := false
 
     tl_in.d.valid := io.dmem.resp.valid || state === s_grant
-    tl_in.d.bits := Mux(acq.opcode === TLMessages.PutFullData,
+    tl_in.d.bits := Mux(acq.opcode.isOneOf(TLMessages.PutFullData, TLMessages.PutPartialData),
       edge.AccessAck(acq, UInt(0)),
       edge.AccessAck(acq, UInt(0), UInt(0)))
     tl_in.d.bits.data := Mux(io.dmem.resp.valid, io.dmem.resp.bits.data_raw, acq.data)
diff --git a/src/main/scala/rocket/SimpleHellaCacheIF.scala b/src/main/scala/rocket/SimpleHellaCacheIF.scala
index db8a34e5..1f821f99 100644
--- a/src/main/scala/rocket/SimpleHellaCacheIF.scala
+++ b/src/main/scala/rocket/SimpleHellaCacheIF.scala
@@ -123,7 +123,7 @@ class SimpleHellaCacheIF(implicit p: Parameters) extends Module
   io.cache.invalidate_lr := io.requestor.invalidate_lr
   io.cache.req <> req_arb.io.out
   io.cache.s1_kill := io.cache.s2_nack
-  io.cache.s1_data := RegEnable(req_arb.io.out.bits.data, s0_req_fire)
+  io.cache.s1_data.data := RegEnable(req_arb.io.out.bits.data, s0_req_fire)
 
   replayq.io.nack.valid := (io.cache.s2_nack || s2_kill) && s2_req_fire
   replayq.io.nack.bits := s2_req_tag
diff --git a/src/main/scala/tile/FPU.scala b/src/main/scala/tile/FPU.scala
index 4fcc088f..80355e4d 100644
--- a/src/main/scala/tile/FPU.scala
+++ b/src/main/scala/tile/FPU.scala
@@ -392,7 +392,7 @@ class FPToInt(implicit p: Parameters) extends FPUModule()(p) {
 
   val store = ieee(in.in1)
   val toint = Mux(in.rm(0), classify_out, store)
-  io.out.bits.store := store
+  io.out.bits.store := Mux(in.singleOut, Fill(xLen/32, store(31, 0)), store)
   io.out.bits.toint := Mux(in.singleOut, toint(31, 0).sextTo(xLen), toint)
   io.out.bits.exc := Bits(0)
 
diff --git a/src/main/scala/uncore/Consts.scala b/src/main/scala/uncore/Consts.scala
index 4f0c7297..d1990625 100644
--- a/src/main/scala/uncore/Consts.scala
+++ b/src/main/scala/uncore/Consts.scala
@@ -28,7 +28,8 @@ trait MemoryOpConstants {
   def M_XA_MINU = UInt("b01110");
   def M_XA_MAXU = UInt("b01111");
   def M_FLUSH   = UInt("b10000") // write back dirty data and cede R/W permissions
-  def M_PRODUCE = UInt("b10001") // write back dirty data and cede W permissions
+  def M_PWR     = UInt("b10001") // partial (masked) store
+  def M_PRODUCE = UInt("b10010") // write back dirty data and cede W permissions
   def M_CLEAN   = UInt("b10011") // write back dirty data and retain R/W permissions
   def M_SFENCE  = UInt("b10100") // flush TLB
 
@@ -37,7 +38,7 @@ trait MemoryOpConstants {
   def isAMO(cmd: UInt) = isAMOLogical(cmd) || isAMOArithmetic(cmd)
   def isPrefetch(cmd: UInt) = cmd === M_PFR || cmd === M_PFW
   def isRead(cmd: UInt) = cmd === M_XRD || cmd === M_XLR || cmd === M_XSC || isAMO(cmd)
-  def isWrite(cmd: UInt) = cmd === M_XWR || cmd === M_XSC || isAMO(cmd)
+  def isWrite(cmd: UInt) = cmd === M_XWR || cmd === M_PWR || cmd === M_XSC || isAMO(cmd)
   def isWriteIntent(cmd: UInt) = isWrite(cmd) || cmd === M_PFW || cmd === M_XLR
 }
 
diff --git a/src/main/scala/uncore/util/AmoAlu.scala b/src/main/scala/uncore/util/AmoAlu.scala
index b36c4446..703d059b 100644
--- a/src/main/scala/uncore/util/AmoAlu.scala
+++ b/src/main/scala/uncore/util/AmoAlu.scala
@@ -31,10 +31,6 @@ class StoreGen(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) {
   def wordData = genData(2)
 }
 
-class StoreGenAligned(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) extends StoreGen(typ, addr, dat, maxSize) {
-  override def genData(i: Int) = dat
-}
-
 class LoadGen(typ: UInt, signed: Bool, addr: UInt, dat: UInt, zero: Bool, maxSize: Int) {
   private val size = new StoreGen(typ, addr, dat, maxSize).size
 
@@ -54,22 +50,16 @@ class LoadGen(typ: UInt, signed: Bool, addr: UInt, dat: UInt, zero: Bool, maxSiz
   def data = genData(0)
 }
 
-class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parameters) extends Module {
+class AMOALU(operandBits: Int)(implicit p: Parameters) extends Module {
   require(operandBits == 32 || operandBits == 64)
   val io = new Bundle {
-    val addr = Bits(INPUT, log2Ceil(operandBits/8))
+    val mask = UInt(INPUT, operandBits/8)
     val cmd = Bits(INPUT, M_SZ)
-    val typ = Bits(INPUT, log2Ceil(log2Ceil(operandBits/8) + 1))
     val lhs = Bits(INPUT, operandBits)
     val rhs = Bits(INPUT, operandBits)
     val out = Bits(OUTPUT, operandBits)
   }
 
-  val storegen =
-    if(rhsIsAligned) new StoreGenAligned(io.typ, io.addr, io.rhs, operandBits/8)
-    else new StoreGen(io.typ, io.addr, io.rhs, operandBits/8)
-  val rhs = storegen.wordData
-  
   val max = io.cmd === M_XA_MAX || io.cmd === M_XA_MAXU
   val min = io.cmd === M_XA_MIN || io.cmd === M_XA_MINU
   val add = io.cmd === M_XA_ADD
@@ -77,10 +67,10 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame
   val logic_xor = io.cmd === M_XA_XOR || io.cmd === M_XA_OR
 
   val adder_out =
-    if (operandBits == 32) io.lhs + rhs
+    if (operandBits == 32) io.lhs + io.rhs
     else {
-      val mask = ~UInt(0,64) ^ (io.addr(2) << 31)
-      (io.lhs & mask) + (rhs & mask)
+      val mask = ~UInt(0,64) ^ (!io.mask(3) << 31)
+      (io.lhs & mask) + (io.rhs & mask)
     }
 
   val less = {
@@ -90,28 +80,29 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame
     }
 
     if (operandBits == 32) {
-      Mux(io.lhs(31) === rhs(31), io.lhs < rhs, Mux(sgned, io.lhs(31), io.rhs(31)))
+      Mux(io.lhs(31) === io.rhs(31), io.lhs < io.rhs, Mux(sgned, io.lhs(31), io.rhs(31)))
     } else {
-      val word = !io.typ(0)
-      val cmp_lhs = Mux(word && !io.addr(2), io.lhs(31), io.lhs(63))
-      val cmp_rhs = Mux(word && !io.addr(2), rhs(31), rhs(63))
-      val lt_lo = io.lhs(31,0) < rhs(31,0)
-      val lt_hi = io.lhs(63,32) < rhs(63,32)
-      val eq_hi = io.lhs(63,32) === rhs(63,32)
-      val lt = Mux(word, Mux(io.addr(2), lt_hi, lt_lo), lt_hi || eq_hi && lt_lo)
+      val cmp_lhs = Mux(!io.mask(4), io.lhs(31), io.lhs(63))
+      val cmp_rhs = Mux(!io.mask(4), io.rhs(31), io.rhs(63))
+      val lt_lo = io.lhs(31,0) < io.rhs(31,0)
+      val lt_hi = io.lhs(63,32) < io.rhs(63,32)
+      val eq_hi = io.lhs(63,32) === io.rhs(63,32)
+      val lt =
+        Mux(io.mask(4) && io.mask(3), lt_hi || eq_hi && lt_lo,
+        Mux(io.mask(4), lt_hi, lt_lo))
       Mux(cmp_lhs === cmp_rhs, lt, Mux(sgned, cmp_lhs, cmp_rhs))
     }
   }
 
-  val minmax = Mux(Mux(less, min, max), io.lhs, storegen.data)
+  val minmax = Mux(Mux(less, min, max), io.lhs, io.rhs)
   val logic =
-    Mux(logic_and, io.lhs & rhs, 0.U) |
-    Mux(logic_xor, io.lhs ^ rhs, 0.U)
+    Mux(logic_and, io.lhs & io.rhs, 0.U) |
+    Mux(logic_xor, io.lhs ^ io.rhs, 0.U)
   val out =
     Mux(add,                    adder_out,
     Mux(logic_and || logic_xor, logic,
                                 minmax))
 
-  val wmask = FillInterleaved(8, storegen.mask)
+  val wmask = FillInterleaved(8, io.mask)
   io.out := wmask & out | ~wmask & io.lhs
 }