Merge branch 'master' into async_queue_option

2017-04-25 11:23:22 -07:00
parent 0aa8f7d61d f3ab23d068
commit 9bb0d92381
13 changed files with 234 additions and 107 deletions
--- a/src/main/scala/rocket/BTB.scala
+++ b/src/main/scala/rocket/BTB.scala
@@ -86,6 +86,15 @@ class BHT(nbht: Int)(implicit val p: Parameters) extends HasCoreParameters {
  val history = Reg(UInt(width = nbhtbits))
 }
 object CFIType {
  def SZ = 2
  def apply() = UInt(width = SZ)
  def branch = 0.U
  def jump = 1.U
  def call = 2.U
  def ret = 3.U
 }
 // BTB update occurs during branch resolution (and only on a mispredict).
 //  - "pc" is what future fetch PCs will tag match against.
 //  - "br_pc" is the PC of the branch instruction.
@@ -95,9 +104,8 @@ class BTBUpdate(implicit p: Parameters) extends BtbBundle()(p) {
  val target = UInt(width = vaddrBits)
  val taken = Bool()
  val isValid = Bool()
  val isJump = Bool()
  val isReturn = Bool()
  val br_pc = UInt(width = vaddrBits)
  val cfiType = CFIType()
 }
 // BHT update occurs during branch resolution on all conditional branches.
@@ -110,8 +118,7 @@ class BHTUpdate(implicit p: Parameters) extends BtbBundle()(p) {
 }
 class RASUpdate(implicit p: Parameters) extends BtbBundle()(p) {
-  val isCall = Bool()
+  val cfiType = CFIType()
  val isReturn = Bool()
  val returnAddr = UInt(width = vaddrBits)
  val prediction = Valid(new BTBResp)
 }
@@ -121,6 +128,7 @@ class RASUpdate(implicit p: Parameters) extends BtbBundle()(p) {
 //  - "mask" provides a mask of valid instructions (instructions are
 //     masked off by the predicted taken branch from the BTB).
 class BTBResp(implicit p: Parameters) extends BtbBundle()(p) {
  val cfiType = CFIType()
  val taken = Bool()
  val mask = Bits(width = fetchWidth)
  val bridx = Bits(width = log2Up(fetchWidth))
@@ -154,8 +162,7 @@ class BTB(implicit p: Parameters) extends BtbModule {
  val pageValid = Reg(init = UInt(0, nPages))
  val isValid = Reg(init = UInt(0, entries))
-  val isReturn = Reg(UInt(width = entries))
+  val cfiType = Reg(Vec(entries, CFIType()))
  val isJump = Reg(UInt(width = entries))
  val brIdx = Reg(Vec(entries, UInt(width=log2Up(fetchWidth))))
  private def page(addr: UInt) = addr >> matchBits
@@ -210,9 +217,8 @@ class BTB(implicit p: Parameters) extends BtbModule {
    tgts(waddr) := update_target(matchBits-1, log2Up(coreInstBytes))
    idxPages(waddr) := idxPageUpdate +& 1 // the +1 corresponds to the <<1 on io.resp.valid
    tgtPages(waddr) := tgtPageUpdate
    cfiType(waddr) := r_btb_update.bits.cfiType
    isValid := Mux(r_btb_update.bits.isValid, isValid | mask, isValid & ~mask)
    isReturn := Mux(r_btb_update.bits.isReturn, isReturn | mask, isReturn & ~mask)
    isJump := Mux(r_btb_update.bits.isJump, isJump | mask, isJump & ~mask)
    if (fetchWidth > 1)
      brIdx(waddr) := r_btb_update.bits.br_pc >> log2Up(coreInstBytes)
@@ -236,6 +242,7 @@ class BTB(implicit p: Parameters) extends BtbModule {
  io.resp.bits.entry := OHToUInt(idxHit)
  io.resp.bits.bridx := (if (fetchWidth > 1) Mux1H(idxHit, brIdx) else UInt(0))
  io.resp.bits.mask := Cat((UInt(1) << ~Mux(io.resp.bits.taken, ~io.resp.bits.bridx, UInt(0)))-1, UInt(1))
  io.resp.bits.cfiType := Mux1H(idxHit, cfiType)
  // if multiple entries for same PC land in BTB, zap them
  when (PopCountAtLeast(idxHit, 2)) {
@@ -244,7 +251,7 @@ class BTB(implicit p: Parameters) extends BtbModule {
  if (nBHT > 0) {
    val bht = new BHT(nBHT)
-    val isBranch = !(idxHit & isJump).orR
+    val isBranch = (idxHit & cfiType.map(_ === CFIType.branch).asUInt).orR
    val res = bht.get(io.req.bits.addr, io.req.valid && io.resp.valid && isBranch)
    val update_btb_hit = io.bht_update.bits.prediction.valid
    when (io.bht_update.valid && update_btb_hit) {
@@ -256,17 +263,14 @@ class BTB(implicit p: Parameters) extends BtbModule {
  if (nRAS > 0) {
    val ras = new RAS(nRAS)
-    val doPeek = (idxHit & isReturn).orR
+    val doPeek = (idxHit & cfiType.map(_ === CFIType.ret).asUInt).orR
    when (!ras.isEmpty && doPeek) {
      io.resp.bits.target := ras.peek
    }
    when (io.ras_update.valid) {
-      when (io.ras_update.bits.isCall) {
+      when (io.ras_update.bits.cfiType === CFIType.call) {
        ras.push(io.ras_update.bits.returnAddr)
-        when (doPeek) {
+      }.elsewhen (io.ras_update.bits.cfiType === CFIType.ret && io.ras_update.bits.prediction.valid) {
          io.resp.bits.target := io.ras_update.bits.returnAddr
        }
      }.elsewhen (io.ras_update.bits.isReturn && io.ras_update.bits.prediction.valid) {
        ras.pop()
      }
    }
--- a/src/main/scala/rocket/CSR.scala
+++ b/src/main/scala/rocket/CSR.scala
@@ -300,10 +300,10 @@ class CSRFile(perfEventSets: EventSets = new EventSets(Seq()))(implicit p: Param
  val pending_interrupts = read_mip & reg_mie
  val m_interrupts = Mux(reg_mstatus.prv <= PRV.S || (reg_mstatus.prv === PRV.M && reg_mstatus.mie), pending_interrupts & ~reg_mideleg, UInt(0))
  val s_interrupts = Mux(m_interrupts === 0 && (reg_mstatus.prv < PRV.S || (reg_mstatus.prv === PRV.S && reg_mstatus.sie)), pending_interrupts & reg_mideleg, UInt(0))
-  val all_interrupts = m_interrupts | s_interrupts
+  val (anyInterrupt, whichInterrupt) = chooseInterrupt(Seq(s_interrupts, m_interrupts))
  val interruptMSB = BigInt(1) << (xLen-1)
-  val interruptCause = UInt(interruptMSB) + PriorityEncoder(all_interrupts)
+  val interruptCause = UInt(interruptMSB) + whichInterrupt
-  io.interrupt := all_interrupts.orR && !reg_debug && !io.singleStep || reg_singleStepped
+  io.interrupt := anyInterrupt && !reg_debug && !io.singleStep || reg_singleStepped
  io.interrupt_cause := interruptCause
  io.bp := reg_bp take nBreakpoints
  io.pmp := reg_pmp.map(PMP(_))
@@ -758,6 +758,14 @@ class CSRFile(perfEventSets: EventSets = new EventSets(Seq()))(implicit p: Param
    }
  }
  def chooseInterrupt(masks: Seq[UInt]) = {
    // we can't simply choose the highest-numbered interrupt, because timer
    // interrupts are in the wrong place in mip.
    val timerMask = UInt(0xF0, xLen)
    val masked = masks.map(m => Cat(m.padTo(xLen) & ~timerMask, m.padTo(xLen) & timerMask))
    (masks.map(_.orR).reduce(_||_), Log2(masked.asUInt)(log2Ceil(xLen)-1, 0))
  }
  def readModifyWriteCSR(cmd: UInt, rdata: UInt, wdata: UInt) =
    (Mux(cmd.isOneOf(CSR.S, CSR.C), rdata, UInt(0)) | wdata) & ~Mux(cmd === CSR.C, wdata, UInt(0))
--- a/src/main/scala/rocket/DCache.scala
+++ b/src/main/scala/rocket/DCache.scala
@@ -11,6 +11,7 @@ import uncore.tilelink2._
 import uncore.util._
 import util._
 import TLMessages._
 import scala.math.min
 class DCacheDataReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) {
  val addr = Bits(width = untagBits)
@@ -57,6 +58,19 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
  data.io.req <> dataArb.io.out
  dataArb.io.out.ready := true
  val rational = p(coreplex.RocketCrossing) match {
    case coreplex.RationalCrossing(_) => true
    case _ => false
  }
  val tl_out_a = Wire(tl_out.a)
  val q_depth = if (rational) min(2, maxUncachedInFlight-1) else 0
  if (q_depth <= 0) {
    tl_out.a <> tl_out_a
  } else {
    tl_out.a <> Queue(tl_out_a, q_depth, flow = true, pipe = true)
  }
  val s1_valid = Reg(next=io.cpu.req.fire(), init=Bool(false))
  val s1_probe = Reg(next=tl_out.b.fire(), init=Bool(false))
  val probe_bits = RegEnable(tl_out.b.bits, tl_out.b.fire()) // TODO has data now :(
@@ -176,7 +190,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
  val (s2_prb_ack_data, s2_report_param, probeNewCoh)= s2_probe_state.onProbe(probe_bits.param)
  val (s2_victim_dirty, s2_shrink_param, voluntaryNewCoh) = s2_victim_state.onCacheControl(M_FLUSH)
  val s2_update_meta = s2_hit_state =/= s2_new_hit_state
-  io.cpu.s2_nack := s2_valid && !s2_valid_hit && !(s2_valid_uncached && tl_out.a.ready && !uncachedInFlight.asUInt.andR)
+  io.cpu.s2_nack := s2_valid && !s2_valid_hit && !(s2_valid_uncached && tl_out_a.ready && !uncachedInFlight.asUInt.andR)
  when (io.cpu.s2_nack || (s2_valid_hit && s2_update_meta)) { s1_nack := true }
  val s3_valid = Reg(next = s2_valid, init=Bool(false))
@@ -285,17 +299,17 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
      M_XA_MAXU -> edge.Arithmetic(a_source, access_address, a_size, a_data, TLAtomics.MAXU)._2))
  } else {
    // If no managers support atomics, assert fail if processor asks for them
-    assert (!(tl_out.a.valid && pstore1_amo && s2_write && s2_uncached))
+    assert (!(tl_out_a.valid && pstore1_amo && s2_write && s2_uncached))
    Wire(new TLBundleA(edge.bundle))
  }
-  tl_out.a.valid := (s2_valid_cached_miss && !s2_victim_dirty) ||
+  tl_out_a.valid := (s2_valid_cached_miss && !s2_victim_dirty) ||
                    (s2_valid_uncached && !uncachedInFlight.asUInt.andR)
-  tl_out.a.bits := Mux(!s2_uncached, acquire, Mux(!s2_write, get, Mux(!pstore1_amo, put, atomics)))
+  tl_out_a.bits := Mux(!s2_uncached, acquire, Mux(!s2_write, get, Mux(!pstore1_amo, put, atomics)))
  // Set pending bits for outstanding TileLink transaction
  val a_sel = UIntToOH(a_source, maxUncachedInFlight+mmioOffset) >> mmioOffset
-  when (tl_out.a.fire()) {
+  when (tl_out_a.fire()) {
    when (s2_uncached) {
      (a_sel.toBools zip (uncachedInFlight zip uncachedReqs)) foreach { case (s, (f, r)) =>
        when (s) {
@@ -518,7 +532,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
  val flushed = Reg(init=Bool(true))
  val flushing = Reg(init=Bool(false))
  val flushCounter = Counter(nSets * nWays)
-  when (tl_out.a.fire() && !s2_uncached) { flushed := false }
+  when (tl_out_a.fire() && !s2_uncached) { flushed := false }
  when (s2_valid_masked && s2_req.cmd === M_FLUSH_ALL) {
    io.cpu.s2_nack := !flushed
    when (!flushed) {
@@ -542,6 +556,6 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
  }
  // performance events
-  io.cpu.acquire := edge.done(tl_out.a)
+  io.cpu.acquire := edge.done(tl_out_a)
  io.cpu.release := edge.done(tl_out.c)
 }
--- a/src/main/scala/rocket/Frontend.scala
+++ b/src/main/scala/rocket/Frontend.scala
@@ -5,6 +5,7 @@ package rocket
 import Chisel._
 import Chisel.ImplicitConversions._
 import chisel3.core.withReset
 import config._
 import coreplex._
 import diplomacy._
@@ -64,11 +65,12 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
  val icache = outer.icache.module
  val tlb = Module(new TLB(log2Ceil(coreInstBytes*fetchWidth), nTLBEntries))
  val fq = withReset(reset || io.cpu.req.valid) { Module(new ShiftQueue(new FrontendResp, 3, flow = true)) }
  val s0_valid = io.cpu.req.valid || fq.io.enq.ready
  val s1_pc_ = Reg(UInt(width=vaddrBitsExtended))
  val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline)
  val s1_speculative = Reg(Bool())
  val s1_same_block = Reg(Bool())
  val s2_valid = Reg(init=Bool(true))
  val s2_pc = Reg(init=io.resetVector)
  val s2_btb_resp_valid = Reg(init=Bool(false))
@@ -82,26 +84,27 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
  val s2_speculative = Reg(init=Bool(false))
  val s2_cacheable = Reg(init=Bool(false))
-  val ntpc = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth)
+  val fetchBytes = coreInstBytes * fetchWidth
-  val ntpc_same_block = (ntpc & rowBytes) === (s1_pc & rowBytes)
+  val s1_base_pc = ~(~s1_pc | (fetchBytes - 1))
  val ntpc = s1_base_pc + fetchBytes.U
  val predicted_npc = Wire(init = ntpc)
  val predicted_taken = Wire(init = Bool(false))
  val icmiss = s2_valid && !icache.io.resp.valid
  val npc = Mux(icmiss, s2_pc, predicted_npc)
  val s0_same_block = !predicted_taken && !icmiss && !io.cpu.req.valid && ntpc_same_block
-  val stall = io.cpu.resp.valid && !io.cpu.resp.ready
+  val s2_replay = Wire(Bool())
-  when (!stall) {
+  s2_replay := (s2_valid && !fq.io.enq.fire()) || RegNext(s2_replay && !s0_valid)
-    s1_same_block := s0_same_block && !tlb.io.resp.miss
+  val npc = Mux(s2_replay, s2_pc, predicted_npc)
  s1_pc_ := io.cpu.npc
  // consider RVC fetches across blocks to be non-speculative if the first
  // part was non-speculative
  val s0_speculative =
    if (usingCompressed) s1_speculative || s2_valid && !s2_speculative || predicted_taken
    else Bool(true)
-    s1_speculative := Mux(icmiss, s2_speculative, s0_speculative)
+  s1_speculative := Mux(io.cpu.req.valid, io.cpu.req.bits.speculative, Mux(s2_replay, s2_speculative, s0_speculative))
-    s2_valid := !icmiss
+
-    when (!icmiss) {
+  s2_valid := false
  when (!s2_replay && !io.cpu.req.valid) {
    s2_valid := true
    s2_pc := s1_pc
    s2_speculative := s1_speculative
    s2_cacheable := tlb.io.resp.cacheable
@@ -109,13 +112,6 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
    s2_maybe_ae := tlb.io.resp.ae.inst
    s2_tlb_miss := tlb.io.resp.miss
  }
  }
  when (io.cpu.req.valid) {
    s1_same_block := Bool(false)
    s1_pc_ := io.cpu.npc
    s1_speculative := io.cpu.req.bits.speculative
    s2_valid := Bool(false)
  }
  if (usingBTB) {
    val btb = Module(new BTB)
@@ -124,7 +120,7 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
    btb.io.btb_update := io.cpu.btb_update
    btb.io.bht_update := io.cpu.bht_update
    btb.io.ras_update := io.cpu.ras_update
-    when (!stall && !icmiss) {
+    when (!s2_replay) {
      btb.io.req.valid := true
      s2_btb_resp_valid := btb.io.resp.valid
      s2_btb_resp_bits := btb.io.resp.bits
@@ -133,10 +129,18 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
      predicted_npc := btb.io.resp.bits.target.sextTo(vaddrBitsExtended)
      predicted_taken := Bool(true)
    }
    // push RAS speculatively
    btb.io.ras_update.valid := btb.io.req.valid && btb.io.resp.valid && btb.io.resp.bits.cfiType.isOneOf(CFIType.call, CFIType.ret)
    val returnAddrLSBs = btb.io.resp.bits.bridx +& 1
    btb.io.ras_update.bits.returnAddr :=
      Mux(returnAddrLSBs(log2Ceil(fetchWidth)), ntpc, s1_base_pc | ((returnAddrLSBs << log2Ceil(coreInstBytes)) & (fetchBytes - 1)))
    btb.io.ras_update.bits.cfiType := btb.io.resp.bits.cfiType
    btb.io.ras_update.bits.prediction.valid := true
  }
  io.ptw <> tlb.io.ptw
-  tlb.io.req.valid := !stall && !icmiss
+  tlb.io.req.valid := !s2_replay
  tlb.io.req.bits.vaddr := s1_pc
  tlb.io.req.bits.passthrough := Bool(false)
  tlb.io.req.bits.instruction := Bool(true)
@@ -144,27 +148,26 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer)
  tlb.io.req.bits.sfence := io.cpu.sfence
  tlb.io.req.bits.size := log2Ceil(coreInstBytes*fetchWidth)
-  icache.io.req.valid := !stall && !s0_same_block
+  icache.io.req.valid := s0_valid
  icache.io.req.bits.addr := io.cpu.npc
  icache.io.invalidate := io.cpu.flush_icache
  icache.io.s1_paddr := tlb.io.resp.paddr
-  icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || icmiss || s1_speculative && !tlb.io.resp.cacheable || tlb.io.resp.pf.inst || tlb.io.resp.ae.inst
+  icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || s2_replay
-  icache.io.s2_kill := false
+  icache.io.s2_kill := s2_speculative && !s2_cacheable || s2_xcpt
  icache.io.resp.ready := !stall && !s1_same_block
-  val s2_kill = s2_speculative && !s2_cacheable || s2_xcpt
+  fq.io.enq.valid := s2_valid && (icache.io.resp.valid || icache.io.s2_kill)
-  io.cpu.resp.valid := s2_valid && (icache.io.resp.valid || s2_kill)
+  fq.io.enq.bits.pc := s2_pc
  io.cpu.resp.bits.pc := s2_pc
  io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc)
-  require(fetchWidth * coreInstBytes <= rowBytes && isPow2(fetchWidth))
+  fq.io.enq.bits.data := icache.io.resp.bits
-  io.cpu.resp.bits.data := icache.io.resp.bits.datablock >> (s2_pc.extract(log2Ceil(rowBytes)-1,log2Ceil(fetchWidth*coreInstBytes)) << log2Ceil(fetchWidth*coreInstBits))
+  fq.io.enq.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes))
-  io.cpu.resp.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Ceil(fetchWidth)+log2Ceil(coreInstBytes)-1, log2Ceil(coreInstBytes))
+  fq.io.enq.bits.pf := s2_pf
-  io.cpu.resp.bits.pf := s2_pf
+  fq.io.enq.bits.ae := s2_ae
-  io.cpu.resp.bits.ae := s2_ae
+  fq.io.enq.bits.replay := icache.io.s2_kill && !icache.io.resp.valid && !s2_xcpt
-  io.cpu.resp.bits.replay := s2_kill && !icache.io.resp.valid && !s2_xcpt
+  fq.io.enq.bits.btb.valid := s2_btb_resp_valid
-  io.cpu.resp.bits.btb.valid := s2_btb_resp_valid
+  fq.io.enq.bits.btb.bits := s2_btb_resp_bits
-  io.cpu.resp.bits.btb.bits := s2_btb_resp_bits
+
  io.cpu.resp <> fq.io.deq
  // performance events
  io.cpu.acquire := edge.done(icache.io.mem(0).a)
--- a/src/main/scala/rocket/IBuf.scala
+++ b/src/main/scala/rocket/IBuf.scala
@@ -84,7 +84,7 @@ class IBuf(implicit p: Parameters) extends CoreModule {
  val ae = valid & (Mux(buf.ae, bufMask, UInt(0)) | Mux(io.imem.bits.ae, ~bufMask, UInt(0)))
  val ic_replay = valid & (Mux(buf.replay, bufMask, UInt(0)) | Mux(io.imem.bits.replay, ~bufMask, UInt(0)))
  val ibufBTBHitMask = Mux(ibufBTBHit, UIntToOH(ibufBTBResp.bridx), UInt(0))
-  assert(!io.imem.bits.btb.valid || io.imem.bits.btb.bits.bridx >= pcWordBits)
+  assert(!io.imem.valid || !io.imem.bits.btb.valid || io.imem.bits.btb.bits.bridx >= pcWordBits)
  val icBTBHitMask = Mux(io.imem.bits.btb.valid, UIntToOH(io.imem.bits.btb.bits.bridx +& nBufValid - pcWordBits), UInt(0))
  val btbHitMask = ibufBTBHitMask & bufMask | icBTBHitMask & ~bufMask
--- a/src/main/scala/rocket/ICache.scala
+++ b/src/main/scala/rocket/ICache.scala
@@ -47,7 +47,7 @@ class ICacheBundle(outer: ICache) extends CoreBundle()(outer.p) {
  val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req
  val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission
-  val resp = Decoupled(new ICacheResp)
+  val resp = Valid(UInt(width = coreInstBits * fetchWidth))
  val invalidate = Bool(INPUT)
  val mem = outer.node.bundleOut
 }
@@ -65,7 +65,6 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
  val s_ready :: s_request :: s_refill :: Nil = Enum(UInt(), 3)
  val state = Reg(init=s_ready)
  val invalidated = Reg(Bool())
  val stall = !io.resp.ready
  val refill_addr = Reg(UInt(width = paddrBits))
  val s1_tag_hit = Wire(Vec(nWays, Bool()))
@@ -78,10 +77,10 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
  val s1_hit = out_valid && s1_any_tag_hit
  val s1_miss = s1_valid && state === s_ready && !s1_any_tag_hit
-  val s0_valid = io.req.valid && state === s_ready && !(s1_valid && stall)
+  val s0_valid = io.req.valid && state === s_ready
  val s0_vaddr = io.req.bits.addr
-  s1_valid := s0_valid || out_valid && stall
+  s1_valid := s0_valid
  when (s1_miss) { refill_addr := io.s1_paddr }
  val refill_tag = refill_addr(tagBits+untagBits-1,untagBits)
@@ -110,16 +109,34 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
  }
  val s1_tag_disparity = Wire(Vec(nWays, Bool()))
-  val s1_dout = Wire(Vec(nWays, UInt(width = code.width(rowBits))))
+  val wordBits = coreInstBits * fetchWidth
  val s1_dout = Wire(Vec(nWays, UInt(width = code.width(wordBits))))
  val s1_dout_valid = RegNext(s0_valid)
  for (i <- 0 until nWays) {
    val s1_vb = vb_array(Cat(UInt(i), io.s1_paddr(untagBits-1,blockOffBits))).toBool
-    s1_tag_disparity(i) := (code.decode(tag_rdata(i)).error holdUnless s1_dout_valid)
+    s1_tag_disparity(i) := code.decode(tag_rdata(i)).error
-    s1_tag_hit(i) := s1_vb && ((code.decode(tag_rdata(i)).uncorrected === s1_tag) holdUnless s1_dout_valid)
+    s1_tag_hit(i) := s1_vb && code.decode(tag_rdata(i)).uncorrected === s1_tag
  }
-  val data_arrays = Seq.fill(nWays) { SeqMem(nSets * refillCycles, Bits(width = code.width(rowBits))) }
+  require(rowBits % wordBits == 0)
  val data_arrays = Seq.fill(rowBits / wordBits) { SeqMem(nSets * refillCycles, Vec(nWays, UInt(width = code.width(wordBits)))) }
  for ((data_array, i) <- data_arrays zipWithIndex) {
    val wen = tl_out.d.valid
    when (wen) {
      val idx = (refill_idx << log2Ceil(refillCycles)) | refill_cnt
      val data = tl_out.d.bits.data(wordBits*(i+1)-1, wordBits*i)
      data_array.write(idx, Vec.fill(nWays)(code.encode(data)), (0 until nWays).map(repl_way === _))
    }
    def wordMatch(addr: UInt) = addr.extract(log2Ceil(rowBytes)-1, log2Ceil(wordBits/8)) === i
    val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles))
    val dout = data_array.read(s0_raddr, !wen && (s0_valid && wordMatch(s0_vaddr)))
    when (wordMatch(io.s1_paddr)) {
      s1_dout := dout
    }
  }
 /*
  for ((data_array, i) <- data_arrays zipWithIndex) {
    val wen = tl_out.d.valid && repl_way === UInt(i)
    when (wen) {
@@ -127,28 +144,29 @@ class ICacheModule(outer: ICache) extends LazyModuleImp(outer)
      data_array.write((refill_idx << log2Ceil(refillCycles)) | refill_cnt, e_d)
    }
    val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles))
-    s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid) holdUnless s1_dout_valid
+    s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid)
  }
 */
  // output signals
  outer.latency match {
    case 1 =>
      require(code.width(rowBits) == rowBits) // no ECC
-      io.resp.bits.datablock := Mux1H(s1_tag_hit, s1_dout)
+      io.resp.bits := Mux1H(s1_tag_hit, s1_dout)
      io.resp.valid := s1_hit
    case 2 =>
-      val s2_valid = RegEnable(out_valid, Bool(false), !stall)
+      val s2_valid = RegNext(out_valid, Bool(false))
-      val s2_hit = RegEnable(s1_hit, Bool(false), !stall)
+      val s2_hit = RegNext(s1_hit, Bool(false))
-      val s2_tag_hit = RegEnable(s1_tag_hit, !stall)
+      val s2_tag_hit = RegEnable(s1_tag_hit, s1_valid)
-      val s2_dout = RegEnable(s1_dout, !stall)
+      val s2_dout = RegEnable(s1_dout, s1_valid)
      val s2_way_mux = Mux1H(s2_tag_hit, s2_dout)
-      val s2_tag_disparity = RegEnable(s1_tag_disparity, !stall).asUInt.orR
+      val s2_tag_disparity = RegEnable(s1_tag_disparity, s1_valid).asUInt.orR
      val s2_data_disparity = code.decode(s2_way_mux).error
      val s2_disparity = s2_tag_disparity || s2_data_disparity
      when (s2_valid && s2_disparity) { invalidate := true }
-      io.resp.bits.datablock := code.decode(s2_way_mux).uncorrected
+      io.resp.bits := code.decode(s2_way_mux).uncorrected
      io.resp.valid := s2_hit && !s2_disparity
  }
  tl_out.a.valid := state === s_request && !io.s2_kill
--- a/src/main/scala/rocket/Rocket.scala
+++ b/src/main/scala/rocket/Rocket.scala
@@ -173,7 +173,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
  val ibuf = Module(new IBuf)
  val id_expanded_inst = ibuf.io.inst.map(_.bits.inst)
  val id_inst = id_expanded_inst.map(_.bits)
-  ibuf.io.imem <> (if (usingCompressed) withReset(reset || take_pc) { Queue(io.imem.resp, 1, flow = true) } else io.imem.resp)
+  ibuf.io.imem <> io.imem.resp
  ibuf.io.kill := take_pc
  require(decodeWidth == 1 /* TODO */ && retireWidth == decodeWidth)
@@ -520,7 +520,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
  val sboard = new Scoreboard(32, true)
  sboard.clear(ll_wen, ll_waddr)
-  val id_sboard_hazard = checkHazards(hazard_targets, sboard.read _)
+  val id_sboard_hazard = checkHazards(hazard_targets, rd => sboard.read(rd) && !(ll_wen && ll_waddr === rd))
  sboard.set(wb_set_sboard && wb_wen, wb_waddr)
  // stall for RAW/WAW hazards on CSRs, loads, AMOs, and mul/div in execute stage.
@@ -587,8 +587,11 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
  io.imem.btb_update.valid := (mem_reg_replay && mem_reg_btb_hit) || (mem_reg_valid && !take_pc_wb && (((mem_cfi_taken || !mem_cfi) && mem_wrong_npc) || (Bool(fastJAL) && mem_ctrl.jal && !mem_reg_btb_hit)))
  io.imem.btb_update.bits.isValid := !mem_reg_replay && mem_cfi
-  io.imem.btb_update.bits.isJump := mem_ctrl.jal || mem_ctrl.jalr
+  io.imem.btb_update.bits.cfiType :=
-  io.imem.btb_update.bits.isReturn := mem_ctrl.jalr && mem_reg_inst(19,15) === BitPat("b00?01")
+    Mux((mem_ctrl.jal || mem_ctrl.jalr) && mem_waddr(0), CFIType.call,
    Mux(mem_ctrl.jalr && mem_reg_inst(19,15) === BitPat("b00?01"), CFIType.ret,
    Mux(mem_ctrl.jal || mem_ctrl.jalr, CFIType.jump,
    CFIType.branch)))
  io.imem.btb_update.bits.target := io.imem.req.bits.pc
  io.imem.btb_update.bits.br_pc := (if (usingCompressed) mem_reg_pc + Mux(mem_reg_rvc, UInt(0), UInt(2)) else mem_reg_pc)
  io.imem.btb_update.bits.pc := ~(~io.imem.btb_update.bits.br_pc | (coreInstBytes*fetchWidth-1))
@@ -601,12 +604,6 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
  io.imem.bht_update.bits.mispredict := mem_wrong_npc
  io.imem.bht_update.bits.prediction := io.imem.btb_update.bits.prediction
  io.imem.ras_update.valid := mem_reg_valid && !take_pc_wb
  io.imem.ras_update.bits.returnAddr := mem_int_wdata
  io.imem.ras_update.bits.isCall := io.imem.btb_update.bits.isJump && mem_waddr(0)
  io.imem.ras_update.bits.isReturn := io.imem.btb_update.bits.isReturn
  io.imem.ras_update.bits.prediction := io.imem.btb_update.bits.prediction
  io.fpu.valid := !ctrl_killd && id_ctrl.fp
  io.fpu.killx := ctrl_killx
  io.fpu.killm := killm_common
--- a/src/main/scala/rocketchip/RocketTestSuite.scala
+++ b/src/main/scala/rocketchip/RocketTestSuite.scala
@@ -4,7 +4,7 @@
 package rocketchip
 import Chisel._
-import scala.collection.mutable.{LinkedHashSet, ArrayBuffer}
+import scala.collection.mutable.LinkedHashSet
 abstract class RocketTestSuite {
  val dir: String
@@ -56,9 +56,9 @@ class RegressionTestSuite(val names: LinkedHashSet[String]) extends RocketTestSu
 }
 object TestGeneration {
-  private val suites = ArrayBuffer[RocketTestSuite]()
+  private val suites = collection.mutable.ListMap[String, RocketTestSuite]()
-  def addSuite(s: RocketTestSuite) { suites += s }
+  def addSuite(s: RocketTestSuite) { suites += (s.makeTargetName -> s) }
  def addSuites(s: Seq[RocketTestSuite]) { s.foreach(addSuite) }
@@ -93,7 +93,7 @@ run-$kind-tests-fast: $$(addprefix $$(output_dir)/, $$(addsuffix .run, $targets)
      } else { "\n" }
    }
-    suites.groupBy(_.kind).map { case (kind, s) => gen(kind, s) }.mkString("\n")
+    suites.values.toSeq.groupBy(_.kind).map { case (kind, s) => gen(kind, s) }.mkString("\n")
  }
 }
--- a/src/main/scala/tile/FPU.scala
+++ b/src/main/scala/tile/FPU.scala
@@ -450,6 +450,7 @@ class IntToFP(val latency: Int)(implicit p: Parameters) extends FPUModule()(p) {
    l2s.io.signedIn := ~in.bits.typ(0)
    l2s.io.in := intValue
    l2s.io.roundingMode := in.bits.rm
    l2s.io.detectTininess := hardfloat.consts.tininess_afterRounding
    mux.data := sanitizeNaN(l2s.io.out, FType.S)
    mux.exc := l2s.io.exceptionFlags
@@ -460,6 +461,7 @@ class IntToFP(val latency: Int)(implicit p: Parameters) extends FPUModule()(p) {
        l2d.io.signedIn := ~in.bits.typ(0)
        l2d.io.in := intValue
        l2d.io.roundingMode := in.bits.rm
        l2d.io.detectTininess := hardfloat.consts.tininess_afterRounding
        mux.data := Cat(l2d.io.out >> l2s.io.out.getWidth, l2s.io.out)
        when (!in.bits.singleIn) {
          mux.data := sanitizeNaN(l2d.io.out, FType.D)
@@ -511,11 +513,13 @@ class FPToFP(val latency: Int)(implicit p: Parameters) extends FPUModule()(p) {
        val d2s = Module(new hardfloat.RecFNToRecFN(dExpWidth, dSigWidth, sExpWidth, sSigWidth))
        d2s.io.in := in.bits.in1
        d2s.io.roundingMode := in.bits.rm
        d2s.io.detectTininess := hardfloat.consts.tininess_afterRounding
        val d2sOut = sanitizeNaN(d2s.io.out, FType.S)
        val s2d = Module(new hardfloat.RecFNToRecFN(sExpWidth, sSigWidth, dExpWidth, dSigWidth))
        s2d.io.in := maxType.unsafeConvert(in.bits.in1, FType.S)
        s2d.io.roundingMode := in.bits.rm
        s2d.io.detectTininess := hardfloat.consts.tininess_afterRounding
        val s2dOut = sanitizeNaN(s2d.io.out, FType.D)
        when (in.bits.singleOut) {
@@ -554,6 +558,7 @@ class FPUFMAPipe(val latency: Int, t: FType)(implicit p: Parameters) extends FPU
  val fma = Module(new hardfloat.MulAddRecFN(t.exp, t.sig))
  fma.io.op := in.fmaCmd
  fma.io.roundingMode := in.rm
  fma.io.detectTininess := hardfloat.consts.tininess_afterRounding
  fma.io.a := in.in1
  fma.io.b := in.in2
  fma.io.c := in.in3
@@ -775,6 +780,7 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) {
    divSqrt.io.a := fpiu.io.out.bits.in.in1
    divSqrt.io.b := fpiu.io.out.bits.in.in2
    divSqrt.io.roundingMode := fpiu.io.out.bits.in.rm
    divSqrt.io.detectTininess := hardfloat.consts.tininess_afterRounding
    when (divSqrt.io.inValid && divSqrt_inReady) {
      divSqrt_in_flight := true
@@ -794,6 +800,7 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) {
    val divSqrt_toSingle = Module(new hardfloat.RecFNToRecFN(11, 53, 8, 24))
    divSqrt_toSingle.io.in := divSqrt_wdata_double
    divSqrt_toSingle.io.roundingMode := divSqrt_rm
    divSqrt_toSingle.io.detectTininess := hardfloat.consts.tininess_afterRounding
    divSqrt_wdata := Mux(divSqrt_single, Cat(divSqrt_wdata_double >> divSqrt_toSingle.io.out.getWidth, sanitizeNaN(divSqrt_toSingle.io.out, FType.S)), divSqrt_wdata_double)
    divSqrt_flags := divSqrt_flags_double | Mux(divSqrt_single, divSqrt_toSingle.io.exceptionFlags, Bits(0))
  } else {
--- a/src/main/scala/tile/Interrupts.scala
+++ b/src/main/scala/tile/Interrupts.scala
@@ -30,7 +30,8 @@ trait HasExternalInterrupts extends HasTileParameters {
  // debug, msip, mtip, meip, seip, lip offsets in CSRs
  def csrIntMap: List[Int] = {
    val nlips = tileParams.core.nLocalInterrupts
-    List(65535, 3, 7, 11, 9) ++ List.tabulate(nlips)(_ + 16)
+    val seip = if (usingVM) Seq(9) else Nil
    List(65535, 3, 7, 11) ++ seip ++ List.tabulate(nlips)(_ + 16)
  }
 }
--- a/src/main/scala/uncore/tilelink2/WidthWidget.scala
+++ b/src/main/scala/uncore/tilelink2/WidthWidget.scala
@@ -132,7 +132,13 @@ class TLWidthWidget(innerBeatBytes: Int)(implicit p: Parameters) extends LazyMod
      } else if (edgeIn.manager.beatBytes > edgeOut.manager.beatBytes) {
        // split input to output
        val repeat = Wire(Bool())
-        repeat := split(edgeIn, Repeater(in, repeat), edgeOut, out)
+        val repeated = Repeater(in, repeat)
        val cated = Wire(repeated)
        cated <> repeated
        edgeIn.data(cated.bits) := Cat(
          edgeIn.data(repeated.bits)(edgeIn.manager.beatBytes*8-1, edgeOut.manager.beatBytes*8),
          edgeIn.data(in.bits)(edgeOut.manager.beatBytes*8-1, 0))
        repeat := split(edgeIn, cated, edgeOut, out)
      } else {
        // merge input to output
        merge(edgeIn, in, edgeOut, out)
--- a/src/main/scala/util/Package.scala
+++ b/src/main/scala/util/Package.scala
@@ -38,9 +38,17 @@ package object util {
  implicit def wcToUInt(c: WideCounter): UInt = c.value
  implicit class UIntToAugmentedUInt(val x: UInt) extends AnyVal {
-    def sextTo(n: Int): UInt =
+    def sextTo(n: Int): UInt = {
      require(x.getWidth <= n)
      if (x.getWidth == n) x
      else Cat(Fill(n - x.getWidth, x(x.getWidth-1)), x)
    }
    def padTo(n: Int): UInt = {
      require(x.getWidth <= n)
      if (x.getWidth == n) x
      else Cat(UInt(0, n - x.getWidth), x)
    }
    def extract(hi: Int, lo: Int): UInt = {
      if (hi == lo-1) UInt(0)
--- a/src/main/scala/util/ShiftQueue.scala
+++ b/src/main/scala/util/ShiftQueue.scala
@@ -0,0 +1,61 @@
 // See LICENSE.SiFive for license details.
 package util
 import Chisel._
 /** Implements the same interface as chisel3.util.Queue, but uses a shift
  * register internally.  It is less energy efficient whenever the queue
  * has more than one entry populated, but is faster on the dequeue side.
  * It is efficient for usually-empty flow-through queues. */
 class ShiftQueue[T <: Data](gen: T,
                            val entries: Int,
                            pipe: Boolean = false,
                            flow: Boolean = false)
    extends Module {
  val io = IO(new QueueIO(gen, entries) {
    val mask = UInt(OUTPUT, entries)
  })
  private val ram = Mem(entries, gen)
  private val valid = RegInit(UInt(0, entries))
  private val elts = Reg(Vec(entries, gen))
  private val do_enq = Wire(init=io.enq.fire())
  private val do_deq = Wire(init=io.deq.fire())
  when (do_deq) {
    when (!do_enq) { valid := (valid >> 1) }
    for (i <- 1 until entries)
      when (valid(i)) { elts(i-1) := elts(i) }
  }
  when (do_enq && do_deq) {
    for (i <- 0 until entries)
      when (valid(i) && (if (i == entries-1) true.B else !valid(i+1))) { elts(i) := io.enq.bits }
  }
  when (do_enq && !do_deq) {
    valid := (valid << 1) | UInt(1)
    for (i <- 0 until entries)
      when (!valid(i) && (if (i == 0) true.B else valid(i-1))) { elts(i) := io.enq.bits }
  }
  io.enq.ready := !valid(entries-1)
  io.deq.valid := valid(0)
  io.deq.bits := elts.head
  if (flow) {
    when (io.enq.valid) { io.deq.valid := true.B }
    when (!valid(0)) {
      io.deq.bits := io.enq.bits
      do_deq := false.B
      when (io.deq.ready) { do_enq := false.B }
    }
  }
  if (pipe) {
    when (io.deq.ready) { io.enq.ready := true.B }
  }
  io.count := PopCount(valid)
  io.mask := valid
 }