diff --git a/rocket/.gitignore b/rocket/.gitignore new file mode 100644 index 00000000..eb5a316c --- /dev/null +++ b/rocket/.gitignore @@ -0,0 +1 @@ +target diff --git a/rocket/LICENSE b/rocket/LICENSE new file mode 100644 index 00000000..60e19fad --- /dev/null +++ b/rocket/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2011-2014, The Regents of the University of California +(Regents). All Rights Reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. Neither the name of the Regents nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, +SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING +OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS +BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED +HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE +MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. diff --git a/rocket/README.md b/rocket/README.md new file mode 100644 index 00000000..72707989 --- /dev/null +++ b/rocket/README.md @@ -0,0 +1,29 @@ +Rocket Core +=========== + +Rocket is a 6-stage single-issue in-order pipeline that executes the 64-bit +scalar RISC-V ISA. Rocket implements an MMU that supports page-based virtual +memory and is able to boot modern operating systems such as Linux. Rocket +also has an optional IEEE 754-2008-compliant FPU, which implements both +single- and double-precision floating-point operations, including fused +multiply-add. + +This repository is not intended to be a self-running repository. To +instantiate a Rocket core, please use the Rocket chip generator found in the +rocket-chip git repository. + +The following table compares a 32-bit ARM Cortex-A5 core to a 64-bit RISC-V +Rocket core built in the same TSMC process (40GPLUS). Fourth column is the +ratio of RISC-V Rocket to ARM Cortex-A5. Both use single-instruction-issue, +in-order pipelines, yet the RISC-V core is faster, smaller, and uses less +power. + +ISA/Implementation | ARM Cortex-A5 | RISC-V Rocket | R/A +--- | --- | --- | --- +ISA Register Width | 32 bits | 64 bits | 2 +Frequency | >1 GHz | >1 GHz | 1 +Dhrystone Performance | 1.57 DMIPS/MHz | 1.72 DMIPS/MHz | 1.1 +Area excluding caches | 0.27 mm2 | 0.14 mm2 | 0.5 +Area with 16KB caches | 0.53 mm2 | 0.39 mm2 | 0.7 +Area Efficiency | 2.96 DMIPS/MHz/mm2 | 4.41 DMIPS/MHz/mm2 | 1.5 +Dynamic Power | <0.08 mW/MHz | 0.034 mW/MHz | >= 0.4 diff --git a/rocket/build.sbt b/rocket/build.sbt new file mode 100644 index 00000000..97c51700 --- /dev/null +++ b/rocket/build.sbt @@ -0,0 +1,10 @@ +organization := "edu.berkeley.cs" + +version := "1.2" + +name := "rocket" + +scalaVersion := "2.11.6" + +libraryDependencies ++= (Seq("chisel", "hardfloat", "uncore", "junctions", "cde").map { + dep: String => sys.props.get(dep + "Version") map { "edu.berkeley.cs" %% dep % _ }}).flatten diff --git a/rocket/src/main/scala/arbiter.scala b/rocket/src/main/scala/arbiter.scala new file mode 100644 index 00000000..1a686d5b --- /dev/null +++ b/rocket/src/main/scala/arbiter.scala @@ -0,0 +1,113 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import cde.{Parameters, Field} +import junctions.{ParameterizedBundle, DecoupledHelper} + +class HellaCacheArbiter(n: Int)(implicit p: Parameters) extends Module +{ + val io = new Bundle { + val requestor = Vec(n, new HellaCacheIO).flip + val mem = new HellaCacheIO + } + + if (n == 1) { + io.mem <> io.requestor.head + } else { + val s1_id = Reg(UInt()) + val s2_id = Reg(next=s1_id) + + io.mem.invalidate_lr := io.requestor.map(_.invalidate_lr).reduce(_||_) + io.mem.req.valid := io.requestor.map(_.req.valid).reduce(_||_) + io.requestor(0).req.ready := io.mem.req.ready + for (i <- 1 until n) + io.requestor(i).req.ready := io.requestor(i-1).req.ready && !io.requestor(i-1).req.valid + + for (i <- n-1 to 0 by -1) { + val req = io.requestor(i).req + def connect_s0() = { + io.mem.req.bits.cmd := req.bits.cmd + io.mem.req.bits.typ := req.bits.typ + io.mem.req.bits.addr := req.bits.addr + io.mem.req.bits.phys := req.bits.phys + io.mem.req.bits.tag := Cat(req.bits.tag, UInt(i, log2Up(n))) + s1_id := UInt(i) + } + def connect_s1() = { + io.mem.s1_kill := io.requestor(i).s1_kill + io.mem.s1_data := io.requestor(i).s1_data + } + + if (i == n-1) { + connect_s0() + connect_s1() + } else { + when (req.valid) { connect_s0() } + when (s1_id === UInt(i)) { connect_s1() } + } + } + + for (i <- 0 until n) { + val resp = io.requestor(i).resp + val tag_hit = io.mem.resp.bits.tag(log2Up(n)-1,0) === UInt(i) + resp.valid := io.mem.resp.valid && tag_hit + io.requestor(i).xcpt := io.mem.xcpt + io.requestor(i).ordered := io.mem.ordered + io.requestor(i).s2_nack := io.mem.s2_nack && s2_id === UInt(i) + resp.bits := io.mem.resp.bits + resp.bits.tag := io.mem.resp.bits.tag >> log2Up(n) + + io.requestor(i).replay_next := io.mem.replay_next + } + } +} + +class InOrderArbiter[T <: Data, U <: Data](reqTyp: T, respTyp: U, n: Int) + (implicit p: Parameters) extends Module { + val io = new Bundle { + val in_req = Vec(n, Decoupled(reqTyp)).flip + val in_resp = Vec(n, Decoupled(respTyp)) + val out_req = Decoupled(reqTyp) + val out_resp = Decoupled(respTyp).flip + } + + if (n > 1) { + val route_q = Module(new Queue(UInt(width = log2Up(n)), 2)) + val req_arb = Module(new RRArbiter(reqTyp, n)) + req_arb.io.in <> io.in_req + + val req_helper = DecoupledHelper( + req_arb.io.out.valid, + route_q.io.enq.ready, + io.out_req.ready) + + io.out_req.bits := req_arb.io.out.bits + io.out_req.valid := req_helper.fire(io.out_req.ready) + + route_q.io.enq.bits := req_arb.io.chosen + route_q.io.enq.valid := req_helper.fire(route_q.io.enq.ready) + + req_arb.io.out.ready := req_helper.fire(req_arb.io.out.valid) + + val resp_sel = route_q.io.deq.bits + val resp_ready = io.in_resp(resp_sel).ready + val resp_helper = DecoupledHelper( + resp_ready, + route_q.io.deq.valid, + io.out_resp.valid) + + val resp_valid = resp_helper.fire(resp_ready) + for (i <- 0 until n) { + io.in_resp(i).bits := io.out_resp.bits + io.in_resp(i).valid := resp_valid && resp_sel === UInt(i) + } + + route_q.io.deq.ready := resp_helper.fire(route_q.io.deq.valid) + io.out_resp.ready := resp_helper.fire(io.out_resp.valid) + } else { + io.out_req <> io.in_req.head + io.in_resp.head <> io.out_resp + } +} diff --git a/rocket/src/main/scala/breakpoint.scala b/rocket/src/main/scala/breakpoint.scala new file mode 100644 index 00000000..ee484c28 --- /dev/null +++ b/rocket/src/main/scala/breakpoint.scala @@ -0,0 +1,82 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import Util._ +import cde.Parameters + +class TDRSelect(implicit p: Parameters) extends CoreBundle()(p) { + val tdrmode = Bool() + val reserved = UInt(width = xLen - 1 - log2Up(nTDR)) + val tdrindex = UInt(width = log2Up(nTDR)) + + def nTDR = p(NBreakpoints) +} + +class BPControl(implicit p: Parameters) extends CoreBundle()(p) { + val tdrtype = UInt(width = 4) + val bpamaskmax = UInt(width = 5) + val reserved = UInt(width = xLen-28) + val bpaction = UInt(width = 8) + val bpmatch = UInt(width = 4) + val m = Bool() + val h = Bool() + val s = Bool() + val u = Bool() + val r = Bool() + val w = Bool() + val x = Bool() + + def tdrType = 1 + def bpaMaskMax = 4 + def enabled(mstatus: MStatus) = Cat(m, h, s, u)(mstatus.prv) +} + +class BP(implicit p: Parameters) extends CoreBundle()(p) { + val control = new BPControl + val address = UInt(width = vaddrBits) + + def mask(dummy: Int = 0) = { + var mask: UInt = control.bpmatch(1) + for (i <- 1 until control.bpaMaskMax) + mask = Cat(mask(i-1) && address(i-1), mask) + mask + } + + def pow2AddressMatch(x: UInt) = + (~x | mask()) === (~address | mask()) +} + +class BreakpointUnit(implicit p: Parameters) extends CoreModule()(p) { + val io = new Bundle { + val status = new MStatus().asInput + val bp = Vec(p(NBreakpoints), new BP).asInput + val pc = UInt(INPUT, vaddrBits) + val ea = UInt(INPUT, vaddrBits) + val xcpt_if = Bool(OUTPUT) + val xcpt_ld = Bool(OUTPUT) + val xcpt_st = Bool(OUTPUT) + } + + io.xcpt_if := false + io.xcpt_ld := false + io.xcpt_st := false + + for (bp <- io.bp) { + when (bp.control.enabled(io.status)) { + when (bp.pow2AddressMatch(io.pc) && bp.control.x) { io.xcpt_if := true } + when (bp.pow2AddressMatch(io.ea) && bp.control.r) { io.xcpt_ld := true } + when (bp.pow2AddressMatch(io.ea) && bp.control.w) { io.xcpt_st := true } + } + } + + if (!io.bp.isEmpty) for ((bpl, bph) <- io.bp zip io.bp.tail) { + def matches(x: UInt) = !(x < bpl.address) && x < bph.address + when (bph.control.enabled(io.status) && bph.control.bpmatch === 1) { + when (matches(io.pc) && bph.control.x) { io.xcpt_if := true } + when (matches(io.ea) && bph.control.r) { io.xcpt_ld := true } + when (matches(io.ea) && bph.control.w) { io.xcpt_st := true } + } + } +} diff --git a/rocket/src/main/scala/btb.scala b/rocket/src/main/scala/btb.scala new file mode 100644 index 00000000..d16c4725 --- /dev/null +++ b/rocket/src/main/scala/btb.scala @@ -0,0 +1,272 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import junctions._ +import cde.{Parameters, Field} +import Util._ + +case object BtbKey extends Field[BtbParameters] + +case class BtbParameters( + nEntries: Int = 62, + nRAS: Int = 2, + updatesOutOfOrder: Boolean = false) + +abstract trait HasBtbParameters extends HasCoreParameters { + val matchBits = pgIdxBits + val entries = p(BtbKey).nEntries + val nRAS = p(BtbKey).nRAS + val updatesOutOfOrder = p(BtbKey).updatesOutOfOrder + val nPages = ((1 max(log2Up(entries)))+1)/2*2 // control logic assumes 2 divides pages + val opaqueBits = log2Up(entries) + val nBHT = 1 << log2Up(entries*2) +} + +abstract class BtbModule(implicit val p: Parameters) extends Module with HasBtbParameters +abstract class BtbBundle(implicit val p: Parameters) extends ParameterizedBundle()(p) + with HasBtbParameters + +class RAS(nras: Int) { + def push(addr: UInt): Unit = { + when (count < nras) { count := count + 1 } + val nextPos = Mux(Bool(isPow2(nras)) || pos < nras-1, pos+1, UInt(0)) + stack(nextPos) := addr + pos := nextPos + } + def peek: UInt = stack(pos) + def pop(): Unit = when (!isEmpty) { + count := count - 1 + pos := Mux(Bool(isPow2(nras)) || pos > 0, pos-1, UInt(nras-1)) + } + def clear(): Unit = count := UInt(0) + def isEmpty: Bool = count === UInt(0) + + private val count = Reg(UInt(width = log2Up(nras+1))) + private val pos = Reg(UInt(width = log2Up(nras))) + private val stack = Reg(Vec(nras, UInt())) +} + +class BHTResp(implicit p: Parameters) extends BtbBundle()(p) { + val history = UInt(width = log2Up(nBHT).max(1)) + val value = UInt(width = 2) +} + +// BHT contains table of 2-bit counters and a global history register. +// The BHT only predicts and updates when there is a BTB hit. +// The global history: +// - updated speculatively in fetch (if there's a BTB hit). +// - on a mispredict, the history register is reset (again, only if BTB hit). +// The counter table: +// - each counter corresponds with the address of the fetch packet ("fetch pc"). +// - updated when a branch resolves (and BTB was a hit for that branch). +// The updating branch must provide its "fetch pc". +class BHT(nbht: Int)(implicit p: Parameters) { + val nbhtbits = log2Up(nbht) + def get(addr: UInt, update: Bool): BHTResp = { + val res = Wire(new BHTResp) + val index = addr(nbhtbits+1,2) ^ history + res.value := table(index) + res.history := history + val taken = res.value(0) + when (update) { history := Cat(taken, history(nbhtbits-1,1)) } + res + } + def update(addr: UInt, d: BHTResp, taken: Bool, mispredict: Bool): Unit = { + val index = addr(nbhtbits+1,2) ^ d.history + table(index) := Cat(taken, (d.value(1) & d.value(0)) | ((d.value(1) | d.value(0)) & taken)) + when (mispredict) { history := Cat(taken, d.history(nbhtbits-1,1)) } + } + + private val table = Mem(nbht, UInt(width = 2)) + val history = Reg(UInt(width = nbhtbits)) +} + +// BTB update occurs during branch resolution (and only on a mispredict). +// - "pc" is what future fetch PCs will tag match against. +// - "br_pc" is the PC of the branch instruction. +class BTBUpdate(implicit p: Parameters) extends BtbBundle()(p) { + val prediction = Valid(new BTBResp) + val pc = UInt(width = vaddrBits) + val target = UInt(width = vaddrBits) + val taken = Bool() + val isJump = Bool() + val isReturn = Bool() + val br_pc = UInt(width = vaddrBits) +} + +// BHT update occurs during branch resolution on all conditional branches. +// - "pc" is what future fetch PCs will tag match against. +class BHTUpdate(implicit p: Parameters) extends BtbBundle()(p) { + val prediction = Valid(new BTBResp) + val pc = UInt(width = vaddrBits) + val taken = Bool() + val mispredict = Bool() +} + +class RASUpdate(implicit p: Parameters) extends BtbBundle()(p) { + val isCall = Bool() + val isReturn = Bool() + val returnAddr = UInt(width = vaddrBits) + val prediction = Valid(new BTBResp) +} + +// - "bridx" is the low-order PC bits of the predicted branch (after +// shifting off the lowest log(inst_bytes) bits off). +// - "mask" provides a mask of valid instructions (instructions are +// masked off by the predicted taken branch from the BTB). +class BTBResp(implicit p: Parameters) extends BtbBundle()(p) { + val taken = Bool() + val mask = Bits(width = fetchWidth) + val bridx = Bits(width = log2Up(fetchWidth)) + val target = UInt(width = vaddrBits) + val entry = UInt(width = opaqueBits) + val bht = new BHTResp +} + +class BTBReq(implicit p: Parameters) extends BtbBundle()(p) { + val addr = UInt(width = vaddrBits) +} + +// fully-associative branch target buffer +// Higher-performance processors may cause BTB updates to occur out-of-order, +// which requires an extra CAM port for updates (to ensure no duplicates get +// placed in BTB). +class BTB(implicit p: Parameters) extends BtbModule { + val io = new Bundle { + val req = Valid(new BTBReq).flip + val resp = Valid(new BTBResp) + val btb_update = Valid(new BTBUpdate).flip + val bht_update = Valid(new BHTUpdate).flip + val ras_update = Valid(new RASUpdate).flip + } + + val idxs = Reg(Vec(entries, UInt(width=matchBits - log2Up(coreInstBytes)))) + val idxPages = Reg(Vec(entries, UInt(width=log2Up(nPages)))) + val tgts = Reg(Vec(entries, UInt(width=matchBits - log2Up(coreInstBytes)))) + val tgtPages = Reg(Vec(entries, UInt(width=log2Up(nPages)))) + val pages = Reg(Vec(nPages, UInt(width=vaddrBits - matchBits))) + val pageValid = Reg(init = UInt(0, nPages)) + val idxPagesOH = idxPages.map(UIntToOH(_)(nPages-1,0)) + val tgtPagesOH = tgtPages.map(UIntToOH(_)(nPages-1,0)) + + val useRAS = Reg(UInt(width = entries)) + val isJump = Reg(UInt(width = entries)) + val brIdx = Reg(Vec(entries, UInt(width=log2Up(fetchWidth)))) + + private def page(addr: UInt) = addr >> matchBits + private def pageMatch(addr: UInt) = { + val p = page(addr) + pageValid & pages.map(_ === p).toBits + } + private def tagMatch(addr: UInt, pgMatch: UInt) = { + val idxMatch = idxs.map(_ === addr(matchBits-1, log2Up(coreInstBytes))).toBits + val idxPageMatch = idxPagesOH.map(_ & pgMatch).map(_.orR).toBits + idxMatch & idxPageMatch + } + + val r_btb_update = Pipe(io.btb_update) + val update_target = io.req.bits.addr + + val pageHit = pageMatch(io.req.bits.addr) + val hitsVec = tagMatch(io.req.bits.addr, pageHit) + val hits = hitsVec.toBits + val updatePageHit = pageMatch(r_btb_update.bits.pc) + + val updateHits = tagMatch(r_btb_update.bits.pc, updatePageHit) + val updateHit = if (updatesOutOfOrder) updateHits.orR else r_btb_update.bits.prediction.valid + val updateHitAddr = if (updatesOutOfOrder) OHToUInt(updateHits) else r_btb_update.bits.prediction.bits.entry + + // guarantee one-hotness of idx after reset + val resetting = Reg(init = Bool(true)) + val (nextRepl, wrap) = Counter(resetting || (r_btb_update.valid && !updateHit), entries) + when (wrap) { resetting := false } + + val useUpdatePageHit = updatePageHit.orR + val usePageHit = pageHit.orR + val doIdxPageRepl = !useUpdatePageHit + val nextPageRepl = Reg(UInt(width = log2Ceil(nPages))) + val idxPageRepl = Mux(usePageHit, Cat(pageHit(nPages-2,0), pageHit(nPages-1)), UIntToOH(nextPageRepl)) + val idxPageUpdateOH = Mux(useUpdatePageHit, updatePageHit, idxPageRepl) + val idxPageUpdate = OHToUInt(idxPageUpdateOH) + val idxPageReplEn = Mux(doIdxPageRepl, idxPageRepl, UInt(0)) + + val samePage = page(r_btb_update.bits.pc) === page(update_target) + val doTgtPageRepl = !samePage && !usePageHit + val tgtPageRepl = Mux(samePage, idxPageUpdateOH, Cat(idxPageUpdateOH(nPages-2,0), idxPageUpdateOH(nPages-1))) + val tgtPageUpdate = OHToUInt(Mux(usePageHit, pageHit, tgtPageRepl)) + val tgtPageReplEn = Mux(doTgtPageRepl, tgtPageRepl, UInt(0)) + + when (r_btb_update.valid && (doIdxPageRepl || doTgtPageRepl)) { + val both = doIdxPageRepl && doTgtPageRepl + val next = nextPageRepl + Mux[UInt](both, 2, 1) + nextPageRepl := Mux(next >= nPages, next(0), next) + } + + when (r_btb_update.valid || resetting) { + assert(resetting || io.req.bits.addr === r_btb_update.bits.target, "BTB request != I$ target") + + val waddr = Mux(updateHit && !resetting, updateHitAddr, nextRepl) + val mask = UIntToOH(waddr) + val newIdx = r_btb_update.bits.pc(matchBits-1, log2Up(coreInstBytes)) + idxs(waddr) := Mux(resetting, Cat(newIdx >> log2Ceil(entries), nextRepl), newIdx) + tgts(waddr) := update_target(matchBits-1, log2Up(coreInstBytes)) + idxPages(waddr) := idxPageUpdate + tgtPages(waddr) := tgtPageUpdate + useRAS := Mux(r_btb_update.bits.isReturn, useRAS | mask, useRAS & ~mask) + isJump := Mux(r_btb_update.bits.isJump, isJump | mask, isJump & ~mask) + if (fetchWidth > 1) + brIdx(waddr) := r_btb_update.bits.br_pc >> log2Up(coreInstBytes) + + require(nPages % 2 == 0) + val idxWritesEven = !idxPageUpdate(0) + + def writeBank(i: Int, mod: Int, en: UInt, data: UInt) = + for (i <- i until nPages by mod) + when (en(i)) { pages(i) := data } + + writeBank(0, 2, Mux(idxWritesEven, idxPageReplEn, tgtPageReplEn), + Mux(idxWritesEven, page(r_btb_update.bits.pc), page(update_target))) + writeBank(1, 2, Mux(idxWritesEven, tgtPageReplEn, idxPageReplEn), + Mux(idxWritesEven, page(update_target), page(r_btb_update.bits.pc))) + pageValid := pageValid | tgtPageReplEn | idxPageReplEn + } + + io.resp.valid := hits.orR + io.resp.bits.taken := io.resp.valid + io.resp.bits.target := Cat(Mux1H(Mux1H(hitsVec, tgtPagesOH), pages), Mux1H(hitsVec, tgts) << log2Up(coreInstBytes)) + io.resp.bits.entry := OHToUInt(hits) + io.resp.bits.bridx := Mux1H(hitsVec, brIdx) + io.resp.bits.mask := Cat((UInt(1) << ~Mux(io.resp.bits.taken, ~io.resp.bits.bridx, UInt(0)))-1, UInt(1)) + + if (nBHT > 0) { + val bht = new BHT(nBHT) + val isBranch = !(hits & isJump).orR + val res = bht.get(io.req.bits.addr, io.req.valid && io.resp.valid && isBranch) + val update_btb_hit = io.bht_update.bits.prediction.valid + when (io.bht_update.valid && update_btb_hit) { + bht.update(io.bht_update.bits.pc, io.bht_update.bits.prediction.bits.bht, io.bht_update.bits.taken, io.bht_update.bits.mispredict) + } + when (!res.value(0) && isBranch) { io.resp.bits.taken := false } + io.resp.bits.bht := res + } + + if (nRAS > 0) { + val ras = new RAS(nRAS) + val doPeek = (hits & useRAS).orR + when (!ras.isEmpty && doPeek) { + io.resp.bits.target := ras.peek + } + when (io.ras_update.valid) { + when (io.ras_update.bits.isCall) { + ras.push(io.ras_update.bits.returnAddr) + when (doPeek) { + io.resp.bits.target := io.ras_update.bits.returnAddr + } + }.elsewhen (io.ras_update.bits.isReturn && io.ras_update.bits.prediction.valid) { + ras.pop() + } + } + } +} diff --git a/rocket/src/main/scala/consts.scala b/rocket/src/main/scala/consts.scala new file mode 100644 index 00000000..74386c12 --- /dev/null +++ b/rocket/src/main/scala/consts.scala @@ -0,0 +1,49 @@ +// See LICENSE for license details. + +package rocket +package constants + +import Chisel._ +import scala.math._ + +trait ScalarOpConstants { + val SZ_BR = 3 + val BR_X = BitPat("b???") + val BR_EQ = UInt(0, 3) + val BR_NE = UInt(1, 3) + val BR_J = UInt(2, 3) + val BR_N = UInt(3, 3) + val BR_LT = UInt(4, 3) + val BR_GE = UInt(5, 3) + val BR_LTU = UInt(6, 3) + val BR_GEU = UInt(7, 3) + + val A1_X = BitPat("b??") + val A1_ZERO = UInt(0, 2) + val A1_RS1 = UInt(1, 2) + val A1_PC = UInt(2, 2) + + val IMM_X = BitPat("b???") + val IMM_S = UInt(0, 3) + val IMM_SB = UInt(1, 3) + val IMM_U = UInt(2, 3) + val IMM_UJ = UInt(3, 3) + val IMM_I = UInt(4, 3) + val IMM_Z = UInt(5, 3) + + val A2_X = BitPat("b??") + val A2_ZERO = UInt(0, 2) + val A2_FOUR = UInt(1, 2) + val A2_RS2 = UInt(2, 2) + val A2_IMM = UInt(3, 2) + + val X = BitPat("b?") + val N = BitPat("b0") + val Y = BitPat("b1") + + val SZ_DW = 1 + val DW_X = X + val DW_32 = N + val DW_64 = Y + val DW_XPR = Y +} diff --git a/rocket/src/main/scala/csr.scala b/rocket/src/main/scala/csr.scala new file mode 100644 index 00000000..86090c2f --- /dev/null +++ b/rocket/src/main/scala/csr.scala @@ -0,0 +1,589 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import Util._ +import Instructions._ +import cde.{Parameters, Field} +import uncore.devices._ +import junctions.AddrMap + +class MStatus extends Bundle { + val debug = Bool() // not truly part of mstatus, but convenient + val prv = UInt(width = PRV.SZ) // not truly part of mstatus, but convenient + val sd = Bool() + val zero3 = UInt(width = 31) + val sd_rv32 = Bool() + val zero2 = UInt(width = 2) + val vm = UInt(width = 5) + val zero1 = UInt(width = 4) + val mxr = Bool() + val pum = Bool() + val mprv = Bool() + val xs = UInt(width = 2) + val fs = UInt(width = 2) + val mpp = UInt(width = 2) + val hpp = UInt(width = 2) + val spp = UInt(width = 1) + val mpie = Bool() + val hpie = Bool() + val spie = Bool() + val upie = Bool() + val mie = Bool() + val hie = Bool() + val sie = Bool() + val uie = Bool() +} + +class DCSR extends Bundle { + val xdebugver = UInt(width = 2) + val ndreset = Bool() + val fullreset = Bool() + val hwbpcount = UInt(width = 12) + val ebreakm = Bool() + val ebreakh = Bool() + val ebreaks = Bool() + val ebreaku = Bool() + val zero2 = Bool() + val stopcycle = Bool() + val stoptime = Bool() + val cause = UInt(width = 3) + val debugint = Bool() + val zero1 = Bool() + val halt = Bool() + val step = Bool() + val prv = UInt(width = PRV.SZ) +} + +class MIP extends Bundle { + val rocc = Bool() + val meip = Bool() + val heip = Bool() + val seip = Bool() + val ueip = Bool() + val mtip = Bool() + val htip = Bool() + val stip = Bool() + val utip = Bool() + val msip = Bool() + val hsip = Bool() + val ssip = Bool() + val usip = Bool() +} + +class PTBR(implicit p: Parameters) extends CoreBundle()(p) { + require(maxPAddrBits - pgIdxBits + asIdBits <= xLen) + val asid = UInt(width = asIdBits) + val ppn = UInt(width = maxPAddrBits - pgIdxBits) +} + +object PRV +{ + val SZ = 2 + val U = 0 + val S = 1 + val H = 2 + val M = 3 +} + +object CSR +{ + // commands + val SZ = 3 + val X = BitPat.DC(SZ) + val N = UInt(0,SZ) + val W = UInt(1,SZ) + val S = UInt(2,SZ) + val C = UInt(3,SZ) + val I = UInt(4,SZ) + val R = UInt(5,SZ) + + val ADDRSZ = 12 +} + +class CSRFileIO(implicit p: Parameters) extends CoreBundle { + val prci = new PRCITileIO().flip + val rw = new Bundle { + val addr = UInt(INPUT, CSR.ADDRSZ) + val cmd = Bits(INPUT, CSR.SZ) + val rdata = Bits(OUTPUT, xLen) + val wdata = Bits(INPUT, xLen) + } + + val csr_stall = Bool(OUTPUT) + val csr_xcpt = Bool(OUTPUT) + val eret = Bool(OUTPUT) + val singleStep = Bool(OUTPUT) + + val status = new MStatus().asOutput + val ptbr = new PTBR().asOutput + val evec = UInt(OUTPUT, vaddrBitsExtended) + val exception = Bool(INPUT) + val retire = UInt(INPUT, log2Up(1+retireWidth)) + val custom_mrw_csrs = Vec(nCustomMrwCsrs, UInt(INPUT, xLen)) + val cause = UInt(INPUT, xLen) + val pc = UInt(INPUT, vaddrBitsExtended) + val badaddr = UInt(INPUT, vaddrBitsExtended) + val fatc = Bool(OUTPUT) + val time = UInt(OUTPUT, xLen) + val fcsr_rm = Bits(OUTPUT, FPConstants.RM_SZ) + val fcsr_flags = Valid(Bits(width = FPConstants.FLAGS_SZ)).flip + val rocc = new RoCCInterface().flip + val interrupt = Bool(OUTPUT) + val interrupt_cause = UInt(OUTPUT, xLen) + val bp = Vec(p(NBreakpoints), new BP).asOutput +} + +class CSRFile(implicit p: Parameters) extends CoreModule()(p) +{ + val io = new CSRFileIO + + val reset_mstatus = Wire(init=new MStatus().fromBits(0)) + reset_mstatus.mpp := PRV.M + reset_mstatus.prv := PRV.M + val reg_mstatus = Reg(init=reset_mstatus) + + val reset_dcsr = Wire(init=new DCSR().fromBits(0)) + reset_dcsr.xdebugver := 1 + reset_dcsr.prv := PRV.M + val reg_dcsr = Reg(init=reset_dcsr) + + val (supported_interrupts, delegable_interrupts) = { + val sup = Wire(init=new MIP().fromBits(0)) + sup.ssip := Bool(p(UseVM)) + sup.msip := true + sup.stip := Bool(p(UseVM)) + sup.mtip := true + sup.meip := true + sup.seip := Bool(p(UseVM)) + sup.rocc := usingRoCC + + val del = Wire(init=sup) + del.msip := false + del.mtip := false + del.meip := false + + (sup.toBits, del.toBits) + } + val delegable_exceptions = UInt(Seq( + Causes.misaligned_fetch, + Causes.fault_fetch, + Causes.breakpoint, + Causes.fault_load, + Causes.fault_store, + Causes.user_ecall).map(1 << _).sum) + + val exception = io.exception || io.csr_xcpt + val reg_debug = Reg(init=Bool(false)) + val reg_dpc = Reg(UInt(width = vaddrBitsExtended)) + val reg_dscratch = Reg(UInt(width = xLen)) + + val reg_singleStepped = Reg(Bool()) + when (io.retire(0) || exception) { reg_singleStepped := true } + when (!io.singleStep) { reg_singleStepped := false } + assert(!io.singleStep || io.retire <= UInt(1)) + assert(!reg_singleStepped || io.retire === UInt(0)) + + val reg_tdrselect = Reg(new TDRSelect) + val reg_bp = Reg(Vec(1 << log2Up(p(NBreakpoints)), new BP)) + + val reg_mie = Reg(init=UInt(0, xLen)) + val reg_mideleg = Reg(init=UInt(0, xLen)) + val reg_medeleg = Reg(init=UInt(0, xLen)) + val reg_mip = Reg(new MIP) + val reg_mepc = Reg(UInt(width = vaddrBitsExtended)) + val reg_mcause = Reg(Bits(width = xLen)) + val reg_mbadaddr = Reg(UInt(width = vaddrBitsExtended)) + val reg_mscratch = Reg(Bits(width = xLen)) + val reg_mtvec = Reg(init=UInt(p(MtvecInit), paddrBits min xLen)) + + val reg_sepc = Reg(UInt(width = vaddrBitsExtended)) + val reg_scause = Reg(Bits(width = xLen)) + val reg_sbadaddr = Reg(UInt(width = vaddrBitsExtended)) + val reg_sscratch = Reg(Bits(width = xLen)) + val reg_stvec = Reg(UInt(width = vaddrBits)) + val reg_sptbr = Reg(new PTBR) + val reg_wfi = Reg(init=Bool(false)) + + val reg_fflags = Reg(UInt(width = 5)) + val reg_frm = Reg(UInt(width = 3)) + + val reg_instret = WideCounter(64, io.retire) + val reg_cycle: UInt = if (enableCommitLog) { reg_instret } else { WideCounter(64) } + + val mip = Wire(init=reg_mip) + mip.rocc := io.rocc.interrupt + val read_mip = mip.toBits & supported_interrupts + + val pending_interrupts = read_mip & reg_mie + val m_interrupts = Mux(!reg_debug && (reg_mstatus.prv < PRV.M || (reg_mstatus.prv === PRV.M && reg_mstatus.mie)), pending_interrupts & ~reg_mideleg, UInt(0)) + val s_interrupts = Mux(!reg_debug && (reg_mstatus.prv < PRV.S || (reg_mstatus.prv === PRV.S && reg_mstatus.sie)), pending_interrupts & reg_mideleg, UInt(0)) + val all_interrupts = m_interrupts | s_interrupts + val interruptMSB = BigInt(1) << (xLen-1) + val interruptCause = interruptMSB + PriorityEncoder(all_interrupts) + io.interrupt := all_interrupts.orR && !io.singleStep || reg_singleStepped + io.interrupt_cause := interruptCause + io.bp := reg_bp take p(NBreakpoints) + + val debugIntCause = reg_mip.getWidth + // debug interrupts are only masked by being in debug mode + when (Bool(usingDebug) && reg_dcsr.debugint && !reg_debug) { + io.interrupt := true + io.interrupt_cause := interruptMSB + debugIntCause + } + + val system_insn = io.rw.cmd === CSR.I + val cpu_ren = io.rw.cmd =/= CSR.N && !system_insn + + val isa_string = "IM" + + (if (usingVM) "S" else "") + + (if (usingUser) "U" else "") + + (if (usingAtomics) "A" else "") + + (if (usingFPU) "FD" else "") + + (if (usingRoCC) "X" else "") + val isa = (BigInt(log2Ceil(xLen) - 4) << (xLen-2)) | + isa_string.map(x => 1 << (x - 'A')).reduce(_|_) + val read_mstatus = io.status.toBits()(xLen-1,0) + + val read_mapping = collection.mutable.LinkedHashMap[Int,Bits]( + CSRs.tdrselect -> reg_tdrselect.toBits, + CSRs.tdrdata1 -> reg_bp(reg_tdrselect.tdrindex).control.toBits, + CSRs.tdrdata2 -> reg_bp(reg_tdrselect.tdrindex).address, + CSRs.mimpid -> UInt(0), + CSRs.marchid -> UInt(0), + CSRs.mvendorid -> UInt(0), + CSRs.mcycle -> reg_cycle, + CSRs.minstret -> reg_instret, + CSRs.mucounteren -> UInt(0), + CSRs.mutime_delta -> UInt(0), + CSRs.mucycle_delta -> UInt(0), + CSRs.muinstret_delta -> UInt(0), + CSRs.misa -> UInt(isa), + CSRs.mstatus -> read_mstatus, + CSRs.mtvec -> reg_mtvec, + CSRs.mip -> read_mip, + CSRs.mie -> reg_mie, + CSRs.mideleg -> reg_mideleg, + CSRs.medeleg -> reg_medeleg, + CSRs.mscratch -> reg_mscratch, + CSRs.mepc -> reg_mepc.sextTo(xLen), + CSRs.mbadaddr -> reg_mbadaddr.sextTo(xLen), + CSRs.mcause -> reg_mcause, + CSRs.mhartid -> io.prci.id) + + if (usingDebug) { + read_mapping += CSRs.dcsr -> reg_dcsr.toBits + read_mapping += CSRs.dpc -> reg_dpc.toBits + read_mapping += CSRs.dscratch -> reg_dscratch.toBits + } + + if (usingFPU) { + read_mapping += CSRs.fflags -> reg_fflags + read_mapping += CSRs.frm -> reg_frm + read_mapping += CSRs.fcsr -> Cat(reg_frm, reg_fflags) + } + + if (usingVM) { + val read_sie = reg_mie & reg_mideleg + val read_sip = read_mip & reg_mideleg + val read_sstatus = Wire(init=io.status) + read_sstatus.vm := 0 + read_sstatus.mprv := 0 + read_sstatus.mpp := 0 + read_sstatus.hpp := 0 + read_sstatus.mpie := 0 + read_sstatus.hpie := 0 + read_sstatus.mie := 0 + read_sstatus.hie := 0 + + read_mapping += CSRs.sstatus -> (read_sstatus.toBits())(xLen-1,0) + read_mapping += CSRs.sip -> read_sip.toBits + read_mapping += CSRs.sie -> read_sie.toBits + read_mapping += CSRs.sscratch -> reg_sscratch + read_mapping += CSRs.scause -> reg_scause + read_mapping += CSRs.sbadaddr -> reg_sbadaddr.sextTo(xLen) + read_mapping += CSRs.sptbr -> reg_sptbr.toBits + read_mapping += CSRs.sepc -> reg_sepc.sextTo(xLen) + read_mapping += CSRs.stvec -> reg_stvec.sextTo(xLen) + read_mapping += CSRs.mscounteren -> UInt(0) + read_mapping += CSRs.mstime_delta -> UInt(0) + read_mapping += CSRs.mscycle_delta -> UInt(0) + read_mapping += CSRs.msinstret_delta -> UInt(0) + } + + if (xLen == 32) { + read_mapping += CSRs.mcycleh -> (reg_cycle >> 32) + read_mapping += CSRs.minstreth -> (reg_instret >> 32) + read_mapping += CSRs.mutime_deltah -> UInt(0) + read_mapping += CSRs.mucycle_deltah -> UInt(0) + read_mapping += CSRs.muinstret_deltah -> UInt(0) + if (usingVM) { + read_mapping += CSRs.mstime_deltah -> UInt(0) + read_mapping += CSRs.mscycle_deltah -> UInt(0) + read_mapping += CSRs.msinstret_deltah -> UInt(0) + } + } + + for (i <- 0 until nCustomMrwCsrs) { + val addr = 0xff0 + i + require(addr < (1 << CSR.ADDRSZ)) + require(!read_mapping.contains(addr), "custom MRW CSR address " + i + " is already in use") + read_mapping += addr -> io.custom_mrw_csrs(i) + } + + for ((addr, i) <- roccCsrs.zipWithIndex) { + require(!read_mapping.contains(addr), "RoCC: CSR address " + addr + " is already in use") + read_mapping += addr -> io.rocc.csr.rdata(i) + } + + val decoded_addr = read_mapping map { case (k, v) => k -> (io.rw.addr === k) } + + val addr_valid = decoded_addr.values.reduce(_||_) + val fp_csr = + if (usingFPU) decoded_addr(CSRs.fflags) || decoded_addr(CSRs.frm) || decoded_addr(CSRs.fcsr) + else Bool(false) + val csr_debug = Bool(usingDebug) && io.rw.addr(5) + val csr_addr_priv = Cat(io.rw.addr(6,5).andR, io.rw.addr(9,8)) + val priv_sufficient = Cat(reg_debug, reg_mstatus.prv) >= csr_addr_priv + val read_only = io.rw.addr(11,10).andR + val cpu_wen = cpu_ren && io.rw.cmd =/= CSR.R && priv_sufficient + val wen = cpu_wen && !read_only + + val wdata = (Mux((io.rw.cmd === CSR.S || io.rw.cmd === CSR.C), io.rw.rdata, UInt(0)) | + Mux(io.rw.cmd =/= CSR.C, io.rw.wdata, UInt(0))) & + ~Mux(io.rw.cmd === CSR.C, io.rw.wdata, UInt(0)) + + val do_system_insn = priv_sufficient && system_insn + val opcode = UInt(1) << io.rw.addr(2,0) + val insn_call = do_system_insn && opcode(0) + val insn_break = do_system_insn && opcode(1) + val insn_ret = do_system_insn && opcode(2) + val insn_sfence_vm = do_system_insn && opcode(4) + val insn_wfi = do_system_insn && opcode(5) + + io.csr_xcpt := (cpu_wen && read_only) || + (cpu_ren && (!priv_sufficient || !addr_valid || fp_csr && !io.status.fs.orR)) || + (system_insn && !priv_sufficient) || + insn_call || insn_break + + when (insn_wfi) { reg_wfi := true } + when (pending_interrupts.orR) { reg_wfi := false } + + val cause = + Mux(!io.csr_xcpt, io.cause, + Mux(insn_call, reg_mstatus.prv + Causes.user_ecall, + Mux[UInt](insn_break, Causes.breakpoint, Causes.illegal_instruction))) + val cause_lsbs = cause(log2Up(xLen)-1,0) + val causeIsDebugInt = cause(xLen-1) && cause_lsbs === debugIntCause + val causeIsDebugBreak = cause === Causes.breakpoint && Cat(reg_dcsr.ebreakm, reg_dcsr.ebreakh, reg_dcsr.ebreaks, reg_dcsr.ebreaku)(reg_mstatus.prv) + val trapToDebug = Bool(usingDebug) && (reg_singleStepped || causeIsDebugInt || causeIsDebugBreak || reg_debug) + val delegate = Bool(p(UseVM)) && reg_mstatus.prv < PRV.M && Mux(cause(xLen-1), reg_mideleg(cause_lsbs), reg_medeleg(cause_lsbs)) + val debugTVec = Mux(reg_debug, UInt(0x808), UInt(0x800)) + val tvec = Mux(trapToDebug, debugTVec, Mux(delegate, reg_stvec.sextTo(vaddrBitsExtended), reg_mtvec)) + val epc = Mux(csr_debug, reg_dpc, Mux(Bool(p(UseVM)) && !csr_addr_priv(1), reg_sepc, reg_mepc)) + io.fatc := insn_sfence_vm + io.evec := Mux(exception, tvec, epc) + io.ptbr := reg_sptbr + io.eret := insn_ret + io.singleStep := reg_dcsr.step && !reg_debug + io.status := reg_mstatus + io.status.sd := io.status.fs.andR || io.status.xs.andR + io.status.debug := reg_debug + if (xLen == 32) + io.status.sd_rv32 := io.status.sd + + when (exception) { + val epc = ~(~io.pc | (coreInstBytes-1)) + val pie = read_mstatus(reg_mstatus.prv) + + when (trapToDebug) { + reg_debug := true + reg_dpc := epc + reg_dcsr.cause := Mux(reg_singleStepped, UInt(4), Mux(causeIsDebugInt, UInt(3), UInt(1))) + reg_dcsr.prv := reg_mstatus.prv + }.elsewhen (delegate) { + reg_sepc := epc + reg_scause := cause + reg_sbadaddr := io.badaddr + reg_mstatus.spie := pie + reg_mstatus.spp := reg_mstatus.prv + reg_mstatus.sie := false + reg_mstatus.prv := PRV.S + }.otherwise { + reg_mepc := epc + reg_mcause := cause + reg_mbadaddr := io.badaddr + reg_mstatus.mpie := pie + reg_mstatus.mpp := reg_mstatus.prv + reg_mstatus.mie := false + reg_mstatus.prv := PRV.M + } + } + + when (insn_ret) { + when (Bool(p(UseVM)) && !csr_addr_priv(1)) { + when (reg_mstatus.spp.toBool) { reg_mstatus.sie := reg_mstatus.spie } + reg_mstatus.spie := false + reg_mstatus.spp := PRV.U + reg_mstatus.prv := reg_mstatus.spp + }.elsewhen (csr_debug) { + reg_mstatus.prv := reg_dcsr.prv + reg_debug := false + }.otherwise { + when (reg_mstatus.mpp(1)) { reg_mstatus.mie := reg_mstatus.mpie } + .elsewhen (Bool(usingVM) && reg_mstatus.mpp(0)) { reg_mstatus.sie := reg_mstatus.mpie } + reg_mstatus.mpie := false + reg_mstatus.mpp := PRV.U + reg_mstatus.prv := reg_mstatus.mpp + } + } + + assert(PopCount(insn_ret :: io.exception :: io.csr_xcpt :: Nil) <= 1, "these conditions must be mutually exclusive") + + io.time := reg_cycle + io.csr_stall := reg_wfi + + io.rw.rdata := Mux1H(for ((k, v) <- read_mapping) yield decoded_addr(k) -> v) + + io.fcsr_rm := reg_frm + when (io.fcsr_flags.valid) { + reg_fflags := reg_fflags | io.fcsr_flags.bits + } + + val supportedModes = Vec((PRV.M +: (if (usingUser) Some(PRV.U) else None) ++: (if (usingVM) Seq(PRV.S) else Nil)).map(UInt(_))) + + when (wen) { + when (decoded_addr(CSRs.mstatus)) { + val new_mstatus = new MStatus().fromBits(wdata) + reg_mstatus.mie := new_mstatus.mie + reg_mstatus.mpie := new_mstatus.mpie + + if (supportedModes.size > 1) { + reg_mstatus.mprv := new_mstatus.mprv + when (supportedModes contains new_mstatus.mpp) { reg_mstatus.mpp := new_mstatus.mpp } + if (supportedModes.size > 2) { + reg_mstatus.mxr := new_mstatus.mxr + reg_mstatus.pum := new_mstatus.pum + reg_mstatus.spp := new_mstatus.spp + reg_mstatus.spie := new_mstatus.spie + reg_mstatus.sie := new_mstatus.sie + } + } + + if (usingVM) { + require(if (xLen == 32) pgLevels == 2 else pgLevels > 2 && pgLevels < 6) + val vm_on = 6 + pgLevels // TODO Sv48 support should imply Sv39 support + when (new_mstatus.vm === 0) { reg_mstatus.vm := 0 } + when (new_mstatus.vm === vm_on) { reg_mstatus.vm := vm_on } + } + if (usingVM || usingFPU) reg_mstatus.fs := Fill(2, new_mstatus.fs.orR) + if (usingRoCC) reg_mstatus.xs := Fill(2, new_mstatus.xs.orR) + } + when (decoded_addr(CSRs.mip)) { + val new_mip = new MIP().fromBits(wdata) + if (usingVM) { + reg_mip.ssip := new_mip.ssip + reg_mip.stip := new_mip.stip + } + } + when (decoded_addr(CSRs.mie)) { reg_mie := wdata & supported_interrupts } + when (decoded_addr(CSRs.mepc)) { reg_mepc := ~(~wdata | (coreInstBytes-1)) } + when (decoded_addr(CSRs.mscratch)) { reg_mscratch := wdata } + if (p(MtvecWritable)) + when (decoded_addr(CSRs.mtvec)) { reg_mtvec := wdata >> 2 << 2 } + when (decoded_addr(CSRs.mcause)) { reg_mcause := wdata & UInt((BigInt(1) << (xLen-1)) + 31) /* only implement 5 LSBs and MSB */ } + when (decoded_addr(CSRs.mbadaddr)) { reg_mbadaddr := wdata(vaddrBitsExtended-1,0) } + if (usingFPU) { + when (decoded_addr(CSRs.fflags)) { reg_fflags := wdata } + when (decoded_addr(CSRs.frm)) { reg_frm := wdata } + when (decoded_addr(CSRs.fcsr)) { reg_fflags := wdata; reg_frm := wdata >> reg_fflags.getWidth } + } + if (usingDebug) { + when (decoded_addr(CSRs.dcsr)) { + val new_dcsr = new DCSR().fromBits(wdata) + reg_dcsr.halt := new_dcsr.halt + reg_dcsr.step := new_dcsr.step + reg_dcsr.ebreakm := new_dcsr.ebreakm + if (usingVM) reg_dcsr.ebreaks := new_dcsr.ebreaks + if (usingUser) reg_dcsr.ebreaku := new_dcsr.ebreaku + if (supportedModes.size > 1) reg_dcsr.prv := new_dcsr.prv + } + when (decoded_addr(CSRs.dpc)) { reg_dpc := ~(~wdata | (coreInstBytes-1)) } + when (decoded_addr(CSRs.dscratch)) { reg_dscratch := wdata } + } + if (usingVM) { + when (decoded_addr(CSRs.sstatus)) { + val new_sstatus = new MStatus().fromBits(wdata) + reg_mstatus.sie := new_sstatus.sie + reg_mstatus.spie := new_sstatus.spie + reg_mstatus.spp := new_sstatus.spp + reg_mstatus.pum := new_sstatus.pum + reg_mstatus.fs := Fill(2, new_sstatus.fs.orR) // even without an FPU + if (usingRoCC) reg_mstatus.xs := Fill(2, new_sstatus.xs.orR) + } + when (decoded_addr(CSRs.sip)) { + val new_sip = new MIP().fromBits(wdata) + reg_mip.ssip := new_sip.ssip + } + when (decoded_addr(CSRs.sie)) { reg_mie := (reg_mie & ~reg_mideleg) | (wdata & reg_mideleg) } + when (decoded_addr(CSRs.sscratch)) { reg_sscratch := wdata } + when (decoded_addr(CSRs.sptbr)) { reg_sptbr.ppn := wdata(ppnBits-1,0) } + when (decoded_addr(CSRs.sepc)) { reg_sepc := ~(~wdata | (coreInstBytes-1)) } + when (decoded_addr(CSRs.stvec)) { reg_stvec := wdata >> 2 << 2 } + when (decoded_addr(CSRs.scause)) { reg_scause := wdata & UInt((BigInt(1) << (xLen-1)) + 31) /* only implement 5 LSBs and MSB */ } + when (decoded_addr(CSRs.sbadaddr)) { reg_sbadaddr := wdata(vaddrBitsExtended-1,0) } + when (decoded_addr(CSRs.mideleg)) { reg_mideleg := wdata & delegable_interrupts } + when (decoded_addr(CSRs.medeleg)) { reg_medeleg := wdata & delegable_exceptions } + } + if (p(NBreakpoints) > 0) { + val newTDR = new TDRSelect().fromBits(wdata) + when (decoded_addr(CSRs.tdrselect)) { reg_tdrselect.tdrindex := newTDR.tdrindex } + + when (reg_tdrselect.tdrmode || reg_debug) { + when (decoded_addr(CSRs.tdrdata1)) { + val newBPC = new BPControl().fromBits(wdata) + reg_bp(reg_tdrselect.tdrindex).control := newBPC + reg_bp(reg_tdrselect.tdrindex).control.bpmatch := newBPC.bpmatch & 2 /* exact/NAPOT only */ + } + when (decoded_addr(CSRs.tdrdata2)) { reg_bp(reg_tdrselect.tdrindex).address := wdata } + } + } + } + + reg_mip := io.prci.interrupts + reg_dcsr.debugint := io.prci.interrupts.debug + reg_dcsr.hwbpcount := UInt(p(NBreakpoints)) + + io.rocc.csr.waddr := io.rw.addr + io.rocc.csr.wdata := wdata + io.rocc.csr.wen := wen + + if (!usingUser) { + reg_mstatus.mpp := PRV.M + reg_mstatus.prv := PRV.M + reg_mstatus.mprv := false + } + + reg_sptbr.asid := 0 + reg_tdrselect.reserved := 0 + reg_tdrselect.tdrmode := true // TODO support D-mode breakpoint theft + if (reg_bp.isEmpty) reg_tdrselect.tdrindex := 0 + for (bpc <- reg_bp map {_.control}) { + bpc.tdrtype := bpc.tdrType + bpc.bpamaskmax := bpc.bpaMaskMax + bpc.reserved := 0 + bpc.bpaction := 0 + bpc.h := false + if (!usingVM) bpc.s := false + if (!usingUser) bpc.u := false + if (!usingVM && !usingUser) bpc.m := true + when (reset) { + bpc.r := false + bpc.w := false + bpc.x := false + } + } + for (bp <- reg_bp drop p(NBreakpoints)) + bp := new BP().fromBits(0) +} diff --git a/rocket/src/main/scala/dcache.scala b/rocket/src/main/scala/dcache.scala new file mode 100644 index 00000000..82de400e --- /dev/null +++ b/rocket/src/main/scala/dcache.scala @@ -0,0 +1,447 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import junctions._ +import uncore.tilelink._ +import uncore.agents._ +import uncore.coherence._ +import uncore.util._ +import uncore.constants._ +import cde.{Parameters, Field} +import Util._ + +class DCacheDataReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) { + val addr = Bits(width = untagBits) + val write = Bool() + val wdata = Bits(width = rowBits) + val wmask = Bits(width = rowBytes) + val way_en = Bits(width = nWays) +} + +class DCacheDataArray(implicit p: Parameters) extends L1HellaCacheModule()(p) { + val io = new Bundle { + val req = Valid(new DCacheDataReq).flip + val resp = Vec(nWays, Bits(OUTPUT, rowBits)) + } + + val addr = io.req.bits.addr >> rowOffBits + for (w <- 0 until nWays) { + val array = SeqMem(nSets*refillCycles, Vec(rowBytes, Bits(width=8))) + val valid = io.req.valid && (Bool(nWays == 1) || io.req.bits.way_en(w)) + when (valid && io.req.bits.write) { + val data = Vec.tabulate(rowBytes)(i => io.req.bits.wdata(8*(i+1)-1, 8*i)) + array.write(addr, data, io.req.bits.wmask.toBools) + } + io.resp(w) := array.read(addr, valid && !io.req.bits.write).toBits + } +} + +class DCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { + val io = new Bundle { + val cpu = (new HellaCacheIO).flip + val ptw = new TLBPTWIO() + val mem = new ClientTileLinkIO + } + + val fq = Module(new FinishQueue(1)) + + require(rowBits == encRowBits) // no ECC + require(refillCyclesPerBeat == 1) + require(rowBits >= coreDataBits) + + // tags + val replacer = p(Replacer)() + def onReset = L1Metadata(UInt(0), ClientMetadata.onReset) + val meta = Module(new MetadataArray(onReset _)) + val metaReadArb = Module(new Arbiter(new MetaReadReq, 3)) + val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, 3)) + meta.io.read <> metaReadArb.io.out + meta.io.write <> metaWriteArb.io.out + + // data + val data = Module(new DCacheDataArray) + val dataArb = Module(new Arbiter(new DCacheDataReq, 4)) + data.io.req <> dataArb.io.out + dataArb.io.out.ready := true + + val s1_valid = Reg(next=io.cpu.req.fire(), init=Bool(false)) + val s1_probe = Reg(next=io.mem.probe.fire(), init=Bool(false)) + val probe_bits = RegEnable(io.mem.probe.bits, io.mem.probe.fire()) + val s1_nack = Wire(init=Bool(false)) + val s1_valid_masked = s1_valid && !io.cpu.s1_kill + val s1_valid_not_nacked = s1_valid_masked && !s1_nack + val s1_req = Reg(io.cpu.req.bits) + when (metaReadArb.io.out.valid) { + s1_req := io.cpu.req.bits + s1_req.addr := Cat(io.cpu.req.bits.addr >> untagBits, metaReadArb.io.out.bits.idx, io.cpu.req.bits.addr(blockOffBits-1,0)) + } + val s1_read = isRead(s1_req.cmd) + val s1_write = isWrite(s1_req.cmd) + val s1_readwrite = s1_read || s1_write + val s1_flush_valid = Reg(Bool()) + + val s_ready :: s_grant_wait :: s_voluntary_writeback :: s_probe_rep_dirty :: s_probe_rep_clean :: s_probe_rep_miss :: s_voluntary_write_meta :: s_probe_write_meta :: Nil = Enum(UInt(), 8) + val grant_wait = Reg(init=Bool(false)) + val release_ack_wait = Reg(init=Bool(false)) + val release_state = Reg(init=s_ready) + val pstore1_valid = Wire(Bool()) + val pstore2_valid = Reg(Bool()) + val inWriteback = release_state === s_voluntary_writeback || release_state === s_probe_rep_dirty + val releaseWay = Wire(UInt()) + io.cpu.req.ready := (release_state === s_ready) && !grant_wait && !s1_nack + + // hit initiation path + dataArb.io.in(3).valid := io.cpu.req.valid && isRead(io.cpu.req.bits.cmd) + dataArb.io.in(3).bits.write := false + dataArb.io.in(3).bits.addr := io.cpu.req.bits.addr + dataArb.io.in(3).bits.way_en := ~UInt(0, nWays) + when (!dataArb.io.in(3).ready && isRead(io.cpu.req.bits.cmd)) { io.cpu.req.ready := false } + metaReadArb.io.in(2).valid := io.cpu.req.valid + metaReadArb.io.in(2).bits.idx := io.cpu.req.bits.addr(idxMSB, idxLSB) + metaReadArb.io.in(2).bits.way_en := ~UInt(0, nWays) + when (!metaReadArb.io.in(2).ready) { io.cpu.req.ready := false } + + // address translation + val tlb = Module(new TLB) + io.ptw <> tlb.io.ptw + tlb.io.req.valid := s1_valid_masked && s1_readwrite + tlb.io.req.bits.passthrough := s1_req.phys + tlb.io.req.bits.vpn := s1_req.addr >> pgIdxBits + tlb.io.req.bits.instruction := false + tlb.io.req.bits.store := s1_write + when (!tlb.io.req.ready && !io.cpu.req.bits.phys) { io.cpu.req.ready := false } + when (s1_valid && s1_readwrite && tlb.io.resp.miss) { s1_nack := true } + + val s1_paddr = Cat(tlb.io.resp.ppn, s1_req.addr(pgIdxBits-1,0)) + val s1_tag = Mux(s1_probe, probe_bits.addr_block >> idxBits, s1_paddr(paddrBits-1, untagBits)) + val s1_hit_way = meta.io.resp.map(r => r.coh.isValid() && r.tag === s1_tag).toBits + val s1_hit_state = ClientMetadata.onReset.fromBits( + meta.io.resp.map(r => Mux(r.tag === s1_tag, r.coh.toBits, UInt(0))) + .reduce (_|_)) + val s1_data_way = Mux(inWriteback, releaseWay, s1_hit_way) + val s1_data = Mux1H(s1_data_way, data.io.resp) // retime into s2 if critical + val s1_victim_way = Wire(init = replacer.way) + + val s2_valid = Reg(next=s1_valid_masked, init=Bool(false)) + val s2_probe = Reg(next=s1_probe, init=Bool(false)) + val releaseInFlight = s1_probe || s2_probe || release_state =/= s_ready + val s2_valid_masked = s2_valid && Reg(next = !s1_nack) + val s2_req = Reg(io.cpu.req.bits) + val s2_uncached = Reg(Bool()) + when (s1_valid_not_nacked || s1_flush_valid) { + s2_req := s1_req + s2_req.addr := s1_paddr + s2_uncached := !tlb.io.resp.cacheable + } + val s2_read = isRead(s2_req.cmd) + val s2_write = isWrite(s2_req.cmd) + val s2_readwrite = s2_read || s2_write + val s2_flush_valid = RegNext(s1_flush_valid) + val s2_data = RegEnable(s1_data, s1_valid || inWriteback) + val s2_probe_way = RegEnable(s1_hit_way, s1_probe) + val s2_probe_state = RegEnable(s1_hit_state, s1_probe) + val s2_hit_way = RegEnable(s1_hit_way, s1_valid_not_nacked) + val s2_hit_state = RegEnable(s1_hit_state, s1_valid_not_nacked) + val s2_hit = s2_hit_state.isHit(s2_req.cmd) + val s2_valid_hit = s2_valid_masked && s2_readwrite && s2_hit + val s2_valid_miss = s2_valid_masked && s2_readwrite && !s2_hit && !(pstore1_valid || pstore2_valid) && !release_ack_wait + val s2_valid_cached_miss = s2_valid_miss && !s2_uncached + val s2_victimize = s2_valid_cached_miss || s2_flush_valid + val s2_valid_uncached = s2_valid_miss && s2_uncached + val s2_victim_way = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_way, UIntToOH(RegEnable(s1_victim_way, s1_valid_not_nacked || s1_flush_valid))) + val s2_victim_tag = RegEnable(meta.io.resp(s1_victim_way).tag, s1_valid_not_nacked || s1_flush_valid) + val s2_victim_state = Mux(s2_hit_state.isValid() && !s2_flush_valid, s2_hit_state, RegEnable(meta.io.resp(s1_victim_way).coh, s1_valid_not_nacked || s1_flush_valid)) + val s2_victim_valid = s2_victim_state.isValid() + val s2_victim_dirty = s2_victim_state.requiresVoluntaryWriteback() + io.cpu.s2_nack := s2_valid && !s2_valid_hit && !(s2_valid_uncached && io.mem.acquire.ready) + when (s2_valid && !s2_valid_hit) { s1_nack := true } + + // exceptions + val misaligned = new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes).misaligned + io.cpu.xcpt.ma.ld := s1_read && misaligned + io.cpu.xcpt.ma.st := s1_write && misaligned + io.cpu.xcpt.pf.ld := s1_read && tlb.io.resp.xcpt_ld + io.cpu.xcpt.pf.st := s1_write && tlb.io.resp.xcpt_st + assert(!(Reg(next= + (io.cpu.xcpt.ma.ld || io.cpu.xcpt.ma.st || io.cpu.xcpt.pf.ld || io.cpu.xcpt.pf.st)) && + s2_valid_masked), + "DCache exception occurred - cache response not killed.") + + // load reservations + val s2_lr = Bool(usingAtomics) && s2_req.cmd === M_XLR + val s2_sc = Bool(usingAtomics) && s2_req.cmd === M_XSC + val lrscCount = Reg(init=UInt(0)) + val lrscValid = lrscCount > 0 + val lrscAddr = Reg(UInt()) + val s2_sc_fail = s2_sc && !(lrscValid && lrscAddr === (s2_req.addr >> blockOffBits)) + when (s2_valid_hit && s2_lr) { + lrscCount := lrscCycles - 1 + lrscAddr := s2_req.addr >> blockOffBits + } + when (lrscValid) { lrscCount := lrscCount - 1 } + when ((s2_valid_hit && s2_sc) || io.cpu.invalidate_lr) { lrscCount := 0 } + + // pending store buffer + val pstore1_cmd = RegEnable(s1_req.cmd, s1_valid_not_nacked && s1_write) + val pstore1_typ = RegEnable(s1_req.typ, s1_valid_not_nacked && s1_write) + val pstore1_addr = RegEnable(s1_paddr, s1_valid_not_nacked && s1_write) + val pstore1_data = RegEnable(io.cpu.s1_data, s1_valid_not_nacked && s1_write) + val pstore1_way = RegEnable(s1_hit_way, s1_valid_not_nacked && s1_write) + val pstore1_storegen = new StoreGen(pstore1_typ, pstore1_addr, pstore1_data, wordBytes) + val pstore1_storegen_data = Wire(init = pstore1_storegen.data) + val pstore1_amo = Bool(usingAtomics) && isRead(pstore1_cmd) + val pstore_drain_structural = pstore1_valid && pstore2_valid && ((s1_valid && s1_write) || pstore1_amo) + val pstore_drain_opportunistic = !(io.cpu.req.valid && isRead(io.cpu.req.bits.cmd)) + val pstore_drain_on_miss = releaseInFlight || io.cpu.s2_nack + val pstore_drain = + Bool(usingAtomics) && pstore_drain_structural || + (((pstore1_valid && !pstore1_amo) || pstore2_valid) && (pstore_drain_opportunistic || pstore_drain_on_miss)) + pstore1_valid := { + val s2_store_valid = s2_valid_hit && s2_write && !s2_sc_fail + val pstore1_held = Reg(Bool()) + assert(!s2_store_valid || !pstore1_held) + pstore1_held := (s2_store_valid || pstore1_held) && pstore2_valid && !pstore_drain + s2_store_valid || pstore1_held + } + val advance_pstore1 = pstore1_valid && (pstore2_valid === pstore_drain) + pstore2_valid := pstore2_valid && !pstore_drain || advance_pstore1 + val pstore2_addr = RegEnable(pstore1_addr, advance_pstore1) + val pstore2_way = RegEnable(pstore1_way, advance_pstore1) + val pstore2_storegen_data = RegEnable(pstore1_storegen_data, advance_pstore1) + val pstore2_storegen_mask = RegEnable(pstore1_storegen.mask, advance_pstore1) + dataArb.io.in(0).valid := pstore_drain + dataArb.io.in(0).bits.write := true + dataArb.io.in(0).bits.addr := Mux(pstore2_valid, pstore2_addr, pstore1_addr) + dataArb.io.in(0).bits.way_en := Mux(pstore2_valid, pstore2_way, pstore1_way) + dataArb.io.in(0).bits.wdata := Fill(rowWords, Mux(pstore2_valid, pstore2_storegen_data, pstore1_storegen_data)) + val pstore_mask_shift = Mux(pstore2_valid, pstore2_addr, pstore1_addr).extract(rowOffBits-1,offsetlsb) << wordOffBits + dataArb.io.in(0).bits.wmask := Mux(pstore2_valid, pstore2_storegen_mask, pstore1_storegen.mask) << pstore_mask_shift + + // store->load RAW hazard detection + val s1_idx = s1_req.addr(idxMSB, wordOffBits) + val s1_raw_hazard = s1_read && + ((pstore1_valid && pstore1_addr(idxMSB, wordOffBits) === s1_idx) || + (pstore2_valid && pstore2_addr(idxMSB, wordOffBits) === s1_idx)) + when (s1_valid && s1_raw_hazard) { s1_nack := true } + + val s2_new_hit_state = s2_hit_state.onHit(s2_req.cmd) + metaWriteArb.io.in(0).valid := (s2_valid_hit && s2_hit_state =/= s2_new_hit_state) || (s2_victimize && !s2_victim_dirty) + metaWriteArb.io.in(0).bits.way_en := s2_victim_way + metaWriteArb.io.in(0).bits.idx := s2_req.addr(idxMSB, idxLSB) + metaWriteArb.io.in(0).bits.data.coh := Mux(s2_hit, s2_new_hit_state, ClientMetadata.onReset) + metaWriteArb.io.in(0).bits.data.tag := s2_req.addr(paddrBits-1, untagBits) + + // acquire + val cachedGetMessage = s2_hit_state.makeAcquire( + client_xact_id = UInt(0), + addr_block = s2_req.addr(paddrBits-1, blockOffBits), + op_code = s2_req.cmd) + val uncachedGetMessage = Get( + client_xact_id = UInt(0), + addr_block = s2_req.addr(paddrBits-1, blockOffBits), + addr_beat = s2_req.addr(blockOffBits-1, beatOffBits), + addr_byte = s2_req.addr(beatOffBits-1, 0), + operand_size = s2_req.typ, + alloc = Bool(false)) + val uncachedPutOffset = s2_req.addr.extract(beatOffBits-1, wordOffBits) + val uncachedPutMessage = Put( + client_xact_id = UInt(0), + addr_block = s2_req.addr(paddrBits-1, blockOffBits), + addr_beat = s2_req.addr(blockOffBits-1, beatOffBits), + data = Fill(beatWords, pstore1_storegen.data), + wmask = Some(pstore1_storegen.mask << (uncachedPutOffset << wordOffBits)), + alloc = Bool(false)) + val uncachedPutAtomicMessage = PutAtomic( + client_xact_id = UInt(0), + addr_block = s2_req.addr(paddrBits-1, blockOffBits), + addr_beat = s2_req.addr(blockOffBits-1, beatOffBits), + addr_byte = s2_req.addr(beatOffBits-1, 0), + atomic_opcode = s2_req.cmd, + operand_size = s2_req.typ, + data = Fill(beatWords, pstore1_storegen.data)) + io.mem.acquire.valid := ((s2_valid_cached_miss && !s2_victim_dirty) || s2_valid_uncached) && fq.io.enq.ready + io.mem.acquire.bits := cachedGetMessage + when (s2_uncached) { + assert(!s2_valid_masked || !s2_hit_state.isValid(), "cache hit on uncached access") + io.mem.acquire.bits := uncachedGetMessage + when (s2_write) { + io.mem.acquire.bits := uncachedPutMessage + when (pstore1_amo) { + io.mem.acquire.bits := uncachedPutAtomicMessage + } + } + } + when (io.mem.acquire.fire()) { grant_wait := true } + + // grant + val grantIsRefill = io.mem.grant.bits.hasMultibeatData() + val grantIsVoluntary = io.mem.grant.bits.isVoluntary() + val grantIsUncached = !grantIsRefill && !grantIsVoluntary + when (io.mem.grant.valid) { + assert(grant_wait || grantIsVoluntary && release_ack_wait, "unexpected grant") + when (grantIsUncached) { s2_data := io.mem.grant.bits.data } + when (grantIsVoluntary) { release_ack_wait := false } + } + val (refillCount, refillDone) = Counter(io.mem.grant.fire() && grantIsRefill, refillCycles) + val grantDone = refillDone || grantIsUncached + when (io.mem.grant.fire() && grantDone) { grant_wait := false } + + // data refill + dataArb.io.in(1).valid := grantIsRefill && io.mem.grant.valid + io.mem.grant.ready := true + assert(dataArb.io.in(1).ready || !dataArb.io.in(1).valid) + dataArb.io.in(1).bits.write := true + dataArb.io.in(1).bits.addr := Cat(s2_req.addr(paddrBits-1, blockOffBits), io.mem.grant.bits.addr_beat) << beatOffBits + dataArb.io.in(1).bits.way_en := s2_victim_way + dataArb.io.in(1).bits.wdata := io.mem.grant.bits.data + dataArb.io.in(1).bits.wmask := ~UInt(0, rowBytes) + // tag updates on refill + metaWriteArb.io.in(1).valid := refillDone + assert(!metaWriteArb.io.in(1).valid || metaWriteArb.io.in(1).ready) + metaWriteArb.io.in(1).bits.way_en := s2_victim_way + metaWriteArb.io.in(1).bits.idx := s2_req.addr(idxMSB, idxLSB) + metaWriteArb.io.in(1).bits.data.coh := s2_hit_state.onGrant(io.mem.grant.bits, s2_req.cmd) + metaWriteArb.io.in(1).bits.data.tag := s2_req.addr(paddrBits-1, untagBits) + + // finish + fq.io.enq.valid := io.mem.grant.fire() && io.mem.grant.bits.requiresAck() && (!grantIsRefill || refillDone) + fq.io.enq.bits := io.mem.grant.bits.makeFinish() + io.mem.finish <> fq.io.deq + when (fq.io.enq.valid) { assert(fq.io.enq.ready) } + when (refillDone) { replacer.miss } + + // probe + val block_probe = releaseInFlight || lrscValid || (s2_valid_hit && s2_lr) + metaReadArb.io.in(1).valid := io.mem.probe.valid && !block_probe + io.mem.probe.ready := metaReadArb.io.in(1).ready && !block_probe && !s1_valid && (!s2_valid || s2_valid_hit) + metaReadArb.io.in(1).bits.idx := io.mem.probe.bits.addr_block + metaReadArb.io.in(1).bits.way_en := ~UInt(0, nWays) + + // release + val (writebackCount, writebackDone) = Counter(io.mem.release.fire() && inWriteback, refillCycles) + val releaseDone = writebackDone || (io.mem.release.fire() && !inWriteback) + val releaseRejected = io.mem.release.valid && !io.mem.release.ready + val s1_release_data_valid = Reg(next = dataArb.io.in(2).fire()) + val s2_release_data_valid = Reg(next = s1_release_data_valid && !releaseRejected) + val releaseDataBeat = Cat(UInt(0), writebackCount) + Mux(releaseRejected, UInt(0), s1_release_data_valid + Cat(UInt(0), s2_release_data_valid)) + io.mem.release.valid := s2_release_data_valid + io.mem.release.bits := ClientMetadata.onReset.makeRelease(probe_bits) + val voluntaryReleaseMessage = s2_victim_state.makeVoluntaryWriteback(UInt(0), UInt(0)) + val voluntaryNewCoh = s2_victim_state.onCacheControl(M_FLUSH) + val probeResponseMessage = s2_probe_state.makeRelease(probe_bits) + val probeNewCoh = s2_probe_state.onProbe(probe_bits) + val newCoh = Wire(init = probeNewCoh) + releaseWay := s2_probe_way + when (s2_victimize && s2_victim_dirty) { + assert(!s2_hit_state.isValid()) + release_state := s_voluntary_writeback + probe_bits.addr_block := Cat(s2_victim_tag, s2_req.addr(idxMSB, idxLSB)) + } + when (s2_probe) { + when (s2_probe_state.requiresVoluntaryWriteback()) { release_state := s_probe_rep_dirty } + .elsewhen (s2_probe_state.isValid()) { release_state := s_probe_rep_clean } + .otherwise { + io.mem.release.valid := true + release_state := s_probe_rep_miss + } + } + when (releaseDone) { release_state := s_ready } + when (release_state === s_probe_rep_miss || release_state === s_probe_rep_clean) { + io.mem.release.valid := true + } + when (release_state === s_probe_rep_clean || release_state === s_probe_rep_dirty) { + io.mem.release.bits := probeResponseMessage + when (releaseDone) { release_state := s_probe_write_meta } + } + when (release_state === s_voluntary_writeback || release_state === s_voluntary_write_meta) { + io.mem.release.bits := voluntaryReleaseMessage + newCoh := voluntaryNewCoh + releaseWay := s2_victim_way + when (releaseDone) { + release_state := s_voluntary_write_meta + release_ack_wait := true + } + } + when (s2_probe && !io.mem.release.fire()) { s1_nack := true } + io.mem.release.bits.addr_block := probe_bits.addr_block + io.mem.release.bits.addr_beat := writebackCount + io.mem.release.bits.data := s2_data + + dataArb.io.in(2).valid := inWriteback && releaseDataBeat < refillCycles + dataArb.io.in(2).bits.write := false + dataArb.io.in(2).bits.addr := Cat(io.mem.release.bits.addr_block, releaseDataBeat(log2Up(refillCycles)-1,0)) << rowOffBits + dataArb.io.in(2).bits.way_en := ~UInt(0, nWays) + + metaWriteArb.io.in(2).valid := (release_state === s_voluntary_write_meta || release_state === s_probe_write_meta) + metaWriteArb.io.in(2).bits.way_en := releaseWay + metaWriteArb.io.in(2).bits.idx := io.mem.release.bits.full_addr()(idxMSB, idxLSB) + metaWriteArb.io.in(2).bits.data.coh := newCoh + metaWriteArb.io.in(2).bits.data.tag := io.mem.release.bits.full_addr()(paddrBits-1, untagBits) + when (metaWriteArb.io.in(2).fire()) { release_state := s_ready } + + // cached response + io.cpu.resp.valid := s2_valid_hit + io.cpu.resp.bits := s2_req + io.cpu.resp.bits.has_data := s2_read + io.cpu.resp.bits.replay := false + io.cpu.ordered := !(s1_valid || s2_valid || grant_wait) + + // uncached response + io.cpu.replay_next := io.mem.grant.valid && grantIsUncached + val doUncachedResp = Reg(next = io.cpu.replay_next) + when (doUncachedResp) { + assert(!s2_valid_hit) + io.cpu.resp.valid := true + io.cpu.resp.bits.replay := true + } + + // load data subword mux/sign extension + val s2_word_idx = s2_req.addr.extract(log2Up(rowBits/8)-1, log2Up(wordBytes)) + val s2_data_word = s2_data >> Cat(s2_word_idx, UInt(0, log2Up(coreDataBits))) + val loadgen = new LoadGen(s2_req.typ, s2_req.addr, s2_data_word, s2_sc, wordBytes) + io.cpu.resp.bits.data := loadgen.data | s2_sc_fail + io.cpu.resp.bits.data_word_bypass := loadgen.wordData + io.cpu.resp.bits.store_data := pstore1_data + + // AMOs + if (usingAtomics) { + val amoalu = Module(new AMOALU) + amoalu.io.addr := pstore1_addr + amoalu.io.cmd := pstore1_cmd + amoalu.io.typ := pstore1_typ + amoalu.io.lhs := s2_data_word + amoalu.io.rhs := pstore1_data + pstore1_storegen_data := amoalu.io.out + } else { + assert(!(s1_valid_masked && s1_read && s1_write), "unsupported D$ operation") + } + + // flushes + val flushed = Reg(init=Bool(true)) + val flushing = Reg(init=Bool(false)) + val flushCounter = Counter(nSets * nWays) + when (io.mem.acquire.fire()) { flushed := false } + when (s2_valid_masked && s2_req.cmd === M_FLUSH_ALL) { + io.cpu.s2_nack := !flushed + when (!flushed) { + flushing := !release_ack_wait + } + } + s1_flush_valid := metaReadArb.io.in(0).fire() && !s1_flush_valid && !s2_flush_valid && release_state === s_ready && !release_ack_wait + metaReadArb.io.in(0).valid := flushing + metaReadArb.io.in(0).bits.idx := flushCounter.value + metaReadArb.io.in(0).bits.way_en := ~UInt(0, nWays) + when (flushing) { + s1_victim_way := flushCounter.value >> log2Up(nSets) + when (s2_flush_valid) { + when (flushCounter.inc()) { + flushed := true + } + } + when (flushed && release_state === s_ready && !release_ack_wait) { + flushing := false + } + } +} diff --git a/rocket/src/main/scala/decode.scala b/rocket/src/main/scala/decode.scala new file mode 100644 index 00000000..07cdc1d6 --- /dev/null +++ b/rocket/src/main/scala/decode.scala @@ -0,0 +1,203 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ + +object DecodeLogic +{ + def term(lit: BitPat) = + new Term(lit.value, BigInt(2).pow(lit.getWidth)-(lit.mask+1)) + def logic(addr: UInt, addrWidth: Int, cache: scala.collection.mutable.Map[Term,Bool], terms: Seq[Term]) = { + terms.map { t => + cache.getOrElseUpdate(t, (if (t.mask == 0) addr else addr & Bits(BigInt(2).pow(addrWidth)-(t.mask+1), addrWidth)) === Bits(t.value, addrWidth)) + }.foldLeft(Bool(false))(_||_) + } + def apply(addr: UInt, default: BitPat, mapping: Iterable[(BitPat, BitPat)]): UInt = { + val cache = caches.getOrElseUpdate(addr, collection.mutable.Map[Term,Bool]()) + val dterm = term(default) + val (keys, values) = mapping.unzip + val addrWidth = keys.map(_.getWidth).max + val terms = keys.toList.map(k => term(k)) + val termvalues = terms zip values.toList.map(term(_)) + + for (t <- keys.zip(terms).tails; if !t.isEmpty) + for (u <- t.tail) + assert(!t.head._2.intersects(u._2), "DecodeLogic: keys " + t.head + " and " + u + " overlap") + + (0 until default.getWidth.max(values.map(_.getWidth).max)).map({ case (i: Int) => + val mint = termvalues.filter { case (k,t) => ((t.mask >> i) & 1) == 0 && ((t.value >> i) & 1) == 1 }.map(_._1) + val maxt = termvalues.filter { case (k,t) => ((t.mask >> i) & 1) == 0 && ((t.value >> i) & 1) == 0 }.map(_._1) + val dc = termvalues.filter { case (k,t) => ((t.mask >> i) & 1) == 1 }.map(_._1) + + if (((dterm.mask >> i) & 1) != 0) { + logic(addr, addrWidth, cache, SimplifyDC(mint, maxt, addrWidth)).toBits + } else { + val defbit = (dterm.value.toInt >> i) & 1 + val t = if (defbit == 0) mint else maxt + val bit = logic(addr, addrWidth, cache, Simplify(t, dc, addrWidth)).toBits + if (defbit == 0) bit else ~bit + } + }).reverse.reduceRight(Cat(_,_)) + } + def apply(addr: UInt, default: Seq[BitPat], mappingIn: Iterable[(BitPat, Seq[BitPat])]): Seq[UInt] = { + val mapping = collection.mutable.ArrayBuffer.fill(default.size)(collection.mutable.ArrayBuffer[(BitPat, BitPat)]()) + for ((key, values) <- mappingIn) + for ((value, i) <- values zipWithIndex) + mapping(i) += key -> value + for ((thisDefault, thisMapping) <- default zip mapping) + yield apply(addr, thisDefault, thisMapping) + } + def apply(addr: UInt, default: Seq[BitPat], mappingIn: List[(UInt, Seq[BitPat])]): Seq[UInt] = + apply(addr, default, mappingIn.map(m => (BitPat(m._1), m._2)).asInstanceOf[Iterable[(BitPat, Seq[BitPat])]]) + def apply(addr: UInt, trues: Iterable[UInt], falses: Iterable[UInt]): Bool = + apply(addr, BitPat.DC(1), trues.map(BitPat(_) -> BitPat("b1")) ++ falses.map(BitPat(_) -> BitPat("b0"))).toBool + private val caches = collection.mutable.Map[UInt,collection.mutable.Map[Term,Bool]]() +} + +class Term(val value: BigInt, val mask: BigInt = 0) +{ + var prime = true + + def covers(x: Term) = ((value ^ x.value) &~ mask | x.mask &~ mask) == 0 + def intersects(x: Term) = ((value ^ x.value) &~ mask &~ x.mask) == 0 + override def equals(that: Any) = that match { + case x: Term => x.value == value && x.mask == mask + case _ => false + } + override def hashCode = value.toInt + def < (that: Term) = value < that.value || value == that.value && mask < that.mask + def similar(x: Term) = { + val diff = value - x.value + mask == x.mask && value > x.value && (diff & diff-1) == 0 + } + def merge(x: Term) = { + prime = false + x.prime = false + val bit = value - x.value + new Term(value &~ bit, mask | bit) + } + + override def toString = value.toString(16) + "-" + mask.toString(16) + (if (prime) "p" else "") +} + +object Simplify +{ + def getPrimeImplicants(implicants: Seq[Term], bits: Int) = { + var prime = List[Term]() + implicants.foreach(_.prime = true) + val cols = (0 to bits).map(b => implicants.filter(b == _.mask.bitCount)) + val table = cols.map(c => (0 to bits).map(b => collection.mutable.Set(c.filter(b == _.value.bitCount):_*))) + for (i <- 0 to bits) { + for (j <- 0 until bits-i) + table(i)(j).foreach(a => table(i+1)(j) ++= table(i)(j+1).filter(_.similar(a)).map(_.merge(a))) + for (r <- table(i)) + for (p <- r; if p.prime) + prime = p :: prime + } + prime.sortWith(_<_) + } + def getEssentialPrimeImplicants(prime: Seq[Term], minterms: Seq[Term]): (Seq[Term],Seq[Term],Seq[Term]) = { + for (i <- 0 until prime.size) { + val icover = minterms.filter(prime(i) covers _) + for (j <- 0 until prime.size) { + val jcover = minterms.filter(prime(j) covers _) + if (icover.size > jcover.size && jcover.forall(prime(i) covers _)) + return getEssentialPrimeImplicants(prime.filter(_ != prime(j)), minterms) + } + } + + val essentiallyCovered = minterms.filter(t => prime.count(_ covers t) == 1) + val essential = prime.filter(p => essentiallyCovered.exists(p covers _)) + val nonessential = prime.filterNot(essential contains _) + val uncovered = minterms.filterNot(t => essential.exists(_ covers t)) + if (essential.isEmpty || uncovered.isEmpty) + (essential, nonessential, uncovered) + else { + val (a, b, c) = getEssentialPrimeImplicants(nonessential, uncovered) + (essential ++ a, b, c) + } + } + def getCost(cover: Seq[Term], bits: Int) = cover.map(bits - _.mask.bitCount).sum + def cheaper(a: List[Term], b: List[Term], bits: Int) = { + val ca = getCost(a, bits) + val cb = getCost(b, bits) + def listLess(a: List[Term], b: List[Term]): Boolean = !b.isEmpty && (a.isEmpty || a.head < b.head || a.head == b.head && listLess(a.tail, b.tail)) + ca < cb || ca == cb && listLess(a.sortWith(_<_), b.sortWith(_<_)) + } + def getCover(implicants: Seq[Term], minterms: Seq[Term], bits: Int) = { + if (minterms.nonEmpty) { + val cover = minterms.map(m => implicants.filter(_.covers(m)).map(i => collection.mutable.Set(i))) + val all = cover.reduceLeft((c0, c1) => c0.map(a => c1.map(_ ++ a)).reduceLeft(_++_)) + all.map(_.toList).reduceLeft((a, b) => if (cheaper(a, b, bits)) a else b) + } else + Seq[Term]() + } + def stringify(s: Seq[Term], bits: Int) = s.map(t => (0 until bits).map(i => if ((t.mask & (1 << i)) != 0) "x" else ((t.value >> i) & 1).toString).reduceLeft(_+_).reverse).reduceLeft(_+" + "+_) + + def apply(minterms: Seq[Term], dontcares: Seq[Term], bits: Int) = { + val prime = getPrimeImplicants(minterms ++ dontcares, bits) + minterms.foreach(t => assert(prime.exists(_.covers(t)))) + val (eprime, prime2, uncovered) = getEssentialPrimeImplicants(prime, minterms) + val cover = eprime ++ getCover(prime2, uncovered, bits) + minterms.foreach(t => assert(cover.exists(_.covers(t)))) // sanity check + cover + } +} + +object SimplifyDC +{ + def getImplicitDC(maxterms: Seq[Term], term: Term, bits: Int, above: Boolean): Term = { + for (i <- 0 until bits) { + var t: Term = null + if (above && ((term.value | term.mask) & (BigInt(1) << i)) == 0) + t = new Term(term.value | (BigInt(1) << i), term.mask) + else if (!above && (term.value & (BigInt(1) << i)) != 0) + t = new Term(term.value & ~(BigInt(1) << i), term.mask) + if (t != null && !maxterms.exists(_.intersects(t))) + return t + } + null + } + def getPrimeImplicants(minterms: Seq[Term], maxterms: Seq[Term], bits: Int) = { + var prime = List[Term]() + minterms.foreach(_.prime = true) + var mint = minterms.map(t => new Term(t.value, t.mask)) + val cols = (0 to bits).map(b => mint.filter(b == _.mask.bitCount)) + val table = cols.map(c => (0 to bits).map(b => collection.mutable.Set(c.filter(b == _.value.bitCount):_*))) + + for (i <- 0 to bits) { + for (j <- 0 until bits-i) { + table(i)(j).foreach(a => table(i+1)(j) ++= table(i)(j+1).filter(_ similar a).map(_ merge a)) + } + for (j <- 0 until bits-i) { + for (a <- table(i)(j).filter(_.prime)) { + val dc = getImplicitDC(maxterms, a, bits, true) + if (dc != null) + table(i+1)(j) += dc merge a + } + for (a <- table(i)(j+1).filter(_.prime)) { + val dc = getImplicitDC(maxterms, a, bits, false) + if (dc != null) + table(i+1)(j) += a merge dc + } + } + for (r <- table(i)) + for (p <- r; if p.prime) + prime = p :: prime + } + prime.sortWith(_<_) + } + + def verify(cover: Seq[Term], minterms: Seq[Term], maxterms: Seq[Term]) = { + assert(minterms.forall(t => cover.exists(_ covers t))) + assert(maxterms.forall(t => !cover.exists(_ intersects t))) + } + def apply(minterms: Seq[Term], maxterms: Seq[Term], bits: Int) = { + val prime = getPrimeImplicants(minterms, maxterms, bits) + val (eprime, prime2, uncovered) = Simplify.getEssentialPrimeImplicants(prime, minterms) + val cover = eprime ++ Simplify.getCover(prime2, uncovered, bits) + verify(cover, minterms, maxterms) + cover + } +} diff --git a/rocket/src/main/scala/dma.scala b/rocket/src/main/scala/dma.scala new file mode 100644 index 00000000..ce1af0ed --- /dev/null +++ b/rocket/src/main/scala/dma.scala @@ -0,0 +1,400 @@ +package rocket + +import Chisel._ +import uncore.tilelink._ +import uncore.devices._ +import uncore.devices.DmaRequest._ +import uncore.agents._ +import uncore.util._ +import junctions.{ParameterizedBundle, AddrMap} +import cde.Parameters + +trait HasClientDmaParameters extends HasCoreParameters with HasDmaParameters { + val dmaAddrBits = coreMaxAddrBits + val dmaSegmentSizeBits = coreMaxAddrBits + val dmaSegmentBits = 24 +} + +abstract class ClientDmaBundle(implicit val p: Parameters) + extends ParameterizedBundle()(p) with HasClientDmaParameters +abstract class ClientDmaModule(implicit val p: Parameters) + extends Module with HasClientDmaParameters + +class ClientDmaRequest(implicit p: Parameters) extends ClientDmaBundle()(p) { + val cmd = UInt(width = DMA_CMD_SZ) + val src_start = UInt(width = dmaAddrBits) + val dst_start = UInt(width = dmaAddrBits) + val src_stride = UInt(width = dmaSegmentSizeBits) + val dst_stride = UInt(width = dmaSegmentSizeBits) + val segment_size = UInt(width = dmaSegmentSizeBits) + val nsegments = UInt(width = dmaSegmentBits) + val word_size = UInt(width = dmaWordSizeBits) +} + +object ClientDmaRequest { + def apply(cmd: UInt, + src_start: UInt, + dst_start: UInt, + segment_size: UInt, + nsegments: UInt = UInt(1), + src_stride: UInt = UInt(0), + dst_stride: UInt = UInt(0), + word_size: UInt = UInt(0)) + (implicit p: Parameters) = { + val req = Wire(new ClientDmaRequest) + req.cmd := cmd + req.src_start := src_start + req.dst_start := dst_start + req.src_stride := src_stride + req.dst_stride := dst_stride + req.segment_size := segment_size + req.nsegments := nsegments + req.word_size := word_size + req + } +} + +object ClientDmaResponse { + val pagefault = UInt("b01") + val invalid_region = UInt("b10") + + def apply(status: UInt = UInt(0))(implicit p: Parameters) = { + val resp = Wire(new ClientDmaResponse) + resp.status := status + resp + } +} + +class ClientDmaResponse(implicit p: Parameters) extends ClientDmaBundle { + val status = UInt(width = dmaStatusBits) +} + +class ClientDmaIO(implicit p: Parameters) extends ParameterizedBundle()(p) { + val req = Decoupled(new ClientDmaRequest) + val resp = Valid(new ClientDmaResponse).flip +} + +class DmaFrontend(implicit p: Parameters) extends CoreModule()(p) + with HasClientDmaParameters with HasTileLinkParameters { + val io = new Bundle { + val cpu = (new ClientDmaIO).flip + val mem = new ClientUncachedTileLinkIO + val ptw = new TLBPTWIO + val busy = Bool(OUTPUT) + val incr_outstanding = Bool(OUTPUT) + val host_id = UInt(INPUT, log2Up(nCores)) + } + + val tlb = Module(new DecoupledTLB()(p.alterPartial({ + case CacheName => "L1D" + }))) + io.ptw <> tlb.io.ptw + + private val pgSize = 1 << pgIdxBits + + val cmd = Reg(UInt(width = DMA_CMD_SZ)) + val adv_ptr = MuxLookup(cmd, UInt("b11"), Seq( + DMA_CMD_PFR -> UInt("b10"), + DMA_CMD_PFW -> UInt("b10"), + DMA_CMD_SIN -> UInt("b10"), + DMA_CMD_SOUT -> UInt("b01"))) + + val segment_size = Reg(UInt(width = dmaSegmentSizeBits)) + val bytes_left = Reg(UInt(width = dmaSegmentSizeBits)) + val segments_left = Reg(UInt(width = dmaSegmentBits)) + val word_size = Reg(UInt(width = dmaWordSizeBits)) + + val src_vaddr = Reg(UInt(width = dmaAddrBits)) + val dst_vaddr = Reg(UInt(width = dmaAddrBits)) + val src_vpn = src_vaddr(dmaAddrBits - 1, pgIdxBits) + val dst_vpn = dst_vaddr(dmaAddrBits - 1, pgIdxBits) + val src_idx = src_vaddr(pgIdxBits - 1, 0) + val dst_idx = dst_vaddr(pgIdxBits - 1, 0) + val src_pglen = UInt(pgSize) - src_idx + val dst_pglen = UInt(pgSize) - dst_idx + + val src_stride = Reg(UInt(width = dmaSegmentSizeBits)) + val dst_stride = Reg(UInt(width = dmaSegmentSizeBits)) + + val src_ppn = Reg(UInt(width = ppnBits)) + val dst_ppn = Reg(UInt(width = ppnBits)) + + val src_paddr = Cat(src_ppn, src_idx) + val dst_paddr = Cat(dst_ppn, dst_idx) + + val last_src_vpn = Reg(UInt(width = vpnBits)) + val last_dst_vpn = Reg(UInt(width = vpnBits)) + + val tx_len = Util.minUInt(src_pglen, dst_pglen, bytes_left) + + val dma_busy = Reg(init = UInt(0, tlMaxClientXacts)) + val dma_xact_id = PriorityEncoder(~dma_busy) + val (dma_req_beat, dma_req_done) = Counter(io.mem.acquire.fire(), tlDataBeats) + + val (s_idle :: s_translate :: s_dma_req :: s_dma_update :: + s_prepare :: s_finish :: Nil) = Enum(Bits(), 6) + val state = Reg(init = s_idle) + + // lower bit is for src, higher bit is for dst + val to_translate = Reg(init = UInt(0, 2)) + val tlb_sent = Reg(init = UInt(0, 2)) + val tlb_to_send = to_translate & ~tlb_sent + val resp_status = Reg(UInt(width = dmaStatusBits)) + + def make_acquire( + addr_beat: UInt, client_xact_id: UInt, client_id: UInt, + cmd: UInt, source: UInt, dest: UInt, + length: UInt, size: UInt): Acquire = { + + val data_blob = Wire(UInt(width = tlDataBeats * tlDataBits)) + data_blob := DmaRequest( + xact_id = UInt(0), + client_id = client_id, + cmd = cmd, + source = source, + dest = dest, + length = length, + size = size).toBits + val data_beats = Vec(tlDataBeats, UInt(width = tlDataBits)).fromBits(data_blob) + val base_addr = addrMap("devices:dma").start + val addr_block = UInt(base_addr >> (tlBeatAddrBits + tlByteAddrBits)) + + PutBlock( + client_xact_id = client_xact_id, + addr_block = addr_block, + addr_beat = addr_beat, + data = data_beats(addr_beat), + alloc = Bool(false)) + } + + def check_region(cmd: UInt, src: UInt, dst: UInt): Bool = { + val src_cacheable = addrMap.isCacheable(src) + val dst_cacheable = addrMap.isCacheable(dst) + val dst_ok = Mux(cmd === DMA_CMD_SOUT, !dst_cacheable, dst_cacheable) + val src_ok = Mux(cmd === DMA_CMD_SIN, !src_cacheable, Bool(true)) + dst_ok && src_ok + } + + tlb.io.req.valid := tlb_to_send.orR + tlb.io.req.bits.vpn := Mux(tlb_to_send(0), src_vpn, dst_vpn) + tlb.io.req.bits.passthrough := Bool(false) + tlb.io.req.bits.instruction := Bool(false) + tlb.io.req.bits.store := !tlb_to_send(0) + tlb.io.resp.ready := tlb_sent.orR + + when (tlb.io.req.fire()) { + tlb_sent := tlb_sent | PriorityEncoderOH(tlb_to_send) + } + + when (tlb.io.resp.fire()) { + val recv_choice = PriorityEncoderOH(to_translate) + val error = Mux(recv_choice(0), + tlb.io.resp.bits.xcpt_ld, tlb.io.resp.bits.xcpt_st) + + when (error) { + resp_status := ClientDmaResponse.pagefault + state := s_finish + } + + // getting the src translation + when (recv_choice(0)) { + src_ppn := tlb.io.resp.bits.ppn + } .otherwise { + dst_ppn := tlb.io.resp.bits.ppn + } + + to_translate := to_translate & ~recv_choice + } + + io.cpu.req.ready := state === s_idle + io.cpu.resp.valid := state === s_finish + io.cpu.resp.bits := ClientDmaResponse(resp_status) + + io.mem.acquire.valid := (state === s_dma_req) && !dma_busy.andR + io.mem.acquire.bits := make_acquire( + addr_beat = dma_req_beat, + client_id = io.host_id, + client_xact_id = dma_xact_id, + cmd = cmd, source = src_paddr, dest = dst_paddr, + length = tx_len, size = word_size) + + io.mem.grant.ready := (state =/= s_dma_req) + + when (io.cpu.req.fire()) { + val req = io.cpu.req.bits + val is_prefetch = req.cmd(2, 1) === UInt("b01") + cmd := req.cmd + src_vaddr := req.src_start + dst_vaddr := req.dst_start + src_stride := req.src_stride + dst_stride := req.dst_stride + segment_size := req.segment_size + segments_left := req.nsegments - UInt(1) + bytes_left := req.segment_size + word_size := req.word_size + to_translate := Mux(is_prefetch, UInt("b10"), UInt("b11")) + tlb_sent := UInt(0) + state := s_translate + } + + when (state === s_translate && !to_translate.orR) { + when (check_region(cmd, src_paddr, dst_paddr)) { + state := s_dma_req + } .otherwise { + resp_status := ClientDmaResponse.invalid_region + state := s_finish + } + } + + def setBusy(set: Bool, xact_id: UInt): UInt = + Mux(set, UIntToOH(xact_id), UInt(0)) + + dma_busy := (dma_busy | + setBusy(dma_req_done, dma_xact_id)) & + ~setBusy(io.mem.grant.fire(), io.mem.grant.bits.client_xact_id) + + + when (dma_req_done) { + src_vaddr := src_vaddr + Mux(adv_ptr(0), tx_len, UInt(0)) + dst_vaddr := dst_vaddr + Mux(adv_ptr(1), tx_len, UInt(0)) + bytes_left := bytes_left - tx_len + state := s_dma_update + } + + when (state === s_dma_update) { + when (bytes_left === UInt(0)) { + when (segments_left === UInt(0)) { + resp_status := UInt(0) + state := s_finish + } .otherwise { + last_src_vpn := src_vpn + last_dst_vpn := dst_vpn + src_vaddr := src_vaddr + src_stride + dst_vaddr := dst_vaddr + dst_stride + bytes_left := segment_size + segments_left := segments_left - UInt(1) + state := s_prepare + } + } .otherwise { + to_translate := adv_ptr & Cat(dst_idx === UInt(0), src_idx === UInt(0)) + tlb_sent := UInt(0) + state := s_translate + } + } + + when (state === s_prepare) { + to_translate := adv_ptr & Cat( + dst_vpn =/= last_dst_vpn, + src_vpn =/= last_src_vpn) + tlb_sent := UInt(0) + state := s_translate + } + + when (state === s_finish) { state := s_idle } + + io.busy := (state =/= s_idle) || dma_busy.orR + io.incr_outstanding := dma_req_done +} + +object DmaCtrlRegNumbers { + val SRC_STRIDE = 0 + val DST_STRIDE = 1 + val SEGMENT_SIZE = 2 + val NSEGMENTS = 3 + val WORD_SIZE = 4 + val RESP_STATUS = 5 + val OUTSTANDING = 6 + val NCSRS = 7 + val CSR_BASE = 0x800 + val CSR_END = CSR_BASE + NCSRS +} +import DmaCtrlRegNumbers._ + +class DmaCtrlRegFile(implicit val p: Parameters) extends Module + with HasClientDmaParameters with HasTileLinkParameters { + + private val nWriteRegs = 5 + private val nRegs = nWriteRegs + 2 + + val io = new Bundle { + val wen = Bool(INPUT) + val waddr = UInt(INPUT, log2Up(nRegs)) + val wdata = UInt(INPUT, dmaSegmentSizeBits) + + val src_stride = UInt(OUTPUT, dmaSegmentSizeBits) + val dst_stride = UInt(OUTPUT, dmaSegmentSizeBits) + val segment_size = UInt(OUTPUT, dmaSegmentSizeBits) + val nsegments = UInt(OUTPUT, dmaSegmentBits) + val word_size = UInt(OUTPUT, dmaWordSizeBits) + + val incr_outstanding = Bool(INPUT) + val xact_outstanding = Bool(OUTPUT) + } + + val regs = Reg(Vec(nWriteRegs, UInt(width = dmaSegmentSizeBits))) + val waddr = io.waddr(log2Up(NCSRS) - 1, 0) + + io.src_stride := regs(SRC_STRIDE) + io.dst_stride := regs(DST_STRIDE) + io.segment_size := regs(SEGMENT_SIZE) + io.nsegments := regs(NSEGMENTS) + io.word_size := regs(WORD_SIZE) + + when (io.wen && waddr < UInt(nWriteRegs)) { + regs.write(waddr, io.wdata) + } + + val outstanding_cnt = TwoWayCounter( + io.incr_outstanding, + io.wen && io.waddr === UInt(OUTSTANDING), + tlMaxClientXacts) + + io.xact_outstanding := outstanding_cnt > UInt(0) +} + +class DmaController(implicit p: Parameters) extends RoCC()(p) + with HasClientDmaParameters { + io.mem.req.valid := Bool(false) + io.resp.valid := Bool(false) + io.interrupt := Bool(false) + + val cmd = Queue(io.cmd) + val inst = cmd.bits.inst + val is_transfer = inst.funct < UInt(8) + + val reg_status = Reg(UInt(width = dmaStatusBits)) + val crfile = Module(new DmaCtrlRegFile) + crfile.io.waddr := io.csr.waddr + crfile.io.wdata := io.csr.wdata + crfile.io.wen := io.csr.wen + + io.csr.rdata(SRC_STRIDE) := crfile.io.src_stride + io.csr.rdata(DST_STRIDE) := crfile.io.dst_stride + io.csr.rdata(SEGMENT_SIZE) := crfile.io.segment_size + io.csr.rdata(NSEGMENTS) := crfile.io.nsegments + io.csr.rdata(WORD_SIZE) := crfile.io.word_size + io.csr.rdata(RESP_STATUS) := reg_status + + val frontend = Module(new DmaFrontend) + io.ptw(0) <> frontend.io.ptw + io.autl <> frontend.io.mem + crfile.io.incr_outstanding := frontend.io.incr_outstanding + frontend.io.host_id := io.host_id + frontend.io.cpu.req.valid := cmd.valid && is_transfer + frontend.io.cpu.req.bits := ClientDmaRequest( + cmd = cmd.bits.inst.funct, + src_start = cmd.bits.rs2, + dst_start = cmd.bits.rs1, + src_stride = crfile.io.src_stride, + dst_stride = crfile.io.dst_stride, + segment_size = crfile.io.segment_size, + nsegments = crfile.io.nsegments, + word_size = crfile.io.word_size) + cmd.ready := is_transfer && frontend.io.cpu.req.ready + + when (frontend.io.cpu.resp.valid) { + reg_status := frontend.io.cpu.resp.bits.status + } + + io.busy := cmd.valid || frontend.io.busy || crfile.io.xact_outstanding +} diff --git a/rocket/src/main/scala/dpath_alu.scala b/rocket/src/main/scala/dpath_alu.scala new file mode 100644 index 00000000..841f0ec0 --- /dev/null +++ b/rocket/src/main/scala/dpath_alu.scala @@ -0,0 +1,96 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import cde.{Parameters, Field} +import Instructions._ + +object ALU +{ + val SZ_ALU_FN = 4 + val FN_X = BitPat("b????") + val FN_ADD = UInt(0) + val FN_SL = UInt(1) + val FN_SEQ = UInt(2) + val FN_SNE = UInt(3) + val FN_XOR = UInt(4) + val FN_SR = UInt(5) + val FN_OR = UInt(6) + val FN_AND = UInt(7) + val FN_SUB = UInt(10) + val FN_SRA = UInt(11) + val FN_SLT = UInt(12) + val FN_SGE = UInt(13) + val FN_SLTU = UInt(14) + val FN_SGEU = UInt(15) + + val FN_DIV = FN_XOR + val FN_DIVU = FN_SR + val FN_REM = FN_OR + val FN_REMU = FN_AND + + val FN_MUL = FN_ADD + val FN_MULH = FN_SL + val FN_MULHSU = FN_SLT + val FN_MULHU = FN_SLTU + + def isMulFN(fn: UInt, cmp: UInt) = fn(1,0) === cmp(1,0) + def isSub(cmd: UInt) = cmd(3) + def isCmp(cmd: UInt) = cmd === FN_SEQ || cmd === FN_SNE || cmd >= FN_SLT + def cmpUnsigned(cmd: UInt) = cmd(1) + def cmpInverted(cmd: UInt) = cmd(0) + def cmpEq(cmd: UInt) = !cmd(3) +} +import ALU._ + +class ALU(implicit p: Parameters) extends CoreModule()(p) { + val io = new Bundle { + val dw = Bits(INPUT, SZ_DW) + val fn = Bits(INPUT, SZ_ALU_FN) + val in2 = UInt(INPUT, xLen) + val in1 = UInt(INPUT, xLen) + val out = UInt(OUTPUT, xLen) + val adder_out = UInt(OUTPUT, xLen) + val cmp_out = Bool(OUTPUT) + } + + // ADD, SUB + val in2_inv = Mux(isSub(io.fn), ~io.in2, io.in2) + val in1_xor_in2 = io.in1 ^ in2_inv + io.adder_out := io.in1 + in2_inv + isSub(io.fn) + + // SLT, SLTU + io.cmp_out := cmpInverted(io.fn) ^ + Mux(cmpEq(io.fn), in1_xor_in2 === UInt(0), + Mux(io.in1(xLen-1) === io.in2(xLen-1), io.adder_out(xLen-1), + Mux(cmpUnsigned(io.fn), io.in2(xLen-1), io.in1(xLen-1)))) + + // SLL, SRL, SRA + val (shamt, shin_r) = + if (xLen == 32) (io.in2(4,0), io.in1) + else { + require(xLen == 64) + val shin_hi_32 = Fill(32, isSub(io.fn) && io.in1(31)) + val shin_hi = Mux(io.dw === DW_64, io.in1(63,32), shin_hi_32) + val shamt = Cat(io.in2(5) & (io.dw === DW_64), io.in2(4,0)) + (shamt, Cat(shin_hi, io.in1(31,0))) + } + val shin = Mux(io.fn === FN_SR || io.fn === FN_SRA, shin_r, Reverse(shin_r)) + val shout_r = (Cat(isSub(io.fn) & shin(xLen-1), shin).toSInt >> shamt)(xLen-1,0) + val shout_l = Reverse(shout_r) + val shout = Mux(io.fn === FN_SR || io.fn === FN_SRA, shout_r, UInt(0)) | + Mux(io.fn === FN_SL, shout_l, UInt(0)) + + // AND, OR, XOR + val logic = Mux(io.fn === FN_XOR || io.fn === FN_OR, in1_xor_in2, UInt(0)) | + Mux(io.fn === FN_OR || io.fn === FN_AND, io.in1 & io.in2, UInt(0)) + val shift_logic = (isCmp(io.fn) && io.cmp_out) | logic | shout + val out = Mux(io.fn === FN_ADD || io.fn === FN_SUB, io.adder_out, shift_logic) + + io.out := out + if (xLen > 32) { + require(xLen == 64) + when (io.dw === DW_32) { io.out := Cat(Fill(32, out(31)), out(31,0)) } + } +} diff --git a/rocket/src/main/scala/fpu.scala b/rocket/src/main/scala/fpu.scala new file mode 100644 index 00000000..75153249 --- /dev/null +++ b/rocket/src/main/scala/fpu.scala @@ -0,0 +1,641 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import Instructions._ +import Util._ +import FPConstants._ +import uncore.constants.MemoryOpConstants._ +import cde.{Parameters, Field} + +case object SFMALatency extends Field[Int] +case object DFMALatency extends Field[Int] + +object FPConstants +{ + val FCMD_ADD = BitPat("b0??00") + val FCMD_SUB = BitPat("b0??01") + val FCMD_MUL = BitPat("b0??10") + val FCMD_MADD = BitPat("b1??00") + val FCMD_MSUB = BitPat("b1??01") + val FCMD_NMSUB = BitPat("b1??10") + val FCMD_NMADD = BitPat("b1??11") + val FCMD_DIV = BitPat("b?0011") + val FCMD_SQRT = BitPat("b?1011") + val FCMD_SGNJ = BitPat("b??1?0") + val FCMD_MINMAX = BitPat("b?01?1") + val FCMD_CVT_FF = BitPat("b??0??") + val FCMD_CVT_IF = BitPat("b?10??") + val FCMD_CMP = BitPat("b?01??") + val FCMD_MV_XF = BitPat("b?11??") + val FCMD_CVT_FI = BitPat("b??0??") + val FCMD_MV_FX = BitPat("b??1??") + val FCMD_X = BitPat("b?????") + val FCMD_WIDTH = 5 + + val RM_SZ = 3 + val FLAGS_SZ = 5 +} + +class FPUCtrlSigs extends Bundle +{ + val cmd = Bits(width = FCMD_WIDTH) + val ldst = Bool() + val wen = Bool() + val ren1 = Bool() + val ren2 = Bool() + val ren3 = Bool() + val swap12 = Bool() + val swap23 = Bool() + val single = Bool() + val fromint = Bool() + val toint = Bool() + val fastpipe = Bool() + val fma = Bool() + val div = Bool() + val sqrt = Bool() + val round = Bool() + val wflags = Bool() +} + +class FPUDecoder extends Module +{ + val io = new Bundle { + val inst = Bits(INPUT, 32) + val sigs = new FPUCtrlSigs().asOutput + } + + val decoder = DecodeLogic(io.inst, + List (FCMD_X, X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X), + Array(FLW -> List(FCMD_X, Y,Y,N,N,N,X,X,Y,N,N,N,N,N,N,N,N), + FLD -> List(FCMD_X, Y,Y,N,N,N,X,X,N,N,N,N,N,N,N,N,N), + FSW -> List(FCMD_MV_XF, Y,N,N,Y,N,Y,X,Y,N,Y,N,N,N,N,N,N), + FSD -> List(FCMD_MV_XF, Y,N,N,Y,N,Y,X,N,N,Y,N,N,N,N,N,N), + FMV_S_X -> List(FCMD_MV_FX, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,N), + FMV_D_X -> List(FCMD_MV_FX, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,N), + FCVT_S_W -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y), + FCVT_S_WU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y), + FCVT_S_L -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y), + FCVT_S_LU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,Y,Y,N,N,N,N,N,Y,Y), + FCVT_D_W -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y), + FCVT_D_WU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y), + FCVT_D_L -> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y), + FCVT_D_LU-> List(FCMD_CVT_FI, N,Y,N,N,N,X,X,N,Y,N,N,N,N,N,Y,Y), + FMV_X_S -> List(FCMD_MV_XF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,N), + FMV_X_D -> List(FCMD_MV_XF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,N), + FCLASS_S -> List(FCMD_MV_XF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,N), + FCLASS_D -> List(FCMD_MV_XF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,N), + FCVT_W_S -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y), + FCVT_WU_S-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y), + FCVT_L_S -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y), + FCVT_LU_S-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,Y,N,Y,N,N,N,N,Y,Y), + FCVT_W_D -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y), + FCVT_WU_D-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y), + FCVT_L_D -> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y), + FCVT_LU_D-> List(FCMD_CVT_IF, N,N,Y,N,N,N,X,N,N,Y,N,N,N,N,Y,Y), + FCVT_S_D -> List(FCMD_CVT_FF, N,Y,Y,N,N,N,X,Y,N,N,Y,N,N,N,Y,Y), + FCVT_D_S -> List(FCMD_CVT_FF, N,Y,Y,N,N,N,X,N,N,N,Y,N,N,N,Y,Y), + FEQ_S -> List(FCMD_CMP, N,N,Y,Y,N,N,N,Y,N,Y,N,N,N,N,N,Y), + FLT_S -> List(FCMD_CMP, N,N,Y,Y,N,N,N,Y,N,Y,N,N,N,N,N,Y), + FLE_S -> List(FCMD_CMP, N,N,Y,Y,N,N,N,Y,N,Y,N,N,N,N,N,Y), + FEQ_D -> List(FCMD_CMP, N,N,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,Y), + FLT_D -> List(FCMD_CMP, N,N,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,Y), + FLE_D -> List(FCMD_CMP, N,N,Y,Y,N,N,N,N,N,Y,N,N,N,N,N,Y), + FSGNJ_S -> List(FCMD_SGNJ, N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,N), + FSGNJN_S -> List(FCMD_SGNJ, N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,N), + FSGNJX_S -> List(FCMD_SGNJ, N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,N), + FSGNJ_D -> List(FCMD_SGNJ, N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,N), + FSGNJN_D -> List(FCMD_SGNJ, N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,N), + FSGNJX_D -> List(FCMD_SGNJ, N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,N), + FMIN_S -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,Y), + FMAX_S -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,Y,N,N,Y,N,N,N,N,Y), + FMIN_D -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,Y), + FMAX_D -> List(FCMD_MINMAX, N,Y,Y,Y,N,N,N,N,N,N,Y,N,N,N,N,Y), + FADD_S -> List(FCMD_ADD, N,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y,Y), + FSUB_S -> List(FCMD_SUB, N,Y,Y,Y,N,N,Y,Y,N,N,N,Y,N,N,Y,Y), + FMUL_S -> List(FCMD_MUL, N,Y,Y,Y,N,N,N,Y,N,N,N,Y,N,N,Y,Y), + FADD_D -> List(FCMD_ADD, N,Y,Y,Y,N,N,Y,N,N,N,N,Y,N,N,Y,Y), + FSUB_D -> List(FCMD_SUB, N,Y,Y,Y,N,N,Y,N,N,N,N,Y,N,N,Y,Y), + FMUL_D -> List(FCMD_MUL, N,Y,Y,Y,N,N,N,N,N,N,N,Y,N,N,Y,Y), + FMADD_S -> List(FCMD_MADD, N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y), + FMSUB_S -> List(FCMD_MSUB, N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y), + FNMADD_S -> List(FCMD_NMADD, N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y), + FNMSUB_S -> List(FCMD_NMSUB, N,Y,Y,Y,Y,N,N,Y,N,N,N,Y,N,N,Y,Y), + FMADD_D -> List(FCMD_MADD, N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y), + FMSUB_D -> List(FCMD_MSUB, N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y), + FNMADD_D -> List(FCMD_NMADD, N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y), + FNMSUB_D -> List(FCMD_NMSUB, N,Y,Y,Y,Y,N,N,N,N,N,N,Y,N,N,Y,Y), + FDIV_S -> List(FCMD_DIV, N,Y,Y,Y,N,N,N,Y,N,N,N,N,Y,N,Y,Y), + FSQRT_S -> List(FCMD_SQRT, N,Y,Y,N,N,Y,X,Y,N,N,N,N,N,Y,Y,Y), + FDIV_D -> List(FCMD_DIV, N,Y,Y,Y,N,N,N,N,N,N,N,N,Y,N,Y,Y), + FSQRT_D -> List(FCMD_SQRT, N,Y,Y,N,N,Y,X,N,N,N,N,N,N,Y,Y,Y) + )) + val s = io.sigs + val sigs = Seq(s.cmd, s.ldst, s.wen, s.ren1, s.ren2, s.ren3, s.swap12, + s.swap23, s.single, s.fromint, s.toint, s.fastpipe, s.fma, + s.div, s.sqrt, s.round, s.wflags) + sigs zip decoder map {case(s,d) => s := d} +} + +class FPUIO(implicit p: Parameters) extends CoreBundle { + val inst = Bits(INPUT, 32) + val fromint_data = Bits(INPUT, xLen) + + val fcsr_rm = Bits(INPUT, FPConstants.RM_SZ) + val fcsr_flags = Valid(Bits(width = FPConstants.FLAGS_SZ)) + + val store_data = Bits(OUTPUT, 64) + val toint_data = Bits(OUTPUT, xLen) + + val dmem_resp_val = Bool(INPUT) + val dmem_resp_type = Bits(INPUT, 3) + val dmem_resp_tag = UInt(INPUT, 5) + val dmem_resp_data = Bits(INPUT, 64) + + val valid = Bool(INPUT) + val fcsr_rdy = Bool(OUTPUT) + val nack_mem = Bool(OUTPUT) + val illegal_rm = Bool(OUTPUT) + val killx = Bool(INPUT) + val killm = Bool(INPUT) + val dec = new FPUCtrlSigs().asOutput + val sboard_set = Bool(OUTPUT) + val sboard_clr = Bool(OUTPUT) + val sboard_clra = UInt(OUTPUT, 5) + + val cp_req = Decoupled(new FPInput()).flip //cp doesn't pay attn to kill sigs + val cp_resp = Decoupled(new FPResult()) +} + +class FPResult extends Bundle +{ + val data = Bits(width = 65) + val exc = Bits(width = 5) +} + +class FPInput extends FPUCtrlSigs { + val rm = Bits(width = 3) + val typ = Bits(width = 2) + val in1 = Bits(width = 65) + val in2 = Bits(width = 65) + val in3 = Bits(width = 65) +} + +object ClassifyRecFN { + def apply(expWidth: Int, sigWidth: Int, in: UInt) = { + val sign = in(sigWidth + expWidth) + val exp = in(sigWidth + expWidth - 1, sigWidth - 1) + val sig = in(sigWidth - 2, 0) + + val code = exp(expWidth,expWidth-2) + val codeHi = code(2, 1) + val isSpecial = codeHi === UInt(3) + + val isHighSubnormalIn = exp(expWidth-2, 0) < UInt(2) + val isSubnormal = code === UInt(1) || codeHi === UInt(1) && isHighSubnormalIn + val isNormal = codeHi === UInt(1) && !isHighSubnormalIn || codeHi === UInt(2) + val isZero = code === UInt(0) + val isInf = isSpecial && !exp(expWidth-2) + val isNaN = code.andR + val isSNaN = isNaN && !sig(sigWidth-2) + val isQNaN = isNaN && sig(sigWidth-2) + + Cat(isQNaN, isSNaN, isInf && !sign, isNormal && !sign, + isSubnormal && !sign, isZero && !sign, isZero && sign, + isSubnormal && sign, isNormal && sign, isInf && sign) + } +} + +class FPToInt extends Module +{ + val io = new Bundle { + val in = Valid(new FPInput).flip + val as_double = new FPInput().asOutput + val out = Valid(new Bundle { + val lt = Bool() + val store = Bits(width = 64) + val toint = Bits(width = 64) + val exc = Bits(width = 5) + }) + } + + val in = Reg(new FPInput) + val valid = Reg(next=io.in.valid) + + def upconvert(x: UInt) = { + val s2d = Module(new hardfloat.RecFNToRecFN(8, 24, 11, 53)) + s2d.io.in := x + s2d.io.roundingMode := UInt(0) + s2d.io.out + } + + val in1_upconvert = upconvert(io.in.bits.in1) + val in2_upconvert = upconvert(io.in.bits.in2) + + when (io.in.valid) { + in := io.in.bits + when (io.in.bits.single && !io.in.bits.ldst && io.in.bits.cmd =/= FCMD_MV_XF) { + in.in1 := in1_upconvert + in.in2 := in2_upconvert + } + } + + val unrec_s = hardfloat.fNFromRecFN(8, 24, in.in1) + val unrec_d = hardfloat.fNFromRecFN(11, 53, in.in1) + val unrec_out = Mux(in.single, Cat(Fill(32, unrec_s(31)), unrec_s), unrec_d) + + val classify_s = ClassifyRecFN(8, 24, in.in1) + val classify_d = ClassifyRecFN(11, 53, in.in1) + val classify_out = Mux(in.single, classify_s, classify_d) + + val dcmp = Module(new hardfloat.CompareRecFN(11, 53)) + dcmp.io.a := in.in1 + dcmp.io.b := in.in2 + dcmp.io.signaling := Bool(true) + val dcmp_out = (~in.rm & Cat(dcmp.io.lt, dcmp.io.eq)).orR + val dcmp_exc = dcmp.io.exceptionFlags + + val d2l = Module(new hardfloat.RecFNToIN(11, 53, 64)) + val d2w = Module(new hardfloat.RecFNToIN(11, 53, 32)) + d2l.io.in := in.in1 + d2l.io.roundingMode := in.rm + d2l.io.signedOut := ~in.typ(0) + d2w.io.in := in.in1 + d2w.io.roundingMode := in.rm + d2w.io.signedOut := ~in.typ(0) + + io.out.bits.toint := Mux(in.rm(0), classify_out, unrec_out) + io.out.bits.store := unrec_out + io.out.bits.exc := Bits(0) + + when (in.cmd === FCMD_CMP) { + io.out.bits.toint := dcmp_out + io.out.bits.exc := dcmp_exc + } + when (in.cmd === FCMD_CVT_IF) { + io.out.bits.toint := Mux(in.typ(1), d2l.io.out.toSInt, d2w.io.out.toSInt).toUInt + val dflags = Mux(in.typ(1), d2l.io.intExceptionFlags, d2w.io.intExceptionFlags) + io.out.bits.exc := Cat(dflags(2, 1).orR, UInt(0, 3), dflags(0)) + } + + io.out.valid := valid + io.out.bits.lt := dcmp.io.lt + io.as_double := in +} + +class IntToFP(val latency: Int) extends Module +{ + val io = new Bundle { + val in = Valid(new FPInput).flip + val out = Valid(new FPResult) + } + + val in = Pipe(io.in) + + val mux = Wire(new FPResult) + mux.exc := Bits(0) + mux.data := hardfloat.recFNFromFN(11, 53, in.bits.in1) + when (in.bits.single) { + mux.data := Cat(SInt(-1, 32), hardfloat.recFNFromFN(8, 24, in.bits.in1)) + } + + val longValue = + Mux(in.bits.typ(1), in.bits.in1.toSInt, + Mux(in.bits.typ(0), in.bits.in1(31,0).zext, in.bits.in1(31,0).toSInt)) + val l2s = Module(new hardfloat.INToRecFN(64, 8, 24)) + l2s.io.signedIn := ~in.bits.typ(0) + l2s.io.in := longValue.toUInt + l2s.io.roundingMode := in.bits.rm + + val l2d = Module(new hardfloat.INToRecFN(64, 11, 53)) + l2d.io.signedIn := ~in.bits.typ(0) + l2d.io.in := longValue.toUInt + l2d.io.roundingMode := in.bits.rm + + when (in.bits.cmd === FCMD_CVT_FI) { + when (in.bits.single) { + mux.data := Cat(SInt(-1, 32), l2s.io.out) + mux.exc := l2s.io.exceptionFlags + }.otherwise { + mux.data := l2d.io.out + mux.exc := l2d.io.exceptionFlags + } + } + + io.out <> Pipe(in.valid, mux, latency-1) +} + +class FPToFP(val latency: Int) extends Module +{ + val io = new Bundle { + val in = Valid(new FPInput).flip + val out = Valid(new FPResult) + val lt = Bool(INPUT) // from FPToInt + } + + val in = Pipe(io.in) + + // fp->fp units + val isSgnj = in.bits.cmd === FCMD_SGNJ + def fsgnjSign(in1: Bits, in2: Bits, pos: Int, en: Bool, rm: Bits) = + Mux(rm(1) || !en, in1(pos), rm(0)) ^ (en && in2(pos)) + val sign_s = fsgnjSign(in.bits.in1, in.bits.in2, 32, in.bits.single && isSgnj, in.bits.rm) + val sign_d = fsgnjSign(in.bits.in1, in.bits.in2, 64, !in.bits.single && isSgnj, in.bits.rm) + val fsgnj = Cat(sign_d, in.bits.in1(63,33), sign_s, in.bits.in1(31,0)) + + val s2d = Module(new hardfloat.RecFNToRecFN(8, 24, 11, 53)) + val d2s = Module(new hardfloat.RecFNToRecFN(11, 53, 8, 24)) + s2d.io.in := in.bits.in1 + s2d.io.roundingMode := in.bits.rm + d2s.io.in := in.bits.in1 + d2s.io.roundingMode := in.bits.rm + + val isnan1 = Mux(in.bits.single, in.bits.in1(31,29).andR, in.bits.in1(63,61).andR) + val isnan2 = Mux(in.bits.single, in.bits.in2(31,29).andR, in.bits.in2(63,61).andR) + val issnan1 = isnan1 && ~Mux(in.bits.single, in.bits.in1(22), in.bits.in1(51)) + val issnan2 = isnan2 && ~Mux(in.bits.single, in.bits.in2(22), in.bits.in2(51)) + val minmax_exc = Cat(issnan1 || issnan2, Bits(0,4)) + val isMax = in.bits.rm(0) + val isLHS = isnan2 || isMax =/= io.lt && !isnan1 + + val mux = Wire(new FPResult) + mux.exc := minmax_exc + mux.data := in.bits.in2 + + when (isSgnj) { mux.exc := UInt(0) } + when (isSgnj || isLHS) { mux.data := fsgnj } + when (in.bits.cmd === FCMD_CVT_FF) { + when (in.bits.single) { + mux.data := Cat(SInt(-1, 32), d2s.io.out) + mux.exc := d2s.io.exceptionFlags + }.otherwise { + mux.data := s2d.io.out + mux.exc := s2d.io.exceptionFlags + } + } + + io.out <> Pipe(in.valid, mux, latency-1) +} + +class FPUFMAPipe(val latency: Int, expWidth: Int, sigWidth: Int) extends Module +{ + val io = new Bundle { + val in = Valid(new FPInput).flip + val out = Valid(new FPResult) + } + + val width = sigWidth + expWidth + val one = UInt(1) << (width-1) + val zero = (io.in.bits.in1(width) ^ io.in.bits.in2(width)) << width + + val valid = Reg(next=io.in.valid) + val in = Reg(new FPInput) + when (io.in.valid) { + in := io.in.bits + val cmd_fma = io.in.bits.ren3 + val cmd_addsub = io.in.bits.swap23 + in.cmd := Cat(io.in.bits.cmd(1) & (cmd_fma || cmd_addsub), io.in.bits.cmd(0)) + when (cmd_addsub) { in.in2 := one } + unless (cmd_fma || cmd_addsub) { in.in3 := zero } + } + + val fma = Module(new hardfloat.MulAddRecFN(expWidth, sigWidth)) + fma.io.op := in.cmd + fma.io.roundingMode := in.rm + fma.io.a := in.in1 + fma.io.b := in.in2 + fma.io.c := in.in3 + + val res = Wire(new FPResult) + res.data := Cat(SInt(-1, 32), fma.io.out) + res.exc := fma.io.exceptionFlags + io.out := Pipe(valid, res, latency-1) +} + +class FPU(implicit p: Parameters) extends CoreModule()(p) { + require(xLen == 64, "RV32 Rocket FP support missing") + val io = new FPUIO + + val ex_reg_valid = Reg(next=io.valid, init=Bool(false)) + val req_valid = ex_reg_valid || io.cp_req.valid + val ex_reg_inst = RegEnable(io.inst, io.valid) + val ex_cp_valid = io.cp_req.valid && !ex_reg_valid + val mem_reg_valid = Reg(next=ex_reg_valid && !io.killx || ex_cp_valid, init=Bool(false)) + val mem_reg_inst = RegEnable(ex_reg_inst, ex_reg_valid) + val mem_cp_valid = Reg(next=ex_cp_valid, init=Bool(false)) + val killm = (io.killm || io.nack_mem) && !mem_cp_valid + val wb_reg_valid = Reg(next=mem_reg_valid && (!killm || mem_cp_valid), init=Bool(false)) + val wb_cp_valid = Reg(next=mem_cp_valid, init=Bool(false)) + + val fp_decoder = Module(new FPUDecoder) + fp_decoder.io.inst := io.inst + + val cp_ctrl = Wire(new FPUCtrlSigs) + cp_ctrl <> io.cp_req.bits + io.cp_resp.valid := Bool(false) + io.cp_resp.bits.data := UInt(0) + + val id_ctrl = fp_decoder.io.sigs + val ex_ctrl = Mux(ex_reg_valid, RegEnable(id_ctrl, io.valid), cp_ctrl) + val mem_ctrl = RegEnable(ex_ctrl, req_valid) + val wb_ctrl = RegEnable(mem_ctrl, mem_reg_valid) + + // load response + val load_wb = Reg(next=io.dmem_resp_val) + val load_wb_single = RegEnable(io.dmem_resp_type === MT_W || io.dmem_resp_type === MT_WU, io.dmem_resp_val) + val load_wb_data = RegEnable(io.dmem_resp_data, io.dmem_resp_val) + val load_wb_tag = RegEnable(io.dmem_resp_tag, io.dmem_resp_val) + val rec_s = hardfloat.recFNFromFN(8, 24, load_wb_data) + val rec_d = hardfloat.recFNFromFN(11, 53, load_wb_data) + val load_wb_data_recoded = Mux(load_wb_single, Cat(SInt(-1, 32), rec_s), rec_d) + + // regfile + val regfile = Mem(32, Bits(width = 65)) + when (load_wb) { + regfile(load_wb_tag) := load_wb_data_recoded + if (enableCommitLog) { + printf ("f%d p%d 0x%x\n", load_wb_tag, load_wb_tag + UInt(32), + Mux(load_wb_single, load_wb_data(31,0), load_wb_data)) + } + } + + val ex_ra1::ex_ra2::ex_ra3::Nil = List.fill(3)(Reg(UInt())) + when (io.valid) { + when (id_ctrl.ren1) { + when (!id_ctrl.swap12) { ex_ra1 := io.inst(19,15) } + when (id_ctrl.swap12) { ex_ra2 := io.inst(19,15) } + } + when (id_ctrl.ren2) { + when (id_ctrl.swap12) { ex_ra1 := io.inst(24,20) } + when (id_ctrl.swap23) { ex_ra3 := io.inst(24,20) } + when (!id_ctrl.swap12 && !id_ctrl.swap23) { ex_ra2 := io.inst(24,20) } + } + when (id_ctrl.ren3) { ex_ra3 := io.inst(31,27) } + } + val ex_rs1::ex_rs2::ex_rs3::Nil = Seq(ex_ra1, ex_ra2, ex_ra3).map(regfile(_)) + val ex_rm = Mux(ex_reg_inst(14,12) === Bits(7), io.fcsr_rm, ex_reg_inst(14,12)) + + val cp_rs1 = io.cp_req.bits.in1 + val cp_rs2 = Mux(io.cp_req.bits.swap23, io.cp_req.bits.in3, io.cp_req.bits.in2) + val cp_rs3 = Mux(io.cp_req.bits.swap23, io.cp_req.bits.in2, io.cp_req.bits.in3) + + val req = Wire(new FPInput) + req := ex_ctrl + req.rm := Mux(ex_reg_valid, ex_rm, io.cp_req.bits.rm) + req.in1 := Mux(ex_reg_valid, ex_rs1, cp_rs1) + req.in2 := Mux(ex_reg_valid, ex_rs2, cp_rs2) + req.in3 := Mux(ex_reg_valid, ex_rs3, cp_rs3) + req.typ := Mux(ex_reg_valid, ex_reg_inst(21,20), io.cp_req.bits.typ) + + val sfma = Module(new FPUFMAPipe(p(SFMALatency), 8, 24)) + sfma.io.in.valid := req_valid && ex_ctrl.fma && ex_ctrl.single + sfma.io.in.bits := req + + val dfma = Module(new FPUFMAPipe(p(DFMALatency), 11, 53)) + dfma.io.in.valid := req_valid && ex_ctrl.fma && !ex_ctrl.single + dfma.io.in.bits := req + + val fpiu = Module(new FPToInt) + fpiu.io.in.valid := req_valid && (ex_ctrl.toint || ex_ctrl.div || ex_ctrl.sqrt || ex_ctrl.cmd === FCMD_MINMAX) + fpiu.io.in.bits := req + io.store_data := fpiu.io.out.bits.store + io.toint_data := fpiu.io.out.bits.toint + when(fpiu.io.out.valid && mem_cp_valid && mem_ctrl.toint){ + io.cp_resp.bits.data := fpiu.io.out.bits.toint + io.cp_resp.valid := Bool(true) + } + + val ifpu = Module(new IntToFP(3)) + ifpu.io.in.valid := req_valid && ex_ctrl.fromint + ifpu.io.in.bits := req + ifpu.io.in.bits.in1 := Mux(ex_reg_valid, io.fromint_data, cp_rs1) + + val fpmu = Module(new FPToFP(2)) + fpmu.io.in.valid := req_valid && ex_ctrl.fastpipe + fpmu.io.in.bits := req + fpmu.io.lt := fpiu.io.out.bits.lt + + val divSqrt_wen = Reg(next=Bool(false)) + val divSqrt_inReady = Wire(init=Bool(false)) + val divSqrt_waddr = Reg(Bits()) + val divSqrt_wdata = Wire(Bits()) + val divSqrt_flags = Wire(Bits()) + val divSqrt_in_flight = Reg(init=Bool(false)) + val divSqrt_killed = Reg(Bool()) + + // writeback arbitration + case class Pipe(p: Module, lat: Int, cond: (FPUCtrlSigs) => Bool, res: FPResult) + val pipes = List( + Pipe(fpmu, fpmu.latency, (c: FPUCtrlSigs) => c.fastpipe, fpmu.io.out.bits), + Pipe(ifpu, ifpu.latency, (c: FPUCtrlSigs) => c.fromint, ifpu.io.out.bits), + Pipe(sfma, sfma.latency, (c: FPUCtrlSigs) => c.fma && c.single, sfma.io.out.bits), + Pipe(dfma, dfma.latency, (c: FPUCtrlSigs) => c.fma && !c.single, dfma.io.out.bits)) + def latencyMask(c: FPUCtrlSigs, offset: Int) = { + require(pipes.forall(_.lat >= offset)) + pipes.map(p => Mux(p.cond(c), UInt(1 << p.lat-offset), UInt(0))).reduce(_|_) + } + def pipeid(c: FPUCtrlSigs) = pipes.zipWithIndex.map(p => Mux(p._1.cond(c), UInt(p._2), UInt(0))).reduce(_|_) + val maxLatency = pipes.map(_.lat).max + val memLatencyMask = latencyMask(mem_ctrl, 2) + + val wen = Reg(init=Bits(0, maxLatency-1)) + val winfo = Reg(Vec(maxLatency-1, Bits())) + val mem_wen = mem_reg_valid && (mem_ctrl.fma || mem_ctrl.fastpipe || mem_ctrl.fromint) + val write_port_busy = RegEnable(mem_wen && (memLatencyMask & latencyMask(ex_ctrl, 1)).orR || (wen & latencyMask(ex_ctrl, 0)).orR, req_valid) + val mem_winfo = Cat(mem_cp_valid, pipeid(mem_ctrl), mem_ctrl.single, mem_reg_inst(11,7)) //single only used for debugging + + for (i <- 0 until maxLatency-2) { + when (wen(i+1)) { winfo(i) := winfo(i+1) } + } + wen := wen >> 1 + when (mem_wen) { + when (!killm) { + wen := wen >> 1 | memLatencyMask + } + for (i <- 0 until maxLatency-1) { + when (!write_port_busy && memLatencyMask(i)) { + winfo(i) := mem_winfo + } + } + } + + val waddr = Mux(divSqrt_wen, divSqrt_waddr, winfo(0)(4,0).toUInt) + val wsrc = (winfo(0) >> 6)(log2Up(pipes.size) - 1,0) + val wcp = winfo(0)(6+log2Up(pipes.size)) + val wdata = Mux(divSqrt_wen, divSqrt_wdata, (pipes.map(_.res.data): Seq[UInt])(wsrc)) + val wexc = (pipes.map(_.res.exc): Seq[UInt])(wsrc) + when ((!wcp && wen(0)) || divSqrt_wen) { + regfile(waddr) := wdata + if (enableCommitLog) { + val wdata_unrec_s = hardfloat.fNFromRecFN(8, 24, wdata(64,0)) + val wdata_unrec_d = hardfloat.fNFromRecFN(11, 53, wdata(64,0)) + val wb_single = (winfo(0) >> 5)(0) + printf ("f%d p%d 0x%x\n", waddr, waddr+ UInt(32), + Mux(wb_single, Cat(UInt(0,32), wdata_unrec_s), wdata_unrec_d)) + } + } + when (wcp && wen(0)) { + io.cp_resp.bits.data := wdata + io.cp_resp.valid := Bool(true) + } + io.cp_req.ready := !ex_reg_valid + + val wb_toint_valid = wb_reg_valid && wb_ctrl.toint + val wb_toint_exc = RegEnable(fpiu.io.out.bits.exc, mem_ctrl.toint) + io.fcsr_flags.valid := wb_toint_valid || divSqrt_wen || wen(0) + io.fcsr_flags.bits := + Mux(wb_toint_valid, wb_toint_exc, UInt(0)) | + Mux(divSqrt_wen, divSqrt_flags, UInt(0)) | + Mux(wen(0), wexc, UInt(0)) + + val units_busy = mem_reg_valid && (mem_ctrl.div || mem_ctrl.sqrt) && (!divSqrt_inReady || wen.orR) // || mem_reg_valid && mem_ctrl.fma && Reg(next=Mux(ex_ctrl.single, io.sfma.valid, io.dfma.valid)) + io.fcsr_rdy := !(ex_reg_valid && ex_ctrl.wflags || mem_reg_valid && mem_ctrl.wflags || wb_reg_valid && wb_ctrl.toint || wen.orR || divSqrt_in_flight) + io.nack_mem := units_busy || write_port_busy || divSqrt_in_flight + io.dec <> fp_decoder.io.sigs + def useScoreboard(f: ((Pipe, Int)) => Bool) = pipes.zipWithIndex.filter(_._1.lat > 3).map(x => f(x)).fold(Bool(false))(_||_) + io.sboard_set := wb_reg_valid && !wb_cp_valid && Reg(next=useScoreboard(_._1.cond(mem_ctrl)) || mem_ctrl.div || mem_ctrl.sqrt) + io.sboard_clr := !wb_cp_valid && (divSqrt_wen || (wen(0) && useScoreboard(x => wsrc === UInt(x._2)))) + io.sboard_clra := waddr + // we don't currently support round-max-magnitude (rm=4) + io.illegal_rm := ex_rm(2) && ex_ctrl.round + + divSqrt_wdata := 0 + divSqrt_flags := 0 + if (p(FDivSqrt)) { + val divSqrt_single = Reg(Bool()) + val divSqrt_rm = Reg(Bits()) + val divSqrt_flags_double = Reg(Bits()) + val divSqrt_wdata_double = Reg(Bits()) + + val divSqrt = Module(new hardfloat.DivSqrtRecF64) + divSqrt_inReady := Mux(divSqrt.io.sqrtOp, divSqrt.io.inReady_sqrt, divSqrt.io.inReady_div) + val divSqrt_outValid = divSqrt.io.outValid_div || divSqrt.io.outValid_sqrt + divSqrt.io.inValid := mem_reg_valid && (mem_ctrl.div || mem_ctrl.sqrt) && !divSqrt_in_flight + divSqrt.io.sqrtOp := mem_ctrl.sqrt + divSqrt.io.a := fpiu.io.as_double.in1 + divSqrt.io.b := fpiu.io.as_double.in2 + divSqrt.io.roundingMode := fpiu.io.as_double.rm + + when (divSqrt.io.inValid && divSqrt_inReady) { + divSqrt_in_flight := true + divSqrt_killed := killm + divSqrt_single := mem_ctrl.single + divSqrt_waddr := mem_reg_inst(11,7) + divSqrt_rm := divSqrt.io.roundingMode + } + + when (divSqrt_outValid) { + divSqrt_wen := !divSqrt_killed + divSqrt_wdata_double := divSqrt.io.out + divSqrt_in_flight := false + divSqrt_flags_double := divSqrt.io.exceptionFlags + } + + val divSqrt_toSingle = Module(new hardfloat.RecFNToRecFN(11, 53, 8, 24)) + divSqrt_toSingle.io.in := divSqrt_wdata_double + divSqrt_toSingle.io.roundingMode := divSqrt_rm + divSqrt_wdata := Mux(divSqrt_single, divSqrt_toSingle.io.out, divSqrt_wdata_double) + divSqrt_flags := divSqrt_flags_double | Mux(divSqrt_single, divSqrt_toSingle.io.exceptionFlags, Bits(0)) + } +} diff --git a/rocket/src/main/scala/frontend.scala b/rocket/src/main/scala/frontend.scala new file mode 100644 index 00000000..77e8f6e6 --- /dev/null +++ b/rocket/src/main/scala/frontend.scala @@ -0,0 +1,130 @@ +package rocket + +import Chisel._ +import uncore.tilelink._ +import Util._ +import cde.{Parameters, Field} + +class FrontendReq(implicit p: Parameters) extends CoreBundle()(p) { + val pc = UInt(width = vaddrBitsExtended) + val speculative = Bool() +} + +class FrontendResp(implicit p: Parameters) extends CoreBundle()(p) { + val pc = UInt(width = vaddrBitsExtended) // ID stage PC + val data = Vec(fetchWidth, Bits(width = coreInstBits)) + val mask = Bits(width = fetchWidth) + val xcpt_if = Bool() + val replay = Bool() +} + +class FrontendIO(implicit p: Parameters) extends CoreBundle()(p) { + val req = Valid(new FrontendReq) + val resp = Decoupled(new FrontendResp).flip + val btb_resp = Valid(new BTBResp).flip + val btb_update = Valid(new BTBUpdate) + val bht_update = Valid(new BHTUpdate) + val ras_update = Valid(new RASUpdate) + val flush_icache = Bool(OUTPUT) + val flush_tlb = Bool(OUTPUT) + val npc = UInt(INPUT, width = vaddrBitsExtended) +} + +class Frontend(implicit p: Parameters) extends CoreModule()(p) with HasL1CacheParameters { + val io = new Bundle { + val cpu = new FrontendIO().flip + val ptw = new TLBPTWIO() + val mem = new ClientUncachedTileLinkIO + } + + val icache = Module(new ICache(latency = 2)) + val tlb = Module(new TLB) + + val s1_pc_ = Reg(UInt(width=vaddrBitsExtended)) + val s1_pc = ~(~s1_pc_ | (coreInstBytes-1)) // discard PC LSBS (this propagates down the pipeline) + val s1_speculative = Reg(Bool()) + val s1_same_block = Reg(Bool()) + val s2_valid = Reg(init=Bool(true)) + val s2_pc = Reg(init=UInt(p(ResetVector))) + val s2_btb_resp_valid = Reg(init=Bool(false)) + val s2_btb_resp_bits = Reg(new BTBResp) + val s2_xcpt_if = Reg(init=Bool(false)) + val s2_speculative = Reg(init=Bool(false)) + + val ntpc = ~(~s1_pc | (coreInstBytes*fetchWidth-1)) + UInt(coreInstBytes*fetchWidth) + val predicted_npc = Wire(init = ntpc) + val icmiss = s2_valid && !icache.io.resp.valid + val npc = Mux(icmiss, s2_pc, predicted_npc).toUInt + val s0_same_block = Wire(init = !icmiss && !io.cpu.req.valid && ((ntpc & rowBytes) === (s1_pc & rowBytes))) + + val stall = io.cpu.resp.valid && !io.cpu.resp.ready + when (!stall) { + s1_same_block := s0_same_block && !tlb.io.resp.miss + s1_pc_ := npc + s1_speculative := Mux(icmiss, s2_speculative, true) + s2_valid := !icmiss + when (!icmiss) { + s2_pc := s1_pc + s2_speculative := s1_speculative && !tlb.io.resp.cacheable + s2_xcpt_if := tlb.io.resp.xcpt_if + } + } + when (io.cpu.req.valid) { + s1_same_block := Bool(false) + s1_pc_ := io.cpu.req.bits.pc + s1_speculative := io.cpu.req.bits.speculative + s2_valid := Bool(false) + } + + if (p(BtbKey).nEntries > 0) { + val btb = Module(new BTB) + btb.io.req.valid := false + btb.io.req.bits.addr := s1_pc + btb.io.btb_update := io.cpu.btb_update + btb.io.bht_update := io.cpu.bht_update + btb.io.ras_update := io.cpu.ras_update + when (!stall && !icmiss) { + btb.io.req.valid := true + s2_btb_resp_valid := btb.io.resp.valid + s2_btb_resp_bits := btb.io.resp.bits + } + when (btb.io.resp.bits.taken) { + predicted_npc := btb.io.resp.bits.target.sextTo(vaddrBitsExtended) + s0_same_block := Bool(false) + } + } + + io.ptw <> tlb.io.ptw + tlb.io.req.valid := !stall && !icmiss + tlb.io.req.bits.vpn := s1_pc >> pgIdxBits + tlb.io.req.bits.passthrough := Bool(false) + tlb.io.req.bits.instruction := Bool(true) + tlb.io.req.bits.store := Bool(false) + + io.mem <> icache.io.mem + icache.io.req.valid := !stall && !s0_same_block + icache.io.req.bits.addr := io.cpu.npc + icache.io.invalidate := io.cpu.flush_icache + icache.io.s1_ppn := tlb.io.resp.ppn + icache.io.s1_kill := io.cpu.req.valid || tlb.io.resp.miss || tlb.io.resp.xcpt_if || icmiss || io.cpu.flush_tlb + icache.io.s2_kill := s2_speculative + icache.io.resp.ready := !stall && !s1_same_block + + io.cpu.resp.valid := s2_valid && (icache.io.resp.valid || s2_speculative || s2_xcpt_if) + io.cpu.resp.bits.pc := s2_pc + io.cpu.npc := Mux(io.cpu.req.valid, io.cpu.req.bits.pc, npc) + + require(fetchWidth * coreInstBytes <= rowBytes) + val fetch_data = icache.io.resp.bits.datablock >> (s2_pc.extract(log2Up(rowBytes)-1,log2Up(fetchWidth*coreInstBytes)) << log2Up(fetchWidth*coreInstBits)) + + for (i <- 0 until fetchWidth) { + io.cpu.resp.bits.data(i) := fetch_data(i*coreInstBits+coreInstBits-1, i*coreInstBits) + } + + io.cpu.resp.bits.mask := UInt((1 << fetchWidth)-1) << s2_pc.extract(log2Up(fetchWidth)+log2Up(coreInstBytes)-1, log2Up(coreInstBytes)) + io.cpu.resp.bits.xcpt_if := s2_xcpt_if + io.cpu.resp.bits.replay := s2_speculative && !icache.io.resp.valid && !s2_xcpt_if + + io.cpu.btb_resp.valid := s2_btb_resp_valid + io.cpu.btb_resp.bits := s2_btb_resp_bits +} diff --git a/rocket/src/main/scala/icache.scala b/rocket/src/main/scala/icache.scala new file mode 100644 index 00000000..107b332c --- /dev/null +++ b/rocket/src/main/scala/icache.scala @@ -0,0 +1,157 @@ +package rocket + +import Chisel._ +import uncore.agents._ +import uncore.tilelink._ +import uncore.util._ +import Util._ +import cde.{Parameters, Field} + +trait HasL1CacheParameters extends HasCacheParameters with HasCoreParameters { + val outerDataBeats = p(TLKey(p(TLId))).dataBeats + val outerDataBits = p(TLKey(p(TLId))).dataBitsPerBeat + val refillCyclesPerBeat = outerDataBits/rowBits + val refillCycles = refillCyclesPerBeat*outerDataBeats +} + +class ICacheReq(implicit p: Parameters) extends CoreBundle()(p) with HasL1CacheParameters { + val addr = UInt(width = vaddrBits) +} + +class ICacheResp(implicit p: Parameters) extends CoreBundle()(p) with HasL1CacheParameters { + val data = Bits(width = coreInstBits) + val datablock = Bits(width = rowBits) +} + +class ICache(latency: Int)(implicit p: Parameters) extends CoreModule()(p) with HasL1CacheParameters { + val io = new Bundle { + val req = Valid(new ICacheReq).flip + val s1_ppn = UInt(INPUT, ppnBits) // delayed one cycle w.r.t. req + val s1_kill = Bool(INPUT) // delayed one cycle w.r.t. req + val s2_kill = Bool(INPUT) // delayed two cycles; prevents I$ miss emission + + val resp = Decoupled(new ICacheResp) + val invalidate = Bool(INPUT) + val mem = new ClientUncachedTileLinkIO + } + require(isPow2(nSets) && isPow2(nWays)) + require(isPow2(coreInstBytes)) + require(!usingVM || pgIdxBits >= untagBits) + + val s_ready :: s_request :: s_refill_wait :: s_refill :: Nil = Enum(UInt(), 4) + val state = Reg(init=s_ready) + val invalidated = Reg(Bool()) + val stall = !io.resp.ready + val rdy = Wire(Bool()) + + val refill_addr = Reg(UInt(width = paddrBits)) + val s1_any_tag_hit = Wire(Bool()) + + val s1_valid = Reg(init=Bool(false)) + val s1_vaddr = Reg(UInt()) + val s1_paddr = Cat(io.s1_ppn, s1_vaddr(pgIdxBits-1,0)).toUInt + val s1_tag = s1_paddr(tagBits+untagBits-1,untagBits) + + val s0_valid = io.req.valid || s1_valid && stall + val s0_vaddr = Mux(s1_valid && stall, s1_vaddr, io.req.bits.addr) + + s1_valid := io.req.valid && rdy || s1_valid && stall && !io.s1_kill + when (io.req.valid && rdy) { + s1_vaddr := io.req.bits.addr + } + + val out_valid = s1_valid && !io.s1_kill && state === s_ready + val s1_idx = s1_vaddr(untagBits-1,blockOffBits) + val s1_hit = out_valid && s1_any_tag_hit + val s1_miss = out_valid && !s1_any_tag_hit + rdy := state === s_ready && !s1_miss + + when (s1_miss && state === s_ready) { + refill_addr := s1_paddr + } + val refill_tag = refill_addr(tagBits+untagBits-1,untagBits) + + val narrow_grant = FlowThroughSerializer(io.mem.grant, refillCyclesPerBeat) + val (refill_cnt, refill_wrap) = Counter(narrow_grant.fire(), refillCycles) + val refill_done = state === s_refill && refill_wrap + narrow_grant.ready := Bool(true) + + val repl_way = if (isDM) UInt(0) else LFSR16(s1_miss)(log2Up(nWays)-1,0) + val entagbits = code.width(tagBits) + val tag_array = SeqMem(nSets, Vec(nWays, Bits(width = entagbits))) + val tag_rdata = tag_array.read(s0_vaddr(untagBits-1,blockOffBits), !refill_done && s0_valid) + when (refill_done) { + val tag = code.encode(refill_tag).toUInt + tag_array.write(s1_idx, Vec.fill(nWays)(tag), Vec.tabulate(nWays)(repl_way === _)) + } + + val vb_array = Reg(init=Bits(0, nSets*nWays)) + when (refill_done && !invalidated) { + vb_array := vb_array.bitSet(Cat(repl_way, s1_idx), Bool(true)) + } + when (io.invalidate) { + vb_array := Bits(0) + invalidated := Bool(true) + } + val s1_disparity = Wire(Vec(nWays, Bool())) + for (i <- 0 until nWays) + when (s1_valid && s1_disparity(i)) { vb_array := vb_array.bitSet(Cat(UInt(i), s1_idx), Bool(false)) } + + val s1_tag_match = Wire(Vec(nWays, Bool())) + val s1_tag_hit = Wire(Vec(nWays, Bool())) + val s1_dout = Wire(Vec(nWays, Bits(width = rowBits))) + + for (i <- 0 until nWays) { + val s1_vb = !io.invalidate && vb_array(Cat(UInt(i), s1_vaddr(untagBits-1,blockOffBits))).toBool + val tag_out = tag_rdata(i) + val s1_tag_disparity = code.decode(tag_out).error + s1_tag_match(i) := tag_out(tagBits-1,0) === s1_tag + s1_tag_hit(i) := s1_vb && s1_tag_match(i) + s1_disparity(i) := s1_vb && (s1_tag_disparity || code.decode(s1_dout(i)).error) + } + s1_any_tag_hit := s1_tag_hit.reduceLeft(_||_) && !s1_disparity.reduceLeft(_||_) + + for (i <- 0 until nWays) { + val data_array = SeqMem(nSets * refillCycles, Bits(width = code.width(rowBits))) + val wen = narrow_grant.valid && repl_way === UInt(i) + when (wen) { + val e_d = code.encode(narrow_grant.bits.data).toUInt + data_array.write((s1_idx << log2Ceil(refillCycles)) | refill_cnt, e_d) + } + val s0_raddr = s0_vaddr(untagBits-1,blockOffBits-log2Ceil(refillCycles)) + s1_dout(i) := data_array.read(s0_raddr, !wen && s0_valid) + } + + // output signals + latency match { + case 1 => + io.resp.bits.datablock := Mux1H(s1_tag_hit, s1_dout) + io.resp.valid := s1_hit + case 2 => + val s2_hit = RegEnable(s1_hit, !stall) + val s2_tag_hit = RegEnable(s1_tag_hit, !stall) + val s2_dout = RegEnable(s1_dout, !stall) + io.resp.bits.datablock := Mux1H(s2_tag_hit, s2_dout) + io.resp.valid := s2_hit + } + io.mem.acquire.valid := state === s_request && !io.s2_kill + io.mem.acquire.bits := GetBlock(addr_block = refill_addr >> blockOffBits) + + // control state machine + switch (state) { + is (s_ready) { + when (s1_miss) { state := s_request } + invalidated := Bool(false) + } + is (s_request) { + when (io.mem.acquire.ready) { state := s_refill_wait } + when (io.s2_kill) { state := s_ready } + } + is (s_refill_wait) { + when (io.mem.grant.valid) { state := s_refill } + } + is (s_refill) { + when (refill_done) { state := s_ready } + } + } +} diff --git a/rocket/src/main/scala/idecode.scala b/rocket/src/main/scala/idecode.scala new file mode 100644 index 00000000..2168922c --- /dev/null +++ b/rocket/src/main/scala/idecode.scala @@ -0,0 +1,319 @@ +// See LICENSE for license details + +package rocket + +import Chisel._ +import Instructions._ +import uncore.constants.MemoryOpConstants._ +import ALU._ +import cde.Parameters +import Util._ + +abstract trait DecodeConstants extends HasCoreParameters +{ + val table: Array[(BitPat, List[BitPat])] +} + +class IntCtrlSigs extends Bundle { + val legal = Bool() + val fp = Bool() + val rocc = Bool() + val branch = Bool() + val jal = Bool() + val jalr = Bool() + val rxs2 = Bool() + val rxs1 = Bool() + val sel_alu2 = Bits(width = A2_X.getWidth) + val sel_alu1 = Bits(width = A1_X.getWidth) + val sel_imm = Bits(width = IMM_X.getWidth) + val alu_dw = Bool() + val alu_fn = Bits(width = FN_X.getWidth) + val mem = Bool() + val mem_cmd = Bits(width = M_SZ) + val mem_type = Bits(width = MT_SZ) + val rfs1 = Bool() + val rfs2 = Bool() + val rfs3 = Bool() + val wfd = Bool() + val div = Bool() + val wxd = Bool() + val csr = Bits(width = CSR.SZ) + val fence_i = Bool() + val fence = Bool() + val amo = Bool() + + def default: List[BitPat] = + // jal renf1 fence.i + // val | jalr | renf2 | + // | fp_val| | renx2 | | renf3 | + // | | rocc| | | renx1 s_alu1 mem_val | | | wfd | + // | | | br| | | | s_alu2 | imm dw alu | mem_cmd mem_type| | | | div | + // | | | | | | | | | | | | | | | | | | | | | wxd | fence + // | | | | | | | | | | | | | | | | | | | | | | csr | | amo + // | | | | | | | | | | | | | | | | | | | | | | | | | | + List(N,X,X,X,X,X,X,X,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, X,X,X,X,X,X,CSR.X,X,X,X) + + def decode(inst: UInt, table: Iterable[(BitPat, List[BitPat])]) = { + val decoder = DecodeLogic(inst, default, table) + val sigs = Seq(legal, fp, rocc, branch, jal, jalr, rxs2, rxs1, sel_alu2, + sel_alu1, sel_imm, alu_dw, alu_fn, mem, mem_cmd, mem_type, + rfs1, rfs2, rfs3, wfd, div, wxd, csr, fence_i, fence, amo) + sigs zip decoder map {case(s,d) => s := d} + this + } +} + +class IDecode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + BNE-> List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X, FN_SNE, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + BEQ-> List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X, FN_SEQ, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + BLT-> List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X, FN_SLT, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + BLTU-> List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X, FN_SLTU, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + BGE-> List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X, FN_SGE, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + BGEU-> List(Y,N,N,Y,N,N,Y,Y,A2_RS2, A1_RS1, IMM_SB,DW_X, FN_SGEU, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + + JAL-> List(Y,N,N,N,Y,N,N,N,A2_FOUR,A1_PC, IMM_UJ,DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + JALR-> List(Y,N,N,N,N,Y,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + AUIPC-> List(Y,N,N,N,N,N,N,N,A2_IMM, A1_PC, IMM_U, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + + LB-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, Y,M_XRD, MT_B, N,N,N,N,N,Y,CSR.N,N,N,N), + LH-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, Y,M_XRD, MT_H, N,N,N,N,N,Y,CSR.N,N,N,N), + LW-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, Y,M_XRD, MT_W, N,N,N,N,N,Y,CSR.N,N,N,N), + LBU-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, Y,M_XRD, MT_BU,N,N,N,N,N,Y,CSR.N,N,N,N), + LHU-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, Y,M_XRD, MT_HU,N,N,N,N,N,Y,CSR.N,N,N,N), + SB-> List(Y,N,N,N,N,N,Y,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD, Y,M_XWR, MT_B, N,N,N,N,N,N,CSR.N,N,N,N), + SH-> List(Y,N,N,N,N,N,Y,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD, Y,M_XWR, MT_H, N,N,N,N,N,N,CSR.N,N,N,N), + SW-> List(Y,N,N,N,N,N,Y,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD, Y,M_XWR, MT_W, N,N,N,N,N,N,CSR.N,N,N,N), + + LUI-> List(Y,N,N,N,N,N,N,N,A2_IMM, A1_ZERO,IMM_U, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + ADDI-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SLTI -> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SLT, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SLTIU-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SLTU, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + ANDI-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_AND, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + ORI-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_OR, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + XORI-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_XOR, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SLLI-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SL, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SRLI-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SR, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SRAI-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_SRA, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + ADD-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SUB-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SUB, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SLT-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SLT, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SLTU-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SLTU, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + AND-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_AND, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + OR-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_OR, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + XOR-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_XOR, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SLL-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SL, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SRL-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SR, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SRA-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_SRA, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + + FENCE-> List(Y,N,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,Y,N), + FENCE_I-> List(Y,N,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, Y,M_FLUSH_ALL,MT_X, N,N,N,N,N,N,CSR.N,Y,N,N), + + SCALL-> List(Y,N,N,N,N,N,N,X,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,N,N,N,CSR.I,N,N,N), + SBREAK-> List(Y,N,N,N,N,N,N,X,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,N,N,N,CSR.I,N,N,N), + MRET-> List(Y,N,N,N,N,N,N,X,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,N,N,N,CSR.I,N,N,N), + WFI-> List(Y,N,N,N,N,N,N,X,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,N,N,N,CSR.I,N,N,N), + CSRRW-> List(Y,N,N,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.W,N,N,N), + CSRRS-> List(Y,N,N,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.S,N,N,N), + CSRRC-> List(Y,N,N,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.C,N,N,N), + CSRRWI-> List(Y,N,N,N,N,N,N,N,A2_IMM, A1_ZERO,IMM_Z, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.W,N,N,N), + CSRRSI-> List(Y,N,N,N,N,N,N,N,A2_IMM, A1_ZERO,IMM_Z, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.S,N,N,N), + CSRRCI-> List(Y,N,N,N,N,N,N,N,A2_IMM, A1_ZERO,IMM_Z, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.C,N,N,N)) +} + +class SDecode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + SFENCE_VM-> List(Y,N,N,N,N,N,N,X,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,N,N,N,CSR.I,N,N,N), + SRET-> List(Y,N,N,N,N,N,N,X,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,N,N,N,CSR.I,N,N,N)) +} + +class DebugDecode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + DRET-> List(Y,N,N,N,N,N,N,X,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,N,N,N,CSR.I,N,N,N)) +} + +class I64Decode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + LD-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, Y,M_XRD, MT_D, N,N,N,N,N,Y,CSR.N,N,N,N), + LWU-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, Y,M_XRD, MT_WU,N,N,N,N,N,Y,CSR.N,N,N,N), + SD-> List(Y,N,N,N,N,N,Y,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD, Y,M_XWR, MT_D, N,N,N,N,N,N,CSR.N,N,N,N), + + ADDIW-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_32,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SLLIW-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_32,FN_SL, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SRLIW-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_32,FN_SR, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SRAIW-> List(Y,N,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_32,FN_SRA, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + ADDW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SUBW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_SUB, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SLLW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_SL, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SRLW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_SR, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + SRAW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32,FN_SRA, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N)) +} + +class MDecode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + MUL-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_MUL, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + MULH-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_MULH, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + MULHU-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_MULHU, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + MULHSU-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_MULHSU,N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + + DIV-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_DIV, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + DIVU-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_DIVU, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + REM-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_REM, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + REMU-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_XPR,FN_REMU, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N)) +} + +class M64Decode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + MULW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_MUL, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + + DIVW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_DIV, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + DIVUW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_DIVU, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + REMW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_REM, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N), + REMUW-> List(Y,N,N,N,N,N,Y,Y,A2_RS2, A1_RS1, IMM_X, DW_32, FN_REMU, N,M_X, MT_X, N,N,N,N,Y,Y,CSR.N,N,N,N)) +} + +class ADecode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + AMOADD_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_ADD, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOXOR_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_XOR, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOSWAP_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_SWAP, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOAND_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_AND, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOOR_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_OR, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOMIN_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_MIN, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOMINU_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_MINU, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOMAX_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_MAX, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOMAXU_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_MAXU, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + + LR_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XLR, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y), + SC_W-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XSC, MT_W, N,N,N,N,N,Y,CSR.N,N,N,Y)) +} + +class A64Decode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + AMOADD_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_ADD, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOSWAP_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_SWAP, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOXOR_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_XOR, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOAND_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_AND, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOOR_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_OR, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOMIN_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_MIN, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOMINU_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_MINU, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOMAX_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_MAX, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + AMOMAXU_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XA_MAXU, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + + LR_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XLR, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y), + SC_D-> List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_XSC, MT_D, N,N,N,N,N,Y,CSR.N,N,N,Y)) +} + +class FDecode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + FCVT_S_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,Y,N,N,CSR.N,N,N,N), + FCVT_D_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,Y,N,N,CSR.N,N,N,N), + FSGNJ_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FSGNJ_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FSGNJX_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FSGNJX_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FSGNJN_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FSGNJN_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FMIN_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FMIN_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FMAX_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FMAX_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FADD_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FADD_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FSUB_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FSUB_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FMUL_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FMUL_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FMADD_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N), + FMADD_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N), + FMSUB_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N), + FMSUB_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N), + FNMADD_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N), + FNMADD_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N), + FNMSUB_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N), + FNMSUB_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,Y,Y,N,N,CSR.N,N,N,N), + FCLASS_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FCLASS_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FMV_X_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FCVT_W_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FCVT_W_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FCVT_WU_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FCVT_WU_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FEQ_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N), + FEQ_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N), + FLT_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N), + FLT_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N), + FLE_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N), + FLE_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,N,N,Y,CSR.N,N,N,N), + FMV_S_X-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N), + FCVT_S_W-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N), + FCVT_D_W-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N), + FCVT_S_WU-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N), + FCVT_D_WU-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N), + FLW-> List(Y,Y,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, Y,M_XRD, MT_W, N,N,N,Y,N,N,CSR.N,N,N,N), + FLD-> List(Y,Y,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_I, DW_XPR,FN_ADD, Y,M_XRD, MT_D, N,N,N,Y,N,N,CSR.N,N,N,N), + FSW-> List(Y,Y,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD, Y,M_XWR, MT_W, N,Y,N,N,N,N,CSR.N,N,N,N), + FSD-> List(Y,Y,N,N,N,N,N,Y,A2_IMM, A1_RS1, IMM_S, DW_XPR,FN_ADD, Y,M_XWR, MT_D, N,Y,N,N,N,N,CSR.N,N,N,N)) +} + +class F64Decode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + FMV_X_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FCVT_L_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FCVT_L_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FCVT_LU_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FCVT_LU_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,N,N,N,N,Y,CSR.N,N,N,N), + FMV_D_X-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N), + FCVT_S_L-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N), + FCVT_D_L-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N), + FCVT_S_LU-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N), + FCVT_D_LU-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,Y,N,N,CSR.N,N,N,N)) +} + +class FDivSqrtDecode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + FDIV_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FDIV_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FSQRT_S-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N), + FSQRT_D-> List(Y,Y,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, Y,Y,N,Y,N,N,CSR.N,N,N,N)) +} + +class RoCCDecode(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + CUSTOM0-> List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM0_RS1-> List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM0_RS1_RS2-> List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM0_RD-> List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM0_RD_RS1-> List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM0_RD_RS1_RS2->List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM1-> List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM1_RS1-> List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM1_RS1_RS2-> List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM1_RD-> List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM1_RD_RS1-> List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM1_RD_RS1_RS2->List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM2-> List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM2_RS1-> List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM2_RS1_RS2-> List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM2_RD-> List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM2_RD_RS1-> List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM2_RD_RS1_RS2->List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM3-> List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM3_RS1-> List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM3_RS1_RS2-> List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,N,CSR.N,N,N,N), + CUSTOM3_RD-> List(Y,N,Y,N,N,N,N,N,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM3_RD_RS1-> List(Y,N,Y,N,N,N,N,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N), + CUSTOM3_RD_RS1_RS2->List(Y,N,Y,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, N,M_X, MT_X, N,N,N,N,N,Y,CSR.N,N,N,N)) +} diff --git a/rocket/src/main/scala/instructions.scala b/rocket/src/main/scala/instructions.scala new file mode 100644 index 00000000..339ac42f --- /dev/null +++ b/rocket/src/main/scala/instructions.scala @@ -0,0 +1,383 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ + +/* Automatically generated by parse-opcodes */ +object Instructions { + def BEQ = BitPat("b?????????????????000?????1100011") + def BNE = BitPat("b?????????????????001?????1100011") + def BLT = BitPat("b?????????????????100?????1100011") + def BGE = BitPat("b?????????????????101?????1100011") + def BLTU = BitPat("b?????????????????110?????1100011") + def BGEU = BitPat("b?????????????????111?????1100011") + def JALR = BitPat("b?????????????????000?????1100111") + def JAL = BitPat("b?????????????????????????1101111") + def LUI = BitPat("b?????????????????????????0110111") + def AUIPC = BitPat("b?????????????????????????0010111") + def ADDI = BitPat("b?????????????????000?????0010011") + def SLLI = BitPat("b000000???????????001?????0010011") + def SLTI = BitPat("b?????????????????010?????0010011") + def SLTIU = BitPat("b?????????????????011?????0010011") + def XORI = BitPat("b?????????????????100?????0010011") + def SRLI = BitPat("b000000???????????101?????0010011") + def SRAI = BitPat("b010000???????????101?????0010011") + def ORI = BitPat("b?????????????????110?????0010011") + def ANDI = BitPat("b?????????????????111?????0010011") + def ADD = BitPat("b0000000??????????000?????0110011") + def SUB = BitPat("b0100000??????????000?????0110011") + def SLL = BitPat("b0000000??????????001?????0110011") + def SLT = BitPat("b0000000??????????010?????0110011") + def SLTU = BitPat("b0000000??????????011?????0110011") + def XOR = BitPat("b0000000??????????100?????0110011") + def SRL = BitPat("b0000000??????????101?????0110011") + def SRA = BitPat("b0100000??????????101?????0110011") + def OR = BitPat("b0000000??????????110?????0110011") + def AND = BitPat("b0000000??????????111?????0110011") + def ADDIW = BitPat("b?????????????????000?????0011011") + def SLLIW = BitPat("b0000000??????????001?????0011011") + def SRLIW = BitPat("b0000000??????????101?????0011011") + def SRAIW = BitPat("b0100000??????????101?????0011011") + def ADDW = BitPat("b0000000??????????000?????0111011") + def SUBW = BitPat("b0100000??????????000?????0111011") + def SLLW = BitPat("b0000000??????????001?????0111011") + def SRLW = BitPat("b0000000??????????101?????0111011") + def SRAW = BitPat("b0100000??????????101?????0111011") + def LB = BitPat("b?????????????????000?????0000011") + def LH = BitPat("b?????????????????001?????0000011") + def LW = BitPat("b?????????????????010?????0000011") + def LD = BitPat("b?????????????????011?????0000011") + def LBU = BitPat("b?????????????????100?????0000011") + def LHU = BitPat("b?????????????????101?????0000011") + def LWU = BitPat("b?????????????????110?????0000011") + def SB = BitPat("b?????????????????000?????0100011") + def SH = BitPat("b?????????????????001?????0100011") + def SW = BitPat("b?????????????????010?????0100011") + def SD = BitPat("b?????????????????011?????0100011") + def FENCE = BitPat("b?????????????????000?????0001111") + def FENCE_I = BitPat("b?????????????????001?????0001111") + def MUL = BitPat("b0000001??????????000?????0110011") + def MULH = BitPat("b0000001??????????001?????0110011") + def MULHSU = BitPat("b0000001??????????010?????0110011") + def MULHU = BitPat("b0000001??????????011?????0110011") + def DIV = BitPat("b0000001??????????100?????0110011") + def DIVU = BitPat("b0000001??????????101?????0110011") + def REM = BitPat("b0000001??????????110?????0110011") + def REMU = BitPat("b0000001??????????111?????0110011") + def MULW = BitPat("b0000001??????????000?????0111011") + def DIVW = BitPat("b0000001??????????100?????0111011") + def DIVUW = BitPat("b0000001??????????101?????0111011") + def REMW = BitPat("b0000001??????????110?????0111011") + def REMUW = BitPat("b0000001??????????111?????0111011") + def AMOADD_W = BitPat("b00000????????????010?????0101111") + def AMOXOR_W = BitPat("b00100????????????010?????0101111") + def AMOOR_W = BitPat("b01000????????????010?????0101111") + def AMOAND_W = BitPat("b01100????????????010?????0101111") + def AMOMIN_W = BitPat("b10000????????????010?????0101111") + def AMOMAX_W = BitPat("b10100????????????010?????0101111") + def AMOMINU_W = BitPat("b11000????????????010?????0101111") + def AMOMAXU_W = BitPat("b11100????????????010?????0101111") + def AMOSWAP_W = BitPat("b00001????????????010?????0101111") + def LR_W = BitPat("b00010??00000?????010?????0101111") + def SC_W = BitPat("b00011????????????010?????0101111") + def AMOADD_D = BitPat("b00000????????????011?????0101111") + def AMOXOR_D = BitPat("b00100????????????011?????0101111") + def AMOOR_D = BitPat("b01000????????????011?????0101111") + def AMOAND_D = BitPat("b01100????????????011?????0101111") + def AMOMIN_D = BitPat("b10000????????????011?????0101111") + def AMOMAX_D = BitPat("b10100????????????011?????0101111") + def AMOMINU_D = BitPat("b11000????????????011?????0101111") + def AMOMAXU_D = BitPat("b11100????????????011?????0101111") + def AMOSWAP_D = BitPat("b00001????????????011?????0101111") + def LR_D = BitPat("b00010??00000?????011?????0101111") + def SC_D = BitPat("b00011????????????011?????0101111") + def ECALL = BitPat("b00000000000000000000000001110011") + def EBREAK = BitPat("b00000000000100000000000001110011") + def URET = BitPat("b00000000001000000000000001110011") + def SRET = BitPat("b00010000001000000000000001110011") + def HRET = BitPat("b00100000001000000000000001110011") + def MRET = BitPat("b00110000001000000000000001110011") + def DRET = BitPat("b01111011001000000000000001110011") + def SFENCE_VM = BitPat("b000100000100?????000000001110011") + def WFI = BitPat("b00010000010100000000000001110011") + def CSRRW = BitPat("b?????????????????001?????1110011") + def CSRRS = BitPat("b?????????????????010?????1110011") + def CSRRC = BitPat("b?????????????????011?????1110011") + def CSRRWI = BitPat("b?????????????????101?????1110011") + def CSRRSI = BitPat("b?????????????????110?????1110011") + def CSRRCI = BitPat("b?????????????????111?????1110011") + def FADD_S = BitPat("b0000000??????????????????1010011") + def FSUB_S = BitPat("b0000100??????????????????1010011") + def FMUL_S = BitPat("b0001000??????????????????1010011") + def FDIV_S = BitPat("b0001100??????????????????1010011") + def FSGNJ_S = BitPat("b0010000??????????000?????1010011") + def FSGNJN_S = BitPat("b0010000??????????001?????1010011") + def FSGNJX_S = BitPat("b0010000??????????010?????1010011") + def FMIN_S = BitPat("b0010100??????????000?????1010011") + def FMAX_S = BitPat("b0010100??????????001?????1010011") + def FSQRT_S = BitPat("b010110000000?????????????1010011") + def FADD_D = BitPat("b0000001??????????????????1010011") + def FSUB_D = BitPat("b0000101??????????????????1010011") + def FMUL_D = BitPat("b0001001??????????????????1010011") + def FDIV_D = BitPat("b0001101??????????????????1010011") + def FSGNJ_D = BitPat("b0010001??????????000?????1010011") + def FSGNJN_D = BitPat("b0010001??????????001?????1010011") + def FSGNJX_D = BitPat("b0010001??????????010?????1010011") + def FMIN_D = BitPat("b0010101??????????000?????1010011") + def FMAX_D = BitPat("b0010101??????????001?????1010011") + def FCVT_S_D = BitPat("b010000000001?????????????1010011") + def FCVT_D_S = BitPat("b010000100000?????????????1010011") + def FSQRT_D = BitPat("b010110100000?????????????1010011") + def FLE_S = BitPat("b1010000??????????000?????1010011") + def FLT_S = BitPat("b1010000??????????001?????1010011") + def FEQ_S = BitPat("b1010000??????????010?????1010011") + def FLE_D = BitPat("b1010001??????????000?????1010011") + def FLT_D = BitPat("b1010001??????????001?????1010011") + def FEQ_D = BitPat("b1010001??????????010?????1010011") + def FCVT_W_S = BitPat("b110000000000?????????????1010011") + def FCVT_WU_S = BitPat("b110000000001?????????????1010011") + def FCVT_L_S = BitPat("b110000000010?????????????1010011") + def FCVT_LU_S = BitPat("b110000000011?????????????1010011") + def FMV_X_S = BitPat("b111000000000?????000?????1010011") + def FCLASS_S = BitPat("b111000000000?????001?????1010011") + def FCVT_W_D = BitPat("b110000100000?????????????1010011") + def FCVT_WU_D = BitPat("b110000100001?????????????1010011") + def FCVT_L_D = BitPat("b110000100010?????????????1010011") + def FCVT_LU_D = BitPat("b110000100011?????????????1010011") + def FMV_X_D = BitPat("b111000100000?????000?????1010011") + def FCLASS_D = BitPat("b111000100000?????001?????1010011") + def FCVT_S_W = BitPat("b110100000000?????????????1010011") + def FCVT_S_WU = BitPat("b110100000001?????????????1010011") + def FCVT_S_L = BitPat("b110100000010?????????????1010011") + def FCVT_S_LU = BitPat("b110100000011?????????????1010011") + def FMV_S_X = BitPat("b111100000000?????000?????1010011") + def FCVT_D_W = BitPat("b110100100000?????????????1010011") + def FCVT_D_WU = BitPat("b110100100001?????????????1010011") + def FCVT_D_L = BitPat("b110100100010?????????????1010011") + def FCVT_D_LU = BitPat("b110100100011?????????????1010011") + def FMV_D_X = BitPat("b111100100000?????000?????1010011") + def FLW = BitPat("b?????????????????010?????0000111") + def FLD = BitPat("b?????????????????011?????0000111") + def FSW = BitPat("b?????????????????010?????0100111") + def FSD = BitPat("b?????????????????011?????0100111") + def FMADD_S = BitPat("b?????00??????????????????1000011") + def FMSUB_S = BitPat("b?????00??????????????????1000111") + def FNMSUB_S = BitPat("b?????00??????????????????1001011") + def FNMADD_S = BitPat("b?????00??????????????????1001111") + def FMADD_D = BitPat("b?????01??????????????????1000011") + def FMSUB_D = BitPat("b?????01??????????????????1000111") + def FNMSUB_D = BitPat("b?????01??????????????????1001011") + def FNMADD_D = BitPat("b?????01??????????????????1001111") + def CUSTOM0 = BitPat("b?????????????????000?????0001011") + def CUSTOM0_RS1 = BitPat("b?????????????????010?????0001011") + def CUSTOM0_RS1_RS2 = BitPat("b?????????????????011?????0001011") + def CUSTOM0_RD = BitPat("b?????????????????100?????0001011") + def CUSTOM0_RD_RS1 = BitPat("b?????????????????110?????0001011") + def CUSTOM0_RD_RS1_RS2 = BitPat("b?????????????????111?????0001011") + def CUSTOM1 = BitPat("b?????????????????000?????0101011") + def CUSTOM1_RS1 = BitPat("b?????????????????010?????0101011") + def CUSTOM1_RS1_RS2 = BitPat("b?????????????????011?????0101011") + def CUSTOM1_RD = BitPat("b?????????????????100?????0101011") + def CUSTOM1_RD_RS1 = BitPat("b?????????????????110?????0101011") + def CUSTOM1_RD_RS1_RS2 = BitPat("b?????????????????111?????0101011") + def CUSTOM2 = BitPat("b?????????????????000?????1011011") + def CUSTOM2_RS1 = BitPat("b?????????????????010?????1011011") + def CUSTOM2_RS1_RS2 = BitPat("b?????????????????011?????1011011") + def CUSTOM2_RD = BitPat("b?????????????????100?????1011011") + def CUSTOM2_RD_RS1 = BitPat("b?????????????????110?????1011011") + def CUSTOM2_RD_RS1_RS2 = BitPat("b?????????????????111?????1011011") + def CUSTOM3 = BitPat("b?????????????????000?????1111011") + def CUSTOM3_RS1 = BitPat("b?????????????????010?????1111011") + def CUSTOM3_RS1_RS2 = BitPat("b?????????????????011?????1111011") + def CUSTOM3_RD = BitPat("b?????????????????100?????1111011") + def CUSTOM3_RD_RS1 = BitPat("b?????????????????110?????1111011") + def CUSTOM3_RD_RS1_RS2 = BitPat("b?????????????????111?????1111011") + def SLLI_RV32 = BitPat("b0000000??????????001?????0010011") + def SRLI_RV32 = BitPat("b0000000??????????101?????0010011") + def SRAI_RV32 = BitPat("b0100000??????????101?????0010011") + def FRFLAGS = BitPat("b00000000000100000010?????1110011") + def FSFLAGS = BitPat("b000000000001?????001?????1110011") + def FSFLAGSI = BitPat("b000000000001?????101?????1110011") + def FRRM = BitPat("b00000000001000000010?????1110011") + def FSRM = BitPat("b000000000010?????001?????1110011") + def FSRMI = BitPat("b000000000010?????101?????1110011") + def FSCSR = BitPat("b000000000011?????001?????1110011") + def FRCSR = BitPat("b00000000001100000010?????1110011") + def RDCYCLE = BitPat("b11000000000000000010?????1110011") + def RDTIME = BitPat("b11000000000100000010?????1110011") + def RDINSTRET = BitPat("b11000000001000000010?????1110011") + def RDCYCLEH = BitPat("b11001000000000000010?????1110011") + def RDTIMEH = BitPat("b11001000000100000010?????1110011") + def RDINSTRETH = BitPat("b11001000001000000010?????1110011") + def SCALL = BitPat("b00000000000000000000000001110011") + def SBREAK = BitPat("b00000000000100000000000001110011") +} +object Causes { + val misaligned_fetch = 0x0 + val fault_fetch = 0x1 + val illegal_instruction = 0x2 + val breakpoint = 0x3 + val misaligned_load = 0x4 + val fault_load = 0x5 + val misaligned_store = 0x6 + val fault_store = 0x7 + val user_ecall = 0x8 + val supervisor_ecall = 0x9 + val hypervisor_ecall = 0xa + val machine_ecall = 0xb + val all = { + val res = collection.mutable.ArrayBuffer[Int]() + res += misaligned_fetch + res += fault_fetch + res += illegal_instruction + res += breakpoint + res += misaligned_load + res += fault_load + res += misaligned_store + res += fault_store + res += user_ecall + res += supervisor_ecall + res += hypervisor_ecall + res += machine_ecall + res.toArray + } +} +object CSRs { + val fflags = 0x1 + val frm = 0x2 + val fcsr = 0x3 + val cycle = 0xc00 + val time = 0xc01 + val instret = 0xc02 + val sstatus = 0x100 + val sie = 0x104 + val stvec = 0x105 + val sscratch = 0x140 + val sepc = 0x141 + val scause = 0x142 + val sbadaddr = 0x143 + val sip = 0x144 + val sptbr = 0x180 + val scycle = 0xd00 + val stime = 0xd01 + val sinstret = 0xd02 + val mstatus = 0x300 + val medeleg = 0x302 + val mideleg = 0x303 + val mie = 0x304 + val mtvec = 0x305 + val mscratch = 0x340 + val mepc = 0x341 + val mcause = 0x342 + val mbadaddr = 0x343 + val mip = 0x344 + val mucounteren = 0x310 + val mscounteren = 0x311 + val mucycle_delta = 0x700 + val mutime_delta = 0x701 + val muinstret_delta = 0x702 + val mscycle_delta = 0x704 + val mstime_delta = 0x705 + val msinstret_delta = 0x706 + val tdrselect = 0x7a0 + val tdrdata1 = 0x7a1 + val tdrdata2 = 0x7a2 + val tdrdata3 = 0x7a3 + val dcsr = 0x7b0 + val dpc = 0x7b1 + val dscratch = 0x7b2 + val mcycle = 0xf00 + val mtime = 0xf01 + val minstret = 0xf02 + val misa = 0xf10 + val mvendorid = 0xf11 + val marchid = 0xf12 + val mimpid = 0xf13 + val mhartid = 0xf14 + val mreset = 0x7c2 + val cycleh = 0xc80 + val timeh = 0xc81 + val instreth = 0xc82 + val mucycle_deltah = 0x780 + val mutime_deltah = 0x781 + val muinstret_deltah = 0x782 + val mscycle_deltah = 0x784 + val mstime_deltah = 0x785 + val msinstret_deltah = 0x786 + val mcycleh = 0xf80 + val mtimeh = 0xf81 + val minstreth = 0xf82 + val all = { + val res = collection.mutable.ArrayBuffer[Int]() + res += fflags + res += frm + res += fcsr + res += cycle + res += time + res += instret + res += sstatus + res += sie + res += stvec + res += sscratch + res += sepc + res += scause + res += sbadaddr + res += sip + res += sptbr + res += scycle + res += stime + res += sinstret + res += mstatus + res += medeleg + res += mideleg + res += mie + res += mtvec + res += mscratch + res += mepc + res += mcause + res += mbadaddr + res += mip + res += mucounteren + res += mscounteren + res += mucycle_delta + res += mutime_delta + res += muinstret_delta + res += mscycle_delta + res += mstime_delta + res += msinstret_delta + res += tdrselect + res += tdrdata1 + res += tdrdata2 + res += tdrdata3 + res += dcsr + res += dpc + res += dscratch + res += mcycle + res += mtime + res += minstret + res += misa + res += mvendorid + res += marchid + res += mimpid + res += mhartid + res += mreset + res.toArray + } + val all32 = { + val res = collection.mutable.ArrayBuffer(all:_*) + res += cycleh + res += timeh + res += instreth + res += mucycle_deltah + res += mutime_deltah + res += muinstret_deltah + res += mscycle_deltah + res += mstime_deltah + res += msinstret_deltah + res += mcycleh + res += mtimeh + res += minstreth + res.toArray + } +} diff --git a/rocket/src/main/scala/multiplier.scala b/rocket/src/main/scala/multiplier.scala new file mode 100644 index 00000000..9770d632 --- /dev/null +++ b/rocket/src/main/scala/multiplier.scala @@ -0,0 +1,152 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import ALU._ +import Util._ + +class MultiplierReq(dataBits: Int, tagBits: Int) extends Bundle { + val fn = Bits(width = SZ_ALU_FN) + val dw = Bits(width = SZ_DW) + val in1 = Bits(width = dataBits) + val in2 = Bits(width = dataBits) + val tag = UInt(width = tagBits) + override def cloneType = new MultiplierReq(dataBits, tagBits).asInstanceOf[this.type] +} + +class MultiplierResp(dataBits: Int, tagBits: Int) extends Bundle { + val data = Bits(width = dataBits) + val tag = UInt(width = tagBits) + override def cloneType = new MultiplierResp(dataBits, tagBits).asInstanceOf[this.type] +} + +class MultiplierIO(dataBits: Int, tagBits: Int) extends Bundle { + val req = Decoupled(new MultiplierReq(dataBits, tagBits)).flip + val kill = Bool(INPUT) + val resp = Decoupled(new MultiplierResp(dataBits, tagBits)) +} + +class MulDiv( + width: Int, + nXpr: Int = 32, + unroll: Int = 1, + earlyOut: Boolean = false) extends Module { + val io = new MultiplierIO(width, log2Up(nXpr)) + val w = io.req.bits.in1.getWidth + val mulw = (w+unroll-1)/unroll*unroll + + val s_ready :: s_neg_inputs :: s_busy :: s_move_rem :: s_neg_output :: s_done :: Nil = Enum(UInt(), 6) + val state = Reg(init=s_ready) + + val req = Reg(io.req.bits) + val count = Reg(UInt(width = log2Up(w+1))) + val neg_out = Reg(Bool()) + val isMul = Reg(Bool()) + val isHi = Reg(Bool()) + val divisor = Reg(Bits(width = w+1)) // div only needs w bits + val remainder = Reg(Bits(width = 2*mulw+2)) // div only needs 2*w+1 bits + + val cmdMul :: cmdHi :: lhsSigned :: rhsSigned :: Nil = + DecodeLogic(io.req.bits.fn, List(X, X, X, X), List( + FN_DIV -> List(N, N, Y, Y), + FN_REM -> List(N, Y, Y, Y), + FN_DIVU -> List(N, N, N, N), + FN_REMU -> List(N, Y, N, N), + FN_MUL -> List(Y, N, X, X), + FN_MULH -> List(Y, Y, Y, Y), + FN_MULHU -> List(Y, Y, N, N), + FN_MULHSU -> List(Y, Y, Y, N))).map(_ toBool) + + require(w == 32 || w == 64) + def halfWidth(req: MultiplierReq) = Bool(w > 32) && req.dw === DW_32 + + def sext(x: Bits, halfW: Bool, signed: Bool) = { + val sign = signed && Mux(halfW, x(w/2-1), x(w-1)) + val hi = Mux(halfW, Fill(w/2, sign), x(w-1,w/2)) + (Cat(hi, x(w/2-1,0)), sign) + } + val (lhs_in, lhs_sign) = sext(io.req.bits.in1, halfWidth(io.req.bits), lhsSigned) + val (rhs_in, rhs_sign) = sext(io.req.bits.in2, halfWidth(io.req.bits), rhsSigned) + + val subtractor = remainder(2*w,w) - divisor(w,0) + val less = subtractor(w) + val negated_remainder = -remainder(w-1,0) + + when (state === s_neg_inputs) { + when (remainder(w-1) || isMul) { + remainder := negated_remainder + } + when (divisor(w-1) || isMul) { + divisor := subtractor + } + state := s_busy + } + + when (state === s_neg_output) { + remainder := negated_remainder + state := s_done + } + when (state === s_move_rem) { + remainder := remainder(2*w, w+1) + state := Mux(neg_out, s_neg_output, s_done) + } + when (state === s_busy && isMul) { + val mulReg = Cat(remainder(2*mulw+1,w+1),remainder(w-1,0)) + val mplier = mulReg(mulw-1,0) + val accum = mulReg(2*mulw,mulw).toSInt + val mpcand = divisor.toSInt + val prod = mplier(unroll-1,0) * mpcand + accum + val nextMulReg = Cat(prod, mplier(mulw-1,unroll)).toUInt + + val eOutMask = (SInt(BigInt(-1) << mulw) >> (count * unroll)(log2Up(mulw)-1,0))(mulw-1,0) + val eOut = Bool(earlyOut) && count =/= mulw/unroll-1 && count =/= 0 && + !isHi && (mplier & ~eOutMask) === UInt(0) + val eOutRes = (mulReg >> (mulw - count * unroll)(log2Up(mulw)-1,0)) + val nextMulReg1 = Cat(nextMulReg(2*mulw,mulw), Mux(eOut, eOutRes, nextMulReg)(mulw-1,0)) + remainder := Cat(nextMulReg1 >> w, Bool(false), nextMulReg1(w-1,0)) + + count := count + 1 + when (eOut || count === mulw/unroll-1) { + state := Mux(isHi, s_move_rem, s_done) + } + } + when (state === s_busy && !isMul) { + when (count === w) { + state := Mux(isHi, s_move_rem, Mux(neg_out, s_neg_output, s_done)) + } + count := count + 1 + + remainder := Cat(Mux(less, remainder(2*w-1,w), subtractor(w-1,0)), remainder(w-1,0), !less) + + val divisorMSB = Log2(divisor(w-1,0), w) + val dividendMSB = Log2(remainder(w-1,0), w) + val eOutPos = UInt(w-1) + divisorMSB - dividendMSB + val eOutZero = divisorMSB > dividendMSB + val eOut = count === 0 && less /* not divby0 */ && (eOutPos > 0 || eOutZero) + when (Bool(earlyOut) && eOut) { + val shift = Mux(eOutZero, UInt(w-1), eOutPos(log2Up(w)-1,0)) + remainder := remainder(w-1,0) << shift + count := shift + } + when (count === 0 && !less /* divby0 */ && !isHi) { neg_out := false } + } + when (io.resp.fire() || io.kill) { + state := s_ready + } + when (io.req.fire()) { + state := Mux(lhs_sign || rhs_sign && !cmdMul, s_neg_inputs, s_busy) + isMul := cmdMul + isHi := cmdHi + count := 0 + neg_out := !cmdMul && Mux(cmdHi, lhs_sign, lhs_sign =/= rhs_sign) + divisor := Cat(rhs_sign, rhs_in) + remainder := lhs_in + req := io.req.bits + } + + io.resp.bits := req + io.resp.bits.data := Mux(halfWidth(req), Cat(Fill(w/2, remainder(w/2-1)), remainder(w/2-1,0)), remainder(w-1,0)) + io.resp.valid := state === s_done + io.req.ready := state === s_ready +} diff --git a/rocket/src/main/scala/nbdcache.scala b/rocket/src/main/scala/nbdcache.scala new file mode 100644 index 00000000..c3b783c7 --- /dev/null +++ b/rocket/src/main/scala/nbdcache.scala @@ -0,0 +1,1247 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import junctions._ +import uncore.tilelink._ +import uncore.coherence._ +import uncore.agents._ +import uncore.util._ +import uncore.constants._ +import cde.{Parameters, Field} +import Util._ + +case object WordBits extends Field[Int] +case object StoreDataQueueDepth extends Field[Int] +case object ReplayQueueDepth extends Field[Int] +case object NMSHRs extends Field[Int] +case object LRSCCycles extends Field[Int] + +trait HasL1HellaCacheParameters extends HasL1CacheParameters { + val wordBits = p(WordBits) + val wordBytes = wordBits/8 + val wordOffBits = log2Up(wordBytes) + val beatBytes = p(CacheBlockBytes) / outerDataBeats + val beatWords = beatBytes / wordBytes + val beatOffBits = log2Up(beatBytes) + val idxMSB = untagBits-1 + val idxLSB = blockOffBits + val offsetmsb = idxLSB-1 + val offsetlsb = wordOffBits + val rowWords = rowBits/wordBits + val doNarrowRead = coreDataBits * nWays % rowBits == 0 + val encDataBits = code.width(coreDataBits) + val encRowBits = encDataBits*rowWords + val sdqDepth = p(StoreDataQueueDepth) + val nMSHRs = p(NMSHRs) + val nIOMSHRs = 1 + val lrscCycles = p(LRSCCycles) + + require(lrscCycles >= 32) // ISA requires 16-insn LRSC sequences to succeed + require(isPow2(nSets)) + require(rowBits <= outerDataBits) + require(!usingVM || untagBits <= pgIdxBits) +} + +abstract class L1HellaCacheModule(implicit val p: Parameters) extends Module + with HasL1HellaCacheParameters +abstract class L1HellaCacheBundle(implicit val p: Parameters) extends junctions.ParameterizedBundle()(p) + with HasL1HellaCacheParameters + +trait HasCoreMemOp extends HasCoreParameters { + val addr = UInt(width = coreMaxAddrBits) + val tag = Bits(width = coreDCacheReqTagBits) + val cmd = Bits(width = M_SZ) + val typ = Bits(width = MT_SZ) +} + +trait HasCoreData extends HasCoreParameters { + val data = Bits(width = coreDataBits) +} + +trait HasSDQId extends HasL1HellaCacheParameters { + val sdq_id = UInt(width = log2Up(sdqDepth)) +} + +trait HasMissInfo extends HasL1HellaCacheParameters { + val tag_match = Bool() + val old_meta = new L1Metadata + val way_en = Bits(width = nWays) +} + +class HellaCacheReqInternal(implicit p: Parameters) extends L1HellaCacheBundle()(p) + with HasCoreMemOp { + val phys = Bool() +} + +class HellaCacheReq(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasCoreData + +class HellaCacheResp(implicit p: Parameters) extends L1HellaCacheBundle()(p) + with HasCoreMemOp + with HasCoreData { + val replay = Bool() + val has_data = Bool() + val data_word_bypass = Bits(width = coreDataBits) + val store_data = Bits(width = coreDataBits) +} + +class AlignmentExceptions extends Bundle { + val ld = Bool() + val st = Bool() +} + +class HellaCacheExceptions extends Bundle { + val ma = new AlignmentExceptions + val pf = new AlignmentExceptions +} + +// interface between D$ and processor/DTLB +class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) { + val req = Decoupled(new HellaCacheReq) + val s1_kill = Bool(OUTPUT) // kill previous cycle's req + val s1_data = Bits(OUTPUT, coreDataBits) // data for previous cycle's req + val s2_nack = Bool(INPUT) // req from two cycles ago is rejected + + val resp = Valid(new HellaCacheResp).flip + val replay_next = Bool(INPUT) + val xcpt = (new HellaCacheExceptions).asInput + val invalidate_lr = Bool(OUTPUT) + val ordered = Bool(INPUT) +} + +class L1DataReadReq(implicit p: Parameters) extends L1HellaCacheBundle()(p) { + val way_en = Bits(width = nWays) + val addr = Bits(width = untagBits) +} + +class L1DataWriteReq(implicit p: Parameters) extends L1DataReadReq()(p) { + val wmask = Bits(width = rowWords) + val data = Bits(width = encRowBits) +} + +class L1RefillReq(implicit p: Parameters) extends L1DataReadReq()(p) + +class L1MetaReadReq(implicit p: Parameters) extends MetaReadReq { + val tag = Bits(width = tagBits) + override def cloneType = new L1MetaReadReq()(p).asInstanceOf[this.type] //TODO remove +} + +class L1MetaWriteReq(implicit p: Parameters) extends + MetaWriteReq[L1Metadata](new L1Metadata) + +object L1Metadata { + def apply(tag: Bits, coh: ClientMetadata)(implicit p: Parameters) = { + val meta = Wire(new L1Metadata) + meta.tag := tag + meta.coh := coh + meta + } +} +class L1Metadata(implicit p: Parameters) extends Metadata()(p) with HasL1HellaCacheParameters { + val coh = new ClientMetadata +} + +class Replay(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasCoreData +class ReplayInternal(implicit p: Parameters) extends HellaCacheReqInternal()(p) with HasSDQId + +class MSHRReq(implicit p: Parameters) extends Replay()(p) with HasMissInfo +class MSHRReqInternal(implicit p: Parameters) extends ReplayInternal()(p) with HasMissInfo + +class ProbeInternal(implicit p: Parameters) extends Probe()(p) with HasClientTransactionId + +class WritebackReq(implicit p: Parameters) extends Release()(p) with HasCacheParameters { + val way_en = Bits(width = nWays) +} + +class IOMSHR(id: Int)(implicit p: Parameters) extends L1HellaCacheModule()(p) { + val io = new Bundle { + val req = Decoupled(new HellaCacheReq).flip + val acquire = Decoupled(new Acquire) + val grant = Valid(new GrantFromSrc).flip + val finish = Decoupled(new FinishToDst) + val resp = Decoupled(new HellaCacheResp) + val replay_next = Bool(OUTPUT) + } + + def beatOffset(addr: UInt) = addr.extract(beatOffBits - 1, wordOffBits) + + def wordFromBeat(addr: UInt, dat: UInt) = { + val shift = Cat(beatOffset(addr), UInt(0, wordOffBits + log2Up(wordBytes))) + (dat >> shift)(wordBits - 1, 0) + } + + val req = Reg(new HellaCacheReq) + val req_cmd_sc = req.cmd === M_XSC + val grant_word = Reg(UInt(width = wordBits)) + val fq = Module(new FinishQueue(1)) + + val s_idle :: s_acquire :: s_grant :: s_resp :: s_finish :: Nil = Enum(Bits(), 5) + val state = Reg(init = s_idle) + io.req.ready := (state === s_idle) + + fq.io.enq.valid := io.grant.valid && io.grant.bits.requiresAck() + fq.io.enq.bits := io.grant.bits.makeFinish() + io.finish.valid := fq.io.deq.valid && (state === s_finish) + io.finish.bits := fq.io.deq.bits + fq.io.deq.ready := io.finish.ready && (state === s_finish) + + val storegen = new StoreGen(req.typ, req.addr, req.data, wordBytes) + val loadgen = new LoadGen(req.typ, req.addr, grant_word, req_cmd_sc, wordBytes) + + val beat_mask = (storegen.mask << Cat(beatOffset(req.addr), UInt(0, wordOffBits))) + val beat_data = Fill(beatWords, storegen.data) + + val addr_block = req.addr(paddrBits - 1, blockOffBits) + val addr_beat = req.addr(blockOffBits - 1, beatOffBits) + val addr_byte = req.addr(beatOffBits - 1, 0) + + val get_acquire = Get( + client_xact_id = UInt(id), + addr_block = addr_block, + addr_beat = addr_beat, + addr_byte = addr_byte, + operand_size = req.typ, + alloc = Bool(false)) + + val put_acquire = Put( + client_xact_id = UInt(id), + addr_block = addr_block, + addr_beat = addr_beat, + data = beat_data, + wmask = Some(beat_mask), + alloc = Bool(false)) + + val putAtomic_acquire = PutAtomic( + client_xact_id = UInt(id), + addr_block = addr_block, + addr_beat = addr_beat, + addr_byte = addr_byte, + atomic_opcode = req.cmd, + operand_size = req.typ, + data = beat_data) + + io.acquire.valid := (state === s_acquire) + io.acquire.bits := Mux(isAMO(req.cmd), putAtomic_acquire, Mux(isRead(req.cmd), get_acquire, put_acquire)) + + io.replay_next := (state === s_grant) || io.resp.valid && !io.resp.ready + io.resp.valid := (state === s_resp) + io.resp.bits := req + io.resp.bits.has_data := isRead(req.cmd) + io.resp.bits.data := loadgen.data | req_cmd_sc + io.resp.bits.store_data := req.data + io.resp.bits.replay := Bool(true) + + when (io.req.fire()) { + req := io.req.bits + state := s_acquire + } + + when (io.acquire.fire()) { + state := s_grant + } + + when (state === s_grant && io.grant.valid) { + state := s_resp + when (isRead(req.cmd)) { + grant_word := wordFromBeat(req.addr, io.grant.bits.data) + } + } + + when (io.resp.fire()) { + state := s_finish + } + + when (io.finish.fire()) { + state := s_idle + } +} + +class MSHR(id: Int)(implicit p: Parameters) extends L1HellaCacheModule()(p) { + val io = new Bundle { + val req_pri_val = Bool(INPUT) + val req_pri_rdy = Bool(OUTPUT) + val req_sec_val = Bool(INPUT) + val req_sec_rdy = Bool(OUTPUT) + val req_bits = new MSHRReqInternal().asInput + + val idx_match = Bool(OUTPUT) + val tag = Bits(OUTPUT, tagBits) + + val mem_req = Decoupled(new Acquire) + val refill = new L1RefillReq().asOutput // Data is bypassed + val meta_read = Decoupled(new L1MetaReadReq) + val meta_write = Decoupled(new L1MetaWriteReq) + val replay = Decoupled(new ReplayInternal) + val mem_grant = Valid(new GrantFromSrc).flip + val mem_finish = Decoupled(new FinishToDst) + val wb_req = Decoupled(new WritebackReq) + val probe_rdy = Bool(OUTPUT) + } + + val s_invalid :: s_wb_req :: s_wb_resp :: s_meta_clear :: s_refill_req :: s_refill_resp :: s_meta_write_req :: s_meta_write_resp :: s_drain_rpq :: Nil = Enum(UInt(), 9) + val state = Reg(init=s_invalid) + + def stateIsOneOf(check_states: Seq[UInt]): Bool = + check_states.map(state === _).reduce(_ || _) + + def stateIsOneOf(st1: UInt, st2: UInt*): Bool = + stateIsOneOf(st1 +: st2) + + val new_coh_state = Reg(init=ClientMetadata.onReset) + val req = Reg(new MSHRReqInternal()) + val req_idx = req.addr(untagBits-1,blockOffBits) + val idx_match = req_idx === io.req_bits.addr(untagBits-1,blockOffBits) + // We only accept secondary misses if we haven't yet sent an Acquire to outer memory + // or if the Acquire that was sent will obtain a Grant with sufficient permissions + // to let us replay this new request. I.e. we don't handle multiple outstanding + // Acquires on the same block for now. + val cmd_requires_second_acquire = + req.old_meta.coh.requiresAcquireOnSecondaryMiss(req.cmd, io.req_bits.cmd) + // Track whether or not a secondary acquire will cause the coherence state + // to go from clean to dirty. + val dirties_coh = Reg(Bool()) + val states_before_refill = Seq(s_wb_req, s_wb_resp, s_meta_clear) + val gnt_multi_data = io.mem_grant.bits.hasMultibeatData() + val (refill_cnt, refill_count_done) = Counter(io.mem_grant.valid && gnt_multi_data, refillCycles) + val refill_done = io.mem_grant.valid && (!gnt_multi_data || refill_count_done) + val sec_rdy = idx_match && + (stateIsOneOf(states_before_refill) || + (stateIsOneOf(s_refill_req, s_refill_resp) && + !cmd_requires_second_acquire && !refill_done)) + + val rpq = Module(new Queue(new ReplayInternal, p(ReplayQueueDepth))) + rpq.io.enq.valid := (io.req_pri_val && io.req_pri_rdy || io.req_sec_val && sec_rdy) && !isPrefetch(io.req_bits.cmd) + rpq.io.enq.bits := io.req_bits + rpq.io.deq.ready := io.replay.ready && state === s_drain_rpq || state === s_invalid + + val coh_on_grant = req.old_meta.coh.onGrant( + incoming = io.mem_grant.bits, + pending = Mux(dirties_coh, M_XWR, req.cmd)) + val coh_on_hit = io.req_bits.old_meta.coh.onHit(io.req_bits.cmd) + + when (state === s_drain_rpq && !rpq.io.deq.valid) { + state := s_invalid + } + when (state === s_meta_write_resp) { + // this wait state allows us to catch RAW hazards on the tags via nack_victim + state := s_drain_rpq + } + when (state === s_meta_write_req && io.meta_write.ready) { + state := s_meta_write_resp + } + when (state === s_refill_resp && refill_done) { + state := s_meta_write_req + new_coh_state := coh_on_grant + } + when (io.mem_req.fire()) { // s_refill_req + state := s_refill_resp + } + when (state === s_meta_clear && io.meta_write.ready) { + state := s_refill_req + } + when (state === s_wb_resp && io.mem_grant.valid) { + state := s_meta_clear + } + when (io.wb_req.fire()) { // s_wb_req + state := Mux(io.wb_req.bits.requiresAck(), s_wb_resp, s_meta_clear) + } + when (io.req_sec_val && io.req_sec_rdy) { // s_wb_req, s_wb_resp, s_refill_req + //If we get a secondary miss that needs more permissions before we've sent + // out the primary miss's Acquire, we can upgrade the permissions we're + // going to ask for in s_refill_req + when(cmd_requires_second_acquire) { + req.cmd := io.req_bits.cmd + } + dirties_coh := dirties_coh || isWrite(io.req_bits.cmd) + } + when (io.req_pri_val && io.req_pri_rdy) { + val coh = io.req_bits.old_meta.coh + req := io.req_bits + dirties_coh := isWrite(io.req_bits.cmd) + when (io.req_bits.tag_match) { + when(coh.isHit(io.req_bits.cmd)) { // set dirty bit + state := s_meta_write_req + new_coh_state := coh_on_hit + }.otherwise { // upgrade permissions + state := s_refill_req + } + }.otherwise { // writback if necessary and refill + state := Mux(coh.requiresVoluntaryWriteback(), s_wb_req, s_meta_clear) + } + } + + val fq = Module(new FinishQueue(1)) + val g = io.mem_grant.bits + val can_finish = state === s_invalid || state === s_refill_req + fq.io.enq.valid := io.mem_grant.valid && g.requiresAck() && refill_done + fq.io.enq.bits := g.makeFinish() + io.mem_finish.valid := fq.io.deq.valid && can_finish + fq.io.deq.ready := io.mem_finish.ready && can_finish + io.mem_finish.bits := fq.io.deq.bits + + io.idx_match := (state =/= s_invalid) && idx_match + io.refill.way_en := req.way_en + io.refill.addr := ((req_idx << log2Ceil(refillCycles)) | refill_cnt) << rowOffBits + io.tag := req.addr >> untagBits + io.req_pri_rdy := state === s_invalid + io.req_sec_rdy := sec_rdy && rpq.io.enq.ready + + val meta_hazard = Reg(init=UInt(0,2)) + when (meta_hazard =/= UInt(0)) { meta_hazard := meta_hazard + 1 } + when (io.meta_write.fire()) { meta_hazard := 1 } + io.probe_rdy := !idx_match || (!stateIsOneOf(states_before_refill) && meta_hazard === 0) + + io.meta_write.valid := state === s_meta_write_req || state === s_meta_clear + io.meta_write.bits.idx := req_idx + io.meta_write.bits.data.coh := Mux(state === s_meta_clear, + req.old_meta.coh.onCacheControl(M_FLUSH), + new_coh_state) + io.meta_write.bits.data.tag := io.tag + io.meta_write.bits.way_en := req.way_en + + io.wb_req.valid := state === s_wb_req + io.wb_req.bits := req.old_meta.coh.makeVoluntaryWriteback( + client_xact_id = UInt(id), + addr_block = Cat(req.old_meta.tag, req_idx)) + io.wb_req.bits.way_en := req.way_en + + io.mem_req.valid := state === s_refill_req && fq.io.enq.ready + io.mem_req.bits := req.old_meta.coh.makeAcquire( + addr_block = Cat(io.tag, req_idx).toUInt, + client_xact_id = Bits(id), + op_code = req.cmd) + + io.meta_read.valid := state === s_drain_rpq + io.meta_read.bits.idx := req_idx + io.meta_read.bits.tag := io.tag + + io.replay.valid := state === s_drain_rpq && rpq.io.deq.valid + io.replay.bits := rpq.io.deq.bits + io.replay.bits.phys := Bool(true) + io.replay.bits.addr := Cat(io.tag, req_idx, rpq.io.deq.bits.addr(blockOffBits-1,0)).toUInt + + when (!io.meta_read.ready) { + rpq.io.deq.ready := Bool(false) + io.replay.bits.cmd := M_FLUSH_ALL /* nop */ + } +} + +class MSHRFile(implicit p: Parameters) extends L1HellaCacheModule()(p) { + val io = new Bundle { + val req = Decoupled(new MSHRReq).flip + val resp = Decoupled(new HellaCacheResp) + val secondary_miss = Bool(OUTPUT) + + val mem_req = Decoupled(new Acquire) + val refill = new L1RefillReq().asOutput + val meta_read = Decoupled(new L1MetaReadReq) + val meta_write = Decoupled(new L1MetaWriteReq) + val replay = Decoupled(new Replay) + val mem_grant = Valid(new GrantFromSrc).flip + val mem_finish = Decoupled(new FinishToDst) + val wb_req = Decoupled(new WritebackReq) + + val probe_rdy = Bool(OUTPUT) + val fence_rdy = Bool(OUTPUT) + val replay_next = Bool(OUTPUT) + } + + // determine if the request is cacheable or not + val cacheable = addrMap.isCacheable(io.req.bits.addr) + + val sdq_val = Reg(init=Bits(0, sdqDepth)) + val sdq_alloc_id = PriorityEncoder(~sdq_val(sdqDepth-1,0)) + val sdq_rdy = !sdq_val.andR + val sdq_enq = io.req.valid && io.req.ready && cacheable && isWrite(io.req.bits.cmd) + val sdq = Mem(sdqDepth, io.req.bits.data) + when (sdq_enq) { sdq(sdq_alloc_id) := io.req.bits.data } + + val idxMatch = Wire(Vec(nMSHRs, Bool())) + val tagList = Wire(Vec(nMSHRs, Bits(width = tagBits))) + val tag_match = Mux1H(idxMatch, tagList) === io.req.bits.addr >> untagBits + + val wbTagList = Wire(Vec(nMSHRs, Bits())) + val refillMux = Wire(Vec(nMSHRs, new L1RefillReq)) + val meta_read_arb = Module(new Arbiter(new L1MetaReadReq, nMSHRs)) + val meta_write_arb = Module(new Arbiter(new L1MetaWriteReq, nMSHRs)) + val mem_req_arb = Module(new LockingArbiter( + new Acquire, + nMSHRs + nIOMSHRs, + outerDataBeats, + Some((a: Acquire) => a.hasMultibeatData()))) + val mem_finish_arb = Module(new Arbiter(new FinishToDst, nMSHRs + nIOMSHRs)) + val wb_req_arb = Module(new Arbiter(new WritebackReq, nMSHRs)) + val replay_arb = Module(new Arbiter(new ReplayInternal, nMSHRs)) + val alloc_arb = Module(new Arbiter(Bool(), nMSHRs)) + + var idx_match = Bool(false) + var pri_rdy = Bool(false) + var sec_rdy = Bool(false) + + io.fence_rdy := true + io.probe_rdy := true + + for (i <- 0 until nMSHRs) { + val mshr = Module(new MSHR(i)) + + idxMatch(i) := mshr.io.idx_match + tagList(i) := mshr.io.tag + wbTagList(i) := mshr.io.wb_req.bits.addr_block >> idxBits + + alloc_arb.io.in(i).valid := mshr.io.req_pri_rdy + mshr.io.req_pri_val := alloc_arb.io.in(i).ready + + mshr.io.req_sec_val := io.req.valid && sdq_rdy && tag_match + mshr.io.req_bits := io.req.bits + mshr.io.req_bits.sdq_id := sdq_alloc_id + + meta_read_arb.io.in(i) <> mshr.io.meta_read + meta_write_arb.io.in(i) <> mshr.io.meta_write + mem_req_arb.io.in(i) <> mshr.io.mem_req + mem_finish_arb.io.in(i) <> mshr.io.mem_finish + wb_req_arb.io.in(i) <> mshr.io.wb_req + replay_arb.io.in(i) <> mshr.io.replay + + mshr.io.mem_grant.valid := io.mem_grant.valid && + io.mem_grant.bits.client_xact_id === UInt(i) + mshr.io.mem_grant.bits := io.mem_grant.bits + refillMux(i) := mshr.io.refill + + pri_rdy = pri_rdy || mshr.io.req_pri_rdy + sec_rdy = sec_rdy || mshr.io.req_sec_rdy + idx_match = idx_match || mshr.io.idx_match + + when (!mshr.io.req_pri_rdy) { io.fence_rdy := false } + when (!mshr.io.probe_rdy) { io.probe_rdy := false } + } + + alloc_arb.io.out.ready := io.req.valid && sdq_rdy && cacheable && !idx_match + + io.meta_read <> meta_read_arb.io.out + io.meta_write <> meta_write_arb.io.out + io.mem_req <> mem_req_arb.io.out + io.mem_finish <> mem_finish_arb.io.out + io.wb_req <> wb_req_arb.io.out + + val mmio_alloc_arb = Module(new Arbiter(Bool(), nIOMSHRs)) + val resp_arb = Module(new Arbiter(new HellaCacheResp, nIOMSHRs)) + + var mmio_rdy = Bool(false) + io.replay_next := Bool(false) + + for (i <- 0 until nIOMSHRs) { + val id = nMSHRs + i + val mshr = Module(new IOMSHR(id)) + + mmio_alloc_arb.io.in(i).valid := mshr.io.req.ready + mshr.io.req.valid := mmio_alloc_arb.io.in(i).ready + mshr.io.req.bits := io.req.bits + + mmio_rdy = mmio_rdy || mshr.io.req.ready + + mem_req_arb.io.in(id) <> mshr.io.acquire + mem_finish_arb.io.in(id) <> mshr.io.finish + + mshr.io.grant.bits := io.mem_grant.bits + mshr.io.grant.valid := io.mem_grant.valid && + io.mem_grant.bits.client_xact_id === UInt(id) + + resp_arb.io.in(i) <> mshr.io.resp + + when (!mshr.io.req.ready) { io.fence_rdy := Bool(false) } + when (mshr.io.replay_next) { io.replay_next := Bool(true) } + } + + mmio_alloc_arb.io.out.ready := io.req.valid && !cacheable + + io.resp <> resp_arb.io.out + io.req.ready := Mux(!cacheable, mmio_rdy, + Mux(idx_match, tag_match && sec_rdy, pri_rdy) && sdq_rdy) + io.secondary_miss := idx_match + io.refill := refillMux(io.mem_grant.bits.client_xact_id) + + val free_sdq = io.replay.fire() && isWrite(io.replay.bits.cmd) + io.replay.bits.data := sdq(RegEnable(replay_arb.io.out.bits.sdq_id, free_sdq)) + io.replay <> replay_arb.io.out + + when (io.replay.valid || sdq_enq) { + sdq_val := sdq_val & ~(UIntToOH(replay_arb.io.out.bits.sdq_id) & Fill(sdqDepth, free_sdq)) | + PriorityEncoderOH(~sdq_val(sdqDepth-1,0)) & Fill(sdqDepth, sdq_enq) + } +} + +class WritebackUnit(implicit p: Parameters) extends L1HellaCacheModule()(p) { + val io = new Bundle { + val req = Decoupled(new WritebackReq).flip + val meta_read = Decoupled(new L1MetaReadReq) + val data_req = Decoupled(new L1DataReadReq) + val data_resp = Bits(INPUT, encRowBits) + val release = Decoupled(new Release) + } + + val active = Reg(init=Bool(false)) + val r1_data_req_fired = Reg(init=Bool(false)) + val r2_data_req_fired = Reg(init=Bool(false)) + val data_req_cnt = Reg(init = UInt(0, width = log2Up(refillCycles+1))) //TODO Zero width + val buf_v = (if(refillCyclesPerBeat > 1) Reg(init=Bits(0, width = refillCyclesPerBeat-1)) else Bits(1)) + val beat_done = buf_v.andR + val (beat_cnt, all_beats_done) = Counter(io.release.fire(), outerDataBeats) + val req = Reg(new WritebackReq) + + io.release.valid := false + when (active) { + r1_data_req_fired := false + r2_data_req_fired := r1_data_req_fired + when (io.data_req.fire() && io.meta_read.fire()) { + r1_data_req_fired := true + data_req_cnt := data_req_cnt + 1 + } + when (r2_data_req_fired) { + io.release.valid := beat_done + when(beat_done) { + when(!io.release.ready) { + r1_data_req_fired := false + r2_data_req_fired := false + data_req_cnt := data_req_cnt - Mux[UInt](Bool(refillCycles > 1) && r1_data_req_fired, 2, 1) + } .otherwise { if(refillCyclesPerBeat > 1) buf_v := 0 } + } + when(!r1_data_req_fired) { + // We're done if this is the final data request and the Release can be sent + active := data_req_cnt < UInt(refillCycles) || !io.release.ready + } + } + } + when (io.req.fire()) { + active := true + data_req_cnt := 0 + if(refillCyclesPerBeat > 1) buf_v := 0 + req := io.req.bits + } + + io.req.ready := !active + + val req_idx = req.addr_block(idxBits-1, 0) + val fire = active && data_req_cnt < UInt(refillCycles) + + // We reissue the meta read as it sets up the mux ctrl for s2_data_muxed + io.meta_read.valid := fire + io.meta_read.bits.idx := req_idx + io.meta_read.bits.tag := req.addr_block >> idxBits + + io.data_req.valid := fire + io.data_req.bits.way_en := req.way_en + io.data_req.bits.addr := (if(refillCycles > 1) + Cat(req_idx, data_req_cnt(log2Up(refillCycles)-1,0)) + else req_idx) << rowOffBits + + io.release.bits := req + io.release.bits.addr_beat := beat_cnt + io.release.bits.data := (if(refillCyclesPerBeat > 1) { + // If the cache rows are narrower than a TLDataBeat, + // then buffer enough data_resps to make a whole beat + val data_buf = Reg(Bits()) + when(active && r2_data_req_fired && !beat_done) { + data_buf := Cat(io.data_resp, data_buf((refillCyclesPerBeat)*encRowBits-1, encRowBits)) + buf_v := (if(refillCyclesPerBeat > 2) + Cat(UInt(1), buf_v(refillCyclesPerBeat-2,1)) + else UInt(1)) + } + Cat(io.data_resp, data_buf) + } else { io.data_resp }) +} + +class ProbeUnit(implicit p: Parameters) extends L1HellaCacheModule()(p) { + val io = new Bundle { + val req = Decoupled(new ProbeInternal).flip + val rep = Decoupled(new Release) + val meta_read = Decoupled(new L1MetaReadReq) + val meta_write = Decoupled(new L1MetaWriteReq) + val wb_req = Decoupled(new WritebackReq) + val way_en = Bits(INPUT, nWays) + val mshr_rdy = Bool(INPUT) + val block_state = new ClientMetadata().asInput + } + + val (s_invalid :: s_meta_read :: s_meta_resp :: s_mshr_req :: + s_mshr_resp :: s_release :: s_writeback_req :: s_writeback_resp :: + s_meta_write :: Nil) = Enum(UInt(), 9) + val state = Reg(init=s_invalid) + val old_coh = Reg(new ClientMetadata) + val way_en = Reg(Bits()) + val req = Reg(new ProbeInternal) + val tag_matches = way_en.orR + + val miss_coh = ClientMetadata.onReset + val reply_coh = Mux(tag_matches, old_coh, miss_coh) + val reply = reply_coh.makeRelease(req) + io.req.ready := state === s_invalid + io.rep.valid := state === s_release + io.rep.bits := reply + + assert(!io.rep.valid || !io.rep.bits.hasData(), + "ProbeUnit should not send releases with data") + + io.meta_read.valid := state === s_meta_read + io.meta_read.bits.idx := req.addr_block + io.meta_read.bits.tag := req.addr_block >> idxBits + + io.meta_write.valid := state === s_meta_write + io.meta_write.bits.way_en := way_en + io.meta_write.bits.idx := req.addr_block + io.meta_write.bits.data.tag := req.addr_block >> idxBits + io.meta_write.bits.data.coh := old_coh.onProbe(req) + + io.wb_req.valid := state === s_writeback_req + io.wb_req.bits := reply + io.wb_req.bits.way_en := way_en + + // state === s_invalid + when (io.req.fire()) { + state := s_meta_read + req := io.req.bits + } + + // state === s_meta_read + when (io.meta_read.fire()) { + state := s_meta_resp + } + + // we need to wait one cycle for the metadata to be read from the array + when (state === s_meta_resp) { + state := s_mshr_req + } + + when (state === s_mshr_req) { + state := s_mshr_resp + old_coh := io.block_state + way_en := io.way_en + // if the read didn't go through, we need to retry + when (!io.mshr_rdy) { state := s_meta_read } + } + + when (state === s_mshr_resp) { + val needs_writeback = tag_matches && old_coh.requiresVoluntaryWriteback() + state := Mux(needs_writeback, s_writeback_req, s_release) + } + + when (state === s_release && io.rep.ready) { + state := Mux(tag_matches, s_meta_write, s_invalid) + } + + // state === s_writeback_req + when (io.wb_req.fire()) { + state := s_writeback_resp + } + + // wait for the writeback request to finish before updating the metadata + when (state === s_writeback_resp && io.wb_req.ready) { + state := s_meta_write + } + + when (io.meta_write.fire()) { + state := s_invalid + } +} + +class DataArray(implicit p: Parameters) extends L1HellaCacheModule()(p) { + val io = new Bundle { + val read = Decoupled(new L1DataReadReq).flip + val write = Decoupled(new L1DataWriteReq).flip + val resp = Vec(nWays, Bits(OUTPUT, encRowBits)) + } + + val waddr = io.write.bits.addr >> rowOffBits + val raddr = io.read.bits.addr >> rowOffBits + + if (doNarrowRead) { + for (w <- 0 until nWays by rowWords) { + val wway_en = io.write.bits.way_en(w+rowWords-1,w) + val rway_en = io.read.bits.way_en(w+rowWords-1,w) + val resp = Wire(Vec(rowWords, Bits(width = encRowBits))) + val r_raddr = RegEnable(io.read.bits.addr, io.read.valid) + for (p <- 0 until resp.size) { + val array = SeqMem(nSets*refillCycles, Vec(rowWords, Bits(width=encDataBits))) + when (wway_en.orR && io.write.valid && io.write.bits.wmask(p)) { + val data = Vec.fill(rowWords)(io.write.bits.data(encDataBits*(p+1)-1,encDataBits*p)) + array.write(waddr, data, wway_en.toBools) + } + resp(p) := array.read(raddr, rway_en.orR && io.read.valid).toBits + } + for (dw <- 0 until rowWords) { + val r = Vec(resp.map(_(encDataBits*(dw+1)-1,encDataBits*dw))) + val resp_mux = + if (r.size == 1) r + else Vec(r(r_raddr(rowOffBits-1,wordOffBits)), r.tail:_*) + io.resp(w+dw) := resp_mux.toBits + } + } + } else { + for (w <- 0 until nWays) { + val array = SeqMem(nSets*refillCycles, Vec(rowWords, Bits(width=encDataBits))) + when (io.write.bits.way_en(w) && io.write.valid) { + val data = Vec.tabulate(rowWords)(i => io.write.bits.data(encDataBits*(i+1)-1,encDataBits*i)) + array.write(waddr, data, io.write.bits.wmask.toBools) + } + io.resp(w) := array.read(raddr, io.read.bits.way_en(w) && io.read.valid).toBits + } + } + + io.read.ready := Bool(true) + io.write.ready := Bool(true) +} + +class HellaCache(implicit p: Parameters) extends L1HellaCacheModule()(p) { + val io = new Bundle { + val cpu = (new HellaCacheIO).flip + val ptw = new TLBPTWIO() + val mem = new ClientTileLinkIO + } + + require(isPow2(nWays)) // TODO: relax this + + val wb = Module(new WritebackUnit) + val prober = Module(new ProbeUnit) + val mshrs = Module(new MSHRFile) + + io.cpu.req.ready := Bool(true) + val s1_valid = Reg(next=io.cpu.req.fire(), init=Bool(false)) + val s1_req = Reg(io.cpu.req.bits) + val s1_valid_masked = s1_valid && !io.cpu.s1_kill + val s1_replay = Reg(init=Bool(false)) + val s1_clk_en = Reg(Bool()) + + val s2_valid = Reg(next=s1_valid_masked, init=Bool(false)) + val s2_req = Reg(io.cpu.req.bits) + val s2_replay = Reg(next=s1_replay, init=Bool(false)) && s2_req.cmd =/= M_FLUSH_ALL + val s2_recycle = Wire(Bool()) + val s2_valid_masked = Wire(Bool()) + + val s3_valid = Reg(init=Bool(false)) + val s3_req = Reg(io.cpu.req.bits) + val s3_way = Reg(Bits()) + + val s1_recycled = RegEnable(s2_recycle, Bool(false), s1_clk_en) + val s1_read = isRead(s1_req.cmd) + val s1_write = isWrite(s1_req.cmd) + val s1_readwrite = s1_read || s1_write || isPrefetch(s1_req.cmd) + + val dtlb = Module(new TLB) + io.ptw <> dtlb.io.ptw + dtlb.io.req.valid := s1_valid_masked && s1_readwrite + dtlb.io.req.bits.passthrough := s1_req.phys + dtlb.io.req.bits.vpn := s1_req.addr >> pgIdxBits + dtlb.io.req.bits.instruction := Bool(false) + dtlb.io.req.bits.store := s1_write + when (!dtlb.io.req.ready && !io.cpu.req.bits.phys) { io.cpu.req.ready := Bool(false) } + + when (io.cpu.req.valid) { + s1_req := io.cpu.req.bits + } + when (wb.io.meta_read.valid) { + s1_req.addr := Cat(wb.io.meta_read.bits.tag, wb.io.meta_read.bits.idx) << blockOffBits + s1_req.phys := Bool(true) + } + when (prober.io.meta_read.valid) { + s1_req.addr := Cat(prober.io.meta_read.bits.tag, prober.io.meta_read.bits.idx) << blockOffBits + s1_req.phys := Bool(true) + } + when (mshrs.io.replay.valid) { + s1_req := mshrs.io.replay.bits + } + when (s2_recycle) { + s1_req := s2_req + } + val s1_addr = Cat(dtlb.io.resp.ppn, s1_req.addr(pgIdxBits-1,0)) + + when (s1_clk_en) { + s2_req.typ := s1_req.typ + s2_req.phys := s1_req.phys + s2_req.addr := s1_addr + when (s1_write) { + s2_req.data := Mux(s1_replay, mshrs.io.replay.bits.data, io.cpu.s1_data) + } + when (s1_recycled) { s2_req.data := s1_req.data } + s2_req.tag := s1_req.tag + s2_req.cmd := s1_req.cmd + } + + val misaligned = new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes).misaligned + io.cpu.xcpt.ma.ld := s1_read && misaligned + io.cpu.xcpt.ma.st := s1_write && misaligned + io.cpu.xcpt.pf.ld := s1_read && dtlb.io.resp.xcpt_ld + io.cpu.xcpt.pf.st := s1_write && dtlb.io.resp.xcpt_st + + assert (!(Reg(next= + (io.cpu.xcpt.ma.ld || io.cpu.xcpt.ma.st || io.cpu.xcpt.pf.ld || io.cpu.xcpt.pf.st)) && + s2_valid_masked), + "DCache exception occurred - cache response not killed.") + + // tags + def onReset = L1Metadata(UInt(0), ClientMetadata.onReset) + val meta = Module(new MetadataArray(onReset _)) + val metaReadArb = Module(new Arbiter(new MetaReadReq, 5)) + val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, 2)) + meta.io.read <> metaReadArb.io.out + meta.io.write <> metaWriteArb.io.out + + // data + val data = Module(new DataArray) + val readArb = Module(new Arbiter(new L1DataReadReq, 4)) + val writeArb = Module(new Arbiter(new L1DataWriteReq, 2)) + data.io.write.valid := writeArb.io.out.valid + writeArb.io.out.ready := data.io.write.ready + data.io.write.bits := writeArb.io.out.bits + val wdata_encoded = (0 until rowWords).map(i => code.encode(writeArb.io.out.bits.data(coreDataBits*(i+1)-1,coreDataBits*i))) + data.io.write.bits.data := wdata_encoded.toBits + + // tag read for new requests + metaReadArb.io.in(4).valid := io.cpu.req.valid + metaReadArb.io.in(4).bits.idx := io.cpu.req.bits.addr >> blockOffBits + when (!metaReadArb.io.in(4).ready) { io.cpu.req.ready := Bool(false) } + + // data read for new requests + readArb.io.in(3).valid := io.cpu.req.valid + readArb.io.in(3).bits.addr := io.cpu.req.bits.addr + readArb.io.in(3).bits.way_en := ~UInt(0, nWays) + when (!readArb.io.in(3).ready) { io.cpu.req.ready := Bool(false) } + + // recycled requests + metaReadArb.io.in(0).valid := s2_recycle + metaReadArb.io.in(0).bits.idx := s2_req.addr >> blockOffBits + readArb.io.in(0).valid := s2_recycle + readArb.io.in(0).bits.addr := s2_req.addr + readArb.io.in(0).bits.way_en := ~UInt(0, nWays) + + // tag check and way muxing + def wayMap[T <: Data](f: Int => T) = Vec((0 until nWays).map(f)) + val s1_tag_eq_way = wayMap((w: Int) => meta.io.resp(w).tag === (s1_addr >> untagBits)).toBits + val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && meta.io.resp(w).coh.isValid()).toBits + s1_clk_en := metaReadArb.io.out.valid //TODO: should be metaReadArb.io.out.fire(), but triggers Verilog backend bug + val s1_writeback = s1_clk_en && !s1_valid && !s1_replay + val s2_tag_match_way = RegEnable(s1_tag_match_way, s1_clk_en) + val s2_tag_match = s2_tag_match_way.orR + val s2_hit_state = Mux1H(s2_tag_match_way, wayMap((w: Int) => RegEnable(meta.io.resp(w).coh, s1_clk_en))) + val s2_hit = s2_tag_match && + s2_hit_state.isHit(s2_req.cmd) && + s2_hit_state === s2_hit_state.onHit(s2_req.cmd) + + // load-reserved/store-conditional + val lrsc_count = Reg(init=UInt(0)) + val lrsc_valid = lrsc_count.orR + val lrsc_addr = Reg(UInt()) + val (s2_lr, s2_sc) = (s2_req.cmd === M_XLR, s2_req.cmd === M_XSC) + val s2_lrsc_addr_match = lrsc_valid && lrsc_addr === (s2_req.addr >> blockOffBits) + val s2_sc_fail = s2_sc && !s2_lrsc_addr_match + when (lrsc_valid) { lrsc_count := lrsc_count - 1 } + when (s2_valid_masked && s2_hit || s2_replay) { + when (s2_lr) { + when (!lrsc_valid) { lrsc_count := lrscCycles-1 } + lrsc_addr := s2_req.addr >> blockOffBits + } + when (s2_sc) { + lrsc_count := 0 + } + } + when (io.cpu.invalidate_lr) { lrsc_count := 0 } + + val s2_data = Wire(Vec(nWays, Bits(width=encRowBits))) + for (w <- 0 until nWays) { + val regs = Reg(Vec(rowWords, Bits(width = encDataBits))) + val en1 = s1_clk_en && s1_tag_eq_way(w) + for (i <- 0 until regs.size) { + val en = en1 && ((Bool(i == 0) || !Bool(doNarrowRead)) || s1_writeback) + when (en) { regs(i) := data.io.resp(w) >> encDataBits*i } + } + s2_data(w) := regs.toBits + } + val s2_data_muxed = Mux1H(s2_tag_match_way, s2_data) + val s2_data_decoded = (0 until rowWords).map(i => code.decode(s2_data_muxed(encDataBits*(i+1)-1,encDataBits*i))) + val s2_data_corrected = s2_data_decoded.map(_.corrected).toBits + val s2_data_uncorrected = s2_data_decoded.map(_.uncorrected).toBits + val s2_word_idx = if(doNarrowRead) UInt(0) else s2_req.addr(log2Up(rowWords*coreDataBytes)-1,log2Up(wordBytes)) + val s2_data_correctable = s2_data_decoded.map(_.correctable).toBits()(s2_word_idx) + + // store/amo hits + s3_valid := (s2_valid_masked && s2_hit || s2_replay) && !s2_sc_fail && isWrite(s2_req.cmd) + val amoalu = Module(new AMOALU) + when ((s2_valid || s2_replay) && (isWrite(s2_req.cmd) || s2_data_correctable)) { + s3_req := s2_req + s3_req.data := Mux(s2_data_correctable, s2_data_corrected, amoalu.io.out) + s3_way := s2_tag_match_way + } + + writeArb.io.in(0).bits.addr := s3_req.addr + writeArb.io.in(0).bits.wmask := UIntToOH(s3_req.addr.extract(rowOffBits-1,offsetlsb)) + writeArb.io.in(0).bits.data := Fill(rowWords, s3_req.data) + writeArb.io.in(0).valid := s3_valid + writeArb.io.in(0).bits.way_en := s3_way + + // replacement policy + val replacer = p(Replacer)() + val s1_replaced_way_en = UIntToOH(replacer.way) + val s2_replaced_way_en = UIntToOH(RegEnable(replacer.way, s1_clk_en)) + val s2_repl_meta = Mux1H(s2_replaced_way_en, wayMap((w: Int) => RegEnable(meta.io.resp(w), s1_clk_en && s1_replaced_way_en(w))).toSeq) + + // miss handling + mshrs.io.req.valid := s2_valid_masked && !s2_hit && (isPrefetch(s2_req.cmd) || isRead(s2_req.cmd) || isWrite(s2_req.cmd)) + mshrs.io.req.bits := s2_req + mshrs.io.req.bits.tag_match := s2_tag_match + mshrs.io.req.bits.old_meta := Mux(s2_tag_match, L1Metadata(s2_repl_meta.tag, s2_hit_state), s2_repl_meta) + mshrs.io.req.bits.way_en := Mux(s2_tag_match, s2_tag_match_way, s2_replaced_way_en) + mshrs.io.req.bits.data := s2_req.data + when (mshrs.io.req.fire()) { replacer.miss } + io.mem.acquire <> mshrs.io.mem_req + + // replays + readArb.io.in(1).valid := mshrs.io.replay.valid + readArb.io.in(1).bits := mshrs.io.replay.bits + readArb.io.in(1).bits.way_en := ~UInt(0, nWays) + mshrs.io.replay.ready := readArb.io.in(1).ready + s1_replay := mshrs.io.replay.valid && readArb.io.in(1).ready + metaReadArb.io.in(1) <> mshrs.io.meta_read + metaWriteArb.io.in(0) <> mshrs.io.meta_write + + // probes and releases + val releaseArb = Module(new LockingArbiter( + new Release, 2, outerDataBeats, + Some((r: Release) => r.hasMultibeatData()))) + io.mem.release <> releaseArb.io.out + + prober.io.req.valid := io.mem.probe.valid && !lrsc_valid + io.mem.probe.ready := prober.io.req.ready && !lrsc_valid + prober.io.req.bits := io.mem.probe.bits + releaseArb.io.in(1) <> prober.io.rep + prober.io.way_en := s2_tag_match_way + prober.io.block_state := s2_hit_state + metaReadArb.io.in(2) <> prober.io.meta_read + metaWriteArb.io.in(1) <> prober.io.meta_write + prober.io.mshr_rdy := mshrs.io.probe_rdy + + // refills + val narrow_grant = FlowThroughSerializer(io.mem.grant, refillCyclesPerBeat) + mshrs.io.mem_grant.valid := narrow_grant.fire() + mshrs.io.mem_grant.bits := narrow_grant.bits + narrow_grant.ready := writeArb.io.in(1).ready || !narrow_grant.bits.hasData() + /* The last clause here is necessary in order to prevent the responses for + * the IOMSHRs from being written into the data array. It works because the + * IOMSHR ids start right the ones for the regular MSHRs. */ + writeArb.io.in(1).valid := narrow_grant.valid && narrow_grant.bits.hasData() && + narrow_grant.bits.client_xact_id < UInt(nMSHRs) + writeArb.io.in(1).bits.addr := mshrs.io.refill.addr + writeArb.io.in(1).bits.way_en := mshrs.io.refill.way_en + writeArb.io.in(1).bits.wmask := ~UInt(0, rowWords) + writeArb.io.in(1).bits.data := narrow_grant.bits.data(encRowBits-1,0) + data.io.read <> readArb.io.out + readArb.io.out.ready := !narrow_grant.valid || narrow_grant.ready // insert bubble if refill gets blocked + io.mem.finish <> mshrs.io.mem_finish + + // writebacks + val wbArb = Module(new Arbiter(new WritebackReq, 2)) + wbArb.io.in(0) <> prober.io.wb_req + wbArb.io.in(1) <> mshrs.io.wb_req + wb.io.req <> wbArb.io.out + metaReadArb.io.in(3) <> wb.io.meta_read + readArb.io.in(2) <> wb.io.data_req + wb.io.data_resp := s2_data_corrected + releaseArb.io.in(0) <> wb.io.release + + // store->load bypassing + val s4_valid = Reg(next=s3_valid, init=Bool(false)) + val s4_req = RegEnable(s3_req, s3_valid && metaReadArb.io.out.valid) + val bypasses = List( + ((s2_valid_masked || s2_replay) && !s2_sc_fail, s2_req, amoalu.io.out), + (s3_valid, s3_req, s3_req.data), + (s4_valid, s4_req, s4_req.data) + ).map(r => (r._1 && (s1_addr >> wordOffBits === r._2.addr >> wordOffBits) && isWrite(r._2.cmd), r._3)) + val s2_store_bypass_data = Reg(Bits(width = coreDataBits)) + val s2_store_bypass = Reg(Bool()) + when (s1_clk_en) { + s2_store_bypass := false + when (bypasses.map(_._1).reduce(_||_)) { + s2_store_bypass_data := PriorityMux(bypasses) + s2_store_bypass := true + } + } + + // load data subword mux/sign extension + val s2_data_word_prebypass = s2_data_uncorrected >> Cat(s2_word_idx, Bits(0,log2Up(coreDataBits))) + val s2_data_word = Mux(s2_store_bypass, s2_store_bypass_data, s2_data_word_prebypass) + val loadgen = new LoadGen(s2_req.typ, s2_req.addr, s2_data_word, s2_sc, wordBytes) + + amoalu.io.addr := s2_req.addr + amoalu.io.cmd := s2_req.cmd + amoalu.io.typ := s2_req.typ + amoalu.io.lhs := s2_data_word + amoalu.io.rhs := s2_req.data + + // nack it like it's hot + val s1_nack = dtlb.io.req.valid && dtlb.io.resp.miss || + s1_req.addr(idxMSB,idxLSB) === prober.io.meta_write.bits.idx && !prober.io.req.ready + val s2_nack_hit = RegEnable(s1_nack, s1_valid || s1_replay) + when (s2_nack_hit) { mshrs.io.req.valid := Bool(false) } + val s2_nack_victim = s2_hit && mshrs.io.secondary_miss + val s2_nack_miss = !s2_hit && !mshrs.io.req.ready + val s2_nack = s2_nack_hit || s2_nack_victim || s2_nack_miss + s2_valid_masked := s2_valid && !s2_nack + + val s2_recycle_ecc = (s2_valid || s2_replay) && s2_hit && s2_data_correctable + val s2_recycle_next = Reg(init=Bool(false)) + when (s1_valid || s1_replay) { s2_recycle_next := s2_recycle_ecc } + s2_recycle := s2_recycle_ecc || s2_recycle_next + + // after a nack, block until nack condition resolves to save energy + val block_miss = Reg(init=Bool(false)) + block_miss := (s2_valid || block_miss) && s2_nack_miss + when (block_miss) { + io.cpu.req.ready := Bool(false) + } + + val cache_resp = Wire(Valid(new HellaCacheResp)) + cache_resp.valid := (s2_replay || s2_valid_masked && s2_hit) && !s2_data_correctable + cache_resp.bits := s2_req + cache_resp.bits.has_data := isRead(s2_req.cmd) + cache_resp.bits.data := loadgen.data | s2_sc_fail + cache_resp.bits.store_data := s2_req.data + cache_resp.bits.replay := s2_replay + + val uncache_resp = Wire(Valid(new HellaCacheResp)) + uncache_resp.bits := mshrs.io.resp.bits + uncache_resp.valid := mshrs.io.resp.valid + mshrs.io.resp.ready := Reg(next= !(s1_valid || s1_replay)) + + io.cpu.s2_nack := s2_valid && s2_nack + io.cpu.resp := Mux(mshrs.io.resp.ready, uncache_resp, cache_resp) + io.cpu.resp.bits.data_word_bypass := loadgen.wordData + io.cpu.ordered := mshrs.io.fence_rdy && !s1_valid && !s2_valid + io.cpu.replay_next := (s1_replay && s1_read) || mshrs.io.replay_next +} + +/** + * This module buffers requests made by the SimpleHellaCacheIF in case they + * are nacked. Nacked requests must be replayed in order, and no other requests + * must be allowed to go through until the replayed requests are successfully + * completed. + */ +class SimpleHellaCacheIFReplayQueue(depth: Int) + (implicit val p: Parameters) extends Module + with HasL1HellaCacheParameters { + val io = new Bundle { + val req = Decoupled(new HellaCacheReq).flip + val nack = Valid(Bits(width = coreDCacheReqTagBits)).flip + val resp = Valid(new HellaCacheResp).flip + val replay = Decoupled(new HellaCacheReq) + } + + // Registers to store the sent request + // When a request is sent the first time, + // it is stored in one of the reqs registers + // and the corresponding inflight bit is set. + // The reqs register will be deallocated once the request is + // successfully completed. + val inflight = Reg(init = UInt(0, depth)) + val reqs = Reg(Vec(depth, new HellaCacheReq)) + + // The nack queue stores the index of nacked requests (in the reqs vector) + // in the order that they were nacked. A request is enqueued onto nackq + // when it is newly nacked (i.e. not a nack for a previous replay). + // The head of the nack queue will be replayed until it is + // successfully completed, at which time the request is dequeued. + // No new requests will be made or other replays attempted until the head + // of the nackq is successfully completed. + val nackq = Module(new Queue(UInt(width = log2Up(depth)), depth)) + val replaying = Reg(init = Bool(false)) + + val next_inflight_onehot = PriorityEncoderOH(~inflight) + val next_inflight = OHToUInt(next_inflight_onehot) + + val next_replay = nackq.io.deq.bits + val next_replay_onehot = UIntToOH(next_replay) + val next_replay_req = reqs(next_replay) + + // Keep sending the head of the nack queue until it succeeds + io.replay.valid := nackq.io.deq.valid && !replaying + io.replay.bits := next_replay_req + // Don't allow new requests if there is are replays waiting + // or something being nacked. + io.req.ready := !inflight.andR && !nackq.io.deq.valid && !io.nack.valid + + // Match on the tags to determine the index of nacks or responses + val nack_onehot = Cat(reqs.map(_.tag === io.nack.bits).reverse) & inflight + val resp_onehot = Cat(reqs.map(_.tag === io.resp.bits.tag).reverse) & inflight + + val replay_complete = io.resp.valid && replaying && io.resp.bits.tag === next_replay_req.tag + val nack_head = io.nack.valid && nackq.io.deq.valid && io.nack.bits === next_replay_req.tag + + // Enqueue to the nack queue if there is a nack that is not in response to + // the previous replay + nackq.io.enq.valid := io.nack.valid && !nack_head + nackq.io.enq.bits := OHToUInt(nack_onehot) + assert(!nackq.io.enq.valid || nackq.io.enq.ready, + "SimpleHellaCacheIF: ReplayQueue nack queue overflow") + + // Dequeue from the nack queue if the last replay was successfully completed + nackq.io.deq.ready := replay_complete + assert(!nackq.io.deq.ready || nackq.io.deq.valid, + "SimpleHellaCacheIF: ReplayQueue nack queue underflow") + + // Set inflight bit when a request is made + // Clear it when it is successfully completed + inflight := (inflight | Mux(io.req.fire(), next_inflight_onehot, UInt(0))) & + ~Mux(io.resp.valid, resp_onehot, UInt(0)) + + when (io.req.fire()) { + reqs(next_inflight) := io.req.bits + } + + // Only one replay outstanding at a time + when (io.replay.fire()) { replaying := Bool(true) } + when (nack_head || replay_complete) { replaying := Bool(false) } +} + +// exposes a sane decoupled request interface +class SimpleHellaCacheIF(implicit p: Parameters) extends Module +{ + val io = new Bundle { + val requestor = new HellaCacheIO().flip + val cache = new HellaCacheIO + } + + val replayq = Module(new SimpleHellaCacheIFReplayQueue(2)) + val req_arb = Module(new Arbiter(new HellaCacheReq, 2)) + + val req_helper = DecoupledHelper( + req_arb.io.in(1).ready, + replayq.io.req.ready, + io.requestor.req.valid) + + req_arb.io.in(0) <> replayq.io.replay + req_arb.io.in(1).valid := req_helper.fire(req_arb.io.in(1).ready) + req_arb.io.in(1).bits := io.requestor.req.bits + io.requestor.req.ready := req_helper.fire(io.requestor.req.valid) + replayq.io.req.valid := req_helper.fire(replayq.io.req.ready) + replayq.io.req.bits := io.requestor.req.bits + + val s0_req_fire = io.cache.req.fire() + val s1_req_fire = Reg(next = s0_req_fire) + val s2_req_fire = Reg(next = s1_req_fire) + val s1_req_tag = Reg(next = io.cache.req.bits.tag) + val s2_req_tag = Reg(next = s1_req_tag) + val s2_kill = Reg(next = io.cache.s1_kill) + + io.cache.invalidate_lr := io.requestor.invalidate_lr + io.cache.req <> req_arb.io.out + io.cache.req.bits.phys := Bool(true) + io.cache.s1_kill := io.cache.s2_nack + io.cache.s1_data := RegEnable(req_arb.io.out.bits.data, s0_req_fire) + + replayq.io.nack.valid := (io.cache.s2_nack || s2_kill) && s2_req_fire + replayq.io.nack.bits := s2_req_tag + replayq.io.resp := io.cache.resp + io.requestor.resp := io.cache.resp + + assert(!Reg(next = io.cache.req.fire()) || + !(io.cache.xcpt.ma.ld || io.cache.xcpt.ma.st || + io.cache.xcpt.pf.ld || io.cache.xcpt.pf.st), + "SimpleHellaCacheIF exception") +} diff --git a/rocket/src/main/scala/package.scala b/rocket/src/main/scala/package.scala new file mode 100644 index 00000000..30368040 --- /dev/null +++ b/rocket/src/main/scala/package.scala @@ -0,0 +1,4 @@ +// See LICENSE for license details. + +package object rocket extends + rocket.constants.ScalarOpConstants diff --git a/rocket/src/main/scala/ptw.scala b/rocket/src/main/scala/ptw.scala new file mode 100644 index 00000000..c5a64764 --- /dev/null +++ b/rocket/src/main/scala/ptw.scala @@ -0,0 +1,203 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import uncore.agents._ +import uncore.constants._ +import Util._ +import cde.{Parameters, Field} + +class PTWReq(implicit p: Parameters) extends CoreBundle()(p) { + val prv = Bits(width = 2) + val pum = Bool() + val mxr = Bool() + val addr = UInt(width = vpnBits) + val store = Bool() + val fetch = Bool() +} + +class PTWResp(implicit p: Parameters) extends CoreBundle()(p) { + val pte = new PTE +} + +class TLBPTWIO(implicit p: Parameters) extends CoreBundle()(p) { + val req = Decoupled(new PTWReq) + val resp = Valid(new PTWResp).flip + val ptbr = new PTBR().asInput + val invalidate = Bool(INPUT) + val status = new MStatus().asInput +} + +class DatapathPTWIO(implicit p: Parameters) extends CoreBundle()(p) { + val ptbr = new PTBR().asInput + val invalidate = Bool(INPUT) + val status = new MStatus().asInput +} + +class PTE(implicit p: Parameters) extends CoreBundle()(p) { + val reserved_for_hardware = Bits(width = 16) + val ppn = UInt(width = 38) + val reserved_for_software = Bits(width = 2) + val d = Bool() + val a = Bool() + val g = Bool() + val u = Bool() + val x = Bool() + val w = Bool() + val r = Bool() + val v = Bool() + + def table(dummy: Int = 0) = v && !r && !w && !x + def leaf(dummy: Int = 0) = v && (r || (x && !w)) + def ur(dummy: Int = 0) = sr() && u + def uw(dummy: Int = 0) = sw() && u + def ux(dummy: Int = 0) = sx() && u + def sr(dummy: Int = 0) = leaf() && r + def sw(dummy: Int = 0) = leaf() && w + def sx(dummy: Int = 0) = leaf() && x + + def access_ok(req: PTWReq) = { + val perm_ok = Mux(req.fetch, x, Mux(req.store, w, r || (x && req.mxr))) + val priv_ok = Mux(u, !req.pum, req.prv(0)) + leaf() && priv_ok && perm_ok + } +} + +class PTW(n: Int)(implicit p: Parameters) extends CoreModule()(p) { + val io = new Bundle { + val requestor = Vec(n, new TLBPTWIO).flip + val mem = new HellaCacheIO + val dpath = new DatapathPTWIO + } + + require(usingAtomics, "PTW requires atomic memory operations") + + val s_ready :: s_req :: s_wait :: s_set_dirty :: s_wait_dirty :: s_done :: Nil = Enum(UInt(), 6) + val state = Reg(init=s_ready) + val count = Reg(UInt(width = log2Up(pgLevels))) + + val r_req = Reg(new PTWReq) + val r_req_dest = Reg(Bits()) + val r_pte = Reg(new PTE) + + val vpn_idxs = (0 until pgLevels).map(i => (r_req.addr >> (pgLevels-i-1)*pgLevelBits)(pgLevelBits-1,0)) + val vpn_idx = vpn_idxs(count) + + val arb = Module(new RRArbiter(new PTWReq, n)) + arb.io.in <> io.requestor.map(_.req) + arb.io.out.ready := state === s_ready + + val pte = new PTE().fromBits(io.mem.resp.bits.data) + val pte_addr = Cat(r_pte.ppn, vpn_idx).toUInt << log2Up(xLen/8) + + when (arb.io.out.fire()) { + r_req := arb.io.out.bits + r_req_dest := arb.io.chosen + r_pte.ppn := io.dpath.ptbr.ppn + } + + val (pte_cache_hit, pte_cache_data) = { + val size = 1 << log2Up(pgLevels * 2) + val plru = new PseudoLRU(size) + val valid = Reg(init = UInt(0, size)) + val tags = Reg(Vec(size, UInt(width = paddrBits))) + val data = Reg(Vec(size, UInt(width = ppnBits))) + + val hits = tags.map(_ === pte_addr).toBits & valid + val hit = hits.orR + when (io.mem.resp.valid && pte.table() && !hit) { + val r = Mux(valid.andR, plru.replace, PriorityEncoder(~valid)) + valid := valid | UIntToOH(r) + tags(r) := pte_addr + data(r) := pte.ppn + } + when (hit && state === s_req) { plru.access(OHToUInt(hits)) } + when (io.dpath.invalidate) { valid := 0 } + + (hit, Mux1H(hits, data)) + } + + val set_dirty_bit = pte.access_ok(r_req) && (!pte.a || (r_req.store && !pte.d)) + when (io.mem.resp.valid && state === s_wait && !set_dirty_bit) { + r_pte := pte + } + + val pte_wdata = Wire(init=new PTE().fromBits(0)) + pte_wdata.a := true + pte_wdata.d := r_req.store + + io.mem.req.valid := state === s_req || state === s_set_dirty + io.mem.req.bits.phys := Bool(true) + io.mem.req.bits.cmd := Mux(state === s_set_dirty, M_XA_OR, M_XRD) + io.mem.req.bits.typ := MT_D + io.mem.req.bits.addr := pte_addr + io.mem.s1_data := pte_wdata.toBits + io.mem.s1_kill := Bool(false) + io.mem.invalidate_lr := Bool(false) + + val r_resp_ppn = io.mem.req.bits.addr >> pgIdxBits + val resp_ppns = (0 until pgLevels-1).map(i => Cat(r_resp_ppn >> pgLevelBits*(pgLevels-i-1), r_req.addr(pgLevelBits*(pgLevels-i-1)-1,0))) :+ r_resp_ppn + val resp_ppn = resp_ppns(count) + val resp_val = state === s_done + + for (i <- 0 until io.requestor.size) { + io.requestor(i).resp.valid := resp_val && (r_req_dest === i) + io.requestor(i).resp.bits.pte := r_pte + io.requestor(i).resp.bits.pte.ppn := resp_ppn + io.requestor(i).ptbr := io.dpath.ptbr + io.requestor(i).invalidate := io.dpath.invalidate + io.requestor(i).status := io.dpath.status + } + + // control state machine + switch (state) { + is (s_ready) { + when (arb.io.out.valid) { + state := s_req + } + count := UInt(0) + } + is (s_req) { + when (pte_cache_hit && count < pgLevels-1) { + io.mem.req.valid := false + state := s_req + count := count + 1 + r_pte.ppn := pte_cache_data + }.elsewhen (io.mem.req.ready) { + state := s_wait + } + } + is (s_wait) { + when (io.mem.s2_nack) { + state := s_req + } + when (io.mem.resp.valid) { + state := s_done + when (set_dirty_bit) { + state := s_set_dirty + } + when (pte.table() && count < pgLevels-1) { + state := s_req + count := count + 1 + } + } + } + is (s_set_dirty) { + when (io.mem.req.ready) { + state := s_wait_dirty + } + } + is (s_wait_dirty) { + when (io.mem.s2_nack) { + state := s_set_dirty + } + when (io.mem.resp.valid) { + state := s_req + } + } + is (s_done) { + state := s_ready + } + } +} diff --git a/rocket/src/main/scala/rocc.scala b/rocket/src/main/scala/rocc.scala new file mode 100644 index 00000000..ce0fcfbe --- /dev/null +++ b/rocket/src/main/scala/rocc.scala @@ -0,0 +1,303 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import uncore.tilelink._ +import uncore.constants._ +import uncore.agents.CacheName +import Util._ +import cde.{Parameters, Field} + +case object RoccMaxTaggedMemXacts extends Field[Int] +case object RoccNMemChannels extends Field[Int] +case object RoccNPTWPorts extends Field[Int] +case object RoccNCSRs extends Field[Int] + +class RoCCCSRs(implicit p: Parameters) extends CoreBundle()(p) { + val rdata = Vec(nRoccCsrs, UInt(INPUT, xLen)) + val waddr = UInt(OUTPUT, CSR.ADDRSZ) + val wdata = UInt(OUTPUT, xLen) + val wen = Bool(OUTPUT) +} + +class RoCCInstruction extends Bundle +{ + val funct = Bits(width = 7) + val rs2 = Bits(width = 5) + val rs1 = Bits(width = 5) + val xd = Bool() + val xs1 = Bool() + val xs2 = Bool() + val rd = Bits(width = 5) + val opcode = Bits(width = 7) +} + +class RoCCCommand(implicit p: Parameters) extends CoreBundle()(p) { + val inst = new RoCCInstruction + val rs1 = Bits(width = xLen) + val rs2 = Bits(width = xLen) + val status = new MStatus +} + +class RoCCResponse(implicit p: Parameters) extends CoreBundle()(p) { + val rd = Bits(width = 5) + val data = Bits(width = xLen) +} + +class RoCCInterface(implicit p: Parameters) extends CoreBundle()(p) { + val cmd = Decoupled(new RoCCCommand).flip + val resp = Decoupled(new RoCCResponse) + val mem = new HellaCacheIO()(p.alterPartial({ case CacheName => "L1D" })) + val busy = Bool(OUTPUT) + val interrupt = Bool(OUTPUT) + + // These should be handled differently, eventually + val autl = new ClientUncachedTileLinkIO + val utl = Vec(p(RoccNMemChannels), new ClientUncachedTileLinkIO) + val ptw = Vec(p(RoccNPTWPorts), new TLBPTWIO) + val fpu_req = Decoupled(new FPInput) + val fpu_resp = Decoupled(new FPResult).flip + val exception = Bool(INPUT) + val csr = (new RoCCCSRs).flip + val host_id = UInt(INPUT, log2Up(nCores)) + + override def cloneType = new RoCCInterface().asInstanceOf[this.type] +} + +abstract class RoCC(implicit p: Parameters) extends CoreModule()(p) { + val io = new RoCCInterface + io.mem.req.bits.phys := Bool(true) // don't perform address translation +} + +class AccumulatorExample(n: Int = 4)(implicit p: Parameters) extends RoCC()(p) { + val regfile = Mem(n, UInt(width = xLen)) + val busy = Reg(init = Vec.fill(n){Bool(false)}) + + val cmd = Queue(io.cmd) + val funct = cmd.bits.inst.funct + val addr = cmd.bits.rs2(log2Up(n)-1,0) + val doWrite = funct === UInt(0) + val doRead = funct === UInt(1) + val doLoad = funct === UInt(2) + val doAccum = funct === UInt(3) + val memRespTag = io.mem.resp.bits.tag(log2Up(n)-1,0) + + // datapath + val addend = cmd.bits.rs1 + val accum = regfile(addr) + val wdata = Mux(doWrite, addend, accum + addend) + + when (cmd.fire() && (doWrite || doAccum)) { + regfile(addr) := wdata + } + + when (io.mem.resp.valid) { + regfile(memRespTag) := io.mem.resp.bits.data + } + + // control + when (io.mem.req.fire()) { + busy(addr) := Bool(true) + } + + when (io.mem.resp.valid) { + busy(memRespTag) := Bool(false) + } + + val doResp = cmd.bits.inst.xd + val stallReg = busy(addr) + val stallLoad = doLoad && !io.mem.req.ready + val stallResp = doResp && !io.resp.ready + + cmd.ready := !stallReg && !stallLoad && !stallResp + // command resolved if no stalls AND not issuing a load that will need a request + + // PROC RESPONSE INTERFACE + io.resp.valid := cmd.valid && doResp && !stallReg && !stallLoad + // valid response if valid command, need a response, and no stalls + io.resp.bits.rd := cmd.bits.inst.rd + // Must respond with the appropriate tag or undefined behavior + io.resp.bits.data := accum + // Semantics is to always send out prior accumulator register value + + io.busy := cmd.valid || busy.reduce(_||_) + // Be busy when have pending memory requests or committed possibility of pending requests + io.interrupt := Bool(false) + // Set this true to trigger an interrupt on the processor (please refer to supervisor documentation) + + // MEMORY REQUEST INTERFACE + io.mem.req.valid := cmd.valid && doLoad && !stallReg && !stallResp + io.mem.req.bits.addr := addend + io.mem.req.bits.tag := addr + io.mem.req.bits.cmd := M_XRD // perform a load (M_XWR for stores) + io.mem.req.bits.typ := MT_D // D = 8 bytes, W = 4, H = 2, B = 1 + io.mem.req.bits.data := Bits(0) // we're not performing any stores... + io.mem.invalidate_lr := false + + io.autl.acquire.valid := false + io.autl.grant.ready := false +} + +class TranslatorExample(implicit p: Parameters) extends RoCC()(p) { + val req_addr = Reg(UInt(width = coreMaxAddrBits)) + val req_rd = Reg(io.resp.bits.rd) + val req_offset = req_addr(pgIdxBits - 1, 0) + val req_vpn = req_addr(coreMaxAddrBits - 1, pgIdxBits) + val pte = Reg(new PTE) + + val s_idle :: s_ptw_req :: s_ptw_resp :: s_resp :: Nil = Enum(Bits(), 4) + val state = Reg(init = s_idle) + + io.cmd.ready := (state === s_idle) + + when (io.cmd.fire()) { + req_rd := io.cmd.bits.inst.rd + req_addr := io.cmd.bits.rs1 + state := s_ptw_req + } + + private val ptw = io.ptw(0) + + when (ptw.req.fire()) { state := s_ptw_resp } + + when (state === s_ptw_resp && ptw.resp.valid) { + pte := ptw.resp.bits.pte + state := s_resp + } + + when (io.resp.fire()) { state := s_idle } + + ptw.req.valid := (state === s_ptw_req) + ptw.req.bits.addr := req_vpn + ptw.req.bits.store := Bool(false) + ptw.req.bits.fetch := Bool(false) + + io.resp.valid := (state === s_resp) + io.resp.bits.rd := req_rd + io.resp.bits.data := Mux(pte.leaf(), Cat(pte.ppn, req_offset), ~UInt(0, xLen)) + + io.busy := (state =/= s_idle) + io.interrupt := Bool(false) + io.mem.req.valid := Bool(false) + io.mem.invalidate_lr := Bool(false) + io.autl.acquire.valid := Bool(false) + io.autl.grant.ready := Bool(false) +} + +class CharacterCountExample(implicit p: Parameters) extends RoCC()(p) + with HasTileLinkParameters { + + private val blockOffset = tlBeatAddrBits + tlByteAddrBits + + val needle = Reg(UInt(width = 8)) + val addr = Reg(UInt(width = coreMaxAddrBits)) + val count = Reg(UInt(width = xLen)) + val resp_rd = Reg(io.resp.bits.rd) + + val addr_block = addr(coreMaxAddrBits - 1, blockOffset) + val offset = addr(blockOffset - 1, 0) + val next_addr = (addr_block + UInt(1)) << UInt(blockOffset) + + val s_idle :: s_acq :: s_gnt :: s_check :: s_resp :: Nil = Enum(Bits(), 5) + val state = Reg(init = s_idle) + + val gnt = io.autl.grant.bits + val recv_data = Reg(UInt(width = tlDataBits)) + val recv_beat = Reg(UInt(width = tlBeatAddrBits)) + + val data_bytes = Vec.tabulate(tlDataBytes) { i => recv_data(8 * (i + 1) - 1, 8 * i) } + val zero_match = data_bytes.map(_ === UInt(0)) + val needle_match = data_bytes.map(_ === needle) + val first_zero = PriorityEncoder(zero_match) + + val chars_found = PopCount(needle_match.zipWithIndex.map { + case (matches, i) => + val idx = Cat(recv_beat, UInt(i, tlByteAddrBits)) + matches && idx >= offset && UInt(i) <= first_zero + }) + val zero_found = zero_match.reduce(_ || _) + val finished = Reg(Bool()) + + io.cmd.ready := (state === s_idle) + io.resp.valid := (state === s_resp) + io.resp.bits.rd := resp_rd + io.resp.bits.data := count + io.autl.acquire.valid := (state === s_acq) + io.autl.acquire.bits := GetBlock(addr_block = addr_block) + io.autl.grant.ready := (state === s_gnt) + + when (io.cmd.fire()) { + addr := io.cmd.bits.rs1 + needle := io.cmd.bits.rs2 + resp_rd := io.cmd.bits.inst.rd + count := UInt(0) + finished := Bool(false) + state := s_acq + } + + when (io.autl.acquire.fire()) { state := s_gnt } + + when (io.autl.grant.fire()) { + recv_beat := gnt.addr_beat + recv_data := gnt.data + state := s_check + } + + when (state === s_check) { + when (!finished) { + count := count + chars_found + } + when (zero_found) { finished := Bool(true) } + when (recv_beat === UInt(tlDataBeats - 1)) { + addr := next_addr + state := Mux(zero_found || finished, s_resp, s_acq) + } .otherwise { + state := s_gnt + } + } + + when (io.resp.fire()) { state := s_idle } + + io.busy := (state =/= s_idle) + io.interrupt := Bool(false) + io.mem.req.valid := Bool(false) + io.mem.invalidate_lr := Bool(false) +} + +class OpcodeSet(val opcodes: Seq[UInt]) { + def |(set: OpcodeSet) = + new OpcodeSet(this.opcodes ++ set.opcodes) + + def matches(oc: UInt) = opcodes.map(_ === oc).reduce(_ || _) +} + +object OpcodeSet { + val custom0 = new OpcodeSet(Seq(Bits("b0001011"))) + val custom1 = new OpcodeSet(Seq(Bits("b0101011"))) + val custom2 = new OpcodeSet(Seq(Bits("b1011011"))) + val custom3 = new OpcodeSet(Seq(Bits("b1111011"))) + val all = custom0 | custom1 | custom2 | custom3 +} + +class RoccCommandRouter(opcodes: Seq[OpcodeSet])(implicit p: Parameters) + extends CoreModule()(p) { + val io = new Bundle { + val in = Decoupled(new RoCCCommand).flip + val out = Vec(opcodes.size, Decoupled(new RoCCCommand)) + val busy = Bool(OUTPUT) + } + + val cmd = Queue(io.in) + val cmdReadys = io.out.zip(opcodes).map { case (out, opcode) => + val me = opcode.matches(cmd.bits.inst.opcode) + out.valid := cmd.valid && me + out.bits := cmd.bits + out.ready && me + } + cmd.ready := cmdReadys.reduce(_ || _) + io.busy := cmd.valid + + assert(PopCount(cmdReadys) <= UInt(1), + "Custom opcode matched for more than one accelerator") +} diff --git a/rocket/src/main/scala/rocket.scala b/rocket/src/main/scala/rocket.scala new file mode 100644 index 00000000..7756ab18 --- /dev/null +++ b/rocket/src/main/scala/rocket.scala @@ -0,0 +1,679 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import junctions._ +import uncore.devices._ +import uncore.agents.CacheName +import uncore.constants._ +import Util._ +import cde.{Parameters, Field} + +case object UseFPU extends Field[Boolean] +case object FDivSqrt extends Field[Boolean] +case object XLen extends Field[Int] +case object FetchWidth extends Field[Int] +case object RetireWidth extends Field[Int] +case object UseVM extends Field[Boolean] +case object UseUser extends Field[Boolean] +case object UseDebug extends Field[Boolean] +case object UseAtomics extends Field[Boolean] +case object UsePerfCounters extends Field[Boolean] +case object FastLoadWord extends Field[Boolean] +case object FastLoadByte extends Field[Boolean] +case object MulUnroll extends Field[Int] +case object DivEarlyOut extends Field[Boolean] +case object CoreInstBits extends Field[Int] +case object CoreDataBits extends Field[Int] +case object CoreDCacheReqTagBits extends Field[Int] +case object NCustomMRWCSRs extends Field[Int] +case object MtvecWritable extends Field[Boolean] +case object MtvecInit extends Field[BigInt] +case object ResetVector extends Field[BigInt] +case object NBreakpoints extends Field[Int] + +trait HasCoreParameters extends HasAddrMapParameters { + implicit val p: Parameters + val xLen = p(XLen) + + val usingVM = p(UseVM) + val usingUser = p(UseUser) + val usingDebug = p(UseDebug) + val usingFPU = p(UseFPU) + val usingAtomics = p(UseAtomics) + val usingFDivSqrt = p(FDivSqrt) + val usingRoCC = !p(BuildRoCC).isEmpty + val mulUnroll = p(MulUnroll) + val divEarlyOut = p(DivEarlyOut) + val fastLoadWord = p(FastLoadWord) + val fastLoadByte = p(FastLoadByte) + + val retireWidth = p(RetireWidth) + val fetchWidth = p(FetchWidth) + val coreInstBits = p(CoreInstBits) + val coreInstBytes = coreInstBits/8 + val coreDataBits = xLen + val coreDataBytes = coreDataBits/8 + val coreDCacheReqTagBits = 7 + (2 + (if(!usingRoCC) 0 else 1)) + val vpnBitsExtended = vpnBits + (vaddrBits < xLen).toInt + val vaddrBitsExtended = vpnBitsExtended + pgIdxBits + val coreMaxAddrBits = paddrBits max vaddrBitsExtended + val nCustomMrwCsrs = p(NCustomMRWCSRs) + val roccCsrs = if (p(BuildRoCC).isEmpty) Nil + else p(BuildRoCC).flatMap(_.csrs) + val nRoccCsrs = p(RoccNCSRs) + val nCores = p(NTiles) + + // Print out log of committed instructions and their writeback values. + // Requires post-processing due to out-of-order writebacks. + val enableCommitLog = false + val usingPerfCounters = p(UsePerfCounters) + + val maxPAddrBits = xLen match { + case 32 => 34 + case 64 => 50 + } + + require(paddrBits < maxPAddrBits) + require(!fastLoadByte || fastLoadWord) +} + +abstract class CoreModule(implicit val p: Parameters) extends Module + with HasCoreParameters +abstract class CoreBundle(implicit val p: Parameters) extends ParameterizedBundle()(p) + with HasCoreParameters + +class RegFile(n: Int, w: Int, zero: Boolean = false) { + private val rf = Mem(n, UInt(width = w)) + private def access(addr: UInt) = rf(~addr(log2Up(n)-1,0)) + private val reads = collection.mutable.ArrayBuffer[(UInt,UInt)]() + private var canRead = true + def read(addr: UInt) = { + require(canRead) + reads += addr -> Wire(UInt()) + reads.last._2 := Mux(Bool(zero) && addr === UInt(0), UInt(0), access(addr)) + reads.last._2 + } + def write(addr: UInt, data: UInt) = { + canRead = false + when (addr =/= UInt(0)) { + access(addr) := data + for ((raddr, rdata) <- reads) + when (addr === raddr) { rdata := data } + } + } +} + +object ImmGen { + def apply(sel: UInt, inst: UInt) = { + val sign = Mux(sel === IMM_Z, SInt(0), inst(31).toSInt) + val b30_20 = Mux(sel === IMM_U, inst(30,20).toSInt, sign) + val b19_12 = Mux(sel =/= IMM_U && sel =/= IMM_UJ, sign, inst(19,12).toSInt) + val b11 = Mux(sel === IMM_U || sel === IMM_Z, SInt(0), + Mux(sel === IMM_UJ, inst(20).toSInt, + Mux(sel === IMM_SB, inst(7).toSInt, sign))) + val b10_5 = Mux(sel === IMM_U || sel === IMM_Z, Bits(0), inst(30,25)) + val b4_1 = Mux(sel === IMM_U, Bits(0), + Mux(sel === IMM_S || sel === IMM_SB, inst(11,8), + Mux(sel === IMM_Z, inst(19,16), inst(24,21)))) + val b0 = Mux(sel === IMM_S, inst(7), + Mux(sel === IMM_I, inst(20), + Mux(sel === IMM_Z, inst(15), Bits(0)))) + + Cat(sign, b30_20, b19_12, b11, b10_5, b4_1, b0).toSInt + } +} + +class Rocket(implicit p: Parameters) extends CoreModule()(p) { + val io = new Bundle { + val prci = new PRCITileIO().flip + val imem = new FrontendIO()(p.alterPartial({case CacheName => "L1I" })) + val dmem = new HellaCacheIO()(p.alterPartial({ case CacheName => "L1D" })) + val ptw = new DatapathPTWIO().flip + val fpu = new FPUIO().flip + val rocc = new RoCCInterface().flip + } + + val decode_table = { + (if (true) new MDecode +: (if (xLen > 32) Seq(new M64Decode) else Nil) else Nil) ++: + (if (usingAtomics) new ADecode +: (if (xLen > 32) Seq(new A64Decode) else Nil) else Nil) ++: + (if (usingFPU) new FDecode +: (if (xLen > 32) Seq(new F64Decode) else Nil) else Nil) ++: + (if (usingFPU && usingFDivSqrt) Some(new FDivSqrtDecode) else None) ++: + (if (usingRoCC) Some(new RoCCDecode) else None) ++: + (if (xLen > 32) Some(new I64Decode) else None) ++: + (if (usingVM) Some(new SDecode) else None) ++: + (if (usingDebug) Some(new DebugDecode) else None) ++: + Seq(new IDecode) + } flatMap(_.table) + + val ex_ctrl = Reg(new IntCtrlSigs) + val mem_ctrl = Reg(new IntCtrlSigs) + val wb_ctrl = Reg(new IntCtrlSigs) + + val ex_reg_xcpt_interrupt = Reg(Bool()) + val ex_reg_valid = Reg(Bool()) + val ex_reg_btb_hit = Reg(Bool()) + val ex_reg_btb_resp = Reg(io.imem.btb_resp.bits) + val ex_reg_xcpt = Reg(Bool()) + val ex_reg_flush_pipe = Reg(Bool()) + val ex_reg_load_use = Reg(Bool()) + val ex_reg_cause = Reg(UInt()) + val ex_reg_replay = Reg(Bool()) + val ex_reg_pc = Reg(UInt()) + val ex_reg_inst = Reg(Bits()) + + val mem_reg_xcpt_interrupt = Reg(Bool()) + val mem_reg_valid = Reg(Bool()) + val mem_reg_btb_hit = Reg(Bool()) + val mem_reg_btb_resp = Reg(io.imem.btb_resp.bits) + val mem_reg_xcpt = Reg(Bool()) + val mem_reg_replay = Reg(Bool()) + val mem_reg_flush_pipe = Reg(Bool()) + val mem_reg_cause = Reg(UInt()) + val mem_reg_slow_bypass = Reg(Bool()) + val mem_reg_load = Reg(Bool()) + val mem_reg_store = Reg(Bool()) + val mem_reg_pc = Reg(UInt()) + val mem_reg_inst = Reg(Bits()) + val mem_reg_wdata = Reg(Bits()) + val mem_reg_rs2 = Reg(Bits()) + val take_pc_mem = Wire(Bool()) + + val wb_reg_valid = Reg(Bool()) + val wb_reg_xcpt = Reg(Bool()) + val wb_reg_mem_xcpt = Reg(Bool()) + val wb_reg_replay = Reg(Bool()) + val wb_reg_cause = Reg(UInt()) + val wb_reg_pc = Reg(UInt()) + val wb_reg_inst = Reg(Bits()) + val wb_reg_wdata = Reg(Bits()) + val wb_reg_rs2 = Reg(Bits()) + val take_pc_wb = Wire(Bool()) + + val take_pc_mem_wb = take_pc_wb || take_pc_mem + val take_pc = take_pc_mem_wb + + // decode stage + val id_pc = io.imem.resp.bits.pc + val id_inst = io.imem.resp.bits.data(0).toBits; require(fetchWidth == 1) + val id_ctrl = Wire(new IntCtrlSigs()).decode(id_inst, decode_table) + val id_raddr3 = id_inst(31,27) + val id_raddr2 = id_inst(24,20) + val id_raddr1 = id_inst(19,15) + val id_waddr = id_inst(11,7) + val id_load_use = Wire(Bool()) + val id_reg_fence = Reg(init=Bool(false)) + val id_ren = IndexedSeq(id_ctrl.rxs1, id_ctrl.rxs2) + val id_raddr = IndexedSeq(id_raddr1, id_raddr2) + val rf = new RegFile(31, xLen) + val id_rs = id_raddr.map(rf.read _) + val ctrl_killd = Wire(Bool()) + + val csr = Module(new CSRFile) + val id_csr_en = id_ctrl.csr =/= CSR.N + val id_system_insn = id_ctrl.csr === CSR.I + val id_csr_ren = (id_ctrl.csr === CSR.S || id_ctrl.csr === CSR.C) && id_raddr1 === UInt(0) + val id_csr = Mux(id_csr_ren, CSR.R, id_ctrl.csr) + val id_csr_addr = id_inst(31,20) + // this is overly conservative + val safe_csrs = CSRs.sscratch :: CSRs.sepc :: CSRs.mscratch :: CSRs.mepc :: CSRs.mcause :: CSRs.mbadaddr :: Nil + val legal_csrs = collection.mutable.LinkedHashSet(CSRs.all:_*) + val id_csr_flush = id_system_insn || (id_csr_en && !id_csr_ren && !DecodeLogic(id_csr_addr, safe_csrs.map(UInt(_)), (legal_csrs -- safe_csrs).toList.map(UInt(_)))) + + val id_illegal_insn = !id_ctrl.legal || + id_ctrl.fp && !csr.io.status.fs.orR || + id_ctrl.rocc && !csr.io.status.xs.orR + // stall decode for fences (now, for AMO.aq; later, for AMO.rl and FENCE) + val id_amo_aq = id_inst(26) + val id_amo_rl = id_inst(25) + val id_fence_next = id_ctrl.fence || id_ctrl.amo && id_amo_rl + val id_mem_busy = !io.dmem.ordered || io.dmem.req.valid + val id_rocc_busy = Bool(usingRoCC) && + (io.rocc.busy || ex_reg_valid && ex_ctrl.rocc || + mem_reg_valid && mem_ctrl.rocc || wb_reg_valid && wb_ctrl.rocc) + id_reg_fence := id_fence_next || id_reg_fence && id_mem_busy + val id_do_fence = id_rocc_busy && id_ctrl.fence || + id_mem_busy && (id_ctrl.amo && id_amo_aq || id_ctrl.fence_i || id_reg_fence && (id_ctrl.mem || id_ctrl.rocc) || id_csr_en) + + val bpu = Module(new BreakpointUnit) + bpu.io.status := csr.io.status + bpu.io.bp := csr.io.bp + bpu.io.pc := id_pc + bpu.io.ea := mem_reg_wdata + + val (id_xcpt, id_cause) = checkExceptions(List( + (csr.io.interrupt, csr.io.interrupt_cause), + (bpu.io.xcpt_if, UInt(Causes.breakpoint)), + (io.imem.resp.bits.xcpt_if, UInt(Causes.fault_fetch)), + (id_illegal_insn, UInt(Causes.illegal_instruction)))) + + val dcache_bypass_data = + if (fastLoadByte) io.dmem.resp.bits.data + else if (fastLoadWord) io.dmem.resp.bits.data_word_bypass + else wb_reg_wdata + + // detect bypass opportunities + val ex_waddr = ex_reg_inst(11,7) + val mem_waddr = mem_reg_inst(11,7) + val wb_waddr = wb_reg_inst(11,7) + val bypass_sources = IndexedSeq( + (Bool(true), UInt(0), UInt(0)), // treat reading x0 as a bypass + (ex_reg_valid && ex_ctrl.wxd, ex_waddr, mem_reg_wdata), + (mem_reg_valid && mem_ctrl.wxd && !mem_ctrl.mem, mem_waddr, wb_reg_wdata), + (mem_reg_valid && mem_ctrl.wxd, mem_waddr, dcache_bypass_data)) + val id_bypass_src = id_raddr.map(raddr => bypass_sources.map(s => s._1 && s._2 === raddr)) + + // execute stage + val bypass_mux = Vec(bypass_sources.map(_._3)) + val ex_reg_rs_bypass = Reg(Vec(id_raddr.size, Bool())) + val ex_reg_rs_lsb = Reg(Vec(id_raddr.size, UInt())) + val ex_reg_rs_msb = Reg(Vec(id_raddr.size, UInt())) + val ex_rs = for (i <- 0 until id_raddr.size) + yield Mux(ex_reg_rs_bypass(i), bypass_mux(ex_reg_rs_lsb(i)), Cat(ex_reg_rs_msb(i), ex_reg_rs_lsb(i))) + val ex_imm = ImmGen(ex_ctrl.sel_imm, ex_reg_inst) + val ex_op1 = MuxLookup(ex_ctrl.sel_alu1, SInt(0), Seq( + A1_RS1 -> ex_rs(0).toSInt, + A1_PC -> ex_reg_pc.toSInt)) + val ex_op2 = MuxLookup(ex_ctrl.sel_alu2, SInt(0), Seq( + A2_RS2 -> ex_rs(1).toSInt, + A2_IMM -> ex_imm, + A2_FOUR -> SInt(4))) + + val alu = Module(new ALU) + alu.io.dw := ex_ctrl.alu_dw + alu.io.fn := ex_ctrl.alu_fn + alu.io.in2 := ex_op2.toUInt + alu.io.in1 := ex_op1.toUInt + + // multiplier and divider + val div = Module(new MulDiv(width = xLen, + unroll = mulUnroll, + earlyOut = divEarlyOut)) + + div.io.req.valid := ex_reg_valid && ex_ctrl.div + div.io.req.bits.dw := ex_ctrl.alu_dw + div.io.req.bits.fn := ex_ctrl.alu_fn + div.io.req.bits.in1 := ex_rs(0) + div.io.req.bits.in2 := ex_rs(1) + div.io.req.bits.tag := ex_waddr + + ex_reg_valid := !ctrl_killd + ex_reg_replay := !take_pc && io.imem.resp.valid && io.imem.resp.bits.replay + ex_reg_xcpt := !ctrl_killd && id_xcpt + ex_reg_xcpt_interrupt := !take_pc && io.imem.resp.valid && csr.io.interrupt + when (id_xcpt) { ex_reg_cause := id_cause } + + when (!ctrl_killd) { + ex_ctrl := id_ctrl + ex_ctrl.csr := id_csr + ex_reg_btb_hit := io.imem.btb_resp.valid + when (io.imem.btb_resp.valid) { ex_reg_btb_resp := io.imem.btb_resp.bits } + ex_reg_flush_pipe := id_ctrl.fence_i || id_csr_flush || csr.io.singleStep + ex_reg_load_use := id_load_use + + when (id_ctrl.jalr && csr.io.status.debug) { + ex_reg_flush_pipe := true + ex_ctrl.fence_i := true + } + + for (i <- 0 until id_raddr.size) { + val do_bypass = id_bypass_src(i).reduce(_||_) + val bypass_src = PriorityEncoder(id_bypass_src(i)) + ex_reg_rs_bypass(i) := do_bypass + ex_reg_rs_lsb(i) := bypass_src + when (id_ren(i) && !do_bypass) { + ex_reg_rs_lsb(i) := id_rs(i)(bypass_src.getWidth-1,0) + ex_reg_rs_msb(i) := id_rs(i) >> bypass_src.getWidth + } + } + } + when (!ctrl_killd || csr.io.interrupt || io.imem.resp.bits.replay) { + ex_reg_inst := id_inst + ex_reg_pc := id_pc + } + + // replay inst in ex stage? + val ex_pc_valid = ex_reg_valid || ex_reg_replay || ex_reg_xcpt_interrupt + val wb_dcache_miss = wb_ctrl.mem && !io.dmem.resp.valid + val replay_ex_structural = ex_ctrl.mem && !io.dmem.req.ready || + ex_ctrl.div && !div.io.req.ready + val replay_ex_load_use = wb_dcache_miss && ex_reg_load_use + val replay_ex = ex_reg_replay || (ex_reg_valid && (replay_ex_structural || replay_ex_load_use)) + val ctrl_killx = take_pc_mem_wb || replay_ex || !ex_reg_valid + // detect 2-cycle load-use delay for LB/LH/SC + val ex_slow_bypass = ex_ctrl.mem_cmd === M_XSC || Vec(MT_B, MT_BU, MT_H, MT_HU).contains(ex_ctrl.mem_type) + + val (ex_xcpt, ex_cause) = checkExceptions(List( + (ex_reg_xcpt_interrupt || ex_reg_xcpt, ex_reg_cause), + (ex_ctrl.fp && io.fpu.illegal_rm, UInt(Causes.illegal_instruction)))) + + // memory stage + val mem_br_taken = mem_reg_wdata(0) + val mem_br_target = mem_reg_pc.toSInt + + Mux(mem_ctrl.branch && mem_br_taken, ImmGen(IMM_SB, mem_reg_inst), + Mux(mem_ctrl.jal, ImmGen(IMM_UJ, mem_reg_inst), SInt(4))) + val mem_int_wdata = Mux(mem_ctrl.jalr, mem_br_target, mem_reg_wdata.toSInt).toUInt + val mem_npc = (Mux(mem_ctrl.jalr, encodeVirtualAddress(mem_reg_wdata, mem_reg_wdata).toSInt, mem_br_target) & SInt(-2)).toUInt + val mem_wrong_npc = Mux(ex_pc_valid, mem_npc =/= ex_reg_pc, Mux(io.imem.resp.valid, mem_npc =/= id_pc, Bool(true))) + val mem_npc_misaligned = mem_npc(1) + val mem_cfi = mem_ctrl.branch || mem_ctrl.jalr || mem_ctrl.jal + val mem_cfi_taken = (mem_ctrl.branch && mem_br_taken) || mem_ctrl.jalr || mem_ctrl.jal + val mem_misprediction = + if (p(BtbKey).nEntries == 0) mem_cfi_taken + else mem_wrong_npc + val want_take_pc_mem = mem_reg_valid && (mem_misprediction || mem_reg_flush_pipe) + take_pc_mem := want_take_pc_mem && !mem_npc_misaligned + + mem_reg_valid := !ctrl_killx + mem_reg_replay := !take_pc_mem_wb && replay_ex + mem_reg_xcpt := !ctrl_killx && ex_xcpt + mem_reg_xcpt_interrupt := !take_pc_mem_wb && ex_reg_xcpt_interrupt + when (ex_xcpt) { mem_reg_cause := ex_cause } + + when (ex_pc_valid) { + mem_ctrl := ex_ctrl + mem_reg_load := ex_ctrl.mem && isRead(ex_ctrl.mem_cmd) + mem_reg_store := ex_ctrl.mem && isWrite(ex_ctrl.mem_cmd) + mem_reg_btb_hit := ex_reg_btb_hit + when (ex_reg_btb_hit) { mem_reg_btb_resp := ex_reg_btb_resp } + mem_reg_flush_pipe := ex_reg_flush_pipe + mem_reg_slow_bypass := ex_slow_bypass + + mem_reg_inst := ex_reg_inst + mem_reg_pc := ex_reg_pc + mem_reg_wdata := alu.io.out + when (ex_ctrl.rxs2 && (ex_ctrl.mem || ex_ctrl.rocc)) { + mem_reg_rs2 := ex_rs(1) + } + } + + val (mem_new_xcpt, mem_new_cause) = checkExceptions(List( + (mem_reg_load && bpu.io.xcpt_ld, UInt(Causes.breakpoint)), + (mem_reg_store && bpu.io.xcpt_st, UInt(Causes.breakpoint)), + (want_take_pc_mem && mem_npc_misaligned, UInt(Causes.misaligned_fetch)), + (mem_ctrl.mem && io.dmem.xcpt.ma.st, UInt(Causes.misaligned_store)), + (mem_ctrl.mem && io.dmem.xcpt.ma.ld, UInt(Causes.misaligned_load)), + (mem_ctrl.mem && io.dmem.xcpt.pf.st, UInt(Causes.fault_store)), + (mem_ctrl.mem && io.dmem.xcpt.pf.ld, UInt(Causes.fault_load)))) + + val (mem_xcpt, mem_cause) = checkExceptions(List( + (mem_reg_xcpt_interrupt || mem_reg_xcpt, mem_reg_cause), + (mem_reg_valid && mem_new_xcpt, mem_new_cause))) + + val dcache_kill_mem = mem_reg_valid && mem_ctrl.wxd && io.dmem.replay_next // structural hazard on writeback port + val fpu_kill_mem = mem_reg_valid && mem_ctrl.fp && io.fpu.nack_mem + val replay_mem = dcache_kill_mem || mem_reg_replay || fpu_kill_mem + val killm_common = dcache_kill_mem || take_pc_wb || mem_reg_xcpt || !mem_reg_valid + div.io.kill := killm_common && Reg(next = div.io.req.fire()) + val ctrl_killm = killm_common || mem_xcpt || fpu_kill_mem + + // writeback stage + wb_reg_valid := !ctrl_killm + wb_reg_replay := replay_mem && !take_pc_wb + wb_reg_xcpt := mem_xcpt && !take_pc_wb + wb_reg_mem_xcpt := mem_reg_valid && mem_new_xcpt && !(mem_reg_xcpt_interrupt || mem_reg_xcpt) + when (mem_xcpt) { wb_reg_cause := mem_cause } + when (mem_reg_valid || mem_reg_replay || mem_reg_xcpt_interrupt) { + wb_ctrl := mem_ctrl + wb_reg_wdata := Mux(mem_ctrl.fp && mem_ctrl.wxd, io.fpu.toint_data, mem_int_wdata) + when (mem_ctrl.rocc) { + wb_reg_rs2 := mem_reg_rs2 + } + wb_reg_inst := mem_reg_inst + wb_reg_pc := mem_reg_pc + } + + val wb_set_sboard = wb_ctrl.div || wb_dcache_miss || wb_ctrl.rocc + val replay_wb_common = io.dmem.s2_nack || wb_reg_replay + val replay_wb_rocc = wb_reg_valid && wb_ctrl.rocc && !io.rocc.cmd.ready + val replay_wb = replay_wb_common || replay_wb_rocc + val wb_xcpt = wb_reg_xcpt || csr.io.csr_xcpt + take_pc_wb := replay_wb || wb_xcpt || csr.io.eret + + // writeback arbitration + val dmem_resp_xpu = !io.dmem.resp.bits.tag(0).toBool + val dmem_resp_fpu = io.dmem.resp.bits.tag(0).toBool + val dmem_resp_waddr = io.dmem.resp.bits.tag >> 1 + val dmem_resp_valid = io.dmem.resp.valid && io.dmem.resp.bits.has_data + val dmem_resp_replay = dmem_resp_valid && io.dmem.resp.bits.replay + + div.io.resp.ready := !(wb_reg_valid && wb_ctrl.wxd) + val ll_wdata = Wire(init = div.io.resp.bits.data) + val ll_waddr = Wire(init = div.io.resp.bits.tag) + val ll_wen = Wire(init = div.io.resp.fire()) + if (usingRoCC) { + io.rocc.resp.ready := !(wb_reg_valid && wb_ctrl.wxd) + when (io.rocc.resp.fire()) { + div.io.resp.ready := Bool(false) + ll_wdata := io.rocc.resp.bits.data + ll_waddr := io.rocc.resp.bits.rd + ll_wen := Bool(true) + } + } + when (dmem_resp_replay && dmem_resp_xpu) { + div.io.resp.ready := Bool(false) + if (usingRoCC) + io.rocc.resp.ready := Bool(false) + ll_waddr := dmem_resp_waddr + ll_wen := Bool(true) + } + + val wb_valid = wb_reg_valid && !replay_wb && !wb_xcpt + val wb_wen = wb_valid && wb_ctrl.wxd + val rf_wen = wb_wen || ll_wen + val rf_waddr = Mux(ll_wen, ll_waddr, wb_waddr) + val rf_wdata = Mux(dmem_resp_valid && dmem_resp_xpu, io.dmem.resp.bits.data, + Mux(ll_wen, ll_wdata, + Mux(wb_ctrl.csr =/= CSR.N, csr.io.rw.rdata, + wb_reg_wdata))) + when (rf_wen) { rf.write(rf_waddr, rf_wdata) } + + // hook up control/status regfile + csr.io.exception := wb_reg_xcpt + csr.io.cause := wb_reg_cause + csr.io.retire := wb_valid + csr.io.prci <> io.prci + io.fpu.fcsr_rm := csr.io.fcsr_rm + csr.io.fcsr_flags := io.fpu.fcsr_flags + io.rocc.csr <> csr.io.rocc.csr + csr.io.rocc.interrupt <> io.rocc.interrupt + csr.io.pc := wb_reg_pc + csr.io.badaddr := Mux(wb_reg_mem_xcpt, encodeVirtualAddress(wb_reg_wdata, wb_reg_wdata), wb_reg_pc) + io.ptw.ptbr := csr.io.ptbr + io.ptw.invalidate := csr.io.fatc + io.ptw.status := csr.io.status + csr.io.rw.addr := wb_reg_inst(31,20) + csr.io.rw.cmd := Mux(wb_reg_valid, wb_ctrl.csr, CSR.N) + csr.io.rw.wdata := wb_reg_wdata + + val hazard_targets = Seq((id_ctrl.rxs1 && id_raddr1 =/= UInt(0), id_raddr1), + (id_ctrl.rxs2 && id_raddr2 =/= UInt(0), id_raddr2), + (id_ctrl.wxd && id_waddr =/= UInt(0), id_waddr)) + val fp_hazard_targets = Seq((io.fpu.dec.ren1, id_raddr1), + (io.fpu.dec.ren2, id_raddr2), + (io.fpu.dec.ren3, id_raddr3), + (io.fpu.dec.wen, id_waddr)) + + val sboard = new Scoreboard(32) + sboard.clear(ll_wen, ll_waddr) + val id_sboard_hazard = checkHazards(hazard_targets, sboard.read _) + sboard.set(wb_set_sboard && wb_wen, wb_waddr) + + // stall for RAW/WAW hazards on CSRs, loads, AMOs, and mul/div in execute stage. + val ex_cannot_bypass = ex_ctrl.csr =/= CSR.N || ex_ctrl.jalr || ex_ctrl.mem || ex_ctrl.div || ex_ctrl.fp || ex_ctrl.rocc + val data_hazard_ex = ex_ctrl.wxd && checkHazards(hazard_targets, _ === ex_waddr) + val fp_data_hazard_ex = ex_ctrl.wfd && checkHazards(fp_hazard_targets, _ === ex_waddr) + val id_ex_hazard = ex_reg_valid && (data_hazard_ex && ex_cannot_bypass || fp_data_hazard_ex) + + // stall for RAW/WAW hazards on CSRs, LB/LH, and mul/div in memory stage. + val mem_mem_cmd_bh = + if (fastLoadWord) Bool(!fastLoadByte) && mem_reg_slow_bypass + else Bool(true) + val mem_cannot_bypass = mem_ctrl.csr =/= CSR.N || mem_ctrl.mem && mem_mem_cmd_bh || mem_ctrl.div || mem_ctrl.fp || mem_ctrl.rocc + val data_hazard_mem = mem_ctrl.wxd && checkHazards(hazard_targets, _ === mem_waddr) + val fp_data_hazard_mem = mem_ctrl.wfd && checkHazards(fp_hazard_targets, _ === mem_waddr) + val id_mem_hazard = mem_reg_valid && (data_hazard_mem && mem_cannot_bypass || fp_data_hazard_mem) + id_load_use := mem_reg_valid && data_hazard_mem && mem_ctrl.mem + + // stall for RAW/WAW hazards on load/AMO misses and mul/div in writeback. + val data_hazard_wb = wb_ctrl.wxd && checkHazards(hazard_targets, _ === wb_waddr) + val fp_data_hazard_wb = wb_ctrl.wfd && checkHazards(fp_hazard_targets, _ === wb_waddr) + val id_wb_hazard = wb_reg_valid && (data_hazard_wb && wb_set_sboard || fp_data_hazard_wb) + + val id_stall_fpu = if (usingFPU) { + val fp_sboard = new Scoreboard(32) + fp_sboard.set((wb_dcache_miss && wb_ctrl.wfd || io.fpu.sboard_set) && wb_valid, wb_waddr) + fp_sboard.clear(dmem_resp_replay && dmem_resp_fpu, dmem_resp_waddr) + fp_sboard.clear(io.fpu.sboard_clr, io.fpu.sboard_clra) + + id_csr_en && !io.fpu.fcsr_rdy || checkHazards(fp_hazard_targets, fp_sboard.read _) + } else Bool(false) + + val dcache_blocked = Reg(Bool()) + dcache_blocked := !io.dmem.req.ready && (io.dmem.req.valid || dcache_blocked) + val rocc_blocked = Reg(Bool()) + rocc_blocked := !wb_reg_xcpt && !io.rocc.cmd.ready && (io.rocc.cmd.valid || rocc_blocked) + + val ctrl_stalld = + id_ex_hazard || id_mem_hazard || id_wb_hazard || id_sboard_hazard || + id_ctrl.fp && id_stall_fpu || + id_ctrl.mem && dcache_blocked || // reduce activity during D$ misses + id_ctrl.rocc && rocc_blocked || // reduce activity while RoCC is busy + id_do_fence || + csr.io.csr_stall + ctrl_killd := !io.imem.resp.valid || io.imem.resp.bits.replay || take_pc || ctrl_stalld || csr.io.interrupt + + io.imem.req.valid := take_pc + io.imem.req.bits.speculative := !take_pc_wb + io.imem.req.bits.pc := + Mux(wb_xcpt || csr.io.eret, csr.io.evec, // exception or [m|s]ret + Mux(replay_wb, wb_reg_pc, // replay + mem_npc)).toUInt // mispredicted branch + io.imem.flush_icache := wb_reg_valid && wb_ctrl.fence_i && !io.dmem.s2_nack + io.imem.flush_tlb := csr.io.fatc + io.imem.resp.ready := !ctrl_stalld || csr.io.interrupt || take_pc_mem + + io.imem.btb_update.valid := mem_reg_valid && !mem_npc_misaligned && mem_wrong_npc && mem_cfi_taken && !take_pc_wb + io.imem.btb_update.bits.isJump := mem_ctrl.jal || mem_ctrl.jalr + io.imem.btb_update.bits.isReturn := mem_ctrl.jalr && mem_reg_inst(19,15) === BitPat("b00??1") + io.imem.btb_update.bits.pc := mem_reg_pc + io.imem.btb_update.bits.target := io.imem.req.bits.pc + io.imem.btb_update.bits.br_pc := mem_reg_pc + io.imem.btb_update.bits.prediction.valid := mem_reg_btb_hit + io.imem.btb_update.bits.prediction.bits := mem_reg_btb_resp + + io.imem.bht_update.valid := mem_reg_valid && mem_ctrl.branch && !take_pc_wb + io.imem.bht_update.bits.pc := mem_reg_pc + io.imem.bht_update.bits.taken := mem_br_taken + io.imem.bht_update.bits.mispredict := mem_wrong_npc + io.imem.bht_update.bits.prediction := io.imem.btb_update.bits.prediction + + io.imem.ras_update.valid := mem_reg_valid && io.imem.btb_update.bits.isJump && !mem_npc_misaligned && !take_pc_wb + io.imem.ras_update.bits.returnAddr := mem_int_wdata + io.imem.ras_update.bits.isCall := mem_ctrl.wxd && mem_waddr(0) + io.imem.ras_update.bits.isReturn := io.imem.btb_update.bits.isReturn + io.imem.ras_update.bits.prediction := io.imem.btb_update.bits.prediction + + io.fpu.valid := !ctrl_killd && id_ctrl.fp + io.fpu.killx := ctrl_killx + io.fpu.killm := killm_common + io.fpu.inst := id_inst + io.fpu.fromint_data := ex_rs(0) + io.fpu.dmem_resp_val := dmem_resp_valid && dmem_resp_fpu + io.fpu.dmem_resp_data := io.dmem.resp.bits.data_word_bypass + io.fpu.dmem_resp_type := io.dmem.resp.bits.typ + io.fpu.dmem_resp_tag := dmem_resp_waddr + + io.dmem.req.valid := ex_reg_valid && ex_ctrl.mem + val ex_dcache_tag = Cat(ex_waddr, ex_ctrl.fp) + require(coreDCacheReqTagBits >= ex_dcache_tag.getWidth) + io.dmem.req.bits.tag := ex_dcache_tag + io.dmem.req.bits.cmd := ex_ctrl.mem_cmd + io.dmem.req.bits.typ := ex_ctrl.mem_type + io.dmem.req.bits.phys := Bool(false) + io.dmem.req.bits.addr := encodeVirtualAddress(ex_rs(0), alu.io.adder_out) + io.dmem.s1_kill := killm_common || mem_xcpt + io.dmem.s1_data := Mux(mem_ctrl.fp, io.fpu.store_data, mem_reg_rs2) + io.dmem.invalidate_lr := wb_xcpt + + io.rocc.cmd.valid := wb_reg_valid && wb_ctrl.rocc && !replay_wb_common + io.rocc.exception := wb_xcpt && csr.io.status.xs.orR + io.rocc.cmd.bits.status := csr.io.status + io.rocc.cmd.bits.inst := new RoCCInstruction().fromBits(wb_reg_inst) + io.rocc.cmd.bits.rs1 := wb_reg_wdata + io.rocc.cmd.bits.rs2 := wb_reg_rs2 + + if (enableCommitLog) { + val pc = Wire(SInt(width=xLen)) + pc := wb_reg_pc + val inst = wb_reg_inst + val rd = RegNext(RegNext(RegNext(id_waddr))) + val wfd = wb_ctrl.wfd + val wxd = wb_ctrl.wxd + val has_data = wb_wen && !wb_set_sboard + val priv = csr.io.status.prv + + when (wb_valid) { + when (wfd) { + printf ("%d 0x%x (0x%x) f%d p%d 0xXXXXXXXXXXXXXXXX\n", priv, pc, inst, rd, rd+UInt(32)) + } + .elsewhen (wxd && rd =/= UInt(0) && has_data) { + printf ("%d 0x%x (0x%x) x%d 0x%x\n", priv, pc, inst, rd, rf_wdata) + } + .elsewhen (wxd && rd =/= UInt(0) && !has_data) { + printf ("%d 0x%x (0x%x) x%d p%d 0xXXXXXXXXXXXXXXXX\n", priv, pc, inst, rd, rd) + } + .otherwise { + printf ("%d 0x%x (0x%x)\n", priv, pc, inst) + } + } + + when (ll_wen && rf_waddr =/= UInt(0)) { + printf ("x%d p%d 0x%x\n", rf_waddr, rf_waddr, rf_wdata) + } + } + else { + printf("C%d: %d [%d] pc=[%x] W[r%d=%x][%d] R[r%d=%x] R[r%d=%x] inst=[%x] DASM(%x)\n", + io.prci.id, csr.io.time(31,0), wb_valid, wb_reg_pc, + Mux(rf_wen, rf_waddr, UInt(0)), rf_wdata, rf_wen, + wb_reg_inst(19,15), Reg(next=Reg(next=ex_rs(0))), + wb_reg_inst(24,20), Reg(next=Reg(next=ex_rs(1))), + wb_reg_inst, wb_reg_inst) + } + + def checkExceptions(x: Seq[(Bool, UInt)]) = + (x.map(_._1).reduce(_||_), PriorityMux(x)) + + def checkHazards(targets: Seq[(Bool, UInt)], cond: UInt => Bool) = + targets.map(h => h._1 && cond(h._2)).reduce(_||_) + + def encodeVirtualAddress(a0: UInt, ea: UInt) = if (vaddrBitsExtended == vaddrBits) ea else { + // efficient means to compress 64-bit VA into vaddrBits+1 bits + // (VA is bad if VA(vaddrBits) != VA(vaddrBits-1)) + val a = a0 >> vaddrBits-1 + val e = ea(vaddrBits,vaddrBits-1).toSInt + val msb = + Mux(a === UInt(0) || a === UInt(1), e =/= SInt(0), + Mux(a.toSInt === SInt(-1) || a.toSInt === SInt(-2), e === SInt(-1), e(0))) + Cat(msb, ea(vaddrBits-1,0)) + } + + class Scoreboard(n: Int) + { + def set(en: Bool, addr: UInt): Unit = update(en, _next | mask(en, addr)) + def clear(en: Bool, addr: UInt): Unit = update(en, _next & ~mask(en, addr)) + def read(addr: UInt): Bool = r(addr) + def readBypassed(addr: UInt): Bool = _next(addr) + + private val r = Reg(init=Bits(0, n)) + private var _next = r + private var ens = Bool(false) + private def mask(en: Bool, addr: UInt) = Mux(en, UInt(1) << addr, UInt(0)) + private def update(en: Bool, update: UInt) = { + _next = update + ens = ens || en + when (ens) { r := _next } + } + } +} diff --git a/rocket/src/main/scala/tile.scala b/rocket/src/main/scala/tile.scala new file mode 100644 index 00000000..66b16553 --- /dev/null +++ b/rocket/src/main/scala/tile.scala @@ -0,0 +1,151 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import uncore.tilelink._ +import uncore.agents._ +import uncore.devices._ +import Util._ +import cde.{Parameters, Field} + +case object CoreName extends Field[String] +case object BuildRoCC extends Field[Seq[RoccParameters]] +case object NCachedTileLinkPorts extends Field[Int] +case object NUncachedTileLinkPorts extends Field[Int] + +case class RoccParameters( + opcodes: OpcodeSet, + generator: Parameters => RoCC, + nMemChannels: Int = 0, + nPTWPorts : Int = 0, + csrs: Seq[Int] = Nil, + useFPU: Boolean = false) + +abstract class Tile(clockSignal: Clock = null, resetSignal: Bool = null) + (implicit p: Parameters) extends Module(Option(clockSignal), Option(resetSignal)) { + val nCachedTileLinkPorts = p(NCachedTileLinkPorts) + val nUncachedTileLinkPorts = p(NUncachedTileLinkPorts) + val dcacheParams = p.alterPartial({ case CacheName => "L1D" }) + + val io = new Bundle { + val cached = Vec(nCachedTileLinkPorts, new ClientTileLinkIO) + val uncached = Vec(nUncachedTileLinkPorts, new ClientUncachedTileLinkIO) + val prci = new PRCITileIO().flip + } +} + +class RocketTile(clockSignal: Clock = null, resetSignal: Bool = null) + (implicit p: Parameters) extends Tile(clockSignal, resetSignal)(p) { + val buildRocc = p(BuildRoCC) + val usingRocc = !buildRocc.isEmpty + val nRocc = buildRocc.size + val nFPUPorts = buildRocc.filter(_.useFPU).size + + val core = Module(new Rocket()(p.alterPartial({ case CoreName => "Rocket" }))) + val icache = Module(new Frontend()(p.alterPartial({ + case CacheName => "L1I" + case CoreName => "Rocket" }))) + val dcache = + if (p(NMSHRs) == 0) Module(new DCache()(dcacheParams)).io + else Module(new HellaCache()(dcacheParams)).io + + val ptwPorts = collection.mutable.ArrayBuffer(icache.io.ptw, dcache.ptw) + val dcPorts = collection.mutable.ArrayBuffer(core.io.dmem) + val uncachedArbPorts = collection.mutable.ArrayBuffer(icache.io.mem) + val uncachedPorts = collection.mutable.ArrayBuffer[ClientUncachedTileLinkIO]() + val cachedPorts = collection.mutable.ArrayBuffer(dcache.mem) + core.io.prci <> io.prci + icache.io.cpu <> core.io.imem + + val fpuOpt = if (p(UseFPU)) Some(Module(new FPU)) else None + fpuOpt.foreach(fpu => core.io.fpu <> fpu.io) + + if (usingRocc) { + val respArb = Module(new RRArbiter(new RoCCResponse, nRocc)) + core.io.rocc.resp <> respArb.io.out + + val roccOpcodes = buildRocc.map(_.opcodes) + val cmdRouter = Module(new RoccCommandRouter(roccOpcodes)) + cmdRouter.io.in <> core.io.rocc.cmd + + val roccs = buildRocc.zipWithIndex.map { case (accelParams, i) => + val rocc = accelParams.generator(p.alterPartial({ + case RoccNMemChannels => accelParams.nMemChannels + case RoccNPTWPorts => accelParams.nPTWPorts + case RoccNCSRs => accelParams.csrs.size + })) + val dcIF = Module(new SimpleHellaCacheIF()(dcacheParams)) + rocc.io.cmd <> cmdRouter.io.out(i) + rocc.io.exception := core.io.rocc.exception + rocc.io.host_id := io.prci.id + dcIF.io.requestor <> rocc.io.mem + dcPorts += dcIF.io.cache + uncachedArbPorts += rocc.io.autl + rocc + } + + if (nFPUPorts > 0) { + fpuOpt.foreach { fpu => + val fpArb = Module(new InOrderArbiter(new FPInput, new FPResult, nFPUPorts)) + val fp_roccs = roccs.zip(buildRocc) + .filter { case (_, params) => params.useFPU } + .map { case (rocc, _) => rocc.io } + fpArb.io.in_req <> fp_roccs.map(_.fpu_req) + fp_roccs.zip(fpArb.io.in_resp).foreach { + case (rocc, fpu_resp) => rocc.fpu_resp <> fpu_resp + } + fpu.io.cp_req <> fpArb.io.out_req + fpArb.io.out_resp <> fpu.io.cp_resp + } + } + + core.io.rocc.busy := cmdRouter.io.busy || roccs.map(_.io.busy).reduce(_ || _) + core.io.rocc.interrupt := roccs.map(_.io.interrupt).reduce(_ || _) + respArb.io.in <> roccs.map(rocc => Queue(rocc.io.resp)) + + if (p(RoccNCSRs) > 0) { + core.io.rocc.csr.rdata <> roccs.flatMap(_.io.csr.rdata) + for ((rocc, accelParams) <- roccs.zip(buildRocc)) { + rocc.io.csr.waddr := core.io.rocc.csr.waddr + rocc.io.csr.wdata := core.io.rocc.csr.wdata + rocc.io.csr.wen := core.io.rocc.csr.wen && + accelParams.csrs + .map(core.io.rocc.csr.waddr === UInt(_)) + .reduce((a, b) => a || b) + } + } + + ptwPorts ++= roccs.flatMap(_.io.ptw) + uncachedPorts ++= roccs.flatMap(_.io.utl) + } + + val uncachedArb = Module(new ClientUncachedTileLinkIOArbiter(uncachedArbPorts.size)) + uncachedArb.io.in <> uncachedArbPorts + uncachedArb.io.out +=: uncachedPorts + + // Connect the caches and RoCC to the outer memory system + io.uncached <> uncachedPorts + io.cached <> cachedPorts + // TODO remove nCached/nUncachedTileLinkPorts parameters and these assertions + require(uncachedPorts.size == nUncachedTileLinkPorts) + require(cachedPorts.size == nCachedTileLinkPorts) + + if (p(UseVM)) { + val ptw = Module(new PTW(ptwPorts.size)(dcacheParams)) + ptw.io.requestor <> ptwPorts + ptw.io.mem +=: dcPorts + core.io.ptw <> ptw.io.dpath + } + + val dcArb = Module(new HellaCacheArbiter(dcPorts.size)(dcacheParams)) + dcArb.io.requestor <> dcPorts + dcache.cpu <> dcArb.io.mem + + if (!usingRocc || nFPUPorts == 0) { + fpuOpt.foreach { fpu => + fpu.io.cp_req.valid := Bool(false) + fpu.io.cp_resp.ready := Bool(false) + } + } +} diff --git a/rocket/src/main/scala/tlb.scala b/rocket/src/main/scala/tlb.scala new file mode 100644 index 00000000..aca9aed2 --- /dev/null +++ b/rocket/src/main/scala/tlb.scala @@ -0,0 +1,176 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import Util._ +import junctions._ +import scala.math._ +import cde.{Parameters, Field} +import uncore.agents.PseudoLRU +import uncore.coherence._ + +case object NTLBEntries extends Field[Int] + +trait HasTLBParameters extends HasCoreParameters { + val entries = p(NTLBEntries) + val camAddrBits = log2Ceil(entries) + val camTagBits = asIdBits + vpnBits +} + +class TLBReq(implicit p: Parameters) extends CoreBundle()(p) { + val vpn = UInt(width = vpnBitsExtended) + val passthrough = Bool() + val instruction = Bool() + val store = Bool() +} + +class TLBResp(implicit p: Parameters) extends CoreBundle()(p) { + // lookup responses + val miss = Bool(OUTPUT) + val ppn = UInt(OUTPUT, ppnBits) + val xcpt_ld = Bool(OUTPUT) + val xcpt_st = Bool(OUTPUT) + val xcpt_if = Bool(OUTPUT) + val cacheable = Bool(OUTPUT) +} + +class TLB(implicit val p: Parameters) extends Module with HasTLBParameters { + val io = new Bundle { + val req = Decoupled(new TLBReq).flip + val resp = new TLBResp + val ptw = new TLBPTWIO + } + + val valid = Reg(init = UInt(0, entries)) + val ppns = Reg(Vec(entries, io.ptw.resp.bits.pte.ppn)) + val tags = Reg(Vec(entries, UInt(width = asIdBits + vpnBits))) + + val s_ready :: s_request :: s_wait :: s_wait_invalidate :: Nil = Enum(UInt(), 4) + val state = Reg(init=s_ready) + val r_refill_tag = Reg(UInt(width = asIdBits + vpnBits)) + val r_refill_waddr = Reg(UInt(width = log2Ceil(entries))) + val r_req = Reg(new TLBReq) + + val lookup_tag = Cat(io.ptw.ptbr.asid, io.req.bits.vpn(vpnBits-1,0)).toUInt + val hitsVec = (0 until entries).map(i => valid(i) && tags(i) === lookup_tag) + val hits = hitsVec.toBits + + // permission bit arrays + val pte_array = Reg(new PTE) + val u_array = Reg(UInt(width = entries)) // user permission + val sw_array = Reg(UInt(width = entries)) // write permission + val sx_array = Reg(UInt(width = entries)) // execute permission + val sr_array = Reg(UInt(width = entries)) // read permission + val dirty_array = Reg(UInt(width = entries)) // PTE dirty bit + when (io.ptw.resp.valid) { + val pte = io.ptw.resp.bits.pte + ppns(r_refill_waddr) := pte.ppn + tags(r_refill_waddr) := r_refill_tag + + val mask = UIntToOH(r_refill_waddr) + valid := valid | mask + u_array := Mux(pte.u, u_array | mask, u_array & ~mask) + sr_array := Mux(pte.sr(), sr_array | mask, sr_array & ~mask) + sw_array := Mux(pte.sw(), sw_array | mask, sw_array & ~mask) + sx_array := Mux(pte.sx(), sx_array | mask, sx_array & ~mask) + dirty_array := Mux(pte.d, dirty_array | mask, dirty_array & ~mask) + } + + // high if there are any unused (invalid) entries in the TLB + val plru = new PseudoLRU(entries) + val repl_waddr = Mux(!valid.andR, PriorityEncoder(~valid), plru.replace) + + val do_mprv = io.ptw.status.mprv && !io.req.bits.instruction + val priv = Mux(do_mprv, io.ptw.status.mpp, io.ptw.status.prv) + val priv_s = priv === PRV.S + val priv_uses_vm = priv <= PRV.S && !io.ptw.status.debug + + val priv_ok = Mux(priv_s, ~Mux(io.ptw.status.pum, u_array, UInt(0)), u_array) + val w_array = priv_ok & sw_array + val x_array = priv_ok & sx_array + val r_array = priv_ok & (sr_array | Mux(io.ptw.status.mxr, x_array, UInt(0))) + + val vm_enabled = Bool(usingVM) && io.ptw.status.vm(3) && priv_uses_vm && !io.req.bits.passthrough + val bad_va = + if (vpnBits == vpnBitsExtended) Bool(false) + else io.req.bits.vpn(vpnBits) =/= io.req.bits.vpn(vpnBits-1) + // it's only a store hit if the dirty bit is set + val tag_hits = hits & (dirty_array | ~Mux(io.req.bits.store, w_array, UInt(0))) + val tag_hit = tag_hits.orR + val tlb_hit = vm_enabled && tag_hit + val tlb_miss = vm_enabled && !tag_hit && !bad_va + + when (io.req.valid && tlb_hit) { + plru.access(OHToUInt(hits)) + } + + val paddr = Cat(io.resp.ppn, UInt(0, pgIdxBits)) + val addr_prot = addrMap.getProt(paddr) + + io.req.ready := state === s_ready + io.resp.xcpt_ld := bad_va || (!tlb_miss && !addr_prot.r) || (tlb_hit && !(r_array & hits).orR) + io.resp.xcpt_st := bad_va || (!tlb_miss && !addr_prot.w) || (tlb_hit && !(w_array & hits).orR) + io.resp.xcpt_if := bad_va || (!tlb_miss && !addr_prot.x) || (tlb_hit && !(x_array & hits).orR) + io.resp.cacheable := addrMap.isCacheable(paddr) + io.resp.miss := tlb_miss + io.resp.ppn := Mux(vm_enabled, Mux1H(hitsVec, ppns), io.req.bits.vpn(ppnBits-1,0)) + + io.ptw.req.valid := state === s_request + io.ptw.req.bits := io.ptw.status + io.ptw.req.bits.addr := r_refill_tag + io.ptw.req.bits.store := r_req.store + io.ptw.req.bits.fetch := r_req.instruction + + if (usingVM) { + when (io.req.fire() && tlb_miss) { + state := s_request + r_refill_tag := lookup_tag + r_refill_waddr := repl_waddr + r_req := io.req.bits + } + when (state === s_request) { + when (io.ptw.invalidate) { + state := s_ready + } + when (io.ptw.req.ready) { + state := s_wait + when (io.ptw.invalidate) { state := s_wait_invalidate } + } + } + when (state === s_wait && io.ptw.invalidate) { + state := s_wait_invalidate + } + when (io.ptw.resp.valid) { + state := s_ready + } + + when (io.ptw.invalidate) { + valid := 0 + } + } +} + +class DecoupledTLB(implicit p: Parameters) extends Module { + val io = new Bundle { + val req = Decoupled(new TLBReq).flip + val resp = Decoupled(new TLBResp) + val ptw = new TLBPTWIO + } + + val reqq = Queue(io.req) + val tlb = Module(new TLB) + + val resp_helper = DecoupledHelper( + reqq.valid, tlb.io.req.ready, io.resp.ready) + val tlb_miss = tlb.io.resp.miss + + tlb.io.req.valid := resp_helper.fire(tlb.io.req.ready) + tlb.io.req.bits := reqq.bits + reqq.ready := resp_helper.fire(reqq.valid, !tlb_miss) + + io.resp.valid := resp_helper.fire(io.resp.ready, !tlb_miss) + io.resp.bits := tlb.io.resp + + io.ptw <> tlb.io.ptw +} diff --git a/rocket/src/main/scala/util.scala b/rocket/src/main/scala/util.scala new file mode 100644 index 00000000..40a3c4a9 --- /dev/null +++ b/rocket/src/main/scala/util.scala @@ -0,0 +1,179 @@ +// See LICENSE for license details. + +package rocket + +import Chisel._ +import uncore._ +import scala.math._ +import cde.{Parameters, Field} + +object Util { + implicit def uintToBitPat(x: UInt): BitPat = BitPat(x) + implicit def intToUInt(x: Int): UInt = UInt(x) + implicit def bigIntToUInt(x: BigInt): UInt = UInt(x) + implicit def booleanToBool(x: Boolean): Bits = Bool(x) + implicit def intSeqToUIntSeq(x: Seq[Int]): Seq[UInt] = x.map(UInt(_)) + implicit def wcToUInt(c: WideCounter): UInt = c.value + + implicit class UIntToAugmentedUInt(val x: UInt) extends AnyVal { + def sextTo(n: Int): UInt = + if (x.getWidth == n) x + else Cat(Fill(n - x.getWidth, x(x.getWidth-1)), x) + + def extract(hi: Int, lo: Int): UInt = { + if (hi == lo-1) UInt(0) + else x(hi, lo) + } + } + + implicit def booleanToIntConv(x: Boolean) = new AnyRef { + def toInt: Int = if (x) 1 else 0 + } + + implicit class SeqToAugmentedSeq[T <: Data](val x: Seq[T]) extends AnyVal { + def apply(idx: UInt): T = { + if (x.size == 1) { + x.head + } else { + val half = 1 << (log2Ceil(x.size) - 1) + val newIdx = idx & (half - 1) + Mux(idx >= UInt(half), x.drop(half)(newIdx), x.take(half)(newIdx)) + } + } + + def toBits(): UInt = Cat(x.map(_.toBits).reverse) + } + + def minUInt(values: Seq[UInt]): UInt = + values.reduce((a, b) => Mux(a < b, a, b)) + + def minUInt(first: UInt, rest: UInt*): UInt = + minUInt(first +: rest.toSeq) +} + +import Util._ + +object Str +{ + def apply(s: String): UInt = { + var i = BigInt(0) + require(s.forall(validChar _)) + for (c <- s) + i = (i << 8) | c + UInt(i, s.length*8) + } + def apply(x: Char): UInt = { + require(validChar(x)) + UInt(x.toInt, 8) + } + def apply(x: UInt): UInt = apply(x, 10) + def apply(x: UInt, radix: Int): UInt = { + val rad = UInt(radix) + val w = x.getWidth + require(w > 0) + + var q = x + var s = digit(q % rad) + for (i <- 1 until ceil(log(2)/log(radix)*w).toInt) { + q = q / rad + s = Cat(Mux(Bool(radix == 10) && q === UInt(0), Str(' '), digit(q % rad)), s) + } + s + } + def apply(x: SInt): UInt = apply(x, 10) + def apply(x: SInt, radix: Int): UInt = { + val neg = x < SInt(0) + val abs = x.abs + if (radix != 10) { + Cat(Mux(neg, Str('-'), Str(' ')), Str(abs, radix)) + } else { + val rad = UInt(radix) + val w = abs.getWidth + require(w > 0) + + var q = abs + var s = digit(q % rad) + var needSign = neg + for (i <- 1 until ceil(log(2)/log(radix)*w).toInt) { + q = q / rad + val placeSpace = q === UInt(0) + val space = Mux(needSign, Str('-'), Str(' ')) + needSign = needSign && !placeSpace + s = Cat(Mux(placeSpace, space, digit(q % rad)), s) + } + Cat(Mux(needSign, Str('-'), Str(' ')), s) + } + } + + private def digit(d: UInt): UInt = Mux(d < UInt(10), Str('0')+d, Str(('a'-10).toChar)+d)(7,0) + private def validChar(x: Char) = x == (x & 0xFF) +} + +object Split +{ + // is there a better way to do do this? + def apply(x: Bits, n0: Int) = { + val w = checkWidth(x, n0) + (x(w-1,n0), x(n0-1,0)) + } + def apply(x: Bits, n1: Int, n0: Int) = { + val w = checkWidth(x, n1, n0) + (x(w-1,n1), x(n1-1,n0), x(n0-1,0)) + } + def apply(x: Bits, n2: Int, n1: Int, n0: Int) = { + val w = checkWidth(x, n2, n1, n0) + (x(w-1,n2), x(n2-1,n1), x(n1-1,n0), x(n0-1,0)) + } + + private def checkWidth(x: Bits, n: Int*) = { + val w = x.getWidth + def decreasing(x: Seq[Int]): Boolean = + if (x.tail.isEmpty) true + else x.head >= x.tail.head && decreasing(x.tail) + require(decreasing(w :: n.toList)) + w + } +} + +// a counter that clock gates most of its MSBs using the LSB carry-out +case class WideCounter(width: Int, inc: UInt = UInt(1)) +{ + private val isWide = width > 2*inc.getWidth + private val smallWidth = if (isWide) inc.getWidth max log2Up(width) else width + private val small = Reg(init=UInt(0, smallWidth)) + private val nextSmall = small +& inc + small := nextSmall + + private val large = if (isWide) { + val r = Reg(init=UInt(0, width - smallWidth)) + when (nextSmall(smallWidth)) { r := r + UInt(1) } + r + } else null + + val value = if (isWide) Cat(large, small) else small + + def := (x: UInt) = { + small := x + if (isWide) large := x >> smallWidth + } +} + +object Random +{ + def apply(mod: Int, random: UInt): UInt = { + if (isPow2(mod)) random(log2Up(mod)-1,0) + else PriorityEncoder(partition(apply(1 << log2Up(mod*8), random), mod)) + } + def apply(mod: Int): UInt = apply(mod, randomizer) + def oneHot(mod: Int, random: UInt): UInt = { + if (isPow2(mod)) UIntToOH(random(log2Up(mod)-1,0)) + else PriorityEncoderOH(partition(apply(1 << log2Up(mod*8), random), mod)).toBits + } + def oneHot(mod: Int): UInt = oneHot(mod, randomizer) + + private def randomizer = LFSR16() + private def round(x: Double): Int = + if (x.toInt.toDouble == x) x.toInt else (x.toInt + 1) & -2 + private def partition(value: UInt, slices: Int) = + Seq.tabulate(slices)(i => value < round((i << value.getWidth).toDouble / slices)) +}