diff --git a/src/main/scala/rocket/Arbiter.scala b/src/main/scala/rocket/Arbiter.scala index 747130df..4fe8b838 100644 --- a/src/main/scala/rocket/Arbiter.scala +++ b/src/main/scala/rocket/Arbiter.scala @@ -56,6 +56,8 @@ class HellaCacheArbiter(n: Int)(implicit p: Parameters) extends Module resp.valid := io.mem.resp.valid && tag_hit io.requestor(i).xcpt := io.mem.xcpt io.requestor(i).ordered := io.mem.ordered + io.requestor(i).acquire := io.mem.acquire + io.requestor(i).release := io.mem.release io.requestor(i).s2_nack := io.mem.s2_nack && s2_id === UInt(i) resp.bits := io.mem.resp.bits resp.bits.tag := io.mem.resp.bits.tag >> log2Up(n) diff --git a/src/main/scala/rocket/CSR.scala b/src/main/scala/rocket/CSR.scala index bc3bbae3..298ac974 100644 --- a/src/main/scala/rocket/CSR.scala +++ b/src/main/scala/rocket/CSR.scala @@ -124,14 +124,23 @@ object CSR } val firstCtr = CSRs.cycle + val firstCtrH = CSRs.cycleh val firstHPC = CSRs.hpmcounter3 + val firstHPCH = CSRs.hpmcounter3h val firstHPE = CSRs.mhpmevent3 val firstMHPC = CSRs.mhpmcounter3 + val firstMHPCH = CSRs.mhpmcounter3h val firstHPM = 3 val nCtr = 32 val nHPM = nCtr - firstHPM } +class PerfCounterIO(implicit p: Parameters) extends CoreBundle + with HasRocketCoreParameters { + val eventSel = UInt(OUTPUT, xLen) + val inc = UInt(INPUT, log2Ceil(1+retireWidth)) +} + class CSRFileIO(implicit p: Parameters) extends CoreBundle with HasRocketCoreParameters { val interrupts = new TileInterrupts().asInput @@ -174,10 +183,10 @@ class CSRFileIO(implicit p: Parameters) extends CoreBundle val interrupt = Bool(OUTPUT) val interrupt_cause = UInt(OUTPUT, xLen) val bp = Vec(nBreakpoints, new BP).asOutput - val events = Vec(nPerfEvents, Bool()).asInput + val counters = Vec(nPerfCounters, new PerfCounterIO) } -class CSRFile(implicit p: Parameters) extends CoreModule()(p) +class CSRFile(perfEventSets: EventSets = new EventSets(Seq()))(implicit p: Parameters) extends CoreModule()(p) with HasRocketCoreParameters { val io = new CSRFileIO @@ -258,8 +267,9 @@ class CSRFile(implicit p: Parameters) extends CoreModule()(p) val reg_instret = WideCounter(64, io.retire) val reg_cycle = if (enableCommitLog) reg_instret else WideCounter(64) - val reg_hpmevent = Seq.fill(nPerfCounters)(if (nPerfEvents > 1) Reg(UInt(width = log2Ceil(nPerfEvents))) else UInt(0)) - val reg_hpmcounter = reg_hpmevent.map(e => WideCounter(64, ((UInt(0) +: io.events): Seq[UInt])(e))) + val reg_hpmevent = io.counters.map(c => Reg(init = UInt(0, xLen))) + (io.counters zip reg_hpmevent) foreach { case (c, e) => c.eventSel := e } + val reg_hpmcounter = io.counters.map(c => WideCounter(40, c.inc, reset = false)) val hpm_mask = reg_mcounteren & Mux((!usingVM).B || reg_mstatus.prv === PRV.S, delegable_counters.U, reg_scounteren) val mip = Wire(init=reg_mip) @@ -339,6 +349,10 @@ class CSRFile(implicit p: Parameters) extends CoreModule()(p) read_mapping += (i + CSR.firstHPE) -> e // mhpmeventN read_mapping += (i + CSR.firstMHPC) -> c // mhpmcounterN if (usingUser) read_mapping += (i + CSR.firstHPC) -> c // hpmcounterN + if (xLen == 32) { + read_mapping += (i + CSR.firstMHPCH) -> c // mhpmcounterNh + if (usingUser) read_mapping += (i + CSR.firstHPCH) -> c // hpmcounterNh + } } if (usingVM) { @@ -407,7 +421,7 @@ class CSRFile(implicit p: Parameters) extends CoreModule()(p) io.decode.read_illegal := effective_prv < io.decode.csr(9,8) || !read_mapping.keys.map(io.decode.csr === _).reduce(_||_) || io.decode.csr === CSRs.sptbr && !allow_sfence_vma || - io.decode.csr >= CSR.firstCtr && io.decode.csr < CSR.firstCtr + CSR.nCtr && effective_prv <= PRV.S && hpm_mask(io.decode.csr(log2Ceil(CSR.firstCtr)-1,0)) || + (io.decode.csr.inRange(CSR.firstCtr, CSR.firstCtr + CSR.nCtr) || io.decode.csr.inRange(CSR.firstCtrH, CSR.firstCtrH + CSR.nCtr)) && effective_prv <= PRV.S && hpm_mask(io.decode.csr(log2Ceil(CSR.firstCtr)-1,0)) || Bool(usingDebug) && !reg_debug && debug_csrs.keys.map(io.decode.csr === _).reduce(_||_) || Bool(usingFPU) && fp_csrs.keys.map(io.decode.csr === _).reduce(_||_) && io.decode.fp_illegal io.decode.write_illegal := io.decode.csr(11,10).andR @@ -561,8 +575,7 @@ class CSRFile(implicit p: Parameters) extends CoreModule()(p) for (((e, c), i) <- (reg_hpmevent zip reg_hpmcounter) zipWithIndex) { writeCounter(i + CSR.firstMHPC, c, wdata) - if (nPerfEvents > 1) - when (decoded_addr(i + CSR.firstHPE)) { e := wdata } + when (decoded_addr(i + CSR.firstHPE)) { e := perfEventSets.maskEventSelector(wdata) } } writeCounter(CSRs.mcycle, reg_cycle, wdata) writeCounter(CSRs.minstret, reg_instret, wdata) @@ -688,10 +701,10 @@ class CSRFile(implicit p: Parameters) extends CoreModule()(p) def writeCounter(lo: Int, ctr: WideCounter, wdata: UInt) = { if (xLen == 32) { val hi = lo + CSRs.mcycleh - CSRs.mcycle - when (decoded_addr(lo)) { ctr := Cat(ctr(63, 32), wdata) } - when (decoded_addr(hi)) { ctr := Cat(wdata, ctr(31, 0)) } + when (decoded_addr(lo)) { ctr := Cat(ctr(ctr.getWidth-1, 32), wdata) } + when (decoded_addr(hi)) { ctr := Cat(wdata(ctr.getWidth-33, 0), ctr(31, 0)) } } else { - when (decoded_addr(lo)) { ctr := wdata } + when (decoded_addr(lo)) { ctr := wdata(ctr.getWidth-1, 0) } } } def formEPC(x: UInt) = ~(~x | Cat(!reg_misa('c'-'a'), UInt(1))) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index b62a927a..575f167f 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -503,4 +503,8 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { flushing := false } } + + // performance events + io.cpu.acquire := tl_out.a.fire() + io.cpu.release := tl_out.c.fire() } diff --git a/src/main/scala/rocket/Events.scala b/src/main/scala/rocket/Events.scala new file mode 100644 index 00000000..d4aeef76 --- /dev/null +++ b/src/main/scala/rocket/Events.scala @@ -0,0 +1,35 @@ +// See LICENSE.Berkeley for license details. +// See LICENSE.SiFive for license details. + +package rocket + +import util._ +import Chisel._ + +class EventSet(gate: (UInt, UInt) => Bool, events: Seq[(String, () => Bool)]) { + def size = events.size + def hits = events.map(_._2()).asUInt + def check(mask: UInt) = gate(mask, hits) +} + +class EventSets(eventSets: Seq[EventSet]) { + def maskEventSelector(eventSel: UInt): UInt = { + // allow full associativity between counters and event sets (for now?) + val setMask = (BigInt(1) << log2Ceil(eventSets.size)) - 1 + val maskMask = ((BigInt(1) << eventSets.map(_.size).max) - 1) << eventSetIdBits + eventSel & (setMask | maskMask).U + } + + private def decode(counter: UInt): (UInt, UInt) = { + require(eventSets.size <= (1 << eventSetIdBits)) + (counter(log2Ceil(eventSets.size)-1, 0), counter >> eventSetIdBits) + } + + def evaluate(eventSel: UInt): Bool = { + val (set, mask) = decode(eventSel) + val sets = eventSets map (_ check mask) + sets(set) + } + + private def eventSetIdBits = 8 +} diff --git a/src/main/scala/rocket/Frontend.scala b/src/main/scala/rocket/Frontend.scala index 39815cd1..b6328f34 100644 --- a/src/main/scala/rocket/Frontend.scala +++ b/src/main/scala/rocket/Frontend.scala @@ -35,6 +35,9 @@ class FrontendIO(implicit p: Parameters) extends CoreBundle()(p) { val flush_icache = Bool(OUTPUT) val flush_tlb = Bool(OUTPUT) val npc = UInt(INPUT, width = vaddrBitsExtended) + + // performance events + val acquire = Bool(INPUT) } class Frontend(implicit p: Parameters) extends LazyModule { @@ -150,6 +153,9 @@ class FrontendModule(outer: Frontend) extends LazyModuleImp(outer) io.cpu.resp.bits.replay := icache.io.s2_kill && !icache.io.resp.valid && !s2_xcpt_if io.cpu.resp.bits.btb.valid := s2_btb_resp_valid io.cpu.resp.bits.btb.bits := s2_btb_resp_bits + + // performance events + io.cpu.acquire := icache.io.mem(0).a.fire() } /** Mix-ins for constructing tiles that have an ICache-based pipeline frontend */ diff --git a/src/main/scala/rocket/HellaCache.scala b/src/main/scala/rocket/HellaCache.scala index 4917770f..f0b23107 100644 --- a/src/main/scala/rocket/HellaCache.scala +++ b/src/main/scala/rocket/HellaCache.scala @@ -121,6 +121,10 @@ class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) { val s1_data = Bits(OUTPUT, coreDataBits) // data for previous cycle's req val s2_nack = Bool(INPUT) // req from two cycles ago is rejected + // performance events + val acquire = Bool(INPUT) + val release = Bool(INPUT) + val resp = Valid(new HellaCacheResp).flip val replay_next = Bool(INPUT) val xcpt = (new HellaCacheExceptions).asInput diff --git a/src/main/scala/rocket/NBDcache.scala b/src/main/scala/rocket/NBDcache.scala index 4ffef4f1..5adf56c3 100644 --- a/src/main/scala/rocket/NBDcache.scala +++ b/src/main/scala/rocket/NBDcache.scala @@ -973,4 +973,8 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule io.cpu.resp.bits.data_word_bypass := loadgen.wordData io.cpu.ordered := mshrs.io.fence_rdy && !s1_valid && !s2_valid io.cpu.replay_next := (s1_replay && s1_read) || mshrs.io.replay_next + + // performance events + io.cpu.acquire := tl_out.a.fire() + io.cpu.release := tl_out.c.fire() } diff --git a/src/main/scala/rocket/Rocket.scala b/src/main/scala/rocket/Rocket.scala index 2ac26f9c..c8c93c19 100644 --- a/src/main/scala/rocket/Rocket.scala +++ b/src/main/scala/rocket/Rocket.scala @@ -19,7 +19,6 @@ case class RocketCoreParams( useCompressed: Boolean = true, nBreakpoints: Int = 1, nPerfCounters: Int = 0, - nPerfEvents: Int = 0, nCustomMRWCSRs: Int = 0, mtvecInit: Option[BigInt] = Some(BigInt(0)), mtvecWritable: Boolean = true, @@ -44,7 +43,6 @@ trait HasRocketCoreParameters extends HasCoreParameters { val fastJAL = rocketParams.fastJAL val nBreakpoints = rocketParams.nBreakpoints val nPerfCounters = rocketParams.nPerfCounters - val nPerfEvents = rocketParams.nPerfEvents val nCustomMrwCsrs = rocketParams.nCustomMRWCSRs val mtvecInit = rocketParams.mtvecInit val mtvecWritable = rocketParams.mtvecWritable @@ -58,6 +56,50 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) with HasRocketCoreParameters with HasCoreIO { + // performance counters + def pipelineIDToWB[T <: Data](x: T): T = + RegEnable(RegEnable(RegEnable(x, !ctrl_killd), ex_pc_valid), mem_pc_valid) + val perfEvents = new EventSets(Seq( + new EventSet((mask, hits) => Mux(mask(0), wb_xcpt, wb_valid && pipelineIDToWB((mask & hits).orR)), Seq( + ("exception", () => false.B), + ("load", () => id_ctrl.mem && id_ctrl.mem_cmd === M_XRD && !id_ctrl.fp), + ("store", () => id_ctrl.mem && id_ctrl.mem_cmd === M_XWR && !id_ctrl.fp), + ("amo", () => Bool(usingAtomics) && id_ctrl.mem && (isAMO(id_ctrl.mem_cmd) || id_ctrl.mem_cmd.isOneOf(M_XLR, M_XSC))), + ("system", () => id_ctrl.csr =/= CSR.N), + ("arith", () => id_ctrl.wxd && !(id_ctrl.jal || id_ctrl.jalr || id_ctrl.mem || id_ctrl.fp || id_ctrl.div || id_ctrl.csr =/= CSR.N)), + ("branch", () => id_ctrl.branch), + ("jal", () => id_ctrl.jal), + ("jalr", () => id_ctrl.jalr)) + ++ (if (!usingMulDiv) Seq() else Seq( + ("mul", () => id_ctrl.div && (id_ctrl.alu_fn & ALU.FN_DIV) =/= ALU.FN_DIV), + ("div", () => id_ctrl.div && (id_ctrl.alu_fn & ALU.FN_DIV) === ALU.FN_DIV))) + ++ (if (!usingFPU) Seq() else Seq( + ("fp load", () => id_ctrl.fp && io.fpu.dec.ldst && io.fpu.dec.wen), + ("fp store", () => id_ctrl.fp && io.fpu.dec.ldst && !io.fpu.dec.wen), + ("fp add", () => id_ctrl.fp && io.fpu.dec.fma && io.fpu.dec.swap23), + ("fp mul", () => id_ctrl.fp && io.fpu.dec.fma && !io.fpu.dec.swap23 && !io.fpu.dec.ren3), + ("fp mul-add", () => id_ctrl.fp && io.fpu.dec.fma && io.fpu.dec.ren3), + ("fp div/sqrt", () => id_ctrl.fp && (io.fpu.dec.div || io.fpu.dec.sqrt)), + ("fp other", () => id_ctrl.fp && !(io.fpu.dec.ldst || io.fpu.dec.fma || io.fpu.dec.div || io.fpu.dec.sqrt))))), + new EventSet((mask, hits) => (mask & hits).orR, Seq( + ("load-use interlock", () => id_ex_hazard && ex_ctrl.mem || id_mem_hazard && mem_ctrl.mem || id_wb_hazard && wb_ctrl.mem), + ("long-latency interlock", () => id_sboard_hazard), + ("csr interlock", () => id_ex_hazard && ex_ctrl.csr =/= CSR.N || id_mem_hazard && mem_ctrl.csr =/= CSR.N || id_wb_hazard && wb_ctrl.csr =/= CSR.N), + ("I$ blocked", () => !(ibuf.io.inst(0).valid || Reg(next = take_pc))), + ("D$ blocked", () => id_ctrl.mem && dcache_blocked), + ("branch misprediction", () => take_pc_mem && mem_direction_misprediction), + ("control-flow target misprediction", () => take_pc_mem && mem_misprediction && !mem_direction_misprediction), + ("flush", () => take_pc_mem && mem_reg_flush_pipe), + ("replay", () => replay_wb)) + ++ (if (!usingMulDiv) Seq() else Seq( + ("mul/div interlock", () => id_ex_hazard && ex_ctrl.div || id_mem_hazard && mem_ctrl.div || id_wb_hazard && wb_ctrl.div))) + ++ (if (!usingFPU) Seq() else Seq( + ("fp interlock", () => id_ex_hazard && ex_ctrl.fp || id_mem_hazard && mem_ctrl.fp || id_wb_hazard && wb_ctrl.fp || id_ctrl.fp && id_stall_fpu)))), + new EventSet((mask, hits) => (mask & hits).orR, Seq( + ("I$ miss", () => io.imem.acquire), + ("D$ miss", () => io.dmem.acquire), + ("D$ release", () => io.dmem.release))))) + val decode_table = { (if (usingMulDiv) new MDecode +: (xLen > 32).option(new M64Decode).toSeq else Nil) ++: (if (usingAtomics) new ADecode +: (xLen > 32).option(new A64Decode).toSeq else Nil) ++: @@ -142,7 +184,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) val id_npc = (ibuf.io.pc.asSInt + ImmGen(IMM_UJ, id_inst(0))).asUInt take_pc_id := Bool(fastJAL) && !ctrl_killd && id_ctrl.jal - val csr = Module(new CSRFile) + val csr = Module(new CSRFile(perfEvents)) val id_csr_en = id_ctrl.csr.isOneOf(CSR.S, CSR.C, CSR.W) val id_system_insn = id_ctrl.csr >= CSR.I val id_csr_ren = id_ctrl.csr.isOneOf(CSR.S, CSR.C) && id_raddr1 === UInt(0) @@ -292,6 +334,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) (ex_reg_xcpt_interrupt || ex_reg_xcpt, ex_reg_cause))) // memory stage + val mem_pc_valid = mem_reg_valid || mem_reg_replay || mem_reg_xcpt_interrupt val mem_br_taken = mem_reg_wdata(0) val mem_br_target = mem_reg_pc.asSInt + Mux(mem_ctrl.branch && mem_br_taken, ImmGen(IMM_SB, mem_reg_inst), @@ -303,6 +346,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) val mem_int_wdata = Mux(!mem_reg_xcpt && (mem_ctrl.jalr ^ mem_npc_misaligned), mem_br_target, mem_reg_wdata.asSInt).asUInt val mem_cfi = mem_ctrl.branch || mem_ctrl.jalr || mem_ctrl.jal val mem_cfi_taken = (mem_ctrl.branch && mem_br_taken) || mem_ctrl.jalr || (Bool(!fastJAL) && mem_ctrl.jal) + val mem_direction_misprediction = mem_reg_btb_hit && mem_ctrl.branch && mem_br_taken =/= mem_reg_btb_resp.taken val mem_misprediction = if (usingBTB) mem_wrong_npc else mem_cfi_taken take_pc_mem := mem_reg_valid && (mem_misprediction || mem_reg_flush_pipe) @@ -357,7 +401,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) wb_reg_replay := replay_mem && !take_pc_wb wb_reg_xcpt := mem_xcpt && !take_pc_wb when (mem_xcpt) { wb_reg_cause := mem_cause } - when (mem_reg_valid || mem_reg_replay || mem_reg_xcpt_interrupt) { + when (mem_pc_valid) { wb_ctrl := mem_ctrl wb_reg_wdata := Mux(!mem_reg_xcpt && mem_ctrl.fp && mem_ctrl.wxd, io.fpu.toint_data, mem_int_wdata) when (mem_ctrl.rocc) { @@ -556,6 +600,9 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) io.rocc.cmd.bits.rs1 := wb_reg_wdata io.rocc.cmd.bits.rs2 := wb_reg_rs2 + // evaluate performance counters + csr.io.counters foreach { c => c.inc := RegNext(perfEvents.evaluate(c.eventSel)) } + if (enableCommitLog) { val pc = Wire(SInt(width=xLen)) pc := wb_reg_pc diff --git a/src/main/scala/util/Counters.scala b/src/main/scala/util/Counters.scala index 2d12a13f..eb555f7a 100644 --- a/src/main/scala/util/Counters.scala +++ b/src/main/scala/util/Counters.scala @@ -50,7 +50,7 @@ case class WideCounter(width: Int, inc: UInt = UInt(1), reset: Boolean = true) private val large = if (isWide) { val r = if (reset) Reg(init=UInt(0, width - smallWidth)) else Reg(UInt(width = width - smallWidth)) - when (nextSmall(smallWidth)) { r := r +& UInt(1) } + when (nextSmall(smallWidth)) { r := r + UInt(1) } r } else null diff --git a/src/main/scala/util/Package.scala b/src/main/scala/util/Package.scala index 5440fcf4..74b0aab2 100644 --- a/src/main/scala/util/Package.scala +++ b/src/main/scala/util/Package.scala @@ -43,6 +43,8 @@ package object util { if (hi == lo-1) UInt(0) else x(hi, lo) } + + def inRange(base: UInt, bounds: UInt) = x >= base && x < bounds } implicit class BooleanToAugmentedBoolean(val x: Boolean) extends AnyVal {