1
0

Merge pull request #902 from freechipsproject/perf-improvements

Perf improvements
This commit is contained in:
Yunsup Lee 2017-07-28 20:12:10 -07:00 committed by GitHub
commit 140086e2c5
2 changed files with 44 additions and 20 deletions

View File

@ -104,11 +104,12 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
s1_req := io.cpu.req.bits s1_req := io.cpu.req.bits
s1_req.addr := Cat(io.cpu.req.bits.addr >> untagBits, metaArb.io.out.bits.idx, io.cpu.req.bits.addr(blockOffBits-1,0)) s1_req.addr := Cat(io.cpu.req.bits.addr >> untagBits, metaArb.io.out.bits.idx, io.cpu.req.bits.addr(blockOffBits-1,0))
} }
val s1_read = needsRead(s1_req) val s1_read = isRead(s1_req.cmd)
val s1_write = isWrite(s1_req.cmd) val s1_write = isWrite(s1_req.cmd)
val s1_readwrite = s1_read || s1_write val s1_readwrite = s1_read || s1_write
val s1_sfence = s1_req.cmd === M_SFENCE val s1_sfence = s1_req.cmd === M_SFENCE
val s1_flush_valid = Reg(Bool()) val s1_flush_valid = Reg(Bool())
val s1_waw_hazard = Wire(Bool())
val s_ready :: s_voluntary_writeback :: s_probe_rep_dirty :: s_probe_rep_clean :: s_probe_retry :: s_probe_rep_miss :: s_voluntary_write_meta :: s_probe_write_meta :: Nil = Enum(UInt(), 8) val s_ready :: s_voluntary_writeback :: s_probe_rep_dirty :: s_probe_rep_clean :: s_probe_retry :: s_probe_rep_miss :: s_voluntary_write_meta :: s_probe_write_meta :: Nil = Enum(UInt(), 8)
val cached_grant_wait = Reg(init=Bool(false)) val cached_grant_wait = Reg(init=Bool(false))
@ -125,13 +126,15 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
val uncachedReqs = Seq.fill(maxUncachedInFlight) { Reg(new HellaCacheReq) } val uncachedReqs = Seq.fill(maxUncachedInFlight) { Reg(new HellaCacheReq) }
// hit initiation path // hit initiation path
val s0_read = needsRead(io.cpu.req.bits) val s0_needsRead = needsRead(io.cpu.req.bits)
dataArb.io.in(3).valid := io.cpu.req.valid && s0_read val s0_read = isRead(io.cpu.req.bits.cmd)
dataArb.io.in(3).valid := io.cpu.req.valid && s0_needsRead
dataArb.io.in(3).bits.write := false dataArb.io.in(3).bits.write := false
dataArb.io.in(3).bits.addr := io.cpu.req.bits.addr dataArb.io.in(3).bits.addr := io.cpu.req.bits.addr
dataArb.io.in(3).bits.wordMask := UIntToOH(io.cpu.req.bits.addr.extract(rowOffBits-1,offsetlsb)) dataArb.io.in(3).bits.wordMask := UIntToOH(io.cpu.req.bits.addr.extract(rowOffBits-1,offsetlsb))
dataArb.io.in(3).bits.way_en := ~UInt(0, nWays) dataArb.io.in(3).bits.way_en := ~UInt(0, nWays)
when (!dataArb.io.in(3).ready && s0_read) { io.cpu.req.ready := false } when (!dataArb.io.in(3).ready && s0_read) { io.cpu.req.ready := false }
val s1_didntRead = RegEnable(s0_needsRead && !dataArb.io.in(3).ready, metaArb.io.out.valid && !metaArb.io.out.bits.write)
metaArb.io.in(7).valid := io.cpu.req.valid metaArb.io.in(7).valid := io.cpu.req.valid
metaArb.io.in(7).bits.write := false metaArb.io.in(7).bits.write := false
metaArb.io.in(7).bits.idx := io.cpu.req.bits.addr(idxMSB, idxLSB) metaArb.io.in(7).bits.idx := io.cpu.req.bits.addr(idxMSB, idxLSB)
@ -153,7 +156,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
tlb.io.req.bits.instruction := false tlb.io.req.bits.instruction := false
tlb.io.req.bits.size := s1_req.typ tlb.io.req.bits.size := s1_req.typ
tlb.io.req.bits.cmd := s1_req.cmd tlb.io.req.bits.cmd := s1_req.cmd
when (!tlb.io.req.ready && !io.cpu.req.bits.phys) { io.cpu.req.ready := false } when (!tlb.io.req.ready && !tlb.io.ptw.resp.valid && !io.cpu.req.bits.phys) { io.cpu.req.ready := false }
when (s1_valid && s1_readwrite && tlb.io.resp.miss) { s1_nack := true } when (s1_valid && s1_readwrite && tlb.io.resp.miss) { s1_nack := true }
val s1_paddr = tlb.io.resp.paddr val s1_paddr = tlb.io.resp.paddr
@ -212,6 +215,8 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
val s2_probe_state = RegEnable(s1_hit_state, s1_probe) val s2_probe_state = RegEnable(s1_hit_state, s1_probe)
val s2_hit_way = RegEnable(s1_hit_way, s1_valid_not_nacked) val s2_hit_way = RegEnable(s1_hit_way, s1_valid_not_nacked)
val s2_hit_state = RegEnable(s1_hit_state, s1_valid_not_nacked) val s2_hit_state = RegEnable(s1_hit_state, s1_valid_not_nacked)
val s2_waw_hazard = RegEnable(s1_waw_hazard, s1_valid_not_nacked)
val s2_store_merge = Wire(Bool())
val s2_hit_valid = s2_hit_state.isValid() val s2_hit_valid = s2_hit_state.isValid()
val (s2_hit, s2_grow_param, s2_new_hit_state) = s2_hit_state.onAccess(s2_req.cmd) val (s2_hit, s2_grow_param, s2_new_hit_state) = s2_hit_state.onAccess(s2_req.cmd)
val s2_data_decoded = decodeData(s2_data) val s2_data_decoded = decodeData(s2_data)
@ -221,8 +226,8 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
val s2_data_uncorrected = (s2_data_decoded.map(_.uncorrected): Seq[UInt]).asUInt val s2_data_uncorrected = (s2_data_decoded.map(_.uncorrected): Seq[UInt]).asUInt
val s2_valid_hit_pre_data_ecc = s2_valid_masked && s2_readwrite && !s2_meta_error && s2_hit val s2_valid_hit_pre_data_ecc = s2_valid_masked && s2_readwrite && !s2_meta_error && s2_hit
val s2_valid_data_error = s2_valid_hit_pre_data_ecc && s2_data_error val s2_valid_data_error = s2_valid_hit_pre_data_ecc && s2_data_error
val s2_valid_hit = s2_valid_hit_pre_data_ecc && !s2_data_error val s2_valid_hit = s2_valid_hit_pre_data_ecc && !s2_data_error && (!s2_waw_hazard || s2_store_merge)
val s2_valid_miss = s2_valid_masked && s2_readwrite && !s2_meta_error && !s2_hit && !any_pstore_valid && !release_ack_wait val s2_valid_miss = s2_valid_masked && s2_readwrite && !s2_meta_error && !s2_hit && !release_ack_wait
val s2_valid_cached_miss = s2_valid_miss && !s2_uncached && !uncachedInFlight.asUInt.orR val s2_valid_cached_miss = s2_valid_miss && !s2_uncached && !uncachedInFlight.asUInt.orR
val s2_victimize = Bool(!usingDataScratchpad) && (s2_valid_cached_miss || s2_valid_data_error || s2_flush_valid) val s2_victimize = Bool(!usingDataScratchpad) && (s2_valid_cached_miss || s2_valid_data_error || s2_flush_valid)
val s2_valid_uncached = s2_valid_miss && s2_uncached val s2_valid_uncached = s2_valid_miss && s2_uncached
@ -274,29 +279,45 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
val pstore1_way = RegEnable(s1_hit_way, s1_valid_not_nacked && s1_write) val pstore1_way = RegEnable(s1_hit_way, s1_valid_not_nacked && s1_write)
val pstore1_mask = RegEnable(s1_mask, s1_valid_not_nacked && s1_write) val pstore1_mask = RegEnable(s1_mask, s1_valid_not_nacked && s1_write)
val pstore1_storegen_data = Wire(init = pstore1_data) val pstore1_storegen_data = Wire(init = pstore1_data)
val pstore1_rmw = Bool(usingRMW) && RegEnable(s1_read, s1_valid_not_nacked && s1_write) val pstore1_rmw = Bool(usingRMW) && RegEnable(needsRead(s1_req), s1_valid_not_nacked && s1_write)
val pstore1_valid = Wire(Bool()) val pstore1_valid = Wire(Bool())
val pstore1_merge = pstore1_valid && s2_store_merge
val pstore2_valid = Reg(Bool()) val pstore2_valid = Reg(Bool())
any_pstore_valid := pstore1_valid || pstore2_valid any_pstore_valid := pstore1_valid || pstore2_valid
val pstore_drain_structural = pstore1_valid && pstore2_valid && ((s1_valid && s1_write) || pstore1_rmw) val pstore_drain_structural = pstore1_valid && pstore2_valid && ((s1_valid && s1_write) || pstore1_rmw)
val pstore_drain_opportunistic = !(io.cpu.req.valid && s0_read) val pstore_drain_opportunistic = !(io.cpu.req.valid && s0_needsRead)
val pstore_drain_on_miss = releaseInFlight || io.cpu.s2_nack val pstore_drain_on_miss = releaseInFlight || io.cpu.s2_nack
val pstore_drain = val pstore_drain = !pstore1_merge &&
Bool(usingRMW) && pstore_drain_structural || (Bool(usingRMW) && pstore_drain_structural ||
(((pstore1_valid && !pstore1_rmw) || pstore2_valid) && (pstore_drain_opportunistic || pstore_drain_on_miss)) (((pstore1_valid && !pstore1_rmw) || pstore2_valid) && (pstore_drain_opportunistic || pstore_drain_on_miss)))
pstore1_valid := { pstore1_valid := {
val s2_store_valid = s2_valid_hit && s2_write && !s2_sc_fail val s2_store_valid = s2_valid_hit && s2_write && !s2_sc_fail
val pstore1_held = Reg(Bool()) val pstore1_held = Reg(Bool())
assert(!s2_store_valid || !pstore1_held) assert(!s2_store_valid || !pstore1_held)
pstore1_held := (s2_store_valid || pstore1_held) && pstore2_valid && !pstore_drain pstore1_held := (s2_store_valid && !s2_store_merge || pstore1_held) && pstore2_valid && !pstore_drain
s2_store_valid || pstore1_held s2_store_valid || pstore1_held
} }
val advance_pstore1 = (pstore1_valid || s2_valid_correct) && (pstore2_valid === pstore_drain) val advance_pstore1 = (pstore1_valid || s2_valid_correct) && (pstore2_valid === pstore_drain)
pstore2_valid := pstore2_valid && !pstore_drain || advance_pstore1 pstore2_valid := pstore2_valid && !pstore_drain || advance_pstore1
val pstore2_addr = RegEnable(Mux(s2_correct, s2_req.addr, pstore1_addr), advance_pstore1) val pstore2_addr = RegEnable(Mux(s2_correct, s2_req.addr, pstore1_addr), advance_pstore1)
val pstore2_way = RegEnable(Mux(s2_correct, s2_hit_way, pstore1_way), advance_pstore1) val pstore2_way = RegEnable(Mux(s2_correct, s2_hit_way, pstore1_way), advance_pstore1)
val pstore2_storegen_data = RegEnable(pstore1_storegen_data, advance_pstore1) s2_store_merge := {
val pstore2_storegen_mask = RegEnable(~Mux(s2_correct, 0.U, ~pstore1_mask), advance_pstore1) val idxMatch = s2_req.addr(untagBits-1, log2Ceil(wordBytes)) === pstore2_addr(untagBits-1, log2Ceil(wordBytes))
val tagMatch = (s2_hit_way & pstore2_way).orR
Bool(eccBytes > 1) && pstore2_valid && idxMatch && tagMatch
}
val pstore2_storegen_data = {
for (i <- 0 until wordBytes)
yield RegEnable(pstore1_storegen_data(8*(i+1)-1, 8*i), advance_pstore1 || pstore1_merge && pstore1_mask(i))
}.asUInt
val pstore2_storegen_mask = {
val mask = Reg(UInt(width = wordBytes))
when (advance_pstore1 || pstore1_merge) {
val mergedMask = pstore1_mask | Mux(pstore1_merge, mask, 0.U)
mask := ~Mux(s2_correct, 0.U, ~mergedMask)
}
mask
}
dataArb.io.in(0).valid := pstore_drain dataArb.io.in(0).valid := pstore_drain
dataArb.io.in(0).bits.write := true dataArb.io.in(0).bits.write := true
dataArb.io.in(0).bits.addr := Mux(pstore2_valid, pstore2_addr, pstore1_addr) dataArb.io.in(0).bits.addr := Mux(pstore2_valid, pstore2_addr, pstore1_addr)
@ -309,9 +330,11 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
def s1Depends(addr: UInt, mask: UInt) = def s1Depends(addr: UInt, mask: UInt) =
addr(idxMSB, wordOffBits) === s1_req.addr(idxMSB, wordOffBits) && addr(idxMSB, wordOffBits) === s1_req.addr(idxMSB, wordOffBits) &&
Mux(s1_write, (eccByteMask(mask) & eccByteMask(s1_mask)).orR, (mask & s1_mask).orR) Mux(s1_write, (eccByteMask(mask) & eccByteMask(s1_mask)).orR, (mask & s1_mask).orR)
val s1_raw_hazard = s1_read && val s1_hazard =
((pstore1_valid && s1Depends(pstore1_addr, pstore1_mask)) || (pstore1_valid && s1Depends(pstore1_addr, pstore1_mask)) ||
(pstore2_valid && s1Depends(pstore2_addr, pstore2_storegen_mask))) (pstore2_valid && s1Depends(pstore2_addr, pstore2_storegen_mask))
val s1_raw_hazard = s1_read && s1_hazard
s1_waw_hazard := Bool(eccBytes > 1) && s1_write && (s1_hazard || s1_didntRead)
when (s1_valid && s1_raw_hazard) { s1_nack := true } when (s1_valid && s1_raw_hazard) { s1_nack := true }
// Prepare a TileLink request message that initiates a transaction // Prepare a TileLink request message that initiates a transaction

View File

@ -90,10 +90,10 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
("load-use interlock", () => id_ex_hazard && ex_ctrl.mem || id_mem_hazard && mem_ctrl.mem || id_wb_hazard && wb_ctrl.mem), ("load-use interlock", () => id_ex_hazard && ex_ctrl.mem || id_mem_hazard && mem_ctrl.mem || id_wb_hazard && wb_ctrl.mem),
("long-latency interlock", () => id_sboard_hazard), ("long-latency interlock", () => id_sboard_hazard),
("csr interlock", () => id_ex_hazard && ex_ctrl.csr =/= CSR.N || id_mem_hazard && mem_ctrl.csr =/= CSR.N || id_wb_hazard && wb_ctrl.csr =/= CSR.N), ("csr interlock", () => id_ex_hazard && ex_ctrl.csr =/= CSR.N || id_mem_hazard && mem_ctrl.csr =/= CSR.N || id_wb_hazard && wb_ctrl.csr =/= CSR.N),
("I$ blocked", () => !(ibuf.io.inst(0).valid || Reg(next = take_pc))), ("I$ blocked", () => icache_blocked),
("D$ blocked", () => id_ctrl.mem && dcache_blocked), ("D$ blocked", () => id_ctrl.mem && dcache_blocked),
("branch misprediction", () => take_pc_mem && mem_direction_misprediction), ("branch misprediction", () => take_pc_mem && mem_direction_misprediction),
("control-flow target misprediction", () => take_pc_mem && mem_misprediction && !mem_direction_misprediction), ("control-flow target misprediction", () => take_pc_mem && mem_misprediction && mem_cfi && !mem_direction_misprediction && !icache_blocked),
("flush", () => wb_reg_flush_pipe), ("flush", () => wb_reg_flush_pipe),
("replay", () => replay_wb)) ("replay", () => replay_wb))
++ (if (!usingMulDiv) Seq() else Seq( ++ (if (!usingMulDiv) Seq() else Seq(
@ -593,7 +593,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
ibuf.io.inst(0).ready := !ctrl_stalld || csr.io.interrupt ibuf.io.inst(0).ready := !ctrl_stalld || csr.io.interrupt
io.imem.btb_update.valid := (mem_reg_replay && mem_reg_btb_hit) || (mem_reg_valid && !take_pc_wb && mem_misprediction) io.imem.btb_update.valid := (mem_reg_replay && mem_reg_btb_hit) || (mem_reg_valid && !take_pc_wb && mem_wrong_npc && (!mem_cfi || mem_cfi_taken))
io.imem.btb_update.bits.isValid := !mem_reg_replay && mem_cfi io.imem.btb_update.bits.isValid := !mem_reg_replay && mem_cfi
io.imem.btb_update.bits.cfiType := io.imem.btb_update.bits.cfiType :=
Mux((mem_ctrl.jal || mem_ctrl.jalr) && mem_waddr(0), CFIType.call, Mux((mem_ctrl.jal || mem_ctrl.jalr) && mem_waddr(0), CFIType.call,
@ -642,6 +642,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p)
io.rocc.cmd.bits.rs2 := wb_reg_rs2 io.rocc.cmd.bits.rs2 := wb_reg_rs2
// evaluate performance counters // evaluate performance counters
val icache_blocked = !(io.imem.resp.valid || RegNext(io.imem.resp.valid))
csr.io.counters foreach { c => c.inc := RegNext(perfEvents.evaluate(c.eventSel)) } csr.io.counters foreach { c => c.inc := RegNext(perfEvents.evaluate(c.eventSel)) }
if (enableCommitLog) { if (enableCommitLog) {