From f49172b5bc052622202ce5254d2ae38227a5773a Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sat, 29 Apr 2017 16:47:49 -0700 Subject: [PATCH 1/5] ScratchpadSlavePort doesn't support byte/halfword atomics --- src/main/scala/rocket/ScratchpadSlavePort.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/rocket/ScratchpadSlavePort.scala b/src/main/scala/rocket/ScratchpadSlavePort.scala index a96fd248..dc456557 100644 --- a/src/main/scala/rocket/ScratchpadSlavePort.scala +++ b/src/main/scala/rocket/ScratchpadSlavePort.scala @@ -22,8 +22,8 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L resources = device.reg, regionType = RegionType.UNCACHED, executable = true, - supportsArithmetic = if (usingAtomics) TransferSizes(1, coreDataBytes) else TransferSizes.none, - supportsLogical = if (usingAtomics) TransferSizes(1, coreDataBytes) else TransferSizes.none, + supportsArithmetic = if (usingAtomics) TransferSizes(4, coreDataBytes) else TransferSizes.none, + supportsLogical = if (usingAtomics) TransferSizes(4, coreDataBytes) else TransferSizes.none, supportsPutPartial = TransferSizes.none, // Can't support PutPartial supportsPutFull = TransferSizes(1, coreDataBytes), supportsGet = TransferSizes(1, coreDataBytes), From 044b6ed3f92439ddee20fb4cd78eae0e2eda3f5b Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Sun, 30 Apr 2017 02:22:19 -0700 Subject: [PATCH 2/5] Improve logical ops in AMOALU As with integer ALU, shave off some muxing. --- src/main/scala/uncore/util/AmoAlu.scala | 31 +++++++++++++++++-------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/main/scala/uncore/util/AmoAlu.scala b/src/main/scala/uncore/util/AmoAlu.scala index c51f4ab3..b36c4446 100644 --- a/src/main/scala/uncore/util/AmoAlu.scala +++ b/src/main/scala/uncore/util/AmoAlu.scala @@ -70,9 +70,11 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame else new StoreGen(io.typ, io.addr, io.rhs, operandBits/8) val rhs = storegen.wordData - val sgned = io.cmd === M_XA_MIN || io.cmd === M_XA_MAX val max = io.cmd === M_XA_MAX || io.cmd === M_XA_MAXU val min = io.cmd === M_XA_MIN || io.cmd === M_XA_MINU + val add = io.cmd === M_XA_ADD + val logic_and = io.cmd === M_XA_OR || io.cmd === M_XA_AND + val logic_xor = io.cmd === M_XA_XOR || io.cmd === M_XA_OR val adder_out = if (operandBits == 32) io.lhs + rhs @@ -81,9 +83,15 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame (io.lhs & mask) + (rhs & mask) } - val less = - if (operandBits == 32) Mux(io.lhs(31) === rhs(31), io.lhs < rhs, Mux(sgned, io.lhs(31), io.rhs(31))) - else { + val less = { + val sgned = { + val mask = M_XA_MIN ^ M_XA_MINU + (io.cmd & mask) === (M_XA_MIN & mask) + } + + if (operandBits == 32) { + Mux(io.lhs(31) === rhs(31), io.lhs < rhs, Mux(sgned, io.lhs(31), io.rhs(31))) + } else { val word = !io.typ(0) val cmp_lhs = Mux(word && !io.addr(2), io.lhs(31), io.lhs(63)) val cmp_rhs = Mux(word && !io.addr(2), rhs(31), rhs(63)) @@ -93,13 +101,16 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame val lt = Mux(word, Mux(io.addr(2), lt_hi, lt_lo), lt_hi || eq_hi && lt_lo) Mux(cmp_lhs === cmp_rhs, lt, Mux(sgned, cmp_lhs, cmp_rhs)) } + } - val out = Mux(io.cmd === M_XA_ADD, adder_out, - Mux(io.cmd === M_XA_AND, io.lhs & rhs, - Mux(io.cmd === M_XA_OR, io.lhs | rhs, - Mux(io.cmd === M_XA_XOR, io.lhs ^ rhs, - Mux(Mux(less, min, max), io.lhs, - storegen.data))))) + val minmax = Mux(Mux(less, min, max), io.lhs, storegen.data) + val logic = + Mux(logic_and, io.lhs & rhs, 0.U) | + Mux(logic_xor, io.lhs ^ rhs, 0.U) + val out = + Mux(add, adder_out, + Mux(logic_and || logic_xor, logic, + minmax)) val wmask = FillInterleaved(8, storegen.mask) io.out := wmask & out | ~wmask & io.lhs From f8151ce786d63947c80eb13ceed29be1acd8decd Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Mon, 1 May 2017 17:36:39 -0700 Subject: [PATCH 3/5] Remove subword load muxing in ScratchpadSlavePort --- src/main/scala/rocket/DCache.scala | 1 + src/main/scala/rocket/HellaCache.scala | 1 + src/main/scala/rocket/NBDcache.scala | 1 + src/main/scala/rocket/ScratchpadSlavePort.scala | 10 ++-------- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index 67163a6f..5562d769 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -517,6 +517,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val loadgen = new LoadGen(s2_req.typ, mtSigned(s2_req.typ), s2_req.addr, s2_data_word, s2_sc, wordBytes) io.cpu.resp.bits.data := loadgen.data | s2_sc_fail io.cpu.resp.bits.data_word_bypass := loadgen.wordData + io.cpu.resp.bits.data_raw := s2_data_word io.cpu.resp.bits.store_data := pstore1_data // AMOs diff --git a/src/main/scala/rocket/HellaCache.scala b/src/main/scala/rocket/HellaCache.scala index 70072321..f50bfa0b 100644 --- a/src/main/scala/rocket/HellaCache.scala +++ b/src/main/scala/rocket/HellaCache.scala @@ -103,6 +103,7 @@ class HellaCacheResp(implicit p: Parameters) extends CoreBundle()(p) val replay = Bool() val has_data = Bool() val data_word_bypass = Bits(width = coreDataBits) + val data_raw = Bits(width = coreDataBits) val store_data = Bits(width = coreDataBits) } diff --git a/src/main/scala/rocket/NBDcache.scala b/src/main/scala/rocket/NBDcache.scala index ebb287f2..5ca57d76 100644 --- a/src/main/scala/rocket/NBDcache.scala +++ b/src/main/scala/rocket/NBDcache.scala @@ -972,6 +972,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule io.cpu.s2_nack := s2_valid && s2_nack io.cpu.resp := Mux(mshrs.io.resp.ready, uncache_resp, cache_resp) io.cpu.resp.bits.data_word_bypass := loadgen.wordData + io.cpu.resp.bits.data_raw := s2_data_word io.cpu.ordered := mshrs.io.fence_rdy && !s1_valid && !s2_valid io.cpu.replay_next := (s1_replay && s1_read) || mshrs.io.replay_next diff --git a/src/main/scala/rocket/ScratchpadSlavePort.scala b/src/main/scala/rocket/ScratchpadSlavePort.scala index dc456557..76b8ea10 100644 --- a/src/main/scala/rocket/ScratchpadSlavePort.scala +++ b/src/main/scala/rocket/ScratchpadSlavePort.scala @@ -48,7 +48,7 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L when (io.dmem.req.fire()) { state := s_wait } val acq = Reg(tl_in.a.bits) - when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data } + when (io.dmem.resp.valid) { acq.data := io.dmem.resp.bits.data_raw } when (tl_in.a.fire()) { acq := tl_in.a.bits } def formCacheReq(a: TLBundleA) = { @@ -85,17 +85,11 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L io.dmem.s1_kill := false io.dmem.invalidate_lr := false - // place AMO data in correct word lane - val minAMOBytes = 4 - val grantData = Mux(io.dmem.resp.valid, io.dmem.resp.bits.data, acq.data) - val alignedGrantData = - Mux(edge.hasData(acq) && (acq.size <= log2Ceil(minAMOBytes)), Fill(coreDataBytes/minAMOBytes, grantData(8*minAMOBytes-1, 0)), grantData) - tl_in.d.valid := io.dmem.resp.valid || state === s_grant tl_in.d.bits := Mux(acq.opcode === TLMessages.PutFullData, edge.AccessAck(acq, UInt(0)), edge.AccessAck(acq, UInt(0), UInt(0))) - tl_in.d.bits.data := alignedGrantData + tl_in.d.bits.data := Mux(io.dmem.resp.valid, io.dmem.resp.bits.data_raw, acq.data) // Tie off unused channels tl_in.b.valid := Bool(false) From 938b089543c3b27d50bd13e41be82363b4e423ae Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Tue, 2 May 2017 01:59:47 -0700 Subject: [PATCH 4/5] Remove legacy devices that use AMOALU I'm going to change the AMOALU API, and so I'm removing dependent dead code. --- src/main/scala/uncore/converters/Ahb.scala | 423 ------------------ .../scala/uncore/converters/Tilelink.scala | 181 -------- src/main/scala/uncore/devices/Bram.scala | 187 -------- 3 files changed, 791 deletions(-) delete mode 100644 src/main/scala/uncore/converters/Ahb.scala delete mode 100644 src/main/scala/uncore/devices/Bram.scala diff --git a/src/main/scala/uncore/converters/Ahb.scala b/src/main/scala/uncore/converters/Ahb.scala deleted file mode 100644 index 6d98af6d..00000000 --- a/src/main/scala/uncore/converters/Ahb.scala +++ /dev/null @@ -1,423 +0,0 @@ -// See LICENSE.SiFive for license details. - -package uncore.converters - -import Chisel._ -import junctions._ -import uncore.tilelink._ -import uncore.util._ -import uncore.constants._ -import config._ -import HastiConstants._ - -/* We need to translate TileLink requests into operations we can actually execute on AHB. - * The general plan of attack is: - * get => one AHB=>TL read - * put => [multiple AHB write fragments=>nill], one AHB write=>TL - * getBlock => AHB burst reads =>TL - * putBlock => AHB burst writes=>TL - * getPrefetch => noop=>TL - * putPrefetch => noop=>TL - * putAtomic => one AHB=>TL read, one idle, one AHB atom_write=>nill, one idle - * - * This requires that we support a pipeline of optional AHB requests with optional TL responses - */ -class AHBRequestIO(implicit p: Parameters) extends HastiMasterIO - with HasGrantType - with HasClientTransactionId - with HasTileLinkBeatId { - val executeAHB = Bool() - val respondTL = Bool() - val latchAtom = Bool() - val firstBurst = Bool() - val finalBurst = Bool() - val cmd = Bits(width = M_SZ) // atomic op -} - -// AHB stage1: translate TileLink Acquires into AHBRequests -class AHBTileLinkIn(supportAtomics: Boolean = false)(implicit val p: Parameters) extends Module - with HasHastiParameters - with HasTileLinkParameters { - val io = new Bundle { - val acquire = new DecoupledIO(new Acquire).flip // NOTE: acquire must be either a Queue or a Pipe - val request = new DecoupledIO(new AHBRequestIO) - } - - // Match the AHB burst with a TileLink {Put,Get}Block - val burstSize = tlDataBeats match { - case 1 => HBURST_SINGLE - // case 2 not supported by AHB - case 4 => HBURST_WRAP4 - case 8 => HBURST_WRAP8 - case 16 => HBURST_WRAP16 - case _ => throw new java.lang.AssertionError("TileLink beats unsupported by AHB") - } - - // Bursts start at 0 and wrap-around back to 0 - val finalBurst = UInt(tlDataBeats-1, width = log2Up(tlDataBeats)).asUInt - val firstBurst = UInt(0, width = log2Up(tlDataBeats)) - val next_wmask = Wire(UInt(width = tlDataBytes)) // calculated below - - // State variables for processing more complicated TileLink Acquires - val s_atom_r :: s_atom_idle1 :: s_atom_w :: s_atom_idle2 :: Nil = Enum(UInt(), 4) - val atom_state = Reg(init = s_atom_r) // never changes if !supportAtomics - val done_wmask = Reg(init = UInt(0, width = tlDataBytes)) - val burst = Reg(init = firstBurst) - - // Grab some view of the TileLink acquire - val acq_wmask = io.acquire.bits.wmask() - val isReadBurst = io.acquire.bits.is(Acquire.getBlockType) - val isWriteBurst = io.acquire.bits.is(Acquire.putBlockType) - val isBurst = isWriteBurst || isReadBurst - val isAtomic = io.acquire.bits.is(Acquire.putAtomicType) && Bool(supportAtomics) - val isPut = io.acquire.bits.is(Acquire.putType) - - // Final states? - val last_wmask = next_wmask === acq_wmask - val last_atom = atom_state === s_atom_idle2 - val last_burst = burst === finalBurst - - // Block the incoming request until we've fully consumed it - // NOTE: the outgoing grant.valid may happen while acquire.ready is still false; - // for this reason it is essential to have a Queue or a Pipe infront of acquire - io.acquire.ready := io.request.ready && MuxLookup(io.acquire.bits.a_type, Bool(true), Array( - Acquire.getType -> Bool(true), - Acquire.getBlockType -> last_burst, // hold it until the last beat is burst - Acquire.putType -> last_wmask, // only accept the put if we can fully consume its wmask - Acquire.putBlockType -> Bool(true), - Acquire.putAtomicType -> last_atom, // atomic operation stages complete - Acquire.getPrefetchType -> Bool(true), - Acquire.putPrefetchType -> Bool(true))) - - // Advance the fragment state - when (io.request.ready && io.acquire.valid && isPut) { - when (last_wmask) { // if this was the last fragment, restart FSM - done_wmask := UInt(0) - } .otherwise { - done_wmask := next_wmask - } - } - - // Advance the burst state - // We assume here that TileLink gives us all putBlock beats with nothing between them - when (io.request.ready && io.acquire.valid && isBurst) { - when (last_burst) { - burst := UInt(0) - } .otherwise { - burst := burst + UInt(1) - } - } - - // Advance the atomic state machine - when (io.request.ready && io.acquire.valid && isAtomic) { - switch (atom_state) { - is (s_atom_r) { atom_state := s_atom_idle1 } - is (s_atom_idle1) { atom_state := s_atom_w } // idle1 => AMOALU runs on a different clock than AHB slave read - is (s_atom_w) { atom_state := s_atom_idle2 } - is (s_atom_idle2) { atom_state := s_atom_r } // idle2 state is required by AHB after hmastlock is lowered - } - } - - // Returns (range=0, range=-1, aligned_wmask, size) - def mask_helper(in_0 : Bool, range : UInt): (Bool, Bool, UInt, UInt) = { - val len = range.getWidth - if (len == 1) { - (range === UInt(0), range === UInt(1), in_0.asUInt() & range, UInt(0)) - } else { - val mid = len / 2 - val lo = range(mid-1, 0) - val hi = range(len-1, mid) - val (lo_0, lo_1, lo_m, lo_s) = mask_helper(in_0, lo) - val (hi_0, hi_1, hi_m, hi_s) = mask_helper(in_0 && lo_0, hi) - val out_0 = lo_0 && hi_0 - val out_1 = lo_1 && hi_1 - val out_m = Cat(hi_m, lo_m) | Fill(len, (in_0 && out_1).asUInt()) - val out_s = Mux(out_1, UInt(log2Up(len)), Mux(lo_0, hi_s, lo_s)) - (out_0, out_1, out_m, out_s) - } - } - - val pending_wmask = acq_wmask & ~done_wmask - val put_addr = PriorityEncoder(pending_wmask) - val (wmask_0, _, exec_wmask, put_size) = mask_helper(Bool(true), pending_wmask) - next_wmask := done_wmask | exec_wmask - - // Calculate the address, with consideration to put fragments and bursts - val addr_block = io.acquire.bits.addr_block - val addr_beatin= io.acquire.bits.addr_beat - val addr_burst = Mux(isReadBurst, addr_beatin + burst, addr_beatin) - val addr_byte = Mux(isPut, put_addr, io.acquire.bits.addr_byte()) - val addr_beat = Mux(isWriteBurst, UInt(0), addr_burst) - val ahbAddr = Cat(addr_block, addr_burst, addr_byte) - val ahbSize = Mux(isPut, put_size, Mux(isBurst, UInt(log2Ceil(tlDataBytes)), io.acquire.bits.op_size())) - - val ahbBurst = MuxLookup(io.acquire.bits.a_type, HBURST_SINGLE, Array( - Acquire.getType -> HBURST_SINGLE, - Acquire.getBlockType -> burstSize, - Acquire.putType -> HBURST_SINGLE, - Acquire.putBlockType -> burstSize, - Acquire.putAtomicType -> HBURST_SINGLE, - Acquire.getPrefetchType -> HBURST_SINGLE, - Acquire.putPrefetchType -> HBURST_SINGLE)) - - val ahbWrite = MuxLookup(io.acquire.bits.a_type, Bool(false), Array( - Acquire.getType -> Bool(false), - Acquire.getBlockType -> Bool(false), - Acquire.putType -> Bool(true), - Acquire.putBlockType -> Bool(true), - Acquire.putAtomicType -> MuxLookup(atom_state, Bool(false), Array( - s_atom_r -> Bool(false), - s_atom_idle1 -> Bool(false), // don't care - s_atom_w -> Bool(true), - s_atom_idle2 -> Bool(true))), // don't care - Acquire.getPrefetchType -> Bool(false), // don't care - Acquire.putPrefetchType -> Bool(true))) // don't care - - val ahbExecute = MuxLookup(io.acquire.bits.a_type, Bool(false), Array( - Acquire.getType -> Bool(true), - Acquire.getBlockType -> Bool(true), - Acquire.putType -> !wmask_0, // handle the case of a Put with no bytes! - Acquire.putBlockType -> Bool(true), - Acquire.putAtomicType -> MuxLookup(atom_state, Bool(false), Array( - s_atom_r -> Bool(true), - s_atom_idle1 -> Bool(false), - s_atom_w -> Bool(true), - s_atom_idle2 -> Bool(false))), - Acquire.getPrefetchType -> Bool(false), - Acquire.putPrefetchType -> Bool(false))) - - val respondTL = MuxLookup(io.acquire.bits.a_type, Bool(false), Array( - Acquire.getType -> Bool(true), - Acquire.getBlockType -> Bool(true), - Acquire.putType -> last_wmask, - Acquire.putBlockType -> last_burst, - Acquire.putAtomicType -> MuxLookup(atom_state, Bool(false), Array( - s_atom_r -> Bool(true), // they want the old data - s_atom_idle1 -> Bool(false), - s_atom_w -> Bool(false), - s_atom_idle2 -> Bool(false))), - Acquire.getPrefetchType -> Bool(true), - Acquire.putPrefetchType -> Bool(true))) - - io.request.valid := io.acquire.valid - io.request.bits.htrans := HTRANS_IDLE // unused/ignored - io.request.bits.haddr := ahbAddr - io.request.bits.hmastlock := isAtomic && atom_state =/= s_atom_idle2 - io.request.bits.hwrite := ahbWrite - io.request.bits.hburst := ahbBurst - io.request.bits.hsize := ahbSize - io.request.bits.hprot := HPROT_DATA | HPROT_PRIVILEGED - io.request.bits.hwdata := io.acquire.bits.data - io.request.bits.executeAHB := ahbExecute - io.request.bits.respondTL := respondTL - io.request.bits.latchAtom := isAtomic && atom_state === s_atom_r - io.request.bits.firstBurst := burst === firstBurst - io.request.bits.finalBurst := burst === finalBurst || !isBurst - io.request.bits.cmd := io.acquire.bits.op_code() - io.request.bits.is_builtin_type := Bool(true) - io.request.bits.g_type := io.acquire.bits.getBuiltInGrantType() - io.request.bits.client_xact_id := io.acquire.bits.client_xact_id - io.request.bits.addr_beat := addr_beat - - val debugBurst = Reg(UInt()) - when (io.request.valid) { - debugBurst := addr_burst - burst - } - - // We only support built-in TileLink requests - assert(!io.acquire.valid || io.acquire.bits.is_builtin_type, "AHB bridge only supports builtin TileLink types") - // Ensure alignment of address to size - assert(!io.acquire.valid || (ahbAddr & ((UInt(1) << ahbSize) - UInt(1))) === UInt(0), "TileLink operation misaligned") - // If this is a putBlock, make sure it moves properly - assert(!io.acquire.valid || !isBurst || burst === firstBurst || debugBurst === addr_burst - burst, "TileLink putBlock beats not sequential") - // We better not get an incomplete TileLink acquire - assert(!io.acquire.valid || isBurst || burst === firstBurst, "TileLink never completed a putBlock") - // If we disabled atomic support, we better not see a request - assert(!io.acquire.bits.is(Acquire.putAtomicType) || Bool(supportAtomics)) -} - -// AHB stage2: execute AHBRequests -class AHBBusMaster(supportAtomics: Boolean = false)(implicit val p: Parameters) extends Module - with HasHastiParameters - with HasTileLinkParameters { - val io = new Bundle { - val request = new DecoupledIO(new AHBRequestIO).flip - val grant = new DecoupledIO(new Grant) - val ahb = new HastiMasterIO() - } - - // All AHB outputs are registered (they might be IOs) - val midBurst = Reg(init = Bool(false)) - val htrans = Reg(init = HTRANS_IDLE) - val haddr = Reg(UInt()) - val hmastlock = Reg(init = Bool(false)) - val hwrite = Reg(Bool()) - val hburst = Reg(UInt()) - val hsize = Reg(init = UInt(0, width = SZ_HSIZE)) - val hprot = Reg(UInt()) - val hwdata0 = Reg(Bits()) - val hwdata1 = Reg(Bits()) - val hrdata = Reg(Bits()) - - io.ahb.htrans := htrans - io.ahb.haddr := haddr - io.ahb.hmastlock := hmastlock - io.ahb.hwrite := hwrite - io.ahb.hburst := hburst - io.ahb.hsize := hsize - io.ahb.hprot := hprot - io.ahb.hwdata := hwdata1 // one cycle after the address phase - - // TileLink response data needed in data phase - val respondTL0 = Reg(init = Bool(false)) - val respondTL1 = Reg(init = Bool(false)) - val latchAtom0 = Reg(init = Bool(false)) - val latchAtom1 = Reg(init = Bool(false)) - val executeAHB0 = Reg(init = Bool(false)) - val executeAHB1 = Reg(init = Bool(false)) - val bubble = Reg(init = Bool(true)) // nothing useful in address phase - val cmd = Reg(Bits()) - val g_type0 = Reg(UInt()) - val g_type1 = Reg(UInt()) - val client_xact_id0 = Reg(Bits()) - val client_xact_id1 = Reg(Bits()) - val addr_beat0 = Reg(UInt()) - val addr_beat1 = Reg(UInt()) - val grant1 = Reg(new Grant) - - // It is allowed to progress from Idle/Busy during a wait state - val addrReady = io.ahb.hready || bubble || (!executeAHB1 && !executeAHB0) - val dataReady = io.ahb.hready || !executeAHB1 - - // Only accept a new AHBRequest if we have enough buffer space in the pad - // to accomodate a persistent drop in TileLink's grant.ready - io.request.ready := addrReady && io.grant.ready - - // htrans must be updated even if no request is valid - when (addrReady) { - when (io.request.fire() && io.request.bits.executeAHB) { - midBurst := !io.request.bits.finalBurst - when (io.request.bits.firstBurst) { - htrans := HTRANS_NONSEQ - } .otherwise { - htrans := HTRANS_SEQ - } - } .otherwise { - when (midBurst) { - htrans := HTRANS_BUSY - } .otherwise { - htrans := HTRANS_IDLE - } - } - } - - // Address phase, clear repondTL when we have nothing to do - when (addrReady) { - when (io.request.fire()) { - respondTL0 := io.request.bits.respondTL - latchAtom0 := io.request.bits.latchAtom - executeAHB0:= io.request.bits.executeAHB - bubble := Bool(false) - } .otherwise { - respondTL0 := Bool(false) - latchAtom0 := Bool(false) - executeAHB0:= Bool(false) - bubble := Bool(true) // an atom-injected Idle is not a bubble! - } - } - - // Transfer bulk address phase - when (io.request.fire()) { - haddr := io.request.bits.haddr - hmastlock := io.request.bits.hmastlock - hwrite := io.request.bits.hwrite - hburst := io.request.bits.hburst - hsize := io.request.bits.hsize - hprot := io.request.bits.hprot - hwdata0 := io.request.bits.hwdata - cmd := io.request.bits.cmd - g_type0 := io.request.bits.g_type - client_xact_id0 := io.request.bits.client_xact_id - addr_beat0 := io.request.bits.addr_beat - } - - // Execute Atomic ops; unused and optimized away if !supportAtomics - val amo_p = p.alterPartial({ - case CacheBlockOffsetBits => hastiAddrBits - }) - val alu = Module(new AMOALU(hastiDataBits, rhsIsAligned = true)(amo_p)) - alu.io.addr := haddr - alu.io.cmd := cmd - alu.io.typ := hsize - alu.io.rhs := hwdata0 - alu.io.lhs := hrdata - - // Transfer bulk data phase - when (dataReady) { - when (addrReady) { - respondTL1 := respondTL0 - latchAtom1 := latchAtom0 - executeAHB1 := executeAHB0 - } .otherwise { - respondTL1 := Bool(false) - latchAtom1 := Bool(false) - executeAHB1 := Bool(false) - } - hwdata1 := Mux(Bool(supportAtomics), alu.io.out, hwdata0) - g_type1 := g_type0 - client_xact_id1 := client_xact_id0 - addr_beat1 := addr_beat0 - } - - // Latch the read result for an atomic operation - when (dataReady && latchAtom1) { - hrdata := io.ahb.hrdata - } - - // Only issue TL grant when the slave has provided data - io.grant.valid := dataReady && respondTL1 - io.grant.bits := Grant( - is_builtin_type = Bool(true), - g_type = g_type1, - client_xact_id = client_xact_id1, - manager_xact_id = UInt(0), - addr_beat = addr_beat1, - data = io.ahb.hrdata) - - // We cannot support errors from AHB to TileLink - assert(!io.ahb.hresp, "AHB hresp error detected and cannot be reported via TileLink") -} - -class AHBBridge(supportAtomics: Boolean = true)(implicit val p: Parameters) extends Module - with HasHastiParameters - with HasTileLinkParameters { - val io = new Bundle { - val tl = new ClientUncachedTileLinkIO().flip - val ahb = new HastiMasterIO() - } - - // Hasti and TileLink widths must agree at this point in the topology - require (tlDataBits == hastiDataBits) - require (p(rocket.PAddrBits) == hastiAddrBits) - - // AHB does not permit bursts to cross a 1KB boundary - require (tlDataBits * tlDataBeats <= 1024*8) - // tlDataBytes must be a power of 2 - require (1 << log2Ceil(tlDataBytes) == tlDataBytes) - - // Create the sub-blocks - val fsm = Module(new AHBTileLinkIn(supportAtomics)) - val bus = Module(new AHBBusMaster(supportAtomics)) - val pad = Module(new Queue(new Grant, 4)) - - fsm.io.acquire <> Queue(io.tl.acquire, 2) // Pipe is also acceptable - bus.io.request <> fsm.io.request - io.ahb <> bus.io.ahb - io.tl.grant <> pad.io.deq - - // The pad is needed to absorb AHB progress while !grant.ready - // We are only 'ready' if the pad has at least 3 cycles of space - bus.io.grant.ready := pad.io.count <= UInt(1) - pad.io.enq.bits := bus.io.grant.bits - pad.io.enq.valid := bus.io.grant.valid -} diff --git a/src/main/scala/uncore/converters/Tilelink.scala b/src/main/scala/uncore/converters/Tilelink.scala index 3aa0ec16..a5572c91 100644 --- a/src/main/scala/uncore/converters/Tilelink.scala +++ b/src/main/scala/uncore/converters/Tilelink.scala @@ -9,7 +9,6 @@ import rocket.PAddrBits import uncore.tilelink._ import uncore.util._ import uncore.constants._ -import uncore.devices.TileLinkTestRAM import unittest.UnitTest import config._ @@ -604,183 +603,3 @@ class TileLinkIONarrower(innerTLId: String, outerTLId: String) sending_get := Bool(false) } } - -class TileLinkWidthAdapterTest(implicit p: Parameters) extends UnitTest { - val narrowConfig = p(TLKey(p(TLId))) - val wideConfig = narrowConfig.copy( - dataBeats = narrowConfig.dataBeats / 2) - val adapterParams = p.alterPartial({ case TLKey("WIDE") => wideConfig }) - - val depth = 2 * narrowConfig.dataBeats - val ram = Module(new TileLinkTestRAM(depth)) - val driver = Module(new DriverSet( - (driverParams: Parameters) => { - implicit val p = driverParams - Seq( - Module(new PutSweepDriver(depth)), - Module(new PutMaskDriver), - Module(new PutAtomicDriver), - Module(new PutBlockSweepDriver(depth / narrowConfig.dataBeats)), - Module(new PrefetchDriver), - Module(new GetMultiWidthDriver)) - })) - val widener = Module(new TileLinkIOWidener(p(TLId), "WIDE")(adapterParams)) - val narrower = Module(new TileLinkIONarrower("WIDE", p(TLId))(adapterParams)) - - widener.io.in <> driver.io.mem - narrower.io.in <> widener.io.out - ram.io <> narrower.io.out - driver.io.start := io.start - io.finished := driver.io.finished -} - -class TileLinkFragmenterSource(implicit p: Parameters) extends TLModule()(p) { - val io = new Bundle { - val in = Decoupled(new Acquire).flip - val out = Decoupled(new Acquire) - val que = Decoupled(UInt(width = tlBeatAddrBits)) - } - - // Pipeline stage with acquire data; needed to ensure in.bits stay fixed when !in.ready - val acq_valid = RegInit(Bool(false)) - val acq_bits = Reg(new Acquire) - // The last beat of generate acquire to send - val acq_last_beat = Reg(UInt(width = tlBeatAddrBits)) - val acq_last = acq_bits.addr_beat === acq_last_beat - - // 'in' has the first beat? - val in_multi_put = io.in.bits.isBuiltInType(Acquire.putBlockType) - val in_multi_get = io.in.bits.isBuiltInType(Acquire.getBlockType) - val in_first_beat = !in_multi_put || io.in.bits.addr_beat === UInt(0) - - // Move stuff from acq to out whenever out is ready - io.out.valid := acq_valid - // When can acq accept a request? - val acq_ready = !acq_valid || (acq_last && io.out.ready) - // Move the first beat from in to acq only when both acq and que are ready - io.in.ready := (!in_first_beat || io.que.ready) && acq_ready - io.que.valid := (in_first_beat && io.in.valid) && acq_ready - - // in.fire moves data from in to acq and (optionally) que - // out.fire moves data from acq to out - - // Desired flow control results: - assert (!io.que.fire() || io.in.fire()) // 1. que.fire => in.fire - assert (!(io.in.fire() && in_first_beat) || io.que.fire()) // 2. in.fire && in_first_beat => que.fire - assert (!io.out.fire() || acq_valid) // 3. out.fire => acq_valid - assert (!io.in.fire() || (!acq_valid || (io.out.fire() && acq_last))) // 4. in.fire => !acq_valid || (out.fire && acq_last) - // Proofs: - // 1. que.fire => que.ready && in.valid && acq_ready => in.ready && in.valid - // 2. in.fire && in_first_beat => in.valid && acq_ready && [(!in_first_beat || que.ready) && in_first_beat] => - // in.valid && acq_ready && que.ready && in_first_beat => que.valid && que.ready - // 3. out.fire => out.valid => acq_valid - // 4. in.fire => acq_ready => !acq_valid || (acq_last && out.ready) => - // !acq_valid || (acq_valid && acq_last && out.ready) => !acq_valid || (acq_last && out.fire) - - val multi_size = SInt(-1, width = tlBeatAddrBits).asUInt // TL2: use in.bits.size()/beatBits-1 - val in_sizeMinus1 = Mux(in_multi_get || in_multi_put, multi_size, UInt(0)) - val in_insertSizeMinus1 = Mux(in_multi_get, multi_size, UInt(0)) - - when (io.in.fire()) { - // Theorem 4 makes this safe; we overwrite garbage, or replace the final acq - acq_valid := Bool(true) - acq_bits := io.in.bits - acq_last_beat := io.in.bits.addr_beat + in_insertSizeMinus1 - // Replace this with size truncation in TL2: - acq_bits.a_type := Mux(in_multi_put, Acquire.putType, Mux(in_multi_get, Acquire.getType, io.in.bits.a_type)) - } .elsewhen (io.out.fire()) { - acq_valid := !acq_last // false => !in.valid || (!que.ready && in_first_beat) - acq_bits.addr_beat := acq_bits.addr_beat + UInt(1) - // acq_last && out.fire => acq_last && out.ready && acq_valid => acq_ready - // Suppose in.valid, then !in.fire => !in.ready => !(!in_first_beat || que.ready) => !que.ready && in_first_beat - } - - // Safe by theorem 3 - io.out.bits := acq_bits - // Safe by theorem 1 - io.que.bits := in_sizeMinus1 -} - -class TileLinkFragmenterSink(implicit p: Parameters) extends TLModule()(p) { - val io = new Bundle { - val in = Decoupled(new Grant).flip - val out = Decoupled(new Grant) - val que = Decoupled(UInt(width = tlBeatAddrBits)).flip - } - - val count_valid = RegInit(Bool(false)) - val multi_op = Reg(Bool()) - val count_bits = Reg(UInt(width = tlBeatAddrBits)) - val last = count_bits === UInt(0) - - val in_put = io.in.bits.isBuiltInType(Grant.putAckType) - val in_get = io.in.bits.isBuiltInType(Grant.getDataBeatType) - val deliver = last || in_get - - // Accept the input, discarding the non-final put grant - io.in.ready := count_valid && (io.out.ready || !deliver) - // Output the grant whenever we want delivery - io.out.valid := count_valid && io.in.valid && deliver - // Take a new number whenever we deliver the last beat - io.que.ready := !count_valid || (io.in.valid && io.out.ready && last) - - // Desired flow control results: - assert (!io.out.fire() || (count_valid && io.in.fire())) // 1. out.fire => in.fire && count_valid - assert (!(io.in.fire() && deliver) || io.out.fire()) // 2. in.fire && deliver => out.fire - assert (!(io.out.fire() && last) || io.que.ready) // 3. out.fire && last => que.ready - assert (!io.que.fire() || (!count_valid || io.out.fire())) // 4. que.fire => !count_valid || (out.fire && last) - // Proofs: - // 1. out.fire => out.ready && (count_valid && in.valid && deliver) => (count_valid && out.ready) && in.valid => in.fire - // 2. in.fire && deliver => in.valid && count_valid && [(out.ready || !deliver) && deliver] => - // in.valid && count_valid && deliver && out.ready => out.fire - // 3. out.fire && last => out.valid && out.ready && last => in.valid && out.ready && last => que.ready - // 4. que.fire => que.valid && (!count_valid || (in.valid && out.ready && last)) - // => !count_valid || (count_valid && in.valid && out.ready && [last => deliver]) - // => !count_valid || (out.valid && out.ready && last) - - when (io.que.fire()) { - // Theorem 4 makes this safe; we overwrite garbage or last output - count_valid := Bool(true) - count_bits := io.que.bits - multi_op := io.que.bits =/= UInt(0) - } .elsewhen (io.in.fire()) { - count_valid := !last // false => !que.valid - count_bits := count_bits - UInt(1) - // Proof: in.fire && [last => deliver] =2=> out.fire && last =3=> que.ready - // !que.fire && que.ready => !que.valid - } - - // Safe by Theorem 1 - io.out.bits := io.in.bits - io.out.bits.g_type := Mux(multi_op, Mux(in_get, Grant.getDataBlockType, Grant.putAckType), io.in.bits.g_type) -} - -class TileLinkFragmenter(depth: Int = 1)(implicit p: Parameters) extends TLModule()(p) { - val io = new Bundle { - val in = new ClientUncachedTileLinkIO().flip - val out = new ClientUncachedTileLinkIO - } - - // TL2: - // supportsAcquire = false - // modify all outward managers to supportsMultibeat = true - // assert: all managers must behaveFIFO (not inspect duplicated id field) - - val source = Module(new TileLinkFragmenterSource) - val sink = Module(new TileLinkFragmenterSink) - sink.io.que <> Queue(source.io.que, depth) - - source.io.in <> io.in.acquire - io.out.acquire <> source.io.out - sink.io.in <> io.out.grant - io.in.grant <> sink.io.out -} - -object TileLinkFragmenter { - // Pass the source/client to fragment - def apply(source: ClientUncachedTileLinkIO, depth: Int = 1): ClientUncachedTileLinkIO = { - val fragmenter = Module(new TileLinkFragmenter(depth)(source.p)) - fragmenter.io.in <> source - fragmenter.io.out - } -} diff --git a/src/main/scala/uncore/devices/Bram.scala b/src/main/scala/uncore/devices/Bram.scala deleted file mode 100644 index dfaee663..00000000 --- a/src/main/scala/uncore/devices/Bram.scala +++ /dev/null @@ -1,187 +0,0 @@ -// See LICENSE.SiFive for license details. -// See LICENSE.Berkeley for license details. - -package uncore.devices - -import Chisel._ -import config._ -import unittest.UnitTest -import junctions._ -import uncore.tilelink._ -import uncore.util._ -import util._ -import HastiConstants._ - -class BRAMSlave(depth: Int)(implicit val p: Parameters) extends Module - with HasTileLinkParameters { - val io = new ClientUncachedTileLinkIO().flip - - // For TL2: - // supportsAcquire = false - // supportsMultibeat = false - // supportsHint = false - // supportsAtomic = false - - // Timing-wise, we assume the input is coming out of registers - // since you probably needed a TileLinkFragmenter infront of us - - // Thus, only one pipeline stage: the grant result - val g_valid = RegInit(Bool(false)) - val g_bits = Reg(new Grant) - - // Just pass the pipeline straight through - io.grant.valid := g_valid - io.grant.bits := g_bits - io.acquire.ready := !g_valid || io.grant.ready - - val acq_get = io.acquire.bits.isBuiltInType(Acquire.getType) - val acq_put = io.acquire.bits.isBuiltInType(Acquire.putType) - val acq_addr = Cat(io.acquire.bits.addr_block, io.acquire.bits.addr_beat) - - val bram = Mem(depth, Bits(width = tlDataBits)) - - val ren = acq_get && io.acquire.fire() - val wen = acq_put && io.acquire.fire() - - when (io.grant.fire()) { - g_valid := Bool(false) - } - - when (io.acquire.fire()) { - g_valid := Bool(true) - g_bits := Grant( - is_builtin_type = Bool(true), - g_type = io.acquire.bits.getBuiltInGrantType(), - client_xact_id = io.acquire.bits.client_xact_id, - manager_xact_id = UInt(0), - addr_beat = io.acquire.bits.addr_beat, - data = UInt(0)) - } - - when (wen) { - bram.write(acq_addr, io.acquire.bits.data) - assert(io.acquire.bits.wmask().andR, "BRAMSlave: partial write masks not supported") - } - io.grant.bits.data := RegEnable(bram.read(acq_addr), ren) -} - -class HastiRAM(depth: Int)(implicit p: Parameters) extends HastiModule()(p) { - val io = new HastiSlaveIO - - val wdata = Vec.tabulate(hastiDataBytes)(i => io.hwdata(8*(i+1)-1,8*i)) - val waddr = Reg(UInt(width = hastiAddrBits)) - val wvalid = Reg(init = Bool(false)) - val wsize = Reg(UInt(width = SZ_HSIZE)) - val ram = SeqMem(depth, Vec(hastiDataBytes, Bits(width = 8))) - - val max_size = log2Ceil(hastiDataBytes) - val wmask_lut = MuxLookup(wsize, SInt(-1, hastiDataBytes).asUInt, - (0 until max_size).map(sz => (UInt(sz) -> UInt((1 << (1 << sz)) - 1)))) - val wmask = (wmask_lut << waddr(max_size - 1, 0))(hastiDataBytes - 1, 0) - - val is_trans = io.hsel && io.htrans.isOneOf(HTRANS_NONSEQ, HTRANS_SEQ) - val raddr = io.haddr >> UInt(max_size) - val ren = is_trans && !io.hwrite - val bypass = Reg(init = Bool(false)) - - when (is_trans && io.hwrite) { - waddr := io.haddr - wsize := io.hsize - wvalid := Bool(true) - } .otherwise { wvalid := Bool(false) } - - when (ren) { bypass := wvalid && (waddr >> UInt(max_size)) === raddr } - - when (wvalid) { - ram.write(waddr >> UInt(max_size), wdata, wmask.toBools) - } - - val rdata = ram.read(raddr, ren) - io.hrdata := Cat(rdata.zip(wmask.toBools).zip(wdata).map { - case ((rbyte, wsel), wbyte) => Mux(wsel && bypass, wbyte, rbyte) - }.reverse) - - io.hready := Bool(true) - io.hresp := HRESP_OKAY -} - -/** - * This RAM is not meant to be particularly performant. - * It just supports the entire range of uncached TileLink operations in the - * simplest way possible. - */ -class TileLinkTestRAM(depth: Int)(implicit val p: Parameters) extends Module - with HasTileLinkParameters { - val io = new ClientUncachedTileLinkIO().flip - - val ram = Mem(depth, UInt(width = tlDataBits)) - - val responding = Reg(init = Bool(false)) - val acq = io.acquire.bits - val r_acq = Reg(io.acquire.bits) - val acq_addr = Cat(acq.addr_block, acq.addr_beat) - val r_acq_addr = Cat(r_acq.addr_block, r_acq.addr_beat) - - when (io.acquire.fire() && io.acquire.bits.last()) { - r_acq := io.acquire.bits - responding := Bool(true) - } - - when (io.grant.fire()) { - val is_getblk = r_acq.isBuiltInType(Acquire.getBlockType) - val last_beat = r_acq.addr_beat === UInt(tlDataBeats - 1) - when (is_getblk && !last_beat) { - r_acq.addr_beat := r_acq.addr_beat + UInt(1) - } .otherwise { responding := Bool(false) } - } - - val old_data = ram(acq_addr) - val new_data = acq.data - val r_old_data = RegEnable(old_data, io.acquire.fire()) - - io.acquire.ready := !responding - io.grant.valid := responding - io.grant.bits := Grant( - is_builtin_type = Bool(true), - g_type = r_acq.getBuiltInGrantType(), - client_xact_id = r_acq.client_xact_id, - manager_xact_id = UInt(0), - addr_beat = r_acq.addr_beat, - data = Mux(r_acq.isAtomic(), r_old_data, ram(r_acq_addr))) - - val amo_shift_bits = acq.amo_shift_bytes() << UInt(3) - val amoalu = Module(new AMOALU(amoAluOperandBits, rhsIsAligned = true)) - amoalu.io.addr := Cat(acq.addr_block, acq.addr_beat, acq.addr_byte()) - amoalu.io.cmd := acq.op_code() - amoalu.io.typ := acq.op_size() - amoalu.io.lhs := old_data >> amo_shift_bits - amoalu.io.rhs := new_data >> amo_shift_bits - - val result = Mux(acq.isAtomic(), amoalu.io.out << amo_shift_bits, new_data) - val wmask = FillInterleaved(8, acq.wmask()) - - when (io.acquire.fire() && acq.hasData()) { - ram(acq_addr) := (old_data & ~wmask) | (result & wmask) - } -} - -class TileLinkRAMTest(implicit val p: Parameters) - extends UnitTest with HasTileLinkParameters { - - val depth = 2 * tlDataBeats - val ram = Module(new TileLinkTestRAM(depth)) - val driver = Module(new DriverSet( - (driverParams: Parameters) => { - implicit val p = driverParams - Seq( - Module(new PutSweepDriver(depth)), - Module(new PutMaskDriver), - Module(new PutAtomicDriver), - Module(new PutBlockSweepDriver(depth / tlDataBeats)), - Module(new PrefetchDriver), - Module(new GetMultiWidthDriver)) - })) - ram.io <> driver.io.mem - driver.io.start := io.start - io.finished := driver.io.finished -} From 3a1a37d41b874ad265f1403e8bcbb7dd87c30299 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Tue, 2 May 2017 03:04:41 -0700 Subject: [PATCH 5/5] Support PutPartial in ScratchpadSlavePort --- src/main/scala/rocket/DCache.scala | 25 +++++------ src/main/scala/rocket/HellaCache.scala | 7 ++- src/main/scala/rocket/IDecode.scala | 2 +- src/main/scala/rocket/NBDcache.scala | 24 +++++----- src/main/scala/rocket/Rocket.scala | 5 ++- .../scala/rocket/ScratchpadSlavePort.scala | 15 +++---- .../scala/rocket/SimpleHellaCacheIF.scala | 2 +- src/main/scala/tile/FPU.scala | 2 +- src/main/scala/uncore/Consts.scala | 5 ++- src/main/scala/uncore/util/AmoAlu.scala | 45 ++++++++----------- 10 files changed, 64 insertions(+), 68 deletions(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index 5562d769..8e08b8d6 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -121,7 +121,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { tlb.io.req.bits.sfence.valid := s1_sfence tlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0) tlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1) - tlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data + tlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data tlb.io.req.bits.passthrough := s1_req.phys tlb.io.req.bits.vaddr := s1_req.addr tlb.io.req.bits.instruction := false @@ -155,6 +155,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { } val s1_data_way = Mux(inWriteback, releaseWay, s1_hit_way) val s1_data = Mux1H(s1_data_way, data.io.resp) // retime into s2 if critical + val s1_mask = Mux(s1_req.cmd === M_PWR, io.cpu.s1_data.mask, new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes).mask) val s2_valid = Reg(next=s1_valid_masked && !s1_sfence, init=Bool(false)) && !io.cpu.s2_xcpt.asUInt.orR val s2_probe = Reg(next=s1_probe, init=Bool(false)) @@ -229,10 +230,10 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val pstore1_cmd = RegEnable(s1_req.cmd, s1_valid_not_nacked && s1_write) val pstore1_typ = RegEnable(s1_req.typ, s1_valid_not_nacked && s1_write) val pstore1_addr = RegEnable(s1_paddr, s1_valid_not_nacked && s1_write) - val pstore1_data = RegEnable(io.cpu.s1_data, s1_valid_not_nacked && s1_write) + val pstore1_data = RegEnable(io.cpu.s1_data.data, s1_valid_not_nacked && s1_write) val pstore1_way = RegEnable(s1_hit_way, s1_valid_not_nacked && s1_write) - val pstore1_storegen = new StoreGen(pstore1_typ, pstore1_addr, pstore1_data, wordBytes) - val pstore1_storegen_data = Wire(init = pstore1_storegen.data) + val pstore1_mask = RegEnable(s1_mask, s1_valid_not_nacked && s1_write) + val pstore1_storegen_data = Wire(init = pstore1_data) val pstore1_amo = Bool(usingAtomics) && isRead(pstore1_cmd) val pstore_drain_structural = pstore1_valid && pstore2_valid && ((s1_valid && s1_write) || pstore1_amo) val pstore_drain_opportunistic = !(io.cpu.req.valid && isRead(io.cpu.req.bits.cmd)) @@ -252,21 +253,20 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val pstore2_addr = RegEnable(pstore1_addr, advance_pstore1) val pstore2_way = RegEnable(pstore1_way, advance_pstore1) val pstore2_storegen_data = RegEnable(pstore1_storegen_data, advance_pstore1) - val pstore2_storegen_mask = RegEnable(pstore1_storegen.mask, advance_pstore1) + val pstore2_storegen_mask = RegEnable(pstore1_mask, advance_pstore1) dataArb.io.in(0).valid := pstore_drain dataArb.io.in(0).bits.write := true dataArb.io.in(0).bits.addr := Mux(pstore2_valid, pstore2_addr, pstore1_addr) dataArb.io.in(0).bits.way_en := Mux(pstore2_valid, pstore2_way, pstore1_way) dataArb.io.in(0).bits.wdata := Fill(rowWords, Mux(pstore2_valid, pstore2_storegen_data, pstore1_storegen_data)) val pstore_mask_shift = Mux(pstore2_valid, pstore2_addr, pstore1_addr).extract(rowOffBits-1,offsetlsb) << wordOffBits - dataArb.io.in(0).bits.wmask := Mux(pstore2_valid, pstore2_storegen_mask, pstore1_storegen.mask) << pstore_mask_shift + dataArb.io.in(0).bits.wmask := Mux(pstore2_valid, pstore2_storegen_mask, pstore1_mask) << pstore_mask_shift // store->load RAW hazard detection - val s1_storegen = new StoreGen(s1_req.typ, s1_req.addr, UInt(0), wordBytes) val s1_idx = s1_req.addr(idxMSB, wordOffBits) val s1_raw_hazard = s1_read && - ((pstore1_valid && pstore1_addr(idxMSB, wordOffBits) === s1_idx && (pstore1_storegen.mask & s1_storegen.mask).orR) || - (pstore2_valid && pstore2_addr(idxMSB, wordOffBits) === s1_idx && (pstore2_storegen_mask & s1_storegen.mask).orR)) + ((pstore1_valid && pstore1_addr(idxMSB, wordOffBits) === s1_idx && (pstore1_mask & s1_mask).orR) || + (pstore2_valid && pstore2_addr(idxMSB, wordOffBits) === s1_idx && (pstore2_storegen_mask & s1_mask).orR)) when (s1_valid && s1_raw_hazard) { s1_nack := true } metaWriteArb.io.in(0).valid := (s2_valid_hit && s2_update_meta) || (s2_victimize && !s2_victim_dirty) @@ -279,8 +279,8 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val a_source = PriorityEncoder(~uncachedInFlight.asUInt << mmioOffset) // skip the MSHR val acquire_address = s2_req_block_addr val access_address = s2_req.addr - val a_size = s2_req.typ(MT_SZ-2, 0) - val a_data = Fill(beatWords, pstore1_storegen.data) + val a_size = mtSize(s2_req.typ) + val a_data = Fill(beatWords, pstore1_data) val acquire = if (edge.manager.anySupportAcquireB) { edge.Acquire(UInt(0), acquire_address, lgCacheBlockBytes, s2_grow_param)._2 // Cacheability checked by tlb } else { @@ -523,9 +523,8 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { // AMOs if (usingAtomics) { val amoalu = Module(new AMOALU(xLen)) - amoalu.io.addr := pstore1_addr + amoalu.io.mask := pstore1_mask amoalu.io.cmd := pstore1_cmd - amoalu.io.typ := pstore1_typ amoalu.io.lhs := s2_data_word amoalu.io.rhs := pstore1_data pstore1_storegen_data := amoalu.io.out diff --git a/src/main/scala/rocket/HellaCache.scala b/src/main/scala/rocket/HellaCache.scala index f50bfa0b..0624fe5d 100644 --- a/src/main/scala/rocket/HellaCache.scala +++ b/src/main/scala/rocket/HellaCache.scala @@ -118,11 +118,16 @@ class HellaCacheExceptions extends Bundle { val ae = new AlignmentExceptions } +class HellaCacheWriteData(implicit p: Parameters) extends CoreBundle()(p) { + val data = UInt(width = coreDataBits) + val mask = UInt(width = coreDataBytes) +} + // interface between D$ and processor/DTLB class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) { val req = Decoupled(new HellaCacheReq) val s1_kill = Bool(OUTPUT) // kill previous cycle's req - val s1_data = Bits(OUTPUT, coreDataBits) // data for previous cycle's req + val s1_data = new HellaCacheWriteData().asOutput // data for previous cycle's req val s2_nack = Bool(INPUT) // req from two cycles ago is rejected // performance events diff --git a/src/main/scala/rocket/IDecode.scala b/src/main/scala/rocket/IDecode.scala index 86c0a03a..ca8ae4ac 100644 --- a/src/main/scala/rocket/IDecode.scala +++ b/src/main/scala/rocket/IDecode.scala @@ -129,7 +129,7 @@ class IDecode(implicit val p: Parameters) extends DecodeConstants class SDecode(implicit val p: Parameters) extends DecodeConstants { val table: Array[(BitPat, List[BitPat])] = Array( - SFENCE_VMA->List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_SFENCE, MT_X, N,N,N,N,N,N,CSR.N,N,N,N,N), + SFENCE_VMA->List(Y,N,N,N,N,N,Y,Y,A2_ZERO,A1_RS1, IMM_X, DW_XPR,FN_ADD, Y,M_SFENCE, MT_W, N,N,N,N,N,N,CSR.N,N,N,N,N), SRET-> List(Y,N,N,N,N,N,N,X,A2_X, A1_X, IMM_X, DW_X, FN_X, N,M_X, MT_X, N,N,N,N,N,N,CSR.I,N,N,N,N)) } diff --git a/src/main/scala/rocket/NBDcache.scala b/src/main/scala/rocket/NBDcache.scala index 5ca57d76..b03361b2 100644 --- a/src/main/scala/rocket/NBDcache.scala +++ b/src/main/scala/rocket/NBDcache.scala @@ -69,20 +69,18 @@ class IOMSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCa } val req = Reg(new HellaCacheReq) - val req_cmd_sc = req.cmd === M_XSC val grant_word = Reg(UInt(width = wordBits)) val s_idle :: s_mem_access :: s_mem_ack :: s_resp :: Nil = Enum(Bits(), 4) val state = Reg(init = s_idle) io.req.ready := (state === s_idle) - val storegen = new StoreGen(req.typ, req.addr, req.data, wordBytes) - val loadgen = new LoadGen(req.typ, mtSigned(req.typ), req.addr, grant_word, req_cmd_sc, wordBytes) + val loadgen = new LoadGen(req.typ, mtSigned(req.typ), req.addr, grant_word, false.B, wordBytes) val a_source = UInt(id) val a_address = req.addr - val a_size = storegen.size - val a_data = Fill(beatWords, storegen.data) + val a_size = mtSize(req.typ) + val a_data = Fill(beatWords, req.data) val get = edge.Get(a_source, a_address, a_size)._2 val put = edge.Put(a_source, a_address, a_size, a_data)._2 @@ -99,9 +97,10 @@ class IOMSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCa M_XA_MAXU -> edge.Arithmetic(a_source, a_address, a_size, a_data, TLAtomics.MAXU)._2)) } else { // If no managers support atomics, assert fail if processor asks for them - assert (!isAMO(req.cmd)) + assert(state === s_idle || !isAMO(req.cmd)) Wire(new TLBundleA(edge.bundle)) } + assert(state === s_idle || req.cmd =/= M_XSC) io.mem_access.valid := (state === s_mem_access) io.mem_access.bits := Mux(isAMO(req.cmd), atomics, Mux(isRead(req.cmd), get, put)) @@ -110,7 +109,7 @@ class IOMSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCa io.resp.valid := (state === s_resp) io.resp.bits := req io.resp.bits.has_data := isRead(req.cmd) - io.resp.bits.data := loadgen.data | req_cmd_sc + io.resp.bits.data := loadgen.data io.resp.bits.store_data := req.data io.resp.bits.replay := Bool(true) @@ -696,6 +695,8 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule val s1_read = isRead(s1_req.cmd) val s1_write = isWrite(s1_req.cmd) val s1_readwrite = s1_read || s1_write || isPrefetch(s1_req.cmd) + // check for unsupported operations + assert(!s1_valid || !s1_req.cmd.isOneOf(M_PWR)) val dtlb = Module(new TLB(log2Ceil(coreDataBytes), nTLBEntries)) io.ptw <> dtlb.io.ptw @@ -703,7 +704,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule dtlb.io.req.bits.sfence.valid := s1_sfence dtlb.io.req.bits.sfence.bits.rs1 := s1_req.typ(0) dtlb.io.req.bits.sfence.bits.rs2 := s1_req.typ(1) - dtlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data + dtlb.io.req.bits.sfence.bits.asid := io.cpu.s1_data.data dtlb.io.req.bits.passthrough := s1_req.phys dtlb.io.req.bits.vaddr := s1_req.addr dtlb.io.req.bits.instruction := Bool(false) @@ -736,7 +737,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule s2_req.phys := s1_req.phys s2_req.addr := s1_addr when (s1_write) { - s2_req.data := Mux(s1_replay, mshrs.io.replay.bits.data, io.cpu.s1_data) + s2_req.data := Mux(s1_replay, mshrs.io.replay.bits.data, io.cpu.s1_data.data) } when (s1_recycled) { s2_req.data := s1_req.data } s2_req.tag := s1_req.tag @@ -927,10 +928,9 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule val s2_data_word_prebypass = s2_data_uncorrected >> Cat(s2_word_idx, Bits(0,log2Up(coreDataBits))) val s2_data_word = Mux(s2_store_bypass, s2_store_bypass_data, s2_data_word_prebypass) val loadgen = new LoadGen(s2_req.typ, mtSigned(s2_req.typ), s2_req.addr, s2_data_word, s2_sc, wordBytes) - - amoalu.io.addr := s2_req.addr + + amoalu.io.mask := new StoreGen(s2_req.typ, s2_req.addr, 0.U, xLen/8).mask amoalu.io.cmd := s2_req.cmd - amoalu.io.typ := s2_req.typ amoalu.io.lhs := s2_data_word amoalu.io.rhs := s2_req.data diff --git a/src/main/scala/rocket/Rocket.scala b/src/main/scala/rocket/Rocket.scala index 112b4d6c..27999f14 100644 --- a/src/main/scala/rocket/Rocket.scala +++ b/src/main/scala/rocket/Rocket.scala @@ -394,7 +394,8 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) mem_reg_pc := ex_reg_pc mem_reg_wdata := alu.io.out when (ex_ctrl.rxs2 && (ex_ctrl.mem || ex_ctrl.rocc)) { - mem_reg_rs2 := ex_rs(1) + val typ = Mux(ex_ctrl.rocc, log2Ceil(xLen/8).U, ex_ctrl.mem_type) + mem_reg_rs2 := new uncore.util.StoreGen(typ, 0.U, ex_rs(1), coreDataBytes).data } } @@ -625,7 +626,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) io.dmem.req.bits.phys := Bool(false) io.dmem.req.bits.addr := encodeVirtualAddress(ex_rs(0), alu.io.adder_out) io.dmem.invalidate_lr := wb_xcpt - io.dmem.s1_data := Mux(mem_ctrl.fp, io.fpu.store_data, mem_reg_rs2) + io.dmem.s1_data.data := Mux(mem_ctrl.fp, io.fpu.store_data, mem_reg_rs2) io.dmem.s1_kill := killm_common || mem_breakpoint io.rocc.cmd.valid := wb_reg_valid && wb_ctrl.rocc && !replay_wb_common diff --git a/src/main/scala/rocket/ScratchpadSlavePort.scala b/src/main/scala/rocket/ScratchpadSlavePort.scala index 76b8ea10..de8702ac 100644 --- a/src/main/scala/rocket/ScratchpadSlavePort.scala +++ b/src/main/scala/rocket/ScratchpadSlavePort.scala @@ -24,7 +24,7 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L executable = true, supportsArithmetic = if (usingAtomics) TransferSizes(4, coreDataBytes) else TransferSizes.none, supportsLogical = if (usingAtomics) TransferSizes(4, coreDataBytes) else TransferSizes.none, - supportsPutPartial = TransferSizes.none, // Can't support PutPartial + supportsPutPartial = TransferSizes(1, coreDataBytes), supportsPutFull = TransferSizes(1, coreDataBytes), supportsGet = TransferSizes(1, coreDataBytes), fifoId = Some(0))), // requests handled in FIFO order @@ -55,6 +55,7 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L val req = Wire(new HellaCacheReq) req.cmd := MuxLookup(a.opcode, Wire(M_XRD), Array( TLMessages.PutFullData -> M_XWR, + TLMessages.PutPartialData -> M_PWR, TLMessages.ArithmeticData -> MuxLookup(a.param, Wire(M_XRD), Array( TLAtomics.MIN -> M_XA_MIN, TLAtomics.MAX -> M_XA_MAX, @@ -67,9 +68,8 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L TLAtomics.AND -> M_XA_AND, TLAtomics.SWAP -> M_XA_SWAP)), TLMessages.Get -> M_XRD)) - // treat all loads as full words, so bytes appear in correct lane - req.typ := Mux(edge.hasData(a), a.size, log2Ceil(coreDataBytes)) - req.addr := Mux(edge.hasData(a), a.address, ~(~a.address | (coreDataBytes-1))) + req.typ := a.size + req.addr := a.address req.tag := UInt(0) req.phys := true req @@ -79,14 +79,13 @@ class ScratchpadSlavePort(address: AddressSet)(implicit p: Parameters) extends L io.dmem.req.valid := (tl_in.a.valid && ready) || state === s_replay tl_in.a.ready := io.dmem.req.ready && ready io.dmem.req.bits := formCacheReq(Mux(state === s_replay, acq, tl_in.a.bits)) - // the TL data is already in the correct byte lane, but the D$ - // expects right-justified store data, so that it can steer the bytes. - io.dmem.s1_data := new LoadGen(acq.size, Bool(false), acq.address(log2Ceil(coreDataBytes)-1,0), acq.data, Bool(false), coreDataBytes).data + io.dmem.s1_data.data := acq.data + io.dmem.s1_data.mask := acq.mask io.dmem.s1_kill := false io.dmem.invalidate_lr := false tl_in.d.valid := io.dmem.resp.valid || state === s_grant - tl_in.d.bits := Mux(acq.opcode === TLMessages.PutFullData, + tl_in.d.bits := Mux(acq.opcode.isOneOf(TLMessages.PutFullData, TLMessages.PutPartialData), edge.AccessAck(acq, UInt(0)), edge.AccessAck(acq, UInt(0), UInt(0))) tl_in.d.bits.data := Mux(io.dmem.resp.valid, io.dmem.resp.bits.data_raw, acq.data) diff --git a/src/main/scala/rocket/SimpleHellaCacheIF.scala b/src/main/scala/rocket/SimpleHellaCacheIF.scala index db8a34e5..1f821f99 100644 --- a/src/main/scala/rocket/SimpleHellaCacheIF.scala +++ b/src/main/scala/rocket/SimpleHellaCacheIF.scala @@ -123,7 +123,7 @@ class SimpleHellaCacheIF(implicit p: Parameters) extends Module io.cache.invalidate_lr := io.requestor.invalidate_lr io.cache.req <> req_arb.io.out io.cache.s1_kill := io.cache.s2_nack - io.cache.s1_data := RegEnable(req_arb.io.out.bits.data, s0_req_fire) + io.cache.s1_data.data := RegEnable(req_arb.io.out.bits.data, s0_req_fire) replayq.io.nack.valid := (io.cache.s2_nack || s2_kill) && s2_req_fire replayq.io.nack.bits := s2_req_tag diff --git a/src/main/scala/tile/FPU.scala b/src/main/scala/tile/FPU.scala index 4fcc088f..80355e4d 100644 --- a/src/main/scala/tile/FPU.scala +++ b/src/main/scala/tile/FPU.scala @@ -392,7 +392,7 @@ class FPToInt(implicit p: Parameters) extends FPUModule()(p) { val store = ieee(in.in1) val toint = Mux(in.rm(0), classify_out, store) - io.out.bits.store := store + io.out.bits.store := Mux(in.singleOut, Fill(xLen/32, store(31, 0)), store) io.out.bits.toint := Mux(in.singleOut, toint(31, 0).sextTo(xLen), toint) io.out.bits.exc := Bits(0) diff --git a/src/main/scala/uncore/Consts.scala b/src/main/scala/uncore/Consts.scala index 4f0c7297..d1990625 100644 --- a/src/main/scala/uncore/Consts.scala +++ b/src/main/scala/uncore/Consts.scala @@ -28,7 +28,8 @@ trait MemoryOpConstants { def M_XA_MINU = UInt("b01110"); def M_XA_MAXU = UInt("b01111"); def M_FLUSH = UInt("b10000") // write back dirty data and cede R/W permissions - def M_PRODUCE = UInt("b10001") // write back dirty data and cede W permissions + def M_PWR = UInt("b10001") // partial (masked) store + def M_PRODUCE = UInt("b10010") // write back dirty data and cede W permissions def M_CLEAN = UInt("b10011") // write back dirty data and retain R/W permissions def M_SFENCE = UInt("b10100") // flush TLB @@ -37,7 +38,7 @@ trait MemoryOpConstants { def isAMO(cmd: UInt) = isAMOLogical(cmd) || isAMOArithmetic(cmd) def isPrefetch(cmd: UInt) = cmd === M_PFR || cmd === M_PFW def isRead(cmd: UInt) = cmd === M_XRD || cmd === M_XLR || cmd === M_XSC || isAMO(cmd) - def isWrite(cmd: UInt) = cmd === M_XWR || cmd === M_XSC || isAMO(cmd) + def isWrite(cmd: UInt) = cmd === M_XWR || cmd === M_PWR || cmd === M_XSC || isAMO(cmd) def isWriteIntent(cmd: UInt) = isWrite(cmd) || cmd === M_PFW || cmd === M_XLR } diff --git a/src/main/scala/uncore/util/AmoAlu.scala b/src/main/scala/uncore/util/AmoAlu.scala index b36c4446..703d059b 100644 --- a/src/main/scala/uncore/util/AmoAlu.scala +++ b/src/main/scala/uncore/util/AmoAlu.scala @@ -31,10 +31,6 @@ class StoreGen(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) { def wordData = genData(2) } -class StoreGenAligned(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) extends StoreGen(typ, addr, dat, maxSize) { - override def genData(i: Int) = dat -} - class LoadGen(typ: UInt, signed: Bool, addr: UInt, dat: UInt, zero: Bool, maxSize: Int) { private val size = new StoreGen(typ, addr, dat, maxSize).size @@ -54,22 +50,16 @@ class LoadGen(typ: UInt, signed: Bool, addr: UInt, dat: UInt, zero: Bool, maxSiz def data = genData(0) } -class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parameters) extends Module { +class AMOALU(operandBits: Int)(implicit p: Parameters) extends Module { require(operandBits == 32 || operandBits == 64) val io = new Bundle { - val addr = Bits(INPUT, log2Ceil(operandBits/8)) + val mask = UInt(INPUT, operandBits/8) val cmd = Bits(INPUT, M_SZ) - val typ = Bits(INPUT, log2Ceil(log2Ceil(operandBits/8) + 1)) val lhs = Bits(INPUT, operandBits) val rhs = Bits(INPUT, operandBits) val out = Bits(OUTPUT, operandBits) } - val storegen = - if(rhsIsAligned) new StoreGenAligned(io.typ, io.addr, io.rhs, operandBits/8) - else new StoreGen(io.typ, io.addr, io.rhs, operandBits/8) - val rhs = storegen.wordData - val max = io.cmd === M_XA_MAX || io.cmd === M_XA_MAXU val min = io.cmd === M_XA_MIN || io.cmd === M_XA_MINU val add = io.cmd === M_XA_ADD @@ -77,10 +67,10 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame val logic_xor = io.cmd === M_XA_XOR || io.cmd === M_XA_OR val adder_out = - if (operandBits == 32) io.lhs + rhs + if (operandBits == 32) io.lhs + io.rhs else { - val mask = ~UInt(0,64) ^ (io.addr(2) << 31) - (io.lhs & mask) + (rhs & mask) + val mask = ~UInt(0,64) ^ (!io.mask(3) << 31) + (io.lhs & mask) + (io.rhs & mask) } val less = { @@ -90,28 +80,29 @@ class AMOALU(operandBits: Int, rhsIsAligned: Boolean = false)(implicit p: Parame } if (operandBits == 32) { - Mux(io.lhs(31) === rhs(31), io.lhs < rhs, Mux(sgned, io.lhs(31), io.rhs(31))) + Mux(io.lhs(31) === io.rhs(31), io.lhs < io.rhs, Mux(sgned, io.lhs(31), io.rhs(31))) } else { - val word = !io.typ(0) - val cmp_lhs = Mux(word && !io.addr(2), io.lhs(31), io.lhs(63)) - val cmp_rhs = Mux(word && !io.addr(2), rhs(31), rhs(63)) - val lt_lo = io.lhs(31,0) < rhs(31,0) - val lt_hi = io.lhs(63,32) < rhs(63,32) - val eq_hi = io.lhs(63,32) === rhs(63,32) - val lt = Mux(word, Mux(io.addr(2), lt_hi, lt_lo), lt_hi || eq_hi && lt_lo) + val cmp_lhs = Mux(!io.mask(4), io.lhs(31), io.lhs(63)) + val cmp_rhs = Mux(!io.mask(4), io.rhs(31), io.rhs(63)) + val lt_lo = io.lhs(31,0) < io.rhs(31,0) + val lt_hi = io.lhs(63,32) < io.rhs(63,32) + val eq_hi = io.lhs(63,32) === io.rhs(63,32) + val lt = + Mux(io.mask(4) && io.mask(3), lt_hi || eq_hi && lt_lo, + Mux(io.mask(4), lt_hi, lt_lo)) Mux(cmp_lhs === cmp_rhs, lt, Mux(sgned, cmp_lhs, cmp_rhs)) } } - val minmax = Mux(Mux(less, min, max), io.lhs, storegen.data) + val minmax = Mux(Mux(less, min, max), io.lhs, io.rhs) val logic = - Mux(logic_and, io.lhs & rhs, 0.U) | - Mux(logic_xor, io.lhs ^ rhs, 0.U) + Mux(logic_and, io.lhs & io.rhs, 0.U) | + Mux(logic_xor, io.lhs ^ io.rhs, 0.U) val out = Mux(add, adder_out, Mux(logic_and || logic_xor, logic, minmax)) - val wmask = FillInterleaved(8, storegen.mask) + val wmask = FillInterleaved(8, io.mask) io.out := wmask & out | ~wmask & io.lhs }