diff --git a/uncore/src/main/scala/broadcast.scala b/uncore/src/main/scala/broadcast.scala index 9f6aba6e..a396de24 100644 --- a/uncore/src/main/scala/broadcast.scala +++ b/uncore/src/main/scala/broadcast.scala @@ -30,12 +30,18 @@ class L2BroadcastHub extends ManagerCoherenceAgent val internalDataBits = new DataQueueLocation().getWidth val inStoreQueue :: inVolWBQueue :: inClientReleaseQueue :: Nil = Enum(UInt(), nDataQueueLocations) + val trackerTLParams = params.alterPartial({ + case TLDataBits => internalDataBits + case TLWriteMaskBits => innerWriteMaskBits + }) + // Create SHRs for outstanding transactions - val trackerList = (0 until nReleaseTransactors).map(id => - Module(new BroadcastVoluntaryReleaseTracker(id), {case TLDataBits => internalDataBits})) ++ - (nReleaseTransactors until nTransactors).map(id => - Module(new BroadcastAcquireTracker(id), {case TLDataBits => internalDataBits})) - + val trackerList = + (0 until nReleaseTransactors).map(id => + Module(new BroadcastVoluntaryReleaseTracker(id))(trackerTLParams)) ++ + (nReleaseTransactors until nTransactors).map(id => + Module(new BroadcastAcquireTracker(id))(trackerTLParams)) + // Propagate incoherence flags trackerList.map(_.io.incoherent := io.incoherent) @@ -100,7 +106,8 @@ class L2BroadcastHub extends ManagerCoherenceAgent // Create an arbiter for the one memory port val outer_arb = Module(new ClientUncachedTileLinkIOArbiter(trackerList.size), { case TLId => params(OuterTLId) - case TLDataBits => internalDataBits }) + case TLDataBits => internalDataBits + case TLWriteMaskBits => innerWriteMaskBits }) outer_arb.io.in <> trackerList.map(_.io.outer) // Get the pending data out of the store data queue val outer_data_ptr = new DataQueueLocation().fromBits(outer_arb.io.out.acquire.bits.data) @@ -112,8 +119,6 @@ class L2BroadcastHub extends ManagerCoherenceAgent io.outer.acquire.bits.data := MuxLookup(outer_data_ptr.loc, io.irel().data, Array( inStoreQueue -> sdq(outer_data_ptr.idx), inVolWBQueue -> vwbdq(outer_data_ptr.idx))) - io.outer.acquire.bits.union := Cat(Fill(io.outer.acquire.bits.tlWriteMaskBits, outer_arb.io.out.acquire.bits.union(1)), - outer_arb.io.out.acquire.bits.union(0)) // Update SDQ valid bits when (io.outer.acquire.valid || sdq_enq) { @@ -209,14 +214,17 @@ class BroadcastAcquireTracker(trackerId: Int) extends BroadcastXactTracker { val s_idle :: s_probe :: s_mem_read :: s_mem_write :: s_make_grant :: s_mem_resp :: s_ack :: Nil = Enum(UInt(), 7) val state = Reg(init=s_idle) - val xact = Reg(Bundle(new AcquireFromSrc, { case TLId => params(InnerTLId); case TLDataBits => 0 })) + val xact = Reg(Bundle(new AcquireFromSrc, { + case TLId => params(InnerTLId) + case TLDataBits => 0 + case TLWriteMaskBits => innerWriteMaskBits + })) val data_buffer = Reg(Vec(io.iacq().data, innerDataBeats)) val coh = ManagerMetadata.onReset assert(!(state != s_idle && xact.isBuiltInType() && - Vec(Acquire.getType, Acquire.putType, Acquire.putAtomicType, - Acquire.prefetchType).contains(xact.a_type)), - "Broadcast Hub does not support PutAtomics, subblock Gets/Puts, or prefetches") // TODO + Vec(Acquire.putAtomicType, Acquire.prefetchType).contains(xact.a_type)), + "Broadcast Hub does not support PutAtomics or prefetches") // TODO val release_count = Reg(init=UInt(0, width = log2Up(io.inner.tlNCachingClients+1))) val pending_probes = Reg(init=Bits(0, width = io.inner.tlNCachingClients)) @@ -236,6 +244,7 @@ class BroadcastAcquireTracker(trackerId: Int) extends BroadcastXactTracker { val pending_outer_write_ = io.iacq().hasData() val pending_outer_read = io.ignt().hasData() val pending_outer_read_ = coh.makeGrant(io.iacq(), UInt(trackerId)).hasData() + val subblock_type = xact.isSubBlockType() io.has_acquire_conflict := xact.conflicts(io.iacq()) && (state != s_idle) && @@ -246,22 +255,32 @@ class BroadcastAcquireTracker(trackerId: Int) extends BroadcastXactTracker { !io.irel().isVoluntary() && (state === s_probe) - val outer_write_acq = Bundle(PutBlock( - client_xact_id = UInt(trackerId), - addr_block = xact.addr_block, - addr_beat = oacq_data_cnt, - data = data_buffer(oacq_data_cnt)))(outerTLParams) - val outer_write_rel = Bundle(PutBlock( - client_xact_id = UInt(trackerId), - addr_block = xact.addr_block, - addr_beat = io.irel().addr_beat, - data = io.irel().data))(outerTLParams) - val outer_read = Bundle(GetBlock( - client_xact_id = UInt(trackerId), - addr_block = xact.addr_block))(outerTLParams) + val oacq_type = MuxLookup(state, Acquire.getBlockType, Seq( + (s_probe, Acquire.putBlockType), + (s_mem_write, Mux(subblock_type, Acquire.putType, Acquire.putBlockType)), + (s_mem_read, Mux(subblock_type, Acquire.getType, Acquire.getBlockType)))) + val oacq_beat = MuxLookup(state, UInt(0), Seq( + (s_probe, io.irel().addr_beat), + (s_mem_write, Mux(subblock_type, xact.addr_beat, oacq_data_cnt)), + (s_mem_read, Mux(subblock_type, xact.addr_beat, UInt(0))))) + val oacq_data = MuxLookup(state, Bits(0), Seq( + (s_probe, io.irel().data), + (s_mem_write, Mux(subblock_type, + data_buffer(0), data_buffer(oacq_data_cnt))))) + val oacq_union = MuxLookup(state, Bits(0), Seq( + (s_probe, Acquire.fullWriteMask), + (s_mem_write, xact.wmask()), + (s_mem_read, Cat(xact.addr_byte(), xact.op_size(), M_XRD)))) io.outer.acquire.valid := Bool(false) - io.outer.acquire.bits := outer_read //default + io.outer.acquire.bits := Bundle(Acquire( + is_builtin_type = Bool(true), + a_type = oacq_type, + client_xact_id = UInt(trackerId), + addr_block = xact.addr_block, + addr_beat = oacq_beat, + data = oacq_data, + union = Cat(oacq_union, Bool(true))))(outerTLParams) io.outer.grant.ready := Bool(false) io.inner.probe.valid := Bool(false) @@ -331,7 +350,6 @@ class BroadcastAcquireTracker(trackerId: Int) extends BroadcastXactTracker { when(io.inner.release.valid) { when(io.irel().hasData()) { io.outer.acquire.valid := Bool(true) - io.outer.acquire.bits := outer_write_rel when(io.outer.acquire.ready) { when(oacq_data_done) { pending_ognt_ack := Bool(true) @@ -353,7 +371,6 @@ class BroadcastAcquireTracker(trackerId: Int) extends BroadcastXactTracker { } is(s_mem_write) { // Write data to outer memory io.outer.acquire.valid := !pending_ognt_ack || !collect_iacq_data || iacq_data_valid(oacq_data_cnt) - io.outer.acquire.bits := outer_write_acq when(oacq_data_done) { pending_ognt_ack := Bool(true) state := Mux(pending_outer_read, s_mem_read, s_mem_resp) @@ -361,7 +378,6 @@ class BroadcastAcquireTracker(trackerId: Int) extends BroadcastXactTracker { } is(s_mem_read) { // Read data from outer memory (possibly what was just written) io.outer.acquire.valid := !pending_ognt_ack - io.outer.acquire.bits := outer_read when(io.outer.acquire.fire()) { state := s_mem_resp } } is(s_mem_resp) { // Wait to forward grants from outer memory diff --git a/uncore/src/main/scala/tilelink.scala b/uncore/src/main/scala/tilelink.scala index 6c4cf264..b78dcafd 100644 --- a/uncore/src/main/scala/tilelink.scala +++ b/uncore/src/main/scala/tilelink.scala @@ -34,6 +34,8 @@ case object TLDataBits extends Field[Int] case object TLDataBeats extends Field[Int] /** Whether the underlying physical network preserved point-to-point ordering of messages */ case object TLNetworkIsOrderedP2P extends Field[Boolean] +/** Number of bits in write mask (usually one per byte in beat) */ +case object TLWriteMaskBits extends Field[Int] /** Utility trait for building Modules and Bundles that use TileLink parameters */ trait TileLinkParameters extends UsesParameters { @@ -53,7 +55,7 @@ trait TileLinkParameters extends UsesParameters { val tlDataBits = params(TLDataBits) val tlDataBytes = tlDataBits/8 val tlDataBeats = params(TLDataBeats) - val tlWriteMaskBits = if(tlDataBits/8 < 1) 1 else tlDataBits/8 + val tlWriteMaskBits = params(TLWriteMaskBits) val tlBeatAddrBits = log2Up(tlDataBeats) val tlByteAddrBits = log2Up(tlWriteMaskBits) val tlMemoryOpcodeBits = M_SZ @@ -1274,11 +1276,38 @@ class NASTIMasterIOTileLinkIOConverter extends TLModule with NASTIParameters { val addr_out = Reg(UInt(width = nastiXAddrBits)) val has_data = Reg(init=Bool(false)) val data_from_rel = Reg(init=Bool(false)) + val is_subblock = io.tl.acquire.bits.isSubBlockType() val (tl_cnt_out, tl_wrap_out) = - Counter((io.tl.acquire.fire() && acq_has_data) || + Counter((io.tl.acquire.fire() && io.tl.acquire.bits.hasMultibeatData()) || (io.tl.release.fire() && rel_has_data), tlDataBeats) val tl_done_out = Reg(init=Bool(false)) + val roq_size = 4 + val roq_data = Reg(Vec(UInt(width = tlByteAddrBits), roq_size)) + val roq_tags = Reg(Vec(UInt(width = nastiRIdBits), roq_size)) + val roq_free = Reg(init = Fill(roq_size, Bits(1, 1))) + val roq_full = !roq_free.orR + + val roq_enq_addr = PriorityEncoder(roq_free) + val roq_enq_valid = io.tl.acquire.fire() && !acq_has_data && is_subblock + val roq_enq_data = io.tl.acquire.bits.addr_byte() + val roq_enq_tag = io.nasti.ar.bits.id + + val roq_deq_tag = io.nasti.r.bits.id + val roq_deq_addr = PriorityEncoder(roq_tags.map(_ === roq_deq_tag)) + val roq_deq_data = roq_data(roq_deq_addr) + val roq_deq_valid = io.nasti.r.fire() && !io.nasti.r.bits.id(0) + + when (roq_enq_valid) { + roq_data(roq_enq_addr) := roq_enq_data + roq_tags(roq_enq_addr) := roq_enq_tag + roq_free(roq_enq_addr) := Bool(false) + } + + when (roq_deq_valid) { + roq_free(roq_deq_addr) := Bool(true) + } + io.nasti.ar.bits.id := tag_out io.nasti.ar.bits.addr := addr_out io.nasti.ar.bits.len := Mux(has_data, UInt(tlDataBeats-1), UInt(0)) @@ -1293,7 +1322,7 @@ class NASTIMasterIOTileLinkIOConverter extends TLModule with NASTIParameters { io.nasti.aw.bits := io.nasti.ar.bits io.nasti.w.bits.strb := Mux(data_from_rel, SInt(-1), io.tl.acquire.bits.wmask()) io.nasti.w.bits.data := Mux(data_from_rel, io.tl.release.bits.data, io.tl.acquire.bits.data) - io.nasti.w.bits.last := tl_wrap_out + io.nasti.w.bits.last := tl_wrap_out || (io.tl.acquire.fire() && is_subblock) when(!active_out){ io.tl.release.ready := io.nasti.w.ready @@ -1307,7 +1336,6 @@ class NASTIMasterIOTileLinkIOConverter extends TLModule with NASTIParameters { io.nasti.aw.valid := is_write io.nasti.ar.valid := !is_write cmd_sent_out := (!is_write && io.nasti.ar.ready) || (is_write && io.nasti.aw.ready) - tl_done_out := tl_wrap_out when(io.tl.release.valid) { data_from_rel := Bool(true) io.nasti.w.bits.data := io.tl.release.bits.data @@ -1319,34 +1347,35 @@ class NASTIMasterIOTileLinkIOConverter extends TLModule with NASTIParameters { io.nasti.aw.bits.id := tag io.nasti.aw.bits.addr := addr io.nasti.aw.bits.len := UInt(tlDataBeats-1) - io.nasti.aw.bits.size := MT_Q tag_out := tag addr_out := addr has_data := rel_has_data + tl_done_out := tl_wrap_out } .elsewhen(io.tl.acquire.valid) { data_from_rel := Bool(false) io.nasti.w.bits.data := io.tl.acquire.bits.data io.nasti.w.bits.strb := io.tl.acquire.bits.wmask() + // The last bit indicates to the Grant logic what g_type to send back + // For read, true = getDataBlockType, false = getDataBeatType + // For write, it should always be false, so that putAckType is sent val tag = Cat(io.tl.acquire.bits.client_id, io.tl.acquire.bits.client_xact_id, - io.tl.acquire.bits.isBuiltInType()) + !is_write && !is_subblock) val addr = io.tl.acquire.bits.full_addr() when(is_write) { io.nasti.aw.bits.id := tag io.nasti.aw.bits.addr := addr - io.nasti.aw.bits.len := Mux(io.tl.acquire.bits.isBuiltInType(Acquire.putBlockType), - UInt(tlDataBeats-1), UInt(0)) - io.nasti.aw.bits.size := bytesToXSize(PopCount(io.tl.acquire.bits.wmask())) + io.nasti.aw.bits.len := Mux(!is_subblock, UInt(tlDataBeats-1), UInt(0)) } .otherwise { io.nasti.ar.bits.id := tag io.nasti.ar.bits.addr := addr - io.nasti.ar.bits.len := Mux(io.tl.acquire.bits.isBuiltInType(Acquire.getBlockType), - UInt(tlDataBeats-1), UInt(0)) + io.nasti.ar.bits.len := Mux(!is_subblock, UInt(tlDataBeats-1), UInt(0)) io.nasti.ar.bits.size := io.tl.acquire.bits.op_size() } tag_out := tag addr_out := addr has_data := acq_has_data + tl_done_out := tl_wrap_out || is_subblock } } } @@ -1364,24 +1393,36 @@ class NASTIMasterIOTileLinkIOConverter extends TLModule with NASTIParameters { } } when(tl_wrap_out) { tl_done_out := Bool(true) } - when(cmd_sent_out && (!has_data || tl_done_out)) { active_out := Bool(false) } + when(cmd_sent_out && !roq_full && (!has_data || tl_done_out)) { + active_out := Bool(false) + } } + assert (!io.nasti.r.valid || !io.nasti.r.bits.resp(1), + "NASTI read response error") + assert (!io.nasti.b.valid || !io.nasti.b.bits.resp(1), + "NASTI write response error") + // Aggregate incoming NASTI responses into TL Grants val (tl_cnt_in, tl_wrap_in) = Counter(io.tl.grant.fire() && io.tl.grant.bits.hasMultibeatData(), tlDataBeats) val gnt_arb = Module(new Arbiter(new GrantToDst, 2)) io.tl.grant <> gnt_arb.io.out + val r_aligned_data = Mux(io.nasti.r.bits.id(0), + io.nasti.r.bits.data, + io.nasti.r.bits.data << Cat(roq_deq_data, UInt(0, 3))) + gnt_arb.io.in(0).valid := io.nasti.r.valid io.nasti.r.ready := gnt_arb.io.in(0).ready gnt_arb.io.in(0).bits := Grant( dst = (if(dstIdBits > 0) io.nasti.r.bits.id(dst_off, tlClientXactIdBits + 1) else UInt(0)), - is_builtin_type = io.nasti.r.bits.id(0), - g_type = Mux(io.nasti.r.bits.id(0), Grant.getDataBlockType, UInt(0)), // TODO: Assumes MI or MEI protocol + is_builtin_type = Bool(true), + g_type = Mux(io.nasti.r.bits.id(0), + Grant.getDataBlockType, Grant.getDataBeatType), // TODO: Assumes MI or MEI protocol client_xact_id = io.nasti.r.bits.id >> 1, manager_xact_id = UInt(0), addr_beat = tl_cnt_in, - data = io.nasti.r.bits.data) + data = r_aligned_data) gnt_arb.io.in(1).valid := io.nasti.b.valid io.nasti.b.ready := gnt_arb.io.in(1).ready diff --git a/uncore/src/main/scala/uncore.scala b/uncore/src/main/scala/uncore.scala index a9cc66d6..969edcfd 100644 --- a/uncore/src/main/scala/uncore.scala +++ b/uncore/src/main/scala/uncore.scala @@ -20,6 +20,7 @@ trait CoherenceAgentParameters extends UsesParameters { val innerTLParams = params.alterPartial({case TLId => params(InnerTLId)}) val innerDataBeats = innerTLParams(TLDataBeats) val innerDataBits = innerTLParams(TLDataBits) + val innerWriteMaskBits = innerTLParams(TLWriteMaskBits) val innerBeatAddrBits = log2Up(innerDataBeats) val innerByteAddrBits = log2Up(innerDataBits/8) require(outerDataBeats == innerDataBeats) //TODO: must fix all xact_data Vecs to remove this requirement