From 4ba8acb4aa26901899963136704d065a22e36460 Mon Sep 17 00:00:00 2001 From: "Wesley W. Terpstra" Date: Thu, 22 Mar 2018 14:27:43 -0700 Subject: [PATCH] TLRAM: add support for ECC (#1304) --- src/main/scala/diplomacy/SRAM.scala | 5 +- src/main/scala/tilelink/SRAM.scala | 174 +++++++++++++++++++++----- src/main/scala/unittest/Configs.scala | 5 +- 3 files changed, 146 insertions(+), 38 deletions(-) diff --git a/src/main/scala/diplomacy/SRAM.scala b/src/main/scala/diplomacy/SRAM.scala index d0b700f8..7892ebfd 100644 --- a/src/main/scala/diplomacy/SRAM.scala +++ b/src/main/scala/diplomacy/SRAM.scala @@ -22,10 +22,9 @@ abstract class DiplomaticSRAM( def mask: List[Boolean] = bigBits(address.mask >> log2Ceil(beatBytes)) // Use single-ported memory with byte-write enable - def makeSinglePortedByteWriteSeqMem(size: Int) = { + def makeSinglePortedByteWriteSeqMem(size: Int, lanes: Int = beatBytes, bits: Int = 8) = { // We require the address range to include an entire beat (for the write mask) - require ((address.mask & (beatBytes-1)) == beatBytes-1) - val mem = SeqMem(size, Vec(beatBytes, Bits(width = 8))) + val mem = SeqMem(size, Vec(lanes, Bits(width = bits))) devName.foreach(n => mem.suggestName(n.split("-").last)) mem } diff --git a/src/main/scala/tilelink/SRAM.scala b/src/main/scala/tilelink/SRAM.scala index 4f151678..403f1ee5 100644 --- a/src/main/scala/tilelink/SRAM.scala +++ b/src/main/scala/tilelink/SRAM.scala @@ -13,10 +13,16 @@ class TLRAM( cacheable: Boolean = true, executable: Boolean = true, beatBytes: Int = 4, + eccBytes: Int = 1, devName: Option[String] = None, - errors: Seq[AddressSet] = Nil) + errors: Seq[AddressSet] = Nil, + code: Code = new IdentityCode) (implicit p: Parameters) extends DiplomaticSRAM(address, beatBytes, devName) { + require (eccBytes >= 1 && isPow2(eccBytes)) + require (beatBytes >= 1 && isPow2(beatBytes)) + require (eccBytes <= beatBytes, s"TLRAM eccBytes (${eccBytes}) > beatBytes (${beatBytes}). Use a WidthWidget=>Fragmenter=>SRAM if you need high density and narrow ECC; it will do bursts efficiently") + val node = TLManagerNode(Seq(TLManagerPortParameters( Seq(TLManagerParameters( address = List(address) ++ errors, @@ -33,46 +39,127 @@ class TLRAM( lazy val module = new LazyModuleImp(this) { val (in, edge) = node.in(0) + val width = code.width(eccBytes*8) + val lanes = beatBytes/eccBytes val addrBits = (mask zip edge.addr_hi(in.a.bits).toBools).filter(_._1).map(_._2) - val a_legal = address.contains(in.a.bits.address) - val memAddress = Cat(addrBits.reverse) - val mem = makeSinglePortedByteWriteSeqMem(1 << addrBits.size) + val mem = makeSinglePortedByteWriteSeqMem(1 << addrBits.size, lanes, width) - val d_full = RegInit(Bool(false)) - val d_read = Reg(Bool()) - val d_size = Reg(UInt()) - val d_source = Reg(UInt()) - val d_data = Wire(UInt()) - val d_legal = Reg(Bool()) + /* This block uses a two-stage pipeline; A=>D + * Both stages vie for access to the single SRAM port. + * Stage D has absolute priority over stage A. + * - read-modify-writeback for sub-lane access happens here + * - writeback of correctable data happens here + * - both actions may occur concurrently + * Stage A has lower priority and will stall if blocked + * - read operations happen here + * - full-lane write operations happen here + */ - // Flow control - when (in.d.fire()) { d_full := Bool(false) } - when (in.a.fire()) { d_full := Bool(true) } - in.d.valid := d_full - in.a.ready := in.d.ready || !d_full + // D stage registers from A + val d_full = RegInit(Bool(false)) + val d_ram_valid = RegInit(Bool(false)) // true if we just read-out from SRAM + val d_size = Reg(UInt()) + val d_source = Reg(UInt()) + val d_legal = Reg(Bool()) + val d_read = Reg(Bool()) + val d_address = Reg(UInt(width = addrBits.size)) + val d_rmw_mask = Reg(UInt(width = beatBytes)) + val d_rmw_data = Reg(UInt(width = 8*beatBytes)) + + // Decode raw unregistered SRAM output + val d_raw_data = Wire(Vec(lanes, Bits(width = width))) + val d_decoded = d_raw_data.map(lane => code.decode(lane)) + val d_corrected = Cat(d_decoded.map(_.corrected).reverse) + val d_uncorrected = Cat(d_decoded.map(_.uncorrected).reverse) + val d_correctable = d_decoded.map(_.correctable) + val d_uncorrectable = d_decoded.map(_.uncorrectable) + val d_need_fix = d_correctable.reduce(_ || _) + val d_error = d_uncorrectable.reduce(_ || _) + + // What does D-stage want to write-back? + val d_wb_data = Vec(Seq.tabulate(beatBytes) { i => + val upd = d_rmw_mask(i) + val rmw = d_rmw_data (8*(i+1)-1, 8*i) + val fix = d_corrected(8*(i+1)-1, 8*i) // safe to use, because D-stage write-back always wins arbitration + Mux(upd, rmw, fix) + }.grouped(eccBytes).map(lane => Cat(lane.reverse)).toList) + val (d_wb_lanes, d_wb_poison) = Seq.tabulate(lanes) { i => + val upd = d_rmw_mask(eccBytes*(i+1)-1, eccBytes*i) + (upd.orR || d_correctable(i), + !upd.andR && d_uncorrectable(i)) // sub-lane writes should not correct uncorrectable + }.unzip + val d_wb = d_rmw_mask.orR || (d_ram_valid && d_need_fix) + + // Extend the validity of SRAM read-out + val d_held_data = RegEnable(d_corrected, d_ram_valid) + val d_held_error = RegEnable(d_error, d_ram_valid) - in.d.bits := edge.AccessAck(d_source, d_size, !d_legal) - // avoid data-bus Mux - in.d.bits.data := d_data in.d.bits.opcode := Mux(d_read, TLMessages.AccessAckData, TLMessages.AccessAck) + in.d.bits.param := UInt(0) + in.d.bits.size := d_size + in.d.bits.source := d_source + in.d.bits.sink := UInt(0) + // It is safe to use uncorrected data here because of d_pause + in.d.bits.data := Mux(d_ram_valid, d_uncorrected, d_held_data) + in.d.bits.error := !d_legal || Mux(d_ram_valid, d_error, d_held_error) - val read = in.a.bits.opcode === TLMessages.Get - val rdata = Wire(Vec(beatBytes, Bits(width = 8))) - val wdata = Vec.tabulate(beatBytes) { i => in.a.bits.data(8*(i+1)-1, 8*i) } - d_data := Cat(rdata.reverse) + // Formulate a response only when SRAM output is unused or correct + val d_pause = d_read && d_ram_valid && d_need_fix + in.d.valid := d_full && !d_pause + in.a.ready := !d_full || (in.d.ready && !d_pause && !d_wb) + + val a_legal = Bool(errors.isEmpty) || address.contains(in.a.bits.address) + val a_address = Cat(addrBits.reverse) + val a_read = in.a.bits.opcode === TLMessages.Get + val a_data = Vec(Seq.tabulate(lanes) { i => in.a.bits.data(eccBytes*8*(i+1)-1, eccBytes*8*i) }) + +/* + val a_sublane = Seq.tabulate(lanes) { i => + val upd = in.a.bits.mask(eccBytes*(i+1)-1, eccBytes*i) + upd.orR && !upd.andR + }.reduce(_ || _) +*/ + val a_sublane = if (eccBytes == 1) Bool(false) else + in.a.bits.opcode === TLMessages.PutPartialData || + in.a.bits.size < UInt(log2Ceil(eccBytes)) + val a_ren = a_read || a_sublane + val a_lanes = Seq.tabulate(lanes) { i => in.a.bits.mask(eccBytes*(i+1)-1, eccBytes*i).orR } + + when (in.d.fire()) { d_full := Bool(false) } + d_ram_valid := Bool(false) + d_rmw_mask := UInt(0) when (in.a.fire()) { - d_read := read - d_size := in.a.bits.size - d_source := in.a.bits.source - d_legal := a_legal + d_full := Bool(true) + d_ram_valid := a_ren && a_legal + d_size := in.a.bits.size + d_source := in.a.bits.source + d_legal := a_legal + d_read := a_read + d_address := a_address + d_rmw_mask := UInt(0) + when (!a_read && a_sublane) { + d_rmw_mask := in.a.bits.mask + d_rmw_data := in.a.bits.data + } + d_held_error:= Bool(false) } - // exactly this pattern is required to get a RWM memory - when (in.a.fire() && !read && a_legal) { - mem.write(memAddress, wdata, in.a.bits.mask.toBools) - } - val ren = in.a.fire() && read - rdata := mem.readAndHold(memAddress, ren) + // SRAM arbitration + val a_fire = in.a.fire() && a_legal + val wen = d_wb || (a_fire && !a_ren) +// val ren = !d_wb && (a_fire && a_ren) + val ren = !wen && a_fire // help Chisel infer a RW-port + + val addr = Mux(d_wb, d_address, a_address) + val sel = Mux(d_wb, Vec(d_wb_lanes), Vec(a_lanes)) + val dat = Mux(d_wb, d_wb_data, a_data) + val poison = Mux(d_wb, Vec(d_wb_poison), Vec.fill(lanes) { Bool(false) }) + val coded = Vec((dat zip poison) map { case (d, p) => + if (code.canDetect) code.encode(d, p) else code.encode(d) + }) + + d_raw_data := mem.read(addr, ren) + when (wen) { mem.write(addr, coded, sel) } // Tie off unused channels in.b.valid := Bool(false) @@ -88,10 +175,12 @@ object TLRAM cacheable: Boolean = true, executable: Boolean = true, beatBytes: Int = 4, + eccBytes: Int = 1, devName: Option[String] = None, - errors: Seq[AddressSet] = Nil)(implicit p: Parameters): TLInwardNode = + errors: Seq[AddressSet] = Nil, + code: Code = new IdentityCode)(implicit p: Parameters): TLInwardNode = { - val ram = LazyModule(new TLRAM(address, cacheable, executable, beatBytes, devName, errors)) + val ram = LazyModule(new TLRAM(address, cacheable, executable, beatBytes, eccBytes, devName, errors, code)) ram.node } } @@ -115,3 +204,20 @@ class TLRAMSimpleTest(ramBeatBytes: Int, txns: Int = 5000, timeout: Int = 500000 val dut = Module(LazyModule(new TLRAMSimple(ramBeatBytes, txns)).module) io.finished := dut.io.finished } + +class TLRAMECC(ramBeatBytes: Int, eccBytes: Int, txns: Int)(implicit p: Parameters) extends LazyModule { + val fuzz = LazyModule(new TLFuzzer(txns)) + val model = LazyModule(new TLRAMModel("SRAMSimple")) + val ram = LazyModule(new TLRAM(AddressSet(0x0, 0x3ff), beatBytes = ramBeatBytes, eccBytes = eccBytes, code = new SECDEDCode)) + + ram.node := TLDelayer(0.25) := model.node := fuzz.node + + lazy val module = new LazyModuleImp(this) with UnitTestModule { + io.finished := fuzz.module.io.finished + } +} + +class TLRAMECCTest(ramBeatBytes: Int, eccBytes: Int, txns: Int = 5000, timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { + val dut = Module(LazyModule(new TLRAMECC(ramBeatBytes, eccBytes, txns)).module) + io.finished := dut.io.finished +} diff --git a/src/main/scala/unittest/Configs.scala b/src/main/scala/unittest/Configs.scala index 996141b8..35b7528c 100644 --- a/src/main/scala/unittest/Configs.scala +++ b/src/main/scala/unittest/Configs.scala @@ -50,7 +50,10 @@ class WithTLSimpleUnitTests extends Config((site, here, up) => { Module(new TLRR1Test( txns= 3*txns, timeout=timeout)), Module(new TLRAMRationalCrossingTest(txns= 3*txns, timeout=timeout)), Module(new TLRAMAsyncCrossingTest( txns= 5*txns, timeout=timeout)), - Module(new TLRAMAtomicAutomataTest( txns=10*txns, timeout=timeout)) ) } + Module(new TLRAMAtomicAutomataTest( txns=10*txns, timeout=timeout)), + Module(new TLRAMECCTest(8, 4, txns=15*txns, timeout=timeout)), + Module(new TLRAMECCTest(4, 1, txns=15*txns, timeout=timeout)), + Module(new TLRAMECCTest(1, 1, txns=15*txns, timeout=timeout)) ) } }) class WithTLWidthUnitTests extends Config((site, here, up) => {