TLRAM: add support for ECC (#1304)

2018-03-22 14:27:43 -07:00
parent 12583af4a8
commit 4ba8acb4aa
3 changed files with 146 additions and 38 deletions
--- a/src/main/scala/diplomacy/SRAM.scala
+++ b/src/main/scala/diplomacy/SRAM.scala
@@ -22,10 +22,9 @@ abstract class DiplomaticSRAM(
  def mask: List[Boolean] = bigBits(address.mask >> log2Ceil(beatBytes))

  // Use single-ported memory with byte-write enable
-  def makeSinglePortedByteWriteSeqMem(size: Int) = {
+  def makeSinglePortedByteWriteSeqMem(size: Int, lanes: Int = beatBytes, bits: Int = 8) = {
    // We require the address range to include an entire beat (for the write mask)
-    require ((address.mask & (beatBytes-1)) == beatBytes-1)
-    val mem = SeqMem(size, Vec(beatBytes, Bits(width = 8)))
+    val mem = SeqMem(size, Vec(lanes, Bits(width = bits)))
    devName.foreach(n => mem.suggestName(n.split("-").last))
    mem
  }
--- a/src/main/scala/tilelink/SRAM.scala
+++ b/src/main/scala/tilelink/SRAM.scala
@@ -13,10 +13,16 @@ class TLRAM(
    cacheable: Boolean = true,
    executable: Boolean = true,
    beatBytes: Int = 4,
+    eccBytes: Int = 1,
    devName: Option[String] = None,
-    errors: Seq[AddressSet] = Nil)
+    errors: Seq[AddressSet] = Nil,
+    code: Code = new IdentityCode)
  (implicit p: Parameters) extends DiplomaticSRAM(address, beatBytes, devName)
 {
+  require (eccBytes  >= 1 && isPow2(eccBytes))
+  require (beatBytes >= 1 && isPow2(beatBytes))
+  require (eccBytes <= beatBytes, s"TLRAM eccBytes (${eccBytes}) > beatBytes (${beatBytes}). Use a WidthWidget=>Fragmenter=>SRAM if you need high density and narrow ECC; it will do bursts efficiently")
+
  val node = TLManagerNode(Seq(TLManagerPortParameters(
    Seq(TLManagerParameters(
      address            = List(address) ++ errors,
@@ -33,46 +39,127 @@ class TLRAM(
  lazy val module = new LazyModuleImp(this) {
    val (in, edge) = node.in(0)

+    val width = code.width(eccBytes*8)
+    val lanes = beatBytes/eccBytes
    val addrBits = (mask zip edge.addr_hi(in.a.bits).toBools).filter(_._1).map(_._2)
-    val a_legal = address.contains(in.a.bits.address)
-    val memAddress = Cat(addrBits.reverse)
-    val mem = makeSinglePortedByteWriteSeqMem(1 << addrBits.size)
+    val mem = makeSinglePortedByteWriteSeqMem(1 << addrBits.size, lanes, width)

-    val d_full = RegInit(Bool(false))
-    val d_read = Reg(Bool())
-    val d_size = Reg(UInt())
-    val d_source = Reg(UInt())
-    val d_data = Wire(UInt())
-    val d_legal = Reg(Bool())
+    /* This block uses a two-stage pipeline; A=>D
+     * Both stages vie for access to the single SRAM port.
+     * Stage D has absolute priority over stage A.
+     *   - read-modify-writeback for sub-lane access happens here
+     *   - writeback of correctable data happens here
+     *   - both actions may occur concurrently
+     * Stage A has lower priority and will stall if blocked
+     *   - read operations happen here
+     *   - full-lane write operations happen here
+     */

-    // Flow control
-    when (in.d.fire()) { d_full := Bool(false) }
-    when (in.a.fire()) { d_full := Bool(true)  }
-    in.d.valid := d_full
-    in.a.ready := in.d.ready || !d_full
+    // D stage registers from A
+    val d_full      = RegInit(Bool(false))
+    val d_ram_valid = RegInit(Bool(false)) // true if we just read-out from SRAM
+    val d_size      = Reg(UInt())
+    val d_source    = Reg(UInt())
+    val d_legal     = Reg(Bool())
+    val d_read      = Reg(Bool())
+    val d_address   = Reg(UInt(width = addrBits.size))
+    val d_rmw_mask  = Reg(UInt(width = beatBytes))
+    val d_rmw_data  = Reg(UInt(width = 8*beatBytes))
+
+    // Decode raw unregistered SRAM output
+    val d_raw_data      = Wire(Vec(lanes, Bits(width = width)))
+    val d_decoded       = d_raw_data.map(lane => code.decode(lane))
+    val d_corrected     = Cat(d_decoded.map(_.corrected).reverse)
+    val d_uncorrected   = Cat(d_decoded.map(_.uncorrected).reverse)
+    val d_correctable   = d_decoded.map(_.correctable)
+    val d_uncorrectable = d_decoded.map(_.uncorrectable)
+    val d_need_fix      = d_correctable.reduce(_ || _)
+    val d_error         = d_uncorrectable.reduce(_ || _)
+
+    // What does D-stage want to write-back?
+    val d_wb_data = Vec(Seq.tabulate(beatBytes) { i =>
+      val upd = d_rmw_mask(i)
+      val rmw = d_rmw_data (8*(i+1)-1, 8*i)
+      val fix = d_corrected(8*(i+1)-1, 8*i) // safe to use, because D-stage write-back always wins arbitration
+      Mux(upd, rmw, fix)
+    }.grouped(eccBytes).map(lane => Cat(lane.reverse)).toList)
+    val (d_wb_lanes, d_wb_poison) = Seq.tabulate(lanes) { i =>
+      val upd = d_rmw_mask(eccBytes*(i+1)-1, eccBytes*i)
+      (upd.orR || d_correctable(i),
+       !upd.andR && d_uncorrectable(i)) // sub-lane writes should not correct uncorrectable
+    }.unzip
+    val d_wb = d_rmw_mask.orR || (d_ram_valid && d_need_fix)
+
+    // Extend the validity of SRAM read-out
+    val d_held_data = RegEnable(d_corrected, d_ram_valid)
+    val d_held_error = RegEnable(d_error, d_ram_valid)

-    in.d.bits := edge.AccessAck(d_source, d_size, !d_legal)
-    // avoid data-bus Mux
-    in.d.bits.data := d_data
    in.d.bits.opcode := Mux(d_read, TLMessages.AccessAckData, TLMessages.AccessAck)
+    in.d.bits.param  := UInt(0)
+    in.d.bits.size   := d_size
+    in.d.bits.source := d_source
+    in.d.bits.sink   := UInt(0)
+    // It is safe to use uncorrected data here because of d_pause
+    in.d.bits.data   := Mux(d_ram_valid, d_uncorrected, d_held_data)
+    in.d.bits.error  := !d_legal || Mux(d_ram_valid, d_error, d_held_error)

-    val read = in.a.bits.opcode === TLMessages.Get
-    val rdata = Wire(Vec(beatBytes, Bits(width = 8)))
-    val wdata = Vec.tabulate(beatBytes) { i => in.a.bits.data(8*(i+1)-1, 8*i) }
-    d_data := Cat(rdata.reverse)
+    // Formulate a response only when SRAM output is unused or correct
+    val d_pause = d_read && d_ram_valid && d_need_fix
+    in.d.valid := d_full && !d_pause
+    in.a.ready := !d_full || (in.d.ready && !d_pause && !d_wb)
+
+    val a_legal = Bool(errors.isEmpty) || address.contains(in.a.bits.address)
+    val a_address = Cat(addrBits.reverse)
+    val a_read = in.a.bits.opcode === TLMessages.Get
+    val a_data = Vec(Seq.tabulate(lanes) { i => in.a.bits.data(eccBytes*8*(i+1)-1, eccBytes*8*i) })
+
+/*
+    val a_sublane = Seq.tabulate(lanes) { i =>
+      val upd = in.a.bits.mask(eccBytes*(i+1)-1, eccBytes*i)
+      upd.orR && !upd.andR
+    }.reduce(_ || _)
+*/
+    val a_sublane = if (eccBytes == 1) Bool(false) else
+      in.a.bits.opcode === TLMessages.PutPartialData ||
+      in.a.bits.size < UInt(log2Ceil(eccBytes))
+    val a_ren = a_read || a_sublane
+    val a_lanes = Seq.tabulate(lanes) { i => in.a.bits.mask(eccBytes*(i+1)-1, eccBytes*i).orR }
+
+    when (in.d.fire()) { d_full := Bool(false) }
+    d_ram_valid := Bool(false)
+    d_rmw_mask  := UInt(0)
    when (in.a.fire()) {
-      d_read   := read
-      d_size   := in.a.bits.size
-      d_source := in.a.bits.source
-      d_legal  := a_legal
+      d_full      := Bool(true)
+      d_ram_valid := a_ren && a_legal
+      d_size      := in.a.bits.size
+      d_source    := in.a.bits.source
+      d_legal     := a_legal
+      d_read      := a_read
+      d_address   := a_address
+      d_rmw_mask  := UInt(0)
+      when (!a_read && a_sublane) {
+        d_rmw_mask := in.a.bits.mask
+        d_rmw_data := in.a.bits.data
+      }
+      d_held_error:= Bool(false)
    }

-    // exactly this pattern is required to get a RWM memory
-    when (in.a.fire() && !read && a_legal) {
-      mem.write(memAddress, wdata, in.a.bits.mask.toBools)
-    }
-    val ren = in.a.fire() && read
-    rdata := mem.readAndHold(memAddress, ren)
+    // SRAM arbitration
+    val a_fire = in.a.fire() && a_legal
+    val wen =  d_wb || (a_fire && !a_ren)
+//  val ren = !d_wb && (a_fire &&  a_ren)
+    val ren = !wen && a_fire // help Chisel infer a RW-port
+
+    val addr   = Mux(d_wb, d_address, a_address)
+    val sel    = Mux(d_wb, Vec(d_wb_lanes), Vec(a_lanes))
+    val dat    = Mux(d_wb, d_wb_data, a_data)
+    val poison = Mux(d_wb, Vec(d_wb_poison), Vec.fill(lanes) { Bool(false) })
+    val coded  = Vec((dat zip poison) map { case (d, p) =>
+      if (code.canDetect) code.encode(d, p) else code.encode(d)
+    })
+
+    d_raw_data := mem.read(addr, ren)
+    when (wen) { mem.write(addr, coded, sel) }

    // Tie off unused channels
    in.b.valid := Bool(false)
@@ -88,10 +175,12 @@ object TLRAM
    cacheable: Boolean = true,
    executable: Boolean = true,
    beatBytes: Int = 4,
+    eccBytes: Int = 1,
    devName: Option[String] = None,
-    errors: Seq[AddressSet] = Nil)(implicit p: Parameters): TLInwardNode =
+    errors: Seq[AddressSet] = Nil,
+    code: Code = new IdentityCode)(implicit p: Parameters): TLInwardNode =
  {
-    val ram = LazyModule(new TLRAM(address, cacheable, executable, beatBytes, devName, errors))
+    val ram = LazyModule(new TLRAM(address, cacheable, executable, beatBytes, eccBytes, devName, errors, code))
    ram.node
  }
 }
@@ -115,3 +204,20 @@ class TLRAMSimpleTest(ramBeatBytes: Int, txns: Int = 5000, timeout: Int = 500000
  val dut = Module(LazyModule(new TLRAMSimple(ramBeatBytes, txns)).module)
  io.finished := dut.io.finished
 }
+
+class TLRAMECC(ramBeatBytes: Int, eccBytes: Int, txns: Int)(implicit p: Parameters) extends LazyModule {
+  val fuzz = LazyModule(new TLFuzzer(txns))
+  val model = LazyModule(new TLRAMModel("SRAMSimple"))
+  val ram  = LazyModule(new TLRAM(AddressSet(0x0, 0x3ff), beatBytes = ramBeatBytes, eccBytes = eccBytes, code = new SECDEDCode))
+
+  ram.node := TLDelayer(0.25) := model.node := fuzz.node
+
+  lazy val module = new LazyModuleImp(this) with UnitTestModule {
+    io.finished := fuzz.module.io.finished
+  }
+}
+
+class TLRAMECCTest(ramBeatBytes: Int, eccBytes: Int, txns: Int = 5000, timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) {
+  val dut = Module(LazyModule(new TLRAMECC(ramBeatBytes, eccBytes, txns)).module)
+  io.finished := dut.io.finished
+}
--- a/src/main/scala/unittest/Configs.scala
+++ b/src/main/scala/unittest/Configs.scala
@@ -50,7 +50,10 @@ class WithTLSimpleUnitTests extends Config((site, here, up) => {
      Module(new TLRR1Test(                txns= 3*txns, timeout=timeout)),
      Module(new TLRAMRationalCrossingTest(txns= 3*txns, timeout=timeout)),
      Module(new TLRAMAsyncCrossingTest(   txns= 5*txns, timeout=timeout)),
-      Module(new TLRAMAtomicAutomataTest(  txns=10*txns, timeout=timeout)) ) }
+      Module(new TLRAMAtomicAutomataTest(  txns=10*txns, timeout=timeout)),
+      Module(new TLRAMECCTest(8, 4,        txns=15*txns, timeout=timeout)),
+      Module(new TLRAMECCTest(4, 1,        txns=15*txns, timeout=timeout)),
+      Module(new TLRAMECCTest(1, 1,        txns=15*txns, timeout=timeout)) ) }
 })

 class WithTLWidthUnitTests extends Config((site, here, up) => {