From cfa86dba4f9b71eb4dadbfd674ed94bd476d7f93 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <waterman@cs.berkeley.edu>
Date: Thu, 2 May 2013 04:58:43 -0700
Subject: [PATCH] add FPGA test bench

The memory models now support back pressure on the response.
---
 csrc/emulator.cc                | 15 +++++++++------
 csrc/mm.cc                      | 29 +++++++++++++++++------------
 csrc/mm.h                       | 18 +++++++++++++-----
 csrc/mm_dramsim2.cc             | 26 +++++++++++++++-----------
 csrc/mm_dramsim2.h              |  5 +++--
 csrc/vcs_main.cc                | 24 +++++++++++++++---------
 riscv-rocket                    |  2 +-
 riscv-tests                     |  2 +-
 src/main/scala/RocketChip.scala |  5 +++--
 src/main/scala/fpga.scala       | 31 +++++++++----------------------
 uncore                          |  2 +-
 11 files changed, 87 insertions(+), 72 deletions(-)
diff --git a/csrc/emulator.cc b/csrc/emulator.cc
index 35bfaccb..693120ec 100644
--- a/csrc/emulator.cc
+++ b/csrc/emulator.cc
@@ -60,17 +60,18 @@ int main(int argc, char** argv)
     fprintf(vcdfile, "$upscope $end\n");
   }
 
-  mm_t* mm = dramsim2 ? (mm_t*)(new mm_dramsim2_t) : (mm_t*)(new mm_magic_t);
-  mm->init(MEM_SIZE);
-  if (loadmem)
-    load_mem(mm->get_data(), loadmem);
-
 
   // The chisel generated code
   Top_t tile;
   srand(random_seed);
   tile.init(random_seed != 0);
 
+  // Instantiate and initialize main memory
+  mm_t* mm = dramsim2 ? (mm_t*)(new mm_dramsim2_t) : (mm_t*)(new mm_magic_t);
+  mm->init(MEM_SIZE, tile.Top__io_mem_resp_bits_data.width()/8, LINE_SIZE);
+  if (loadmem)
+    load_mem(mm->get_data(), loadmem);
+
   // Instantiate HTIF
   htif = new htif_emulator_t(std::vector<std::string>(argv + 1, argv + argc));
   int htif_bits = tile.Top__io_host_in_bits.width();
@@ -105,7 +106,9 @@ int main(int argc, char** argv)
       tile.Top__io_mem_req_cmd_bits_tag.lo_word(),
 
       tile.Top__io_mem_req_data_valid.lo_word(),
-      &tile.Top__io_mem_req_data_bits_data.values[0]
+      &tile.Top__io_mem_req_data_bits_data.values[0],
+
+      tile.Top__io_mem_resp_ready.to_bool()
     );
 
     if (tile.Top__io_host_clk_edge.to_bool())
diff --git a/csrc/mm.cc b/csrc/mm.cc
index afbbfef7..c5399cfd 100644
--- a/csrc/mm.cc
+++ b/csrc/mm.cc
@@ -5,8 +5,11 @@
 #include <cstring>
 #include <cassert>
 
-void mm_t::init(size_t sz)
+void mm_t::init(size_t sz, int wsz, int lsz)
 {
+  assert(wsz > 0 && lsz > 0 && (lsz & (lsz-1)) == 0 && lsz % wsz == 0);
+  word_size = wsz;
+  line_size = lsz;
   data = new char[sz];
   size = sz;
 }
@@ -16,10 +19,10 @@ mm_t::~mm_t()
   delete [] data;
 }
 
-void mm_magic_t::init(size_t sz)
+void mm_magic_t::init(size_t sz, int wsz, int lsz)
 {
-  mm_t::init(sz);
-  dummy_data.resize(MM_WORD_SIZE);
+  mm_t::init(sz, wsz, lsz);
+  dummy_data.resize(word_size);
 }
 
 void mm_magic_t::tick
@@ -29,28 +32,30 @@ void mm_magic_t::tick
   uint64_t req_cmd_addr,
   uint64_t req_cmd_tag,
   bool req_data_val,
-  void* req_data_bits
+  void* req_data_bits,
+  bool resp_rdy
 )
 {
   bool req_cmd_fire = req_cmd_val && req_cmd_ready();
   bool req_data_fire = req_data_val && req_data_ready();
+  bool resp_fire = resp_valid() && resp_rdy;
   assert(!(req_cmd_fire && req_data_fire));
 
-  if (resp_valid())
+  if (resp_fire)
     resp.pop();
 
   if (req_data_fire)
   {
-    memcpy(data + store_addr + store_count*MM_WORD_SIZE, req_data_bits, MM_WORD_SIZE);
+    memcpy(data + store_addr + store_count*word_size, req_data_bits, word_size);
 
-    store_count = (store_count + 1) % REFILL_COUNT;
+    store_count = (store_count + 1) % (line_size/word_size);
     if (store_count == 0)
       store_inflight = false;
   }
 
   if (req_cmd_fire)
   {
-    auto byte_addr = req_cmd_addr*REFILL_COUNT*MM_WORD_SIZE;
+    auto byte_addr = req_cmd_addr * line_size;
     assert(byte_addr < size);
 
     if (req_cmd_store)
@@ -58,10 +63,10 @@ void mm_magic_t::tick
       store_inflight = true;
       store_addr = byte_addr;
     }
-    else for (int i = 0; i < REFILL_COUNT; i++)
+    else for (int i = 0; i < line_size/word_size; i++)
     {
-      auto base = data + byte_addr + i*MM_WORD_SIZE;
-      auto dat = std::vector<char>(base, base + MM_WORD_SIZE);
+      auto base = data + byte_addr + i*word_size;
+      auto dat = std::vector<char>(base, base + word_size);
       resp.push(std::make_pair(req_cmd_tag, dat));
     }
   }
diff --git a/csrc/mm.h b/csrc/mm.h
index 64465405..13ccf202 100644
--- a/csrc/mm.h
+++ b/csrc/mm.h
@@ -1,17 +1,19 @@
 #ifndef MM_EMULATOR_H
 #define MM_EMULATOR_H
 
-#include "mm_param.h"
 #include <stdint.h>
 #include <cstring>
 #include <queue>
 
+const int LINE_SIZE = 64; // all cores assume this.
+const size_t MEM_SIZE = (sizeof(long) > 4 ? 4L : 1L) * 1024*1024*1024;
+
 class mm_t
 {
  public:
   mm_t() : data(0), size(0) {}
 
-  virtual void init(size_t sz);
+  virtual void init(size_t sz, int word_size, int line_size);
 
   virtual bool req_cmd_ready() = 0;
   virtual bool req_data_ready() = 0;
@@ -26,17 +28,22 @@ class mm_t
     uint64_t req_cmd_addr,
     uint64_t req_cmd_tag,
     bool req_data_val,
-    void* req_data_bits
+    void* req_data_bits,
+    bool resp_rdy
   ) = 0;
 
   virtual void* get_data() { return data; }
   virtual size_t get_size() { return size; }
+  virtual size_t get_word_size() { return word_size; }
+  virtual size_t get_line_size() { return line_size; }
 
   virtual ~mm_t();
 
  protected:
   char* data;
   size_t size;
+  int word_size;
+  int line_size;
 };
 
 class mm_magic_t : public mm_t
@@ -44,7 +51,7 @@ class mm_magic_t : public mm_t
  public:
   mm_magic_t() : store_inflight(false), store_count(0) {}
 
-  virtual void init(size_t sz);
+  virtual void init(size_t sz, int word_size, int line_size);
 
   virtual bool req_cmd_ready() { return !store_inflight; }
   virtual bool req_data_ready() { return store_inflight; }
@@ -59,7 +66,8 @@ class mm_magic_t : public mm_t
     uint64_t req_cmd_addr,
     uint64_t req_cmd_tag,
     bool req_data_val,
-    void* req_data_bits
+    void* req_data_bits,
+    bool resp_rdy
   );
 
  protected:
diff --git a/csrc/mm_dramsim2.cc b/csrc/mm_dramsim2.cc
index 6be9eaf6..904a24b3 100644
--- a/csrc/mm_dramsim2.cc
+++ b/csrc/mm_dramsim2.cc
@@ -19,10 +19,10 @@ void mm_dramsim2_t::read_complete(unsigned id, uint64_t address, uint64_t clock_
   auto tag = req[address];
   req.erase(address);
 
-  for (int i = 0; i < REFILL_COUNT; i++)
+  for (int i = 0; i < line_size/word_size; i++)
   {
-    auto base = data + address + i*MM_WORD_SIZE;
-    auto dat = std::vector<char>(base, base + MM_WORD_SIZE);
+    auto base = data + address + i*word_size;
+    auto dat = std::vector<char>(base, base + word_size);
     resp.push(std::make_pair(tag, dat));
   }
 
@@ -43,10 +43,12 @@ void power_callback(double a, double b, double c, double d)
     //fprintf(stderr, "power callback: %0.3f, %0.3f, %0.3f, %0.3f\n",a,b,c,d);
 }
 
-void mm_dramsim2_t::init(size_t sz)
+void mm_dramsim2_t::init(size_t sz, int wsz, int lsz)
 {
-  mm_t::init(sz);
-  dummy_data.resize(MM_WORD_SIZE);
+  assert(lsz == 64); // assumed by dramsim2
+  mm_t::init(sz, wsz, lsz);
+
+  dummy_data.resize(word_size);
 
   assert(size % (1024*1024) == 0);
   mem = getMemorySystemInstance("DDR3_micron_64M_8B_x4_sg15.ini", "system.ini", "dramsim2_ini", "results", size/(1024*1024));
@@ -67,20 +69,22 @@ void mm_dramsim2_t::tick
   uint64_t req_cmd_addr,
   uint64_t req_cmd_tag,
   bool req_data_val,
-  void* req_data_bits
+  void* req_data_bits,
+  bool resp_rdy
 )
 {
   bool req_cmd_fire = req_cmd_val && req_cmd_ready();
   bool req_data_fire = req_data_val && req_data_ready();
+  bool resp_fire = resp_valid() && resp_rdy;
   assert(!(req_cmd_fire && req_data_fire));
 
-  if (resp_valid())
+  if (resp_fire)
     resp.pop();
 
   if (req_cmd_fire)
   {
     // since the I$ can speculatively ask for address that are out of bounds
-    auto byte_addr = (req_cmd_addr*REFILL_COUNT*MM_WORD_SIZE) % size;
+    auto byte_addr = (req_cmd_addr * line_size) % size;
 
     if (req_cmd_store)
     {
@@ -104,9 +108,9 @@ void mm_dramsim2_t::tick
 
   if (req_data_fire)
   {
-    memcpy(data + store_addr + store_count*MM_WORD_SIZE, req_data_bits, MM_WORD_SIZE);
+    memcpy(data + store_addr + store_count*word_size, req_data_bits, word_size);
 
-    store_count = (store_count + 1) % REFILL_COUNT;
+    store_count = (store_count + 1) % (line_size/word_size);
     if (store_count == 0)
     { // last chunch of cache line arrived.
       store_inflight = 0;
diff --git a/csrc/mm_dramsim2.h b/csrc/mm_dramsim2.h
index 7db22480..1546afe8 100644
--- a/csrc/mm_dramsim2.h
+++ b/csrc/mm_dramsim2.h
@@ -12,7 +12,7 @@ class mm_dramsim2_t : public mm_t
  public:
   mm_dramsim2_t() : store_inflight(false), store_count(0) {}
 
-  virtual void init(size_t sz);
+  virtual void init(size_t sz, int word_size, int line_size);
 
   virtual bool req_cmd_ready() { return mem->willAcceptTransaction() && !store_inflight; }
   virtual bool req_data_ready() { return mem->willAcceptTransaction() && store_inflight; }
@@ -27,7 +27,8 @@ class mm_dramsim2_t : public mm_t
     uint64_t req_cmd_addr,
     uint64_t req_cmd_tag,
     bool req_data_val,
-    void* req_data_bits
+    void* req_data_bits,
+    bool resp_rdy
   );
 
 
diff --git a/csrc/vcs_main.cc b/csrc/vcs_main.cc
index 862621cb..e5f5162b 100644
--- a/csrc/vcs_main.cc
+++ b/csrc/vcs_main.cc
@@ -26,23 +26,24 @@ void memory_tick(
   vc_handle mem_req_data_bits,
 
   vc_handle mem_resp_val,
+  vc_handle mem_resp_rdy,
   vc_handle mem_resp_tag,
   vc_handle mem_resp_data)
 {
-  uint32_t req_data[MM_WORD_SIZE*REFILL_COUNT/sizeof(uint32_t)];
-  for (size_t i = 0; i < MM_WORD_SIZE*REFILL_COUNT/sizeof(uint32_t); i++)
+  uint32_t req_data[mm->get_word_size()/sizeof(uint32_t)];
+  for (size_t i = 0; i < mm->get_word_size()/sizeof(uint32_t); i++)
     req_data[i] = vc_4stVectorRef(mem_req_data_bits)[i].d;
 
   vc_putScalar(mem_req_rdy, mm->req_cmd_ready());
   vc_putScalar(mem_req_data_rdy, mm->req_data_ready());
   vc_putScalar(mem_resp_val, mm->resp_valid());
 
-  vec32 d[MM_WORD_SIZE*REFILL_COUNT/sizeof(uint32_t)];
+  vec32 d[mm->get_word_size()/sizeof(uint32_t)];
   d[0].c = 0;
   d[0].d = mm->resp_tag();
   vc_put4stVector(mem_resp_tag, d);
 
-  for (size_t i = 0; i < MM_WORD_SIZE*REFILL_COUNT/sizeof(uint32_t); i++)
+  for (size_t i = 0; i < mm->get_word_size()/sizeof(uint32_t); i++)
   {
     d[i].c = 0;
     d[i].d = ((uint32_t*)mm->resp_data())[i];
@@ -56,22 +57,26 @@ void memory_tick(
     vc_4stVectorRef(mem_req_addr)->d,
     vc_4stVectorRef(mem_req_tag)->d,
     vc_getScalar(mem_req_data_val),
-    req_data
+    req_data,
+    vc_getScalar(mem_resp_rdy)
   );
 }
 
 void htif_init
 (
-  vc_handle width,
+  vc_handle htif_width,
+  vc_handle mem_width,
   vc_handle argv,
   vc_handle loadmem,
   vc_handle dramsim
 )
 {
+  int mw = vc_4stVectorRef(mem_width)->d;
   mm = vc_getScalar(dramsim) ? (mm_t*)(new mm_dramsim2_t) : (mm_t*)(new mm_magic_t);
-  mm->init(MEM_SIZE);
+  assert(mw && (mw & (mw-1)) == 0);
+  mm->init(MEM_SIZE, mw/8, LINE_SIZE);
 
-  vec32* w = vc_4stVectorRef(width);
+  vec32* w = vc_4stVectorRef(htif_width);
   assert(w->d <= 32 && w->d % 8 == 0); // htif_tick assumes data fits in a vec32
   htif_bytes = w->d/8;
 
@@ -134,7 +139,8 @@ void htif_tick
   vc_put4stVector(htif_in_bits, &bits);
   vc_putScalar(htif_in_valid, peek_in_valid);
 
-  vc_putScalar(exit, htif->done() ? (htif->exit_code() << 1 | 1) : 0);
+  bits.d = htif->done() ? (htif->exit_code() << 1 | 1) : 0;
+  vc_put4stVector(exit, &bits);
 }
 
 }
diff --git a/riscv-rocket b/riscv-rocket
index ac48cb2a..a5063baf 160000
--- a/riscv-rocket
+++ b/riscv-rocket
@@ -1 +1 @@
-Subproject commit ac48cb2a5d5388b83aacbe1bf6c1b00610069346
+Subproject commit a5063baf1a5806f577e38c8c33d71225619da0c3
diff --git a/riscv-tests b/riscv-tests
index 1f25cfbd..1dd1e131 160000
--- a/riscv-tests
+++ b/riscv-tests
@@ -1 +1 @@
-Subproject commit 1f25cfbde65518f6e7b43d49451eb3ae1f9d2811
+Subproject commit 1dd1e13180dd65ffe3075cbdc5c12fda8c3e755f
diff --git a/src/main/scala/RocketChip.scala b/src/main/scala/RocketChip.scala
index d5d3f802..526907e2 100644
--- a/src/main/scala/RocketChip.scala
+++ b/src/main/scala/RocketChip.scala
@@ -323,7 +323,7 @@ class OuterMemorySystem(htif_width: Int, clientEndpoints: Seq[ClientCoherenceAge
     val incoherent = Vec(conf.ln.nClients) { Bool() }.asInput
     val mem_backup = new ioMemSerialized(htif_width)
     val mem_backup_en = Bool(INPUT)
-    val mem = new ioMemPipe
+    val mem = new ioMem
   }
 
   import rocket.Constants._
@@ -375,6 +375,7 @@ class OuterMemorySystem(htif_width: Int, clientEndpoints: Seq[ClientCoherenceAge
   mem_serdes.io.wide.req_data.bits := mem_dataq.io.deq.bits
 
   llc.io.mem.resp.valid := Mux(io.mem_backup_en, mem_serdes.io.wide.resp.valid, io.mem.resp.valid)
+  io.mem.resp.ready := Bool(true)
   llc.io.mem.resp.bits := Mux(io.mem_backup_en, mem_serdes.io.wide.resp.bits, io.mem.resp.bits)
 
   io.mem_backup <> mem_serdes.io.narrow
@@ -388,7 +389,7 @@ class Uncore(htif_width: Int, tileList: Seq[ClientCoherenceAgent])(implicit conf
     val host = new HostIO(htif_width)
     val mem_backup = new ioMemSerialized(htif_width)
     val mem_backup_en = Bool(INPUT)
-    val mem = new ioMemPipe
+    val mem = new ioMem
     val tiles = Vec(conf.ln.nClients) { new TileLinkIO }.flip
     val htif = Vec(conf.ln.nClients) { new HTIFIO(conf.ln.nClients) }.flip
     val incoherent = Vec(conf.ln.nClients) { Bool() }.asInput
diff --git a/src/main/scala/fpga.scala b/src/main/scala/fpga.scala
index 3788864f..f906e3e2 100644
--- a/src/main/scala/fpga.scala
+++ b/src/main/scala/fpga.scala
@@ -13,7 +13,7 @@ class FPGAOuterMemorySystem(htif_width: Int, clientEndpoints: Seq[ClientCoherenc
     val tiles = Vec(conf.ln.nClients) { new TileLinkIO }.flip
     val htif = (new TileLinkIO).flip
     val incoherent = Vec(conf.ln.nClients) { Bool() }.asInput
-    val mem = new ioMemPipe
+    val mem = new ioMem
   }
 
   import rocket.Constants._
@@ -25,9 +25,6 @@ class FPGAOuterMemorySystem(htif_width: Int, clientEndpoints: Seq[ClientCoherenc
   require(clientEndpoints.length == lnWithHtifConf.nClients)
   val masterEndpoints = (0 until lnWithHtifConf.nMasters).map(new L2CoherenceAgent(_)(ucWithHtifConf))
 
-  val llc = new DRAMSideLLCNull(NGLOBAL_XACTS, REFILL_CYCLES)
-  val mem_serdes = new MemSerdes(htif_width)
-
   val net = new ReferenceChipCrossbarNetwork(masterEndpoints++clientEndpoints)(lnWithHtifConf)
   net.io zip (masterEndpoints.map(_.io.client) ++ io.tiles :+ io.htif) map { case (net, end) => net <> end }
   masterEndpoints.map{ _.io.incoherent zip (io.incoherent ++ List(Bool(true))) map { case (m, c) => m := c } }
@@ -40,19 +37,9 @@ class FPGAOuterMemorySystem(htif_width: Int, clientEndpoints: Seq[ClientCoherenc
   } else {
     conv.io.uncached <> masterEndpoints.head.io.master
   }
-  llc.io.cpu.req_cmd <> Queue(conv.io.mem.req_cmd)
-  llc.io.cpu.req_data <> Queue(conv.io.mem.req_data, REFILL_CYCLES)
-  conv.io.mem.resp <> llc.io.cpu.resp
-
-  val mem_cmdq = (new Queue(2)) { new MemReqCmd }
-  mem_cmdq.io.enq <> llc.io.mem.req_cmd
-  mem_cmdq.io.deq <> io.mem.req_cmd
-
-  val mem_dataq = (new Queue(REFILL_CYCLES)) { new MemData }
-  mem_dataq.io.enq <> llc.io.mem.req_data
-  mem_dataq.io.deq <> io.mem.req_data
-
-  llc.io.mem.resp <> io.mem.resp
+  io.mem.req_cmd <> Queue(conv.io.mem.req_cmd)
+  io.mem.req_data <> Queue(conv.io.mem.req_data, REFILL_CYCLES)
+  conv.io.mem.resp <> Queue(io.mem.resp, 16)
 }
 
 class FPGAUncore(htif_width: Int, tileList: Seq[ClientCoherenceAgent])(implicit conf: UncoreConfiguration) extends Component
@@ -61,7 +48,7 @@ class FPGAUncore(htif_width: Int, tileList: Seq[ClientCoherenceAgent])(implicit
   val io = new Bundle {
     val debug = new DebugIO()
     val host = new HostIO(htif_width)
-    val mem = new ioMemPipe
+    val mem = new ioMem
     val tiles = Vec(conf.ln.nClients) { new TileLinkIO }.flip
     val htif = Vec(conf.ln.nClients) { new HTIFIO(conf.ln.nClients) }.flip
     val incoherent = Vec(conf.ln.nClients) { Bool() }.asInput
@@ -188,11 +175,11 @@ class Slave extends AXISlave
 
   // read cr1 -> mem.req_cmd (nonblocking)
   // the memory system is FIFO from hereon out, so just remember the tags here
-  val tagq = new Queue(NGLOBAL_XACTS)(top.io.mem.req_cmd.bits.tag.clone)
+  val tagq = new Queue(4)(top.io.mem.req_cmd.bits.tag.clone)
   tagq.io.enq.bits := top.io.mem.req_cmd.bits.tag
   tagq.io.enq.valid := ren(1) && top.io.mem.req_cmd.valid && !top.io.mem.req_cmd.bits.rw
   top.io.mem.req_cmd.ready := ren(1)
-  rdata(1) := Cat(top.io.mem.req_cmd.bits.addr, top.io.mem.req_cmd.bits.rw, top.io.mem.req_cmd.valid)
+  rdata(1) := Cat(top.io.mem.req_cmd.bits.addr, top.io.mem.req_cmd.bits.rw, top.io.mem.req_cmd.valid && (tagq.io.enq.ready || top.io.mem.req_cmd.bits.rw))
   rvalid(1) := Bool(true)
   require(dw >= top.io.mem.req_cmd.bits.addr.getWidth + 1 + 1)
 
@@ -205,7 +192,7 @@ class Slave extends AXISlave
   top.io.mem.resp.bits.tag := tagq.io.deq.bits
   top.io.mem.resp.valid := wen(1) && in_count.andR
   tagq.io.deq.ready := top.io.mem.resp.fire() && rf_count.andR
-  wready(1) := Bool(true) //top.io.mem.resp.ready
+  wready(1) := top.io.mem.resp.ready
   when (wen(1) && wready(1)) {
     in_count := in_count + UFix(1)
     in_reg := top.io.mem.resp.bits.data
@@ -222,7 +209,7 @@ class Slave extends AXISlave
   when (ren(2) && rvalid(2)) { out_count := out_count + UFix(1) }
 
   // read cr3 -> error mode (nonblocking)
-  rdata(3) := top.io.debug.error_mode
+  rdata(3) := Cat(top.io.mem.req_cmd.valid, tagq.io.enq.ready, top.io.debug.error_mode)
   rvalid(3) := Bool(true)
 
   // writes to cr2, cr3 ignored
diff --git a/uncore b/uncore
index e39b29ba..d154f3fd 160000
--- a/uncore
+++ b/uncore
@@ -1 +1 @@
-Subproject commit e39b29bac3889f43fa666bdd72d86b17d439b9ca
+Subproject commit d154f3fdb673d28e26363e7d22df4ac1770f2c2c