1
0
Fork 0
rocket-chip/src/main/scala/tile/LazyRoCC.scala

386 lines
12 KiB
Scala
Raw Normal View History

// See LICENSE.Berkeley for license details.
// See LICENSE.SiFive for license details.
2014-09-13 03:06:41 +02:00
package freechips.rocketchip.tile
import Chisel._
import freechips.rocketchip.config._
2018-01-12 21:29:27 +01:00
import freechips.rocketchip.subsystem._
import freechips.rocketchip.diplomacy._
import freechips.rocketchip.rocket._
import freechips.rocketchip.tilelink._
import freechips.rocketchip.util.InOrderArbiter
2016-02-25 07:39:00 +01:00
case object RoccNPTWPorts extends Field[Int]
Heterogeneous Tiles (#550) Fundamental new features: * Added tile package: This package is intended to hold components re-usable across different types of tile. Will be the future location of TL2-RoCC accelerators and new diplomatic versions of intra-tile interfaces. * Adopted [ModuleName]Params convention: Code base was very inconsistent about what to name case classes that provide parameters to modules. Settled on calling them [ModuleName]Params to distinguish them from config.Parameters and config.Config. So far applied mostly only to case classes defined within rocket and tile. * Defined RocketTileParams: A nested case class containing case classes for all the components of a tile (L1 caches and core). Allows all such parameters to vary per-tile. * Defined RocketCoreParams: All the parameters that can be varied per-core. * Defined L1CacheParams: A trait defining the parameters common to L1 caches, made concrete in different derived case classes. * Defined RocketTilesKey: A sequence of RocketTileParams, one for every tile to be created. * Provided HeterogeneousDualCoreConfig: An example of making a heterogeneous chip with two cores, one big and one little. * Changes to legacy code: ReplacementPolicy moved to package util. L1Metadata moved to package tile. Legacy L2 cache agent removed because it can no longer share the metadata array implementation with the L1. Legacy GroundTests on life support. Additional changes that got rolled in along the way: * rocket: Fix critical path through BTB for I$ index bits > pgIdxBits * coreplex: tiles connected via :=* * groundtest: updated to use TileParams * tilelink: cache cork requirements are relaxed to allow more cacheless masters
2017-02-09 22:59:09 +01:00
case object BuildRoCC extends Field[Seq[RoCCParams]]
Heterogeneous Tiles (#550) Fundamental new features: * Added tile package: This package is intended to hold components re-usable across different types of tile. Will be the future location of TL2-RoCC accelerators and new diplomatic versions of intra-tile interfaces. * Adopted [ModuleName]Params convention: Code base was very inconsistent about what to name case classes that provide parameters to modules. Settled on calling them [ModuleName]Params to distinguish them from config.Parameters and config.Config. So far applied mostly only to case classes defined within rocket and tile. * Defined RocketTileParams: A nested case class containing case classes for all the components of a tile (L1 caches and core). Allows all such parameters to vary per-tile. * Defined RocketCoreParams: All the parameters that can be varied per-core. * Defined L1CacheParams: A trait defining the parameters common to L1 caches, made concrete in different derived case classes. * Defined RocketTilesKey: A sequence of RocketTileParams, one for every tile to be created. * Provided HeterogeneousDualCoreConfig: An example of making a heterogeneous chip with two cores, one big and one little. * Changes to legacy code: ReplacementPolicy moved to package util. L1Metadata moved to package tile. Legacy L2 cache agent removed because it can no longer share the metadata array implementation with the L1. Legacy GroundTests on life support. Additional changes that got rolled in along the way: * rocket: Fix critical path through BTB for I$ index bits > pgIdxBits * coreplex: tiles connected via :=* * groundtest: updated to use TileParams * tilelink: cache cork requirements are relaxed to allow more cacheless masters
2017-02-09 22:59:09 +01:00
case class RoCCParams(
opcodes: OpcodeSet,
generator: Parameters => LazyRoCC,
nPTWPorts : Int = 0,
useFPU: Boolean = false)
2013-09-15 00:31:50 +02:00
class RoCCInstruction extends Bundle
{
val funct = Bits(width = 7)
val rs2 = Bits(width = 5)
val rs1 = Bits(width = 5)
2013-09-15 00:31:50 +02:00
val xd = Bool()
val xs1 = Bool()
val xs2 = Bool()
val rd = Bits(width = 5)
2013-09-15 00:31:50 +02:00
val opcode = Bits(width = 7)
}
2015-10-06 06:48:05 +02:00
class RoCCCommand(implicit p: Parameters) extends CoreBundle()(p) {
2013-09-15 00:31:50 +02:00
val inst = new RoCCInstruction
val rs1 = Bits(width = xLen)
val rs2 = Bits(width = xLen)
val status = new MStatus
2013-09-15 00:31:50 +02:00
}
2015-10-06 06:48:05 +02:00
class RoCCResponse(implicit p: Parameters) extends CoreBundle()(p) {
2013-09-15 00:31:50 +02:00
val rd = Bits(width = 5)
val data = Bits(width = xLen)
2013-09-15 00:31:50 +02:00
}
class RoCCCoreIO(implicit p: Parameters) extends CoreBundle()(p) {
2013-09-15 00:31:50 +02:00
val cmd = Decoupled(new RoCCCommand).flip
val resp = Decoupled(new RoCCResponse)
val mem = new HellaCacheIO
2014-01-29 07:13:16 +01:00
val busy = Bool(OUTPUT)
val interrupt = Bool(OUTPUT)
val exception = Bool(INPUT)
}
/** Base classes for Diplomatic TL2 RoCC units **/
abstract class LazyRoCC(implicit p: Parameters) extends LazyModule {
val module: LazyRoCCModule
val atlNode: TLNode = TLIdentityNode()
val tlNode: TLNode = TLIdentityNode()
}
class RoCCIO(val outer: LazyRoCC)(implicit p: Parameters) extends RoCCCoreIO()(p) {
// Should be handled differently, eventually
2016-02-25 07:39:00 +01:00
val ptw = Vec(p(RoccNPTWPorts), new TLBPTWIO)
2015-04-02 10:30:11 +02:00
val fpu_req = Decoupled(new FPInput)
val fpu_resp = Decoupled(new FPResult).flip
}
class LazyRoCCModule(outer: LazyRoCC) extends LazyModuleImp(outer) {
val io = IO(new RoCCIO(outer))
}
/** Mixins for including RoCC **/
trait HasLazyRoCC extends CanHavePTW { this: BaseTile =>
val roccs = p(BuildRoCC).zipWithIndex.map { case (accelParams, i) =>
accelParams.generator(p.alterPartial({
case RoccNPTWPorts => accelParams.nPTWPorts
}))}
roccs.map(_.atlNode).foreach { atl => tlMasterXbar.node :=* atl }
roccs.map(_.tlNode).foreach { tl => tlOtherMastersNode :=* tl }
nPTWPorts += p(BuildRoCC).map(_.nPTWPorts).foldLeft(0)(_ + _)
nDCachePorts += roccs.size
}
2016-01-13 00:36:16 +01:00
trait HasLazyRoCCModule[+L <: BaseTile with HasLazyRoCC] extends CanHavePTWModule
with HasCoreParameters { this: BaseTileModuleImp[L] =>
val roccCore = Wire(new RoCCCoreIO()(outer.p))
val buildRocc = outer.p(BuildRoCC)
val usingRocc = !buildRocc.isEmpty
val nRocc = buildRocc.size
val nFPUPorts = buildRocc.filter(_.useFPU).size
val roccOpcodes = buildRocc.map(_.opcodes)
if(usingRocc) {
val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), nRocc))
roccCore.resp <> respArb.io.out
val cmdRouter = Module(new RoccCommandRouter(roccOpcodes)(outer.p))
cmdRouter.io.in <> roccCore.cmd
outer.roccs.zipWithIndex.foreach { case (rocc, i) =>
ptwPorts ++= rocc.module.io.ptw
rocc.module.io.cmd <> cmdRouter.io.out(i)
rocc.module.io.exception := roccCore.exception
val dcIF = Module(new SimpleHellaCacheIF()(outer.p))
dcIF.io.requestor <> rocc.module.io.mem
dcachePorts += dcIF.io.cache
respArb.io.in(i) <> Queue(rocc.module.io.resp)
}
roccCore.busy := cmdRouter.io.busy || outer.roccs.map(_.module.io.busy).reduce(_ || _)
roccCore.interrupt := outer.roccs.map(_.module.io.interrupt).reduce(_ || _)
fpuOpt foreach { fpu =>
if (usingFPU && nFPUPorts > 0) {
val fpArb = Module(new InOrderArbiter(new FPInput()(outer.p), new FPResult()(outer.p), nFPUPorts))
val fp_rocc_ios = outer.roccs.zip(buildRocc)
.filter { case (_, params) => params.useFPU }
.map { case (rocc, _) => rocc.module.io }
fpArb.io.in_req <> fp_rocc_ios.map(_.fpu_req)
fp_rocc_ios.zip(fpArb.io.in_resp).foreach {
case (rocc, arb) => rocc.fpu_resp <> arb
}
fpu.io.cp_req <> fpArb.io.out_req
fpArb.io.out_resp <> fpu.io.cp_resp
} else {
fpu.io.cp_req.valid := Bool(false)
fpu.io.cp_resp.ready := Bool(false)
}
}
}
2013-09-15 00:31:50 +02:00
}
class AccumulatorExample(implicit p: Parameters) extends LazyRoCC {
override lazy val module = new AccumulatorExampleModule(this)
2013-09-15 00:31:50 +02:00
}
class AccumulatorExampleModule(outer: AccumulatorExample, n: Int = 4)(implicit p: Parameters) extends LazyRoCCModule(outer)
with HasCoreParameters {
val regfile = Mem(n, UInt(width = xLen))
val busy = Reg(init = Vec.fill(n){Bool(false)})
2013-09-15 00:31:50 +02:00
2013-09-15 07:34:53 +02:00
val cmd = Queue(io.cmd)
val funct = cmd.bits.inst.funct
val addr = cmd.bits.rs2(log2Up(n)-1,0)
2013-09-15 07:34:53 +02:00
val doWrite = funct === UInt(0)
val doRead = funct === UInt(1)
val doLoad = funct === UInt(2)
val doAccum = funct === UInt(3)
val memRespTag = io.mem.resp.bits.tag(log2Up(n)-1,0)
// datapath
val addend = cmd.bits.rs1
2013-09-15 00:31:50 +02:00
val accum = regfile(addr)
2013-09-15 07:34:53 +02:00
val wdata = Mux(doWrite, addend, accum + addend)
2013-09-15 00:31:50 +02:00
2013-09-15 07:34:53 +02:00
when (cmd.fire() && (doWrite || doAccum)) {
2013-09-15 00:31:50 +02:00
regfile(addr) := wdata
}
2013-09-15 07:34:53 +02:00
when (io.mem.resp.valid) {
regfile(memRespTag) := io.mem.resp.bits.data
2016-08-04 20:17:13 +02:00
busy(memRespTag) := Bool(false)
2013-09-15 07:34:53 +02:00
}
// control
when (io.mem.req.fire()) {
busy(addr) := Bool(true)
}
val doResp = cmd.bits.inst.xd
val stallReg = busy(addr)
val stallLoad = doLoad && !io.mem.req.ready
val stallResp = doResp && !io.resp.ready
cmd.ready := !stallReg && !stallLoad && !stallResp
// command resolved if no stalls AND not issuing a load that will need a request
// PROC RESPONSE INTERFACE
io.resp.valid := cmd.valid && doResp && !stallReg && !stallLoad
// valid response if valid command, need a response, and no stalls
2013-09-15 07:34:53 +02:00
io.resp.bits.rd := cmd.bits.inst.rd
// Must respond with the appropriate tag or undefined behavior
io.resp.bits.data := accum
// Semantics is to always send out prior accumulator register value
2013-09-15 07:34:53 +02:00
io.busy := cmd.valid || busy.reduce(_||_)
// Be busy when have pending memory requests or committed possibility of pending requests
2013-09-15 00:31:50 +02:00
io.interrupt := Bool(false)
// Set this true to trigger an interrupt on the processor (please refer to supervisor documentation)
2013-09-15 07:34:53 +02:00
// MEMORY REQUEST INTERFACE
io.mem.req.valid := cmd.valid && doLoad && !stallReg && !stallResp
2013-09-15 07:34:53 +02:00
io.mem.req.bits.addr := addend
io.mem.req.bits.tag := addr
io.mem.req.bits.cmd := M_XRD // perform a load (M_XWR for stores)
2013-09-15 07:34:53 +02:00
io.mem.req.bits.typ := MT_D // D = 8 bytes, W = 4, H = 2, B = 1
io.mem.req.bits.data := Bits(0) // we're not performing any stores...
io.mem.req.bits.phys := Bool(false)
io.mem.invalidate_lr := Bool(false)
}
2014-05-15 01:17:39 +02:00
class TranslatorExample(implicit p: Parameters) extends LazyRoCC {
override lazy val module = new TranslatorExampleModule(this)
2013-09-15 00:31:50 +02:00
}
class TranslatorExampleModule(outer: TranslatorExample)(implicit p: Parameters) extends LazyRoCCModule(outer)
with HasCoreParameters {
val req_addr = Reg(UInt(width = coreMaxAddrBits))
val req_rd = Reg(io.resp.bits.rd)
val req_offset = req_addr(pgIdxBits - 1, 0)
val req_vpn = req_addr(coreMaxAddrBits - 1, pgIdxBits)
2016-03-03 08:29:58 +01:00
val pte = Reg(new PTE)
val s_idle :: s_ptw_req :: s_ptw_resp :: s_resp :: Nil = Enum(Bits(), 4)
val state = Reg(init = s_idle)
io.cmd.ready := (state === s_idle)
when (io.cmd.fire()) {
req_rd := io.cmd.bits.inst.rd
req_addr := io.cmd.bits.rs1
state := s_ptw_req
}
2016-02-25 07:39:00 +01:00
private val ptw = io.ptw(0)
2016-02-25 07:39:00 +01:00
when (ptw.req.fire()) { state := s_ptw_resp }
when (state === s_ptw_resp && ptw.resp.valid) {
2016-03-03 08:29:58 +01:00
pte := ptw.resp.bits.pte
state := s_resp
}
when (io.resp.fire()) { state := s_idle }
2016-02-25 07:39:00 +01:00
ptw.req.valid := (state === s_ptw_req)
ptw.req.bits.addr := req_vpn
io.resp.valid := (state === s_resp)
io.resp.bits.rd := req_rd
2016-08-04 20:17:13 +02:00
io.resp.bits.data := Mux(pte.leaf(), Cat(pte.ppn, req_offset), SInt(-1, xLen).asUInt)
io.busy := (state =/= s_idle)
io.interrupt := Bool(false)
io.mem.req.valid := Bool(false)
}
class CharacterCountExample(implicit p: Parameters) extends LazyRoCC {
override lazy val module = new CharacterCountExampleModule(this)
override val atlNode = TLClientNode(Seq(TLClientPortParameters(Seq(TLClientParameters("CharacterCountRoCC")))))
}
class CharacterCountExampleModule(outer: CharacterCountExample)(implicit p: Parameters) extends LazyRoCCModule(outer)
with HasCoreParameters
with HasL1CacheParameters {
val cacheParams = tileParams.icache.get
private val blockOffset = blockOffBits
private val beatOffset = log2Up(cacheDataBits/8)
val needle = Reg(UInt(width = 8))
val addr = Reg(UInt(width = coreMaxAddrBits))
val count = Reg(UInt(width = xLen))
val resp_rd = Reg(io.resp.bits.rd)
val addr_block = addr(coreMaxAddrBits - 1, blockOffset)
val offset = addr(blockOffset - 1, 0)
val next_addr = (addr_block + UInt(1)) << UInt(blockOffset)
val s_idle :: s_acq :: s_gnt :: s_check :: s_resp :: Nil = Enum(Bits(), 5)
val state = Reg(init = s_idle)
val (tl_out, edgesOut) = outer.atlNode.out(0)
val gnt = tl_out.d.bits
val recv_data = Reg(UInt(width = cacheDataBits))
val recv_beat = Reg(UInt(width = log2Up(cacheDataBeats+1)), init = UInt(0))
val data_bytes = Vec.tabulate(cacheDataBits/8) { i => recv_data(8 * (i + 1) - 1, 8 * i) }
val zero_match = data_bytes.map(_ === UInt(0))
val needle_match = data_bytes.map(_ === needle)
val first_zero = PriorityEncoder(zero_match)
val chars_found = PopCount(needle_match.zipWithIndex.map {
case (matches, i) =>
val idx = Cat(recv_beat - UInt(1), UInt(i, beatOffset))
matches && idx >= offset && UInt(i) <= first_zero
})
val zero_found = zero_match.reduce(_ || _)
val finished = Reg(Bool())
io.cmd.ready := (state === s_idle)
io.resp.valid := (state === s_resp)
io.resp.bits.rd := resp_rd
io.resp.bits.data := count
tl_out.a.valid := (state === s_acq)
tl_out.a.bits := edgesOut.Get(
fromSource = UInt(0),
toAddress = addr_block << blockOffset,
lgSize = UInt(lgCacheBlockBytes))._2
tl_out.d.ready := (state === s_gnt)
when (io.cmd.fire()) {
addr := io.cmd.bits.rs1
needle := io.cmd.bits.rs2
resp_rd := io.cmd.bits.inst.rd
count := UInt(0)
finished := Bool(false)
state := s_acq
}
when (tl_out.a.fire()) { state := s_gnt }
when (tl_out.d.fire()) {
recv_beat := recv_beat + UInt(1)
recv_data := gnt.data
state := s_check
}
when (state === s_check) {
when (!finished) {
count := count + chars_found
}
when (zero_found) { finished := Bool(true) }
when (recv_beat === UInt(cacheDataBeats)) {
addr := next_addr
state := Mux(zero_found || finished, s_resp, s_acq)
} .otherwise {
state := s_gnt
}
}
when (io.resp.fire()) { state := s_idle }
io.busy := (state =/= s_idle)
io.interrupt := Bool(false)
io.mem.req.valid := Bool(false)
// Tie off unused channels
tl_out.b.ready := Bool(true)
tl_out.c.valid := Bool(false)
tl_out.e.valid := Bool(false)
}
class OpcodeSet(val opcodes: Seq[UInt]) {
def |(set: OpcodeSet) =
new OpcodeSet(this.opcodes ++ set.opcodes)
def matches(oc: UInt) = opcodes.map(_ === oc).reduce(_ || _)
}
object OpcodeSet {
def custom0 = new OpcodeSet(Seq(Bits("b0001011")))
def custom1 = new OpcodeSet(Seq(Bits("b0101011")))
def custom2 = new OpcodeSet(Seq(Bits("b1011011")))
def custom3 = new OpcodeSet(Seq(Bits("b1111011")))
def all = custom0 | custom1 | custom2 | custom3
}
class RoccCommandRouter(opcodes: Seq[OpcodeSet])(implicit p: Parameters)
extends CoreModule()(p) {
val io = new Bundle {
val in = Decoupled(new RoCCCommand).flip
val out = Vec(opcodes.size, Decoupled(new RoCCCommand))
val busy = Bool(OUTPUT)
}
val cmd = Queue(io.in)
val cmdReadys = io.out.zip(opcodes).map { case (out, opcode) =>
val me = opcode.matches(cmd.bits.inst.opcode)
out.valid := cmd.valid && me
out.bits := cmd.bits
out.ready && me
}
cmd.ready := cmdReadys.reduce(_ || _)
io.busy := cmd.valid
assert(PopCount(cmdReadys) <= UInt(1),
"Custom opcode matched for more than one accelerator")
}