From a369d8f17fc95ed8076730f52449927cef1d453c Mon Sep 17 00:00:00 2001 From: Colin Schmidt Date: Thu, 2 Apr 2015 01:30:11 -0700 Subject: [PATCH 01/10] Add fpu port to the rocc interface --- rocket/src/main/scala/core.scala | 6 +++ rocket/src/main/scala/fpu.scala | 67 ++++++++++++++++++++++---------- rocket/src/main/scala/rocc.scala | 2 + 3 files changed, 54 insertions(+), 21 deletions(-) diff --git a/rocket/src/main/scala/core.scala b/rocket/src/main/scala/core.scala index 6c1b7373..0586048c 100644 --- a/rocket/src/main/scala/core.scala +++ b/rocket/src/main/scala/core.scala @@ -74,6 +74,12 @@ class Core extends Module with CoreParameters .foreach { fpu => dpath.io.fpu <> fpu.io.dpath ctrl.io.fpu <> fpu.io.ctrl + if(!params(BuildRoCC).isEmpty) { + io.rocc.fpu_req <> fpu.io.cp_req + io.rocc.fpu_resp <> fpu.io.cp_resp + } else { + fpu.io.cp_req.valid := Bool(false) + } } ctrl.io.dpath <> dpath.io.ctrl diff --git a/rocket/src/main/scala/fpu.scala b/rocket/src/main/scala/fpu.scala index 9f069b45..36216edb 100644 --- a/rocket/src/main/scala/fpu.scala +++ b/rocket/src/main/scala/fpu.scala @@ -348,21 +348,32 @@ class FPU extends Module val io = new Bundle { val ctrl = (new CtrlFPUIO).flip val dpath = (new DpathFPUIO).flip + val cp_req = Decoupled(new FPInput()).flip //cp doesn't pay attn to kill sigs + val cp_resp = Decoupled(new FPResult()) } val ex_reg_valid = Reg(next=io.ctrl.valid, init=Bool(false)) + val req_valid = ex_reg_valid || io.cp_req.valid val ex_reg_inst = RegEnable(io.dpath.inst, io.ctrl.valid) - val mem_reg_valid = Reg(next=ex_reg_valid && !io.ctrl.killx, init=Bool(false)) + val ex_cp_valid = io.cp_req.valid && !ex_reg_valid + val mem_reg_valid = Reg(next=ex_reg_valid && !io.ctrl.killx || ex_cp_valid, init=Bool(false)) val mem_reg_inst = RegEnable(ex_reg_inst, ex_reg_valid) - val killm = io.ctrl.killm || io.ctrl.nack_mem - val wb_reg_valid = Reg(next=mem_reg_valid && !killm, init=Bool(false)) + val mem_cp_valid = Reg(next=ex_cp_valid, init=Bool(false)) + val killm = (io.ctrl.killm || io.ctrl.nack_mem) && !mem_cp_valid + val wb_reg_valid = Reg(next=mem_reg_valid && (!killm || mem_cp_valid), init=Bool(false)) + val wb_cp_valid = Reg(next=mem_cp_valid, init=Bool(false)) val fp_decoder = Module(new FPUDecoder) fp_decoder.io.inst := io.dpath.inst + val cp_ctrl = new FPUCtrlSigs + cp_ctrl <> io.cp_req.bits + io.cp_resp.valid := Bool(false) + io.cp_resp.bits.data := UInt(0) + val id_ctrl = fp_decoder.io.sigs - val ex_ctrl = RegEnable(id_ctrl, io.ctrl.valid) - val mem_ctrl = RegEnable(ex_ctrl, ex_reg_valid) + val ex_ctrl = Mux(ex_reg_valid, RegEnable(id_ctrl, io.ctrl.valid), cp_ctrl) + val mem_ctrl = RegEnable(ex_ctrl, req_valid) val wb_ctrl = RegEnable(mem_ctrl, mem_reg_valid) // load response @@ -391,35 +402,43 @@ class FPU extends Module val ex_rs1::ex_rs2::ex_rs3::Nil = Seq(ex_ra1, ex_ra2, ex_ra3).map(regfile(_)) val ex_rm = Mux(ex_reg_inst(14,12) === Bits(7), io.dpath.fcsr_rm, ex_reg_inst(14,12)) + val cp_rs1 = io.cp_req.bits.in1 + val cp_rs2 = Mux(io.cp_req.bits.swap23, io.cp_req.bits.in3, io.cp_req.bits.in2) + val cp_rs3 = Mux(io.cp_req.bits.swap23, io.cp_req.bits.in2, io.cp_req.bits.in3) + val req = new FPInput req := ex_ctrl - req.rm := ex_rm - req.in1 := ex_rs1 - req.in2 := ex_rs2 - req.in3 := ex_rs3 - req.typ := ex_reg_inst(21,20) + req.rm := Mux(ex_reg_valid, ex_rm, io.cp_req.bits.rm) + req.in1 := Mux(ex_reg_valid, ex_rs1, cp_rs1) + req.in2 := Mux(ex_reg_valid, ex_rs2, cp_rs2) + req.in3 := Mux(ex_reg_valid, ex_rs3, cp_rs3) + req.typ := Mux(ex_reg_valid, ex_reg_inst(21,20), io.cp_req.bits.typ) val sfma = Module(new FPUFMAPipe(params(SFMALatency), 23, 9)) - sfma.io.in.valid := ex_reg_valid && ex_ctrl.fma && ex_ctrl.single + sfma.io.in.valid := req_valid && ex_ctrl.fma && ex_ctrl.single sfma.io.in.bits := req val dfma = Module(new FPUFMAPipe(params(DFMALatency), 52, 12)) - dfma.io.in.valid := ex_reg_valid && ex_ctrl.fma && !ex_ctrl.single + dfma.io.in.valid := req_valid && ex_ctrl.fma && !ex_ctrl.single dfma.io.in.bits := req val fpiu = Module(new FPToInt) - fpiu.io.in.valid := ex_reg_valid && (ex_ctrl.toint || ex_ctrl.cmd === FCMD_MINMAX) + fpiu.io.in.valid := req_valid && (ex_ctrl.toint || ex_ctrl.cmd === FCMD_MINMAX) fpiu.io.in.bits := req io.dpath.store_data := fpiu.io.out.bits.store io.dpath.toint_data := fpiu.io.out.bits.toint + when(fpiu.io.out.valid){//COLIN FIXME: are there conflicts since we now share a port? + io.cp_resp.bits.data := fpiu.io.out.bits.toint + io.cp_resp.valid := Bool(true) + } val ifpu = Module(new IntToFP(3)) - ifpu.io.in.valid := ex_reg_valid && ex_ctrl.fromint + ifpu.io.in.valid := req_valid && ex_ctrl.fromint ifpu.io.in.bits := req - ifpu.io.in.bits.in1 := io.dpath.fromint_data + ifpu.io.in.bits.in1 := Mux(ex_reg_valid, io.dpath.fromint_data, cp_rs1) val fpmu = Module(new FPToFP(2)) - fpmu.io.in.valid := ex_reg_valid && ex_ctrl.fastpipe + fpmu.io.in.valid := req_valid && ex_ctrl.fastpipe fpmu.io.in.bits := req fpmu.io.lt := fpiu.io.out.bits.lt @@ -441,8 +460,8 @@ class FPU extends Module val wen = Reg(init=Bits(0, maxLatency-1)) val winfo = Vec.fill(maxLatency-1){Reg(Bits())} val mem_wen = mem_reg_valid && (mem_ctrl.fma || mem_ctrl.fastpipe || mem_ctrl.fromint) - val write_port_busy = RegEnable(mem_wen && (memLatencyMask & latencyMask(ex_ctrl, 1)).orR || (wen & latencyMask(ex_ctrl, 0)).orR, ex_reg_valid) - val mem_winfo = Cat(pipeid(mem_ctrl), mem_reg_inst(11,7)) + val write_port_busy = RegEnable(mem_wen && (memLatencyMask & latencyMask(ex_ctrl, 1)).orR || (wen & latencyMask(ex_ctrl, 0)).orR, req_valid) + val mem_winfo = Cat(mem_cp_valid, pipeid(mem_ctrl), mem_reg_inst(11,7)) for (i <- 0 until maxLatency-2) { when (wen(i+1)) { winfo(i) := winfo(i+1) } @@ -461,9 +480,15 @@ class FPU extends Module val waddr = winfo(0)(4,0).toUInt val wsrc = winfo(0) >> waddr.getWidth + val wcp = winfo(0)(waddr.getWidth+log2Up(pipes.size)) val wdata = Vec(pipes.map(_.wdata))(wsrc) val wexc = Vec(pipes.map(_.wexc))(wsrc) - when (wen(0)) { regfile(waddr(4,0)) := wdata } + when (wen(0) && !wcp) { regfile(waddr(4,0)) := wdata } + when (wen(0) && wcp) { + io.cp_resp.bits.data := wdata + io.cp_resp.valid := Bool(true) + } + io.cp_req.ready := !ex_reg_valid val wb_toint_valid = wb_reg_valid && wb_ctrl.toint val wb_toint_exc = RegEnable(fpiu.io.out.bits.exc, mem_ctrl.toint) @@ -478,8 +503,8 @@ class FPU extends Module io.ctrl.nack_mem := units_busy || write_port_busy io.ctrl.dec <> fp_decoder.io.sigs def useScoreboard(f: ((Pipe, Int)) => Bool) = pipes.zipWithIndex.filter(_._1.lat > 3).map(x => f(x)).fold(Bool(false))(_||_) - io.ctrl.sboard_set := wb_reg_valid && Reg(next=useScoreboard(_._1.cond(mem_ctrl))) - io.ctrl.sboard_clr := wen(0) && useScoreboard(x => wsrc === UInt(x._2)) + io.ctrl.sboard_set := wb_reg_valid && !wb_cp_valid && Reg(next=useScoreboard(_._1.cond(mem_ctrl))) + io.ctrl.sboard_clr := wen(0) && !wb_cp_valid && useScoreboard(x => wsrc === UInt(x._2)) io.ctrl.sboard_clra := waddr // we don't currently support round-max-magnitude (rm=4) io.ctrl.illegal_rm := ex_rm(2) && ex_ctrl.round diff --git a/rocket/src/main/scala/rocc.scala b/rocket/src/main/scala/rocc.scala index 0f044dae..bf49aca5 100644 --- a/rocket/src/main/scala/rocc.scala +++ b/rocket/src/main/scala/rocc.scala @@ -49,6 +49,8 @@ class RoCCInterface extends Bundle val iptw = new TLBPTWIO val dptw = new TLBPTWIO val pptw = new TLBPTWIO + val fpu_req = Decoupled(new FPInput) + val fpu_resp = Decoupled(new FPResult).flip val exception = Bool(INPUT) } From bd72db92c13db7512ef29b5118d42ef02bf395e5 Mon Sep 17 00:00:00 2001 From: Colin Schmidt Date: Tue, 7 Apr 2015 15:02:02 -0700 Subject: [PATCH 02/10] update rocc port to use fdiv/sqrt --- rocket/src/main/scala/fpu.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/rocket/src/main/scala/fpu.scala b/rocket/src/main/scala/fpu.scala index 29802b83..3b5a6c1f 100644 --- a/rocket/src/main/scala/fpu.scala +++ b/rocket/src/main/scala/fpu.scala @@ -440,7 +440,7 @@ class FPU extends Module fpiu.io.in.bits := req io.dpath.store_data := fpiu.io.out.bits.store io.dpath.toint_data := fpiu.io.out.bits.toint - when(fpiu.io.out.valid){//COLIN FIXME: are there conflicts since we now share a port? + when(fpiu.io.out.valid && mem_cp_valid && !(mem_ctrl.div || mem_ctrl.sqrt)){ io.cp_resp.bits.data := fpiu.io.out.bits.toint io.cp_resp.valid := Bool(true) } @@ -463,6 +463,7 @@ class FPU extends Module val divSqrt_wdata = Bits() val divSqrt_flags = Bits() val divSqrt_in_flight = Reg(init=Bool(false)) + val divSqrt_cp = Reg(init=Bool(false)) // writeback arbitration case class Pipe(p: Module, lat: Int, cond: (FPUCtrlSigs) => Bool, wdata: Bits, wexc: Bits) @@ -505,8 +506,8 @@ class FPU extends Module val wcp = winfo(0)(5+log2Up(pipes.size)) val wdata = Mux(divSqrt_wen, divSqrt_wdata, Vec(pipes.map(_.wdata))(wsrc)) val wexc = Vec(pipes.map(_.wexc))(wsrc) - when (!wcp && (wen(0) || divSqrt_wen)) { regfile(waddr) := wdata } - when (wcp && (wen(0) || divSqrt_wen)) { + when ((!wcp && wen(0)) || (!divSqrt_cp && divSqrt_wen)) { regfile(waddr) := wdata } + when ((wcp && wen(0)) || (divSqrt_cp && divSqrt_wen)) { io.cp_resp.bits.data := wdata io.cp_resp.valid := Bool(true) } @@ -541,7 +542,7 @@ class FPU extends Module def upconvert(x: UInt) = hardfloat.recodedFloatNToRecodedFloatM(x, Bits(0), 23, 9, 52, 12)._1 val divSqrt_wb_hazard = wen.orR - divSqrt.io.inValid := mem_reg_valid && !divSqrt_wb_hazard && !divSqrt_in_flight && !io.ctrl.killm && (mem_ctrl.div || mem_ctrl.sqrt) + divSqrt.io.inValid := mem_reg_valid && !divSqrt_wb_hazard && !divSqrt_in_flight && (!io.ctrl.killm || mem_cp_valid) && (mem_ctrl.div || mem_ctrl.sqrt) divSqrt.io.sqrtOp := mem_ctrl.sqrt divSqrt.io.a := fpiu.io.as_double.in1 divSqrt.io.b := fpiu.io.as_double.in2 @@ -552,6 +553,7 @@ class FPU extends Module divSqrt_single := mem_ctrl.single divSqrt_waddr := mem_reg_inst(11,7) divSqrt_rm := divSqrt.io.roundingMode + divSqrt_cp := mem_cp_valid } when (divSqrt_outValid) { From 1f410ac42cde16c30ec76ec130c5b2ab7fb0006d Mon Sep 17 00:00:00 2001 From: Colin Schmidt Date: Wed, 22 Apr 2015 11:26:03 -0700 Subject: [PATCH 03/10] move fetch buffer into frontend to allow retiming --- rocket/src/main/scala/icache.scala | 85 ++++++++++++++---------------- 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/rocket/src/main/scala/icache.scala b/rocket/src/main/scala/icache.scala index 1103f22d..69f5d877 100644 --- a/rocket/src/main/scala/icache.scala +++ b/rocket/src/main/scala/icache.scala @@ -59,13 +59,14 @@ class Frontend(btb_updates_out_of_order: Boolean = false) extends FrontendModule val s2_btb_resp_valid = Reg(init=Bool(false)) val s2_btb_resp_bits = Reg(btb.io.resp.bits.clone) val s2_xcpt_if = Reg(init=Bool(false)) + val icbuf = Module(new Queue(new ICacheResp, 1, pipe=true)) val msb = vaddrBits-1 val lsb = log2Up(coreFetchWidth*coreInstBytes) val btbTarget = Cat(btb.io.resp.bits.target(msb), btb.io.resp.bits.target) val ntpc_0 = s1_pc + UInt(coreInstBytes*coreFetchWidth) val ntpc = Cat(s1_pc(msb) & ntpc_0(msb), ntpc_0(msb,lsb), Bits(0,lsb)) // unsure - val icmiss = s2_valid && !icache.io.resp.valid + val icmiss = s2_valid && !icbuf.io.deq.valid val predicted_npc = Mux(btb.io.resp.bits.taken, btbTarget, ntpc) val npc = Mux(icmiss, s2_pc, predicted_npc).toUInt val s0_same_block = !icmiss && !io.cpu.req.valid && !btb.io.resp.bits.taken && ((ntpc & rowBytes) === (s1_pc & rowBytes)) @@ -109,15 +110,17 @@ class Frontend(btb_updates_out_of_order: Boolean = false) extends FrontendModule icache.io.invalidate := io.cpu.invalidate icache.io.req.bits.ppn := tlb.io.resp.ppn icache.io.req.bits.kill := io.cpu.req.valid || tlb.io.resp.miss || icmiss || io.ptw.invalidate - icache.io.resp.ready := !stall && !s1_same_block - io.cpu.resp.valid := s2_valid && (s2_xcpt_if || icache.io.resp.valid) + io.cpu.resp.valid := s2_valid && (s2_xcpt_if || icbuf.io.deq.valid) io.cpu.resp.bits.pc := s2_pc + icbuf.io.enq <> icache.io.resp + icbuf.io.deq.ready := !stall && !s1_same_block + require(coreFetchWidth * coreInstBytes <= rowBytes) val fetch_data = - if (coreFetchWidth * coreInstBytes == rowBytes) icache.io.resp.bits.datablock - else icache.io.resp.bits.datablock >> (s2_pc(log2Up(rowBytes)-1,log2Up(coreFetchWidth*coreInstBytes)) << log2Up(coreFetchWidth*coreInstBits)) + if (coreFetchWidth * coreInstBytes == rowBytes) icbuf.io.deq.bits.datablock + else icbuf.io.deq.bits.datablock >> (s2_pc(log2Up(rowBytes)-1,log2Up(coreFetchWidth*coreInstBytes)) << log2Up(coreFetchWidth*coreInstBits)) for (i <- 0 until coreFetchWidth) { io.cpu.resp.bits.data(i) := fetch_data(i*coreInstBits+coreInstBits-1, i*coreInstBits) @@ -139,7 +142,6 @@ class ICacheReq extends FrontendBundle { } class ICacheResp extends FrontendBundle { - val data = Bits(width = coreInstBits) val datablock = Bits(width = rowBits) } @@ -161,9 +163,8 @@ class ICache extends FrontendModule val stall = !io.resp.ready val rdy = Bool() - val s2_valid = Reg(init=Bool(false)) - val s2_addr = Reg(UInt(width = paddrBits)) - val s2_any_tag_hit = Bool() + val refill_addr = Reg(UInt(width = paddrBits)) + val s1_any_tag_hit = Bool() val s1_valid = Reg(init=Bool(false)) val s1_pgoff = Reg(UInt(width = pgIdxBits)) @@ -178,17 +179,17 @@ class ICache extends FrontendModule s1_pgoff := io.req.bits.idx } - s2_valid := s1_valid && rdy && !io.req.bits.kill || io.resp.valid && stall - when (s1_valid && rdy && !stall) { - s2_addr := s1_addr - } + val out_valid = s1_valid && !io.req.bits.kill && state === s_ready + val s1_idx = s1_addr(untagBits-1,blockOffBits) + val s1_offset = s1_addr(blockOffBits-1,0) + val s1_hit = out_valid && s1_any_tag_hit + val s1_miss = out_valid && !s1_any_tag_hit + rdy := state === s_ready && !s1_miss - val s2_tag = s2_addr(tagBits+untagBits-1,untagBits) - val s2_idx = s2_addr(untagBits-1,blockOffBits) - val s2_offset = s2_addr(blockOffBits-1,0) - val s2_hit = s2_valid && s2_any_tag_hit - val s2_miss = s2_valid && !s2_any_tag_hit - rdy := state === s_ready && !s2_miss + when (s1_valid && state === s_ready && s1_miss) { + refill_addr := s1_addr + } + val refill_tag = refill_addr(tagBits+untagBits-1,untagBits) val ser = Module(new FlowThroughSerializer( io.mem.grant.bits, @@ -200,14 +201,14 @@ class ICache extends FrontendModule val refill_bits = ser.io.out.bits ser.io.out.ready := Bool(true) - val repl_way = if (isDM) UInt(0) else LFSR16(s2_miss)(log2Up(nWays)-1,0) + val repl_way = if (isDM) UInt(0) else LFSR16(s1_miss)(log2Up(nWays)-1,0) val entagbits = code.width(tagBits) val tag_array = Mem(Bits(width = entagbits*nWays), nSets, seqRead = true) val tag_raddr = Reg(UInt()) when (refill_done) { val wmask = FillInterleaved(entagbits, if (isDM) Bits(1) else UIntToOH(repl_way)) - val tag = code.encode(s2_tag).toUInt - tag_array.write(s2_idx, Fill(nWays, tag), wmask) + val tag = code.encode(refill_tag).toUInt + tag_array.write(s1_idx, Fill(nWays, tag), wmask) } // /*.else*/when (s0_valid) { // uncomment ".else" to infer 6T SRAM .elsewhen (s0_valid) { @@ -216,55 +217,49 @@ class ICache extends FrontendModule val vb_array = Reg(init=Bits(0, nSets*nWays)) when (refill_done && !invalidated) { - vb_array := vb_array.bitSet(Cat(repl_way, s2_idx), Bool(true)) + vb_array := vb_array.bitSet(Cat(repl_way, s1_idx), Bool(true)) } when (io.invalidate) { vb_array := Bits(0) invalidated := Bool(true) } - val s2_disparity = Vec.fill(nWays){Bool()} + val s1_disparity = Vec.fill(nWays){Bool()} for (i <- 0 until nWays) - when (s2_valid && s2_disparity(i)) { vb_array := vb_array.bitSet(Cat(UInt(i), s2_idx), Bool(false)) } + when (s1_valid && s1_disparity(i)) { vb_array := vb_array.bitSet(Cat(UInt(i), s1_idx), Bool(false)) } val s1_tag_match = Vec.fill(nWays){Bool()} - val s2_tag_hit = Vec.fill(nWays){Bool()} - val s2_dout = Vec.fill(nWays){Reg(Bits())} + val s1_tag_hit = Vec.fill(nWays){Bool()} + val s1_dout = Vec.fill(nWays){(Bits())} for (i <- 0 until nWays) { val s1_vb = !io.invalidate && vb_array(Cat(UInt(i), s1_pgoff(untagBits-1,blockOffBits))).toBool - val s2_vb = Reg(Bool()) - val s2_tag_disparity = Reg(Bool()) - val s2_tag_match = Reg(Bool()) val tag_out = tag_array(tag_raddr)(entagbits*(i+1)-1, entagbits*i) + val s1_tag_disparity = code.decode(tag_out).error when (s1_valid && rdy && !stall) { - s2_vb := s1_vb - s2_tag_disparity := code.decode(tag_out).error - s2_tag_match := s1_tag_match(i) } s1_tag_match(i) := tag_out(tagBits-1,0) === s1_tag - s2_tag_hit(i) := s2_vb && s2_tag_match - s2_disparity(i) := s2_vb && (s2_tag_disparity || code.decode(s2_dout(i)).error) + s1_tag_hit(i) := s1_vb && s1_tag_match(i) + s1_disparity(i) := s1_vb && (s1_tag_disparity || code.decode(s1_dout(i)).error) } - s2_any_tag_hit := s2_tag_hit.reduceLeft(_||_) && !s2_disparity.reduceLeft(_||_) + s1_any_tag_hit := s1_tag_hit.reduceLeft(_||_) && !s1_disparity.reduceLeft(_||_) for (i <- 0 until nWays) { val data_array = Mem(Bits(width = code.width(rowBits)), nSets*refillCycles, seqRead = true) val s1_raddr = Reg(UInt()) when (refill_valid && repl_way === UInt(i)) { val e_d = code.encode(refill_bits.payload.data) - if(refillCycles > 1) data_array(Cat(s2_idx, refill_bits.payload.addr_beat)) := e_d - else data_array(s2_idx) := e_d + if(refillCycles > 1) data_array(Cat(s1_idx, refill_bits.payload.addr_beat)) := e_d + else data_array(s1_idx) := e_d } // /*.else*/when (s0_valid) { // uncomment ".else" to infer 6T SRAM .elsewhen (s0_valid) { s1_raddr := s0_pgoff(untagBits-1,blockOffBits-(if(refillCycles > 1) refill_cnt.getWidth else 0)) } // if s1_tag_match is critical, replace with partial tag check - when (s1_valid && rdy && !stall && (Bool(isDM) || s1_tag_match(i))) { s2_dout(i) := data_array(s1_raddr) } + s1_dout(i) := 0 + when (s1_valid && rdy && !stall && (Bool(isDM) || s1_tag_match(i))) { s1_dout(i) := data_array(s1_raddr) } } - val s2_dout_word = s2_dout.map(x => (x >> (s2_offset(log2Up(rowBytes)-1,log2Up(coreInstBytes)) << log2Up(coreInstBits)))(coreInstBits-1,0)) - io.resp.bits.data := Mux1H(s2_tag_hit, s2_dout_word) - io.resp.bits.datablock := Mux1H(s2_tag_hit, s2_dout) + io.resp.bits.datablock := Mux1H(s1_tag_hit, s1_dout) val ack_q = Module(new Queue(new LogicalNetworkIO(new Finish), 1)) ack_q.io.enq.valid := refill_done && refill_bits.payload.requiresAck() @@ -272,15 +267,15 @@ class ICache extends FrontendModule ack_q.io.enq.bits.header.dst := refill_bits.header.src // output signals - io.resp.valid := s2_hit + io.resp.valid := s1_hit io.mem.acquire.valid := (state === s_request) && ack_q.io.enq.ready - io.mem.acquire.bits := GetBlock(addr_block = s2_addr >> UInt(blockOffBits)) + io.mem.acquire.bits := GetBlock(addr_block = refill_addr >> UInt(blockOffBits)) io.mem.finish <> ack_q.io.deq // control state machine switch (state) { is (s_ready) { - when (s2_miss) { state := s_request } + when (s1_miss) { state := s_request } invalidated := Bool(false) } is (s_request) { From c746ef8702fb424fe90b2df1e7d1740603cb09f6 Mon Sep 17 00:00:00 2001 From: Colin Schmidt Date: Mon, 4 May 2015 11:20:55 -0700 Subject: [PATCH 04/10] fix bug in rocc port resp for FPtoInt instructions --- rocket/src/main/scala/fpu.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rocket/src/main/scala/fpu.scala b/rocket/src/main/scala/fpu.scala index 3b5a6c1f..d26c9eda 100644 --- a/rocket/src/main/scala/fpu.scala +++ b/rocket/src/main/scala/fpu.scala @@ -440,7 +440,7 @@ class FPU extends Module fpiu.io.in.bits := req io.dpath.store_data := fpiu.io.out.bits.store io.dpath.toint_data := fpiu.io.out.bits.toint - when(fpiu.io.out.valid && mem_cp_valid && !(mem_ctrl.div || mem_ctrl.sqrt)){ + when(fpiu.io.out.valid && mem_cp_valid && mem_ctrl.toint){ io.cp_resp.bits.data := fpiu.io.out.bits.toint io.cp_resp.valid := Bool(true) } From 3d6a060dc37ad0059e808278b668e5d0a0761d1d Mon Sep 17 00:00:00 2001 From: Albert Ou Date: Mon, 10 Aug 2015 23:52:58 -0700 Subject: [PATCH 05/10] Bump Scala to 2.11.6 This change, originally part of commit b978083, was excluded from the merge at commit 47494ec. --- rocket/build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rocket/build.sbt b/rocket/build.sbt index 4a8e9378..bfc36cc5 100644 --- a/rocket/build.sbt +++ b/rocket/build.sbt @@ -4,7 +4,7 @@ version := "1.2" name := "rocket" -scalaVersion := "2.10.2" +scalaVersion := "2.11.6" libraryDependencies ++= (Seq("chisel", "hardfloat", "uncore", "junctions").map { dep: String => sys.props.get(dep + "Version") map { "edu.berkeley.cs" %% dep % _ }}).flatten From d292b6cb1300fe0c705402fb60de503816f391fb Mon Sep 17 00:00:00 2001 From: Colin Schmidt Date: Tue, 8 Sep 2015 14:42:34 -0700 Subject: [PATCH 06/10] don't connect rocc-fpu-port without rocc accel --- rocket/src/main/scala/rocket.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rocket/src/main/scala/rocket.scala b/rocket/src/main/scala/rocket.scala index cd9b8a26..42daa58e 100644 --- a/rocket/src/main/scala/rocket.scala +++ b/rocket/src/main/scala/rocket.scala @@ -492,7 +492,7 @@ class Rocket extends CoreModule io.rocc.cmd.bits.rs1 := wb_reg_wdata io.rocc.cmd.bits.rs2 := wb_reg_rs2 - if (!params(BuildFPU).isEmpty) { + if (!params(BuildFPU).isEmpty && !params(BuildRoCC).isEmpty) { io.fpu.cp_req <> io.rocc.fpu_req io.fpu.cp_resp <> io.rocc.fpu_resp } else { From 0b15b19381f026a05ae791c9d6a0c006e147407c Mon Sep 17 00:00:00 2001 From: Howard Mao Date: Tue, 1 Dec 2015 10:22:31 -0800 Subject: [PATCH 07/10] add arbiter for FPU --- rocket/src/main/scala/arbiter.scala | 49 +++++++++++++++++++++++++++++ rocket/src/main/scala/rocket.scala | 7 ----- rocket/src/main/scala/tile.scala | 14 +++++++-- 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/rocket/src/main/scala/arbiter.scala b/rocket/src/main/scala/arbiter.scala index 27bfcc86..16c04858 100644 --- a/rocket/src/main/scala/arbiter.scala +++ b/rocket/src/main/scala/arbiter.scala @@ -5,6 +5,7 @@ package rocket import Chisel._ import uncore._ import cde.{Parameters, Field} +import junctions.ParameterizedBundle class HellaCacheArbiter(n: Int)(implicit p: Parameters) extends Module { @@ -53,3 +54,51 @@ class HellaCacheArbiter(n: Int)(implicit p: Parameters) extends Module io.requestor(i).replay_next.bits := io.mem.replay_next.bits >> log2Up(n) } } + +class InOrderArbiter[T <: Data, U <: Data](reqTyp: T, respTyp: U, n: Int) + (implicit p: Parameters) extends Module { + val io = new Bundle { + val in_req = Vec(n, Decoupled(reqTyp)).flip + val in_resp = Vec(n, Decoupled(respTyp)) + val out_req = Decoupled(reqTyp) + val out_resp = Decoupled(respTyp).flip + } + + if (n > 1) { + val route_q = Module(new Queue(UInt(width = log2Up(n)), 2)) + val req_arb = Module(new RRArbiter(reqTyp, n)) + req_arb.io.in <> io.in_req + + val req_helper = DecoupledHelper( + req_arb.io.out.valid, + route_q.io.enq.ready, + io.out_req.ready) + + io.out_req.bits := req_arb.io.out.bits + io.out_req.valid := req_helper.fire(io.out_req.ready) + + route_q.io.enq.bits := req_arb.io.chosen + route_q.io.enq.valid := req_helper.fire(route_q.io.enq.ready) + + req_arb.io.out.ready := req_helper.fire(req_arb.io.out.valid) + + val resp_sel = route_q.io.deq.bits + val resp_ready = io.in_resp(resp_sel).ready + val resp_helper = DecoupledHelper( + resp_ready, + route_q.io.deq.valid, + io.out_resp.valid) + + val resp_valid = resp_helper.fire(resp_ready) + for (i <- 0 until n) { + io.in_resp(i).bits := io.out_resp.bits + io.in_resp(i).valid := resp_valid && resp_sel === UInt(i) + } + + route_q.io.deq.ready := resp_helper.fire(route_q.io.deq.valid) + io.out_resp.ready := resp_helper.fire(io.out_resp.valid) + } else { + io.out_req <> io.in_req.head + io.in_resp.head <> io.out_resp + } +} diff --git a/rocket/src/main/scala/rocket.scala b/rocket/src/main/scala/rocket.scala index 251ac330..53824d51 100644 --- a/rocket/src/main/scala/rocket.scala +++ b/rocket/src/main/scala/rocket.scala @@ -537,13 +537,6 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) { io.rocc.cmd.bits.rs1 := wb_reg_wdata io.rocc.cmd.bits.rs2 := wb_reg_rs2 - if (usingFPU && usingRoCC) { - io.fpu.cp_req <> io.rocc.fpu_req - io.fpu.cp_resp <> io.rocc.fpu_resp - } else { - io.fpu.cp_req.valid := Bool(false) - } - if (enableCommitLog) { val pc = Wire(SInt(width=64)) pc := wb_reg_pc diff --git a/rocket/src/main/scala/tile.scala b/rocket/src/main/scala/tile.scala index 7c838475..0420c6a5 100644 --- a/rocket/src/main/scala/tile.scala +++ b/rocket/src/main/scala/tile.scala @@ -52,8 +52,8 @@ class RocketTile(resetSignal: Bool = null)(implicit p: Parameters) extends Tile( icache.io.cpu <> core.io.imem core.io.ptw <> ptw.io.dpath - //If so specified, build an FPU module and wire it in - if (p(UseFPU)) core.io.fpu <> Module(new FPU()(p)).io + val fpuOpt = if (p(UseFPU)) Some(Module(new FPU)) else None + fpuOpt.foreach(fpu => core.io.fpu <> fpu.io) // Connect the caches and ROCC to the outer memory system io.cached.head <> dcache.io.mem @@ -86,6 +86,16 @@ class RocketTile(resetSignal: Bool = null)(implicit p: Parameters) extends Tile( rocc } + fpuOpt.foreach { fpu => + val fpArb = Module(new InOrderArbiter(new FPInput, new FPResult, nRocc)) + fpArb.io.in_req <> roccs.map(_.io.fpu_req) + roccs.zip(fpArb.io.in_resp).foreach { + case (rocc, fpu_resp) => rocc.io.fpu_resp <> fpu_resp + } + fpu.io.cp_req <> fpArb.io.out_req + fpArb.io.out_resp <> fpu.io.cp_resp + } + core.io.rocc.busy := cmdRouter.io.busy || roccs.map(_.io.busy).reduce(_ || _) core.io.rocc.interrupt := roccs.map(_.io.interrupt).reduce(_ || _) respArb.io.in <> roccs.map(rocc => Queue(rocc.io.resp)) From 4833d41dbc62a4e779f49dfef067d482182caa37 Mon Sep 17 00:00:00 2001 From: Howard Mao Date: Tue, 1 Dec 2015 16:48:05 -0800 Subject: [PATCH 08/10] make the connection of FPU ports optional per accelerator --- rocket/src/main/scala/tile.scala | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/rocket/src/main/scala/tile.scala b/rocket/src/main/scala/tile.scala index 0420c6a5..8ded6be8 100644 --- a/rocket/src/main/scala/tile.scala +++ b/rocket/src/main/scala/tile.scala @@ -11,6 +11,7 @@ case object CoreName extends Field[String] case object BuildRoCC extends Field[Seq[Parameters => RoCC]] case object RoccOpcodes extends Field[Seq[OpcodeSet]] case object RoccAcceleratorMemChannels extends Field[Seq[Int]] +case object RoccUseFPU extends Field[Seq[Boolean]] abstract class Tile(resetSignal: Bool = null) (implicit p: Parameters) extends Module(_reset = resetSignal) { @@ -19,6 +20,8 @@ abstract class Tile(resetSignal: Bool = null) val roccMemChannels = p(RoccAcceleratorMemChannels) val usingRocc = !buildRocc.isEmpty val nRocc = buildRocc.size + val roccUseFPU = p(RoccUseFPU) + val nFPUPorts = roccUseFPU.filter(useFPU => useFPU).size val nDCachePorts = 2 + nRocc val nPTWPorts = 2 + 3 * nRocc val nCachedTileLinkPorts = 1 @@ -86,15 +89,20 @@ class RocketTile(resetSignal: Bool = null)(implicit p: Parameters) extends Tile( rocc } - fpuOpt.foreach { fpu => - val fpArb = Module(new InOrderArbiter(new FPInput, new FPResult, nRocc)) - fpArb.io.in_req <> roccs.map(_.io.fpu_req) - roccs.zip(fpArb.io.in_resp).foreach { - case (rocc, fpu_resp) => rocc.io.fpu_resp <> fpu_resp + if (nFPUPorts > 0) { + fpuOpt.foreach { fpu => + val fpArb = Module(new InOrderArbiter(new FPInput, new FPResult, nFPUPorts)) + val fp_roccs = roccs.zip(roccUseFPU) + .filter { case (_, useFPU) => useFPU } + .map { case (rocc, _) => rocc } + fpArb.io.in_req <> fp_roccs.map(_.io.fpu_req) + fp_roccs.zip(fpArb.io.in_resp).foreach { + case (rocc, fpu_resp) => rocc.io.fpu_resp <> fpu_resp + } + fpu.io.cp_req <> fpArb.io.out_req + fpArb.io.out_resp <> fpu.io.cp_resp } - fpu.io.cp_req <> fpArb.io.out_req - fpArb.io.out_resp <> fpu.io.cp_resp - } + } core.io.rocc.busy := cmdRouter.io.busy || roccs.map(_.io.busy).reduce(_ || _) core.io.rocc.interrupt := roccs.map(_.io.interrupt).reduce(_ || _) From dcca0b1d86eb58ce8883f86b81a29446d1ac284c Mon Sep 17 00:00:00 2001 From: Howard Mao Date: Tue, 1 Dec 2015 18:14:58 -0800 Subject: [PATCH 09/10] fix up FPU connection --- rocket/src/main/scala/tile.scala | 45 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/rocket/src/main/scala/tile.scala b/rocket/src/main/scala/tile.scala index 8599f543..8878e722 100644 --- a/rocket/src/main/scala/tile.scala +++ b/rocket/src/main/scala/tile.scala @@ -13,15 +13,15 @@ case object BuildRoCC extends Field[Seq[RoccParameters]] case class RoccParameters( opcodes: OpcodeSet, generator: Parameters => RoCC, - nMemChannels: Int = 1) + nMemChannels: Int = 1, + useFPU: Boolean = false) abstract class Tile(resetSignal: Bool = null) (implicit p: Parameters) extends Module(_reset = resetSignal) { val buildRocc = p(BuildRoCC) val usingRocc = !buildRocc.isEmpty val nRocc = buildRocc.size - val roccUseFPU = p(RoccUseFPU) - val nFPUPorts = roccUseFPU.filter(useFPU => useFPU).size + val nFPUPorts = buildRocc.filter(_.useFPU).size val nDCachePorts = 2 + nRocc val nPTWPorts = 2 + 3 * nRocc val nCachedTileLinkPorts = 1 @@ -73,32 +73,31 @@ class RocketTile(resetSignal: Bool = null)(implicit p: Parameters) extends Tile( val cmdRouter = Module(new RoccCommandRouter(roccOpcodes)) cmdRouter.io.in <> core.io.rocc.cmd - val roccs = buildRocc.zipWithIndex.map { - case (RoccParameters(_, generator, nchannels), i) => - val accelParams = p.alterPartial({ case RoccNMemChannels => nchannels }) - val rocc = generator(accelParams) - val dcIF = Module(new SimpleHellaCacheIF()(dcacheParams)) - rocc.io.cmd <> cmdRouter.io.out(i) - rocc.io.s := core.io.rocc.s - rocc.io.exception := core.io.rocc.exception - dcIF.io.requestor <> rocc.io.mem - dcArb.io.requestor(2 + i) <> dcIF.io.cache - iMemArb.io.in(1 + i) <> rocc.io.imem - ptw.io.requestor(2 + 3 * i) <> rocc.io.iptw - ptw.io.requestor(3 + 3 * i) <> rocc.io.dptw - ptw.io.requestor(4 + 3 * i) <> rocc.io.pptw - rocc + val roccs = buildRocc.zipWithIndex.map { case (accelParams, i) => + val rocc = accelParams.generator( + p.alterPartial({ case RoccNMemChannels => accelParams.nMemChannels })) + val dcIF = Module(new SimpleHellaCacheIF()(dcacheParams)) + rocc.io.cmd <> cmdRouter.io.out(i) + rocc.io.s := core.io.rocc.s + rocc.io.exception := core.io.rocc.exception + dcIF.io.requestor <> rocc.io.mem + dcArb.io.requestor(2 + i) <> dcIF.io.cache + iMemArb.io.in(1 + i) <> rocc.io.imem + ptw.io.requestor(2 + 3 * i) <> rocc.io.iptw + ptw.io.requestor(3 + 3 * i) <> rocc.io.dptw + ptw.io.requestor(4 + 3 * i) <> rocc.io.pptw + rocc } if (nFPUPorts > 0) { fpuOpt.foreach { fpu => val fpArb = Module(new InOrderArbiter(new FPInput, new FPResult, nFPUPorts)) - val fp_roccs = roccs.zip(roccUseFPU) - .filter { case (_, useFPU) => useFPU } - .map { case (rocc, _) => rocc } - fpArb.io.in_req <> fp_roccs.map(_.io.fpu_req) + val fp_roccs = roccs.zip(buildRocc) + .filter { case (_, params) => params.useFPU } + .map { case (rocc, _) => rocc.io } + fpArb.io.in_req <> fp_roccs.map(_.fpu_req) fp_roccs.zip(fpArb.io.in_resp).foreach { - case (rocc, fpu_resp) => rocc.io.fpu_resp <> fpu_resp + case (rocc, fpu_resp) => rocc.fpu_resp <> fpu_resp } fpu.io.cp_req <> fpArb.io.out_req fpArb.io.out_resp <> fpu.io.cp_resp From 73b026366301c29b74be13b05b3a8bb16bfd2260 Mon Sep 17 00:00:00 2001 From: Howard Mao Date: Tue, 1 Dec 2015 20:41:58 -0800 Subject: [PATCH 10/10] disconnect fpu port if no fpu-using RoCC accelerators --- rocket/src/main/scala/tile.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/rocket/src/main/scala/tile.scala b/rocket/src/main/scala/tile.scala index 8878e722..09c4f006 100644 --- a/rocket/src/main/scala/tile.scala +++ b/rocket/src/main/scala/tile.scala @@ -110,4 +110,11 @@ class RocketTile(resetSignal: Bool = null)(implicit p: Parameters) extends Tile( roccs.flatMap(_.io.dmem) :+ iMemArb.io.out } else { Seq(icache.io.mem) }) + + if (!usingRocc || nFPUPorts == 0) { + fpuOpt.foreach { fpu => + fpu.io.cp_req.valid := Bool(false) + fpu.io.cp_resp.ready := Bool(false) + } + } }