1
0

Remove FP unboxing from FMA critical path

This commit is contained in:
Andrew Waterman 2017-05-31 14:57:56 -07:00 committed by Wesley W. Terpstra
parent 7504b47bbe
commit 8229bdee03

View File

@ -259,10 +259,11 @@ trait HasFPUParameters {
}
// implement NaN unboxing for FU inputs
def unbox(x: UInt, tag: UInt): UInt = {
def unbox(x: UInt, tag: UInt, exactType: Option[FType]): UInt = {
val outType = exactType.getOrElse(maxType)
def helper(x: UInt, t: FType): Seq[(Bool, UInt)] = {
val prev =
if (typeTag(t) == 0) {
if (t == minType) {
Seq()
} else {
val prevT = prevType(t)
@ -274,13 +275,16 @@ trait HasFPUParameters {
val isbox = isBox(x, t)
prev.map(p => (isbox && p._1, p._2))
}
prev :+ (true.B, t.unsafeConvert(x, maxType))
prev :+ (true.B, t.unsafeConvert(x, outType))
}
val res = helper(x, maxType)
val oks = res.map(_._1)
val floats = res.map(_._2)
Mux(oks(tag), floats(tag), maxType.qNaN)
val (oks, floats) = helper(x, maxType).unzip
if (exactType.isEmpty || floatTypes.size == 1) {
Mux(oks(tag), floats(tag), maxType.qNaN)
} else {
val t = exactType.get
floats(typeTag(t)) | Mux(oks(typeTag(t)), 0.U, t.qNaN)
}
}
// make sure that the redundant bits in the NaN-boxed encoding are consistent
@ -538,7 +542,7 @@ class FPToFP(val latency: Int)(implicit p: Parameters) extends FPUModule()(p) {
io.out <> Pipe(in.valid, mux, latency-1)
}
class FPUFMAPipe(val latency: Int, t: FType)(implicit p: Parameters) extends FPUModule()(p) {
class FPUFMAPipe(val latency: Int, val t: FType)(implicit p: Parameters) extends FPUModule()(p) {
val io = new Bundle {
val in = Valid(new FPInput).flip
val out = Valid(new FPResult)
@ -553,9 +557,8 @@ class FPUFMAPipe(val latency: Int, t: FType)(implicit p: Parameters) extends FPU
val cmd_fma = io.in.bits.ren3
val cmd_addsub = io.in.bits.swap23
in := io.in.bits
in.in1 := maxType.unsafeConvert(io.in.bits.in1, t)
in.in2 := Mux(cmd_addsub, one, maxType.unsafeConvert(io.in.bits.in2, t))
in.in3 := Mux(cmd_fma || cmd_addsub, maxType.unsafeConvert(io.in.bits.in3, t), zero)
when (cmd_addsub) { in.in2 := one }
when (!(cmd_fma || cmd_addsub)) { in.in3 := zero }
}
val fma = Module(new hardfloat.MulAddRecFN(t.exp, t.sig))
@ -615,45 +618,29 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) {
printf("f%d p%d 0x%x\n", load_wb_tag, load_wb_tag + 32, load_wb_data)
}
val ex_ra1::ex_ra2::ex_ra3::Nil = List.fill(3)(Reg(UInt()))
val ex_ra = List.fill(3)(Reg(UInt()))
val ex_rs = ex_ra.map(a => regfile(a))
when (io.valid) {
when (id_ctrl.ren1) {
when (!id_ctrl.swap12) { ex_ra1 := io.inst(19,15) }
when (id_ctrl.swap12) { ex_ra2 := io.inst(19,15) }
when (!id_ctrl.swap12) { ex_ra(0) := io.inst(19,15) }
when (id_ctrl.swap12) { ex_ra(1) := io.inst(19,15) }
}
when (id_ctrl.ren2) {
when (id_ctrl.swap12) { ex_ra1 := io.inst(24,20) }
when (id_ctrl.swap23) { ex_ra3 := io.inst(24,20) }
when (!id_ctrl.swap12 && !id_ctrl.swap23) { ex_ra2 := io.inst(24,20) }
when (id_ctrl.swap12) { ex_ra(0) := io.inst(24,20) }
when (id_ctrl.swap23) { ex_ra(2) := io.inst(24,20) }
when (!id_ctrl.swap12 && !id_ctrl.swap23) { ex_ra(1) := io.inst(24,20) }
}
when (id_ctrl.ren3) { ex_ra3 := io.inst(31,27) }
when (id_ctrl.ren3) { ex_ra(2) := io.inst(31,27) }
}
val ex_rm = Mux(ex_reg_inst(14,12) === Bits(7), io.fcsr_rm, ex_reg_inst(14,12))
val req = Wire(new FPInput)
def readAndUnbox(addr: UInt) = unbox(regfile(addr), !ex_ctrl.singleIn)
req := ex_ctrl
req.rm := ex_rm
req.in1 := readAndUnbox(ex_ra1)
req.in2 := readAndUnbox(ex_ra2)
req.in3 := readAndUnbox(ex_ra3)
req.typ := ex_reg_inst(21,20)
req.fmaCmd := ex_reg_inst(3,2) | (!ex_ctrl.ren3 && ex_reg_inst(27))
when (ex_cp_valid) {
req := io.cp_req.bits
when (io.cp_req.bits.swap23) {
req.in2 := io.cp_req.bits.in3
req.in3 := io.cp_req.bits.in2
}
}
val sfma = Module(new FPUFMAPipe(cfg.sfmaLatency, FType.S))
sfma.io.in.valid := req_valid && ex_ctrl.fma && ex_ctrl.singleOut
sfma.io.in.bits := req
sfma.io.in.bits := fuInput(Some(sfma.t))
val fpiu = Module(new FPToInt)
fpiu.io.in.valid := req_valid && (ex_ctrl.toint || ex_ctrl.div || ex_ctrl.sqrt || (ex_ctrl.fastpipe && ex_ctrl.wflags))
fpiu.io.in.bits := req
fpiu.io.in.bits := fuInput(None)
io.store_data := fpiu.io.out.bits.store
io.toint_data := fpiu.io.out.bits.toint
when(fpiu.io.out.valid && mem_cp_valid && mem_ctrl.toint){
@ -663,12 +650,12 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) {
val ifpu = Module(new IntToFP(2))
ifpu.io.in.valid := req_valid && ex_ctrl.fromint
ifpu.io.in.bits := req
ifpu.io.in.bits := fpiu.io.in.bits
ifpu.io.in.bits.in1 := Mux(ex_cp_valid, io.cp_req.bits.in1, io.fromint_data)
val fpmu = Module(new FPToFP(2))
fpmu.io.in.valid := req_valid && ex_ctrl.fastpipe
fpmu.io.in.bits := req
fpmu.io.in.bits := fpiu.io.in.bits
fpmu.io.lt := fpiu.io.out.bits.lt
val divSqrt_wen = Wire(init = false.B)
@ -687,7 +674,7 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) {
(fLen > 32).option({
val dfma = Module(new FPUFMAPipe(cfg.dfmaLatency, FType.D))
dfma.io.in.valid := req_valid && ex_ctrl.fma && !ex_ctrl.singleOut
dfma.io.in.bits := req
dfma.io.in.bits := fuInput(Some(dfma.t))
Pipe(dfma, dfma.latency, (c: FPUCtrlSigs) => c.fma && !c.singleOut, dfma.io.out.bits)
})
def latencyMask(c: FPUCtrlSigs, offset: Int) = {
@ -800,6 +787,26 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) {
} else {
when (id_ctrl.div || id_ctrl.sqrt) { io.illegal_rm := true }
}
def fuInput(minT: Option[FType]): FPInput = {
val req = Wire(new FPInput)
val tag = !ex_ctrl.singleIn // TODO typeTag
req := ex_ctrl
req.rm := ex_rm
req.in1 := unbox(ex_rs(0), tag, minT)
req.in2 := unbox(ex_rs(1), tag, minT)
req.in3 := unbox(ex_rs(2), tag, minT)
req.typ := ex_reg_inst(21,20)
req.fmaCmd := ex_reg_inst(3,2) | (!ex_ctrl.ren3 && ex_reg_inst(27))
when (ex_cp_valid) {
req := io.cp_req.bits
when (io.cp_req.bits.swap23) {
req.in2 := io.cp_req.bits.in3
req.in3 := io.cp_req.bits.in2
}
}
req
}
}
/** Mix-ins for constructing tiles that may have an FPU external to the core pipeline */