diff --git a/cmmc/Target/RISCV/RISCV.yml b/cmmc/Target/RISCV/RISCV.yml index 9a8e260b..eadd8fbb 100644 --- a/cmmc/Target/RISCV/RISCV.yml +++ b/cmmc/Target/RISCV/RISCV.yml @@ -1787,7 +1787,7 @@ InstSelection: Lhs: $X Rhs: $Y Rhs: $Z - $Predicate: isAggressiveOptEnabled() + $Predicate: isAggressiveOptEnabled() && ctx.hasOneUse($MulDst) Replace: FMADD_S: Rd: $Dst @@ -1795,6 +1795,63 @@ InstSelection: Rs2: $Y Rs3: $Z + - Pattern: + InstFSub: + Dst: $Dst + Lhs: + InstFMul: + Dst: $MulDst + Lhs: $X + Rhs: $Y + Rhs: $Z + $Predicate: isAggressiveOptEnabled() && ctx.hasOneUse($MulDst) + Replace: + FMSUB_S: + Rd: $Dst + Rs1: $X + Rs2: $Y + Rs3: $Z + + - Pattern: + InstFAdd: + Dst: $Dst + Lhs: + InstFNeg: + Dst: $NegDst + Src: + InstFMul: + Dst: $MulDst + Lhs: $X + Rhs: $Y + Rhs: $Z + $Predicate: isAggressiveOptEnabled() && ctx.hasOneUse($MulDst) && ctx.hasOneUse($NegDst) + Replace: + FNMSUB_S: + Rd: $Dst + Rs1: $X + Rs2: $Y + Rs3: $Z + + - Pattern: + InstFSub: + Dst: $Dst + Lhs: + InstFNeg: + Dst: $NegDst + Src: + InstFMul: + Dst: $MulDst + Lhs: $X + Rhs: $Y + Rhs: $Z + $Predicate: isAggressiveOptEnabled() && ctx.hasOneUse($MulDst) && ctx.hasOneUse($NegDst) + Replace: + FNMADD_S: + Rd: $Dst + Rs1: $X + Rs2: $Y + Rs3: $Z + - Pattern: $Template: Dst: $Dst diff --git a/cmmc/Transforms/Combine/ArithmeticReduce.cpp b/cmmc/Transforms/Combine/ArithmeticReduce.cpp index b41108b8..d624a2ea 100644 --- a/cmmc/Transforms/Combine/ArithmeticReduce.cpp +++ b/cmmc/Transforms/Combine/ArithmeticReduce.cpp @@ -1327,28 +1327,30 @@ class ArithmeticReduce final : public TransformPass { return val; } - // TODO: handle nan // x <= f1 || x >= f2 if(or_(fcmp(cmp1, any(v1), fp_(f1)), fcmp(cmp2, exactly(v1), fp_(f2)))(matchCtx)) { if(cmp1 > cmp2) { std::swap(cmp1, cmp2); std::swap(f1, f2); } + auto isNotNan = [&](Value* v) { + return builder.makeOp(InstructionID::FCmp, CompareOp::FCmpOrderedEqual, v, v); + }; if(cmp1 == CompareOp::FCmpOrderedLessEqual && cmp2 == CompareOp::FCmpOrderedGreaterEqual) { if(f1 >= f2) - return builder.getTrue(); + return isNotNan(v1); } if(cmp1 == CompareOp::FCmpOrderedLessEqual && cmp2 == CompareOp::FCmpOrderedGreaterThan) { if(f1 >= f2) - return builder.getTrue(); + return isNotNan(v1); } if(cmp1 == CompareOp::FCmpOrderedLessThan && cmp2 == CompareOp::FCmpOrderedGreaterEqual) { if(f1 >= f2) - return builder.getTrue(); + return isNotNan(v1); } if(cmp1 == CompareOp::FCmpOrderedLessThan && cmp2 == CompareOp::FCmpOrderedGreaterThan) { if(f1 > f2) - return builder.getTrue(); + return isNotNan(v1); // TODO: f1 == f2 -> x une f1 } } diff --git a/tests/Regression/CodeGen/fma.arm.s b/tests/Regression/CodeGen/fma.arm.s index ce3658e4..6f1b9242 100644 --- a/tests/Regression/CodeGen/fma.arm.s +++ b/tests/Regression/CodeGen/fma.arm.s @@ -16,3 +16,23 @@ fmadd2: vmul.f32 s0, s0, s1 vadd.f32 s0, s2, s0 bx lr +.p2align 4 +.globl fmsub +fmsub: + vmul.f32 s0, s0, s1 + vsub.f32 s0, s0, s2 + bx lr +.p2align 4 +.globl fnmadd +fnmadd: + vmul.f32 s0, s0, s1 + vneg.f32 s0, s0 + vsub.f32 s0, s0, s2 + bx lr +.p2align 4 +.globl fnmsub +fnmsub: + vmul.f32 s0, s0, s1 + vneg.f32 s0, s0 + vadd.f32 s0, s0, s2 + bx lr diff --git a/tests/Regression/CodeGen/fma.mips.s b/tests/Regression/CodeGen/fma.mips.s index e287891a..31f90c21 100644 --- a/tests/Regression/CodeGen/fma.mips.s +++ b/tests/Regression/CodeGen/fma.mips.s @@ -16,3 +16,29 @@ fmadd2: add.s $f0, $f4, $f6 jr $ra nop +.p2align 2 +.globl fmsub +fmsub: + mtc1 $a2, $f4 + mul.s $f6, $f12, $f14 + sub.s $f0, $f6, $f4 + jr $ra + nop +.p2align 2 +.globl fnmadd +fnmadd: + mtc1 $a2, $f4 + mul.s $f6, $f12, $f14 + neg.s $f6, $f6 + sub.s $f0, $f6, $f4 + jr $ra + nop +.p2align 2 +.globl fnmsub +fnmsub: + mtc1 $a2, $f4 + mul.s $f6, $f12, $f14 + neg.s $f6, $f6 + add.s $f0, $f6, $f4 + jr $ra + nop diff --git a/tests/Regression/CodeGen/fma.riscv.s b/tests/Regression/CodeGen/fma.riscv.s index 9165ce19..50d02aa2 100644 --- a/tests/Regression/CodeGen/fma.riscv.s +++ b/tests/Regression/CodeGen/fma.riscv.s @@ -13,3 +13,23 @@ fmadd2: fmul.s f13, f10, f11 fadd.s f10, f12, f13 ret +.p2align 2 +.globl fmsub +fmsub: + fmul.s f13, f10, f11 + fsub.s f10, f13, f12 + ret +.p2align 2 +.globl fnmadd +fnmadd: + fmul.s f13, f10, f11 + fneg.s f14, f13 + fsub.s f10, f14, f12 + ret +.p2align 2 +.globl fnmsub +fnmsub: + fmul.s f13, f10, f11 + fneg.s f14, f13 + fadd.s f10, f14, f12 + ret diff --git a/tests/Regression/CodeGen/fma.sy b/tests/Regression/CodeGen/fma.sy index 79ee730c..94eb1773 100644 --- a/tests/Regression/CodeGen/fma.sy +++ b/tests/Regression/CodeGen/fma.sy @@ -4,3 +4,12 @@ float fmadd1(float x, float y, float z) { float fmadd2(float x, float y, float z) { return z + x * y; } +float fmsub(float x, float y, float z) { + return x * y - z; +} +float fnmadd(float x, float y, float z) { + return - (x * y) - z; +} +float fnmsub(float x, float y, float z) { + return - (x * y) + z; +} diff --git a/tests/SysY2022/performance/large_loop_array_1.out b/tests/SysY2022/performance/large_loop_array_1.out index 9aea9e0c..84fa316c 100644 --- a/tests/SysY2022/performance/large_loop_array_1.out +++ b/tests/SysY2022/performance/large_loop_array_1.out @@ -1,2 +1,2 @@ +0x1.45378p+50 0 -0 \ No newline at end of file diff --git a/tests/SysY2022/performance/large_loop_array_1.riscv.s b/tests/SysY2022/performance/large_loop_array_1.riscv.s index e1d10725..99c8866a 100644 --- a/tests/SysY2022/performance/large_loop_array_1.riscv.s +++ b/tests/SysY2022/performance/large_loop_array_1.riscv.s @@ -1,19 +1,270 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data +.section .rodata +.p2align 2 +__cmmc_fp_constant_pool: + .4byte 1045220557 + .4byte 1036831949 +.bss +.p2align 3 +cmmc_parallel_body_payload_0: + .zero 40 +.p2align 3 +y: + .zero 8192 +.p2align 3 +x: + .zero 8192 .text .p2align 2 .globl main main: - addi sp, sp, -8 + addi sp, sp, -120 sd ra, 0(sp) + sd s8, 8(sp) + sd s11, 16(sp) + fsw f9, 24(sp) + fsw f8, 28(sp) + sd s3, 32(sp) + fsw f18, 40(sp) + sd s6, 48(sp) + sd s9, 56(sp) + sd s5, 64(sp) + sd s0, 72(sp) + sd s4, 80(sp) + sd s1, 88(sp) + sd s2, 96(sp) + sd s7, 104(sp) + sd s10, 112(sp) jal getint + mv s2, a0 li a0, 22 jal _sysy_starttime + li s10, 3 +pcrel287: + auipc a2, %pcrel_hi(__cmmc_fp_constant_pool) + addiw s1, s2, -3 + lui s6, 260096 +pcrel288: + auipc a0, %pcrel_hi(x) +pcrel289: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) +pcrel290: + auipc a1, %pcrel_hi(y) + fmv.w.x f8, zero + mv s11, zero + addi s5, a2, %pcrel_lo(pcrel287) + fmv.w.x f9, s6 + addi s4, a0, %pcrel_lo(pcrel288) + addi s3, s9, %pcrel_lo(pcrel289) + addi s0, a1, %pcrel_lo(pcrel290) + fmv.s f18, f8 + lui a2, 122 + mv a0, zero +pcrel291: + auipc a1, %pcrel_hi(cmmc_parallel_body_0) + addiw s7, a2, 288 + addi s8, a1, %pcrel_lo(pcrel291) + j label92 +.p2align 2 +label120: + addi a1, a1, 4 +.p2align 2 +label116: + sh2add a3, a2, s0 + flw f11, 0(a1) + addiw a2, a2, 1 + flw f12, 0(a3) + fmadd.s f10, f11, f12, f10 + bgt s2, a2, label120 + fadd.s f8, f8, f10 + addiw s11, s11, 1 + bge s11, s7, label103 +.p2align 2 +label92: + lui a4, 419430 + fmv.w.x f11, s6 + flw f12, 0(s5) + addiw a3, a4, 1639 + mul a1, s11, a3 + fadd.s f10, f9, f12 + srli a4, a1, 63 + srai a2, a1, 34 + fmv.s f9, f11 + add a3, a4, a2 + sh2add t0, a3, a3 + slliw a5, t0, 1 + subw a2, s11, a5 + sltu a1, zero, a2 + bne a1, zero, label254 + fmv.s f9, f10 +label254: + flw f12, 4(s5) + fmv.w.x f11, zero + fadd.s f10, f18, f12 + fmv.s f18, f11 + bne a1, zero, label256 + fmv.s f18, f10 +label256: + ble s2, a0, label148 +pcrel292: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) + sw a0, %pcrel_lo(pcrel292)(s9) + sd s4, 8(s3) + fsw f18, 16(s3) + sd s0, 24(s3) + fsw f9, 32(s3) + sw s2, 36(s3) + mv a1, s2 + mv a2, s8 + jal cmmcParallelFor + mv a0, s2 + ble s2, zero, label159 +.p2align 2 +label104: + ble s2, s10, label170 + fmv.w.x f10, zero + mv a1, s4 + mv a2, zero +.p2align 2 +label106: + sh2add a3, a2, s0 + flw f12, 0(a1) + addiw a2, a2, 4 + flw f14, 0(a3) + flw f13, 4(a1) + fmadd.s f11, f12, f14, f10 + flw f15, 4(a3) + flw f14, 8(a1) + flw f0, 8(a3) + flw f12, 12(a1) + fmadd.s f10, f13, f15, f11 + flw f13, 12(a3) + fmadd.s f11, f14, f0, f10 + fmadd.s f10, f12, f13, f11 + ble s1, a2, label202 + addi a1, a1, 16 + j label106 +.p2align 2 +label202: + fmv.s f11, f10 + ble s2, a2, label275 +.p2align 2 +label115: + sh2add a1, a2, s4 + fmv.s f10, f11 + j label116 +.p2align 2 +label159: + fmv.w.x f10, zero + fadd.s f8, f8, f10 + addiw s11, s11, 1 + blt s11, s7, label92 + j label103 +.p2align 2 +label170: + fmv.w.x f10, zero + mv a2, zero + fmv.s f11, f10 + bgt s2, zero, label115 + fadd.s f8, f8, f10 + addiw s11, s11, 1 + blt s11, s7, label92 + j label103 +.p2align 2 +label148: + bgt s2, zero, label104 + fmv.w.x f10, zero + addiw s11, s11, 1 + fadd.s f8, f8, f10 + blt s11, s7, label92 +label103: li a0, 39 jal _sysy_stoptime + fmv.s f10, f8 + jal putfloat mv a0, zero - jal putint ld ra, 0(sp) - mv a0, zero - addi sp, sp, 8 + ld s8, 8(sp) + ld s11, 16(sp) + flw f9, 24(sp) + flw f8, 28(sp) + ld s3, 32(sp) + flw f18, 40(sp) + ld s6, 48(sp) + ld s9, 56(sp) + ld s5, 64(sp) + ld s0, 72(sp) + ld s4, 80(sp) + ld s1, 88(sp) + ld s2, 96(sp) + ld s7, 104(sp) + ld s10, 112(sp) + addi sp, sp, 120 + ret +.p2align 2 +label275: + fadd.s f8, f8, f10 + addiw s11, s11, 1 + blt s11, s7, label92 + j label103 +.p2align 2 +cmmc_parallel_body_0: + mv a3, a0 + addiw t0, a0, 3 +pcrel90: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_0) + addiw a5, a1, -3 + addi a0, a2, %pcrel_lo(pcrel90) + ld a4, 8(a0) + flw f10, 16(a0) + ld a2, 24(a0) + flw f11, 32(a0) + ble a1, t0, label7 + sh2add a0, a3, a4 +.p2align 2 +label3: + fcvt.s.w f12, a3 + addiw t1, a3, 1 + sh2add t0, a3, a2 + fadd.s f13, f11, f12 + fadd.s f14, f10, f12 + fcvt.s.w f12, t1 + addiw t1, a3, 2 + fsw f14, 0(a0) + fadd.s f14, f11, f12 + fsw f13, 0(t0) + fadd.s f13, f10, f12 + fcvt.s.w f12, t1 + addiw t1, a3, 3 + addiw a3, a3, 4 + fsw f13, 4(a0) + fadd.s f13, f11, f12 + fsw f14, 4(t0) + fadd.s f14, f10, f12 + fcvt.s.w f12, t1 + fsw f14, 8(a0) + fadd.s f14, f11, f12 + fsw f13, 8(t0) + fadd.s f13, f10, f12 + fsw f13, 12(a0) + fsw f14, 12(t0) + ble a5, a3, label7 + addi a0, a0, 16 + j label3 +label7: + ble a1, a3, label14 + sh2add a0, a3, a4 +label10: + fcvt.s.w f12, a3 + sh2add a4, a3, a2 + fadd.s f13, f11, f12 + addiw a3, a3, 1 + fadd.s f14, f10, f12 + fsw f14, 0(a0) + fsw f13, 0(a4) + ble a1, a3, label14 + addi a0, a0, 4 + j label10 +label14: ret diff --git a/tests/SysY2022/performance/large_loop_array_1.sy b/tests/SysY2022/performance/large_loop_array_1.sy index 3e0aff89..7148b653 100644 --- a/tests/SysY2022/performance/large_loop_array_1.sy +++ b/tests/SysY2022/performance/large_loop_array_1.sy @@ -37,13 +37,6 @@ int main() { i = i + 1; } stoptime(); - if ((total - 1430318598848512.000000) <=0.000001 || (total - 1430318598848512.000000) >= -0.000001) { - putint(0); - return 0; - } - else { - putint(1); - return 1; - } - + putfloat(total); + return 0; } diff --git a/tests/SysY2022/performance/large_loop_array_1.sy.ir b/tests/SysY2022/performance/large_loop_array_1.sy.ir index b35e04aa..c9f77cc2 100644 --- a/tests/SysY2022/performance/large_loop_array_1.sy.ir +++ b/tests/SysY2022/performance/large_loop_array_1.sy.ir @@ -1,12 +1,185 @@ internal func @getint() -> i32 { NoMemoryRead NoMemoryWrite }; -internal func @putint(i32) -> void { NoMemoryRead NoMemoryWrite }; +internal func @putfloat(f32) -> void { NoMemoryRead NoMemoryWrite }; internal func @starttime(i32) -> void { NoMemoryRead NoMemoryWrite }; internal func @stoptime(i32) -> void { NoMemoryRead NoMemoryWrite }; -func @main() -> i32 { NoMemoryRead NoMemoryWrite NoRecurse Entry } { +func @main() -> i32 { NoMemoryWrite NoRecurse Entry } { ^entry: i32 %0 = call () -> i32 @getint(); call (i32) -> void @starttime(i32 22); + i1 %1 = icmp sgt i32 %0, i32 0; + i1 %2 = icmp sgt i32 %0, i32 3; + i32 %3 = add i32 %0, i32 -3; + [2048 * f32]* %4 = ptrcast [2048 * f32]* @x to [2048 * f32]*; + f32* %5 = getelementptr &([2048 * f32]* %4)[i64 0][i64 0]; + [2048 * f32]* %6 = ptrcast [2048 * f32]* @y to [2048 * f32]*; + f32* %7 = getelementptr &([2048 * f32]* %6)[i64 0][i64 0]; + [40 * i8]* %8 = ptrcast [40 * i8]* @cmmc_parallel_body_payload_0 to [40 * i8]*; + i32* %9 = ptradd [40 * i8]* %8, i32 0; + [2048 * f32]** %10 = ptradd [40 * i8]* %8, i32 8; + f32* %11 = ptradd [40 * i8]* %8, i32 16; + [2048 * f32]** %12 = ptradd [40 * i8]* %8, i32 24; + f32* %13 = ptradd [40 * i8]* %8, i32 32; + i32* %14 = ptradd [40 * i8]* %8, i32 36; + i8* %15 = functionptr () -> void @cmmc_parallel_body_0 as i8*; + ubr ^while.body; + ^while.body: + i32 %16 = phi [^entry, i32 0] [^b2, i32 %31]; + f32 %17 = phi [^entry, f32 0] [^b2, f32 %26]; + f32 %18 = phi [^entry, f32 1] [^b2, f32 %24]; + i32 %19 = phi [^entry, i32 0] [^b2, i32 %28]; + f32 %20 = phi [^entry, f32 0] [^b2, f32 %30]; + i32 %21 = srem i32 %16, i32 10; + i1 %22 = icmp neq i32 %21, i32 0; + f32 %23 = fadd f32 %18, f32 0.2; + f32 %24 = select i1 %22 ? f32 1 : f32 %23; + f32 %25 = fadd f32 %17, f32 0.1; + f32 %26 = select i1 %22 ? f32 0 : f32 %25; + i1 %27 = icmp sgt i32 %0, i32 %19; + cbr i1 %27(prob = 0.984615), ^b, ^b1; + ^b: + store i32* %9 with i32 %19; + store [2048 * f32]** %10 with [2048 * f32]* %4; + store f32* %11 with f32 %26; + store [2048 * f32]** %12 with [2048 * f32]* %6; + store f32* %13 with f32 %24; + store i32* %14 with i32 %0; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 %19, i32 %0, i8* %15); + ubr ^b1; + ^b1: + i32 %28 = phi [^while.body, i32 %19] [^b, i32 %0]; + cbr i1 %1(prob = 0.984615), ^super.header, ^b2; + ^super.header: + cbr i1 %2(prob = 0.941176), ^while.body1, ^scalar.header; + ^b2: + f32 %29 = phi [^b1, f32 0] [^scalar.header, f32 %63] [^while.body2, f32 %72]; + f32 %30 = fadd f32 %20, f32 %29; + i32 %31 = add i32 %16, i32 1; + i1 %32 = icmp slt i32 %31, i32 500000; + cbr i1 %32(prob = 0.999998), ^while.body, ^b3; + ^while.body1: + f32 %33 = phi [^super.header, f32 0] [^while.body1, f32 %58]; + i32 %34 = phi [^super.header, i32 0] [^while.body1, i32 %59]; + f32* %35 = getelementptr &(f32* %5)[i32 %34]; + f32 %36 = load f32* %35; + f32* %37 = getelementptr &(f32* %7)[i32 %34]; + f32 %38 = load f32* %37; + f32 %39 = fmul f32 %36, f32 %38; + f32 %40 = fadd f32 %33, f32 %39; + f32* %41 = getelementptr &(f32* %35)[i64 1]; + f32 %42 = load f32* %41; + f32* %43 = getelementptr &(f32* %37)[i64 1]; + f32 %44 = load f32* %43; + f32 %45 = fmul f32 %42, f32 %44; + f32 %46 = fadd f32 %40, f32 %45; + f32* %47 = getelementptr &(f32* %35)[i64 2]; + f32 %48 = load f32* %47; + f32* %49 = getelementptr &(f32* %37)[i64 2]; + f32 %50 = load f32* %49; + f32 %51 = fmul f32 %48, f32 %50; + f32 %52 = fadd f32 %46, f32 %51; + f32* %53 = getelementptr &(f32* %35)[i64 3]; + f32 %54 = load f32* %53; + f32* %55 = getelementptr &(f32* %37)[i64 3]; + f32 %56 = load f32* %55; + f32 %57 = fmul f32 %54, f32 %56; + f32 %58 = fadd f32 %52, f32 %57; + i32 %59 = add i32 %34, i32 4; + i1 %60 = icmp sgt i32 %3, i32 %59; + cbr i1 %60(prob = 0.941176), ^while.body1, ^scalar.header; + ^scalar.header: + f32 %61 = phi [^super.header, f32 0] [^while.body1, f32 %58]; + i32 %62 = phi [^super.header, i32 0] [^while.body1, i32 %59]; + f32 %63 = phi [^super.header, f32 undef] [^while.body1, f32 %58]; + i1 %64 = icmp sgt i32 %0, i32 %62; + cbr i1 %64(prob = 0.75), ^while.body2, ^b2; + ^b3: call (i32) -> void @stoptime(i32 39); - call (i32) -> void @putint(i32 0); + call (f32) -> void @putfloat(f32 %30); ret i32 0; + ^while.body2 {scalar}: + i32 %65 = phi [^scalar.header, i32 %62] [^while.body2, i32 %73]; + f32 %66 = phi [^scalar.header, f32 %61] [^while.body2, f32 %72]; + f32* %67 = getelementptr &(f32* %5)[i32 %65]; + f32 %68 = load f32* %67; + f32* %69 = getelementptr &(f32* %7)[i32 %65]; + f32 %70 = load f32* %69; + f32 %71 = fmul f32 %68, f32 %70; + f32 %72 = fadd f32 %66, f32 %71; + i32 %73 = add i32 %65, i32 1; + i1 %74 = icmp sgt i32 %0, i32 %73; + cbr i1 %74(prob = 0.75), ^while.body2, ^b2; } +internal func @cmmcParallelFor(i32, i32, i8*) -> void { NoRecurse }; +internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse ParallelBody } { + ^b: + i32 %2 = add i32 %0, i32 3; + i1 %3 = icmp sgt i32 %1, i32 %2; + [40 * i8]* %4 = ptrcast [40 * i8]* @cmmc_parallel_body_payload_0 to [40 * i8]*; + [2048 * f32]** %5 = ptradd [40 * i8]* %4, i32 8; + [2048 * f32]* %6 = load [2048 * f32]** %5; + f32* %7 = ptradd [40 * i8]* %4, i32 16; + f32 %8 = load f32* %7; + [2048 * f32]** %9 = ptradd [40 * i8]* %4, i32 24; + [2048 * f32]* %10 = load [2048 * f32]** %9; + f32* %11 = ptradd [40 * i8]* %4, i32 32; + f32 %12 = load f32* %11; + i32 %13 = add i32 %1, i32 -3; + cbr i1 %3(prob = 0.941176), ^b1, ^scalar.header; + ^b1: + i32 %14 = phi [^b, i32 %0] [^b1, i32 %38]; + f32 %15 = s2f i32 %14 to f32; + f32 %16 = fadd f32 %8, f32 %15; + f32* %17 = getelementptr &([2048 * f32]* %6)[i64 0][i32 %14]; + store f32* %17 with f32 %16; + f32 %18 = fadd f32 %12, f32 %15; + f32* %19 = getelementptr &([2048 * f32]* %10)[i64 0][i32 %14]; + store f32* %19 with f32 %18; + i32 %20 = add i32 %14, i32 1; + f32 %21 = s2f i32 %20 to f32; + f32 %22 = fadd f32 %8, f32 %21; + f32* %23 = getelementptr &(f32* %17)[i64 1]; + store f32* %23 with f32 %22; + f32 %24 = fadd f32 %12, f32 %21; + f32* %25 = getelementptr &(f32* %19)[i64 1]; + store f32* %25 with f32 %24; + i32 %26 = add i32 %14, i32 2; + f32 %27 = s2f i32 %26 to f32; + f32 %28 = fadd f32 %8, f32 %27; + f32* %29 = getelementptr &(f32* %17)[i64 2]; + store f32* %29 with f32 %28; + f32 %30 = fadd f32 %12, f32 %27; + f32* %31 = getelementptr &(f32* %19)[i64 2]; + store f32* %31 with f32 %30; + i32 %32 = add i32 %14, i32 3; + f32 %33 = s2f i32 %32 to f32; + f32 %34 = fadd f32 %8, f32 %33; + f32* %35 = getelementptr &(f32* %17)[i64 3]; + store f32* %35 with f32 %34; + f32 %36 = fadd f32 %12, f32 %33; + f32* %37 = getelementptr &(f32* %19)[i64 3]; + store f32* %37 with f32 %36; + i32 %38 = add i32 %14, i32 4; + i1 %39 = icmp sgt i32 %13, i32 %38; + cbr i1 %39(prob = 0.941176), ^b1, ^scalar.header; + ^scalar.header: + i32 %40 = phi [^b, i32 %0] [^b1, i32 %38]; + i1 %41 = icmp sgt i32 %1, i32 %40; + cbr i1 %41(prob = 0.75), ^b2, ^scalar.final; + ^b2 {scalar}: + i32 %42 = phi [^scalar.header, i32 %40] [^b2, i32 %48]; + f32 %43 = s2f i32 %42 to f32; + f32 %44 = fadd f32 %8, f32 %43; + f32* %45 = getelementptr &([2048 * f32]* %6)[i64 0][i32 %42]; + store f32* %45 with f32 %44; + f32 %46 = fadd f32 %12, f32 %43; + f32* %47 = getelementptr &([2048 * f32]* %10)[i64 0][i32 %42]; + store f32* %47 with f32 %46; + i32 %48 = add i32 %42, i32 1; + i1 %49 = icmp sgt i32 %1, i32 %48; + cbr i1 %49(prob = 0.75), ^b2, ^scalar.final; + ^scalar.final: + ret; +} +internal [40 * i8]* @cmmc_parallel_body_payload_0, align 8; +internal [2048 * f32]* @y, align 8; +internal [2048 * f32]* @x, align 8; diff --git a/tests/SysY2022/performance/large_loop_array_2.out b/tests/SysY2022/performance/large_loop_array_2.out index 9aea9e0c..3f247f8c 100644 --- a/tests/SysY2022/performance/large_loop_array_2.out +++ b/tests/SysY2022/performance/large_loop_array_2.out @@ -1,2 +1,2 @@ +0x1.4536bp+53 0 -0 \ No newline at end of file diff --git a/tests/SysY2022/performance/large_loop_array_2.riscv.s b/tests/SysY2022/performance/large_loop_array_2.riscv.s index e1d10725..c9323fe0 100644 --- a/tests/SysY2022/performance/large_loop_array_2.riscv.s +++ b/tests/SysY2022/performance/large_loop_array_2.riscv.s @@ -1,19 +1,270 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data +.section .rodata +.p2align 2 +__cmmc_fp_constant_pool: + .4byte 1045220557 + .4byte 1036831949 +.bss +.p2align 3 +cmmc_parallel_body_payload_0: + .zero 40 +.p2align 3 +y: + .zero 16384 +.p2align 3 +x: + .zero 16384 .text .p2align 2 .globl main main: - addi sp, sp, -8 + addi sp, sp, -120 sd ra, 0(sp) + sd s8, 8(sp) + sd s11, 16(sp) + fsw f9, 24(sp) + fsw f8, 28(sp) + sd s3, 32(sp) + fsw f18, 40(sp) + sd s6, 48(sp) + sd s9, 56(sp) + sd s5, 64(sp) + sd s0, 72(sp) + sd s4, 80(sp) + sd s1, 88(sp) + sd s2, 96(sp) + sd s7, 104(sp) + sd s10, 112(sp) jal getint + mv s2, a0 li a0, 22 jal _sysy_starttime + li s10, 3 +pcrel287: + auipc a2, %pcrel_hi(__cmmc_fp_constant_pool) + addiw s1, s2, -3 + lui s6, 260096 +pcrel288: + auipc a0, %pcrel_hi(x) +pcrel289: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) +pcrel290: + auipc a1, %pcrel_hi(y) + fmv.w.x f8, zero + mv s11, zero + addi s5, a2, %pcrel_lo(pcrel287) + fmv.w.x f9, s6 + addi s4, a0, %pcrel_lo(pcrel288) + addi s3, s9, %pcrel_lo(pcrel289) + addi s0, a1, %pcrel_lo(pcrel290) + fmv.s f18, f8 + lui a2, 122 + mv a0, zero +pcrel291: + auipc a1, %pcrel_hi(cmmc_parallel_body_0) + addiw s7, a2, 288 + addi s8, a1, %pcrel_lo(pcrel291) + j label92 +.p2align 2 +label120: + addi a1, a1, 4 +.p2align 2 +label116: + sh2add a3, a2, s0 + flw f11, 0(a1) + addiw a2, a2, 1 + flw f12, 0(a3) + fmadd.s f10, f11, f12, f10 + bgt s2, a2, label120 + fadd.s f8, f8, f10 + addiw s11, s11, 1 + bge s11, s7, label103 +.p2align 2 +label92: + lui a4, 419430 + fmv.w.x f11, s6 + flw f12, 0(s5) + addiw a3, a4, 1639 + mul a1, s11, a3 + fadd.s f10, f9, f12 + srli a4, a1, 63 + srai a2, a1, 34 + fmv.s f9, f11 + add a3, a4, a2 + sh2add t0, a3, a3 + slliw a5, t0, 1 + subw a2, s11, a5 + sltu a1, zero, a2 + bne a1, zero, label254 + fmv.s f9, f10 +label254: + flw f12, 4(s5) + fmv.w.x f11, zero + fadd.s f10, f18, f12 + fmv.s f18, f11 + bne a1, zero, label256 + fmv.s f18, f10 +label256: + ble s2, a0, label148 +pcrel292: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) + sw a0, %pcrel_lo(pcrel292)(s9) + sd s4, 8(s3) + fsw f18, 16(s3) + sd s0, 24(s3) + fsw f9, 32(s3) + sw s2, 36(s3) + mv a1, s2 + mv a2, s8 + jal cmmcParallelFor + mv a0, s2 + ble s2, zero, label159 +.p2align 2 +label104: + ble s2, s10, label170 + fmv.w.x f10, zero + mv a1, s4 + mv a2, zero +.p2align 2 +label106: + sh2add a3, a2, s0 + flw f12, 0(a1) + addiw a2, a2, 4 + flw f14, 0(a3) + flw f13, 4(a1) + fmadd.s f11, f12, f14, f10 + flw f15, 4(a3) + flw f14, 8(a1) + flw f0, 8(a3) + flw f12, 12(a1) + fmadd.s f10, f13, f15, f11 + flw f13, 12(a3) + fmadd.s f11, f14, f0, f10 + fmadd.s f10, f12, f13, f11 + ble s1, a2, label202 + addi a1, a1, 16 + j label106 +.p2align 2 +label202: + fmv.s f11, f10 + ble s2, a2, label275 +.p2align 2 +label115: + sh2add a1, a2, s4 + fmv.s f10, f11 + j label116 +.p2align 2 +label159: + fmv.w.x f10, zero + fadd.s f8, f8, f10 + addiw s11, s11, 1 + blt s11, s7, label92 + j label103 +.p2align 2 +label170: + fmv.w.x f10, zero + mv a2, zero + fmv.s f11, f10 + bgt s2, zero, label115 + fadd.s f8, f8, f10 + addiw s11, s11, 1 + blt s11, s7, label92 + j label103 +.p2align 2 +label148: + bgt s2, zero, label104 + fmv.w.x f10, zero + addiw s11, s11, 1 + fadd.s f8, f8, f10 + blt s11, s7, label92 +label103: li a0, 39 jal _sysy_stoptime + fmv.s f10, f8 + jal putfloat mv a0, zero - jal putint ld ra, 0(sp) - mv a0, zero - addi sp, sp, 8 + ld s8, 8(sp) + ld s11, 16(sp) + flw f9, 24(sp) + flw f8, 28(sp) + ld s3, 32(sp) + flw f18, 40(sp) + ld s6, 48(sp) + ld s9, 56(sp) + ld s5, 64(sp) + ld s0, 72(sp) + ld s4, 80(sp) + ld s1, 88(sp) + ld s2, 96(sp) + ld s7, 104(sp) + ld s10, 112(sp) + addi sp, sp, 120 + ret +.p2align 2 +label275: + fadd.s f8, f8, f10 + addiw s11, s11, 1 + blt s11, s7, label92 + j label103 +.p2align 2 +cmmc_parallel_body_0: + mv a3, a0 + addiw t0, a0, 3 +pcrel90: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_0) + addiw a5, a1, -3 + addi a0, a2, %pcrel_lo(pcrel90) + ld a4, 8(a0) + flw f10, 16(a0) + ld a2, 24(a0) + flw f11, 32(a0) + ble a1, t0, label7 + sh2add a0, a3, a4 +.p2align 2 +label3: + fcvt.s.w f12, a3 + addiw t1, a3, 1 + sh2add t0, a3, a2 + fadd.s f13, f11, f12 + fadd.s f14, f10, f12 + fcvt.s.w f12, t1 + addiw t1, a3, 2 + fsw f14, 0(a0) + fadd.s f14, f11, f12 + fsw f13, 0(t0) + fadd.s f13, f10, f12 + fcvt.s.w f12, t1 + addiw t1, a3, 3 + addiw a3, a3, 4 + fsw f13, 4(a0) + fadd.s f13, f11, f12 + fsw f14, 4(t0) + fadd.s f14, f10, f12 + fcvt.s.w f12, t1 + fsw f14, 8(a0) + fadd.s f14, f11, f12 + fsw f13, 8(t0) + fadd.s f13, f10, f12 + fsw f13, 12(a0) + fsw f14, 12(t0) + ble a5, a3, label7 + addi a0, a0, 16 + j label3 +label7: + ble a1, a3, label14 + sh2add a0, a3, a4 +label10: + fcvt.s.w f12, a3 + sh2add a4, a3, a2 + fadd.s f13, f11, f12 + addiw a3, a3, 1 + fadd.s f14, f10, f12 + fsw f14, 0(a0) + fsw f13, 0(a4) + ble a1, a3, label14 + addi a0, a0, 4 + j label10 +label14: ret diff --git a/tests/SysY2022/performance/large_loop_array_2.sy b/tests/SysY2022/performance/large_loop_array_2.sy index d380cc41..7ad0b257 100644 --- a/tests/SysY2022/performance/large_loop_array_2.sy +++ b/tests/SysY2022/performance/large_loop_array_2.sy @@ -37,13 +37,6 @@ int main() { i = i + 1; } stoptime(); - if ((total - 11442437121638400.000000) <=0.000001 || (total - 11442437121638400.000000) >= -0.000001) { - putint(0); - return 0; - } - else { - putint(1); - return 1; - } - + putfloat(total); + return 0; } diff --git a/tests/SysY2022/performance/large_loop_array_2.sy.ir b/tests/SysY2022/performance/large_loop_array_2.sy.ir index b35e04aa..a05b860a 100644 --- a/tests/SysY2022/performance/large_loop_array_2.sy.ir +++ b/tests/SysY2022/performance/large_loop_array_2.sy.ir @@ -1,12 +1,185 @@ internal func @getint() -> i32 { NoMemoryRead NoMemoryWrite }; -internal func @putint(i32) -> void { NoMemoryRead NoMemoryWrite }; +internal func @putfloat(f32) -> void { NoMemoryRead NoMemoryWrite }; internal func @starttime(i32) -> void { NoMemoryRead NoMemoryWrite }; internal func @stoptime(i32) -> void { NoMemoryRead NoMemoryWrite }; -func @main() -> i32 { NoMemoryRead NoMemoryWrite NoRecurse Entry } { +func @main() -> i32 { NoMemoryWrite NoRecurse Entry } { ^entry: i32 %0 = call () -> i32 @getint(); call (i32) -> void @starttime(i32 22); + i1 %1 = icmp sgt i32 %0, i32 0; + i1 %2 = icmp sgt i32 %0, i32 3; + i32 %3 = add i32 %0, i32 -3; + [4096 * f32]* %4 = ptrcast [4096 * f32]* @x to [4096 * f32]*; + f32* %5 = getelementptr &([4096 * f32]* %4)[i64 0][i64 0]; + [4096 * f32]* %6 = ptrcast [4096 * f32]* @y to [4096 * f32]*; + f32* %7 = getelementptr &([4096 * f32]* %6)[i64 0][i64 0]; + [40 * i8]* %8 = ptrcast [40 * i8]* @cmmc_parallel_body_payload_0 to [40 * i8]*; + i32* %9 = ptradd [40 * i8]* %8, i32 0; + [4096 * f32]** %10 = ptradd [40 * i8]* %8, i32 8; + f32* %11 = ptradd [40 * i8]* %8, i32 16; + [4096 * f32]** %12 = ptradd [40 * i8]* %8, i32 24; + f32* %13 = ptradd [40 * i8]* %8, i32 32; + i32* %14 = ptradd [40 * i8]* %8, i32 36; + i8* %15 = functionptr () -> void @cmmc_parallel_body_0 as i8*; + ubr ^while.body; + ^while.body: + i32 %16 = phi [^entry, i32 0] [^b2, i32 %31]; + f32 %17 = phi [^entry, f32 0] [^b2, f32 %26]; + f32 %18 = phi [^entry, f32 1] [^b2, f32 %24]; + i32 %19 = phi [^entry, i32 0] [^b2, i32 %28]; + f32 %20 = phi [^entry, f32 0] [^b2, f32 %30]; + i32 %21 = srem i32 %16, i32 10; + i1 %22 = icmp neq i32 %21, i32 0; + f32 %23 = fadd f32 %18, f32 0.2; + f32 %24 = select i1 %22 ? f32 1 : f32 %23; + f32 %25 = fadd f32 %17, f32 0.1; + f32 %26 = select i1 %22 ? f32 0 : f32 %25; + i1 %27 = icmp sgt i32 %0, i32 %19; + cbr i1 %27(prob = 0.984615), ^b, ^b1; + ^b: + store i32* %9 with i32 %19; + store [4096 * f32]** %10 with [4096 * f32]* %4; + store f32* %11 with f32 %26; + store [4096 * f32]** %12 with [4096 * f32]* %6; + store f32* %13 with f32 %24; + store i32* %14 with i32 %0; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 %19, i32 %0, i8* %15); + ubr ^b1; + ^b1: + i32 %28 = phi [^while.body, i32 %19] [^b, i32 %0]; + cbr i1 %1(prob = 0.984615), ^super.header, ^b2; + ^super.header: + cbr i1 %2(prob = 0.941176), ^while.body1, ^scalar.header; + ^b2: + f32 %29 = phi [^b1, f32 0] [^scalar.header, f32 %63] [^while.body2, f32 %72]; + f32 %30 = fadd f32 %20, f32 %29; + i32 %31 = add i32 %16, i32 1; + i1 %32 = icmp slt i32 %31, i32 500000; + cbr i1 %32(prob = 0.999998), ^while.body, ^b3; + ^while.body1: + f32 %33 = phi [^super.header, f32 0] [^while.body1, f32 %58]; + i32 %34 = phi [^super.header, i32 0] [^while.body1, i32 %59]; + f32* %35 = getelementptr &(f32* %5)[i32 %34]; + f32 %36 = load f32* %35; + f32* %37 = getelementptr &(f32* %7)[i32 %34]; + f32 %38 = load f32* %37; + f32 %39 = fmul f32 %36, f32 %38; + f32 %40 = fadd f32 %33, f32 %39; + f32* %41 = getelementptr &(f32* %35)[i64 1]; + f32 %42 = load f32* %41; + f32* %43 = getelementptr &(f32* %37)[i64 1]; + f32 %44 = load f32* %43; + f32 %45 = fmul f32 %42, f32 %44; + f32 %46 = fadd f32 %40, f32 %45; + f32* %47 = getelementptr &(f32* %35)[i64 2]; + f32 %48 = load f32* %47; + f32* %49 = getelementptr &(f32* %37)[i64 2]; + f32 %50 = load f32* %49; + f32 %51 = fmul f32 %48, f32 %50; + f32 %52 = fadd f32 %46, f32 %51; + f32* %53 = getelementptr &(f32* %35)[i64 3]; + f32 %54 = load f32* %53; + f32* %55 = getelementptr &(f32* %37)[i64 3]; + f32 %56 = load f32* %55; + f32 %57 = fmul f32 %54, f32 %56; + f32 %58 = fadd f32 %52, f32 %57; + i32 %59 = add i32 %34, i32 4; + i1 %60 = icmp sgt i32 %3, i32 %59; + cbr i1 %60(prob = 0.941176), ^while.body1, ^scalar.header; + ^scalar.header: + f32 %61 = phi [^super.header, f32 0] [^while.body1, f32 %58]; + i32 %62 = phi [^super.header, i32 0] [^while.body1, i32 %59]; + f32 %63 = phi [^super.header, f32 undef] [^while.body1, f32 %58]; + i1 %64 = icmp sgt i32 %0, i32 %62; + cbr i1 %64(prob = 0.75), ^while.body2, ^b2; + ^b3: call (i32) -> void @stoptime(i32 39); - call (i32) -> void @putint(i32 0); + call (f32) -> void @putfloat(f32 %30); ret i32 0; + ^while.body2 {scalar}: + i32 %65 = phi [^scalar.header, i32 %62] [^while.body2, i32 %73]; + f32 %66 = phi [^scalar.header, f32 %61] [^while.body2, f32 %72]; + f32* %67 = getelementptr &(f32* %5)[i32 %65]; + f32 %68 = load f32* %67; + f32* %69 = getelementptr &(f32* %7)[i32 %65]; + f32 %70 = load f32* %69; + f32 %71 = fmul f32 %68, f32 %70; + f32 %72 = fadd f32 %66, f32 %71; + i32 %73 = add i32 %65, i32 1; + i1 %74 = icmp sgt i32 %0, i32 %73; + cbr i1 %74(prob = 0.75), ^while.body2, ^b2; } +internal func @cmmcParallelFor(i32, i32, i8*) -> void { NoRecurse }; +internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse ParallelBody } { + ^b: + i32 %2 = add i32 %0, i32 3; + i1 %3 = icmp sgt i32 %1, i32 %2; + [40 * i8]* %4 = ptrcast [40 * i8]* @cmmc_parallel_body_payload_0 to [40 * i8]*; + [4096 * f32]** %5 = ptradd [40 * i8]* %4, i32 8; + [4096 * f32]* %6 = load [4096 * f32]** %5; + f32* %7 = ptradd [40 * i8]* %4, i32 16; + f32 %8 = load f32* %7; + [4096 * f32]** %9 = ptradd [40 * i8]* %4, i32 24; + [4096 * f32]* %10 = load [4096 * f32]** %9; + f32* %11 = ptradd [40 * i8]* %4, i32 32; + f32 %12 = load f32* %11; + i32 %13 = add i32 %1, i32 -3; + cbr i1 %3(prob = 0.941176), ^b1, ^scalar.header; + ^b1: + i32 %14 = phi [^b, i32 %0] [^b1, i32 %38]; + f32 %15 = s2f i32 %14 to f32; + f32 %16 = fadd f32 %8, f32 %15; + f32* %17 = getelementptr &([4096 * f32]* %6)[i64 0][i32 %14]; + store f32* %17 with f32 %16; + f32 %18 = fadd f32 %12, f32 %15; + f32* %19 = getelementptr &([4096 * f32]* %10)[i64 0][i32 %14]; + store f32* %19 with f32 %18; + i32 %20 = add i32 %14, i32 1; + f32 %21 = s2f i32 %20 to f32; + f32 %22 = fadd f32 %8, f32 %21; + f32* %23 = getelementptr &(f32* %17)[i64 1]; + store f32* %23 with f32 %22; + f32 %24 = fadd f32 %12, f32 %21; + f32* %25 = getelementptr &(f32* %19)[i64 1]; + store f32* %25 with f32 %24; + i32 %26 = add i32 %14, i32 2; + f32 %27 = s2f i32 %26 to f32; + f32 %28 = fadd f32 %8, f32 %27; + f32* %29 = getelementptr &(f32* %17)[i64 2]; + store f32* %29 with f32 %28; + f32 %30 = fadd f32 %12, f32 %27; + f32* %31 = getelementptr &(f32* %19)[i64 2]; + store f32* %31 with f32 %30; + i32 %32 = add i32 %14, i32 3; + f32 %33 = s2f i32 %32 to f32; + f32 %34 = fadd f32 %8, f32 %33; + f32* %35 = getelementptr &(f32* %17)[i64 3]; + store f32* %35 with f32 %34; + f32 %36 = fadd f32 %12, f32 %33; + f32* %37 = getelementptr &(f32* %19)[i64 3]; + store f32* %37 with f32 %36; + i32 %38 = add i32 %14, i32 4; + i1 %39 = icmp sgt i32 %13, i32 %38; + cbr i1 %39(prob = 0.941176), ^b1, ^scalar.header; + ^scalar.header: + i32 %40 = phi [^b, i32 %0] [^b1, i32 %38]; + i1 %41 = icmp sgt i32 %1, i32 %40; + cbr i1 %41(prob = 0.75), ^b2, ^scalar.final; + ^b2 {scalar}: + i32 %42 = phi [^scalar.header, i32 %40] [^b2, i32 %48]; + f32 %43 = s2f i32 %42 to f32; + f32 %44 = fadd f32 %8, f32 %43; + f32* %45 = getelementptr &([4096 * f32]* %6)[i64 0][i32 %42]; + store f32* %45 with f32 %44; + f32 %46 = fadd f32 %12, f32 %43; + f32* %47 = getelementptr &([4096 * f32]* %10)[i64 0][i32 %42]; + store f32* %47 with f32 %46; + i32 %48 = add i32 %42, i32 1; + i1 %49 = icmp sgt i32 %1, i32 %48; + cbr i1 %49(prob = 0.75), ^b2, ^scalar.final; + ^scalar.final: + ret; +} +internal [40 * i8]* @cmmc_parallel_body_payload_0, align 8; +internal [4096 * f32]* @y, align 8; +internal [4096 * f32]* @x, align 8; diff --git a/tests/SysY2022/performance/large_loop_array_3.out b/tests/SysY2022/performance/large_loop_array_3.out index 9aea9e0c..d1d88252 100644 --- a/tests/SysY2022/performance/large_loop_array_3.out +++ b/tests/SysY2022/performance/large_loop_array_3.out @@ -1,2 +1,2 @@ +0x1.049956p+54 0 -0 \ No newline at end of file diff --git a/tests/SysY2022/performance/large_loop_array_3.riscv.s b/tests/SysY2022/performance/large_loop_array_3.riscv.s index e1d10725..cb64967c 100644 --- a/tests/SysY2022/performance/large_loop_array_3.riscv.s +++ b/tests/SysY2022/performance/large_loop_array_3.riscv.s @@ -1,19 +1,270 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data +.section .rodata +.p2align 2 +__cmmc_fp_constant_pool: + .4byte 1045220557 + .4byte 1036831949 +.bss +.p2align 3 +cmmc_parallel_body_payload_0: + .zero 40 +.p2align 3 +y: + .zero 32768 +.p2align 3 +x: + .zero 32768 .text .p2align 2 .globl main main: - addi sp, sp, -8 + addi sp, sp, -120 sd ra, 0(sp) + sd s8, 8(sp) + sd s11, 16(sp) + fsw f9, 24(sp) + fsw f8, 28(sp) + sd s3, 32(sp) + fsw f18, 40(sp) + sd s6, 48(sp) + sd s9, 56(sp) + sd s5, 64(sp) + sd s0, 72(sp) + sd s4, 80(sp) + sd s1, 88(sp) + sd s2, 96(sp) + sd s7, 104(sp) + sd s10, 112(sp) jal getint + mv s2, a0 li a0, 22 jal _sysy_starttime + li s10, 3 +pcrel287: + auipc a2, %pcrel_hi(__cmmc_fp_constant_pool) + addiw s1, s2, -3 + lui s6, 260096 +pcrel288: + auipc a0, %pcrel_hi(x) +pcrel289: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) +pcrel290: + auipc a1, %pcrel_hi(y) + fmv.w.x f8, zero + mv s11, zero + addi s5, a2, %pcrel_lo(pcrel287) + fmv.w.x f9, s6 + addi s4, a0, %pcrel_lo(pcrel288) + addi s3, s9, %pcrel_lo(pcrel289) + addi s0, a1, %pcrel_lo(pcrel290) + fmv.s f18, f8 + lui a2, 24 + mv a0, zero +pcrel291: + auipc a1, %pcrel_hi(cmmc_parallel_body_0) + addiw s7, a2, 1696 + addi s8, a1, %pcrel_lo(pcrel291) + j label92 +.p2align 2 +label120: + addi a1, a1, 4 +.p2align 2 +label116: + sh2add a3, a2, s0 + flw f11, 0(a1) + addiw a2, a2, 1 + flw f12, 0(a3) + fmadd.s f10, f11, f12, f10 + bgt s2, a2, label120 + fadd.s f8, f8, f10 + addiw s11, s11, 1 + bge s11, s7, label103 +.p2align 2 +label92: + lui a4, 419430 + fmv.w.x f11, s6 + flw f12, 0(s5) + addiw a3, a4, 1639 + mul a1, s11, a3 + fadd.s f10, f9, f12 + srli a4, a1, 63 + srai a2, a1, 34 + fmv.s f9, f11 + add a3, a4, a2 + sh2add t0, a3, a3 + slliw a5, t0, 1 + subw a2, s11, a5 + sltu a1, zero, a2 + bne a1, zero, label254 + fmv.s f9, f10 +label254: + flw f12, 4(s5) + fmv.w.x f11, zero + fadd.s f10, f18, f12 + fmv.s f18, f11 + bne a1, zero, label256 + fmv.s f18, f10 +label256: + ble s2, a0, label148 +pcrel292: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) + sw a0, %pcrel_lo(pcrel292)(s9) + sd s4, 8(s3) + fsw f18, 16(s3) + sd s0, 24(s3) + fsw f9, 32(s3) + sw s2, 36(s3) + mv a1, s2 + mv a2, s8 + jal cmmcParallelFor + mv a0, s2 + ble s2, zero, label159 +.p2align 2 +label104: + ble s2, s10, label170 + fmv.w.x f10, zero + mv a1, s4 + mv a2, zero +.p2align 2 +label106: + sh2add a3, a2, s0 + flw f12, 0(a1) + addiw a2, a2, 4 + flw f14, 0(a3) + flw f13, 4(a1) + fmadd.s f11, f12, f14, f10 + flw f15, 4(a3) + flw f14, 8(a1) + flw f0, 8(a3) + flw f12, 12(a1) + fmadd.s f10, f13, f15, f11 + flw f13, 12(a3) + fmadd.s f11, f14, f0, f10 + fmadd.s f10, f12, f13, f11 + ble s1, a2, label202 + addi a1, a1, 16 + j label106 +.p2align 2 +label202: + fmv.s f11, f10 + ble s2, a2, label275 +.p2align 2 +label115: + sh2add a1, a2, s4 + fmv.s f10, f11 + j label116 +.p2align 2 +label159: + fmv.w.x f10, zero + fadd.s f8, f8, f10 + addiw s11, s11, 1 + blt s11, s7, label92 + j label103 +.p2align 2 +label170: + fmv.w.x f10, zero + mv a2, zero + fmv.s f11, f10 + bgt s2, zero, label115 + fadd.s f8, f8, f10 + addiw s11, s11, 1 + blt s11, s7, label92 + j label103 +.p2align 2 +label148: + bgt s2, zero, label104 + fmv.w.x f10, zero + addiw s11, s11, 1 + fadd.s f8, f8, f10 + blt s11, s7, label92 +label103: li a0, 39 jal _sysy_stoptime + fmv.s f10, f8 + jal putfloat mv a0, zero - jal putint ld ra, 0(sp) - mv a0, zero - addi sp, sp, 8 + ld s8, 8(sp) + ld s11, 16(sp) + flw f9, 24(sp) + flw f8, 28(sp) + ld s3, 32(sp) + flw f18, 40(sp) + ld s6, 48(sp) + ld s9, 56(sp) + ld s5, 64(sp) + ld s0, 72(sp) + ld s4, 80(sp) + ld s1, 88(sp) + ld s2, 96(sp) + ld s7, 104(sp) + ld s10, 112(sp) + addi sp, sp, 120 + ret +.p2align 2 +label275: + fadd.s f8, f8, f10 + addiw s11, s11, 1 + blt s11, s7, label92 + j label103 +.p2align 2 +cmmc_parallel_body_0: + mv a3, a0 + addiw t0, a0, 3 +pcrel90: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_0) + addiw a5, a1, -3 + addi a0, a2, %pcrel_lo(pcrel90) + ld a4, 8(a0) + flw f10, 16(a0) + ld a2, 24(a0) + flw f11, 32(a0) + ble a1, t0, label7 + sh2add a0, a3, a4 +.p2align 2 +label3: + fcvt.s.w f12, a3 + addiw t1, a3, 1 + sh2add t0, a3, a2 + fadd.s f13, f11, f12 + fadd.s f14, f10, f12 + fcvt.s.w f12, t1 + addiw t1, a3, 2 + fsw f14, 0(a0) + fadd.s f14, f11, f12 + fsw f13, 0(t0) + fadd.s f13, f10, f12 + fcvt.s.w f12, t1 + addiw t1, a3, 3 + addiw a3, a3, 4 + fsw f13, 4(a0) + fadd.s f13, f11, f12 + fsw f14, 4(t0) + fadd.s f14, f10, f12 + fcvt.s.w f12, t1 + fsw f14, 8(a0) + fadd.s f14, f11, f12 + fsw f13, 8(t0) + fadd.s f13, f10, f12 + fsw f13, 12(a0) + fsw f14, 12(t0) + ble a5, a3, label7 + addi a0, a0, 16 + j label3 +label7: + ble a1, a3, label14 + sh2add a0, a3, a4 +label10: + fcvt.s.w f12, a3 + sh2add a4, a3, a2 + fadd.s f13, f11, f12 + addiw a3, a3, 1 + fadd.s f14, f10, f12 + fsw f14, 0(a0) + fsw f13, 0(a4) + ble a1, a3, label14 + addi a0, a0, 4 + j label10 +label14: ret diff --git a/tests/SysY2022/performance/large_loop_array_3.sy b/tests/SysY2022/performance/large_loop_array_3.sy index 29019208..89764f90 100644 --- a/tests/SysY2022/performance/large_loop_array_3.sy +++ b/tests/SysY2022/performance/large_loop_array_3.sy @@ -37,13 +37,6 @@ int main() { i = i + 1; } stoptime(); - if ((total - 18338022147751936.000000) <=0.000001 || (total - 18338022147751936.000000) >= -0.000001) { - putint(0); - return 0; - } - else { - putint(1); - return 1; - } - + putfloat(total); + return 0; } diff --git a/tests/SysY2022/performance/large_loop_array_3.sy.ir b/tests/SysY2022/performance/large_loop_array_3.sy.ir index b35e04aa..35994f60 100644 --- a/tests/SysY2022/performance/large_loop_array_3.sy.ir +++ b/tests/SysY2022/performance/large_loop_array_3.sy.ir @@ -1,12 +1,185 @@ internal func @getint() -> i32 { NoMemoryRead NoMemoryWrite }; -internal func @putint(i32) -> void { NoMemoryRead NoMemoryWrite }; +internal func @putfloat(f32) -> void { NoMemoryRead NoMemoryWrite }; internal func @starttime(i32) -> void { NoMemoryRead NoMemoryWrite }; internal func @stoptime(i32) -> void { NoMemoryRead NoMemoryWrite }; -func @main() -> i32 { NoMemoryRead NoMemoryWrite NoRecurse Entry } { +func @main() -> i32 { NoMemoryWrite NoRecurse Entry } { ^entry: i32 %0 = call () -> i32 @getint(); call (i32) -> void @starttime(i32 22); + i1 %1 = icmp sgt i32 %0, i32 0; + i1 %2 = icmp sgt i32 %0, i32 3; + i32 %3 = add i32 %0, i32 -3; + [8192 * f32]* %4 = ptrcast [8192 * f32]* @x to [8192 * f32]*; + f32* %5 = getelementptr &([8192 * f32]* %4)[i64 0][i64 0]; + [8192 * f32]* %6 = ptrcast [8192 * f32]* @y to [8192 * f32]*; + f32* %7 = getelementptr &([8192 * f32]* %6)[i64 0][i64 0]; + [40 * i8]* %8 = ptrcast [40 * i8]* @cmmc_parallel_body_payload_0 to [40 * i8]*; + i32* %9 = ptradd [40 * i8]* %8, i32 0; + [8192 * f32]** %10 = ptradd [40 * i8]* %8, i32 8; + f32* %11 = ptradd [40 * i8]* %8, i32 16; + [8192 * f32]** %12 = ptradd [40 * i8]* %8, i32 24; + f32* %13 = ptradd [40 * i8]* %8, i32 32; + i32* %14 = ptradd [40 * i8]* %8, i32 36; + i8* %15 = functionptr () -> void @cmmc_parallel_body_0 as i8*; + ubr ^while.body; + ^while.body: + i32 %16 = phi [^entry, i32 0] [^b2, i32 %31]; + f32 %17 = phi [^entry, f32 0] [^b2, f32 %26]; + f32 %18 = phi [^entry, f32 1] [^b2, f32 %24]; + i32 %19 = phi [^entry, i32 0] [^b2, i32 %28]; + f32 %20 = phi [^entry, f32 0] [^b2, f32 %30]; + i32 %21 = srem i32 %16, i32 10; + i1 %22 = icmp neq i32 %21, i32 0; + f32 %23 = fadd f32 %18, f32 0.2; + f32 %24 = select i1 %22 ? f32 1 : f32 %23; + f32 %25 = fadd f32 %17, f32 0.1; + f32 %26 = select i1 %22 ? f32 0 : f32 %25; + i1 %27 = icmp sgt i32 %0, i32 %19; + cbr i1 %27(prob = 0.984615), ^b, ^b1; + ^b: + store i32* %9 with i32 %19; + store [8192 * f32]** %10 with [8192 * f32]* %4; + store f32* %11 with f32 %26; + store [8192 * f32]** %12 with [8192 * f32]* %6; + store f32* %13 with f32 %24; + store i32* %14 with i32 %0; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 %19, i32 %0, i8* %15); + ubr ^b1; + ^b1: + i32 %28 = phi [^while.body, i32 %19] [^b, i32 %0]; + cbr i1 %1(prob = 0.984615), ^super.header, ^b2; + ^super.header: + cbr i1 %2(prob = 0.941176), ^while.body1, ^scalar.header; + ^b2: + f32 %29 = phi [^b1, f32 0] [^scalar.header, f32 %63] [^while.body2, f32 %72]; + f32 %30 = fadd f32 %20, f32 %29; + i32 %31 = add i32 %16, i32 1; + i1 %32 = icmp slt i32 %31, i32 100000; + cbr i1 %32(prob = 0.99999), ^while.body, ^b3; + ^while.body1: + f32 %33 = phi [^super.header, f32 0] [^while.body1, f32 %58]; + i32 %34 = phi [^super.header, i32 0] [^while.body1, i32 %59]; + f32* %35 = getelementptr &(f32* %5)[i32 %34]; + f32 %36 = load f32* %35; + f32* %37 = getelementptr &(f32* %7)[i32 %34]; + f32 %38 = load f32* %37; + f32 %39 = fmul f32 %36, f32 %38; + f32 %40 = fadd f32 %33, f32 %39; + f32* %41 = getelementptr &(f32* %35)[i64 1]; + f32 %42 = load f32* %41; + f32* %43 = getelementptr &(f32* %37)[i64 1]; + f32 %44 = load f32* %43; + f32 %45 = fmul f32 %42, f32 %44; + f32 %46 = fadd f32 %40, f32 %45; + f32* %47 = getelementptr &(f32* %35)[i64 2]; + f32 %48 = load f32* %47; + f32* %49 = getelementptr &(f32* %37)[i64 2]; + f32 %50 = load f32* %49; + f32 %51 = fmul f32 %48, f32 %50; + f32 %52 = fadd f32 %46, f32 %51; + f32* %53 = getelementptr &(f32* %35)[i64 3]; + f32 %54 = load f32* %53; + f32* %55 = getelementptr &(f32* %37)[i64 3]; + f32 %56 = load f32* %55; + f32 %57 = fmul f32 %54, f32 %56; + f32 %58 = fadd f32 %52, f32 %57; + i32 %59 = add i32 %34, i32 4; + i1 %60 = icmp sgt i32 %3, i32 %59; + cbr i1 %60(prob = 0.941176), ^while.body1, ^scalar.header; + ^scalar.header: + f32 %61 = phi [^super.header, f32 0] [^while.body1, f32 %58]; + i32 %62 = phi [^super.header, i32 0] [^while.body1, i32 %59]; + f32 %63 = phi [^super.header, f32 undef] [^while.body1, f32 %58]; + i1 %64 = icmp sgt i32 %0, i32 %62; + cbr i1 %64(prob = 0.75), ^while.body2, ^b2; + ^b3: call (i32) -> void @stoptime(i32 39); - call (i32) -> void @putint(i32 0); + call (f32) -> void @putfloat(f32 %30); ret i32 0; + ^while.body2 {scalar}: + i32 %65 = phi [^scalar.header, i32 %62] [^while.body2, i32 %73]; + f32 %66 = phi [^scalar.header, f32 %61] [^while.body2, f32 %72]; + f32* %67 = getelementptr &(f32* %5)[i32 %65]; + f32 %68 = load f32* %67; + f32* %69 = getelementptr &(f32* %7)[i32 %65]; + f32 %70 = load f32* %69; + f32 %71 = fmul f32 %68, f32 %70; + f32 %72 = fadd f32 %66, f32 %71; + i32 %73 = add i32 %65, i32 1; + i1 %74 = icmp sgt i32 %0, i32 %73; + cbr i1 %74(prob = 0.75), ^while.body2, ^b2; } +internal func @cmmcParallelFor(i32, i32, i8*) -> void { NoRecurse }; +internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse ParallelBody } { + ^b: + i32 %2 = add i32 %0, i32 3; + i1 %3 = icmp sgt i32 %1, i32 %2; + [40 * i8]* %4 = ptrcast [40 * i8]* @cmmc_parallel_body_payload_0 to [40 * i8]*; + [8192 * f32]** %5 = ptradd [40 * i8]* %4, i32 8; + [8192 * f32]* %6 = load [8192 * f32]** %5; + f32* %7 = ptradd [40 * i8]* %4, i32 16; + f32 %8 = load f32* %7; + [8192 * f32]** %9 = ptradd [40 * i8]* %4, i32 24; + [8192 * f32]* %10 = load [8192 * f32]** %9; + f32* %11 = ptradd [40 * i8]* %4, i32 32; + f32 %12 = load f32* %11; + i32 %13 = add i32 %1, i32 -3; + cbr i1 %3(prob = 0.941176), ^b1, ^scalar.header; + ^b1: + i32 %14 = phi [^b, i32 %0] [^b1, i32 %38]; + f32 %15 = s2f i32 %14 to f32; + f32 %16 = fadd f32 %8, f32 %15; + f32* %17 = getelementptr &([8192 * f32]* %6)[i64 0][i32 %14]; + store f32* %17 with f32 %16; + f32 %18 = fadd f32 %12, f32 %15; + f32* %19 = getelementptr &([8192 * f32]* %10)[i64 0][i32 %14]; + store f32* %19 with f32 %18; + i32 %20 = add i32 %14, i32 1; + f32 %21 = s2f i32 %20 to f32; + f32 %22 = fadd f32 %8, f32 %21; + f32* %23 = getelementptr &(f32* %17)[i64 1]; + store f32* %23 with f32 %22; + f32 %24 = fadd f32 %12, f32 %21; + f32* %25 = getelementptr &(f32* %19)[i64 1]; + store f32* %25 with f32 %24; + i32 %26 = add i32 %14, i32 2; + f32 %27 = s2f i32 %26 to f32; + f32 %28 = fadd f32 %8, f32 %27; + f32* %29 = getelementptr &(f32* %17)[i64 2]; + store f32* %29 with f32 %28; + f32 %30 = fadd f32 %12, f32 %27; + f32* %31 = getelementptr &(f32* %19)[i64 2]; + store f32* %31 with f32 %30; + i32 %32 = add i32 %14, i32 3; + f32 %33 = s2f i32 %32 to f32; + f32 %34 = fadd f32 %8, f32 %33; + f32* %35 = getelementptr &(f32* %17)[i64 3]; + store f32* %35 with f32 %34; + f32 %36 = fadd f32 %12, f32 %33; + f32* %37 = getelementptr &(f32* %19)[i64 3]; + store f32* %37 with f32 %36; + i32 %38 = add i32 %14, i32 4; + i1 %39 = icmp sgt i32 %13, i32 %38; + cbr i1 %39(prob = 0.941176), ^b1, ^scalar.header; + ^scalar.header: + i32 %40 = phi [^b, i32 %0] [^b1, i32 %38]; + i1 %41 = icmp sgt i32 %1, i32 %40; + cbr i1 %41(prob = 0.75), ^b2, ^scalar.final; + ^b2 {scalar}: + i32 %42 = phi [^scalar.header, i32 %40] [^b2, i32 %48]; + f32 %43 = s2f i32 %42 to f32; + f32 %44 = fadd f32 %8, f32 %43; + f32* %45 = getelementptr &([8192 * f32]* %6)[i64 0][i32 %42]; + store f32* %45 with f32 %44; + f32 %46 = fadd f32 %12, f32 %43; + f32* %47 = getelementptr &([8192 * f32]* %10)[i64 0][i32 %42]; + store f32* %47 with f32 %46; + i32 %48 = add i32 %42, i32 1; + i1 %49 = icmp sgt i32 %1, i32 %48; + cbr i1 %49(prob = 0.75), ^b2, ^scalar.final; + ^scalar.final: + ret; +} +internal [40 * i8]* @cmmc_parallel_body_payload_0, align 8; +internal [8192 * f32]* @y, align 8; +internal [8192 * f32]* @x, align 8; diff --git a/tests/SysY2022/performance/vector_mul2.riscv.s b/tests/SysY2022/performance/vector_mul2.riscv.s index d4711e6a..589e4289 100644 --- a/tests/SysY2022/performance/vector_mul2.riscv.s +++ b/tests/SysY2022/performance/vector_mul2.riscv.s @@ -29,25 +29,25 @@ main: sd s0, 16(sp) jal _sysy_starttime lui a3, 24 -pcrel1015: +pcrel1018: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1016: +pcrel1019: auipc a1, %pcrel_hi(vectorA) - addi s1, a1, %pcrel_lo(pcrel1016) -pcrel1017: + addi s1, a1, %pcrel_lo(pcrel1019) +pcrel1020: auipc a1, %pcrel_hi(cmmc_parallel_body_0) - sd s1, %pcrel_lo(pcrel1015)(a0) - addi a2, a1, %pcrel_lo(pcrel1017) + sd s1, %pcrel_lo(pcrel1018)(a0) + addi a2, a1, %pcrel_lo(pcrel1020) mv a0, zero addiw a1, a3, 1696 jal cmmcParallelFor -pcrel1018: +pcrel1021: auipc a1, %pcrel_hi(Vectortm) mv a4, zero -pcrel1019: +pcrel1022: auipc a0, %pcrel_hi(vectorB) - addi a3, a1, %pcrel_lo(pcrel1018) - addi s0, a0, %pcrel_lo(pcrel1019) + addi a3, a1, %pcrel_lo(pcrel1021) + addi s0, a0, %pcrel_lo(pcrel1022) j label111 .p2align 2 label201: @@ -57,101 +57,231 @@ label201: .p2align 2 label111: fmv.w.x f10, zero - mv a1, a3 + mv a2, a3 mv a5, zero mv a0, zero - j label113 -.p2align 2 -label118: - fsw f10, 0(a1) - lui a5, 24 - addiw a2, a5, 1696 - bge t0, a2, label121 - addi a1, a1, 4 - mv a5, t0 .p2align 2 label113: addiw t0, a5, 1 lui t1, 24 - addiw a2, t1, 1696 - bge a0, a2, label118 - addiw a2, a0, 3 + addiw a1, t1, 1696 + bge a0, a1, label118 + addiw a1, a0, 3 lui t2, 24 addiw t1, t2, 1696 - bge a2, t1, label752 - sh2add a2, a0, s1 + bge a1, t1, label752 + sh2add a1, a0, s1 addiw t1, a5, 2 addiw t2, a5, 3 addiw t3, a5, 4 j label234 .p2align 2 +label752: + fmv.w.x f12, zero + fmv.s f11, f10 + fmv.s f10, f12 +.p2align 2 +label223: + lui t1, 24 + addiw a1, t1, 1696 + bge a0, a1, label757 + sh2add a1, a0, s1 +.p2align 2 +label228: + addw t2, a5, a0 + addw t4, t0, a0 + mulw t3, t2, t4 + mv t1, t3 + bge t3, zero, label978 + addiw t1, t3, 1 +label978: + sraiw t2, t1, 1 + addiw a0, a0, 1 + flw f14, 0(a1) + addw t3, t0, t2 + lui t2, 24 + fcvt.s.w f12, t3 + addiw t1, t2, 1696 + fdiv.s f13, f14, f12 + fadd.s f10, f11, f13 + bge a0, t1, label773 + addi a1, a1, 4 + fmv.s f11, f10 + j label228 +.p2align 2 +label981: + addiw t4, t4, 1 +.p2align 2 +label982: + sraiw a6, t4, 1 + flw f13, 0(a1) + addw t4, t1, a0 + addw t6, t0, a6 + fcvt.s.w f12, t6 + mulw t6, t5, t4 + fdiv.s f14, f13, f12 + mv t5, t6 + fadd.s f11, f10, f14 + bge t6, zero, label984 + addiw t5, t6, 1 +label984: + sraiw a6, t5, 1 + flw f14, 4(a1) + addw t5, t2, a0 + addw t6, t0, a6 + mulw a6, t4, t5 + fcvt.s.w f12, t6 + mv t6, a6 + fdiv.s f13, f14, f12 + fadd.s f10, f11, f13 + bge a6, zero, label986 + addiw t6, a6, 1 +label986: + sraiw t4, t6, 1 + flw f13, 8(a1) + addw t6, t3, a0 + addw a6, t0, t4 + fcvt.s.w f12, a6 + mulw a6, t5, t6 + fdiv.s f14, f13, f12 + mv t4, a6 + fadd.s f11, f10, f14 + bge a6, zero, label988 + addiw t4, a6, 1 +label988: + sraiw t5, t4, 1 + addiw a0, a0, 4 + flw f13, 12(a1) + addw t6, t0, t5 + lui t5, 24 + fcvt.s.w f12, t6 + addiw t4, t5, 1693 + fdiv.s f14, f13, f12 + fadd.s f10, f11, f14 + bge a0, t4, label820 + addi a1, a1, 16 +.p2align 2 +label234: + addw t6, a5, a0 + addw t5, t0, a0 + mulw a6, t6, t5 + mv t4, a6 + bge a6, zero, label982 + j label981 +label202: + li a0, 76 + jal _sysy_stoptime + mv a1, zero + mv a0, s1 + fmv.w.x f10, zero +.p2align 2 +label203: + sh2add a2, a1, s0 + flw f11, 0(a0) + lui a3, 24 + addiw a1, a1, 16 + flw f14, 0(a2) + flw f13, 4(a0) + fmadd.s f12, f11, f14, f10 + flw f0, 4(a2) + flw f14, 8(a0) + flw f15, 8(a2) + fmadd.s f11, f13, f0, f12 + flw f12, 12(a0) + flw f1, 12(a2) + flw f13, 16(a0) + flw f0, 16(a2) + fmadd.s f10, f14, f15, f11 + flw f14, 20(a0) + flw f15, 20(a2) + fmadd.s f11, f12, f1, f10 + fmadd.s f12, f13, f0, f11 + flw f11, 24(a0) + flw f0, 24(a2) + flw f13, 28(a0) + fmadd.s f10, f14, f15, f12 + flw f15, 28(a2) + flw f14, 32(a0) + fmadd.s f12, f11, f0, f10 + flw f0, 32(a2) + fmadd.s f11, f13, f15, f12 + flw f12, 36(a0) + flw f13, 36(a2) + fmadd.s f10, f14, f0, f11 + flw f14, 40(a0) + flw f15, 40(a2) + fmadd.s f11, f12, f13, f10 + flw f13, 44(a0) + flw f0, 44(a2) + fmadd.s f12, f14, f15, f11 + flw f11, 48(a0) + flw f15, 48(a2) + flw f14, 52(a0) + fmadd.s f10, f13, f0, f12 + flw f0, 52(a2) + flw f13, 56(a0) + fmadd.s f12, f11, f15, f10 + flw f15, 56(a2) + fmadd.s f11, f14, f0, f12 + flw f12, 60(a0) + flw f14, 60(a2) + addiw a2, a3, 1696 + fmadd.s f10, f13, f15, f11 + fmadd.s f11, f12, f14, f10 + bge a1, a2, label623 + addi a0, a0, 64 + fmv.s f10, f11 + j label203 +.p2align 2 +label757: + lui a5, 24 + addiw a0, a5, 1696 +.p2align 2 +label118: + fsw f10, 0(a2) + lui a5, 24 + addiw a1, a5, 1696 + bge t0, a1, label121 + addi a2, a2, 4 + mv a5, t0 + j label113 +.p2align 2 label121: fmv.w.x f10, zero - mv a1, s0 + mv a2, s0 mv t0, zero mv a0, zero + j label122 +.p2align 2 +label272: + lui t0, 24 + addiw a0, t0, 1696 +.p2align 2 +label144: + fsw f10, 0(a2) + lui t0, 24 + addiw a1, t0, 1696 + bge a5, a1, label343 + addi a2, a2, 4 + mv t0, a5 .p2align 2 label122: addiw a5, t0, 1 lui t1, 24 - addiw a2, t1, 1696 - bge a0, a2, label144 - addiw a2, a0, 3 + addiw a1, t1, 1696 + bge a0, a1, label144 + addiw a1, a0, 3 lui t2, 24 addiw t1, t2, 1696 - bge a2, t1, label267 - sh2add a2, a0, a3 + bge a1, t1, label267 + sh2add a1, a0, a3 addiw t1, t0, 2 addiw t2, t0, 3 addiw t3, t0, 4 j label139 .p2align 2 -label267: - fmv.w.x f11, zero -.p2align 2 -label128: - lui t1, 24 - addiw a2, t1, 1696 - bge a0, a2, label272 - sh2add a2, a0, a3 -.p2align 2 -label133: - addw t2, t0, a0 - addw t3, a5, a0 - mulw t4, t2, t3 - mv t1, t4 - bge t4, zero, label923 - addiw t1, t4, 1 -label923: - sraiw t2, t1, 1 - addiw a0, a0, 1 - flw f13, 0(a2) - addw t3, t2, a0 - lui t2, 24 - fcvt.s.w f11, t3 - addiw t1, t2, 1696 - fdiv.s f12, f13, f11 - fadd.s f10, f10, f12 - bge a0, t1, label288 - addi a2, a2, 4 - j label133 -.p2align 2 -label288: - lui a2, 24 - addiw a0, a2, 1696 -.p2align 2 -label144: - fsw f10, 0(a1) - lui t0, 24 - addiw a2, t0, 1696 - bge a5, a2, label343 - addi a1, a1, 4 - mv t0, a5 - j label122 -.p2align 2 label143: - addi a2, a2, 16 - fmv.s f10, f11 + addi a1, a1, 16 .p2align 2 label139: addw t6, t0, a0 @@ -163,7 +293,7 @@ label139: label927: sraiw t6, t4, 1 addiw a6, a0, 1 - flw f13, 0(a2) + flw f13, 0(a1) addw t4, t1, a0 addw a7, t6, a6 mulw t6, t5, t4 @@ -176,7 +306,7 @@ label927: label929: sraiw t6, t5, 1 addiw a7, a0, 2 - flw f14, 4(a2) + flw f14, 4(a1) addw t5, t2, a0 addw a6, t6, a7 fcvt.s.w f12, a6 @@ -189,7 +319,7 @@ label929: label931: sraiw t4, t6, 1 addiw a6, a0, 3 - flw f14, 8(a2) + flw f14, 8(a1) addw t6, t3, a0 addw a7, t4, a6 mulw a6, t5, t6 @@ -202,40 +332,54 @@ label931: label933: sraiw t5, t4, 1 addiw a0, a0, 4 - flw f12, 12(a2) + flw f13, 12(a1) addw t6, t5, a0 lui t5, 24 - fcvt.s.w f10, t6 + fcvt.s.w f12, t6 addiw t4, t5, 1693 - fdiv.s f13, f12, f10 - fadd.s f11, f11, f13 + fdiv.s f14, f13, f12 + fadd.s f10, f11, f14 blt a0, t4, label143 - fmv.s f10, f11 - j label128 -.p2align 2 -label752: - fmv.w.x f12, zero fmv.s f11, f10 - fmv.s f10, f12 -.p2align 2 -label223: - lui t1, 24 - addiw a2, t1, 1696 - bge a0, a2, label757 - sh2add a2, a0, s1 - j label228 -.p2align 2 -label343: - fmv.w.x f10, zero - mv a2, a3 - mv a5, zero - mv a0, zero .p2align 2 -label149: - addiw t0, a5, 1 +label128: lui t1, 24 addiw a1, t1, 1696 - bge a0, a1, label171 + bge a0, a1, label272 + sh2add a1, a0, a3 +.p2align 2 +label133: + addw t2, t0, a0 + addw t3, a5, a0 + mulw t4, t2, t3 + mv t1, t4 + bge t4, zero, label923 + addiw t1, t4, 1 +label923: + sraiw t2, t1, 1 + addiw a0, a0, 1 + flw f13, 0(a1) + addw t3, t2, a0 + lui t2, 24 + fcvt.s.w f10, t3 + addiw t1, t2, 1696 + fdiv.s f12, f13, f10 + fadd.s f11, f11, f12 + bge a0, t1, label288 + addi a1, a1, 4 + j label133 +.p2align 2 +label343: + fmv.w.x f10, zero + mv a2, a3 + mv a5, zero + mv a0, zero +.p2align 2 +label149: + addiw t0, a5, 1 + lui t1, 24 + addiw a1, t1, 1696 + bge a0, a1, label171 addiw a1, a0, 3 lui t2, 24 addiw t1, t2, 1696 @@ -244,7 +388,6 @@ label149: addiw t1, a5, 2 addiw t2, a5, 3 addiw t3, a5, 4 - fmv.s f11, f10 j label156 .p2align 2 label160: @@ -265,375 +408,95 @@ label939: mulw t6, t5, t4 fcvt.s.w f12, a6 mv t5, t6 - fdiv.s f14, f13, f12 - fadd.s f10, f11, f14 - bge t6, zero, label941 - addiw t5, t6, 1 -label941: - sraiw a6, t5, 1 - flw f13, 4(a1) - addw t5, t2, a0 - addw t6, t0, a6 - mulw a6, t4, t5 - fcvt.s.w f12, t6 - mv t6, a6 - fdiv.s f14, f13, f12 - fadd.s f11, f10, f14 - bge a6, zero, label943 - addiw t6, a6, 1 -label943: - sraiw t4, t6, 1 - flw f14, 8(a1) - addw a6, t0, t4 - fcvt.s.w f12, a6 - addw a6, t3, a0 - fdiv.s f13, f14, f12 - mulw t6, t5, a6 - mv t4, t6 - fadd.s f10, f11, f13 - bge t6, zero, label945 - addiw t4, t6, 1 -label945: - sraiw t5, t4, 1 - addiw a0, a0, 4 - flw f13, 12(a1) - addw t6, t0, t5 - lui t5, 24 - fcvt.s.w f12, t6 - addiw t4, t5, 1693 - fdiv.s f14, f13, f12 - fadd.s f11, f10, f14 - blt a0, t4, label160 - fmv.s f10, f11 -.p2align 2 -label161: - lui t1, 24 - addiw a1, t1, 1696 - bge a0, a1, label407 - sh2add a1, a0, s0 -.p2align 2 -label166: - addw t2, a5, a0 - addw t3, t0, a0 - mulw t4, t2, t3 - mv t1, t4 - bge t4, zero, label950 - addiw t1, t4, 1 -label950: - sraiw t2, t1, 1 - addiw a0, a0, 1 - flw f13, 0(a1) - addw t3, t0, t2 - lui t2, 24 - fcvt.s.w f11, t3 - addiw t1, t2, 1696 - fdiv.s f12, f13, f11 - fadd.s f10, f10, f12 - bge a0, t1, label423 - addi a1, a1, 4 - j label166 -.p2align 2 -label407: - lui a5, 24 - fmv.s f10, f11 - addiw a0, a5, 1696 -.p2align 2 -label171: - fsw f10, 0(a2) - lui a5, 24 - addiw a1, a5, 1696 - bge t0, a1, label174 - addi a2, a2, 4 - mv a5, t0 - j label149 -.p2align 2 -label238: - addi a2, a2, 16 -.p2align 2 -label234: - addw t6, a5, a0 - addw t5, t0, a0 - mulw a6, t6, t5 - mv t4, a6 - bge a6, zero, label982 - addiw t4, a6, 1 -label982: - sraiw a6, t4, 1 - flw f13, 0(a2) - addw t4, t1, a0 - addw t6, t0, a6 - fcvt.s.w f12, t6 - mulw t6, t5, t4 - fdiv.s f14, f13, f12 - mv t5, t6 - fadd.s f11, f10, f14 - bge t6, zero, label984 - addiw t5, t6, 1 -label984: - sraiw a6, t5, 1 - flw f14, 4(a2) - addw t5, t2, a0 - addw t6, t0, a6 - mulw a6, t4, t5 - fcvt.s.w f12, t6 - mv t6, a6 - fdiv.s f13, f14, f12 - fadd.s f10, f11, f13 - bge a6, zero, label986 - addiw t6, a6, 1 -label986: - sraiw t4, t6, 1 - flw f13, 8(a2) - addw t6, t3, a0 - addw a6, t0, t4 - fcvt.s.w f12, a6 - mulw a6, t5, t6 - fdiv.s f14, f13, f12 - mv t4, a6 - fadd.s f11, f10, f14 - bge a6, zero, label988 - addiw t4, a6, 1 -label988: - sraiw t5, t4, 1 - addiw a0, a0, 4 - flw f13, 12(a2) - addw t6, t0, t5 - lui t5, 24 - fcvt.s.w f12, t6 - addiw t4, t5, 1693 - fdiv.s f14, f13, f12 - fadd.s f10, f11, f14 - blt a0, t4, label238 - fmv.s f11, f10 - j label223 -.p2align 2 -label174: - fmv.w.x f10, zero - mv a2, s1 - mv t0, zero - mv a0, zero - j label175 -.p2align 2 -label197: - fsw f10, 0(a2) - lui t0, 24 - addiw a1, t0, 1696 - bge a5, a1, label201 - addi a2, a2, 4 - mv t0, a5 -.p2align 2 -label175: - addiw a5, t0, 1 - lui t1, 24 - addiw a1, t1, 1696 - bge a0, a1, label197 - addiw a1, a0, 3 - lui t2, 24 - addiw t1, t2, 1696 - bge a1, t1, label439 - sh2add a1, a0, a3 - addiw t1, t0, 2 - addiw t2, t0, 3 - addiw t3, t0, 4 - j label182 -.p2align 2 -label439: - fmv.w.x f12, zero - fmv.s f11, f10 - fmv.s f10, f12 -.p2align 2 -label187: - lui t1, 24 - addiw a1, t1, 1696 - bge a0, a1, label494 - sh2add a1, a0, a3 -.p2align 2 -label192: - addw t2, t0, a0 - addw t3, a5, a0 - mulw t4, t2, t3 - mv t1, t4 - bge t4, zero, label968 - addiw t1, t4, 1 -label968: - sraiw t2, t1, 1 - addiw a0, a0, 1 - flw f13, 0(a1) - addw t3, t2, a0 - lui t2, 24 - fcvt.s.w f10, t3 - addiw t1, t2, 1696 - fdiv.s f12, f13, f10 - fadd.s f11, f11, f12 - bge a0, t1, label510 - addi a1, a1, 4 - j label192 -.p2align 2 -label186: - addi a1, a1, 16 -.p2align 2 -label182: - addw t6, t0, a0 - addw t4, a5, a0 - mulw a6, t6, t4 - mv t5, a6 - bge a6, zero, label957 - addiw t5, a6, 1 -label957: - sraiw t6, t5, 1 - addiw a6, a0, 1 - flw f14, 0(a1) - addw t5, t1, a0 - addw a7, t6, a6 - mulw a6, t4, t5 - fcvt.s.w f12, a7 - mv t6, a6 - fdiv.s f13, f14, f12 - fadd.s f11, f10, f13 - bge a6, zero, label959 - addiw t6, a6, 1 -label959: - sraiw t4, t6, 1 - addiw a6, a0, 2 - flw f14, 4(a1) - addw a7, t4, a6 - addw t4, t2, a0 - fcvt.s.w f12, a7 - mulw t6, t5, t4 - fdiv.s f13, f14, f12 - mv t5, t6 - fadd.s f10, f11, f13 - bge t6, zero, label961 - addiw t5, t6, 1 -label961: - sraiw t6, t5, 1 - addiw a7, a0, 3 - flw f14, 8(a1) - addw t5, t3, a0 - addw a6, t6, a7 - mulw t6, t4, t5 - fcvt.s.w f12, a6 - mv t4, t6 - fdiv.s f13, f14, f12 - fadd.s f11, f10, f13 - bge t6, zero, label963 - addiw t4, t6, 1 -label963: - sraiw t5, t4, 1 - addiw a0, a0, 4 - flw f13, 12(a1) - addw t6, t5, a0 - lui t5, 24 - fcvt.s.w f12, t6 - addiw t4, t5, 1693 - fdiv.s f14, f13, f12 - fadd.s f10, f11, f14 - blt a0, t4, label186 - fmv.s f11, f10 - j label187 -label202: - li a0, 76 - jal _sysy_stoptime - mv a1, zero - mv a0, s1 - fmv.w.x f10, zero -.p2align 2 -label203: - sh2add a2, a1, s0 - flw f11, 0(a0) - lui a3, 24 - addiw a1, a1, 16 - flw f14, 0(a2) - flw f13, 4(a0) - fmadd.s f12, f11, f14, f10 - flw f0, 4(a2) - flw f14, 8(a0) - flw f15, 8(a2) - fmadd.s f11, f13, f0, f12 - flw f12, 12(a0) - flw f1, 12(a2) - flw f13, 16(a0) - flw f0, 16(a2) - fmadd.s f10, f14, f15, f11 - flw f14, 20(a0) - flw f15, 20(a2) - fmadd.s f11, f12, f1, f10 - fmadd.s f12, f13, f0, f11 - flw f11, 24(a0) - flw f0, 24(a2) - flw f13, 28(a0) - fmadd.s f10, f14, f15, f12 - flw f15, 28(a2) - flw f14, 32(a0) - fmadd.s f12, f11, f0, f10 - flw f0, 32(a2) - fmadd.s f11, f13, f15, f12 - flw f12, 36(a0) - flw f13, 36(a2) - fmadd.s f10, f14, f0, f11 - flw f14, 40(a0) - flw f15, 40(a2) - fmadd.s f11, f12, f13, f10 - flw f13, 44(a0) - flw f0, 44(a2) - fmadd.s f12, f14, f15, f11 - flw f11, 48(a0) - flw f15, 48(a2) - flw f14, 52(a0) - fmadd.s f10, f13, f0, f12 - flw f0, 52(a2) - flw f13, 56(a0) - fmadd.s f12, f11, f15, f10 - flw f15, 56(a2) - fmadd.s f11, f14, f0, f12 - flw f12, 60(a0) - flw f14, 60(a2) - addiw a2, a3, 1696 - fmadd.s f10, f13, f15, f11 - fmadd.s f11, f12, f14, f10 - bge a1, a2, label623 - addi a0, a0, 64 - fmv.s f10, f11 - j label203 + fdiv.s f14, f13, f12 + fadd.s f11, f10, f14 + bge t6, zero, label941 + addiw t5, t6, 1 +label941: + sraiw a6, t5, 1 + flw f13, 4(a1) + addw t5, t2, a0 + addw t6, t0, a6 + mulw a6, t4, t5 + fcvt.s.w f12, t6 + mv t6, a6 + fdiv.s f14, f13, f12 + fadd.s f10, f11, f14 + bge a6, zero, label943 + addiw t6, a6, 1 +label943: + sraiw t4, t6, 1 + flw f14, 8(a1) + addw a6, t0, t4 + fcvt.s.w f12, a6 + addw a6, t3, a0 + fdiv.s f13, f14, f12 + mulw t6, t5, a6 + mv t4, t6 + fadd.s f11, f10, f13 + bge t6, zero, label945 + addiw t4, t6, 1 +label945: + sraiw t5, t4, 1 + addiw a0, a0, 4 + flw f13, 12(a1) + addw t6, t0, t5 + lui t5, 24 + fcvt.s.w f12, t6 + addiw t4, t5, 1693 + fdiv.s f14, f13, f12 + fadd.s f10, f11, f14 + blt a0, t4, label160 + fmv.s f11, f10 +.p2align 2 +label161: + lui t1, 24 + addiw a1, t1, 1696 + bge a0, a1, label407 + sh2add a1, a0, s0 + j label166 +.p2align 2 +label407: + lui a5, 24 + addiw a0, a5, 1696 .p2align 2 -label232: +label171: + fsw f10, 0(a2) + lui a5, 24 + addiw a1, a5, 1696 + bge t0, a1, label174 addi a2, a2, 4 - fmv.s f11, f10 + mv a5, t0 + j label149 .p2align 2 -label228: +label170: + addi a1, a1, 4 +.p2align 2 +label166: addw t2, a5, a0 - addw t4, t0, a0 - mulw t3, t2, t4 - mv t1, t3 - bge t3, zero, label978 - addiw t1, t3, 1 -label978: + addw t3, t0, a0 + mulw t4, t2, t3 + mv t1, t4 + bge t4, zero, label950 + addiw t1, t4, 1 +label950: sraiw t2, t1, 1 addiw a0, a0, 1 - flw f14, 0(a2) + flw f13, 0(a1) addw t3, t0, t2 lui t2, 24 - fcvt.s.w f12, t3 + fcvt.s.w f10, t3 addiw t1, t2, 1696 - fdiv.s f13, f14, f12 - fadd.s f10, f11, f13 - blt a0, t1, label232 - lui a2, 24 - addiw a0, a2, 1696 - j label118 -.p2align 2 -label510: + fdiv.s f12, f13, f10 + fadd.s f11, f11, f12 + blt a0, t1, label170 lui a1, 24 fmv.s f10, f11 addiw a0, a1, 1696 - j label197 + j label171 label623: fmv.w.x f10, zero mv a0, zero - j label209 -.p2align 2 -label219: - addi s0, s0, 64 .p2align 2 label209: flw f15, 0(s0) @@ -671,13 +534,16 @@ label209: flw f14, 60(s0) fmadd.s f13, f15, f15, f12 fmadd.s f10, f14, f14, f13 - blt a0, a1, label219 + bge a0, a1, label213 + addi s0, s0, 64 + j label209 +label213: fdiv.s f12, f11, f10 lui a0, 260096 -pcrel1020: +pcrel1023: auipc a2, %pcrel_hi(__cmmc_fp_constant_pool) fmv.w.x f13, a0 - addi a0, a2, %pcrel_lo(pcrel1020) + addi a0, a2, %pcrel_lo(pcrel1023) fsub.s f10, f13, f12 flw f13, 0(a0) flw f11, 4(a0) @@ -692,10 +558,10 @@ pcrel1020: label214: fadd.s f14, f10, f11 lui a0, 258048 -pcrel1021: +pcrel1024: auipc a2, %pcrel_hi(__cmmc_fp_constant_pool) fmv.w.x f15, a0 - addi a0, a2, %pcrel_lo(pcrel1021) + addi a0, a2, %pcrel_lo(pcrel1024) fmul.s f10, f14, f15 flw f14, 0(a0) flw f15, 4(a0) @@ -705,16 +571,12 @@ pcrel1021: flt.s a1, f14, f13 or a3, a1, a2 bne a3, zero, label214 - j label217 -label709: - lui a0, 260096 - fmv.w.x f10, a0 label217: lui a0, 260096 -pcrel1022: +pcrel1025: auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) fmv.w.x f12, a0 - addi a1, a3, %pcrel_lo(pcrel1022) + addi a1, a3, %pcrel_lo(pcrel1025) fsub.s f11, f10, f12 flw f12, 0(a1) flw f10, 4(a1) @@ -731,30 +593,171 @@ pcrel1022: addi sp, sp, 24 ret .p2align 2 -label272: +label174: + fmv.w.x f10, zero + mv a2, s1 + mv t0, zero + mv a0, zero + j label175 +.p2align 2 +label197: + fsw f10, 0(a2) lui t0, 24 - fmv.s f10, f11 - addiw a0, t0, 1696 - j label144 + addiw a1, t0, 1696 + bge a5, a1, label201 + addi a2, a2, 4 + mv t0, a5 .p2align 2 -label757: - lui a5, 24 - addiw a0, a5, 1696 - j label118 +label175: + addiw a5, t0, 1 + lui t1, 24 + addiw a1, t1, 1696 + bge a0, a1, label197 + addiw a1, a0, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge a1, t1, label439 + sh2add a1, a0, a3 + addiw t1, t0, 2 + addiw t2, t0, 3 + addiw t3, t0, 4 + fmv.s f11, f10 +.p2align 2 +label182: + addw t6, t0, a0 + addw t4, a5, a0 + mulw a6, t6, t4 + mv t5, a6 + bge a6, zero, label957 + addiw t5, a6, 1 +label957: + sraiw t6, t5, 1 + addiw a6, a0, 1 + flw f14, 0(a1) + addw t5, t1, a0 + addw a7, t6, a6 + mulw a6, t4, t5 + fcvt.s.w f12, a7 + mv t6, a6 + fdiv.s f13, f14, f12 + fadd.s f10, f11, f13 + bge a6, zero, label959 + addiw t6, a6, 1 +label959: + sraiw t4, t6, 1 + addiw a6, a0, 2 + flw f14, 4(a1) + addw a7, t4, a6 + addw t4, t2, a0 + fcvt.s.w f12, a7 + mulw t6, t5, t4 + fdiv.s f13, f14, f12 + mv t5, t6 + fadd.s f11, f10, f13 + bge t6, zero, label961 + addiw t5, t6, 1 +label961: + sraiw t6, t5, 1 + addiw a7, a0, 3 + flw f14, 8(a1) + addw t5, t3, a0 + addw a6, t6, a7 + mulw t6, t4, t5 + fcvt.s.w f12, a6 + mv t4, t6 + fdiv.s f13, f14, f12 + fadd.s f10, f11, f13 + bge t6, zero, label963 + addiw t4, t6, 1 +label963: + sraiw t5, t4, 1 + addiw a0, a0, 4 + flw f13, 12(a1) + addw t6, t5, a0 + lui t5, 24 + fcvt.s.w f12, t6 + addiw t4, t5, 1693 + fdiv.s f14, f13, f12 + fadd.s f11, f10, f14 + bge a0, t4, label489 + addi a1, a1, 16 + j label182 +.p2align 2 +label439: + fmv.w.x f11, zero +.p2align 2 +label187: + lui t1, 24 + addiw a1, t1, 1696 + bge a0, a1, label494 + sh2add a1, a0, a3 + j label192 +.p2align 2 +label196: + addi a1, a1, 4 +.p2align 2 +label192: + addw t2, t0, a0 + addw t3, a5, a0 + mulw t4, t2, t3 + mv t1, t4 + bge t4, zero, label968 + addiw t1, t4, 1 +label968: + sraiw t2, t1, 1 + addiw a0, a0, 1 + flw f13, 0(a1) + addw t3, t2, a0 + lui t2, 24 + fcvt.s.w f11, t3 + addiw t1, t2, 1696 + fdiv.s f12, f13, f11 + fadd.s f10, f10, f12 + blt a0, t1, label196 + lui a1, 24 + addiw a0, a1, 1696 + j label197 +label709: + lui a0, 260096 + fmv.w.x f10, a0 + j label217 +.p2align 2 +label489: + fmv.s f10, f11 + j label187 .p2align 2 label494: lui t0, 24 + fmv.s f10, f11 addiw a0, t0, 1696 j label197 .p2align 2 +label820: + fmv.s f11, f10 + j label223 +.p2align 2 label355: - fmv.w.x f11, zero + fmv.w.x f12, zero + fmv.s f11, f10 + fmv.s f10, f12 j label161 .p2align 2 -label423: +label288: lui a1, 24 + fmv.s f10, f11 addiw a0, a1, 1696 - j label171 + j label144 +.p2align 2 +label773: + lui a1, 24 + addiw a0, a1, 1696 + j label118 +.p2align 2 +label267: + fmv.w.x f12, zero + fmv.s f11, f10 + fmv.s f10, f12 + j label128 .p2align 2 cmmc_parallel_body_0: mv a3, a0