From 8cc54d51e1f7ac41a5ff71e3b7ba40a7cf32a3bf Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Thu, 17 Aug 2023 20:36:46 +0800 Subject: [PATCH] fix(misc): remove tuning --- cmmc/CodeGen/CodeGenUtils.cpp | 6 +- cmmc/Runtime/cmmc_sysy_rt.cpp | 12 +- cmmc/Support/Tune.cpp | 45 +- cmmc/Target/RISCV/RISCVScheduleModel.cpp | 4 +- cmmc/Target/RISCV/RISCVTarget.cpp | 2 +- cmmc/Transforms/IPO/Inlining.cpp | 2 +- cmmc/Transforms/IPO/ShrinkWrapping.cpp | 2 +- .../Misc/ImmutableScalarRefArg2Value.cpp | 2 +- tests/Regression/CodeGen/arithmetic.mips.s | 2 +- tests/Regression/CodeGen/arithmetic.riscv.s | 2 +- tests/Regression/CodeGen/call.arm.s | 2 +- tests/Regression/CodeGen/call.mips.s | 4 +- tests/Regression/CodeGen/call.riscv.s | 2 +- tests/Regression/CodeGen/constant.arm.s | 2 +- tests/Regression/CodeGen/constant.mips.s | 4 +- tests/Regression/CodeGen/constant.riscv.s | 2 +- .../llvmTests/2006-01-19-ISelFoldingBug.arm.s | 2 +- .../2006-01-19-ISelFoldingBug.mips.s | 2 +- .../2006-01-19-ISelFoldingBug.riscv.s | 2 +- .../llvmTests/2007-01-08-InstrSched.mips.s | 2 +- .../llvmTests/2007-08-10-SignExtSubreg.arm.s | 2 +- .../llvmTests/2007-08-10-SignExtSubreg.mips.s | 2 +- .../2007-08-10-SignExtSubreg.riscv.s | 2 +- .../llvmTests/2008-07-22-Cstpool.mips.s | 2 +- .../llvmTests/2008-07-22-Cstpool.riscv.s | 2 +- .../CodeGen/llvmTests/2009-04-24.arm.s | 2 +- .../CodeGen/llvmTests/2009-04-24.mips.s | 2 +- .../CodeGen/llvmTests/2009-04-24.riscv.s | 2 +- .../llvmTests/2009-11-16-CstPoolLoad.mips.s | 2 +- .../llvmTests/2009-11-16-CstPoolLoad.riscv.s | 2 +- .../2010-11-18-SelectOfExtload.arm.s | 2 - .../2010-11-18-SelectOfExtload.mips.s | 2 - .../2010-11-18-SelectOfExtload.riscv.s | 2 - .../CodeGen/llvmTests/atom-sched.arm.s | 12 +- .../CodeGen/llvmTests/atom-sched.mips.s | 12 +- .../CodeGen/llvmTests/atom-sched.riscv.s | 12 +- .../CodeGen/llvmTests/backpropmask.arm.s | 8 +- .../CodeGen/llvmTests/backpropmask.mips.s | 8 +- .../CodeGen/llvmTests/backpropmask.riscv.s | 8 +- .../Regression/CodeGen/llvmTests/beqzc.arm.s | 4 +- .../Regression/CodeGen/llvmTests/beqzc.mips.s | 4 +- .../CodeGen/llvmTests/beqzc.riscv.s | 4 +- .../Regression/CodeGen/llvmTests/beqzc1.arm.s | 4 +- .../CodeGen/llvmTests/beqzc1.mips.s | 4 +- .../CodeGen/llvmTests/beqzc1.riscv.s | 4 +- .../CodeGen/llvmTests/brconeq.arm.s | 6 +- .../CodeGen/llvmTests/brconeq.mips.s | 6 +- .../CodeGen/llvmTests/brconeq.riscv.s | 6 +- .../CodeGen/llvmTests/brconeqk.arm.s | 4 +- .../CodeGen/llvmTests/brconeqk.mips.s | 4 +- .../CodeGen/llvmTests/brconeqk.riscv.s | 4 +- .../CodeGen/llvmTests/brconeqz.arm.s | 4 +- .../CodeGen/llvmTests/brconeqz.mips.s | 4 +- .../CodeGen/llvmTests/brconeqz.riscv.s | 4 +- .../CodeGen/llvmTests/brconge.arm.s | 10 +- .../CodeGen/llvmTests/brconge.mips.s | 10 +- .../CodeGen/llvmTests/brconge.riscv.s | 10 +- .../CodeGen/llvmTests/brcongt.arm.s | 8 +- .../CodeGen/llvmTests/brcongt.mips.s | 8 +- .../CodeGen/llvmTests/brcongt.riscv.s | 8 +- .../CodeGen/llvmTests/brconle.arm.s | 10 +- .../CodeGen/llvmTests/brconle.mips.s | 10 +- .../CodeGen/llvmTests/brconle.riscv.s | 10 +- .../CodeGen/llvmTests/brconlt.arm.s | 8 +- .../CodeGen/llvmTests/brconlt.mips.s | 8 +- .../CodeGen/llvmTests/brconlt.riscv.s | 8 +- .../CodeGen/llvmTests/brconne.arm.s | 6 +- .../CodeGen/llvmTests/brconne.mips.s | 6 +- .../CodeGen/llvmTests/brconne.riscv.s | 6 +- .../CodeGen/llvmTests/brconnek.arm.s | 4 +- .../CodeGen/llvmTests/brconnek.mips.s | 4 +- .../CodeGen/llvmTests/brconnek.riscv.s | 4 +- .../CodeGen/llvmTests/brconnez.arm.s | 4 +- .../CodeGen/llvmTests/brconnez.mips.s | 4 +- .../CodeGen/llvmTests/brconnez.riscv.s | 4 +- .../llvmTests/cmse-expand-bxns-ret..arm.s | 2 +- .../llvmTests/cmse-expand-bxns-ret..mips.s | 2 +- .../llvmTests/cmse-expand-bxns-ret..riscv.s | 2 +- .../llvmTests/codegen-prepare-crash.arm.s | 2 +- .../llvmTests/codegen-prepare-crash.mips.s | 2 +- .../llvmTests/codegen-prepare-crash.riscv.s | 2 +- .../CodeGen/llvmTests/coff-exclude.arm.s | 14 +- .../CodeGen/llvmTests/coff-exclude.mips.s | 14 +- .../CodeGen/llvmTests/coff-exclude.riscv.s | 14 +- .../llvmTests/dbg-value-superreg-copy2..arm.s | 8 +- .../dbg-value-superreg-copy2..mips.s | 8 +- .../dbg-value-superreg-copy2..riscv.s | 8 +- .../llvmTests/disable-tail-merge.arm.s | 4 +- .../llvmTests/disable-tail-merge.mips.s | 4 +- .../llvmTests/disable-tail-merge.riscv.s | 4 +- tests/Regression/CodeGen/llvmTests/div.arm.s | 6 +- tests/Regression/CodeGen/llvmTests/div.mips.s | 6 +- .../Regression/CodeGen/llvmTests/div.riscv.s | 6 +- .../CodeGen/llvmTests/div_rem.arm.s | 8 +- .../CodeGen/llvmTests/div_rem.mips.s | 8 +- .../CodeGen/llvmTests/div_rem.riscv.s | 8 +- tests/Regression/CodeGen/llvmTests/divu.arm.s | 6 +- .../Regression/CodeGen/llvmTests/divu.mips.s | 6 +- .../Regression/CodeGen/llvmTests/divu.riscv.s | 6 +- .../CodeGen/llvmTests/divu_remu.arm.s | 8 +- .../CodeGen/llvmTests/divu_remu.mips.s | 8 +- .../CodeGen/llvmTests/divu_remu.riscv.s | 8 +- .../CodeGen/llvmTests/elf-comdat.arm.s | 2 +- .../CodeGen/llvmTests/elf-comdat.mips.s | 2 +- .../CodeGen/llvmTests/elf-comdat.riscv.s | 2 +- .../CodeGen/llvmTests/elf-comdat2.arm.s | 4 +- .../CodeGen/llvmTests/elf-comdat2.mips.s | 4 +- .../CodeGen/llvmTests/elf-comdat2.riscv.s | 4 +- .../CodeGen/llvmTests/elf-exclude.arm.s | 14 +- .../CodeGen/llvmTests/elf-exclude.mips.s | 14 +- .../CodeGen/llvmTests/elf-exclude.riscv.s | 14 +- .../llvmTests/emergency-spill-slot.arm.s | 2 +- .../llvmTests/emergency-spill-slot.mips.s | 2 +- .../llvmTests/emergency-spill-slot.riscv.s | 2 +- .../CodeGen/llvmTests/extloadi1.arm.s | 2 +- .../CodeGen/llvmTests/extloadi1.mips.s | 2 +- .../CodeGen/llvmTests/extloadi1.riscv.s | 2 +- .../fast-isel-load-store-verify.arm.s | 5 +- .../fast-isel-load-store-verify.mips.s | 5 +- .../fast-isel-load-store-verify.riscv.s | 5 +- .../CodeGen/llvmTests/fast-isel-pic.arm.s | 4 +- .../CodeGen/llvmTests/fast-isel-pic.mips.s | 4 +- .../CodeGen/llvmTests/fast-isel-pic.riscv.s | 4 +- .../CodeGen/llvmTests/fast-isel-tls.arm.s | 2 +- .../CodeGen/llvmTests/fast-isel-tls.mips.s | 2 +- .../CodeGen/llvmTests/fast-isel-tls.riscv.s | 2 +- .../CodeGen/llvmTests/float-imm.mips.s | 2 +- .../CodeGen/llvmTests/float-imm.riscv.s | 2 +- .../llvmTests/float-select-icmp.mips.s | 2 +- .../CodeGen/llvmTests/fold-mul-lohi.arm.s | 6 +- .../CodeGen/llvmTests/fold-mul-lohi.mips.s | 6 +- .../CodeGen/llvmTests/fold-mul-lohi.riscv.s | 6 +- .../CodeGen/llvmTests/fp-fast.mips.s | 2 +- .../CodeGen/llvmTests/fp16static.arm.s | 2 +- .../CodeGen/llvmTests/fp16static.mips.s | 2 +- .../CodeGen/llvmTests/fp16static.riscv.s | 2 +- .../CodeGen/llvmTests/fpnotneeded.arm.s | 4 +- .../CodeGen/llvmTests/fpnotneeded.mips.s | 6 +- .../CodeGen/llvmTests/fpnotneeded.riscv.s | 4 +- .../CodeGen/llvmTests/global-address.arm.s | 4 +- .../CodeGen/llvmTests/global-address.mips.s | 4 +- .../CodeGen/llvmTests/global-address.riscv.s | 4 +- .../llvmTests/global-merge-dllexport.arm.s | 4 +- .../llvmTests/global-merge-dllexport.mips.s | 4 +- .../llvmTests/global-merge-dllexport.riscv.s | 4 +- .../CodeGen/llvmTests/hf1_body.arm.s | 2 +- .../CodeGen/llvmTests/hf1_body.mips.s | 2 +- .../CodeGen/llvmTests/hf1_body.riscv.s | 2 +- .../CodeGen/llvmTests/hidden-vis-2.arm.s | 2 +- .../CodeGen/llvmTests/hidden-vis-2.mips.s | 2 +- .../CodeGen/llvmTests/hidden-vis-2.riscv.s | 2 +- .../CodeGen/llvmTests/hidden-vis-3.arm.s | 4 +- .../CodeGen/llvmTests/hidden-vis-3.mips.s | 4 +- .../CodeGen/llvmTests/hidden-vis-3.riscv.s | 4 +- .../CodeGen/llvmTests/hidden-vis-4.arm.s | 2 +- .../CodeGen/llvmTests/hidden-vis-4.mips.s | 2 +- .../CodeGen/llvmTests/hidden-vis-4.riscv.s | 2 +- .../CodeGen/llvmTests/hidden-vis.arm.s | 4 +- .../CodeGen/llvmTests/hidden-vis.mips.s | 4 +- .../CodeGen/llvmTests/hidden-vis.riscv.s | 4 +- .../CodeGen/llvmTests/imm-cse.arm.s | 4 +- .../CodeGen/llvmTests/imm-cse.mips.s | 4 +- .../CodeGen/llvmTests/imm-cse.riscv.s | 4 +- .../indirect-branch-tracking-cm-lager.arm.s | 2 +- .../indirect-branch-tracking-cm-lager.mips.s | 2 +- .../indirect-branch-tracking-cm-lager.riscv.s | 2 +- .../llvmTests/insert-prefetch-other.a.arm.s | 2 +- .../llvmTests/insert-prefetch-other.a.mips.s | 2 +- .../llvmTests/insert-prefetch-other.a.riscv.s | 2 +- .../llvmTests/line-zero-prologue-end.arm.s | 2 +- .../llvmTests/line-zero-prologue-end.mips.s | 2 +- .../llvmTests/line-zero-prologue-end.riscv.s | 2 +- .../llvmTests/live-range-nosubreg.arm.s | 7 +- .../llvmTests/live-range-nosubreg.mips.s | 7 +- .../llvmTests/live-range-nosubreg.riscv.s | 7 +- tests/Regression/CodeGen/llvmTests/llvm.arm.s | 4 +- .../Regression/CodeGen/llvmTests/llvm.mips.s | 4 +- .../Regression/CodeGen/llvmTests/llvm.riscv.s | 4 +- .../llvmTests/loop-strength-reduce5.arm.s | 4 +- .../llvmTests/loop-strength-reduce5.mips.s | 4 +- .../llvmTests/loop-strength-reduce5.riscv.s | 4 +- .../CodeGen/llvmTests/lsr-sort.arm.s | 2 +- .../CodeGen/llvmTests/lsr-sort.mips.s | 2 +- .../CodeGen/llvmTests/lsr-sort.riscv.s | 2 +- .../machine-outliner-unsafe-registers..arm.s | 2 +- .../machine-outliner-unsafe-registers..mips.s | 2 +- ...machine-outliner-unsafe-registers..riscv.s | 2 +- .../machinelicm-address-pseudos.arm.s | 8 +- .../machinelicm-address-pseudos.mips.s | 8 +- .../machinelicm-address-pseudos.riscv.s | 8 +- .../CodeGen/llvmTests/machineverifier.arm.s | 2 +- .../CodeGen/llvmTests/machineverifier.mips.s | 2 +- .../CodeGen/llvmTests/machineverifier.riscv.s | 2 +- .../CodeGen/llvmTests/macho-comdat.arm.s | 2 +- .../CodeGen/llvmTests/macho-comdat.mips.s | 2 +- .../CodeGen/llvmTests/macho-comdat.riscv.s | 2 +- .../llvmTests/macho-extern-hidden.arm.s | 2 +- .../llvmTests/macho-extern-hidden.mips.s | 2 +- .../llvmTests/macho-extern-hidden.riscv.s | 2 +- .../CodeGen/llvmTests/micromips-li.arm.s | 6 +- .../CodeGen/llvmTests/micromips-li.mips.s | 6 +- .../CodeGen/llvmTests/micromips-li.riscv.s | 6 +- .../llvmTests/micromips-lwc1-swc1.arm.s | 2 +- .../llvmTests/micromips-lwc1-swc1.mips.s | 2 +- .../llvmTests/micromips-lwc1-swc1.riscv.s | 2 +- .../micromips-rdhwr-directives.arm.s | 2 +- .../micromips-rdhwr-directives.mips.s | 2 +- .../micromips-rdhwr-directives.riscv.s | 2 +- .../CodeGen/llvmTests/micromips-shift.arm.s | 16 +- .../CodeGen/llvmTests/micromips-shift.mips.s | 16 +- .../CodeGen/llvmTests/micromips-shift.riscv.s | 16 +- .../CodeGen/llvmTests/minsize-litpools.arm.s | 2 +- .../CodeGen/llvmTests/minsize-litpools.mips.s | 2 +- .../llvmTests/minsize-litpools.riscv.s | 2 +- .../llvmTests/nacl-reserved-regs.arm.s | 2 +- .../llvmTests/nacl-reserved-regs.mips.s | 2 +- .../llvmTests/nacl-reserved-regs.riscv.s | 2 +- .../CodeGen/llvmTests/overlap-shift.arm.s | 2 +- .../CodeGen/llvmTests/overlap-shift.mips.s | 2 +- .../CodeGen/llvmTests/overlap-shift.riscv.s | 2 +- .../CodeGen/llvmTests/pr15981.arm.s | 6 +- .../CodeGen/llvmTests/pr15981.mips.s | 6 +- .../CodeGen/llvmTests/pr15981.riscv.s | 6 +- .../Regression/CodeGen/llvmTests/pr3216.arm.s | 1 - .../CodeGen/llvmTests/pr3216.mips.s | 1 - .../CodeGen/llvmTests/pr3216.riscv.s | 1 - .../CodeGen/llvmTests/pr32256.arm.s | 1 - .../CodeGen/llvmTests/pr32256.mips.s | 1 - .../CodeGen/llvmTests/pr32256.riscv.s | 1 - .../CodeGen/llvmTests/pr32588.arm.s | 6 +- .../CodeGen/llvmTests/pr32588.mips.s | 6 +- .../CodeGen/llvmTests/pr32588.riscv.s | 6 +- .../CodeGen/llvmTests/pr34381.arm.s | 7 +- .../CodeGen/llvmTests/pr34381.mips.s | 7 +- .../CodeGen/llvmTests/pr34381.riscv.s | 7 +- .../CodeGen/llvmTests/pr35761.arm.s | 5 +- .../CodeGen/llvmTests/pr35761.mips.s | 5 +- .../CodeGen/llvmTests/pr35761.riscv.s | 5 +- .../CodeGen/llvmTests/pr58286.arm.s | 2 +- .../CodeGen/llvmTests/pr58286.mips.s | 2 +- .../CodeGen/llvmTests/pr58286.riscv.s | 2 +- .../CodeGen/llvmTests/private.arm.s | 2 +- .../CodeGen/llvmTests/private.mips.s | 2 +- .../CodeGen/llvmTests/private.riscv.s | 2 +- .../CodeGen/llvmTests/rdhwr-directives.arm.s | 2 +- .../CodeGen/llvmTests/rdhwr-directives.mips.s | 2 +- .../llvmTests/rdhwr-directives.riscv.s | 2 +- .../Regression/CodeGen/llvmTests/readtp.arm.s | 2 +- .../CodeGen/llvmTests/readtp.mips.s | 2 +- .../CodeGen/llvmTests/readtp.riscv.s | 2 +- .../CodeGen/llvmTests/return-ext.arm.s | 6 +- .../CodeGen/llvmTests/return-ext.mips.s | 6 +- .../CodeGen/llvmTests/return-ext.riscv.s | 6 +- .../Regression/CodeGen/llvmTests/sel1c.arm.s | 6 +- .../Regression/CodeGen/llvmTests/sel1c.mips.s | 6 +- .../CodeGen/llvmTests/sel1c.riscv.s | 6 +- .../Regression/CodeGen/llvmTests/sel2c.arm.s | 6 +- .../Regression/CodeGen/llvmTests/sel2c.mips.s | 6 +- .../CodeGen/llvmTests/sel2c.riscv.s | 6 +- .../CodeGen/llvmTests/select-const.mips.s | 2 +- .../Regression/CodeGen/llvmTests/seleq.arm.s | 18 +- .../Regression/CodeGen/llvmTests/seleq.mips.s | 18 +- .../CodeGen/llvmTests/seleq.riscv.s | 18 +- .../CodeGen/llvmTests/setcc-se.arm.s | 2 +- .../CodeGen/llvmTests/setcc-se.mips.s | 2 +- .../CodeGen/llvmTests/setcc-se.riscv.s | 2 +- .../Regression/CodeGen/llvmTests/seteq.arm.s | 10 +- .../Regression/CodeGen/llvmTests/seteq.mips.s | 10 +- .../CodeGen/llvmTests/seteq.riscv.s | 10 +- .../Regression/CodeGen/llvmTests/seteqz.arm.s | 8 +- .../CodeGen/llvmTests/seteqz.mips.s | 8 +- .../CodeGen/llvmTests/seteqz.riscv.s | 8 +- .../Regression/CodeGen/llvmTests/setgek.arm.s | 8 +- .../CodeGen/llvmTests/setgek.mips.s | 8 +- .../CodeGen/llvmTests/setgek.riscv.s | 8 +- .../Regression/CodeGen/llvmTests/setle.arm.s | 14 +- .../Regression/CodeGen/llvmTests/setle.mips.s | 14 +- .../CodeGen/llvmTests/setle.riscv.s | 14 +- .../Regression/CodeGen/llvmTests/setlt.arm.s | 14 +- .../Regression/CodeGen/llvmTests/setlt.mips.s | 14 +- .../CodeGen/llvmTests/setlt.riscv.s | 14 +- .../Regression/CodeGen/llvmTests/setltk.arm.s | 14 +- .../CodeGen/llvmTests/setltk.mips.s | 14 +- .../CodeGen/llvmTests/setltk.riscv.s | 14 +- .../Regression/CodeGen/llvmTests/setne.arm.s | 10 +- .../Regression/CodeGen/llvmTests/setne.mips.s | 10 +- .../CodeGen/llvmTests/setne.riscv.s | 10 +- .../Regression/CodeGen/llvmTests/setuge.arm.s | 14 +- .../CodeGen/llvmTests/setuge.mips.s | 14 +- .../CodeGen/llvmTests/setuge.riscv.s | 14 +- .../Regression/CodeGen/llvmTests/setugt.arm.s | 14 +- .../CodeGen/llvmTests/setugt.mips.s | 14 +- .../CodeGen/llvmTests/setugt.riscv.s | 14 +- .../Regression/CodeGen/llvmTests/setule.arm.s | 14 +- .../CodeGen/llvmTests/setule.mips.s | 14 +- .../CodeGen/llvmTests/setule.riscv.s | 14 +- .../Regression/CodeGen/llvmTests/setult.arm.s | 14 +- .../CodeGen/llvmTests/setult.mips.s | 14 +- .../CodeGen/llvmTests/setult.riscv.s | 14 +- .../CodeGen/llvmTests/setultk.arm.s | 14 +- .../CodeGen/llvmTests/setultk.mips.s | 14 +- .../CodeGen/llvmTests/setultk.riscv.s | 14 +- .../CodeGen/llvmTests/shift-codegen.arm.s | 4 +- .../CodeGen/llvmTests/shift-codegen.mips.s | 4 +- .../CodeGen/llvmTests/shift-codegen.riscv.s | 4 +- .../CodeGen/llvmTests/shift-one.arm.s | 2 +- .../CodeGen/llvmTests/shift-one.mips.s | 2 +- .../CodeGen/llvmTests/shift-one.riscv.s | 2 +- .../CodeGen/llvmTests/stride-reuse.arm.s | 6 +- .../CodeGen/llvmTests/stride-reuse.mips.s | 6 +- .../CodeGen/llvmTests/stride-reuse.riscv.s | 6 +- .../llvmTests/symbol-redefinition.arm.s | 2 +- .../llvmTests/symbol-redefinition.mips.s | 2 +- .../llvmTests/symbol-redefinition.riscv.s | 2 +- .../CodeGen/llvmTests/tailregccpic.arm.s | 2 +- .../CodeGen/llvmTests/tailregccpic.mips.s | 2 +- .../CodeGen/llvmTests/tailregccpic.riscv.s | 2 +- .../llvmTests/tglobaladdr-wrapper.arm.s | 6 +- .../llvmTests/tglobaladdr-wrapper.mips.s | 6 +- .../llvmTests/tglobaladdr-wrapper.riscv.s | 6 +- .../Regression/CodeGen/llvmTests/tls16.arm.s | 2 +- .../Regression/CodeGen/llvmTests/tls16.mips.s | 2 +- .../CodeGen/llvmTests/tls16.riscv.s | 2 +- .../Regression/CodeGen/llvmTests/tlv-3.arm.s | 1 - .../Regression/CodeGen/llvmTests/tlv-3.mips.s | 1 - .../CodeGen/llvmTests/tlv-3.riscv.s | 1 - .../llvmTests/win64-eh-empty-block-2..arm.s | 5 +- .../llvmTests/win64-eh-empty-block-2..mips.s | 5 +- .../llvmTests/win64-eh-empty-block-2..riscv.s | 5 +- .../CodeGen/llvmTests/x86-64-pic-4.arm.s | 2 +- .../CodeGen/llvmTests/x86-64-pic-4.mips.s | 2 +- .../CodeGen/llvmTests/x86-64-pic-4.riscv.s | 2 +- .../CodeGen/llvmTests/x86-64-pic-5.arm.s | 2 +- .../CodeGen/llvmTests/x86-64-pic-5.mips.s | 2 +- .../CodeGen/llvmTests/x86-64-pic-5.riscv.s | 2 +- .../llvmTests/zero-call-used-regs.arm.s | 2 +- .../llvmTests/zero-call-used-regs.mips.s | 2 +- .../llvmTests/zero-call-used-regs.riscv.s | 2 +- .../llvmTests/zero-initialized-in-bss.arm.s | 4 +- .../llvmTests/zero-initialized-in-bss.mips.s | 4 +- .../llvmTests/zero-initialized-in-bss.riscv.s | 4 +- tests/Regression/CodeGen/loadstore.arm.s | 10 +- tests/Regression/CodeGen/loadstore.mips.s | 12 +- tests/Regression/CodeGen/loadstore.riscv.s | 10 +- tests/Regression/CodeGen/select.mips.s | 2 +- tests/Regression/CodeGen/select.riscv.s | 2 +- tests/Regression/CodeGen/switch.arm.s | 2 +- tests/Regression/CodeGen/switch.mips.s | 2 +- tests/Regression/CodeGen/switch.riscv.s | 2 +- .../SysY2022/functional/55_sort_test1.riscv.s | 38 +- .../SysY2022/functional/56_sort_test2.riscv.s | 14 +- .../SysY2022/functional/58_sort_test4.riscv.s | 133 +- .../SysY2022/functional/59_sort_test5.riscv.s | 103 +- tests/SysY2022/functional/61_sort_test7.arm.s | 2 +- .../SysY2022/functional/61_sort_test7.riscv.s | 2 +- .../SysY2022/functional/62_percolation.arm.s | 2 +- .../functional/62_percolation.riscv.s | 130 +- tests/SysY2022/functional/64_calculator.arm.s | 8 +- .../SysY2022/functional/64_calculator.riscv.s | 946 +++--- tests/SysY2022/functional/65_color.arm.s | 2 +- tests/SysY2022/functional/65_color.riscv.s | 30 +- tests/SysY2022/functional/68_brainfk.arm.s | 4 +- tests/SysY2022/functional/68_brainfk.riscv.s | 4 +- tests/SysY2022/functional/69_expr_eval.arm.s | 4 +- .../SysY2022/functional/69_expr_eval.riscv.s | 242 +- tests/SysY2022/functional/70_dijkstra.arm.s | 2 +- tests/SysY2022/functional/70_dijkstra.riscv.s | 2 +- .../SysY2022/functional/71_full_conn.riscv.s | 1921 ++++++------ tests/SysY2022/functional/74_kmp.arm.s | 6 +- tests/SysY2022/functional/74_kmp.riscv.s | 6 +- tests/SysY2022/functional/75_max_flow.arm.s | 6 +- tests/SysY2022/functional/75_max_flow.riscv.s | 6 +- tests/SysY2022/functional/76_n_queens.arm.s | 8 +- tests/SysY2022/functional/76_n_queens.riscv.s | 8 +- tests/SysY2022/functional/77_substr.arm.s | 2 +- tests/SysY2022/functional/77_substr.riscv.s | 456 +-- .../SysY2022/functional/80_chaos_token.arm.s | 6 +- .../functional/80_chaos_token.riscv.s | 6 +- tests/SysY2022/functional/83_long_array.arm.s | 6 +- .../SysY2022/functional/83_long_array.riscv.s | 708 ++--- .../SysY2022/functional/84_long_array2.arm.s | 4 +- .../functional/84_long_array2.riscv.s | 4 +- tests/SysY2022/functional/85_long_code.arm.s | 2 +- .../SysY2022/functional/85_long_code.riscv.s | 2 +- .../SysY2022/functional/88_many_params2.arm.s | 4 +- .../functional/88_many_params2.riscv.s | 4 +- .../SysY2022/functional/94_nested_loops.arm.s | 4 +- .../functional/94_nested_loops.riscv.s | 792 ++--- tests/SysY2022/functional/95_float.riscv.s | 2 +- tests/SysY2022/hidden_functional/09_BFS.arm.s | 10 +- .../SysY2022/hidden_functional/09_BFS.riscv.s | 490 +-- tests/SysY2022/hidden_functional/10_DFS.arm.s | 8 +- .../SysY2022/hidden_functional/10_DFS.riscv.s | 371 +-- tests/SysY2022/hidden_functional/11_BST.arm.s | 6 +- .../SysY2022/hidden_functional/11_BST.riscv.s | 6 +- tests/SysY2022/hidden_functional/12_DSU.arm.s | 2 +- .../SysY2022/hidden_functional/12_DSU.riscv.s | 92 +- tests/SysY2022/hidden_functional/13_LCA.arm.s | 10 +- .../SysY2022/hidden_functional/13_LCA.riscv.s | 558 ++-- tests/SysY2022/hidden_functional/14_dp.arm.s | 4 +- .../SysY2022/hidden_functional/14_dp.riscv.s | 4 +- .../hidden_functional/16_k_smallest.arm.s | 2 +- .../hidden_functional/16_k_smallest.riscv.s | 2 +- .../hidden_functional/17_maximal_clique.arm.s | 4 +- .../17_maximal_clique.riscv.s | 4 +- .../SysY2022/hidden_functional/18_prim.arm.s | 8 +- .../hidden_functional/18_prim.riscv.s | 80 +- .../hidden_functional/19_search.arm.s | 2 +- .../hidden_functional/19_search.riscv.s | 202 +- .../SysY2022/hidden_functional/20_sort.arm.s | 8 +- .../hidden_functional/20_sort.riscv.s | 8 +- .../hidden_functional/21_union_find.arm.s | 2 +- .../hidden_functional/21_union_find.riscv.s | 2 +- .../22_matrix_multiply.arm.s | 6 +- .../22_matrix_multiply.riscv.s | 6 +- .../SysY2022/hidden_functional/23_json.arm.s | 4 +- .../hidden_functional/23_json.riscv.s | 4 +- .../hidden_functional/28_side_effect2.riscv.s | 70 +- .../hidden_functional/29_long_line.arm.s | 2 +- .../hidden_functional/29_long_line.riscv.s | 2 +- .../30_many_dimensions.arm.s | 2 +- .../30_many_dimensions.riscv.s | 174 +- .../31_many_indirections.arm.s | 2 +- .../31_many_indirections.riscv.s | 554 ++-- .../hidden_functional/35_math.riscv.s | 30 +- .../hidden_functional/36_rotate.arm.s | 2 +- .../hidden_functional/36_rotate.riscv.s | 4 +- tests/SysY2022/hidden_functional/37_dct.arm.s | 6 +- .../SysY2022/hidden_functional/37_dct.riscv.s | 164 +- .../hidden_functional/38_light2d.riscv.s | 232 +- tests/SysY2022/performance/00_bitset1.arm.s | 2 +- tests/SysY2022/performance/00_bitset1.riscv.s | 217 +- tests/SysY2022/performance/00_bitset2.arm.s | 2 +- tests/SysY2022/performance/00_bitset2.riscv.s | 217 +- tests/SysY2022/performance/00_bitset3.arm.s | 2 +- tests/SysY2022/performance/00_bitset3.riscv.s | 217 +- tests/SysY2022/performance/01_mm1.arm.s | 12 +- tests/SysY2022/performance/01_mm1.riscv.s | 1394 +++++---- tests/SysY2022/performance/01_mm1.sy.ir | 709 +++-- tests/SysY2022/performance/01_mm2.arm.s | 12 +- tests/SysY2022/performance/01_mm2.riscv.s | 1394 +++++---- tests/SysY2022/performance/01_mm2.sy.ir | 709 +++-- tests/SysY2022/performance/01_mm3.arm.s | 12 +- tests/SysY2022/performance/01_mm3.riscv.s | 1394 +++++---- tests/SysY2022/performance/01_mm3.sy.ir | 709 +++-- tests/SysY2022/performance/02_mv1.arm.s | 10 +- tests/SysY2022/performance/02_mv1.riscv.s | 118 +- tests/SysY2022/performance/02_mv2.arm.s | 10 +- tests/SysY2022/performance/02_mv2.riscv.s | 118 +- tests/SysY2022/performance/02_mv3.arm.s | 10 +- tests/SysY2022/performance/02_mv3.riscv.s | 118 +- tests/SysY2022/performance/03_sort1.arm.s | 4 +- tests/SysY2022/performance/03_sort1.riscv.s | 4 +- tests/SysY2022/performance/03_sort2.arm.s | 4 +- tests/SysY2022/performance/03_sort2.riscv.s | 4 +- tests/SysY2022/performance/03_sort3.arm.s | 4 +- tests/SysY2022/performance/03_sort3.riscv.s | 4 +- tests/SysY2022/performance/04_spmv1.arm.s | 14 +- tests/SysY2022/performance/04_spmv1.riscv.s | 140 +- tests/SysY2022/performance/04_spmv2.arm.s | 14 +- tests/SysY2022/performance/04_spmv2.riscv.s | 140 +- tests/SysY2022/performance/04_spmv3.arm.s | 14 +- tests/SysY2022/performance/04_spmv3.riscv.s | 140 +- .../performance/brainfuck-bootstrap.arm.s | 10 +- .../performance/brainfuck-bootstrap.riscv.s | 358 ++- .../performance/brainfuck-bootstrap.sy.ir | 342 ++- .../brainfuck-mandelbrot-nerf.arm.s | 10 +- .../brainfuck-mandelbrot-nerf.riscv.s | 358 ++- .../brainfuck-mandelbrot-nerf.sy.ir | 342 ++- .../performance/brainfuck-pi-nerf.arm.s | 10 +- .../performance/brainfuck-pi-nerf.riscv.s | 358 ++- .../performance/brainfuck-pi-nerf.sy.ir | 342 ++- tests/SysY2022/performance/conv0.arm.s | 22 +- tests/SysY2022/performance/conv0.riscv.s | 2394 +++++++-------- tests/SysY2022/performance/conv0.sy.ir | 652 ++-- tests/SysY2022/performance/conv1.arm.s | 22 +- tests/SysY2022/performance/conv1.riscv.s | 2394 +++++++-------- tests/SysY2022/performance/conv1.sy.ir | 652 ++-- tests/SysY2022/performance/conv2.arm.s | 22 +- tests/SysY2022/performance/conv2.riscv.s | 2394 +++++++-------- tests/SysY2022/performance/conv2.sy.ir | 652 ++-- tests/SysY2022/performance/crypto-1.arm.s | 2 +- tests/SysY2022/performance/crypto-1.riscv.s | 942 +++--- tests/SysY2022/performance/crypto-1.sy.ir | 615 ++-- tests/SysY2022/performance/crypto-2.arm.s | 2 +- tests/SysY2022/performance/crypto-2.riscv.s | 942 +++--- tests/SysY2022/performance/crypto-2.sy.ir | 615 ++-- tests/SysY2022/performance/crypto-3.arm.s | 2 +- tests/SysY2022/performance/crypto-3.riscv.s | 942 +++--- tests/SysY2022/performance/crypto-3.sy.ir | 615 ++-- .../performance/dead-code-elimination-1.arm.s | 4 +- .../performance/dead-code-elimination-2.arm.s | 4 +- .../performance/dead-code-elimination-3.arm.s | 4 +- tests/SysY2022/performance/derich1.arm.s | 16 +- tests/SysY2022/performance/derich1.riscv.s | 758 +++-- tests/SysY2022/performance/derich2.arm.s | 16 +- tests/SysY2022/performance/derich2.riscv.s | 758 +++-- tests/SysY2022/performance/derich3.arm.s | 16 +- tests/SysY2022/performance/derich3.riscv.s | 758 +++-- tests/SysY2022/performance/fft0.arm.s | 6 +- tests/SysY2022/performance/fft0.riscv.s | 6 +- tests/SysY2022/performance/fft1.arm.s | 6 +- tests/SysY2022/performance/fft1.riscv.s | 6 +- tests/SysY2022/performance/fft2.arm.s | 6 +- tests/SysY2022/performance/fft2.riscv.s | 6 +- tests/SysY2022/performance/floyd-0.arm.s | 10 +- tests/SysY2022/performance/floyd-0.riscv.s | 10 +- tests/SysY2022/performance/floyd-1.arm.s | 10 +- tests/SysY2022/performance/floyd-1.riscv.s | 10 +- tests/SysY2022/performance/floyd-2.arm.s | 10 +- tests/SysY2022/performance/floyd-2.riscv.s | 10 +- .../performance/gameoflife-gosper.arm.s | 12 +- .../performance/gameoflife-gosper.riscv.s | 656 ++-- .../performance/gameoflife-gosper.sy.ir | 204 +- .../performance/gameoflife-oscillator.arm.s | 12 +- .../performance/gameoflife-oscillator.riscv.s | 656 ++-- .../performance/gameoflife-oscillator.sy.ir | 204 +- .../performance/gameoflife-p61glidergun.arm.s | 12 +- .../gameoflife-p61glidergun.riscv.s | 656 ++-- .../performance/gameoflife-p61glidergun.sy.ir | 204 +- .../performance/large_loop_array_1.arm.s | 6 +- .../performance/large_loop_array_1.riscv.s | 230 +- .../performance/large_loop_array_2.arm.s | 6 +- .../performance/large_loop_array_2.riscv.s | 230 +- .../performance/large_loop_array_3.arm.s | 6 +- .../performance/large_loop_array_3.riscv.s | 230 +- tests/SysY2022/performance/layernorm1.arm.s | 10 +- tests/SysY2022/performance/layernorm1.riscv.s | 44 +- tests/SysY2022/performance/layernorm2.arm.s | 10 +- tests/SysY2022/performance/layernorm2.riscv.s | 44 +- tests/SysY2022/performance/layernorm3.arm.s | 10 +- tests/SysY2022/performance/layernorm3.riscv.s | 44 +- tests/SysY2022/performance/matmul1.arm.s | 10 +- tests/SysY2022/performance/matmul1.riscv.s | 2632 ++++++++--------- tests/SysY2022/performance/matmul2.arm.s | 10 +- tests/SysY2022/performance/matmul2.riscv.s | 2632 ++++++++--------- tests/SysY2022/performance/matmul3.arm.s | 10 +- tests/SysY2022/performance/matmul3.riscv.s | 2632 ++++++++--------- tests/SysY2022/performance/median0.arm.s | 2 +- tests/SysY2022/performance/median0.riscv.s | 2 +- tests/SysY2022/performance/median1.arm.s | 2 +- tests/SysY2022/performance/median1.riscv.s | 2 +- tests/SysY2022/performance/median2.arm.s | 2 +- tests/SysY2022/performance/median2.riscv.s | 2 +- .../performance/recursion_fabonacci-1.arm.s | 2 +- .../performance/recursion_fabonacci-1.riscv.s | 4 +- .../performance/recursion_fabonacci-2.arm.s | 2 +- .../performance/recursion_fabonacci-2.riscv.s | 4 +- .../performance/recursion_fabonacci-3.arm.s | 2 +- .../performance/recursion_fabonacci-3.riscv.s | 4 +- .../performance/recursive_call_1.arm.s | 2 +- .../performance/recursive_call_1.riscv.s | 4 +- .../performance/recursive_call_2.arm.s | 2 +- .../performance/recursive_call_2.riscv.s | 4 +- .../performance/recursive_call_3.arm.s | 2 +- .../performance/recursive_call_3.riscv.s | 4 +- tests/SysY2022/performance/shuffle0.arm.s | 22 +- tests/SysY2022/performance/shuffle0.riscv.s | 22 +- tests/SysY2022/performance/shuffle1.arm.s | 22 +- tests/SysY2022/performance/shuffle1.riscv.s | 22 +- tests/SysY2022/performance/shuffle2.arm.s | 22 +- tests/SysY2022/performance/shuffle2.riscv.s | 22 +- tests/SysY2022/performance/sl1.arm.s | 4 +- tests/SysY2022/performance/sl1.riscv.s | 1441 ++------- tests/SysY2022/performance/sl1.sy.ir | 1041 ++----- tests/SysY2022/performance/sl2.arm.s | 4 +- tests/SysY2022/performance/sl2.riscv.s | 1441 ++------- tests/SysY2022/performance/sl2.sy.ir | 1041 ++----- tests/SysY2022/performance/sl3.arm.s | 4 +- tests/SysY2022/performance/sl3.riscv.s | 1441 ++------- tests/SysY2022/performance/sl3.sy.ir | 1041 ++----- tests/SysY2022/performance/stencil0.arm.s | 4 +- tests/SysY2022/performance/stencil0.riscv.s | 4 +- tests/SysY2022/performance/stencil1.arm.s | 4 +- tests/SysY2022/performance/stencil1.riscv.s | 4 +- tests/SysY2022/performance/transpose0.arm.s | 6 +- tests/SysY2022/performance/transpose0.riscv.s | 6 +- tests/SysY2022/performance/transpose1.arm.s | 6 +- tests/SysY2022/performance/transpose1.riscv.s | 6 +- tests/SysY2022/performance/transpose2.arm.s | 6 +- tests/SysY2022/performance/transpose2.riscv.s | 6 +- tests/SysY2022/performance/vector_mul1.arm.s | 934 +++--- .../SysY2022/performance/vector_mul1.riscv.s | 1189 ++++---- tests/SysY2022/performance/vector_mul2.arm.s | 934 +++--- .../SysY2022/performance/vector_mul2.riscv.s | 1189 ++++---- tests/SysY2022/performance/vector_mul3.arm.s | 934 +++--- .../SysY2022/performance/vector_mul3.riscv.s | 1189 ++++---- 587 files changed, 29671 insertions(+), 32738 deletions(-) diff --git a/cmmc/CodeGen/CodeGenUtils.cpp b/cmmc/CodeGen/CodeGenUtils.cpp index 749b23108..92dbfc93d 100644 --- a/cmmc/CodeGen/CodeGenUtils.cpp +++ b/cmmc/CodeGen/CodeGenUtils.cpp @@ -43,8 +43,10 @@ void dumpAssembly(std::ostream& out, const CodeGenContext& ctx, const MIRModule& out << ".data\n"sv; emitData(); const auto dumpSymbol = [&](const MIRGlobal& global) { - if(!global.reloc->isFunc()) - out << ".align " << global.alignment << std::endl; + if(!global.reloc->isFunc()) { + if(global.alignment > 1) + out << ".p2align " << ilog2(global.alignment) << std::endl; + } auto symbol = global.reloc->symbol(); if(global.linkage == Linkage::Global) out << ".globl "sv << symbol << '\n'; diff --git a/cmmc/Runtime/cmmc_sysy_rt.cpp b/cmmc/Runtime/cmmc_sysy_rt.cpp index c83117cb5..2b8ad2cf4 100644 --- a/cmmc/Runtime/cmmc_sysy_rt.cpp +++ b/cmmc/Runtime/cmmc_sysy_rt.cpp @@ -66,7 +66,7 @@ namespace { Futex ready, done; }; - std::array workers; // NOLINT + Worker workers[maxThreads]; // NOLINT static_assert(std::atomic_uint32_t::is_always_lock_free); static_assert(std::atomic_int32_t::is_always_lock_free); @@ -122,17 +122,17 @@ using Time = int64_t; struct ParallelForEntry final { CmmcForLoop func; uint32_t size; - bool valid = false; - uint32_t hitCount = 0; + bool valid; + uint32_t hitCount; static constexpr uint32_t sampleThreshold = 100; static constexpr uint32_t sampleCount = 20; static constexpr uint32_t stopSampleThreshold = sampleThreshold + 3 * sampleCount; Time times[3]; // 1T 2T 4T - uint32_t bestThreads = 0; + uint32_t bestThreads; }; constexpr uint32_t entryCount = 16; -static std::array parallelCache; // NOLINT -static uint32_t lookupPtr = 0; // NOLINT +static ParallelForEntry parallelCache[entryCount]; // NOLINT +static uint32_t lookupPtr; // NOLINT static ParallelForEntry& selectEntry(CmmcForLoop func, uint32_t size) { for(uint32_t i = 0; i < entryCount; ++i, ++lookupPtr) { if(lookupPtr == entryCount) diff --git a/cmmc/Support/Tune.cpp b/cmmc/Support/Tune.cpp index 03ccf5241..a866c8baf 100644 --- a/cmmc/Support/Tune.cpp +++ b/cmmc/Support/Tune.cpp @@ -45,28 +45,31 @@ void initTune(const std::string_view& name, const std::string_view& target) { return; } - if(target == "riscv") { - const std::string_view table[][2] = { - { "/01_mm", - "loop_unswitch 1 loop_parallel 1 unroll_block_size 8 max_unroll_body_size 128 max_constant_hoist_count 9" }, - { "/conv", "prob_predict 1" }, - { "/brainfuck", - "loop_unswitch 1 loop_parallel 0 loop_extract 0 loop_unroll 0 dyn_loop_unroll 0 max_constant_hoist_count 4" }, - { "/crypto", - "loop_unswitch 0 loop_parallel 0 unroll_block_size 8 max_unroll_body_size 128 max_constant_hoist_count 6" }, - { "/sl", "loop_unswitch 1 loop_parallel 1 unroll_block_size 2 max_unroll_body_size 128 max_constant_hoist_count 3" }, - { "/gameoflife", "loop_parallel 1 unroll_block_size 16 max_unroll_body_size 128 max_constant_hoist_count 6" } - }; - for(auto& [key, value] : table) { - if(key.empty()) - continue; + CMMC_UNUSED(name); + CMMC_UNUSED(target); + // Tuning is illegal in the competition + // if(target == "riscv") { + // const std::string_view table[][2] = { + // { "/01_mm", + // "loop_unswitch 1 loop_parallel 1 unroll_block_size 8 max_unroll_body_size 128 max_constant_hoist_count 9" }, + // { "/conv", "prob_predict 1" }, + // { "/brainfuck", + // "loop_unswitch 1 loop_parallel 0 loop_extract 0 loop_unroll 0 dyn_loop_unroll 0 max_constant_hoist_count 4" }, + // { "/crypto", + // "loop_unswitch 0 loop_parallel 0 unroll_block_size 8 max_unroll_body_size 128 max_constant_hoist_count 6" }, + // { "/sl", "loop_unswitch 1 loop_parallel 1 unroll_block_size 2 max_unroll_body_size 128 max_constant_hoist_count 3" + // }, { "/gameoflife", "loop_parallel 1 unroll_block_size 16 max_unroll_body_size 128 max_constant_hoist_count 6" } + // }; + // for(auto& [key, value] : table) { + // if(key.empty()) + // continue; - if(name.find(key) != std::string_view::npos) { - parseTune(value); - return; - } - } - } + // if(name.find(key) != std::string_view::npos) { + // parseTune(value); + // return; + // } + // } + // } } int32_t queryTuneOpt(const std::string_view& key, int32_t defaultValue) { diff --git a/cmmc/Target/RISCV/RISCVScheduleModel.cpp b/cmmc/Target/RISCV/RISCVScheduleModel.cpp index 9f8184f8a..016077ab6 100644 --- a/cmmc/Target/RISCV/RISCVScheduleModel.cpp +++ b/cmmc/Target/RISCV/RISCVScheduleModel.cpp @@ -631,7 +631,7 @@ static bool earlyFoldStore(MIRBasicBlock& block, CodeGenContext& ctx) { return modified; } -static bool earlyFoldDualWordCopy(MIRFunction& func, CodeGenContext& ctx) { +static bool earlyFoldDoubleWordCopy(MIRFunction& func, CodeGenContext& ctx) { std::unordered_map useCount; std::unordered_map addressDef; for(auto& block : func.blocks()) { @@ -1137,7 +1137,7 @@ bool RISCVScheduleModel_sifive_u74::peepholeOpt(MIRFunction& func, CodeGenContex modified |= removeSExtW(func, ctx); modified |= expandMulWithConstant(func, ctx, static_cast(queryTuneOpt("max_mul_constant_cost", 2))); if(ctx.flags.inSSAForm) - modified |= earlyFoldDualWordCopy(func, ctx); + modified |= earlyFoldDoubleWordCopy(func, ctx); return modified; } diff --git a/cmmc/Target/RISCV/RISCVTarget.cpp b/cmmc/Target/RISCV/RISCVTarget.cpp index 7eab52166..c473d77c2 100644 --- a/cmmc/Target/RISCV/RISCVTarget.cpp +++ b/cmmc/Target/RISCV/RISCVTarget.cpp @@ -218,7 +218,7 @@ class RISCVTarget final : public Target { .branchLimit = static_cast(queryTuneOpt("branch_limit", 400)), .disableSelectionOpt = true, .branchPredictionWarmupThreshold = static_cast(queryTuneOpt("branch_prediction_warmup_threshold", 2)), - .maxConstantHoistCount = static_cast(queryTuneOpt("max_constant_hoist_count", 12)), + .maxConstantHoistCount = static_cast(queryTuneOpt("max_constant_hoist_count", 8)), }; return defaultHeuristic; diff --git a/cmmc/Transforms/IPO/Inlining.cpp b/cmmc/Transforms/IPO/Inlining.cpp index 8ae6711b1..fbce33123 100644 --- a/cmmc/Transforms/IPO/Inlining.cpp +++ b/cmmc/Transforms/IPO/Inlining.cpp @@ -194,7 +194,7 @@ class FuncInlining final : public TransformPass { public: bool run(Function& func, AnalysisPassManager&) const override { - if(func.getSymbol().prefix().find("_inline_wrapped") != std::string_view::npos) { + if(func.getSymbol().prefix().find("_cmmc_inline_wrapped") != std::string_view::npos) { return false; } diff --git a/cmmc/Transforms/IPO/ShrinkWrapping.cpp b/cmmc/Transforms/IPO/ShrinkWrapping.cpp index fe0ec5387..e9426deed 100644 --- a/cmmc/Transforms/IPO/ShrinkWrapping.cpp +++ b/cmmc/Transforms/IPO/ShrinkWrapping.cpp @@ -30,7 +30,7 @@ CMMC_NAMESPACE_BEGIN class ShrinkWrapping final : public TransformPass { static Function* cloneFunc(Function& func) { - auto newFunc = make(String::get(std::string(func.getSymbol().prefix()) + "_inline_wrapped"), + auto newFunc = make(String::get(std::string(func.getSymbol().prefix()) + "_cmmc_inline_wrapped"), func.getType()->as()); newFunc->setLinkage(Linkage::Internal); diff --git a/cmmc/Transforms/Misc/ImmutableScalarRefArg2Value.cpp b/cmmc/Transforms/Misc/ImmutableScalarRefArg2Value.cpp index b60c718b3..4574fda2e 100644 --- a/cmmc/Transforms/Misc/ImmutableScalarRefArg2Value.cpp +++ b/cmmc/Transforms/Misc/ImmutableScalarRefArg2Value.cpp @@ -101,7 +101,7 @@ class ImmutableScalarRefArg2Value final : public TransformPass { } for(auto [func, type] : wrappers) { - const auto wrapperFunc = make(String::get(std::string(func->getSymbol().prefix()) + "_wrapper"), type); + const auto wrapperFunc = make(String::get(std::string(func->getSymbol().prefix()) + "_cmmc_wrapper"), type); wrapperFunc->setLinkage(Linkage::Internal); std::vector args; auto& argTypes = type->getArgTypes(); diff --git a/tests/Regression/CodeGen/arithmetic.mips.s b/tests/Regression/CodeGen/arithmetic.mips.s index 46f039430..4d5e64835 100644 --- a/tests/Regression/CodeGen/arithmetic.mips.s +++ b/tests/Regression/CodeGen/arithmetic.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1065353216 .4byte 1056964608 diff --git a/tests/Regression/CodeGen/arithmetic.riscv.s b/tests/Regression/CodeGen/arithmetic.riscv.s index 6df93e5a5..745912402 100644 --- a/tests/Regression/CodeGen/arithmetic.riscv.s +++ b/tests/Regression/CodeGen/arithmetic.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1078530010 .text diff --git a/tests/Regression/CodeGen/call.arm.s b/tests/Regression/CodeGen/call.arm.s index d83f3bbe7..81c674e95 100644 --- a/tests/Regression/CodeGen/call.arm.s +++ b/tests/Regression/CodeGen/call.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl touch touch: .4byte 0 diff --git a/tests/Regression/CodeGen/call.mips.s b/tests/Regression/CodeGen/call.mips.s index c33c49d70..c9ee1a092 100644 --- a/tests/Regression/CodeGen/call.mips.s +++ b/tests/Regression/CodeGen/call.mips.s @@ -1,11 +1,11 @@ .data .data -.align 4 +.p2align 2 .globl touch touch: .4byte 0 .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1065353216 .text diff --git a/tests/Regression/CodeGen/call.riscv.s b/tests/Regression/CodeGen/call.riscv.s index d482979aa..2476395fc 100644 --- a/tests/Regression/CodeGen/call.riscv.s +++ b/tests/Regression/CodeGen/call.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl touch touch: .4byte 0 diff --git a/tests/Regression/CodeGen/constant.arm.s b/tests/Regression/CodeGen/constant.arm.s index 7437738e1..dba052f79 100644 --- a/tests/Regression/CodeGen/constant.arm.s +++ b/tests/Regression/CodeGen/constant.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 .globl x x: .zero 40 diff --git a/tests/Regression/CodeGen/constant.mips.s b/tests/Regression/CodeGen/constant.mips.s index dd8261215..6ccd2175e 100644 --- a/tests/Regression/CodeGen/constant.mips.s +++ b/tests/Regression/CodeGen/constant.mips.s @@ -1,10 +1,10 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1082130432 .bss -.align 8 +.p2align 3 .globl x x: .zero 40 diff --git a/tests/Regression/CodeGen/constant.riscv.s b/tests/Regression/CodeGen/constant.riscv.s index 08f4a5e39..260924640 100644 --- a/tests/Regression/CodeGen/constant.riscv.s +++ b/tests/Regression/CodeGen/constant.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 .globl x x: .zero 40 diff --git a/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.arm.s b/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.arm.s index 02446c225..d9bc2a514 100644 --- a/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.arm.s +++ b/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl A A: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.mips.s b/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.mips.s index 8fbb5056b..9f91ad2a3 100644 --- a/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.mips.s +++ b/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl A A: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.riscv.s b/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.riscv.s index ea9110d31..a6ca223b9 100644 --- a/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/2006-01-19-ISelFoldingBug.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl A A: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/2007-01-08-InstrSched.mips.s b/tests/Regression/CodeGen/llvmTests/2007-01-08-InstrSched.mips.s index 8e46daeb0..d9a050c80 100644 --- a/tests/Regression/CodeGen/llvmTests/2007-01-08-InstrSched.mips.s +++ b/tests/Regression/CodeGen/llvmTests/2007-01-08-InstrSched.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1077936128 .4byte 1084227584 diff --git a/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.arm.s b/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.arm.s index 1c03d5d6d..5c8dddbd3 100644 --- a/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.arm.s +++ b/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl X X: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.mips.s b/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.mips.s index ad6920435..9befac430 100644 --- a/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.mips.s +++ b/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl X X: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.riscv.s b/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.riscv.s index 6baed64d6..dd1dc21ff 100644 --- a/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/2007-08-10-SignExtSubreg.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl X X: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/2008-07-22-Cstpool.mips.s b/tests/Regression/CodeGen/llvmTests/2008-07-22-Cstpool.mips.s index 3e5cd7a83..e21f9bfca 100644 --- a/tests/Regression/CodeGen/llvmTests/2008-07-22-Cstpool.mips.s +++ b/tests/Regression/CodeGen/llvmTests/2008-07-22-Cstpool.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1082759578 .4byte 1082340147 diff --git a/tests/Regression/CodeGen/llvmTests/2008-07-22-Cstpool.riscv.s b/tests/Regression/CodeGen/llvmTests/2008-07-22-Cstpool.riscv.s index 12322e559..783aba2ad 100644 --- a/tests/Regression/CodeGen/llvmTests/2008-07-22-Cstpool.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/2008-07-22-Cstpool.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1082759578 .4byte 1082340147 diff --git a/tests/Regression/CodeGen/llvmTests/2009-04-24.arm.s b/tests/Regression/CodeGen/llvmTests/2009-04-24.arm.s index c807ba3f1..ece4b42ef 100644 --- a/tests/Regression/CodeGen/llvmTests/2009-04-24.arm.s +++ b/tests/Regression/CodeGen/llvmTests/2009-04-24.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 15 diff --git a/tests/Regression/CodeGen/llvmTests/2009-04-24.mips.s b/tests/Regression/CodeGen/llvmTests/2009-04-24.mips.s index 8a8afc57b..1ca8406c0 100644 --- a/tests/Regression/CodeGen/llvmTests/2009-04-24.mips.s +++ b/tests/Regression/CodeGen/llvmTests/2009-04-24.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 15 diff --git a/tests/Regression/CodeGen/llvmTests/2009-04-24.riscv.s b/tests/Regression/CodeGen/llvmTests/2009-04-24.riscv.s index 20d1bdc18..053214125 100644 --- a/tests/Regression/CodeGen/llvmTests/2009-04-24.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/2009-04-24.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 15 diff --git a/tests/Regression/CodeGen/llvmTests/2009-11-16-CstPoolLoad.mips.s b/tests/Regression/CodeGen/llvmTests/2009-11-16-CstPoolLoad.mips.s index ae13c3c84..6175e0ba1 100644 --- a/tests/Regression/CodeGen/llvmTests/2009-11-16-CstPoolLoad.mips.s +++ b/tests/Regression/CodeGen/llvmTests/2009-11-16-CstPoolLoad.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1079613850 .text diff --git a/tests/Regression/CodeGen/llvmTests/2009-11-16-CstPoolLoad.riscv.s b/tests/Regression/CodeGen/llvmTests/2009-11-16-CstPoolLoad.riscv.s index d2405e733..a1a8a77cb 100644 --- a/tests/Regression/CodeGen/llvmTests/2009-11-16-CstPoolLoad.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/2009-11-16-CstPoolLoad.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1079613850 .text diff --git a/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.arm.s b/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.arm.s index ff5623f72..cb7fe0aaa 100644 --- a/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.arm.s +++ b/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.arm.s @@ -1,11 +1,9 @@ .arch armv7ve .data .bss -.align 1 .globl u u: .zero 1 -.align 1 .globl s s: .zero 1 diff --git a/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.mips.s b/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.mips.s index 3da3bd559..d2cf17e6b 100644 --- a/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.mips.s +++ b/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.mips.s @@ -1,10 +1,8 @@ .data .bss -.align 1 .globl u u: .zero 1 -.align 1 .globl s s: .zero 1 diff --git a/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.riscv.s b/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.riscv.s index 55116d024..3c8b95208 100644 --- a/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/2010-11-18-SelectOfExtload.riscv.s @@ -1,11 +1,9 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 1 .globl u u: .zero 1 -.align 1 .globl s s: .zero 1 diff --git a/tests/Regression/CodeGen/llvmTests/atom-sched.arm.s b/tests/Regression/CodeGen/llvmTests/atom-sched.arm.s index 97cd650ab..52c5e90ce 100644 --- a/tests/Regression/CodeGen/llvmTests/atom-sched.arm.s +++ b/tests/Regression/CodeGen/llvmTests/atom-sched.arm.s @@ -1,27 +1,27 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 4 +.p2align 2 .globl d d: .4byte 0 -.align 4 +.p2align 2 .globl e e: .4byte 0 -.align 4 +.p2align 2 .globl f f: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/atom-sched.mips.s b/tests/Regression/CodeGen/llvmTests/atom-sched.mips.s index 11caa95ab..282bd9aae 100644 --- a/tests/Regression/CodeGen/llvmTests/atom-sched.mips.s +++ b/tests/Regression/CodeGen/llvmTests/atom-sched.mips.s @@ -1,26 +1,26 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 4 +.p2align 2 .globl d d: .4byte 0 -.align 4 +.p2align 2 .globl e e: .4byte 0 -.align 4 +.p2align 2 .globl f f: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/atom-sched.riscv.s b/tests/Regression/CodeGen/llvmTests/atom-sched.riscv.s index a73e2a758..206244e4f 100644 --- a/tests/Regression/CodeGen/llvmTests/atom-sched.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/atom-sched.riscv.s @@ -1,27 +1,27 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 4 +.p2align 2 .globl d d: .4byte 0 -.align 4 +.p2align 2 .globl e e: .4byte 0 -.align 4 +.p2align 2 .globl f f: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/backpropmask.arm.s b/tests/Regression/CodeGen/llvmTests/backpropmask.arm.s index b8f29ed46..353040845 100644 --- a/tests/Regression/CodeGen/llvmTests/backpropmask.arm.s +++ b/tests/Regression/CodeGen/llvmTests/backpropmask.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl b b: .4byte 918 -.align 4 +.p2align 2 .globl d d: .4byte 8089 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/backpropmask.mips.s b/tests/Regression/CodeGen/llvmTests/backpropmask.mips.s index f1c686f68..59cc58451 100644 --- a/tests/Regression/CodeGen/llvmTests/backpropmask.mips.s +++ b/tests/Regression/CodeGen/llvmTests/backpropmask.mips.s @@ -1,18 +1,18 @@ .data .data -.align 4 +.p2align 2 .globl b b: .4byte 918 -.align 4 +.p2align 2 .globl d d: .4byte 8089 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/backpropmask.riscv.s b/tests/Regression/CodeGen/llvmTests/backpropmask.riscv.s index 9fe31f6bb..786f86181 100644 --- a/tests/Regression/CodeGen/llvmTests/backpropmask.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/backpropmask.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl b b: .4byte 918 -.align 4 +.p2align 2 .globl d d: .4byte 8089 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/beqzc.arm.s b/tests/Regression/CodeGen/llvmTests/beqzc.arm.s index f7a2177c8..290fb57e6 100644 --- a/tests/Regression/CodeGen/llvmTests/beqzc.arm.s +++ b/tests/Regression/CodeGen/llvmTests/beqzc.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/beqzc.mips.s b/tests/Regression/CodeGen/llvmTests/beqzc.mips.s index b45d61f58..b93c636d9 100644 --- a/tests/Regression/CodeGen/llvmTests/beqzc.mips.s +++ b/tests/Regression/CodeGen/llvmTests/beqzc.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/beqzc.riscv.s b/tests/Regression/CodeGen/llvmTests/beqzc.riscv.s index bcfa420c9..d2dc5dec5 100644 --- a/tests/Regression/CodeGen/llvmTests/beqzc.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/beqzc.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/beqzc1.arm.s b/tests/Regression/CodeGen/llvmTests/beqzc1.arm.s index 9214b4133..bd476c443 100644 --- a/tests/Regression/CodeGen/llvmTests/beqzc1.arm.s +++ b/tests/Regression/CodeGen/llvmTests/beqzc1.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/beqzc1.mips.s b/tests/Regression/CodeGen/llvmTests/beqzc1.mips.s index 775898571..7bc9f2bed 100644 --- a/tests/Regression/CodeGen/llvmTests/beqzc1.mips.s +++ b/tests/Regression/CodeGen/llvmTests/beqzc1.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/beqzc1.riscv.s b/tests/Regression/CodeGen/llvmTests/beqzc1.riscv.s index e8f6cb558..60463f372 100644 --- a/tests/Regression/CodeGen/llvmTests/beqzc1.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/beqzc1.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconeq.arm.s b/tests/Regression/CodeGen/llvmTests/brconeq.arm.s index 4c12067a7..3c0e38196 100644 --- a/tests/Regression/CodeGen/llvmTests/brconeq.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brconeq.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconeq.mips.s b/tests/Regression/CodeGen/llvmTests/brconeq.mips.s index 619b09a62..ed13df28d 100644 --- a/tests/Regression/CodeGen/llvmTests/brconeq.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brconeq.mips.s @@ -1,14 +1,14 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconeq.riscv.s b/tests/Regression/CodeGen/llvmTests/brconeq.riscv.s index c90569a79..0e347eb1e 100644 --- a/tests/Regression/CodeGen/llvmTests/brconeq.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brconeq.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconeqk.arm.s b/tests/Regression/CodeGen/llvmTests/brconeqk.arm.s index eeac2f07f..fda49f887 100644 --- a/tests/Regression/CodeGen/llvmTests/brconeqk.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brconeqk.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconeqk.mips.s b/tests/Regression/CodeGen/llvmTests/brconeqk.mips.s index 170e12077..490cb7b23 100644 --- a/tests/Regression/CodeGen/llvmTests/brconeqk.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brconeqk.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconeqk.riscv.s b/tests/Regression/CodeGen/llvmTests/brconeqk.riscv.s index 055083ff2..f0d0e02bc 100644 --- a/tests/Regression/CodeGen/llvmTests/brconeqk.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brconeqk.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconeqz.arm.s b/tests/Regression/CodeGen/llvmTests/brconeqz.arm.s index 266f54a53..745d8989c 100644 --- a/tests/Regression/CodeGen/llvmTests/brconeqz.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brconeqz.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconeqz.mips.s b/tests/Regression/CodeGen/llvmTests/brconeqz.mips.s index 71a98a15c..676eb22fb 100644 --- a/tests/Regression/CodeGen/llvmTests/brconeqz.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brconeqz.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconeqz.riscv.s b/tests/Regression/CodeGen/llvmTests/brconeqz.riscv.s index 70b3fb698..00ed34b5d 100644 --- a/tests/Regression/CodeGen/llvmTests/brconeqz.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brconeqz.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconge.arm.s b/tests/Regression/CodeGen/llvmTests/brconge.arm.s index 4458726e5..d600d22a8 100644 --- a/tests/Regression/CodeGen/llvmTests/brconge.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brconge.arm.s @@ -1,23 +1,23 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 5 -.align 4 +.p2align 2 .globl result1 result1: .4byte 0 -.align 4 +.p2align 2 .globl result2 result2: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/brconge.mips.s b/tests/Regression/CodeGen/llvmTests/brconge.mips.s index 9eeba6841..81ecc179c 100644 --- a/tests/Regression/CodeGen/llvmTests/brconge.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brconge.mips.s @@ -1,22 +1,22 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 5 -.align 4 +.p2align 2 .globl result1 result1: .4byte 0 -.align 4 +.p2align 2 .globl result2 result2: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/brconge.riscv.s b/tests/Regression/CodeGen/llvmTests/brconge.riscv.s index 0ed34ac28..8cda5b8d4 100644 --- a/tests/Regression/CodeGen/llvmTests/brconge.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brconge.riscv.s @@ -1,23 +1,23 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 5 -.align 4 +.p2align 2 .globl result1 result1: .4byte 0 -.align 4 +.p2align 2 .globl result2 result2: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/brcongt.arm.s b/tests/Regression/CodeGen/llvmTests/brcongt.arm.s index 06159929c..3a6b2d085 100644 --- a/tests/Regression/CodeGen/llvmTests/brcongt.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brcongt.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brcongt.mips.s b/tests/Regression/CodeGen/llvmTests/brcongt.mips.s index e0b767304..d26c254c7 100644 --- a/tests/Regression/CodeGen/llvmTests/brcongt.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brcongt.mips.s @@ -1,18 +1,18 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brcongt.riscv.s b/tests/Regression/CodeGen/llvmTests/brcongt.riscv.s index 463bab0f3..f429abecb 100644 --- a/tests/Regression/CodeGen/llvmTests/brcongt.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brcongt.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconle.arm.s b/tests/Regression/CodeGen/llvmTests/brconle.arm.s index 62d2d4f03..602b00b46 100644 --- a/tests/Regression/CodeGen/llvmTests/brconle.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brconle.arm.s @@ -1,23 +1,23 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 4294967291 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 4294967291 -.align 4 +.p2align 2 .globl result1 result1: .4byte 0 -.align 4 +.p2align 2 .globl result2 result2: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/brconle.mips.s b/tests/Regression/CodeGen/llvmTests/brconle.mips.s index 5ac699127..3a1896278 100644 --- a/tests/Regression/CodeGen/llvmTests/brconle.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brconle.mips.s @@ -1,22 +1,22 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 4294967291 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 4294967291 -.align 4 +.p2align 2 .globl result1 result1: .4byte 0 -.align 4 +.p2align 2 .globl result2 result2: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/brconle.riscv.s b/tests/Regression/CodeGen/llvmTests/brconle.riscv.s index c69083097..4f5a2b259 100644 --- a/tests/Regression/CodeGen/llvmTests/brconle.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brconle.riscv.s @@ -1,23 +1,23 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 4294967291 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 4294967291 -.align 4 +.p2align 2 .globl result1 result1: .4byte 0 -.align 4 +.p2align 2 .globl result2 result2: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/brconlt.arm.s b/tests/Regression/CodeGen/llvmTests/brconlt.arm.s index 2bea9028f..ccc2234b4 100644 --- a/tests/Regression/CodeGen/llvmTests/brconlt.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brconlt.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconlt.mips.s b/tests/Regression/CodeGen/llvmTests/brconlt.mips.s index 745ed6cf5..d8af0f7f1 100644 --- a/tests/Regression/CodeGen/llvmTests/brconlt.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brconlt.mips.s @@ -1,18 +1,18 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconlt.riscv.s b/tests/Regression/CodeGen/llvmTests/brconlt.riscv.s index 458e69e2c..aa4fa3e41 100644 --- a/tests/Regression/CodeGen/llvmTests/brconlt.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brconlt.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconne.arm.s b/tests/Regression/CodeGen/llvmTests/brconne.arm.s index c5791504a..1e3434a81 100644 --- a/tests/Regression/CodeGen/llvmTests/brconne.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brconne.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconne.mips.s b/tests/Regression/CodeGen/llvmTests/brconne.mips.s index d7c6b77c4..94fcc3e37 100644 --- a/tests/Regression/CodeGen/llvmTests/brconne.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brconne.mips.s @@ -1,14 +1,14 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconne.riscv.s b/tests/Regression/CodeGen/llvmTests/brconne.riscv.s index 22c4f2786..5bd3d0696 100644 --- a/tests/Regression/CodeGen/llvmTests/brconne.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brconne.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 5 -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconnek.arm.s b/tests/Regression/CodeGen/llvmTests/brconnek.arm.s index bd8d62c77..d2851037c 100644 --- a/tests/Regression/CodeGen/llvmTests/brconnek.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brconnek.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconnek.mips.s b/tests/Regression/CodeGen/llvmTests/brconnek.mips.s index b50820c6c..a4a0b3511 100644 --- a/tests/Regression/CodeGen/llvmTests/brconnek.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brconnek.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconnek.riscv.s b/tests/Regression/CodeGen/llvmTests/brconnek.riscv.s index 087ac2128..46519f958 100644 --- a/tests/Regression/CodeGen/llvmTests/brconnek.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brconnek.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconnez.arm.s b/tests/Regression/CodeGen/llvmTests/brconnez.arm.s index 3a49d50b2..431b3da32 100644 --- a/tests/Regression/CodeGen/llvmTests/brconnez.arm.s +++ b/tests/Regression/CodeGen/llvmTests/brconnez.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 0 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconnez.mips.s b/tests/Regression/CodeGen/llvmTests/brconnez.mips.s index ea6d29cbc..aadc23e5c 100644 --- a/tests/Regression/CodeGen/llvmTests/brconnez.mips.s +++ b/tests/Regression/CodeGen/llvmTests/brconnez.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 0 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/brconnez.riscv.s b/tests/Regression/CodeGen/llvmTests/brconnez.riscv.s index f888e130d..6b8be4ff1 100644 --- a/tests/Regression/CodeGen/llvmTests/brconnez.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/brconnez.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 0 -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..arm.s b/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..arm.s index 2862f7131..22d4812cf 100644 --- a/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..arm.s +++ b/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl counter counter: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..mips.s b/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..mips.s index 24c1a4f12..15171c337 100644 --- a/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..mips.s +++ b/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl counter counter: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..riscv.s b/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..riscv.s index 8756bdbce..3195c7743 100644 --- a/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..riscv.s +++ b/tests/Regression/CodeGen/llvmTests/cmse-expand-bxns-ret..riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl counter counter: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.arm.s b/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.arm.s index 8c6f7d7c4..dbd24eeed 100644 --- a/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.arm.s +++ b/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 .globl g g: .zero 40 diff --git a/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.mips.s b/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.mips.s index eb42bd1c3..aada3ee96 100644 --- a/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.mips.s +++ b/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 8 +.p2align 3 .globl g g: .zero 40 diff --git a/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.riscv.s b/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.riscv.s index 5a1e7dd10..84ab31289 100644 --- a/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/codegen-prepare-crash.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 .globl g g: .zero 40 diff --git a/tests/Regression/CodeGen/llvmTests/coff-exclude.arm.s b/tests/Regression/CodeGen/llvmTests/coff-exclude.arm.s index 9fd2b6dc9..ba5616541 100644 --- a/tests/Regression/CodeGen/llvmTests/coff-exclude.arm.s +++ b/tests/Regression/CodeGen/llvmTests/coff-exclude.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 1 -.align 4 +.p2align 2 .globl b b: .4byte 1 -.align 4 +.p2align 2 .globl c c: .4byte 1 -.align 4 +.p2align 2 .globl d d: .4byte 1 -.align 4 +.p2align 2 .globl e e: .4byte 1 -.align 4 +.p2align 2 .globl f f: .4byte 1 -.align 4 +.p2align 2 .globl g g: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/coff-exclude.mips.s b/tests/Regression/CodeGen/llvmTests/coff-exclude.mips.s index 699d58081..bfe064808 100644 --- a/tests/Regression/CodeGen/llvmTests/coff-exclude.mips.s +++ b/tests/Regression/CodeGen/llvmTests/coff-exclude.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 1 -.align 4 +.p2align 2 .globl b b: .4byte 1 -.align 4 +.p2align 2 .globl c c: .4byte 1 -.align 4 +.p2align 2 .globl d d: .4byte 1 -.align 4 +.p2align 2 .globl e e: .4byte 1 -.align 4 +.p2align 2 .globl f f: .4byte 1 -.align 4 +.p2align 2 .globl g g: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/coff-exclude.riscv.s b/tests/Regression/CodeGen/llvmTests/coff-exclude.riscv.s index a8204567d..d7e29dc84 100644 --- a/tests/Regression/CodeGen/llvmTests/coff-exclude.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/coff-exclude.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 1 -.align 4 +.p2align 2 .globl b b: .4byte 1 -.align 4 +.p2align 2 .globl c c: .4byte 1 -.align 4 +.p2align 2 .globl d d: .4byte 1 -.align 4 +.p2align 2 .globl e e: .4byte 1 -.align 4 +.p2align 2 .globl f f: .4byte 1 -.align 4 +.p2align 2 .globl g g: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..arm.s b/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..arm.s index b8f29ed46..353040845 100644 --- a/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..arm.s +++ b/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl b b: .4byte 918 -.align 4 +.p2align 2 .globl d d: .4byte 8089 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..mips.s b/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..mips.s index f1c686f68..59cc58451 100644 --- a/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..mips.s +++ b/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..mips.s @@ -1,18 +1,18 @@ .data .data -.align 4 +.p2align 2 .globl b b: .4byte 918 -.align 4 +.p2align 2 .globl d d: .4byte 8089 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..riscv.s b/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..riscv.s index 9fe31f6bb..786f86181 100644 --- a/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..riscv.s +++ b/tests/Regression/CodeGen/llvmTests/dbg-value-superreg-copy2..riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl b b: .4byte 918 -.align 4 +.p2align 2 .globl d d: .4byte 8089 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/disable-tail-merge.arm.s b/tests/Regression/CodeGen/llvmTests/disable-tail-merge.arm.s index 496b97cc0..ff4cfc486 100644 --- a/tests/Regression/CodeGen/llvmTests/disable-tail-merge.arm.s +++ b/tests/Regression/CodeGen/llvmTests/disable-tail-merge.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl g0 g0: .4byte 0 -.align 4 +.p2align 2 .globl g1 g1: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/disable-tail-merge.mips.s b/tests/Regression/CodeGen/llvmTests/disable-tail-merge.mips.s index f265f39d5..45aee99a7 100644 --- a/tests/Regression/CodeGen/llvmTests/disable-tail-merge.mips.s +++ b/tests/Regression/CodeGen/llvmTests/disable-tail-merge.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl g0 g0: .4byte 0 -.align 4 +.p2align 2 .globl g1 g1: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/disable-tail-merge.riscv.s b/tests/Regression/CodeGen/llvmTests/disable-tail-merge.riscv.s index b05e73bb7..5f9f22780 100644 --- a/tests/Regression/CodeGen/llvmTests/disable-tail-merge.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/disable-tail-merge.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl g0 g0: .4byte 0 -.align 4 +.p2align 2 .globl g1 g1: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/div.arm.s b/tests/Regression/CodeGen/llvmTests/div.arm.s index 6d7a5f6f0..d0a3001ea 100644 --- a/tests/Regression/CodeGen/llvmTests/div.arm.s +++ b/tests/Regression/CodeGen/llvmTests/div.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 100 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4294967292 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/div.mips.s b/tests/Regression/CodeGen/llvmTests/div.mips.s index 01ef53f2a..535c69799 100644 --- a/tests/Regression/CodeGen/llvmTests/div.mips.s +++ b/tests/Regression/CodeGen/llvmTests/div.mips.s @@ -1,14 +1,14 @@ .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 100 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4294967292 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/div.riscv.s b/tests/Regression/CodeGen/llvmTests/div.riscv.s index 577490b2b..6dab38849 100644 --- a/tests/Regression/CodeGen/llvmTests/div.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/div.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 100 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4294967292 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/div_rem.arm.s b/tests/Regression/CodeGen/llvmTests/div_rem.arm.s index 00a7833ad..96e8c456a 100644 --- a/tests/Regression/CodeGen/llvmTests/div_rem.arm.s +++ b/tests/Regression/CodeGen/llvmTests/div_rem.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 103 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4294967292 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 -.align 4 +.p2align 2 .globl llll llll: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/div_rem.mips.s b/tests/Regression/CodeGen/llvmTests/div_rem.mips.s index 07a482a94..799ba5b4d 100644 --- a/tests/Regression/CodeGen/llvmTests/div_rem.mips.s +++ b/tests/Regression/CodeGen/llvmTests/div_rem.mips.s @@ -1,18 +1,18 @@ .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 103 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4294967292 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 -.align 4 +.p2align 2 .globl llll llll: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/div_rem.riscv.s b/tests/Regression/CodeGen/llvmTests/div_rem.riscv.s index 6f549d307..aa058478f 100644 --- a/tests/Regression/CodeGen/llvmTests/div_rem.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/div_rem.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 103 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4294967292 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 -.align 4 +.p2align 2 .globl llll llll: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/divu.arm.s b/tests/Regression/CodeGen/llvmTests/divu.arm.s index 7d3657add..3d2fcf4a3 100644 --- a/tests/Regression/CodeGen/llvmTests/divu.arm.s +++ b/tests/Regression/CodeGen/llvmTests/divu.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 100 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/divu.mips.s b/tests/Regression/CodeGen/llvmTests/divu.mips.s index 421e48a3d..1be96b89c 100644 --- a/tests/Regression/CodeGen/llvmTests/divu.mips.s +++ b/tests/Regression/CodeGen/llvmTests/divu.mips.s @@ -1,14 +1,14 @@ .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 100 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/divu.riscv.s b/tests/Regression/CodeGen/llvmTests/divu.riscv.s index 267589a84..c8f10dd51 100644 --- a/tests/Regression/CodeGen/llvmTests/divu.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/divu.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 100 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/divu_remu.arm.s b/tests/Regression/CodeGen/llvmTests/divu_remu.arm.s index 5870f8ff3..bb8184247 100644 --- a/tests/Regression/CodeGen/llvmTests/divu_remu.arm.s +++ b/tests/Regression/CodeGen/llvmTests/divu_remu.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 103 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 -.align 4 +.p2align 2 .globl llll llll: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/divu_remu.mips.s b/tests/Regression/CodeGen/llvmTests/divu_remu.mips.s index 41bfa261b..95b7e9cd8 100644 --- a/tests/Regression/CodeGen/llvmTests/divu_remu.mips.s +++ b/tests/Regression/CodeGen/llvmTests/divu_remu.mips.s @@ -1,18 +1,18 @@ .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 103 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 -.align 4 +.p2align 2 .globl llll llll: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/divu_remu.riscv.s b/tests/Regression/CodeGen/llvmTests/divu_remu.riscv.s index be9863fc8..52de257b4 100644 --- a/tests/Regression/CodeGen/llvmTests/divu_remu.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/divu_remu.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl iiii iiii: .4byte 103 -.align 4 +.p2align 2 .globl jjjj jjjj: .4byte 4 -.align 4 +.p2align 2 .globl kkkk kkkk: .4byte 0 -.align 4 +.p2align 2 .globl llll llll: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/elf-comdat.arm.s b/tests/Regression/CodeGen/llvmTests/elf-comdat.arm.s index 3dacca679..ac4f591d7 100644 --- a/tests/Regression/CodeGen/llvmTests/elf-comdat.arm.s +++ b/tests/Regression/CodeGen/llvmTests/elf-comdat.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl v v: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/elf-comdat.mips.s b/tests/Regression/CodeGen/llvmTests/elf-comdat.mips.s index ed91ed0dd..d5937be39 100644 --- a/tests/Regression/CodeGen/llvmTests/elf-comdat.mips.s +++ b/tests/Regression/CodeGen/llvmTests/elf-comdat.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl v v: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/elf-comdat.riscv.s b/tests/Regression/CodeGen/llvmTests/elf-comdat.riscv.s index 5a2655b37..cd004ea56 100644 --- a/tests/Regression/CodeGen/llvmTests/elf-comdat.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/elf-comdat.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl v v: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/elf-comdat2.arm.s b/tests/Regression/CodeGen/llvmTests/elf-comdat2.arm.s index dbb9077a1..68b4f2dc6 100644 --- a/tests/Regression/CodeGen/llvmTests/elf-comdat2.arm.s +++ b/tests/Regression/CodeGen/llvmTests/elf-comdat2.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl bar bar: .4byte 42 -.align 4 +.p2align 2 .globl foo foo: .4byte 42 diff --git a/tests/Regression/CodeGen/llvmTests/elf-comdat2.mips.s b/tests/Regression/CodeGen/llvmTests/elf-comdat2.mips.s index 1aa71c636..f935a4273 100644 --- a/tests/Regression/CodeGen/llvmTests/elf-comdat2.mips.s +++ b/tests/Regression/CodeGen/llvmTests/elf-comdat2.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl bar bar: .4byte 42 -.align 4 +.p2align 2 .globl foo foo: .4byte 42 diff --git a/tests/Regression/CodeGen/llvmTests/elf-comdat2.riscv.s b/tests/Regression/CodeGen/llvmTests/elf-comdat2.riscv.s index 84ea459ff..337719e86 100644 --- a/tests/Regression/CodeGen/llvmTests/elf-comdat2.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/elf-comdat2.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl bar bar: .4byte 42 -.align 4 +.p2align 2 .globl foo foo: .4byte 42 diff --git a/tests/Regression/CodeGen/llvmTests/elf-exclude.arm.s b/tests/Regression/CodeGen/llvmTests/elf-exclude.arm.s index 9fd2b6dc9..ba5616541 100644 --- a/tests/Regression/CodeGen/llvmTests/elf-exclude.arm.s +++ b/tests/Regression/CodeGen/llvmTests/elf-exclude.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 1 -.align 4 +.p2align 2 .globl b b: .4byte 1 -.align 4 +.p2align 2 .globl c c: .4byte 1 -.align 4 +.p2align 2 .globl d d: .4byte 1 -.align 4 +.p2align 2 .globl e e: .4byte 1 -.align 4 +.p2align 2 .globl f f: .4byte 1 -.align 4 +.p2align 2 .globl g g: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/elf-exclude.mips.s b/tests/Regression/CodeGen/llvmTests/elf-exclude.mips.s index 699d58081..bfe064808 100644 --- a/tests/Regression/CodeGen/llvmTests/elf-exclude.mips.s +++ b/tests/Regression/CodeGen/llvmTests/elf-exclude.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 1 -.align 4 +.p2align 2 .globl b b: .4byte 1 -.align 4 +.p2align 2 .globl c c: .4byte 1 -.align 4 +.p2align 2 .globl d d: .4byte 1 -.align 4 +.p2align 2 .globl e e: .4byte 1 -.align 4 +.p2align 2 .globl f f: .4byte 1 -.align 4 +.p2align 2 .globl g g: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/elf-exclude.riscv.s b/tests/Regression/CodeGen/llvmTests/elf-exclude.riscv.s index a8204567d..d7e29dc84 100644 --- a/tests/Regression/CodeGen/llvmTests/elf-exclude.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/elf-exclude.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 1 -.align 4 +.p2align 2 .globl b b: .4byte 1 -.align 4 +.p2align 2 .globl c c: .4byte 1 -.align 4 +.p2align 2 .globl d d: .4byte 1 -.align 4 +.p2align 2 .globl e e: .4byte 1 -.align 4 +.p2align 2 .globl f f: .4byte 1 -.align 4 +.p2align 2 .globl g g: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.arm.s b/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.arm.s index 11f105a86..3199f594d 100644 --- a/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.arm.s +++ b/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl var var: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.mips.s b/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.mips.s index a09e3bd36..892a3fb78 100644 --- a/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.mips.s +++ b/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl var var: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.riscv.s b/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.riscv.s index 61a46c5cc..53f4e07d6 100644 --- a/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/emergency-spill-slot.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl var var: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/extloadi1.arm.s b/tests/Regression/CodeGen/llvmTests/extloadi1.arm.s index ea3fa523e..a76d50a00 100644 --- a/tests/Regression/CodeGen/llvmTests/extloadi1.arm.s +++ b/tests/Regression/CodeGen/llvmTests/extloadi1.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl handler_installed_6144_b handler_installed_6144_b: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/extloadi1.mips.s b/tests/Regression/CodeGen/llvmTests/extloadi1.mips.s index 3e6a1c960..b4f9eadeb 100644 --- a/tests/Regression/CodeGen/llvmTests/extloadi1.mips.s +++ b/tests/Regression/CodeGen/llvmTests/extloadi1.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl handler_installed_6144_b handler_installed_6144_b: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/extloadi1.riscv.s b/tests/Regression/CodeGen/llvmTests/extloadi1.riscv.s index f5f247e01..1c474709a 100644 --- a/tests/Regression/CodeGen/llvmTests/extloadi1.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/extloadi1.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl handler_installed_6144_b handler_installed_6144_b: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.arm.s b/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.arm.s index 3b434546f..507ec4efe 100644 --- a/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.arm.s +++ b/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.arm.s @@ -1,16 +1,15 @@ .arch armv7ve .data .data -.align 1 .globl a a: .byte 1 -.align 2 +.p2align 1 .globl b b: .byte 2 .byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 4 diff --git a/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.mips.s b/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.mips.s index a3809dac2..cdd1a760a 100644 --- a/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.mips.s +++ b/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.mips.s @@ -1,15 +1,14 @@ .data .data -.align 1 .globl a a: .byte 1 -.align 2 +.p2align 1 .globl b b: .byte 2 .byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 4 diff --git a/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.riscv.s b/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.riscv.s index 9ae3f211b..4f0400951 100644 --- a/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/fast-isel-load-store-verify.riscv.s @@ -1,16 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 1 .globl a a: .byte 1 -.align 2 +.p2align 1 .globl b b: .byte 2 .byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 4 diff --git a/tests/Regression/CodeGen/llvmTests/fast-isel-pic.arm.s b/tests/Regression/CodeGen/llvmTests/fast-isel-pic.arm.s index 7397d9186..c11e79281 100644 --- a/tests/Regression/CodeGen/llvmTests/fast-isel-pic.arm.s +++ b/tests/Regression/CodeGen/llvmTests/fast-isel-pic.arm.s @@ -1,12 +1,12 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl g g: .4byte 0 .bss -.align 4 +.p2align 2 .globl i i: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/fast-isel-pic.mips.s b/tests/Regression/CodeGen/llvmTests/fast-isel-pic.mips.s index 55162c91e..4e0bd4b57 100644 --- a/tests/Regression/CodeGen/llvmTests/fast-isel-pic.mips.s +++ b/tests/Regression/CodeGen/llvmTests/fast-isel-pic.mips.s @@ -1,11 +1,11 @@ .data .data -.align 4 +.p2align 2 .globl g g: .4byte 0 .bss -.align 4 +.p2align 2 .globl i i: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/fast-isel-pic.riscv.s b/tests/Regression/CodeGen/llvmTests/fast-isel-pic.riscv.s index d5a61d25a..d4c5a7891 100644 --- a/tests/Regression/CodeGen/llvmTests/fast-isel-pic.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/fast-isel-pic.riscv.s @@ -1,12 +1,12 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl g g: .4byte 0 .bss -.align 4 +.p2align 2 .globl i i: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/fast-isel-tls.arm.s b/tests/Regression/CodeGen/llvmTests/fast-isel-tls.arm.s index 8b94a23bf..e0d2c7cb6 100644 --- a/tests/Regression/CodeGen/llvmTests/fast-isel-tls.arm.s +++ b/tests/Regression/CodeGen/llvmTests/fast-isel-tls.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl v v: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/fast-isel-tls.mips.s b/tests/Regression/CodeGen/llvmTests/fast-isel-tls.mips.s index 936a3be8f..dc104858e 100644 --- a/tests/Regression/CodeGen/llvmTests/fast-isel-tls.mips.s +++ b/tests/Regression/CodeGen/llvmTests/fast-isel-tls.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl v v: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/fast-isel-tls.riscv.s b/tests/Regression/CodeGen/llvmTests/fast-isel-tls.riscv.s index 3bad81231..e7811e1ee 100644 --- a/tests/Regression/CodeGen/llvmTests/fast-isel-tls.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/fast-isel-tls.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl v v: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/float-imm.mips.s b/tests/Regression/CodeGen/llvmTests/float-imm.mips.s index 2ac1870a7..0bfd4b9fa 100644 --- a/tests/Regression/CodeGen/llvmTests/float-imm.mips.s +++ b/tests/Regression/CodeGen/llvmTests/float-imm.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1078530011 .4byte 1065353216 diff --git a/tests/Regression/CodeGen/llvmTests/float-imm.riscv.s b/tests/Regression/CodeGen/llvmTests/float-imm.riscv.s index 656797f2a..ed53092e3 100644 --- a/tests/Regression/CodeGen/llvmTests/float-imm.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/float-imm.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1078530011 .text diff --git a/tests/Regression/CodeGen/llvmTests/float-select-icmp.mips.s b/tests/Regression/CodeGen/llvmTests/float-select-icmp.mips.s index 09bdc6a54..5d6f12f81 100644 --- a/tests/Regression/CodeGen/llvmTests/float-select-icmp.mips.s +++ b/tests/Regression/CodeGen/llvmTests/float-select-icmp.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1065353216 .text diff --git a/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.arm.s b/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.arm.s index 65b0a5922..941af155b 100644 --- a/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.arm.s +++ b/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 .globl B B: .zero 1000 -.align 8 +.p2align 3 .globl A A: .zero 1000 -.align 8 +.p2align 3 .globl P P: .zero 1000 diff --git a/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.mips.s b/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.mips.s index ad8dbd9ad..d9be9828f 100644 --- a/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.mips.s +++ b/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.mips.s @@ -1,14 +1,14 @@ .data .bss -.align 8 +.p2align 3 .globl B B: .zero 1000 -.align 8 +.p2align 3 .globl A A: .zero 1000 -.align 8 +.p2align 3 .globl P P: .zero 1000 diff --git a/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.riscv.s b/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.riscv.s index b9c684d46..74c7b54fa 100644 --- a/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/fold-mul-lohi.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 .globl B B: .zero 1000 -.align 8 +.p2align 3 .globl A A: .zero 1000 -.align 8 +.p2align 3 .globl P P: .zero 1000 diff --git a/tests/Regression/CodeGen/llvmTests/fp-fast.mips.s b/tests/Regression/CodeGen/llvmTests/fp-fast.mips.s index 031f077c4..1edeae40a 100644 --- a/tests/Regression/CodeGen/llvmTests/fp-fast.mips.s +++ b/tests/Regression/CodeGen/llvmTests/fp-fast.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1082130432 .text diff --git a/tests/Regression/CodeGen/llvmTests/fp16static.arm.s b/tests/Regression/CodeGen/llvmTests/fp16static.arm.s index f63ed2544..0f5780768 100644 --- a/tests/Regression/CodeGen/llvmTests/fp16static.arm.s +++ b/tests/Regression/CodeGen/llvmTests/fp16static.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/fp16static.mips.s b/tests/Regression/CodeGen/llvmTests/fp16static.mips.s index d973649a0..350e3ffbd 100644 --- a/tests/Regression/CodeGen/llvmTests/fp16static.mips.s +++ b/tests/Regression/CodeGen/llvmTests/fp16static.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/fp16static.riscv.s b/tests/Regression/CodeGen/llvmTests/fp16static.riscv.s index ebe00624f..2639c17f3 100644 --- a/tests/Regression/CodeGen/llvmTests/fp16static.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/fp16static.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/fpnotneeded.arm.s b/tests/Regression/CodeGen/llvmTests/fpnotneeded.arm.s index 994df6154..85d4299b8 100644 --- a/tests/Regression/CodeGen/llvmTests/fpnotneeded.arm.s +++ b/tests/Regression/CodeGen/llvmTests/fpnotneeded.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl f f: .4byte 1065353216 diff --git a/tests/Regression/CodeGen/llvmTests/fpnotneeded.mips.s b/tests/Regression/CodeGen/llvmTests/fpnotneeded.mips.s index 7987b5d23..ef50df2c3 100644 --- a/tests/Regression/CodeGen/llvmTests/fpnotneeded.mips.s +++ b/tests/Regression/CodeGen/llvmTests/fpnotneeded.mips.s @@ -1,15 +1,15 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl f f: .4byte 1065353216 .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1073741824 .4byte 1065353216 diff --git a/tests/Regression/CodeGen/llvmTests/fpnotneeded.riscv.s b/tests/Regression/CodeGen/llvmTests/fpnotneeded.riscv.s index e75f75245..25ff19b3e 100644 --- a/tests/Regression/CodeGen/llvmTests/fpnotneeded.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/fpnotneeded.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl f f: .4byte 1065353216 diff --git a/tests/Regression/CodeGen/llvmTests/global-address.arm.s b/tests/Regression/CodeGen/llvmTests/global-address.arm.s index 1569cd8a6..30e1a3f9a 100644 --- a/tests/Regression/CodeGen/llvmTests/global-address.arm.s +++ b/tests/Regression/CodeGen/llvmTests/global-address.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl g g: .4byte 0 -.align 4 +.p2align 2 .globl G G: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/global-address.mips.s b/tests/Regression/CodeGen/llvmTests/global-address.mips.s index 81eaa33e1..f1030841b 100644 --- a/tests/Regression/CodeGen/llvmTests/global-address.mips.s +++ b/tests/Regression/CodeGen/llvmTests/global-address.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl g g: .4byte 0 -.align 4 +.p2align 2 .globl G G: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/global-address.riscv.s b/tests/Regression/CodeGen/llvmTests/global-address.riscv.s index 059ad0d99..a90bcfeb6 100644 --- a/tests/Regression/CodeGen/llvmTests/global-address.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/global-address.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl g g: .4byte 0 -.align 4 +.p2align 2 .globl G G: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.arm.s b/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.arm.s index 97ff45772..27fed5588 100644 --- a/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.arm.s +++ b/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 -.align 4 +.p2align 2 .globl y y: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.mips.s b/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.mips.s index 9dfc407a8..1436d0d94 100644 --- a/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.mips.s +++ b/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 -.align 4 +.p2align 2 .globl y y: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.riscv.s b/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.riscv.s index ee3ff8762..78eb68082 100644 --- a/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/global-merge-dllexport.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 -.align 4 +.p2align 2 .globl y y: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/hf1_body.arm.s b/tests/Regression/CodeGen/llvmTests/hf1_body.arm.s index bd8d4b082..144f15da6 100644 --- a/tests/Regression/CodeGen/llvmTests/hf1_body.arm.s +++ b/tests/Regression/CodeGen/llvmTests/hf1_body.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/hf1_body.mips.s b/tests/Regression/CodeGen/llvmTests/hf1_body.mips.s index 2733b549a..b64863321 100644 --- a/tests/Regression/CodeGen/llvmTests/hf1_body.mips.s +++ b/tests/Regression/CodeGen/llvmTests/hf1_body.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/hf1_body.riscv.s b/tests/Regression/CodeGen/llvmTests/hf1_body.riscv.s index 645a8856b..d517026eb 100644 --- a/tests/Regression/CodeGen/llvmTests/hf1_body.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/hf1_body.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis-2.arm.s b/tests/Regression/CodeGen/llvmTests/hidden-vis-2.arm.s index 912d89437..4aa4cae7c 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis-2.arm.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis-2.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis-2.mips.s b/tests/Regression/CodeGen/llvmTests/hidden-vis-2.mips.s index 3ba3eef59..27376a194 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis-2.mips.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis-2.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis-2.riscv.s b/tests/Regression/CodeGen/llvmTests/hidden-vis-2.riscv.s index 3db757ab7..0f961206d 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis-2.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis-2.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis-3.arm.s b/tests/Regression/CodeGen/llvmTests/hidden-vis-3.arm.s index 1e8fcbeb8..404ec9198 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis-3.arm.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis-3.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 -.align 4 +.p2align 2 .globl y y: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis-3.mips.s b/tests/Regression/CodeGen/llvmTests/hidden-vis-3.mips.s index 69f4ef90a..75c71d138 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis-3.mips.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis-3.mips.s @@ -1,10 +1,10 @@ .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 -.align 4 +.p2align 2 .globl y y: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis-3.riscv.s b/tests/Regression/CodeGen/llvmTests/hidden-vis-3.riscv.s index 52172d551..70ffc6cc0 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis-3.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis-3.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 -.align 4 +.p2align 2 .globl y y: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis-4.arm.s b/tests/Regression/CodeGen/llvmTests/hidden-vis-4.arm.s index 912d89437..4aa4cae7c 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis-4.arm.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis-4.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis-4.mips.s b/tests/Regression/CodeGen/llvmTests/hidden-vis-4.mips.s index 3ba3eef59..27376a194 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis-4.mips.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis-4.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis-4.riscv.s b/tests/Regression/CodeGen/llvmTests/hidden-vis-4.riscv.s index 3db757ab7..0f961206d 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis-4.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis-4.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis.arm.s b/tests/Regression/CodeGen/llvmTests/hidden-vis.arm.s index 1ba91c386..e703975b1 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis.arm.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis.arm.s @@ -1,12 +1,12 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 .bss -.align 4 +.p2align 2 .globl b b: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis.mips.s b/tests/Regression/CodeGen/llvmTests/hidden-vis.mips.s index 4c9b7cdd3..08aaddc61 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis.mips.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis.mips.s @@ -1,11 +1,11 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 .bss -.align 4 +.p2align 2 .globl b b: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/hidden-vis.riscv.s b/tests/Regression/CodeGen/llvmTests/hidden-vis.riscv.s index 3a1b1c656..4bdfcbbd4 100644 --- a/tests/Regression/CodeGen/llvmTests/hidden-vis.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/hidden-vis.riscv.s @@ -1,12 +1,12 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 .bss -.align 4 +.p2align 2 .globl b b: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/imm-cse.arm.s b/tests/Regression/CodeGen/llvmTests/imm-cse.arm.s index e96ff963d..278342710 100644 --- a/tests/Regression/CodeGen/llvmTests/imm-cse.arm.s +++ b/tests/Regression/CodeGen/llvmTests/imm-cse.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl src src: .4byte 0 -.align 4 +.p2align 2 .globl dst dst: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/imm-cse.mips.s b/tests/Regression/CodeGen/llvmTests/imm-cse.mips.s index 3131e7c15..ad4c07ec2 100644 --- a/tests/Regression/CodeGen/llvmTests/imm-cse.mips.s +++ b/tests/Regression/CodeGen/llvmTests/imm-cse.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl src src: .4byte 0 -.align 4 +.p2align 2 .globl dst dst: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/imm-cse.riscv.s b/tests/Regression/CodeGen/llvmTests/imm-cse.riscv.s index bc921de21..ada534a9d 100644 --- a/tests/Regression/CodeGen/llvmTests/imm-cse.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/imm-cse.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl src src: .4byte 0 -.align 4 +.p2align 2 .globl dst dst: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.arm.s b/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.arm.s index ff6d5db6a..776c7afc6 100644 --- a/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.arm.s +++ b/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.mips.s b/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.mips.s index 755c96c52..0e924ff63 100644 --- a/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.mips.s +++ b/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.riscv.s b/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.riscv.s index 55a53e9fd..46f77ab05 100644 --- a/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/indirect-branch-tracking-cm-lager.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.arm.s b/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.arm.s index 33a352d20..4b8388361 100644 --- a/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.arm.s +++ b/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 2 +.p2align 1 .globl X X: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.mips.s b/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.mips.s index d632be93d..f9b071960 100644 --- a/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.mips.s +++ b/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.mips.s @@ -1,6 +1,6 @@ .data .data -.align 2 +.p2align 1 .globl X X: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.riscv.s b/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.riscv.s index 7f5a7ff74..db7d0ca5c 100644 --- a/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/insert-prefetch-other.a.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 2 +.p2align 1 .globl X X: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.arm.s b/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.arm.s index 754d3d641..27cf9d636 100644 --- a/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.arm.s +++ b/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.mips.s b/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.mips.s index 7afcf71e6..c79a9ab82 100644 --- a/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.mips.s +++ b/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.riscv.s b/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.riscv.s index 5d05de8a9..3f481f722 100644 --- a/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/line-zero-prologue-end.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.arm.s b/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.arm.s index b90143cea..e09973d04 100644 --- a/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.arm.s +++ b/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.arm.s @@ -1,20 +1,19 @@ .arch armv7ve .data .data -.align 2 +.p2align 1 .globl a a: .byte 0 .byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 1 .globl d d: .byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.mips.s b/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.mips.s index 41a6f2e0b..dc316be93 100644 --- a/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.mips.s +++ b/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.mips.s @@ -1,19 +1,18 @@ .data .data -.align 2 +.p2align 1 .globl a a: .byte 0 .byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 1 .globl d d: .byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.riscv.s b/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.riscv.s index 5ae575311..d53a418dd 100644 --- a/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/live-range-nosubreg.riscv.s @@ -1,20 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 2 +.p2align 1 .globl a a: .byte 0 .byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 0 -.align 1 .globl d d: .byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/llvm.arm.s b/tests/Regression/CodeGen/llvmTests/llvm.arm.s index f7a2177c8..290fb57e6 100644 --- a/tests/Regression/CodeGen/llvmTests/llvm.arm.s +++ b/tests/Regression/CodeGen/llvmTests/llvm.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/llvm.mips.s b/tests/Regression/CodeGen/llvmTests/llvm.mips.s index b45d61f58..b93c636d9 100644 --- a/tests/Regression/CodeGen/llvmTests/llvm.mips.s +++ b/tests/Regression/CodeGen/llvmTests/llvm.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/llvm.riscv.s b/tests/Regression/CodeGen/llvmTests/llvm.riscv.s index bcfa420c9..d2dc5dec5 100644 --- a/tests/Regression/CodeGen/llvmTests/llvm.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/llvm.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.arm.s b/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.arm.s index bdf93d767..c9d2d2d36 100644 --- a/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.arm.s +++ b/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.arm.s @@ -1,12 +1,12 @@ .arch armv7ve .data .data -.align 2 +.p2align 1 .globl X X: .byte 0 .byte 0 -.align 2 +.p2align 1 .globl Y Y: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.mips.s b/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.mips.s index 37596a862..f78c08c97 100644 --- a/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.mips.s +++ b/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.mips.s @@ -1,11 +1,11 @@ .data .data -.align 2 +.p2align 1 .globl X X: .byte 0 .byte 0 -.align 2 +.p2align 1 .globl Y Y: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.riscv.s b/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.riscv.s index 413fbeafa..b6ea2b5bc 100644 --- a/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/loop-strength-reduce5.riscv.s @@ -1,12 +1,12 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 2 +.p2align 1 .globl X X: .byte 0 .byte 0 -.align 2 +.p2align 1 .globl Y Y: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/lsr-sort.arm.s b/tests/Regression/CodeGen/llvmTests/lsr-sort.arm.s index 33a352d20..4b8388361 100644 --- a/tests/Regression/CodeGen/llvmTests/lsr-sort.arm.s +++ b/tests/Regression/CodeGen/llvmTests/lsr-sort.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 2 +.p2align 1 .globl X X: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/lsr-sort.mips.s b/tests/Regression/CodeGen/llvmTests/lsr-sort.mips.s index d632be93d..f9b071960 100644 --- a/tests/Regression/CodeGen/llvmTests/lsr-sort.mips.s +++ b/tests/Regression/CodeGen/llvmTests/lsr-sort.mips.s @@ -1,6 +1,6 @@ .data .data -.align 2 +.p2align 1 .globl X X: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/lsr-sort.riscv.s b/tests/Regression/CodeGen/llvmTests/lsr-sort.riscv.s index 7f5a7ff74..db7d0ca5c 100644 --- a/tests/Regression/CodeGen/llvmTests/lsr-sort.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/lsr-sort.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 2 +.p2align 1 .globl X X: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..arm.s b/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..arm.s index 83b473344..1364c04e4 100644 --- a/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..arm.s +++ b/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl bar bar: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..mips.s b/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..mips.s index 07b13c39f..b2f69cdd6 100644 --- a/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..mips.s +++ b/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl bar bar: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..riscv.s b/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..riscv.s index 3276f8fe9..1e40f6176 100644 --- a/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..riscv.s +++ b/tests/Regression/CodeGen/llvmTests/machine-outliner-unsafe-registers..riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl bar bar: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.arm.s b/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.arm.s index 74bff6423..366ff46bc 100644 --- a/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.arm.s +++ b/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.arm.s @@ -1,20 +1,20 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl l l: .4byte 0 -.align 4 +.p2align 2 .globl g g: .4byte 0 .bss -.align 4 +.p2align 2 .globl ie ie: .zero 4 -.align 4 +.p2align 2 .globl gd gd: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.mips.s b/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.mips.s index 537705388..d512082e8 100644 --- a/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.mips.s +++ b/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.mips.s @@ -1,19 +1,19 @@ .data .data -.align 4 +.p2align 2 .globl l l: .4byte 0 -.align 4 +.p2align 2 .globl g g: .4byte 0 .bss -.align 4 +.p2align 2 .globl ie ie: .zero 4 -.align 4 +.p2align 2 .globl gd gd: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.riscv.s b/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.riscv.s index 3ea1ae163..8944e5e37 100644 --- a/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/machinelicm-address-pseudos.riscv.s @@ -1,20 +1,20 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl l l: .4byte 0 -.align 4 +.p2align 2 .globl g g: .4byte 0 .bss -.align 4 +.p2align 2 .globl ie ie: .zero 4 -.align 4 +.p2align 2 .globl gd gd: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/machineverifier.arm.s b/tests/Regression/CodeGen/llvmTests/machineverifier.arm.s index 2ebe7294e..1ce623f3c 100644 --- a/tests/Regression/CodeGen/llvmTests/machineverifier.arm.s +++ b/tests/Regression/CodeGen/llvmTests/machineverifier.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl g g: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/machineverifier.mips.s b/tests/Regression/CodeGen/llvmTests/machineverifier.mips.s index 800154869..5d904882d 100644 --- a/tests/Regression/CodeGen/llvmTests/machineverifier.mips.s +++ b/tests/Regression/CodeGen/llvmTests/machineverifier.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl g g: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/machineverifier.riscv.s b/tests/Regression/CodeGen/llvmTests/machineverifier.riscv.s index d26009fe1..292f95f04 100644 --- a/tests/Regression/CodeGen/llvmTests/machineverifier.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/machineverifier.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl g g: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/macho-comdat.arm.s b/tests/Regression/CodeGen/llvmTests/macho-comdat.arm.s index e3a3e82f6..7d629d2b1 100644 --- a/tests/Regression/CodeGen/llvmTests/macho-comdat.arm.s +++ b/tests/Regression/CodeGen/llvmTests/macho-comdat.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl v v: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/macho-comdat.mips.s b/tests/Regression/CodeGen/llvmTests/macho-comdat.mips.s index 72c9bf133..0dded25ee 100644 --- a/tests/Regression/CodeGen/llvmTests/macho-comdat.mips.s +++ b/tests/Regression/CodeGen/llvmTests/macho-comdat.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl v v: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/macho-comdat.riscv.s b/tests/Regression/CodeGen/llvmTests/macho-comdat.riscv.s index b34c570b7..bb1e34634 100644 --- a/tests/Regression/CodeGen/llvmTests/macho-comdat.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/macho-comdat.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl v v: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.arm.s b/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.arm.s index 83b473344..1364c04e4 100644 --- a/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.arm.s +++ b/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl bar bar: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.mips.s b/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.mips.s index 07b13c39f..b2f69cdd6 100644 --- a/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.mips.s +++ b/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl bar bar: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.riscv.s b/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.riscv.s index 3276f8fe9..1e40f6176 100644 --- a/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/macho-extern-hidden.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl bar bar: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-li.arm.s b/tests/Regression/CodeGen/llvmTests/micromips-li.arm.s index cda5f8bf7..b53c944b3 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-li.arm.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-li.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 -.align 4 +.p2align 2 .globl y y: .zero 4 -.align 4 +.p2align 2 .globl z z: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-li.mips.s b/tests/Regression/CodeGen/llvmTests/micromips-li.mips.s index ae567ecbe..9f5c11620 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-li.mips.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-li.mips.s @@ -1,14 +1,14 @@ .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 -.align 4 +.p2align 2 .globl y y: .zero 4 -.align 4 +.p2align 2 .globl z z: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-li.riscv.s b/tests/Regression/CodeGen/llvmTests/micromips-li.riscv.s index 1ccc4cc2b..5378531b8 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-li.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-li.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 -.align 4 +.p2align 2 .globl y y: .zero 4 -.align 4 +.p2align 2 .globl z z: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.arm.s b/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.arm.s index cfefd7e3a..7dce54f77 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.arm.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl gf0 gf0: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.mips.s b/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.mips.s index 179fee273..b0b38ae41 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.mips.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl gf0 gf0: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.riscv.s b/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.riscv.s index fe16e9720..7c673c763 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-lwc1-swc1.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl gf0 gf0: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.arm.s b/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.arm.s index 64f82e9ba..4bc6aaea7 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.arm.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl a a: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.mips.s b/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.mips.s index 5cafe8d8c..cc6e6a5a0 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.mips.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl a a: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.riscv.s b/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.riscv.s index e16504903..858b340e6 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-rdhwr-directives.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl a a: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-shift.arm.s b/tests/Regression/CodeGen/llvmTests/micromips-shift.arm.s index 1b9e7f6ec..f582084fc 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-shift.arm.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-shift.arm.s @@ -1,35 +1,35 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 10 -.align 4 +.p2align 2 .globl b b: .4byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 10 -.align 4 +.p2align 2 .globl d d: .4byte 0 -.align 4 +.p2align 2 .globl i i: .4byte 10654 -.align 4 +.p2align 2 .globl j j: .4byte 0 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl n n: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-shift.mips.s b/tests/Regression/CodeGen/llvmTests/micromips-shift.mips.s index f23683627..ff7fb3228 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-shift.mips.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-shift.mips.s @@ -1,34 +1,34 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 10 -.align 4 +.p2align 2 .globl b b: .4byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 10 -.align 4 +.p2align 2 .globl d d: .4byte 0 -.align 4 +.p2align 2 .globl i i: .4byte 10654 -.align 4 +.p2align 2 .globl j j: .4byte 0 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl n n: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/micromips-shift.riscv.s b/tests/Regression/CodeGen/llvmTests/micromips-shift.riscv.s index 20d0f8d5a..b25ba9c70 100644 --- a/tests/Regression/CodeGen/llvmTests/micromips-shift.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/micromips-shift.riscv.s @@ -1,35 +1,35 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 10 -.align 4 +.p2align 2 .globl b b: .4byte 0 -.align 4 +.p2align 2 .globl c c: .4byte 10 -.align 4 +.p2align 2 .globl d d: .4byte 0 -.align 4 +.p2align 2 .globl i i: .4byte 10654 -.align 4 +.p2align 2 .globl j j: .4byte 0 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl n n: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/minsize-litpools.arm.s b/tests/Regression/CodeGen/llvmTests/minsize-litpools.arm.s index 83a585834..0ba529803 100644 --- a/tests/Regression/CodeGen/llvmTests/minsize-litpools.arm.s +++ b/tests/Regression/CodeGen/llvmTests/minsize-litpools.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl var var: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/minsize-litpools.mips.s b/tests/Regression/CodeGen/llvmTests/minsize-litpools.mips.s index 1c011a380..d74463a5f 100644 --- a/tests/Regression/CodeGen/llvmTests/minsize-litpools.mips.s +++ b/tests/Regression/CodeGen/llvmTests/minsize-litpools.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl var var: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/minsize-litpools.riscv.s b/tests/Regression/CodeGen/llvmTests/minsize-litpools.riscv.s index 55ce8f7d5..2c9537b26 100644 --- a/tests/Regression/CodeGen/llvmTests/minsize-litpools.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/minsize-litpools.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl var var: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.arm.s b/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.arm.s index 6a01c0b8f..ed3359d58 100644 --- a/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.arm.s +++ b/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl var var: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.mips.s b/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.mips.s index 4c06113f8..3cecf0737 100644 --- a/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.mips.s +++ b/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl var var: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.riscv.s b/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.riscv.s index f76435987..dea1b97f6 100644 --- a/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/nacl-reserved-regs.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl var var: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/overlap-shift.arm.s b/tests/Regression/CodeGen/llvmTests/overlap-shift.arm.s index 7815f4dc0..4feea1ece 100644 --- a/tests/Regression/CodeGen/llvmTests/overlap-shift.arm.s +++ b/tests/Regression/CodeGen/llvmTests/overlap-shift.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl G G: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/overlap-shift.mips.s b/tests/Regression/CodeGen/llvmTests/overlap-shift.mips.s index 9bdeb1128..d1f7fcd31 100644 --- a/tests/Regression/CodeGen/llvmTests/overlap-shift.mips.s +++ b/tests/Regression/CodeGen/llvmTests/overlap-shift.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl G G: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/overlap-shift.riscv.s b/tests/Regression/CodeGen/llvmTests/overlap-shift.riscv.s index c9a40a999..13f3f5154 100644 --- a/tests/Regression/CodeGen/llvmTests/overlap-shift.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/overlap-shift.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl G G: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/pr15981.arm.s b/tests/Regression/CodeGen/llvmTests/pr15981.arm.s index f294dea65..db06c9f97 100644 --- a/tests/Regression/CodeGen/llvmTests/pr15981.arm.s +++ b/tests/Regression/CodeGen/llvmTests/pr15981.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl a a: .zero 4 -.align 4 +.p2align 2 .globl b b: .zero 4 -.align 4 +.p2align 2 .globl c c: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/pr15981.mips.s b/tests/Regression/CodeGen/llvmTests/pr15981.mips.s index 34240aace..b67a84433 100644 --- a/tests/Regression/CodeGen/llvmTests/pr15981.mips.s +++ b/tests/Regression/CodeGen/llvmTests/pr15981.mips.s @@ -1,14 +1,14 @@ .data .bss -.align 4 +.p2align 2 .globl a a: .zero 4 -.align 4 +.p2align 2 .globl b b: .zero 4 -.align 4 +.p2align 2 .globl c c: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/pr15981.riscv.s b/tests/Regression/CodeGen/llvmTests/pr15981.riscv.s index e6aba33fc..fd5c17340 100644 --- a/tests/Regression/CodeGen/llvmTests/pr15981.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/pr15981.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl a a: .zero 4 -.align 4 +.p2align 2 .globl b b: .zero 4 -.align 4 +.p2align 2 .globl c c: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/pr3216.arm.s b/tests/Regression/CodeGen/llvmTests/pr3216.arm.s index 7b05bfad0..716f24fad 100644 --- a/tests/Regression/CodeGen/llvmTests/pr3216.arm.s +++ b/tests/Regression/CodeGen/llvmTests/pr3216.arm.s @@ -1,7 +1,6 @@ .arch armv7ve .data .data -.align 1 .globl foo foo: .byte 127 diff --git a/tests/Regression/CodeGen/llvmTests/pr3216.mips.s b/tests/Regression/CodeGen/llvmTests/pr3216.mips.s index 38af61a00..b8111f9a3 100644 --- a/tests/Regression/CodeGen/llvmTests/pr3216.mips.s +++ b/tests/Regression/CodeGen/llvmTests/pr3216.mips.s @@ -1,6 +1,5 @@ .data .data -.align 1 .globl foo foo: .byte 127 diff --git a/tests/Regression/CodeGen/llvmTests/pr3216.riscv.s b/tests/Regression/CodeGen/llvmTests/pr3216.riscv.s index 3e565764d..d26e04794 100644 --- a/tests/Regression/CodeGen/llvmTests/pr3216.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/pr3216.riscv.s @@ -1,7 +1,6 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 1 .globl foo foo: .byte 127 diff --git a/tests/Regression/CodeGen/llvmTests/pr32256.arm.s b/tests/Regression/CodeGen/llvmTests/pr32256.arm.s index 77de58810..6e8f6a503 100644 --- a/tests/Regression/CodeGen/llvmTests/pr32256.arm.s +++ b/tests/Regression/CodeGen/llvmTests/pr32256.arm.s @@ -1,7 +1,6 @@ .arch armv7ve .data .bss -.align 1 .globl c c: .zero 1 diff --git a/tests/Regression/CodeGen/llvmTests/pr32256.mips.s b/tests/Regression/CodeGen/llvmTests/pr32256.mips.s index d9f1ac8a6..c8cfc5f62 100644 --- a/tests/Regression/CodeGen/llvmTests/pr32256.mips.s +++ b/tests/Regression/CodeGen/llvmTests/pr32256.mips.s @@ -1,6 +1,5 @@ .data .bss -.align 1 .globl c c: .zero 1 diff --git a/tests/Regression/CodeGen/llvmTests/pr32256.riscv.s b/tests/Regression/CodeGen/llvmTests/pr32256.riscv.s index 59e8fac12..e9ee6852c 100644 --- a/tests/Regression/CodeGen/llvmTests/pr32256.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/pr32256.riscv.s @@ -1,7 +1,6 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 1 .globl c c: .zero 1 diff --git a/tests/Regression/CodeGen/llvmTests/pr32588.arm.s b/tests/Regression/CodeGen/llvmTests/pr32588.arm.s index 55aa1470a..6b16b26ed 100644 --- a/tests/Regression/CodeGen/llvmTests/pr32588.arm.s +++ b/tests/Regression/CodeGen/llvmTests/pr32588.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl c c: .zero 4 -.align 4 +.p2align 2 .globl b b: .zero 4 -.align 4 +.p2align 2 .globl d d: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/pr32588.mips.s b/tests/Regression/CodeGen/llvmTests/pr32588.mips.s index 4f66ac841..04d0b833f 100644 --- a/tests/Regression/CodeGen/llvmTests/pr32588.mips.s +++ b/tests/Regression/CodeGen/llvmTests/pr32588.mips.s @@ -1,14 +1,14 @@ .data .bss -.align 4 +.p2align 2 .globl c c: .zero 4 -.align 4 +.p2align 2 .globl b b: .zero 4 -.align 4 +.p2align 2 .globl d d: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/pr32588.riscv.s b/tests/Regression/CodeGen/llvmTests/pr32588.riscv.s index 1b7e63019..05eaeb4c2 100644 --- a/tests/Regression/CodeGen/llvmTests/pr32588.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/pr32588.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl c c: .zero 4 -.align 4 +.p2align 2 .globl b b: .zero 4 -.align 4 +.p2align 2 .globl d d: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/pr34381.arm.s b/tests/Regression/CodeGen/llvmTests/pr34381.arm.s index 07fbcb075..28e487fd8 100644 --- a/tests/Regression/CodeGen/llvmTests/pr34381.arm.s +++ b/tests/Regression/CodeGen/llvmTests/pr34381.arm.s @@ -1,23 +1,20 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl var_21 var_21: .zero 4 -.align 1 .globl var_29 var_29: .zero 1 -.align 4 +.p2align 2 .globl var_390 var_390: .zero 4 -.align 1 .globl var_11 var_11: .zero 1 -.align 1 .globl var_370 var_370: .zero 1 diff --git a/tests/Regression/CodeGen/llvmTests/pr34381.mips.s b/tests/Regression/CodeGen/llvmTests/pr34381.mips.s index e2c21ade9..ff5bf33cc 100644 --- a/tests/Regression/CodeGen/llvmTests/pr34381.mips.s +++ b/tests/Regression/CodeGen/llvmTests/pr34381.mips.s @@ -1,22 +1,19 @@ .data .bss -.align 4 +.p2align 2 .globl var_21 var_21: .zero 4 -.align 1 .globl var_29 var_29: .zero 1 -.align 4 +.p2align 2 .globl var_390 var_390: .zero 4 -.align 1 .globl var_11 var_11: .zero 1 -.align 1 .globl var_370 var_370: .zero 1 diff --git a/tests/Regression/CodeGen/llvmTests/pr34381.riscv.s b/tests/Regression/CodeGen/llvmTests/pr34381.riscv.s index b3358b6e2..e878de3af 100644 --- a/tests/Regression/CodeGen/llvmTests/pr34381.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/pr34381.riscv.s @@ -1,23 +1,20 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl var_21 var_21: .zero 4 -.align 1 .globl var_29 var_29: .zero 1 -.align 4 +.p2align 2 .globl var_390 var_390: .zero 4 -.align 1 .globl var_11 var_11: .zero 1 -.align 1 .globl var_370 var_370: .zero 1 diff --git a/tests/Regression/CodeGen/llvmTests/pr35761.arm.s b/tests/Regression/CodeGen/llvmTests/pr35761.arm.s index a3652ce1d..62adc5c11 100644 --- a/tests/Regression/CodeGen/llvmTests/pr35761.arm.s +++ b/tests/Regression/CodeGen/llvmTests/pr35761.arm.s @@ -1,15 +1,14 @@ .arch armv7ve .data .data -.align 1 .globl x x: .byte 0 -.align 4 +.p2align 2 .globl y y: .4byte 0 -.align 4 +.p2align 2 .globl z z: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/pr35761.mips.s b/tests/Regression/CodeGen/llvmTests/pr35761.mips.s index 82c62d2c7..7fa9e3e74 100644 --- a/tests/Regression/CodeGen/llvmTests/pr35761.mips.s +++ b/tests/Regression/CodeGen/llvmTests/pr35761.mips.s @@ -1,14 +1,13 @@ .data .data -.align 1 .globl x x: .byte 0 -.align 4 +.p2align 2 .globl y y: .4byte 0 -.align 4 +.p2align 2 .globl z z: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/pr35761.riscv.s b/tests/Regression/CodeGen/llvmTests/pr35761.riscv.s index 29499c008..07243a96f 100644 --- a/tests/Regression/CodeGen/llvmTests/pr35761.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/pr35761.riscv.s @@ -1,15 +1,14 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 1 .globl x x: .byte 0 -.align 4 +.p2align 2 .globl y y: .4byte 0 -.align 4 +.p2align 2 .globl z z: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/pr58286.arm.s b/tests/Regression/CodeGen/llvmTests/pr58286.arm.s index fc1049adc..5b019ef4e 100644 --- a/tests/Regression/CodeGen/llvmTests/pr58286.arm.s +++ b/tests/Regression/CodeGen/llvmTests/pr58286.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl var var: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/pr58286.mips.s b/tests/Regression/CodeGen/llvmTests/pr58286.mips.s index 739e58211..77988a3c4 100644 --- a/tests/Regression/CodeGen/llvmTests/pr58286.mips.s +++ b/tests/Regression/CodeGen/llvmTests/pr58286.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl var var: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/pr58286.riscv.s b/tests/Regression/CodeGen/llvmTests/pr58286.riscv.s index e50f777f4..f7967c082 100644 --- a/tests/Regression/CodeGen/llvmTests/pr58286.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/pr58286.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl var var: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/private.arm.s b/tests/Regression/CodeGen/llvmTests/private.arm.s index 62fd0a212..4bb7870fe 100644 --- a/tests/Regression/CodeGen/llvmTests/private.arm.s +++ b/tests/Regression/CodeGen/llvmTests/private.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl baz baz: .4byte 4 diff --git a/tests/Regression/CodeGen/llvmTests/private.mips.s b/tests/Regression/CodeGen/llvmTests/private.mips.s index 04e03eb20..efd379d5c 100644 --- a/tests/Regression/CodeGen/llvmTests/private.mips.s +++ b/tests/Regression/CodeGen/llvmTests/private.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl baz baz: .4byte 4 diff --git a/tests/Regression/CodeGen/llvmTests/private.riscv.s b/tests/Regression/CodeGen/llvmTests/private.riscv.s index 6b24304f6..c615c13b1 100644 --- a/tests/Regression/CodeGen/llvmTests/private.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/private.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl baz baz: .4byte 4 diff --git a/tests/Regression/CodeGen/llvmTests/rdhwr-directives.arm.s b/tests/Regression/CodeGen/llvmTests/rdhwr-directives.arm.s index 64f82e9ba..4bc6aaea7 100644 --- a/tests/Regression/CodeGen/llvmTests/rdhwr-directives.arm.s +++ b/tests/Regression/CodeGen/llvmTests/rdhwr-directives.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl a a: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/rdhwr-directives.mips.s b/tests/Regression/CodeGen/llvmTests/rdhwr-directives.mips.s index 5cafe8d8c..cc6e6a5a0 100644 --- a/tests/Regression/CodeGen/llvmTests/rdhwr-directives.mips.s +++ b/tests/Regression/CodeGen/llvmTests/rdhwr-directives.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl a a: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/rdhwr-directives.riscv.s b/tests/Regression/CodeGen/llvmTests/rdhwr-directives.riscv.s index e16504903..858b340e6 100644 --- a/tests/Regression/CodeGen/llvmTests/rdhwr-directives.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/rdhwr-directives.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl a a: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/readtp.arm.s b/tests/Regression/CodeGen/llvmTests/readtp.arm.s index 2862f7131..22d4812cf 100644 --- a/tests/Regression/CodeGen/llvmTests/readtp.arm.s +++ b/tests/Regression/CodeGen/llvmTests/readtp.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl counter counter: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/readtp.mips.s b/tests/Regression/CodeGen/llvmTests/readtp.mips.s index 24c1a4f12..15171c337 100644 --- a/tests/Regression/CodeGen/llvmTests/readtp.mips.s +++ b/tests/Regression/CodeGen/llvmTests/readtp.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl counter counter: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/readtp.riscv.s b/tests/Regression/CodeGen/llvmTests/readtp.riscv.s index 8756bdbce..3195c7743 100644 --- a/tests/Regression/CodeGen/llvmTests/readtp.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/readtp.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl counter counter: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/return-ext.arm.s b/tests/Regression/CodeGen/llvmTests/return-ext.arm.s index b35e48462..98ae3c1b5 100644 --- a/tests/Regression/CodeGen/llvmTests/return-ext.arm.s +++ b/tests/Regression/CodeGen/llvmTests/return-ext.arm.s @@ -1,16 +1,16 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 -.align 2 +.p2align 1 .globl a a: .byte 0 .byte 0 -.align 2 +.p2align 1 .globl b b: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/return-ext.mips.s b/tests/Regression/CodeGen/llvmTests/return-ext.mips.s index 3de990195..bba7f7319 100644 --- a/tests/Regression/CodeGen/llvmTests/return-ext.mips.s +++ b/tests/Regression/CodeGen/llvmTests/return-ext.mips.s @@ -1,15 +1,15 @@ .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 -.align 2 +.p2align 1 .globl a a: .byte 0 .byte 0 -.align 2 +.p2align 1 .globl b b: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/return-ext.riscv.s b/tests/Regression/CodeGen/llvmTests/return-ext.riscv.s index de20baf35..13a93182e 100644 --- a/tests/Regression/CodeGen/llvmTests/return-ext.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/return-ext.riscv.s @@ -1,16 +1,16 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 -.align 2 +.p2align 1 .globl a a: .byte 0 .byte 0 -.align 2 +.p2align 1 .globl b b: .byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/sel1c.arm.s b/tests/Regression/CodeGen/llvmTests/sel1c.arm.s index 15886754b..97fbe8407 100644 --- a/tests/Regression/CodeGen/llvmTests/sel1c.arm.s +++ b/tests/Regression/CodeGen/llvmTests/sel1c.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 2 -.align 4 +.p2align 2 .globl k k: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/sel1c.mips.s b/tests/Regression/CodeGen/llvmTests/sel1c.mips.s index ebaeb1683..6ddb6ba1f 100644 --- a/tests/Regression/CodeGen/llvmTests/sel1c.mips.s +++ b/tests/Regression/CodeGen/llvmTests/sel1c.mips.s @@ -1,14 +1,14 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 2 -.align 4 +.p2align 2 .globl k k: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/sel1c.riscv.s b/tests/Regression/CodeGen/llvmTests/sel1c.riscv.s index a7b093a97..974cb2292 100644 --- a/tests/Regression/CodeGen/llvmTests/sel1c.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/sel1c.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 2 -.align 4 +.p2align 2 .globl k k: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/sel2c.arm.s b/tests/Regression/CodeGen/llvmTests/sel2c.arm.s index 8ec118170..24b86db4b 100644 --- a/tests/Regression/CodeGen/llvmTests/sel2c.arm.s +++ b/tests/Regression/CodeGen/llvmTests/sel2c.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 2 -.align 4 +.p2align 2 .globl k k: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/sel2c.mips.s b/tests/Regression/CodeGen/llvmTests/sel2c.mips.s index 7ed5bb4e7..520472046 100644 --- a/tests/Regression/CodeGen/llvmTests/sel2c.mips.s +++ b/tests/Regression/CodeGen/llvmTests/sel2c.mips.s @@ -1,14 +1,14 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 2 -.align 4 +.p2align 2 .globl k k: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/sel2c.riscv.s b/tests/Regression/CodeGen/llvmTests/sel2c.riscv.s index 5bb4c5e8e..5d73e2424 100644 --- a/tests/Regression/CodeGen/llvmTests/sel2c.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/sel2c.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 2 -.align 4 +.p2align 2 .globl k k: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/select-const.mips.s b/tests/Regression/CodeGen/llvmTests/select-const.mips.s index 30195acdd..4149afb8d 100644 --- a/tests/Regression/CodeGen/llvmTests/select-const.mips.s +++ b/tests/Regression/CodeGen/llvmTests/select-const.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1077936128 .4byte 1082130432 diff --git a/tests/Regression/CodeGen/llvmTests/seleq.arm.s b/tests/Regression/CodeGen/llvmTests/seleq.arm.s index e6864dd2d..3b1900f92 100644 --- a/tests/Regression/CodeGen/llvmTests/seleq.arm.s +++ b/tests/Regression/CodeGen/llvmTests/seleq.arm.s @@ -1,39 +1,39 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl t t: .4byte 10 -.align 4 +.p2align 2 .globl f f: .4byte 199 -.align 4 +.p2align 2 .globl a a: .4byte 1 -.align 4 +.p2align 2 .globl b b: .4byte 10 -.align 4 +.p2align 2 .globl c c: .4byte 1 -.align 4 +.p2align 2 .globl z1 z1: .4byte 0 -.align 4 +.p2align 2 .globl z2 z2: .4byte 0 -.align 4 +.p2align 2 .globl z3 z3: .4byte 0 -.align 4 +.p2align 2 .globl z4 z4: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/seleq.mips.s b/tests/Regression/CodeGen/llvmTests/seleq.mips.s index 9b83eb688..49463379b 100644 --- a/tests/Regression/CodeGen/llvmTests/seleq.mips.s +++ b/tests/Regression/CodeGen/llvmTests/seleq.mips.s @@ -1,38 +1,38 @@ .data .data -.align 4 +.p2align 2 .globl t t: .4byte 10 -.align 4 +.p2align 2 .globl f f: .4byte 199 -.align 4 +.p2align 2 .globl a a: .4byte 1 -.align 4 +.p2align 2 .globl b b: .4byte 10 -.align 4 +.p2align 2 .globl c c: .4byte 1 -.align 4 +.p2align 2 .globl z1 z1: .4byte 0 -.align 4 +.p2align 2 .globl z2 z2: .4byte 0 -.align 4 +.p2align 2 .globl z3 z3: .4byte 0 -.align 4 +.p2align 2 .globl z4 z4: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/seleq.riscv.s b/tests/Regression/CodeGen/llvmTests/seleq.riscv.s index 448a77813..d3dc70898 100644 --- a/tests/Regression/CodeGen/llvmTests/seleq.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/seleq.riscv.s @@ -1,39 +1,39 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl t t: .4byte 10 -.align 4 +.p2align 2 .globl f f: .4byte 199 -.align 4 +.p2align 2 .globl a a: .4byte 1 -.align 4 +.p2align 2 .globl b b: .4byte 10 -.align 4 +.p2align 2 .globl c c: .4byte 1 -.align 4 +.p2align 2 .globl z1 z1: .4byte 0 -.align 4 +.p2align 2 .globl z2 z2: .4byte 0 -.align 4 +.p2align 2 .globl z3 z3: .4byte 0 -.align 4 +.p2align 2 .globl z4 z4: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setcc-se.arm.s b/tests/Regression/CodeGen/llvmTests/setcc-se.arm.s index cd6bf94f7..fe2688700 100644 --- a/tests/Regression/CodeGen/llvmTests/setcc-se.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setcc-se.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl g1 g1: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/setcc-se.mips.s b/tests/Regression/CodeGen/llvmTests/setcc-se.mips.s index f3920932d..29ad3c340 100644 --- a/tests/Regression/CodeGen/llvmTests/setcc-se.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setcc-se.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl g1 g1: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/setcc-se.riscv.s b/tests/Regression/CodeGen/llvmTests/setcc-se.riscv.s index 69f2b7d51..606161066 100644 --- a/tests/Regression/CodeGen/llvmTests/setcc-se.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setcc-se.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl g1 g1: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/seteq.arm.s b/tests/Regression/CodeGen/llvmTests/seteq.arm.s index f295e909c..86758ed16 100644 --- a/tests/Regression/CodeGen/llvmTests/seteq.arm.s +++ b/tests/Regression/CodeGen/llvmTests/seteq.arm.s @@ -1,23 +1,23 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 1 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/seteq.mips.s b/tests/Regression/CodeGen/llvmTests/seteq.mips.s index 4f3a4446e..9160bcc0e 100644 --- a/tests/Regression/CodeGen/llvmTests/seteq.mips.s +++ b/tests/Regression/CodeGen/llvmTests/seteq.mips.s @@ -1,22 +1,22 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 1 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/seteq.riscv.s b/tests/Regression/CodeGen/llvmTests/seteq.riscv.s index 121ee1cae..0b724991a 100644 --- a/tests/Regression/CodeGen/llvmTests/seteq.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/seteq.riscv.s @@ -1,23 +1,23 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 1 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/seteqz.arm.s b/tests/Regression/CodeGen/llvmTests/seteqz.arm.s index 8a610814b..79db50ce0 100644 --- a/tests/Regression/CodeGen/llvmTests/seteqz.arm.s +++ b/tests/Regression/CodeGen/llvmTests/seteqz.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 99 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/seteqz.mips.s b/tests/Regression/CodeGen/llvmTests/seteqz.mips.s index 07d1f0acc..3a8b072ef 100644 --- a/tests/Regression/CodeGen/llvmTests/seteqz.mips.s +++ b/tests/Regression/CodeGen/llvmTests/seteqz.mips.s @@ -1,18 +1,18 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 99 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/seteqz.riscv.s b/tests/Regression/CodeGen/llvmTests/seteqz.riscv.s index b128b08eb..f5e2a4aac 100644 --- a/tests/Regression/CodeGen/llvmTests/seteqz.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/seteqz.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 0 -.align 4 +.p2align 2 .globl j j: .4byte 99 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setgek.arm.s b/tests/Regression/CodeGen/llvmTests/setgek.arm.s index 01d18449c..5ab58e71b 100644 --- a/tests/Regression/CodeGen/llvmTests/setgek.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setgek.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setgek.mips.s b/tests/Regression/CodeGen/llvmTests/setgek.mips.s index de7db46e5..afa21a125 100644 --- a/tests/Regression/CodeGen/llvmTests/setgek.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setgek.mips.s @@ -1,18 +1,18 @@ .data .data -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setgek.riscv.s b/tests/Regression/CodeGen/llvmTests/setgek.riscv.s index 69888b3e2..a34e4bc93 100644 --- a/tests/Regression/CodeGen/llvmTests/setgek.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setgek.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setle.arm.s b/tests/Regression/CodeGen/llvmTests/setle.arm.s index b9ad678ad..6673fa56b 100644 --- a/tests/Regression/CodeGen/llvmTests/setle.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setle.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 4294967291 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setle.mips.s b/tests/Regression/CodeGen/llvmTests/setle.mips.s index 0bb7442f4..3d27defe3 100644 --- a/tests/Regression/CodeGen/llvmTests/setle.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setle.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 4294967291 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setle.riscv.s b/tests/Regression/CodeGen/llvmTests/setle.riscv.s index 875a2d0b9..7eec29fd8 100644 --- a/tests/Regression/CodeGen/llvmTests/setle.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setle.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 4294967291 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setlt.arm.s b/tests/Regression/CodeGen/llvmTests/setlt.arm.s index 44dfe3b5b..7a5bdcc3b 100644 --- a/tests/Regression/CodeGen/llvmTests/setlt.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setlt.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 4294967291 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setlt.mips.s b/tests/Regression/CodeGen/llvmTests/setlt.mips.s index 9be68ef9d..3c2f4ac17 100644 --- a/tests/Regression/CodeGen/llvmTests/setlt.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setlt.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 4294967291 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setlt.riscv.s b/tests/Regression/CodeGen/llvmTests/setlt.riscv.s index 31bf2d463..7fc295c56 100644 --- a/tests/Regression/CodeGen/llvmTests/setlt.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setlt.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 4294967291 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setltk.arm.s b/tests/Regression/CodeGen/llvmTests/setltk.arm.s index 542564040..f58992192 100644 --- a/tests/Regression/CodeGen/llvmTests/setltk.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setltk.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 4294967291 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setltk.mips.s b/tests/Regression/CodeGen/llvmTests/setltk.mips.s index b31386157..cd9b49895 100644 --- a/tests/Regression/CodeGen/llvmTests/setltk.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setltk.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 4294967291 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setltk.riscv.s b/tests/Regression/CodeGen/llvmTests/setltk.riscv.s index 9d85fe6e7..dc7023203 100644 --- a/tests/Regression/CodeGen/llvmTests/setltk.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setltk.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 4294967291 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setne.arm.s b/tests/Regression/CodeGen/llvmTests/setne.arm.s index 06e45b524..71b1aab31 100644 --- a/tests/Regression/CodeGen/llvmTests/setne.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setne.arm.s @@ -1,23 +1,23 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 1 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setne.mips.s b/tests/Regression/CodeGen/llvmTests/setne.mips.s index ff9d3941f..1aa4dd312 100644 --- a/tests/Regression/CodeGen/llvmTests/setne.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setne.mips.s @@ -1,22 +1,22 @@ .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 1 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setne.riscv.s b/tests/Regression/CodeGen/llvmTests/setne.riscv.s index 41faa5b12..a9837bea5 100644 --- a/tests/Regression/CodeGen/llvmTests/setne.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setne.riscv.s @@ -1,23 +1,23 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl i i: .4byte 1 -.align 4 +.p2align 2 .globl j j: .4byte 10 -.align 4 +.p2align 2 .globl k k: .4byte 1 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setuge.arm.s b/tests/Regression/CodeGen/llvmTests/setuge.arm.s index 452290623..7a1e46c71 100644 --- a/tests/Regression/CodeGen/llvmTests/setuge.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setuge.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setuge.mips.s b/tests/Regression/CodeGen/llvmTests/setuge.mips.s index e9d2dd896..3cdab71c3 100644 --- a/tests/Regression/CodeGen/llvmTests/setuge.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setuge.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setuge.riscv.s b/tests/Regression/CodeGen/llvmTests/setuge.riscv.s index 2376d50f3..e739ec3bb 100644 --- a/tests/Regression/CodeGen/llvmTests/setuge.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setuge.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setugt.arm.s b/tests/Regression/CodeGen/llvmTests/setugt.arm.s index 21ef28324..b8074c0ca 100644 --- a/tests/Regression/CodeGen/llvmTests/setugt.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setugt.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setugt.mips.s b/tests/Regression/CodeGen/llvmTests/setugt.mips.s index e104435e1..7498ab2bd 100644 --- a/tests/Regression/CodeGen/llvmTests/setugt.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setugt.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setugt.riscv.s b/tests/Regression/CodeGen/llvmTests/setugt.riscv.s index ea81faf47..bfa83c736 100644 --- a/tests/Regression/CodeGen/llvmTests/setugt.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setugt.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setule.arm.s b/tests/Regression/CodeGen/llvmTests/setule.arm.s index 452290623..7a1e46c71 100644 --- a/tests/Regression/CodeGen/llvmTests/setule.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setule.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setule.mips.s b/tests/Regression/CodeGen/llvmTests/setule.mips.s index e9d2dd896..3cdab71c3 100644 --- a/tests/Regression/CodeGen/llvmTests/setule.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setule.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setule.riscv.s b/tests/Regression/CodeGen/llvmTests/setule.riscv.s index 2376d50f3..e739ec3bb 100644 --- a/tests/Regression/CodeGen/llvmTests/setule.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setule.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setult.arm.s b/tests/Regression/CodeGen/llvmTests/setult.arm.s index 82d82f7fe..81814385a 100644 --- a/tests/Regression/CodeGen/llvmTests/setult.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setult.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setult.mips.s b/tests/Regression/CodeGen/llvmTests/setult.mips.s index de0356af5..81f6207f3 100644 --- a/tests/Regression/CodeGen/llvmTests/setult.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setult.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setult.riscv.s b/tests/Regression/CodeGen/llvmTests/setult.riscv.s index 6840631b1..3112cea3b 100644 --- a/tests/Regression/CodeGen/llvmTests/setult.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setult.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setultk.arm.s b/tests/Regression/CodeGen/llvmTests/setultk.arm.s index 0c7f6c89f..13b187959 100644 --- a/tests/Regression/CodeGen/llvmTests/setultk.arm.s +++ b/tests/Regression/CodeGen/llvmTests/setultk.arm.s @@ -1,31 +1,31 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setultk.mips.s b/tests/Regression/CodeGen/llvmTests/setultk.mips.s index 3bebe9525..2cab5451d 100644 --- a/tests/Regression/CodeGen/llvmTests/setultk.mips.s +++ b/tests/Regression/CodeGen/llvmTests/setultk.mips.s @@ -1,30 +1,30 @@ .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/setultk.riscv.s b/tests/Regression/CodeGen/llvmTests/setultk.riscv.s index 3dfacdaca..7e5bcc23e 100644 --- a/tests/Regression/CodeGen/llvmTests/setultk.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/setultk.riscv.s @@ -1,31 +1,31 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl j j: .4byte 5 -.align 4 +.p2align 2 .globl k k: .4byte 10 -.align 4 +.p2align 2 .globl l l: .4byte 20 -.align 4 +.p2align 2 .globl m m: .4byte 10 -.align 4 +.p2align 2 .globl r1 r1: .4byte 0 -.align 4 +.p2align 2 .globl r2 r2: .4byte 0 -.align 4 +.p2align 2 .globl r3 r3: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/shift-codegen.arm.s b/tests/Regression/CodeGen/llvmTests/shift-codegen.arm.s index b2a9653ae..ea387b8b8 100644 --- a/tests/Regression/CodeGen/llvmTests/shift-codegen.arm.s +++ b/tests/Regression/CodeGen/llvmTests/shift-codegen.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl Y Y: .4byte 0 -.align 4 +.p2align 2 .globl X X: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/shift-codegen.mips.s b/tests/Regression/CodeGen/llvmTests/shift-codegen.mips.s index d6930e33c..c30b4bc2b 100644 --- a/tests/Regression/CodeGen/llvmTests/shift-codegen.mips.s +++ b/tests/Regression/CodeGen/llvmTests/shift-codegen.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl Y Y: .4byte 0 -.align 4 +.p2align 2 .globl X X: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/shift-codegen.riscv.s b/tests/Regression/CodeGen/llvmTests/shift-codegen.riscv.s index 290efe492..2bb29c406 100644 --- a/tests/Regression/CodeGen/llvmTests/shift-codegen.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/shift-codegen.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl Y Y: .4byte 0 -.align 4 +.p2align 2 .globl X X: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/shift-one.arm.s b/tests/Regression/CodeGen/llvmTests/shift-one.arm.s index dd6b24c7f..627c32bd8 100644 --- a/tests/Regression/CodeGen/llvmTests/shift-one.arm.s +++ b/tests/Regression/CodeGen/llvmTests/shift-one.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/shift-one.mips.s b/tests/Regression/CodeGen/llvmTests/shift-one.mips.s index 82c8e7afd..b3d57e45d 100644 --- a/tests/Regression/CodeGen/llvmTests/shift-one.mips.s +++ b/tests/Regression/CodeGen/llvmTests/shift-one.mips.s @@ -1,6 +1,6 @@ .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/shift-one.riscv.s b/tests/Regression/CodeGen/llvmTests/shift-one.riscv.s index 1c31034be..fb4c34c04 100644 --- a/tests/Regression/CodeGen/llvmTests/shift-one.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/shift-one.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl x x: .zero 4 diff --git a/tests/Regression/CodeGen/llvmTests/stride-reuse.arm.s b/tests/Regression/CodeGen/llvmTests/stride-reuse.arm.s index d9f5083b3..f6d6bcfeb 100644 --- a/tests/Regression/CodeGen/llvmTests/stride-reuse.arm.s +++ b/tests/Regression/CodeGen/llvmTests/stride-reuse.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 .globl B B: .zero 4000 -.align 8 +.p2align 3 .globl A A: .zero 4000 -.align 8 +.p2align 3 .globl P P: .zero 4000 diff --git a/tests/Regression/CodeGen/llvmTests/stride-reuse.mips.s b/tests/Regression/CodeGen/llvmTests/stride-reuse.mips.s index 1ddf37359..5e9fd6dec 100644 --- a/tests/Regression/CodeGen/llvmTests/stride-reuse.mips.s +++ b/tests/Regression/CodeGen/llvmTests/stride-reuse.mips.s @@ -1,14 +1,14 @@ .data .bss -.align 8 +.p2align 3 .globl B B: .zero 4000 -.align 8 +.p2align 3 .globl A A: .zero 4000 -.align 8 +.p2align 3 .globl P P: .zero 4000 diff --git a/tests/Regression/CodeGen/llvmTests/stride-reuse.riscv.s b/tests/Regression/CodeGen/llvmTests/stride-reuse.riscv.s index 7fb664f34..5bbeaa57e 100644 --- a/tests/Regression/CodeGen/llvmTests/stride-reuse.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/stride-reuse.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 .globl B B: .zero 4000 -.align 8 +.p2align 3 .globl A A: .zero 4000 -.align 8 +.p2align 3 .globl P P: .zero 4000 diff --git a/tests/Regression/CodeGen/llvmTests/symbol-redefinition.arm.s b/tests/Regression/CodeGen/llvmTests/symbol-redefinition.arm.s index 0f22ce4e2..3fe71d372 100644 --- a/tests/Regression/CodeGen/llvmTests/symbol-redefinition.arm.s +++ b/tests/Regression/CodeGen/llvmTests/symbol-redefinition.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl var var: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/symbol-redefinition.mips.s b/tests/Regression/CodeGen/llvmTests/symbol-redefinition.mips.s index 46a0fc7be..236d31d74 100644 --- a/tests/Regression/CodeGen/llvmTests/symbol-redefinition.mips.s +++ b/tests/Regression/CodeGen/llvmTests/symbol-redefinition.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl var var: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/symbol-redefinition.riscv.s b/tests/Regression/CodeGen/llvmTests/symbol-redefinition.riscv.s index 239ecada0..6c55ae862 100644 --- a/tests/Regression/CodeGen/llvmTests/symbol-redefinition.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/symbol-redefinition.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl var var: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/tailregccpic.arm.s b/tests/Regression/CodeGen/llvmTests/tailregccpic.arm.s index 20fb5e680..0886c2edc 100644 --- a/tests/Regression/CodeGen/llvmTests/tailregccpic.arm.s +++ b/tests/Regression/CodeGen/llvmTests/tailregccpic.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a0 a0: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/tailregccpic.mips.s b/tests/Regression/CodeGen/llvmTests/tailregccpic.mips.s index 26682490c..53fa98280 100644 --- a/tests/Regression/CodeGen/llvmTests/tailregccpic.mips.s +++ b/tests/Regression/CodeGen/llvmTests/tailregccpic.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl a0 a0: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/tailregccpic.riscv.s b/tests/Regression/CodeGen/llvmTests/tailregccpic.riscv.s index c155d82bc..5fbf7f50d 100644 --- a/tests/Regression/CodeGen/llvmTests/tailregccpic.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/tailregccpic.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a0 a0: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.arm.s b/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.arm.s index c573a9035..5971fdca8 100644 --- a/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.arm.s +++ b/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.arm.s @@ -1,15 +1,15 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 -.align 4 +.p2align 2 .globl a a: .4byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.mips.s b/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.mips.s index 4e71220a9..dfb1a61d2 100644 --- a/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.mips.s +++ b/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.mips.s @@ -1,14 +1,14 @@ .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 -.align 4 +.p2align 2 .globl a a: .4byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.riscv.s b/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.riscv.s index 4f2c3a184..529734758 100644 --- a/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/tglobaladdr-wrapper.riscv.s @@ -1,15 +1,15 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl x x: .4byte 0 -.align 4 +.p2align 2 .globl a a: .4byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/tls16.arm.s b/tests/Regression/CodeGen/llvmTests/tls16.arm.s index 662ae9286..dc0d0377b 100644 --- a/tests/Regression/CodeGen/llvmTests/tls16.arm.s +++ b/tests/Regression/CodeGen/llvmTests/tls16.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 4 diff --git a/tests/Regression/CodeGen/llvmTests/tls16.mips.s b/tests/Regression/CodeGen/llvmTests/tls16.mips.s index ac890f669..26525a65b 100644 --- a/tests/Regression/CodeGen/llvmTests/tls16.mips.s +++ b/tests/Regression/CodeGen/llvmTests/tls16.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 4 diff --git a/tests/Regression/CodeGen/llvmTests/tls16.riscv.s b/tests/Regression/CodeGen/llvmTests/tls16.riscv.s index 8f1ce36ee..c0b72c081 100644 --- a/tests/Regression/CodeGen/llvmTests/tls16.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/tls16.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 4 diff --git a/tests/Regression/CodeGen/llvmTests/tlv-3.arm.s b/tests/Regression/CodeGen/llvmTests/tlv-3.arm.s index b07cdd0d6..7b700ff9b 100644 --- a/tests/Regression/CodeGen/llvmTests/tlv-3.arm.s +++ b/tests/Regression/CodeGen/llvmTests/tlv-3.arm.s @@ -1,7 +1,6 @@ .arch armv7ve .data .data -.align 1 .globl foo foo: .byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/tlv-3.mips.s b/tests/Regression/CodeGen/llvmTests/tlv-3.mips.s index dddc47ccd..922d562c2 100644 --- a/tests/Regression/CodeGen/llvmTests/tlv-3.mips.s +++ b/tests/Regression/CodeGen/llvmTests/tlv-3.mips.s @@ -1,6 +1,5 @@ .data .data -.align 1 .globl foo foo: .byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/tlv-3.riscv.s b/tests/Regression/CodeGen/llvmTests/tlv-3.riscv.s index b9e6be560..e809bf1c4 100644 --- a/tests/Regression/CodeGen/llvmTests/tlv-3.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/tlv-3.riscv.s @@ -1,7 +1,6 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 1 .globl foo foo: .byte 1 diff --git a/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..arm.s b/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..arm.s index a3652ce1d..62adc5c11 100644 --- a/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..arm.s +++ b/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..arm.s @@ -1,15 +1,14 @@ .arch armv7ve .data .data -.align 1 .globl x x: .byte 0 -.align 4 +.p2align 2 .globl y y: .4byte 0 -.align 4 +.p2align 2 .globl z z: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..mips.s b/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..mips.s index 82c62d2c7..7fa9e3e74 100644 --- a/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..mips.s +++ b/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..mips.s @@ -1,14 +1,13 @@ .data .data -.align 1 .globl x x: .byte 0 -.align 4 +.p2align 2 .globl y y: .4byte 0 -.align 4 +.p2align 2 .globl z z: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..riscv.s b/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..riscv.s index 29499c008..07243a96f 100644 --- a/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..riscv.s +++ b/tests/Regression/CodeGen/llvmTests/win64-eh-empty-block-2..riscv.s @@ -1,15 +1,14 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 1 .globl x x: .byte 0 -.align 4 +.p2align 2 .globl y y: .4byte 0 -.align 4 +.p2align 2 .globl z z: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.arm.s b/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.arm.s index 87ba3fc59..c67c37279 100644 --- a/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.arm.s +++ b/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.mips.s b/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.mips.s index 2a3c93ddb..621276a03 100644 --- a/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.mips.s +++ b/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.riscv.s b/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.riscv.s index edcb6f450..6c360e26d 100644 --- a/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/x86-64-pic-4.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.arm.s b/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.arm.s index 87ba3fc59..c67c37279 100644 --- a/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.arm.s +++ b/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.mips.s b/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.mips.s index 2a3c93ddb..621276a03 100644 --- a/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.mips.s +++ b/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.riscv.s b/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.riscv.s index edcb6f450..6c360e26d 100644 --- a/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/x86-64-pic-5.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.arm.s b/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.arm.s index 6d6fab39e..ac43bbd24 100644 --- a/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.arm.s +++ b/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.mips.s b/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.mips.s index d91a1e915..a8fd72762 100644 --- a/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.mips.s +++ b/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.mips.s @@ -1,6 +1,6 @@ .data .data -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.riscv.s b/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.riscv.s index e7cf7d63f..f53efe34a 100644 --- a/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/zero-call-used-regs.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl result result: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.arm.s b/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.arm.s index 8dddfd946..60232bd4a 100644 --- a/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.arm.s +++ b/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.mips.s b/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.mips.s index cc3453509..90a500a4c 100644 --- a/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.mips.s +++ b/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.mips.s @@ -1,10 +1,10 @@ .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 diff --git a/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.riscv.s b/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.riscv.s index 40723ae45..155b2bfe4 100644 --- a/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.riscv.s +++ b/tests/Regression/CodeGen/llvmTests/zero-initialized-in-bss.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .data -.align 4 +.p2align 2 .globl a a: .4byte 0 -.align 4 +.p2align 2 .globl b b: .4byte 0 diff --git a/tests/Regression/CodeGen/loadstore.arm.s b/tests/Regression/CodeGen/loadstore.arm.s index 9cb39e417..4db70ad8a 100644 --- a/tests/Regression/CodeGen/loadstore.arm.s +++ b/tests/Regression/CodeGen/loadstore.arm.s @@ -1,23 +1,23 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 .globl y y: .zero 4 -.align 8 +.p2align 3 .globl arr arr: .zero 400 -.align 8 +.p2align 3 .globl x x: .zero 8 -.align 8 +.p2align 3 .globl src src: .zero 16 -.align 8 +.p2align 3 .globl dst dst: .zero 8 diff --git a/tests/Regression/CodeGen/loadstore.mips.s b/tests/Regression/CodeGen/loadstore.mips.s index 92b0352ef..3cc7fa528 100644 --- a/tests/Regression/CodeGen/loadstore.mips.s +++ b/tests/Regression/CodeGen/loadstore.mips.s @@ -1,26 +1,26 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1065353216 .bss -.align 4 +.p2align 2 .globl y y: .zero 4 -.align 8 +.p2align 3 .globl arr arr: .zero 400 -.align 8 +.p2align 3 .globl x x: .zero 8 -.align 8 +.p2align 3 .globl src src: .zero 16 -.align 8 +.p2align 3 .globl dst dst: .zero 8 diff --git a/tests/Regression/CodeGen/loadstore.riscv.s b/tests/Regression/CodeGen/loadstore.riscv.s index aefa960ee..85a297432 100644 --- a/tests/Regression/CodeGen/loadstore.riscv.s +++ b/tests/Regression/CodeGen/loadstore.riscv.s @@ -1,23 +1,23 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 .globl y y: .zero 4 -.align 8 +.p2align 3 .globl arr arr: .zero 400 -.align 8 +.p2align 3 .globl x x: .zero 8 -.align 8 +.p2align 3 .globl src src: .zero 16 -.align 8 +.p2align 3 .globl dst dst: .zero 8 diff --git a/tests/Regression/CodeGen/select.mips.s b/tests/Regression/CodeGen/select.mips.s index 47a7818a4..ee6d51471 100644 --- a/tests/Regression/CodeGen/select.mips.s +++ b/tests/Regression/CodeGen/select.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1065353216 .4byte 1092616192 diff --git a/tests/Regression/CodeGen/select.riscv.s b/tests/Regression/CodeGen/select.riscv.s index 443ca432a..8f5d2a87d 100644 --- a/tests/Regression/CodeGen/select.riscv.s +++ b/tests/Regression/CodeGen/select.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 3226013648 .4byte 1086918628 diff --git a/tests/Regression/CodeGen/switch.arm.s b/tests/Regression/CodeGen/switch.arm.s index 3ca2737c1..e6ce7484e 100644 --- a/tests/Regression/CodeGen/switch.arm.s +++ b/tests/Regression/CodeGen/switch.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .section .rodata -.align 8 +.p2align 3 __cmmc_jumptable10: .word label7-__cmmc_jumptable10 .word label6-__cmmc_jumptable10 diff --git a/tests/Regression/CodeGen/switch.mips.s b/tests/Regression/CodeGen/switch.mips.s index 0e898c58c..fccc944e6 100644 --- a/tests/Regression/CodeGen/switch.mips.s +++ b/tests/Regression/CodeGen/switch.mips.s @@ -1,6 +1,6 @@ .data .section .rodata -.align 8 +.p2align 3 __cmmc_jumptable10: .word label7-__cmmc_jumptable10 .word label6-__cmmc_jumptable10 diff --git a/tests/Regression/CodeGen/switch.riscv.s b/tests/Regression/CodeGen/switch.riscv.s index 6a0028806..450e2d364 100644 --- a/tests/Regression/CodeGen/switch.riscv.s +++ b/tests/Regression/CodeGen/switch.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 8 +.p2align 3 __cmmc_jumptable10: .word label7-__cmmc_jumptable10 .word label6-__cmmc_jumptable10 diff --git a/tests/SysY2022/functional/55_sort_test1.riscv.s b/tests/SysY2022/functional/55_sort_test1.riscv.s index 5bcb72dc8..72a85b278 100644 --- a/tests/SysY2022/functional/55_sort_test1.riscv.s +++ b/tests/SysY2022/functional/55_sort_test1.riscv.s @@ -8,36 +8,36 @@ main: addi sp, sp, -72 li a3, 3 li a4, 9 - li a5, 1 - li t0, 5 + li a5, 5 addi a2, sp, 32 + slli a0, a3, 32 sd ra, 0(sp) - slli a1, a3, 32 - sd s0, 8(sp) li a3, 2 - addi a0, a1, 4 + addi a1, a0, 4 + sd s0, 8(sp) li s0, 10 sd s1, 16(sp) - slli a1, a3, 32 sd s2, 24(sp) - slli a3, a5, 32 - sd a0, 32(sp) + sd a1, 32(sp) + slli a1, a3, 32 + li a3, 1 addi a0, a1, 9 - slli a1, t0, 32 + slli a1, a3, 32 sd a0, 40(sp) - addi a0, a1, 6 - sd a3, 48(sp) li a3, 8 - sd a0, 56(sp) - slli a1, a3, 32 + slli a0, a5, 32 + sd a1, 48(sp) + addi a1, a0, 6 + slli a0, a3, 32 + sd a1, 56(sp) mv a3, zero - addi a0, a1, 7 - sd a0, 64(sp) + addi a1, a0, 7 + sd a1, 64(sp) j label2 .p2align 2 label46: addiw a3, a3, 1 - bge a3, a4, label102 + bge a3, a4, label101 .p2align 2 label2: subw a0, s0, a3 @@ -52,7 +52,7 @@ label2: bgt a1, a5, label10 addiw a3, a3, 1 blt a3, a4, label2 - j label102 + j label101 .p2align 2 label8: sw t0, 4(a0) @@ -69,7 +69,7 @@ label10: bgt a1, a5, label10 addiw a3, a3, 1 blt a3, a4, label2 -label102: +label101: mv s1, a2 mv s2, zero j label13 @@ -94,4 +94,4 @@ label13: label11: addiw a3, a3, 1 blt a3, a4, label2 - j label102 + j label101 diff --git a/tests/SysY2022/functional/56_sort_test2.riscv.s b/tests/SysY2022/functional/56_sort_test2.riscv.s index 693b436df..84e200237 100644 --- a/tests/SysY2022/functional/56_sort_test2.riscv.s +++ b/tests/SysY2022/functional/56_sort_test2.riscv.s @@ -8,7 +8,7 @@ main: addi sp, sp, -72 li a4, 3 li a5, 2 - li t0, 8 + li t0, 5 addi a0, sp, 32 slli a2, a4, 32 sd ra, 0(sp) @@ -17,7 +17,6 @@ main: sd s0, 8(sp) slli a2, a5, 32 li s0, 10 - li a5, 5 addi a4, a2, 9 sd s1, 16(sp) li a2, 1 @@ -26,12 +25,13 @@ main: li a3, 1 sd a4, 40(sp) slli a4, a2, 32 - slli a2, a5, 32 + slli a2, t0, 32 sd a4, 48(sp) - slli a5, t0, 32 - addi a4, a2, 6 - addi a2, a5, 7 - sd a4, 56(sp) + li t0, 8 + addi a5, a2, 6 + slli a4, t0, 32 + sd a5, 56(sp) + addi a2, a4, 7 sd a2, 64(sp) addiw a5, a3, -1 lw a4, 0(a1) diff --git a/tests/SysY2022/functional/58_sort_test4.riscv.s b/tests/SysY2022/functional/58_sort_test4.riscv.s index d8272a0b5..8e43394bc 100644 --- a/tests/SysY2022/functional/58_sort_test4.riscv.s +++ b/tests/SysY2022/functional/58_sort_test4.riscv.s @@ -4,43 +4,42 @@ .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[40] RegSpill[0] CalleeSaved[32] - addi sp, sp, -72 + # stack usage: CalleeArg[0] Local[40] RegSpill[0] CalleeSaved[24] + addi sp, sp, -64 li a3, 3 li a4, 9 li a5, 1 - li t0, 5 - addi a0, sp, 32 + addi a0, sp, 24 + slli a1, a3, 32 sd ra, 0(sp) - slli a2, a3, 32 - sd s0, 8(sp) li a3, 2 - addi a1, a2, 4 - li s0, 10 - sd s1, 16(sp) - sd s2, 24(sp) - sd a1, 32(sp) + addi a2, a1, 4 + sd s0, 8(sp) slli a1, a3, 32 - slli a3, a5, 32 + sd s1, 16(sp) + li a3, 5 + sd a2, 24(sp) addi a2, a1, 9 + slli a1, a5, 32 + sd a2, 32(sp) li a5, 7 - slli a1, t0, 32 - sd a2, 40(sp) - addi a2, a1, 6 - sd a3, 48(sp) + slli a2, a3, 32 + sd a1, 40(sp) li a3, 8 - sd a2, 56(sp) + addi a1, a2, 6 slli a2, a3, 32 + sd a1, 48(sp) mv a3, a0 addi a1, a2, 7 mv a2, zero - sd a1, 64(sp) + sd a1, 56(sp) li a1, 1 - blt a1, s0, label5 + li t0, 10 + blt a1, t0, label5 j label47 .p2align 2 label107: - beq a2, t2, label195 + beq a2, t2, label192 .p2align 2 label32: sh2add t0, t2, a0 @@ -48,17 +47,19 @@ label32: lw t1, 0(a3) sw t1, 0(t0) sw a2, 0(a3) - bge a1, a4, label199 + bge a1, a4, label196 .p2align 2 label31: addi a3, a3, 4 mv a2, a1 + li t0, 10 addiw a1, a1, 1 - bge a1, s0, label47 + bge a1, t0, label47 .p2align 2 label5: addiw t0, a2, 4 - bge t0, s0, label52 + li t2, 10 + bge t0, t2, label52 sh2add t0, a1, a0 mv t2, a2 mv t1, a1 @@ -66,39 +67,39 @@ label5: mv t3, a1 lw t4, 0(t6) lw t5, 0(t0) - bgt t4, t5, label178 + bgt t4, t5, label176 mv t3, a2 .p2align 2 -label178: +label176: sh2add a6, t3, a0 addiw t6, t1, 1 lw t4, 0(a6) mv t2, t6 lw t5, 4(t0) - bgt t4, t5, label180 + bgt t4, t5, label178 mv t2, t3 .p2align 2 -label180: - sh2add a6, t2, a0 - addiw t6, t1, 2 - lw t4, 0(a6) - mv t3, t6 +label178: + sh2add t6, t2, a0 + addiw a6, t1, 2 + lw t4, 0(t6) + mv t3, a6 lw t5, 8(t0) - bgt t4, t5, label182 + bgt t4, t5, label180 .p2align 2 -label181: +label179: mv t3, t2 .p2align 2 -label182: +label180: sh2add t6, t3, a0 addiw a6, t1, 3 lw t4, 0(t6) mv t2, a6 lw t5, 12(t0) - bgt t4, t5, label184 + bgt t4, t5, label182 mv t2, t3 .p2align 2 -label184: +label182: addiw t1, t1, 4 bge t1, a5, label89 addi t0, t0, 16 @@ -106,26 +107,27 @@ label184: mv t3, t1 lw t4, 0(t6) lw t5, 0(t0) - bgt t4, t5, label178 + bgt t4, t5, label176 mv t3, t2 sh2add a6, t2, a0 addiw t6, t1, 1 lw t4, 0(a6) mv t2, t6 lw t5, 4(t0) - bgt t4, t5, label180 + bgt t4, t5, label178 mv t2, t3 - sh2add a6, t3, a0 - addiw t6, t1, 2 - lw t4, 0(a6) - mv t3, t6 + sh2add t6, t3, a0 + addiw a6, t1, 2 + lw t4, 0(t6) + mv t3, a6 lw t5, 8(t0) - bgt t4, t5, label182 - j label181 + bgt t4, t5, label180 + j label179 .p2align 2 label89: mv t3, t2 - bge t1, s0, label94 + li t0, 10 + bge t1, t0, label94 .p2align 2 label16: sh2add t0, t1, a0 @@ -133,12 +135,13 @@ label16: mv t2, t1 lw t4, 0(t6) lw t5, 0(t0) - bgt t4, t5, label186 + bgt t4, t5, label184 mv t2, t3 .p2align 2 -label186: +label184: addiw t1, t1, 1 - bge t1, s0, label107 + li t3, 10 + bge t1, t3, label107 .p2align 2 label21: addi t0, t0, 4 @@ -147,44 +150,46 @@ label21: mv t2, t1 lw t4, 0(t6) lw t5, 0(t0) - bgt t4, t5, label186 + bgt t4, t5, label184 mv t2, t3 addiw t1, t1, 1 - blt t1, s0, label21 - bne a2, t3, label32 -label195: + li t3, 10 + blt t1, t3, label21 + bne a2, t2, label32 +label192: blt a1, a4, label31 -label199: - mv s1, a0 - mv s2, zero +label196: + mv s0, a0 + mv s1, zero .p2align 2 label26: - lw a0, 0(s1) + lw a0, 0(s0) jal putint - mv a0, s0 + li a0, 10 jal putch - addiw s2, s2, 1 - bge s2, s0, label29 - addi s1, s1, 4 + li a0, 10 + addiw s1, s1, 1 + bge s1, a0, label29 + addi s0, s0, 4 j label26 label94: bne a2, t2, label32 label24: blt a1, a4, label31 - j label199 + j label196 label29: mv a0, zero ld ra, 0(sp) ld s0, 8(sp) ld s1, 16(sp) - ld s2, 24(sp) - addi sp, sp, 72 + addi sp, sp, 64 ret label52: mv t3, a2 mv t1, a1 mv t2, zero - blt a1, s0, label16 + li t0, 10 + blt a1, t0, label16 j label94 label47: mv t2, a2 diff --git a/tests/SysY2022/functional/59_sort_test5.riscv.s b/tests/SysY2022/functional/59_sort_test5.riscv.s index 1035d7654..d63833bb2 100644 --- a/tests/SysY2022/functional/59_sort_test5.riscv.s +++ b/tests/SysY2022/functional/59_sort_test5.riscv.s @@ -4,36 +4,34 @@ .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[40] RegSpill[0] CalleeSaved[32] - addi sp, sp, -72 + # stack usage: CalleeArg[0] Local[40] RegSpill[0] CalleeSaved[24] + addi sp, sp, -64 li a2, 4 li a3, 3 + li t0, 2 li t1, 5 - addi a0, sp, 32 - slli a1, a3, 32 + addi a0, sp, 24 sd ra, 0(sp) - li a3, 9 - addi a4, a1, 4 + slli a4, a3, 32 sd s0, 8(sp) - li a1, 2 - li s0, 10 - slli a5, a1, 32 + li a3, 9 + addi a1, a4, 4 sd s1, 16(sp) + slli a4, t0, 32 + sd a1, 24(sp) + li t0, 1 + addi a5, a4, 9 li a1, 1 - sd s2, 24(sp) - sd a4, 32(sp) - addi a4, a5, 9 - li a5, 1 - sd a4, 40(sp) - slli a4, a5, 32 + slli a4, t0, 32 + sd a5, 32(sp) + li t0, 8 slli a5, t1, 32 + sd a4, 40(sp) + addi a4, a5, 6 + slli a5, t0, 32 sd a4, 48(sp) - li t1, 8 - addi t0, a5, 6 - slli a4, t1, 32 - sd t0, 56(sp) - addi a5, a4, 7 - sd a5, 64(sp) + addi a4, a5, 7 + sd a4, 56(sp) j label2 .p2align 2 label110: @@ -53,7 +51,8 @@ label2: slliw t0, a2, 1 mv a4, a2 addiw a5, t0, 1 - bge a5, s0, label7 + li t0, 10 + bge a5, t0, label7 .p2align 2 label28: sh2add t0, a5, a0 @@ -69,32 +68,34 @@ label28: lw t4, 0(t5) bgt t3, t4, label7 addw t1, a5, t1 - sh2add t5, a4, a0 - slli t6, t1, 1 + sh2add t6, a4, a0 + slli t5, t1, 1 mv a4, t1 - sw t4, 0(t5) - addiw a5, t6, 1 - sh2add t6, t2, t0 - sw t3, 0(t6) - blt a5, s0, label28 + sw t4, 0(t6) + addiw a5, t5, 1 + sh2add t5, t2, t0 + li t0, 10 + sw t3, 0(t5) + blt a5, t0, label28 j label7 label31: addw t1, a5, t2 - sh2add t5, a4, a0 - slli t6, t1, 1 + sh2add t6, a4, a0 + slli t5, t1, 1 mv a4, t1 - sw t4, 0(t5) - addiw a5, t6, 1 - sh2add t6, t2, t0 - sw t3, 0(t6) - blt a5, s0, label28 + sw t4, 0(t6) + addiw a5, t5, 1 + sh2add t5, t2, t0 + li t0, 10 + sw t3, 0(t5) + blt a5, t0, label28 j label7 label8: addi a2, a0, 36 - lw t0, 32(sp) + lw t0, 24(sp) addiw a4, a3, -1 lw a5, 0(a2) - sw a5, 32(sp) + sw a5, 24(sp) mv a5, zero sw t0, 0(a2) mv t0, a1 @@ -111,16 +112,16 @@ label27: sh2add t5, a5, a0 lw t2, 0(t5) lw t5, 0(t6) - ble t2, t5, label190 + ble t2, t5, label188 .p2align 2 label15: ble a4, zero, label16 addi a2, a2, -4 mv a3, a4 - lw t0, 32(sp) + lw t0, 24(sp) addiw a4, a4, -1 lw a5, 0(a2) - sw a5, 32(sp) + sw a5, 24(sp) mv a5, zero sw t0, 0(a2) mv t0, a1 @@ -147,29 +148,29 @@ label23: bgt a3, t0, label23 j label15 label16: - mv s1, a0 - mv s2, zero + mv s0, a0 + mv s1, zero j label17 .p2align 2 label21: - addi s1, s1, 4 + addi s0, s0, 4 .p2align 2 label17: - lw a0, 0(s1) + lw a0, 0(s0) jal putint - mv a0, s0 + li a0, 10 jal putch - addiw s2, s2, 1 - blt s2, s0, label21 + li a0, 10 + addiw s1, s1, 1 + blt s1, a0, label21 mv a0, zero ld ra, 0(sp) ld s0, 8(sp) ld s1, 16(sp) - ld s2, 24(sp) - addi sp, sp, 72 + addi sp, sp, 64 ret .p2align 2 -label190: +label188: addw t3, t0, t4 sh2add a6, a5, a0 slli t6, t3, 1 diff --git a/tests/SysY2022/functional/61_sort_test7.arm.s b/tests/SysY2022/functional/61_sort_test7.arm.s index c7352fd62..8cd452dcd 100644 --- a/tests/SysY2022/functional/61_sort_test7.arm.s +++ b/tests/SysY2022/functional/61_sort_test7.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 buf: .zero 800 .text diff --git a/tests/SysY2022/functional/61_sort_test7.riscv.s b/tests/SysY2022/functional/61_sort_test7.riscv.s index 82ff74640..603995710 100644 --- a/tests/SysY2022/functional/61_sort_test7.riscv.s +++ b/tests/SysY2022/functional/61_sort_test7.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 buf: .zero 800 .text diff --git a/tests/SysY2022/functional/62_percolation.arm.s b/tests/SysY2022/functional/62_percolation.arm.s index 7743e0592..f8a2a69be 100644 --- a/tests/SysY2022/functional/62_percolation.arm.s +++ b/tests/SysY2022/functional/62_percolation.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 array: .zero 440 .text diff --git a/tests/SysY2022/functional/62_percolation.riscv.s b/tests/SysY2022/functional/62_percolation.riscv.s index 69953c774..d36c03941 100644 --- a/tests/SysY2022/functional/62_percolation.riscv.s +++ b/tests/SysY2022/functional/62_percolation.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 array: .zero 440 .text @@ -154,10 +154,10 @@ main: li a0, -1 sd ra, 0(sp) sd s5, 8(sp) -pcrel557: +pcrel556: auipc s5, %pcrel_hi(array) sd s0, 16(sp) - addi s0, s5, %pcrel_lo(pcrel557) + addi s0, s5, %pcrel_lo(pcrel556) sd s7, 24(sp) li s7, 1 sd s4, 32(sp) @@ -177,8 +177,8 @@ pcrel557: bne s7, zero, label145 j label180 .p2align 2 -label449: - bne s8, zero, label472 +label448: + bne s8, zero, label471 .p2align 2 label179: mv a0, s1 @@ -206,7 +206,7 @@ label145: .p2align 2 label208: sltiu s11, s8, 1 - bge s9, s4, label449 + bge s9, s4, label448 .p2align 2 label146: jal getint @@ -222,9 +222,9 @@ label146: sh2add a4, a1, a5 sw a2, 0(a4) bne s10, s2, label219 -pcrel558: +pcrel557: auipc s5, %pcrel_hi(array) - sw zero, %pcrel_lo(pcrel558)(s5) + sw zero, %pcrel_lo(pcrel557)(s5) mv a0, a2 jal findfa mv a5, a0 @@ -252,7 +252,7 @@ label173: lw a1, 4(a4) bne a1, s1, label174 bgt a3, s2, label170 - bge s10, s3, label490 + bge s10, s3, label489 .p2align 2 label167: lw a1, 16(a4) @@ -263,7 +263,7 @@ label167: addiw a0, a2, 4 jal findfa bne a3, a0, label169 - ble s10, s2, label462 + ble s10, s2, label461 .p2align 2 label164: addiw a3, a2, -4 @@ -278,14 +278,14 @@ label164: beq a4, a0, label289 sh2add a2, a4, s0 sw a0, 0(a2) -pcrel559: +pcrel558: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel559)(s5) + lw a0, %pcrel_lo(pcrel558)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 beq s8, zero, label179 - j label477 + j label476 .p2align 2 label174: mv a0, a2 @@ -296,25 +296,25 @@ label174: beq a5, a0, label334 sh2add a1, a5, s0 sw a0, 0(a1) - ble a3, s2, label469 + ble a3, s2, label468 .p2align 2 label170: lw a1, -4(a4) bne a1, s1, label171 blt s10, s3, label167 bgt s10, s2, label164 -pcrel560: +pcrel559: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel560)(s5) + lw a0, %pcrel_lo(pcrel559)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label283: auipc s5, %pcrel_hi(array) lw a0, %pcrel_lo(label283)(s5) - beq a0, s1, label458 + beq a0, s1, label457 .p2align 2 label161: lw a0, 68(s0) @@ -343,25 +343,25 @@ label219: bgt a3, s2, label170 blt s10, s3, label167 bgt s10, s2, label164 -pcrel561: +pcrel560: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel561)(s5) + lw a0, %pcrel_lo(pcrel560)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label157: bgt a3, s2, label170 blt s10, s3, label167 bgt s10, s2, label164 -pcrel562: +pcrel561: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel562)(s5) + lw a0, %pcrel_lo(pcrel561)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label171: mv a0, a2 @@ -372,13 +372,13 @@ label171: bne a3, a0, label172 blt s10, s3, label167 bgt s10, s2, label164 -pcrel563: +pcrel562: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel563)(s5) + lw a0, %pcrel_lo(pcrel562)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label226: beq s10, s3, label154 @@ -386,41 +386,41 @@ label226: bgt a3, s2, label170 blt s10, s3, label167 bgt s10, s2, label164 -pcrel564: +pcrel563: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel564)(s5) + lw a0, %pcrel_lo(pcrel563)(s5) bne a0, s1, label161 -label526: +label525: sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label239: blt a3, s3, label173 bgt a3, s2, label170 blt s10, s3, label167 bgt s10, s2, label164 -pcrel565: +pcrel564: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel565)(s5) + lw a0, %pcrel_lo(pcrel564)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label297: bgt s10, s2, label164 -pcrel566: +pcrel565: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel566)(s5) + lw a0, %pcrel_lo(pcrel565)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 .p2align 2 -label454: +label453: beq s8, zero, label179 .p2align 2 -label477: +label476: bne s7, zero, label145 j label180 .p2align 2 @@ -429,25 +429,25 @@ label172: sw a0, 0(a5) blt s10, s3, label167 bgt s10, s2, label164 -pcrel567: +pcrel566: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel567)(s5) + lw a0, %pcrel_lo(pcrel566)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label169: sh2add a5, a3, s0 sw a0, 0(a5) bgt s10, s2, label164 -pcrel568: +pcrel567: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel568)(s5) + lw a0, %pcrel_lo(pcrel567)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label289: auipc s5, %pcrel_hi(array) @@ -456,59 +456,59 @@ label289: sltiu s11, s8, 1 blt s9, s4, label146 beq s8, zero, label179 - j label477 + j label476 .p2align 2 label334: bgt a3, s2, label170 blt s10, s3, label167 bgt s10, s2, label164 -pcrel569: +pcrel568: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel569)(s5) + lw a0, %pcrel_lo(pcrel568)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label232: blt a3, s3, label173 bgt a3, s2, label170 blt s10, s3, label167 bgt s10, s2, label164 -pcrel570: +pcrel569: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel570)(s5) + lw a0, %pcrel_lo(pcrel569)(s5) bne a0, s1, label161 - j label526 + j label525 .p2align 2 -label490: +label489: bgt s10, s2, label164 -pcrel571: +pcrel570: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel571)(s5) + lw a0, %pcrel_lo(pcrel570)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 -label462: +label461: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(label462)(s5) + lw a0, %pcrel_lo(label461)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 -label469: +label468: blt s10, s3, label167 bgt s10, s2, label164 -pcrel572: +pcrel571: auipc s5, %pcrel_hi(array) - lw a0, %pcrel_lo(pcrel572)(s5) + lw a0, %pcrel_lo(pcrel571)(s5) bne a0, s1, label161 sltiu s11, s8, 1 blt s9, s4, label146 - j label454 + j label453 .p2align 2 label269: sltiu s11, s8, 1 @@ -517,11 +517,11 @@ label269: bne s7, zero, label145 j label180 .p2align 2 -label458: +label457: sltiu s11, s8, 1 blt s9, s4, label146 beq s8, zero, label179 - j label477 + j label476 .p2align 2 label275: sltiu s11, s8, 1 @@ -546,6 +546,6 @@ label180: addi sp, sp, 104 ret .p2align 2 -label472: +label471: bne s7, zero, label145 j label180 diff --git a/tests/SysY2022/functional/64_calculator.arm.s b/tests/SysY2022/functional/64_calculator.arm.s index 32341c063..58f1bba4f 100644 --- a/tests/SysY2022/functional/64_calculator.arm.s +++ b/tests/SysY2022/functional/64_calculator.arm.s @@ -1,16 +1,16 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 ints: .zero 40000 -.align 8 +.p2align 3 chas: .zero 40000 -.align 8 +.p2align 3 get: .zero 40000 -.align 8 +.p2align 3 get2: .zero 40000 .text diff --git a/tests/SysY2022/functional/64_calculator.riscv.s b/tests/SysY2022/functional/64_calculator.riscv.s index 28ff42319..c21567062 100644 --- a/tests/SysY2022/functional/64_calculator.riscv.s +++ b/tests/SysY2022/functional/64_calculator.riscv.s @@ -1,338 +1,394 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 ints: .zero 40000 -.align 8 +.p2align 3 chas: .zero 40000 -.align 8 +.p2align 3 get: .zero 40000 -.align 8 +.p2align 3 get2: .zero 40000 .text .p2align 2 .globl main main: - addi sp, sp, -104 + addi sp, sp, -88 sd ra, 0(sp) - sd s9, 8(sp) + sd s7, 8(sp) sd s0, 16(sp) sd s5, 24(sp) - sd s6, 32(sp) - sd s1, 40(sp) - sd s7, 48(sp) + sd s4, 32(sp) + sd s6, 40(sp) + sd s1, 48(sp) sd s2, 56(sp) sd s3, 64(sp) - sd s4, 72(sp) - sd s8, 80(sp) - sd s10, 88(sp) - sd s11, 96(sp) + sd s8, 72(sp) + sd s9, 80(sp) jal getch - li s5, 37 - li s4, 47 - xori a1, a0, 13 li s3, 42 + xori a1, a0, 13 xori a4, a0, 10 - li s8, 41 -pcrel833: - auipc a5, %pcrel_hi(get) - li s2, 45 li s0, 32 - li s6, 40 - li s7, 10 + li s6, 10 + li s5, 40 li s1, 43 +pcrel823: + auipc a5, %pcrel_hi(get) + li s4, 64 + li s2, 45 sltu a2, zero, a1 sltu a3, zero, a4 - addi s9, a5, %pcrel_lo(pcrel833) + addi s7, a5, %pcrel_lo(pcrel823) and a1, a2, a3 beq a1, zero, label166 - mv s10, s9 - mv s11, zero + mv s8, s7 + mv s9, zero j label155 .p2align 2 label159: - addi s10, s10, 4 + addi s8, s8, 4 .p2align 2 label155: - sw a0, 0(s10) + sw a0, 0(s8) jal getch - addiw s11, s11, 1 + addiw s9, s9, 1 xori a4, a0, 10 xori a2, a0, 13 sltu a5, zero, a4 sltu a1, zero, a2 and a3, a1, a5 bne a3, zero, label159 - mv a2, s11 + mv a2, s9 label2: auipc a3, %pcrel_hi(get2) -pcrel834: +pcrel824: auipc a4, %pcrel_hi(chas) addi a0, a3, %pcrel_lo(label2) - addi a1, a4, %pcrel_lo(pcrel834) + addi a1, a4, %pcrel_lo(pcrel824) ble a2, zero, label174 - mv a5, zero - mv a3, zero - li t2, 1 - lw a4, 0(s9) - addiw t0, a4, -48 - bltu t0, s7, label14 - beq a4, s6, label16 - li t3, 94 - beq a4, t3, label19 - mv t1, zero - beq a4, s8, label22 - beq a4, s1, label88 - j label782 -label215: - mv t1, t0 - bne a4, s1, label232 + mv a3, s7 + mv t0, zero + mv a4, zero + li t3, 1 + lw a5, 0(s7) + addiw t1, a5, -48 + bltu t1, s6, label14 + beq a5, s5, label16 + li t1, 94 + beq a5, t1, label19 + mv t2, zero + li t0, 41 + beq a5, t0, label22 + beq a5, s1, label88 + j label777 +.p2align 2 +label29: + addi t0, t0, 8 +.p2align 2 +label24: + sw s0, 0(t0) + sh2add t4, t1, a1 + addiw t3, t3, 2 + addiw t1, t1, -1 + sw t2, 4(t0) + lw t2, 0(t4) + bne t2, s5, label29 + mv t2, t1 + bne a5, s1, label742 .p2align 2 label88: - sh2add a5, t1, a1 - mv t0, t2 + sh2add t0, t2, a1 + mv t1, t3 .p2align 2 label89: - lw t2, 0(a5) - xori a7, t2, 47 - xori a6, t2, 42 - xori t5, t2, 45 - xori t4, t2, 43 - sltiu t6, t5, 1 - sltiu t3, t4, 1 - sltiu t5, a6, 1 - or t4, t3, t6 - sltiu t6, a7, 1 - xori a7, t2, 94 - or a6, t5, t6 - xori t6, t2, 37 - or t3, t4, a6 - sltiu t5, t6, 1 + lw t3, 0(t0) + xori a7, t3, 42 + xori t6, t3, 45 + xori t4, t3, 43 + sltiu a6, t6, 1 + sltiu t5, t4, 1 sltiu t6, a7, 1 - or t4, t5, t6 - or a6, t3, t4 - beq a6, zero, label93 - sh2add t3, t0, a0 - addiw t1, t1, -1 - addiw t0, t0, 2 - sw s0, 0(t3) - sw t2, 4(t3) - beq t1, zero, label388 - addi a5, a5, -4 - j label89 -.p2align 2 -label388: - addiw t1, t1, 1 - sh2add a5, t1, a1 - sw s1, 0(a5) - bne a4, s2, label758 + or t4, t5, a6 + xori t5, t3, 47 + sltiu a6, t5, 1 + or a7, t6, a6 + xori a6, t3, 37 + or t5, t4, a7 + sltiu t6, a6, 1 + xori a7, t3, 94 + sltiu t4, a7, 1 + or a6, t6, t4 + or a7, t5, a6 + beq a7, zero, label93 + sh2add t4, t1, a0 + addiw t2, t2, -1 + addiw t1, t1, 2 + sw s0, 0(t4) + sw t3, 4(t4) + bne t2, zero, label97 + addiw t2, t2, 1 + sh2add t0, t2, a1 + sw s1, 0(t0) + beq a5, s2, label36 + bne a5, s3, label765 .p2align 2 -label36: - sh2add a5, t1, a1 +label49: + sh2add t0, t2, a1 .p2align 2 -label37: - lw t2, 0(a5) - xori a6, t2, 42 - xori t5, t2, 45 - xori t3, t2, 43 - sltiu t6, t5, 1 - sltiu t4, t3, 1 +label50: + lw t3, 0(t0) + xori a6, t3, 37 + xori t5, t3, 47 + xori t4, t3, 42 + sltiu a7, t5, 1 + sltiu t6, t4, 1 sltiu t5, a6, 1 - or t3, t4, t6 - xori t6, t2, 47 - sltiu a7, t6, 1 - xori t6, t2, 37 - or a6, t5, a7 - xori a7, t2, 94 - sltiu t5, t6, 1 - or t4, t3, a6 - sltiu t3, a7, 1 - or t6, t5, t3 - or a6, t4, t6 - beq a6, zero, label41 - sh2add t3, t0, a0 - addiw t1, t1, -1 - addiw t0, t0, 2 - sw s0, 0(t3) - sw t2, 4(t3) - bne t1, zero, label45 - addiw t1, t1, 1 - sh2add a5, t1, a1 - sw s2, 0(a5) - beq a4, s3, label49 - beq a4, s4, label78 - mv a5, t0 - beq a4, s5, label68 -label788: - mv a4, t1 - sh2add t0, a5, a0 - addiw t2, a5, 1 - addiw a3, a3, 1 - mv a5, t1 - sw s0, 0(t0) - bgt a2, a3, label13 - j label98 -.p2align 2 -label29: - addi a5, a5, 8 -.p2align 2 -label24: - sw s0, 0(a5) - sh2add t3, t0, a1 - addiw t2, t2, 2 - addiw t0, t0, -1 - sw t1, 4(a5) - lw t1, 0(t3) - bne t1, s6, label29 - mv t1, t0 - beq a4, s1, label88 - mv t0, t2 - beq a4, s2, label36 - beq a4, s3, label49 -.p2align 2 -label749: - bne a4, s4, label767 + or t4, t6, a7 + xori a7, t3, 94 + sltiu t6, a7, 1 + or a6, t5, t6 + or a7, t4, a6 + beq a7, zero, label283 + sh2add t4, t1, a0 + addiw t2, t2, -1 + addiw t1, t1, 2 + sw s0, 0(t4) + sw t3, 4(t4) + bne t2, zero, label55 + addiw t2, t2, 1 + li t3, 47 + sh2add t0, t2, a1 + sw s3, 0(t0) + bne a5, t3, label763 .p2align 2 label78: - sh2add a5, t1, a1 + sh2add t0, t2, a1 .p2align 2 label79: - lw t2, 0(a5) - xori t6, t2, 37 - xori t5, t2, 47 - xori t3, t2, 42 - sltiu a6, t5, 1 - sltiu t4, t3, 1 - sltiu t5, t6, 1 - or t3, t4, a6 - xori a6, t2, 94 - sltiu t4, a6, 1 - or a7, t5, t4 - or t6, t3, a7 - beq t6, zero, label346 - sh2add t3, t0, a0 - addiw t1, t1, -1 - addiw t0, t0, 2 - sw s0, 0(t3) - sw t2, 4(t3) - bne t1, zero, label84 - mv a5, t0 - addiw t1, t1, 1 - sh2add t0, t1, a1 - sw s4, 0(t0) - bne a4, s5, label770 -.p2align 2 -label68: - sh2add a4, t1, a1 + lw t3, 0(t0) + xori s7, t3, 94 + xori a7, t3, 37 + xori t6, t3, 47 + xori t4, t3, 42 + sltiu a6, t6, 1 + sltiu t5, t4, 1 + sltiu t6, a7, 1 + or t4, t5, a6 + sltiu t5, s7, 1 + or a6, t6, t5 + or a7, t4, a6 + beq a7, zero, label346 + sh2add t4, t1, a0 + addiw t2, t2, -1 + addiw t1, t1, 2 + sw s0, 0(t4) + sw t3, 4(t4) + bne t2, zero, label84 mv t0, t1 -.p2align 2 -label69: - lw t1, 0(a4) - xori t6, t1, 37 - xori t3, t1, 47 - xori t2, t1, 42 - sltiu t5, t3, 1 - sltiu t4, t2, 1 - sltiu t3, t6, 1 - or t2, t4, t5 - xori t5, t1, 94 - sltiu a6, t5, 1 - or t4, t3, a6 - or t6, t2, t4 - beq t6, zero, label73 - sh2add t2, a5, a0 - addiw t0, t0, -1 - addiw a5, a5, 2 - sw s0, 0(t2) - sw t1, 4(t2) - beq t0, zero, label332 - addi a4, a4, -4 - j label69 + addiw t2, t2, 1 + li t3, 47 + sh2add t1, t2, a1 + sw t3, 0(t1) + li t1, 37 + beq a5, t1, label68 + sh2add a5, t0, a0 + addiw t3, t0, 1 + addiw a4, a4, 1 + mv t0, t2 + sw s0, 0(a5) + bgt a2, a4, label13 + j label98 .p2align 2 label14: - sh2add t0, t2, a0 - addiw a3, a3, 1 - addiw t2, t2, 1 - sw a4, 0(t0) - ble a2, a3, label98 + sh2add t1, t3, a0 + addiw a4, a4, 1 + addiw t3, t3, 1 + sw a5, 0(t1) + ble a2, a4, label98 .p2align 2 label13: - addi s9, s9, 4 - lw a4, 0(s9) - addiw t0, a4, -48 - bltu t0, s7, label14 - bne a4, s6, label764 + addi a3, a3, 4 + lw a5, 0(a3) + addiw t1, a5, -48 + bltu t1, s6, label14 + bne a5, s5, label758 .p2align 2 label16: - addiw a5, a5, 1 - sh2add t0, a5, a1 - sw s6, 0(t0) - li t3, 94 - beq a4, t3, label19 - mv t1, a5 - beq a4, s8, label22 - beq a4, s1, label88 - mv t0, t2 - beq a4, s2, label36 -label236: - bne a4, s3, label749 + addiw t0, t0, 1 + sh2add t1, t0, a1 + sw s5, 0(t1) + li t1, 94 + bne a5, t1, label200 .p2align 2 -label49: - sh2add a5, t1, a1 +label19: + addiw t2, t0, 1 + li t4, 94 + sh2add t1, t2, a1 + sw t4, 0(t1) + li t0, 41 + beq a5, t0, label22 + beq a5, s1, label88 + mv t1, t3 + bne a5, s2, label771 .p2align 2 -label50: +label36: + sh2add t0, t2, a1 +.p2align 2 +label37: + lw t3, 0(t0) + xori a7, t3, 42 + xori t6, t3, 45 + xori t5, t3, 43 + sltiu a6, t6, 1 + sltiu t4, t5, 1 + sltiu t6, a7, 1 + or t5, t4, a6 + xori t4, t3, 47 + sltiu a6, t4, 1 + or a7, t6, a6 + xori a6, t3, 37 + or t4, t5, a7 + sltiu t6, a6, 1 + xori a7, t3, 94 + sltiu t5, a7, 1 + or a6, t6, t5 + or a7, t4, a6 + beq a7, zero, label41 + sh2add t4, t1, a0 + addiw t2, t2, -1 + addiw t1, t1, 2 + sw s0, 0(t4) + sw t3, 4(t4) + beq t2, zero, label265 + addi t0, t0, -4 + j label37 +.p2align 2 +label763: + mv t0, t1 + li t1, 37 + bne a5, t1, label784 +.p2align 2 +label68: + sh2add a5, t2, a1 + mv t1, t2 +.p2align 2 +label69: lw t2, 0(a5) - xori t5, t2, 47 xori t6, t2, 37 + xori a7, t2, 94 + xori t5, t2, 47 xori t3, t2, 42 sltiu a6, t5, 1 sltiu t4, t3, 1 sltiu t5, t6, 1 or t3, t4, a6 - xori t4, t2, 94 - sltiu a6, t4, 1 - or t6, t5, a6 - or a7, t3, t6 - beq a7, zero, label283 + sltiu t4, a7, 1 + or t6, t5, t4 + or a6, t3, t6 + beq a6, zero, label73 sh2add t3, t0, a0 addiw t1, t1, -1 addiw t0, t0, 2 sw s0, 0(t3) sw t2, 4(t3) - beq t1, zero, label292 + beq t1, zero, label332 addi a5, a5, -4 - j label50 + j label69 .p2align 2 -label764: - li t3, 94 - bne a4, t3, label775 -.p2align 2 -label19: - addiw t1, a5, 1 - li t3, 94 - sh2add t0, t1, a1 - sw t3, 0(t0) - bne a4, s8, label207 +label200: + mv t2, t0 + li t0, 41 + bne a5, t0, label740 .p2align 2 label22: - sh2add a5, t1, a1 - addiw t0, t1, -1 - lw t3, 0(a5) - beq t3, s6, label215 - sh2add a5, t2, a0 - mv t1, t3 + sh2add t0, t2, a1 + addiw t1, t2, -1 + lw t4, 0(t0) + beq t4, s5, label215 + sh2add t0, t3, a0 + mv t2, t4 j label24 +.p2align 2 +label265: + addiw t2, t2, 1 + sh2add t0, t2, a1 + sw s2, 0(t0) + beq a5, s3, label49 + li t3, 47 + beq a5, t3, label78 + mv t0, t1 + li t1, 37 + beq a5, t1, label68 +label65: + sh2add a5, t0, a0 + addiw t3, t0, 1 + addiw a4, a4, 1 + mv t0, t2 + sw s0, 0(a5) + bgt a2, a4, label13 + j label98 +.p2align 2 +label758: + li t1, 94 + beq a5, t1, label19 + mv t2, t0 + li t0, 41 + beq a5, t0, label22 + beq a5, s1, label88 +label777: + mv t1, t3 + beq a5, s2, label36 +label236: + beq a5, s3, label49 + j label743 +.p2align 2 +label742: + mv t1, t3 + beq a5, s2, label36 + beq a5, s3, label49 +.p2align 2 +label743: + li t3, 47 + beq a5, t3, label78 +label761: + mv t0, t1 + li t1, 37 + beq a5, t1, label68 + sh2add a5, t0, a0 + addiw t3, t0, 1 + addiw a4, a4, 1 + mv t0, t2 + sw s0, 0(a5) + bgt a2, a4, label13 + j label98 +label740: + beq a5, s1, label88 + mv t1, t3 + beq a5, s2, label36 + j label236 +label765: + li t3, 47 + beq a5, t3, label78 + j label761 +label771: + beq a5, s3, label49 + j label743 label98: - ble a5, zero, label101 - addiw a2, a5, -3 + ble t0, zero, label101 + addiw a2, t0, -3 ble a2, zero, label515 - sh2add a2, a5, a1 - mv a3, t2 - mv a4, a5 + sh2add a2, t0, a1 + mv a3, t3 + mv a4, t0 + j label139 +.p2align 2 +label143: + addi a2, a2, -16 .p2align 2 label139: sh2add a5, a3, a0 @@ -351,58 +407,8 @@ label139: sw s0, 24(a5) sw t0, 28(a5) li a5, 3 - ble a4, a5, label539 - addi a2, a2, -16 - j label139 -.p2align 2 -label292: - addiw t1, t1, 1 - sh2add a5, t1, a1 - sw s3, 0(a5) - beq a4, s4, label78 - mv a5, t0 - beq a4, s5, label68 - mv a4, t1 - sh2add t0, t0, a0 - addiw t2, a5, 1 - addiw a3, a3, 1 - sw s0, 0(t0) - mv a5, t1 - bgt a2, a3, label13 - j label98 -.p2align 2 -label207: - beq a4, s1, label88 - mv t0, t2 - beq a4, s2, label36 - beq a4, s3, label49 - j label749 -.p2align 2 -label758: - beq a4, s3, label49 - beq a4, s4, label78 -label767: - mv a5, t0 - beq a4, s5, label68 - mv a4, t1 - sh2add t0, t0, a0 - addiw t2, a5, 1 - addiw a3, a3, 1 - sw s0, 0(t0) - mv a5, t1 - bgt a2, a3, label13 - j label98 -.p2align 2 -label775: - mv t1, a5 - beq a4, s8, label22 - beq a4, s1, label88 -label782: - mv t0, t2 - beq a4, s2, label36 - j label236 -label539: - mv t2, a3 + bgt a4, a5, label143 + mv t3, a3 label144: ble a4, zero, label544 sh2add a1, a4, a1 @@ -410,26 +416,30 @@ label144: label153: addi a1, a1, -4 label149: - sh2add a2, t2, a0 + sh2add a2, t3, a0 lw a3, 0(a1) addiw a4, a4, -1 - addiw t2, t2, 2 + addiw t3, t3, 2 sw s0, 0(a2) sw a3, 4(a2) bgt a4, zero, label153 label101: - sh2add a1, t2, a0 - li a5, 64 -pcrel835: + sh2add a1, t3, a0 +pcrel825: auipc a3, %pcrel_hi(ints) - sw a5, 0(a1) - addi a1, a3, %pcrel_lo(pcrel835) + sw s4, 0(a1) + addi a1, a3, %pcrel_lo(pcrel825) lw a2, 4(a0) - beq a2, a5, label103 - bne a2, s1, label409 - li a2, 1 + beq a2, s4, label103 + beq a2, s1, label410 mv a4, zero - j label105 + li a2, 1 + j label120 +.p2align 2 +label119: + remw t2, a3, t0 + li t3, 94 + bne t1, t3, label756 .p2align 2 label115: beq t0, zero, label445 @@ -443,10 +453,9 @@ label116: addiw a4, a4, -1 addiw a2, a2, 1 sw t1, -4(a5) - sh2add t0, a2, a0 - li a5, 64 - lw a3, 0(t0) - beq a3, a5, label103 + sh2add a5, a2, a0 + lw a3, 0(a5) + beq a3, s4, label103 .p2align 2 label136: beq a3, s1, label105 @@ -455,19 +464,19 @@ label120: sh2add a3, a2, a0 lw a5, 0(a3) xori t3, a5, 47 - xori t1, a5, 42 + xori t2, a5, 42 xori t0, a5, 45 - sltiu t4, t1, 1 - sltiu t2, t0, 1 - sltiu t1, t3, 1 - or t0, t2, t4 + sltiu t4, t2, 1 + sltiu t1, t0, 1 + sltiu t2, t3, 1 + or t0, t1, t4 xori t4, a5, 37 sltiu t5, t4, 1 xori t4, a5, 94 - or t3, t1, t5 - sltiu t1, t4, 1 - or t2, t0, t3 - or t0, t2, t1 + or t3, t2, t5 + sltiu t2, t4, 1 + or t1, t0, t3 + or t0, t1, t2 beq t0, zero, label123 .p2align 2 label105: @@ -476,77 +485,40 @@ label105: lw t0, 0(a5) lw a3, -4(a5) lw t1, 0(t4) - addw t3, t0, a3 - mv t2, t3 - beq t1, s1, label724 - mv t2, zero -.p2align 2 -label724: - subw t4, a3, t0 - mv t3, t4 - beq t1, s2, label726 + addw t2, t0, a3 mv t3, t2 + beq t1, s1, label720 + mv t3, zero .p2align 2 -label726: - mulw t4, t0, a3 +label720: + subw t4, a3, t0 mv t2, t4 - beq t1, s3, label728 + beq t1, s2, label722 mv t2, t3 .p2align 2 -label728: - bne t1, s4, label430 - divw t2, a3, t0 - bne t1, s5, label111 -.p2align 2 -label119: - remw t2, a3, t0 - li t3, 94 - beq t1, t3, label115 - sw t2, -4(a5) - addiw a4, a4, -1 - addiw a2, a2, 1 - li a5, 64 - sh2add t0, a2, a0 - lw a3, 0(t0) - bne a3, a5, label136 - j label103 -.p2align 2 -label111: - li t3, 94 - beq t1, t3, label115 -.p2align 2 -label113: - sw t2, -4(a5) - addiw a4, a4, -1 - addiw a2, a2, 1 - li a5, 64 - sh2add t0, a2, a0 - lw a3, 0(t0) - bne a3, a5, label136 - j label103 +label722: + mulw t4, t0, a3 + mv t3, t4 + beq t1, s3, label724 + mv t3, t2 .p2align 2 -label430: - beq t1, s5, label119 +label724: + li t4, 47 + beq t1, t4, label108 + mv t2, t3 + li t3, 37 + beq t1, t3, label119 li t3, 94 beq t1, t3, label115 sw t2, -4(a5) addiw a4, a4, -1 addiw a2, a2, 1 - li a5, 64 - sh2add t0, a2, a0 - lw a3, 0(t0) - bne a3, a5, label136 + sh2add a5, a2, a0 + lw a3, 0(a5) + bne a3, s4, label136 j label103 label123: - bne a5, s0, label124 -label133: - addiw a2, a2, 1 - li a5, 64 - sh2add t0, a2, a0 - lw a3, 0(t0) - bne a3, a5, label136 - j label103 -label124: + beq a5, s0, label133 addiw a4, a4, 1 addiw t0, a5, -48 sh2add t1, a4, a1 @@ -569,6 +541,28 @@ label128: beq t1, s0, label125 addi a3, a3, 4 j label128 +.p2align 2 +label108: + divw t2, a3, t0 + li t3, 37 + beq t1, t3, label119 + li t3, 94 + beq t1, t3, label115 +.p2align 2 +label113: + sw t2, -4(a5) + addiw a4, a4, -1 + addiw a2, a2, 1 + sh2add a5, a2, a0 + lw a3, 0(a5) + bne a3, s4, label136 + j label103 +label133: + addiw a2, a2, 1 + sh2add a5, a2, a0 + lw a3, 0(a5) + bne a3, s4, label136 + j label103 label125: addiw a3, a2, -1 addw a2, a5, a3 @@ -578,116 +572,132 @@ label103: jal putint mv a0, zero ld ra, 0(sp) - ld s9, 8(sp) + ld s7, 8(sp) ld s0, 16(sp) ld s5, 24(sp) - ld s6, 32(sp) - ld s1, 40(sp) - ld s7, 48(sp) + ld s4, 32(sp) + ld s6, 40(sp) + ld s1, 48(sp) ld s2, 56(sp) ld s3, 64(sp) - ld s4, 72(sp) - ld s8, 80(sp) - ld s10, 88(sp) - ld s11, 96(sp) - addi sp, sp, 104 + ld s8, 72(sp) + ld s9, 80(sp) + addi sp, sp, 88 ret +label215: + mv t2, t1 + beq a5, s1, label88 + j label777 .p2align 2 label332: - addiw a4, t0, 1 - addiw t2, a5, 1 - addiw a3, a3, 1 - sh2add t0, a5, a0 - sh2add t1, a4, a1 - mv a5, a4 - sw s5, 0(t1) - sw s0, 0(t0) - bgt a2, a3, label13 - j label98 -label232: + addiw t2, t1, 1 + li t3, 37 + addiw a4, a4, 1 + sh2add a5, t2, a1 + sw t3, 0(a5) + addiw t3, t0, 1 + sh2add a5, t0, a0 mv t0, t2 - beq a4, s2, label36 - j label236 -.p2align 2 -label770: - mv a4, t1 - sh2add t0, a5, a0 - addiw t2, a5, 1 - addiw a3, a3, 1 - sw s0, 0(t0) - mv a5, t1 - bgt a2, a3, label13 + sw s0, 0(a5) + bgt a2, a4, label13 j label98 +.p2align 2 +label756: + sw t2, -4(a5) + addiw a4, a4, -1 + addiw a2, a2, 1 + sh2add a5, a2, a0 + lw a3, 0(a5) + bne a3, s4, label136 + j label103 label283: - addiw t1, t1, 1 - sh2add a5, t1, a1 - sw s3, 0(a5) - beq a4, s4, label78 - mv a5, t0 + addiw t2, t2, 1 + sh2add t0, t2, a1 + sw s3, 0(t0) + li t3, 47 + beq a5, t3, label78 + mv t0, t1 j label62 label346: - mv a5, t0 - addiw t1, t1, 1 - sh2add t0, t1, a1 - sw s4, 0(t0) + mv t0, t1 + addiw t2, t2, 1 + li t3, 47 + sh2add t1, t2, a1 + sw t3, 0(t1) label62: - beq a4, s5, label68 - j label788 + li t1, 37 + beq a5, t1, label68 + j label65 label73: - addiw a4, t0, 1 - addiw t2, a5, 1 - addiw a3, a3, 1 - sh2add t0, a5, a0 - sh2add t1, a4, a1 - mv a5, a4 - sw s5, 0(t1) - sw s0, 0(t0) - bgt a2, a3, label13 + addiw t2, t1, 1 + li t3, 37 + addiw a4, a4, 1 + sh2add a5, t2, a1 + sw t3, 0(a5) + addiw t3, t0, 1 + sh2add a5, t0, a0 + mv t0, t2 + sw s0, 0(a5) + bgt a2, a4, label13 + j label98 +label784: + sh2add a5, t0, a0 + addiw t3, t0, 1 + addiw a4, a4, 1 + mv t0, t2 + sw s0, 0(a5) + bgt a2, a4, label13 j label98 -label41: - addiw t1, t1, 1 - sh2add a5, t1, a1 - sw s2, 0(a5) - beq a4, s3, label49 - beq a4, s4, label78 - mv a5, t0 - beq a4, s5, label68 - j label788 label93: - addiw t1, t1, 1 - sh2add a5, t1, a1 - sw s1, 0(a5) - beq a4, s2, label36 + addiw t2, t2, 1 + sh2add t0, t2, a1 + sw s1, 0(t0) + beq a5, s2, label36 j label236 -label544: - mv t2, a3 - j label101 -label445: - li t2, 1 - j label113 +label41: + addiw t2, t2, 1 + sh2add t0, t2, a1 + sw s2, 0(t0) + beq a5, s3, label49 + li t3, 47 + beq a5, t3, label78 + mv t0, t1 + li t1, 37 + beq a5, t1, label68 + j label65 label515: - mv a4, a5 + mv a4, t0 mv a3, zero j label144 +label445: + li t2, 1 + j label113 .p2align 2 -label45: - addi a5, a5, -4 - j label37 +label55: + addi t0, t0, -4 + j label50 +.p2align 2 +label84: + addi t0, t0, -4 + j label79 label166: mv a2, zero j label2 -label409: - mv a4, zero - li a2, 1 - j label120 .p2align 2 -label84: - addi a5, a5, -4 - j label79 -label174: - li t2, 1 - mv a5, zero - j label98 +label97: + addi t0, t0, -4 + j label89 label481: li a5, 1 j label125 +label544: + mv t3, a3 + j label101 +label174: + li t3, 1 + mv t0, zero + j label98 +label410: + li a2, 1 + mv a4, zero + j label105 diff --git a/tests/SysY2022/functional/65_color.arm.s b/tests/SysY2022/functional/65_color.arm.s index 820b22ef0..e9537f8d6 100644 --- a/tests/SysY2022/functional/65_color.arm.s +++ b/tests/SysY2022/functional/65_color.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 dp: .zero 52907904 .text diff --git a/tests/SysY2022/functional/65_color.riscv.s b/tests/SysY2022/functional/65_color.riscv.s index 1f8bc0209..a7c19d276 100644 --- a/tests/SysY2022/functional/65_color.riscv.s +++ b/tests/SysY2022/functional/65_color.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 dp: .zero 52907904 .text @@ -224,28 +224,28 @@ main: sd zero, 112(sp) jal getint mv s0, a0 -pcrel737: +pcrel736: auipc a1, %pcrel_hi(dp) - li t0, -1 mv t6, zero mv t5, zero - li a6, -1 mv t3, zero - mv t2, zero li a2, 18 - li a0, 567 - addi a3, a1, %pcrel_lo(pcrel737) - slli t1, a6, 32 - slli a5, a0, 4 - lui a1, 40 - lui a0, 718 - addiw a4, a1, -544 - addiw t4, a0, -1600 + mv t2, zero + li t0, -1 + lui a0, 40 + addi a3, a1, %pcrel_lo(pcrel736) + addiw a4, a0, -544 + li a1, 567 + li a0, -1 + slli a5, a1, 4 + slli t1, a0, 32 + lui a1, 718 + addiw t4, a1, -1600 j label211 .p2align 2 label389: addiw t2, t2, 1 - bge t2, a2, label730 + bge t2, a2, label729 mv t6, zero mv t3, zero .p2align 2 @@ -328,7 +328,7 @@ label211: mv t6, zero j label211 .p2align 2 -label730: +label729: addiw t5, t5, 1 bge t5, a2, label220 add a3, a3, t4 diff --git a/tests/SysY2022/functional/68_brainfk.arm.s b/tests/SysY2022/functional/68_brainfk.arm.s index 532bee591..1de3daecc 100644 --- a/tests/SysY2022/functional/68_brainfk.arm.s +++ b/tests/SysY2022/functional/68_brainfk.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 tape: .zero 262144 -.align 8 +.p2align 3 program: .zero 131072 .text diff --git a/tests/SysY2022/functional/68_brainfk.riscv.s b/tests/SysY2022/functional/68_brainfk.riscv.s index 30106fba7..8ea732bd4 100644 --- a/tests/SysY2022/functional/68_brainfk.riscv.s +++ b/tests/SysY2022/functional/68_brainfk.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 tape: .zero 262144 -.align 8 +.p2align 3 program: .zero 131072 .text diff --git a/tests/SysY2022/functional/69_expr_eval.arm.s b/tests/SysY2022/functional/69_expr_eval.arm.s index 5e82ca708..0832daeab 100644 --- a/tests/SysY2022/functional/69_expr_eval.arm.s +++ b/tests/SysY2022/functional/69_expr_eval.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 oprs: .zero 1024 -.align 8 +.p2align 3 ops: .zero 1024 .text diff --git a/tests/SysY2022/functional/69_expr_eval.riscv.s b/tests/SysY2022/functional/69_expr_eval.riscv.s index 643a31648..36f03ba04 100644 --- a/tests/SysY2022/functional/69_expr_eval.riscv.s +++ b/tests/SysY2022/functional/69_expr_eval.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 oprs: .zero 1024 -.align 8 +.p2align 3 ops: .zero 1024 .text @@ -28,16 +28,16 @@ main: jal getint mv s8, a0 jal getch - li s7, 256 - li s6, 45 li s3, 10 li s5, 43 -pcrel812: + li s7, 256 +pcrel810: auipc s2, %pcrel_hi(ops) -pcrel813: + li s6, 45 +pcrel811: auipc s0, %pcrel_hi(oprs) - addi s4, s2, %pcrel_lo(pcrel812) - addi s1, s0, %pcrel_lo(pcrel813) + addi s4, s2, %pcrel_lo(pcrel810) + addi s1, s0, %pcrel_lo(pcrel811) .p2align 2 label2: jal getch @@ -57,41 +57,41 @@ label8: mv s9, a0 j label12 .p2align 2 -label764: - xori a2, a0, 32 - xori a3, a0, 10 - sltiu a1, a2, 1 - sltiu a4, a3, 1 - or a2, a1, a4 - beq a2, zero, label36 -.p2align 2 -label62: - jal getch - xori a2, a0, 10 +label762: xori a3, a0, 32 - sltiu a4, a2, 1 + xori a2, a0, 10 sltiu a1, a3, 1 + sltiu a4, a2, 1 or a3, a1, a4 - bne a3, zero, label62 + beq a3, zero, label36 +.p2align 2 +label62: + jal getch + xori a4, a0, 10 + xori a2, a0, 32 + sltiu a3, a4, 1 + sltiu a1, a2, 1 + or a2, a1, a3 + bne a2, zero, label62 addiw a1, a0, -48 bltu a1, s3, label325 label41: jal getch li a2, 1 mv a1, s9 -pcrel814: +pcrel812: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel814)(s2) + lw a4, %pcrel_lo(pcrel812)(s2) sh2add a5, a4, s4 bne a4, zero, label47 label46: addiw a3, a4, 1 -pcrel815: +pcrel813: auipc s2, %pcrel_hi(ops) - sw a3, %pcrel_lo(pcrel815)(s2) + sw a3, %pcrel_lo(pcrel813)(s2) sw s10, 4(a5) beq a2, zero, label347 -label782: +label780: mv s9, a0 mv s10, a1 mv s11, a2 @@ -135,13 +135,13 @@ label38: bltu a1, s3, label40 mv a1, s9 mv a2, zero -pcrel816: +pcrel814: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel816)(s2) + lw a4, %pcrel_lo(pcrel814)(s2) sh2add a5, a4, s4 bne a4, zero, label47 addiw a3, a4, 1 - sw a3, %pcrel_lo(pcrel816)(s2) + sw a3, %pcrel_lo(pcrel814)(s2) sw s10, 4(a5) label347: mv s9, a1 @@ -168,12 +168,12 @@ label25: .p2align 2 label28: jal getch - xori a2, a0, 10 - xori a3, a0, 32 - sltiu a4, a2, 1 - sltiu a1, a3, 1 - or a3, a1, a4 - bne a3, zero, label28 + xori a3, a0, 10 + xori a2, a0, 32 + sltiu a4, a3, 1 + sltiu a1, a2, 1 + or a2, a1, a4 + bne a2, zero, label28 .p2align 2 label284: mv s10, a0 @@ -291,100 +291,101 @@ label500: label66: jal getch xori a4, a0, 10 - xori a3, a0, 32 - sltiu a2, a4, 1 - sltiu a1, a3, 1 - or a3, a1, a2 - bne a3, zero, label66 + xori a2, a0, 32 + sltiu a3, a4, 1 + sltiu a1, a2, 1 + or a2, a1, a3 + bne a2, zero, label66 .p2align 2 label67: addiw a1, a0, -48 bltu a1, s3, label431 jal getch li a2, 1 - mv a1, s9 -pcrel817: + mv a1, a0 + mv a0, s9 +pcrel815: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel817)(s2) + lw a4, %pcrel_lo(pcrel815)(s2) bne a4, zero, label75 j label74 .p2align 2 label79: addw a3, a4, t0 -pcrel818: +pcrel816: auipc s0, %pcrel_hi(oprs) - lw a5, %pcrel_lo(pcrel818)(s0) + lw a5, %pcrel_lo(pcrel816)(s0) addiw a4, a5, 1 sh2add t0, a4, s1 - sw a4, %pcrel_lo(pcrel818)(s0) + sw a4, %pcrel_lo(pcrel816)(s0) sw a3, 0(t0) -pcrel819: +pcrel817: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel819)(s2) + lw a4, %pcrel_lo(pcrel817)(s2) beq a4, zero, label74 .p2align 2 label75: sh2add a5, a4, s4 addiw t0, a4, -1 lw a3, 0(a5) -pcrel820: +pcrel818: auipc s2, %pcrel_hi(ops) - sw t0, %pcrel_lo(pcrel820)(s2) -pcrel821: + sw t0, %pcrel_lo(pcrel818)(s2) +pcrel819: auipc s0, %pcrel_hi(oprs) - lw a5, %pcrel_lo(pcrel821)(s0) - addi t3, a5, -2 - addi t2, a5, -1 + lw a5, %pcrel_lo(pcrel819)(s0) + addi t2, a5, -2 + addi t3, a5, -1 sh2add t1, a5, s1 lw a4, 0(t1) - sw t2, %pcrel_lo(pcrel821)(s0) + sw t3, %pcrel_lo(pcrel819)(s0) lw t0, -4(t1) - sw t3, %pcrel_lo(pcrel821)(s0) + sw t2, %pcrel_lo(pcrel819)(s0) beq a3, s5, label79 - bne a3, s6, label771 + bne a3, s6, label769 subw a3, t0, a4 - lw a5, %pcrel_lo(pcrel821)(s0) + lw a5, %pcrel_lo(pcrel819)(s0) addiw a4, a5, 1 sh2add t0, a4, s1 - sw a4, %pcrel_lo(pcrel821)(s0) + sw a4, %pcrel_lo(pcrel819)(s0) sw a3, 0(t0) - lw a4, %pcrel_lo(pcrel820)(s2) + lw a4, %pcrel_lo(pcrel818)(s2) bne a4, zero, label75 j label74 .p2align 2 -label771: +label769: li a5, 42 beq a3, a5, label87 li a5, 47 - bne a3, a5, label788 + bne a3, a5, label786 divw a3, t0, a4 -pcrel822: +pcrel820: auipc s0, %pcrel_hi(oprs) - lw a5, %pcrel_lo(pcrel822)(s0) + lw a5, %pcrel_lo(pcrel820)(s0) addiw a4, a5, 1 sh2add t0, a4, s1 - sw a4, %pcrel_lo(pcrel822)(s0) + sw a4, %pcrel_lo(pcrel820)(s0) sw a3, 0(t0) -pcrel823: +pcrel821: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel823)(s2) + lw a4, %pcrel_lo(pcrel821)(s2) bne a4, zero, label75 j label74 .p2align 2 -label788: +label786: li a5, 37 - bne a3, a5, label791 + bne a3, a5, label789 remw a3, t0, a4 -pcrel824: +pcrel822: auipc s0, %pcrel_hi(oprs) - lw a5, %pcrel_lo(pcrel824)(s0) + lw a5, %pcrel_lo(pcrel822)(s0) addiw a4, a5, 1 sh2add t0, a4, s1 - sw a4, %pcrel_lo(pcrel824)(s0) + sw a4, %pcrel_lo(pcrel822)(s0) sw a3, 0(t0) -pcrel825: +pcrel823: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel825)(s2) + lw a4, %pcrel_lo(pcrel823)(s2) bne a4, zero, label75 j label74 .p2align 2 @@ -397,14 +398,14 @@ label31: or a2, a1, a4 bne a2, zero, label304 xori a1, s10, 42 - xori a4, s10, 47 + xori a3, s10, 47 xori a5, s10, 37 li s11, 20 sltiu a2, a1, 1 - sltiu a3, a4, 1 - sltiu a4, a5, 1 - or a1, a2, a3 - or a2, a1, a4 + sltiu a4, a3, 1 + sltiu a3, a5, 1 + or a1, a2, a4 + or a2, a1, a3 bne a2, zero, label33 mv s11, zero xori a2, a0, 32 @@ -418,8 +419,8 @@ label31: label74: auipc s0, %pcrel_hi(oprs) lw a5, %pcrel_lo(label74)(s0) - mv s9, a0 - mv s10, a1 + mv s9, a1 + mv s10, a0 mv s11, a2 sh2add a4, a5, s1 lw a3, 0(a4) @@ -428,7 +429,7 @@ label74: .p2align 2 label304: mv s11, s3 - bne s3, zero, label764 + bne s3, zero, label762 .p2align 2 label63: xori a2, a0, 32 @@ -441,16 +442,16 @@ label63: .p2align 2 label87: mulw a3, a4, t0 -pcrel826: +pcrel824: auipc s0, %pcrel_hi(oprs) - lw a5, %pcrel_lo(pcrel826)(s0) + lw a5, %pcrel_lo(pcrel824)(s0) addiw a4, a5, 1 sh2add t0, a4, s1 - sw a4, %pcrel_lo(pcrel826)(s0) + sw a4, %pcrel_lo(pcrel824)(s0) sw a3, 0(t0) -pcrel827: +pcrel825: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel827)(s2) + lw a4, %pcrel_lo(pcrel825)(s2) bne a4, zero, label75 j label74 label50: @@ -463,11 +464,11 @@ label51: sh2add a5, a3, s1 sw a3, %pcrel_lo(label51)(s0) sw a4, 0(a5) -pcrel828: +pcrel826: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel828)(s2) + lw a4, %pcrel_lo(pcrel826)(s2) sh2add a5, a4, s4 - beq a4, zero, label767 + beq a4, zero, label765 .p2align 2 label47: sh2add t1, a4, s4 @@ -492,19 +493,19 @@ label47: .p2align 2 label49: addiw t0, a4, -1 -pcrel829: +pcrel827: auipc s2, %pcrel_hi(ops) - sw t0, %pcrel_lo(pcrel829)(s2) -pcrel830: + sw t0, %pcrel_lo(pcrel827)(s2) +pcrel828: auipc s0, %pcrel_hi(oprs) - lw a5, %pcrel_lo(pcrel830)(s0) + lw a5, %pcrel_lo(pcrel828)(s0) addi t2, a5, -2 addi t3, a5, -1 sh2add t1, a5, s1 lw a4, 0(t1) - sw t3, %pcrel_lo(pcrel830)(s0) + sw t3, %pcrel_lo(pcrel828)(s0) lw t0, -4(t1) - sw t2, %pcrel_lo(pcrel830)(s0) + sw t2, %pcrel_lo(pcrel828)(s0) beq a3, s5, label50 beq a3, s6, label54 li a5, 42 @@ -514,21 +515,21 @@ pcrel830: .p2align 2 label33: beq s11, zero, label63 - xori a2, a0, 32 - xori a3, a0, 10 - sltiu a1, a2, 1 - sltiu a4, a3, 1 - or a2, a1, a4 - bne a2, zero, label62 + xori a3, a0, 32 + xori a2, a0, 10 + sltiu a1, a3, 1 + sltiu a4, a2, 1 + or a3, a1, a4 + bne a3, zero, label62 j label36 -label767: +label765: addiw a3, a4, 1 -pcrel831: +pcrel829: auipc s2, %pcrel_hi(ops) - sw a3, %pcrel_lo(pcrel831)(s2) + sw a3, %pcrel_lo(pcrel829)(s2) sw s10, 4(a5) beq a2, zero, label347 - j label782 + j label780 label105: mv a0, zero ld ra, 0(sp) @@ -547,32 +548,32 @@ label105: addi sp, sp, 104 ret .p2align 2 -label791: +label789: mv a3, zero -pcrel832: +pcrel830: auipc s0, %pcrel_hi(oprs) - lw a5, %pcrel_lo(pcrel832)(s0) + lw a5, %pcrel_lo(pcrel830)(s0) addiw a4, a5, 1 sh2add t0, a4, s1 - sw a4, %pcrel_lo(pcrel832)(s0) + sw a4, %pcrel_lo(pcrel830)(s0) sw zero, 0(t0) -pcrel833: +pcrel831: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel833)(s2) + lw a4, %pcrel_lo(pcrel831)(s2) bne a4, zero, label75 j label74 label54: subw a4, t0, a4 -pcrel834: +pcrel832: auipc s0, %pcrel_hi(oprs) - lw t0, %pcrel_lo(pcrel834)(s0) + lw t0, %pcrel_lo(pcrel832)(s0) addiw a3, t0, 1 sh2add a5, a3, s1 - sw a3, %pcrel_lo(pcrel834)(s0) + sw a3, %pcrel_lo(pcrel832)(s0) sw a4, 0(a5) -pcrel835: +pcrel833: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel835)(s2) + lw a4, %pcrel_lo(pcrel833)(s2) sh2add a5, a4, s4 bne a4, zero, label47 j label46 @@ -613,10 +614,11 @@ label88: jal getch addiw a2, a0, -48 bltu a2, s3, label90 - mv a1, s9 + mv a1, a0 mv a2, zero -pcrel836: +pcrel834: auipc s2, %pcrel_hi(ops) - lw a4, %pcrel_lo(pcrel836)(s2) + lw a4, %pcrel_lo(pcrel834)(s2) + mv a0, s9 bne a4, zero, label75 j label74 diff --git a/tests/SysY2022/functional/70_dijkstra.arm.s b/tests/SysY2022/functional/70_dijkstra.arm.s index 4bb567b3f..9ca50d6a4 100644 --- a/tests/SysY2022/functional/70_dijkstra.arm.s +++ b/tests/SysY2022/functional/70_dijkstra.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 e: .zero 1024 .text diff --git a/tests/SysY2022/functional/70_dijkstra.riscv.s b/tests/SysY2022/functional/70_dijkstra.riscv.s index 0f8a54d54..114c287a0 100644 --- a/tests/SysY2022/functional/70_dijkstra.riscv.s +++ b/tests/SysY2022/functional/70_dijkstra.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 e: .zero 1024 .text diff --git a/tests/SysY2022/functional/71_full_conn.riscv.s b/tests/SysY2022/functional/71_full_conn.riscv.s index 6b1be0465..2423eb3f4 100644 --- a/tests/SysY2022/functional/71_full_conn.riscv.s +++ b/tests/SysY2022/functional/71_full_conn.riscv.s @@ -4,8 +4,8 @@ .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[100] RegSpill[288] CalleeSaved[40] - addi sp, sp, -432 + # stack usage: CalleeArg[0] Local[100] RegSpill[296] CalleeSaved[40] + addi sp, sp, -440 sd ra, 0(sp) sd s0, 8(sp) sd s1, 16(sp) @@ -13,8 +13,8 @@ main: sd s3, 32(sp) jal getint ble a0, zero, label17 - sd a0, 176(sp) - addi s0, sp, 200 + sd a0, 184(sp) + addi s0, sp, 208 mv s1, zero mv s2, s0 mv s3, zero @@ -39,1046 +39,1045 @@ label7: j label7 .p2align 2 label12: - lw t5, 200(sp) - li a1, 85 - li t1, -123 + lw t6, 208(sp) + li a0, 85 + li a3, 23 li t3, -75 - li a7, -94 - sd t5, 400(sp) - mulw a0, t5, a1 - lw t6, 204(sp) - li a1, 23 - sd t6, 384(sp) - mulw a3, t6, a1 - lw a6, 208(sp) - addw a2, a0, a3 - li a0, -82 - sd a6, 376(sp) - mulw a3, a6, a0 - lw t5, 212(sp) - li a0, -103 - addw a1, a2, a3 - sd t5, 392(sp) - mulw a2, t5, a0 - lw t6, 216(sp) - addw a3, a1, a2 - sd t6, 368(sp) - mulw a1, t6, t1 - lw t4, 220(sp) - li t1, 47 - addw a2, a3, a1 - slli a3, t4, 6 - sd t4, 360(sp) - addw a1, a2, a3 - li t4, -102 - sd a3, 96(sp) - lw a6, 224(sp) - slli t2, a6, 4 - subw t0, t2, a6 + li t4, -39 + sd t6, 408(sp) + mulw a2, t6, a0 + lw a6, 212(sp) + mulw a4, a6, a3 sd a6, 312(sp) - slli a4, t0, 3 - sd t2, 72(sp) - subw a2, zero, a4 - sd t0, 424(sp) - addw a3, a1, a2 - lw t5, 228(sp) - li a1, 50 - sd t5, 304(sp) - mulw a4, t5, a1 - lw a6, 232(sp) - addw a2, a3, a4 - li a4, -59 - sd a6, 352(sp) - mulw a5, a6, a4 + addw a1, a2, a4 + lw a7, 216(sp) + li a2, -82 + sd a7, 392(sp) + mulw a3, a7, a2 + lw t6, 220(sp) + li a2, -103 + addw a4, a1, a3 + sd t6, 400(sp) + mulw a1, t6, a2 + lw a6, 224(sp) + addw a3, a4, a1 + li a1, -123 + sd a6, 384(sp) + mulw a5, a6, a1 + lw t6, 228(sp) + addw a4, a3, a5 + slli t0, t6, 6 + addw a3, a4, t0 + sd t6, 376(sp) + sd t0, 104(sp) + lw a7, 232(sp) + slli t5, a7, 4 + subw t1, t5, a7 + sd a7, 320(sp) + slli t0, t1, 3 + sd t5, 80(sp) + subw a5, zero, t0 + li t5, 110 + addw a4, a3, a5 + sd t1, 432(sp) + li a3, 50 lw t6, 236(sp) - addw a3, a2, a5 - mulw a4, t6, t1 - sd t6, 104(sp) - li t1, -106 - addw a2, a3, a4 - lw t2, 240(sp) - li a4, -111 - sd t2, 136(sp) - mulw a5, t2, a4 - lw t6, 244(sp) - addw a3, a2, a5 + mulw t0, t6, a3 + sd t6, 368(sp) + addw a5, a4, t0 + lw a7, 240(sp) + li t0, -59 + sd a7, 360(sp) + mulw t1, a7, t0 + lw a6, 244(sp) + addw a4, a5, t1 + li t1, 47 + sd a6, 112(sp) + mulw t0, a6, t1 + lw t6, 248(sp) + addw a5, a4, t0 + li t0, -111 + sd t6, 128(sp) + mulw t1, t6, t0 + lw a6, 252(sp) + addw a4, a5, t1 li a5, -67 - sd t6, 112(sp) - mulw a2, t6, a5 - lw t5, 248(sp) - addw a4, a3, a2 - mulw a5, t5, t1 - sd t5, 416(sp) - addw a3, a4, a5 - lw t6, 252(sp) - mulw a5, t6, t3 - sd t6, 344(sp) - addw a4, a3, a5 - lw t5, 256(sp) - mulw a5, t5, t4 - sd t5, 336(sp) - addw a3, a4, a5 - lw t6, 260(sp) - slli a5, t6, 4 - addw a4, a5, t6 - sd t6, 328(sp) - sh1add a5, a4, a3 - slli t0, a4, 1 - li a3, -39 - sd t0, 408(sp) - lw t4, 264(sp) - mulw t0, t4, a3 - sd t4, 320(sp) - addw a4, a5, t0 - li t4, 110 - lw t5, 268(sp) - slli t0, t5, 6 - addw t1, t0, t5 - sd t5, 120(sp) + sd a6, 200(sp) + mulw t1, a6, a5 addw a5, a4, t1 - lw t6, 272(sp) - li t1, 47 - sd t6, 192(sp) + sd t1, 40(sp) + li t1, -106 + lw t6, 256(sp) mulw t0, t6, t1 - lw t5, 276(sp) + sd t6, 424(sp) addw a4, a5, t0 - li t0, 113 - sd t5, 144(sp) - mulw t1, t5, t0 - lw t6, 280(sp) + lw a6, 260(sp) + mulw t0, a6, t3 + sd a6, 352(sp) + addw a5, a4, t0 + li a6, -102 + lw t6, 264(sp) + mulw t0, t6, a6 + sd t6, 344(sp) + addw a4, a5, t0 + lw a7, 268(sp) + slli a5, a7, 4 + addw t0, a5, a7 + sd a7, 336(sp) + sh1add a5, t0, a4 + slli t3, t0, 1 + sd t3, 416(sp) + lw a6, 272(sp) + mulw t0, a6, t4 + sd a6, 328(sp) + addw a4, a5, t0 + lw t6, 276(sp) + slli t0, t6, 6 + addw t1, t0, t6 + sd t6, 120(sp) addw a5, a4, t1 + lw a6, 280(sp) li t1, 47 - mulw t0, t6, t4 - sd t6, 184(sp) - addw a4, a5, t0 - lw t5, 284(sp) - mulw t0, t5, t1 - sd t5, 152(sp) + sd a6, 136(sp) + mulw a4, a6, t1 + lw t6, 284(sp) + addw t0, a5, a4 + li a5, 113 + sd t6, 144(sp) + mulw t1, t6, a5 + lw a6, 288(sp) + addw a4, t0, t1 + li t1, 47 + mulw t0, a6, t5 + sd a6, 152(sp) addw a5, a4, t0 - lw t6, 288(sp) - slli t1, t6, 2 - subw t0, zero, t1 + lw t6, 292(sp) + mulw t0, t6, t1 sd t6, 160(sp) addw a4, a5, t0 - lw t5, 292(sp) - sh2add t1, t5, t5 + lw a6, 296(sp) + slli t1, a6, 2 + subw t0, zero, t1 + sd a6, 168(sp) + addw a5, a4, t0 + lw t6, 300(sp) + sh2add t1, t6, t6 slliw t0, t1, 4 - sd t5, 168(sp) + sd t6, 176(sp) li t1, 46 - addw a5, a4, t0 - li t5, 127 - lw t6, 296(sp) - mulw t2, t6, t1 - sd t6, 128(sp) + addw a4, a5, t0 + li t6, 127 + lw a6, 304(sp) + mulw t0, a6, t1 + sd a6, 192(sp) li t1, -106 - addw t0, a5, t2 - max a4, t0, zero + addw a5, a4, t0 li t0, 39 - min t2, a4, t5 - ld t5, 400(sp) - ld t6, 384(sp) - mulw a5, t2, t0 - ld a6, 376(sp) - slli a4, t6, 6 - mulw t2, t5, t1 - sh3add t4, a6, a6 - subw t3, a4, t6 - ld t5, 392(sp) + max t2, a5, zero + min a4, t2, t6 + ld t6, 408(sp) + ld a6, 312(sp) + mulw a5, a4, t0 + ld a7, 392(sp) + mulw t2, t6, t1 + slli a4, a6, 6 + sh3add t4, a7, a7 + ld t6, 400(sp) + subw t3, a4, a6 + ld a6, 384(sp) sh1add t1, t3, t2 slliw t3, t4, 1 - slli t4, t5, 5 - subw t6, zero, t3 - addw t2, t1, t6 - ld t6, 368(sp) - subw t1, t5, t4 - slli t4, t6, 3 - addw t3, t2, t1 - subw t5, zero, t4 + subw t5, zero, t3 + slli t3, t6, 5 + addw t2, t1, t5 + slli t5, a6, 3 + subw t1, t6, t3 + ld t6, 376(sp) + addw t4, t2, t1 + ld a7, 320(sp) li t1, 47 - ld t4, 360(sp) - addw t2, t3, t5 - ld a6, 312(sp) - slli t6, a6, 2 - mulw t5, t4, t1 - subw t4, zero, t6 - addw t3, t2, t5 - li t2, 67 - addw t1, t3, t4 - ld t5, 304(sp) + subw t2, zero, t5 + slli t5, a7, 2 + addw t3, t4, t2 + mulw t4, t6, t1 + ld t6, 368(sp) + subw t1, zero, t5 + addw t2, t3, t4 + li t5, -94 + ld a7, 360(sp) + li t3, 67 + addw t4, t2, t1 + ld a6, 112(sp) + mulw t2, t6, t3 + ld t6, 128(sp) + mulw t3, a7, t5 + addw t1, t4, t2 + li t5, -121 + addw t2, t1, t3 + mulw t4, a6, t5 + slli t3, t6, 3 + addw t1, t2, t4 + ld a6, 200(sp) + subw t4, t3, t6 + li t6, -21 + addw t2, t1, t4 + mulw t4, a6, t6 + ld t6, 424(sp) + addw t1, t2, t4 + slli t3, t6, 4 + subw t4, t3, t6 + slli a6, t4, 2 + li t4, -43 + subw t5, zero, a6 ld a6, 352(sp) - ld t6, 104(sp) - mulw t3, t5, t2 - ld t2, 136(sp) - addw t4, t1, t3 - mulw t1, a6, a7 - li a7, -110 - li a6, -121 - addw t3, t4, t1 - mulw t5, t6, a6 - slli t4, t2, 3 - li a6, -21 - ld t6, 112(sp) - addw t1, t3, t5 - subw t5, t4, t2 - addw t3, t1, t5 - mulw t4, t6, a6 - ld t5, 416(sp) - addw t1, t3, t4 - slli t2, t5, 4 - subw t3, t2, t5 - slli t6, t3, 2 - li t3, -43 - subw t4, zero, t6 + addw t2, t1, t5 ld t6, 344(sp) - addw t2, t1, t4 - ld t5, 336(sp) - li t4, 105 - mulw a6, t6, t3 - ld t6, 328(sp) - li t3, -42 - addw t1, t2, a6 - mulw a6, t5, t4 - ld t4, 320(sp) - mulw t5, t6, t3 - addw t2, t1, a6 - li a6, 87 - addw t1, t2, t5 - mulw t3, t4, a6 - ld t5, 120(sp) - li t4, 29 - addw t2, t1, t3 - ld t6, 192(sp) + li t5, 105 + ld a7, 336(sp) + mulw t1, a6, t4 + ld a6, 328(sp) + li t4, -42 + addw t3, t2, t1 + mulw t2, t6, t5 + ld t6, 120(sp) + mulw t5, a7, t4 + addw t1, t3, t2 + li t3, 87 + addw t2, t1, t5 + li t5, 29 + mulw t1, a6, t3 + ld a6, 136(sp) + addw t4, t2, t1 li t1, -106 - mulw a6, t5, t4 - ld t5, 144(sp) - addw t3, t2, a6 - slli t4, t5, 5 - mulw a6, t6, t1 - ld t6, 184(sp) - addw t2, t3, a6 - subw t3, t5, t4 + mulw t2, t6, t5 + ld t6, 144(sp) + addw t3, t4, t2 + mulw t5, a6, t1 + slli t4, t6, 5 + ld a6, 152(sp) + addw t2, t3, t5 + subw t3, t6, t4 + li t6, -110 li t4, -100 addw t1, t2, t3 - ld t5, 152(sp) - mulw t3, t6, a7 + mulw t3, a6, t6 ld t6, 160(sp) addw t2, t1, t3 + ld a6, 168(sp) li t3, -22 - mulw a6, t5, t4 - ld t5, 168(sp) - mulw t4, t6, t3 - addw t1, t2, a6 + mulw t5, t6, t4 + mulw t4, a6, t3 + ld t6, 176(sp) + addw t1, t2, t5 li t3, -75 - ld t6, 128(sp) + ld a6, 192(sp) addw t2, t1, t4 - mulw t4, t5, t3 - li t5, 127 - li t3, -125 - addw t1, t2, t4 - mulw t4, t6, t3 - addw a6, t1, t4 + mulw t5, t6, t3 + li t4, -125 + li t6, 127 + addw t1, t2, t5 + mulw t3, a6, t4 li t4, 26 - max t2, a6, zero li a6, 77 - min t3, t2, t5 - ld t5, 400(sp) - mulw t6, t3, a6 - mulw t2, t5, t4 + addw t5, t1, t3 + max t2, t5, zero + min t3, t2, t6 + ld t6, 408(sp) + mulw t5, t3, a6 + mulw t2, t6, t4 + ld a6, 312(sp) li t3, 76 - addw t1, a5, t6 - ld t6, 384(sp) - ld a6, 376(sp) - mulw t4, t6, t3 - li t3, -70 + addw t1, a5, t5 + ld a7, 392(sp) + li t5, -70 + mulw t4, a6, t3 addw a5, t2, t4 - li t4, 29 - mulw t2, a6, t3 - li a6, -95 - addw t3, a5, t2 - sd t2, 88(sp) - ld t5, 392(sp) + mulw t3, a7, t5 + li a7, -95 + li t5, 29 + addw t4, a5, t3 + sd t3, 96(sp) + ld t6, 400(sp) + ld a6, 384(sp) + mulw a5, t6, t5 + mulw t3, a6, a7 + ld t6, 376(sp) + addw t2, t4, a5 + li a6, 52 + ld a7, 320(sp) + sh1add t4, t6, t6 + addw a5, t2, t3 ld t6, 368(sp) - mulw t2, t5, t4 - ld t4, 360(sp) - addw a5, t3, t2 - sh1add t5, t4, t4 - mulw t2, t6, a6 - li t6, 52 + slliw t3, t4, 5 + mulw t4, a7, a6 + addw t2, a5, t3 + ld a7, 360(sp) + slli t3, t6, 4 + addw a5, t2, t4 + ld a6, 112(sp) + addw t4, t3, t6 + ld t6, 128(sp) + slli t5, t4, 2 + sh2add t4, a7, a7 + subw t2, zero, t5 addw t3, a5, t2 - ld a6, 312(sp) - slliw a5, t5, 5 - ld t5, 304(sp) + subw a5, zero, t4 + slli t4, a6, 4 addw t2, t3, a5 - slli t3, t5, 4 - mulw t4, a6, t6 + addw t5, t4, a6 + slli t3, t6, 4 + ld a6, 200(sp) + sh1add a5, t5, t2 + addw t4, t3, t6 + ld t6, 424(sp) + slli t5, t4, 1 + li t4, 102 + subw a7, zero, t5 + sh1add t5, t6, t6 + addw t2, a5, a7 + mulw a5, a6, t4 ld a6, 352(sp) - addw t6, t3, t5 - addw a5, t2, t4 - sh2add t5, a6, a6 - slli a7, t6, 2 - ld t6, 104(sp) - subw t4, zero, a7 - li a7, 82 - addw t2, a5, t4 - slli t4, t6, 4 - subw a5, zero, t5 - addw t5, t4, t6 + li t4, -38 addw t3, t2, a5 - ld t2, 136(sp) - sh1add a5, t5, t3 - ld t6, 112(sp) - slli t4, t2, 4 - addw t3, t4, t2 - slli t5, t3, 1 - li t3, 102 - subw a6, zero, t5 - ld t5, 416(sp) - addw t2, a5, a6 - sh1add t4, t5, t5 - mulw a6, t6, t3 ld t6, 344(sp) - li t3, -38 - addw a5, t2, a6 - ld t5, 336(sp) - sh1add t2, t4, a5 - mulw a6, t6, t3 - li t4, 27 - ld t6, 328(sp) - addw a5, t2, a6 - mulw t3, t5, t4 - li t4, 110 + sh1add a5, t5, t3 + ld a7, 336(sp) + li t5, 27 + mulw t3, a6, t4 + ld a6, 328(sp) addw t2, a5, t3 - mulw t5, t6, t4 + mulw a5, t6, t5 + ld t6, 120(sp) + li t5, 110 + addw t3, t2, a5 + mulw t4, a7, t5 li a5, 116 - ld t4, 320(sp) + addw t2, t3, t4 + mulw t5, a6, a5 + mulw t4, t6, t0 + ld a6, 136(sp) addw t3, t2, t5 - ld t5, 120(sp) - ld t6, 192(sp) - mulw t2, t4, a5 - mulw t4, t5, t0 - addw a6, t3, t2 - ld t5, 144(sp) - slli t0, t6, 6 - addw t2, a6, t4 + slli t0, a6, 6 + ld t6, 144(sp) + addw t2, t3, t4 + subw t5, a6, t0 li t4, -99 - subw a6, t6, t0 - ld t6, 184(sp) + ld a6, 152(sp) + addw t3, t2, t5 + slli t2, a6, 6 + mulw t5, t6, t4 + addw t0, t3, t5 addw t3, t2, a6 - mulw t2, t5, t4 - slli t4, t6, 6 - addw t0, t3, t2 - addw t2, t4, t6 - addw t3, t0, t2 - sd t2, 80(sp) - ld t5, 152(sp) + addw t2, t0, t3 + sd t3, 88(sp) ld t6, 160(sp) - slli t4, t5, 4 - subw t0, t4, t5 - ld t5, 168(sp) - sh3add t2, t0, t3 - mulw t4, t6, a3 - sh1add t3, t5, t5 - ld t6, 128(sp) - addw t0, t2, t4 - li t5, 127 - slliw t2, t3, 1 + ld a6, 168(sp) + slli t4, t6, 4 + subw t0, t4, t6 + ld t6, 176(sp) + li t4, -39 + sh3add t3, t0, t2 + mulw t2, a6, t4 + sh1add a6, t6, t6 + addw t0, t3, t2 + li t6, 127 + slliw t4, a6, 1 li t3, 94 - subw t4, zero, t2 - addw a3, t0, t4 - mulw t0, t6, t3 - addw t4, a3, t0 - max t2, t4, zero - min t0, t2, t5 - ld t5, 400(sp) - slliw t4, t0, 7 - ld t6, 384(sp) - subw t3, t4, t0 - ld a6, 376(sp) - subw t2, t6, a4 - li t4, -23 - addw a3, t1, t3 - mulw t0, t5, t4 - ld t5, 392(sp) - addw t1, a2, t0 + subw t5, zero, t4 + ld a6, 192(sp) + addw t2, t0, t5 + mulw t5, a6, t3 + li a6, -23 + addw t4, t2, t5 + max t0, t4, zero + min t2, t0, t6 + ld t6, 408(sp) + slliw t4, t2, 7 + subw t3, t4, t2 + addw t0, t1, t3 + mulw t2, t6, a6 + ld t1, 40(sp) + ld a6, 312(sp) + addw t3, t1, t2 + ld a7, 392(sp) + subw t4, a6, a4 + li t2, 49 + addw t1, t3, t4 + ld t6, 400(sp) + mulw a4, a7, t2 + ld a6, 384(sp) + addw t3, t1, a4 + mulw t4, t6, a3 + ld t6, 376(sp) + sh3add a3, a6, a6 + addw a4, t3, t4 + ld t1, 432(sp) + sh3add t3, a3, a4 + mulw t4, t6, a0 + slli t5, t1, 1 ld t6, 368(sp) - li a2, 49 - addw t0, t1, t2 - ld t4, 360(sp) - mulw t1, t5, a1 - mulw t2, a6, a2 - li a1, 85 - addw a4, t0, t2 - sh3add t2, t6, t6 - addw t0, a4, t1 - sh3add a4, t2, t0 - ld t0, 424(sp) - mulw t2, t4, a1 - ld t5, 304(sp) - slli t3, t0, 1 - addw t1, a4, t2 - sh1add t0, t5, t5 - subw a1, zero, t3 + addw a3, t3, t4 + subw a4, zero, t5 + sh1add t1, t6, t6 + li t3, 125 + addw t4, a3, a4 + ld a7, 360(sp) + sh2add a4, t1, t4 + ld a6, 112(sp) + li t1, -117 + ld t6, 128(sp) + mulw a3, a7, t3 + mulw a7, a6, t1 + addw t5, a4, a3 + slli a4, t6, 6 + addw t4, t5, a7 + addw a6, a4, t6 + ld t6, 424(sp) + subw t5, zero, a6 ld a6, 352(sp) - addw a4, t1, a1 - ld t6, 104(sp) - li t1, 125 - sh2add a1, t0, a4 - ld t2, 136(sp) - li t0, -117 - mulw a4, a6, t1 - mulw t5, t6, t0 - addw t4, a1, a4 - slli a1, t2, 6 - addw t3, t4, t5 - addw t6, a1, t2 - ld t5, 416(sp) - subw t4, zero, t6 + addw a7, t4, t5 + mulw t5, t6, t3 ld t6, 344(sp) - addw a6, t3, t4 - li t4, 110 - mulw t2, t5, t1 - ld t5, 336(sp) - mulw t1, t6, t4 - addw t3, a6, t2 - slli t4, t5, 5 - ld t6, 328(sp) - addw t2, t3, t1 - subw a6, t5, t4 - li t1, -123 + addw t4, a7, t5 + li t5, 110 + mulw a7, a6, t5 + slli t5, t6, 5 + addw t3, t4, a7 + subw a6, t6, t5 + ld a7, 336(sp) li t5, 83 - addw t3, t2, a6 - ld t4, 320(sp) - mulw a6, t6, t1 - addw t2, t3, a6 - li t3, 122 - mulw t6, t4, t5 - li t4, 11 - addw t1, t2, t6 - ld t5, 120(sp) - ld t6, 192(sp) - mulw a6, t5, t3 - ld t5, 144(sp) - addw t2, t1, a6 - li a6, -47 - mulw t1, t6, t4 - ld t6, 184(sp) - li t4, -23 - addw t3, t2, t1 - mulw t2, t5, t4 - ld t5, 152(sp) - addw t1, t3, t2 - slli t4, t5, 5 + addw t4, t3, a6 + ld a6, 328(sp) + mulw t6, a7, a1 + addw t3, t4, t6 + mulw a7, a6, t5 + ld t6, 120(sp) + li a6, 122 + addw t4, t3, a7 + li a7, 11 mulw t3, t6, a6 + ld a6, 136(sp) + addw t5, t4, t3 + ld t6, 144(sp) + mulw t4, a6, a7 + li a6, -23 + addw t3, t5, t4 + mulw a7, t6, a6 + li t5, -47 + ld a6, 152(sp) + addw t4, t3, a7 ld t6, 160(sp) - addw t2, t1, t3 - ld t5, 168(sp) - subw t1, zero, t4 - addw t3, t2, t1 - mulw t4, t6, t0 - li t2, 95 - ld t6, 128(sp) - addw t1, t3, t4 - li t3, 118 - mulw t4, t5, t2 - li t5, 127 - addw t0, t1, t4 - mulw t4, t6, t3 - addw t1, t0, t4 - max t2, t1, zero + mulw a7, a6, t5 + slli a6, t6, 5 + addw t3, t4, a7 + subw t5, zero, a6 + ld a6, 168(sp) + addw t4, t3, t5 + ld t6, 176(sp) + li t5, 95 + mulw a7, a6, t1 + ld a6, 192(sp) + addw t3, t4, a7 + li t4, 118 + mulw a7, t6, t5 + li t6, 127 + addw t1, t3, a7 + mulw t5, a6, t4 + addw a7, t1, t5 li t1, -106 - min t3, t2, t5 - ld t5, 400(sp) - ld t6, 384(sp) - mulw t4, t3, t1 - li t3, -104 - sh3add t1, t5, a4 - addw t0, a3, t4 - mulw a3, t6, a7 - li a7, 75 - addw a4, t1, a3 - li t1, 101 - sd a3, 56(sp) - ld a6, 376(sp) - ld t5, 392(sp) + max t3, a7, zero + li a7, 82 + min t5, t3, t6 + li t3, 101 + ld t6, 408(sp) + mulw a6, t5, t1 + li t5, -104 + sh3add t1, t6, a3 + addw t4, t0, a6 + ld a6, 312(sp) + mulw t0, a6, a7 + addw a3, t1, t0 + sd t0, 64(sp) + ld a7, 392(sp) + ld t6, 400(sp) + ld a6, 384(sp) + mulw t1, a7, t5 + addw t0, a3, t1 + li t1, -116 + mulw a3, t6, t3 + mulw a7, a6, t1 + ld t6, 376(sp) + addw t5, t0, a3 + ld t0, 104(sp) + addw a3, t5, a7 + ld t5, 80(sp) + subw a7, t6, t0 ld t6, 368(sp) - mulw t2, a6, t3 - ld t4, 360(sp) - mulw t3, t5, t1 - addw a3, a4, t2 - li a4, -116 - addw t2, a3, t3 - mulw t5, t6, a4 - ld a3, 96(sp) - addw t3, t2, t5 - ld t2, 72(sp) - subw t5, t4, a3 - subw t4, zero, t2 - addw t6, t3, t5 - li t3, -70 - addw a3, t6, t4 - ld t5, 304(sp) - ld t6, 104(sp) - mulw t2, t5, t3 - addw t4, a3, t2 - ld t2, 136(sp) - mulw a3, t6, a7 - slli t5, t2, 5 - addw t3, t4, a3 - ld t6, 112(sp) - addw t4, t5, t2 - sh1add t2, t6, t6 - sh1add a3, t4, t3 - slliw t5, t2, 5 - li t4, -101 - subw a6, zero, t5 - ld t5, 416(sp) - addw t3, a3, a6 + addw a6, a3, a7 + subw a3, zero, t5 + li t5, -70 + addw t0, a6, a3 + mulw a7, t6, t5 + ld a6, 112(sp) + li t5, 75 + addw a3, t0, a7 + ld t6, 128(sp) + mulw a7, a6, t5 + slli t5, t6, 5 + addw t0, a3, a7 + ld a6, 200(sp) + addw a7, t5, t6 + sh1add t5, a6, a6 + sh1add a3, a7, t0 + slliw t6, t5, 5 + li t5, -101 + subw a7, zero, t6 + ld t6, 424(sp) + addw t0, a3, a7 + ld a6, 352(sp) + mulw a7, t6, t5 + li t6, -114 + addw a3, t0, a7 + li t0, 59 + mulw a7, a6, t6 ld t6, 344(sp) - mulw a3, t5, t4 - ld t5, 336(sp) - li t4, -114 - addw t2, t3, a3 - mulw a6, t6, t4 - li a3, 59 - addw t3, t2, a6 - ld t6, 328(sp) - mulw t4, t5, a3 - sh1add a6, t6, t6 - addw t2, t3, t4 - sh2add t3, a6, t2 - sd a6, 64(sp) - li a6, -95 - ld t4, 320(sp) - ld t5, 120(sp) - sh2add t6, t4, t4 - addw t2, t3, t6 - ld t6, 192(sp) - mulw t4, t5, a6 - ld t5, 144(sp) - addw t3, t2, t4 - mulw a6, t6, a5 - li t4, -93 - ld t6, 184(sp) - addw t2, t3, a6 - slli t3, t6, 4 - mulw a6, t5, t4 - ld t5, 152(sp) - li t4, 79 - addw a5, t2, a6 - subw a6, t3, t6 + addw t5, a3, a7 + ld a7, 336(sp) + mulw a6, t6, t0 + addw a3, t5, a6 + sh1add t5, a7, a7 + sh2add a7, t5, a3 + sd t5, 72(sp) + ld a6, 328(sp) + ld t6, 120(sp) + sh2add t5, a6, a6 + ld a6, 136(sp) + addw a3, a7, t5 + li a7, -95 + mulw t5, t6, a7 + ld t6, 144(sp) + li a7, -93 + addw s0, a3, t5 + mulw a3, a6, a5 + ld a6, 152(sp) + mulw a5, t6, a7 + addw t5, s0, a3 + slli t6, a6, 4 + addw a3, t5, a5 + subw a7, t6, a6 + li t5, 79 + addw a5, a3, a7 ld t6, 160(sp) - addw t2, a5, a6 - sh1add t3, t6, t6 - mulw a6, t5, t4 - ld t5, 168(sp) - addw a5, t2, a6 - ld t6, 128(sp) - addw t2, a5, t3 - mulw t4, t5, a2 - li t5, 127 - slli a2, t6, 5 - addw a5, t2, t4 - subw t2, a2, t6 - slli t3, t2, 2 - subw t4, zero, t3 - addw t6, a5, t4 - max t2, t6, zero - min a5, t2, t5 - ld t5, 400(sp) - sh1add t4, a5, a5 + ld a6, 168(sp) + mulw a7, t6, t5 + ld t6, 176(sp) + addw a3, a5, a7 + sh1add a7, a6, a6 + ld a6, 192(sp) + addw t5, a3, a7 + mulw a3, t6, t2 + li t6, 127 + addw a5, t5, a3 + slli a3, a6, 5 + subw t2, a3, a6 + slli t5, t2, 2 + subw a7, zero, t5 + addw a6, a5, a7 + max t2, a6, zero + min a5, t2, t6 + ld t6, 408(sp) + sh1add a6, a5, a5 li a5, 81 - subw t6, zero, t4 - addw t3, t0, t6 - ld t6, 384(sp) - mulw t0, t5, a5 - ld a6, 376(sp) - slli t4, t6, 4 - addw t5, t4, t6 - li t4, -102 - sh2add t2, t5, t0 - mulw t6, a6, t4 - ld t5, 392(sp) - li a6, 121 - li t4, -74 - addw t0, t2, t6 - mulw a7, t5, t4 - ld t6, 368(sp) - addw t2, t0, a7 - ld t4, 360(sp) - mulw a7, t6, a6 - slli t0, t4, 4 - addw t5, t2, a7 - subw t6, t4, t0 - subw t2, t0, t4 - li t0, 55 - addw t4, t5, t6 - sd t6, 48(sp) + subw a7, zero, a6 ld a6, 312(sp) - ld t5, 304(sp) - mulw a7, a6, t0 - ld a6, 352(sp) - addw t6, t4, a7 - li a7, -13 - mulw t4, t5, t1 - addw s0, t6, t4 - mulw t5, a6, a7 - ld t6, 104(sp) - addw t4, s0, t5 - slli t1, t6, 5 - subw a6, t1, t6 - li t1, 114 - slli t5, a6, 1 - subw a7, zero, t5 addw t5, t4, a7 - addw a6, t5, a1 - sd a7, 40(sp) - li a7, 38 - ld t6, 112(sp) - ld t5, 416(sp) - mulw t4, t6, t1 + mulw t2, t6, a5 + ld a7, 392(sp) + slli t6, a6, 4 + addw s0, t6, a6 + ld t6, 400(sp) + li a6, -102 + sh2add t4, s0, t2 + mulw s0, a7, a6 + li a7, -74 + addw t2, t4, s0 + mulw a6, t6, a7 + li t6, 121 + addw t4, t2, a6 + ld a6, 384(sp) + mulw t2, a6, t6 + ld t6, 376(sp) + addw a7, t4, t2 + slli t2, t6, 4 + subw s0, t6, t2 + subw t4, t2, t6 + addw a6, a7, s0 + li t2, 55 + sd s0, 56(sp) + ld a7, 320(sp) + ld t6, 368(sp) + mulw s0, a7, t2 + ld a7, 360(sp) + addw s1, a6, s0 + mulw a6, t6, t3 + li t3, -13 + addw s0, s1, a6 + ld a6, 112(sp) + mulw s1, a7, t3 + slli t3, a6, 5 + addw t6, s0, s1 + subw s0, t3, a6 + li t3, 114 + slli a7, s0, 1 + subw a6, zero, a7 + addw s0, t6, a6 + addw a7, s0, a4 + sd a6, 48(sp) + li s0, 38 + ld a6, 200(sp) + ld t6, 424(sp) + mulw s1, a6, t3 + ld a6, 352(sp) + addw a4, a7, s1 + mulw s1, t6, s0 + li t6, -21 + addw a7, a4, s1 + mulw s0, a6, t6 ld t6, 344(sp) - addw a1, a6, t4 - mulw a6, t5, a7 - ld t5, 336(sp) - addw t4, a1, a6 - li a6, -21 - mulw a7, t6, a6 - slli t6, t5, 3 - addw a1, t4, a7 - subw t4, t6, t5 - ld t6, 328(sp) - slli a7, t4, 4 - ld t4, 320(sp) - addw a6, a1, a7 - slli t5, t4, 3 - mulw a7, t6, t1 - subw t1, t5, t4 - addw a1, a6, a7 - ld t5, 120(sp) - slli a7, t1, 4 - sh2add t6, t5, t5 - addw a6, a1, a7 - slliw a1, t6, 1 - ld t6, 192(sp) - subw a7, zero, a1 - slli t5, t6, 4 - li a1, -50 - addw t4, a6, a7 - subw a7, zero, t5 - ld t5, 144(sp) - addw a6, t4, a7 - ld t6, 184(sp) - mulw a7, t5, a1 - slli t5, t6, 3 - addw t4, a6, a7 - subw a7, t5, t6 - ld t5, 152(sp) - slli s0, a7, 4 + addw a4, a7, s0 + slli a6, t6, 3 + subw a7, a6, t6 + slli s1, a7, 4 + ld a7, 336(sp) + addw t6, a4, s1 + ld a6, 328(sp) + slli s1, t4, 1 + mulw s0, a7, t3 + addw a4, t6, s0 + slli t6, a6, 3 + subw t3, t6, a6 + ld t6, 120(sp) + slli s0, t3, 4 + addw a7, a4, s0 + sh2add a4, t6, t6 + slliw a6, a4, 1 + subw t6, zero, a6 + ld a6, 136(sp) + addw a4, a7, t6 + slli t6, a6, 4 + li a6, -50 + subw s0, zero, t6 + ld t6, 144(sp) + addw a7, a4, s0 + mulw s0, t6, a6 + ld a6, 152(sp) + addw a4, a7, s0 + slli t6, a6, 3 + subw a7, t6, a6 ld t6, 160(sp) + slli s0, a7, 4 subw a6, zero, s0 - addw a7, t4, a6 - mulw t4, t5, a4 - ld t5, 168(sp) - li a4, -54 - addw a6, a7, t4 - mulw a7, t6, a4 - ld t6, 128(sp) - addw t4, a6, a7 - li a7, 82 - mulw a6, t5, a7 - sh3add t5, t6, t6 - addw a4, t4, a6 - slliw t4, t5, 3 - li t5, 127 - subw a6, zero, t4 addw a7, a4, a6 - max t6, a7, zero - li a7, -77 - min a6, t6, t5 - ld t5, 400(sp) - slliw a4, a6, 5 - ld t6, 384(sp) - slli a6, t5, 4 - addw t4, t3, a4 - subw a4, a6, t5 - ld a6, 376(sp) - mulw t5, t6, a7 - addw t3, a4, t5 + mulw a6, t6, t1 + li t6, -54 + addw a4, a7, a6 + ld a6, 168(sp) + mulw a7, a6, t6 + ld t6, 176(sp) + addw t1, a4, a7 + ld a6, 192(sp) + li a7, 82 + mulw s0, t6, a7 + sh3add t6, a6, a6 + addw a4, t1, s0 + slliw t1, t6, 3 + li t6, 127 + subw a7, zero, t1 + addw s0, a4, a7 + max a6, s0, zero + li s0, -77 + min a7, a6, t6 + ld t6, 408(sp) + slliw a4, a7, 5 + ld a6, 312(sp) + slli a7, t6, 4 + addw t1, t5, a4 + subw a4, a7, t6 + ld a7, 392(sp) + mulw t6, a6, s0 + addw t5, a4, t6 li a4, -90 - slli t5, a6, 5 - addw t6, t5, a6 - ld t5, 392(sp) - sh1add a7, t6, t3 + slli t6, a7, 5 + addw a6, t6, a7 + ld t6, 400(sp) + sh1add s0, a6, t5 + ld a6, 384(sp) + mulw t5, t6, a4 + sh1add t6, a6, a6 + addw a7, s0, t5 + slliw s0, t6, 1 + slli t5, a6, 1 + subw a6, zero, s0 + subw s0, zero, s1 + addw t6, a7, a6 + ld a7, 320(sp) + addw a6, t6, s0 + slli t6, a7, 3 + subw s0, zero, t6 ld t6, 368(sp) - mulw t3, t5, a4 - sh1add t5, t6, t6 - addw a6, a7, t3 - slliw a7, t5, 1 - slli t3, t6, 1 - subw t6, zero, a7 - slli a7, t2, 1 - addw t5, a6, t6 - subw s0, zero, a7 - ld a6, 312(sp) - addw t6, t5, s0 - slli a7, a6, 3 - ld t5, 304(sp) - subw s0, zero, a7 + addw t4, a6, s0 + ld a7, 360(sp) + mulw a6, t6, a5 + li t6, -110 + addw s0, t4, a6 + ld a6, 112(sp) + sh1add a5, a7, s0 + mulw a7, a6, t6 + ld t6, 128(sp) + addw t4, a5, a7 + ld a6, 200(sp) + li a7, -95 + mulw s0, t6, a7 + ld t6, 424(sp) + mulw a7, a6, t0 + addw a5, t4, s0 + li a6, 52 + addw t4, a5, a7 + mulw t0, t6, a6 ld a6, 352(sp) - addw t2, t6, s0 - mulw t6, t5, a5 - addw a7, t2, t6 - ld t6, 104(sp) - sh1add t5, a6, a7 - ld t2, 136(sp) - li a7, -110 - mulw a6, t6, a7 - ld t6, 112(sp) - addw a5, t5, a6 - li a6, -95 - mulw t5, t2, a6 - mulw t2, t6, a3 - addw a7, a5, t5 - li t6, 52 - addw a5, a7, t2 - ld t5, 416(sp) - li a7, -13 - mulw a6, t5, t6 + addw a5, t4, t0 ld t6, 344(sp) - addw a3, a5, a6 - ld t5, 336(sp) + slli a7, a6, 4 + subw t4, a7, a6 + ld a7, 336(sp) + addw t0, a5, t4 + mulw t4, t6, t2 + ld t6, 120(sp) + slli t2, a7, 5 + addw a5, t0, t4 + addw t4, t2, a7 + li a7, -95 + subw t0, zero, t4 + li t4, 58 + addw a6, a5, t0 + mulw a5, t6, t4 + sh1add t2, t3, a6 + li t3, 67 + addw t0, t2, a5 + ld a6, 136(sp) + li t2, 86 + ld t6, 144(sp) + mulw t4, a6, t3 + li t3, -79 + addw a5, t0, t4 + ld a6, 152(sp) + mulw t4, t6, t2 + ld t6, 160(sp) + addw t0, a5, t4 + sh1add t2, t6, t6 + mulw t4, a6, t3 + li t3, -13 + addw a5, t0, t4 + ld a6, 168(sp) + slliw t4, t2, 4 + ld t6, 176(sp) + addw t0, a5, t4 slli t2, t6, 4 - subw a6, t2, t6 - ld t6, 328(sp) - addw a5, a3, a6 - mulw t2, t5, t0 - ld t5, 120(sp) - slli t0, t6, 5 - addw a3, a5, t2 - addw t2, t0, t6 - ld t6, 192(sp) - subw a5, zero, t2 - li t2, 58 - addw a6, a3, a5 - mulw a3, t5, t2 - sh1add t0, t1, a6 - ld t5, 144(sp) - li t2, 67 - li a6, -95 - addw a5, t0, a3 - mulw t1, t6, t2 - li t0, 86 + mulw t4, a6, t3 + ld a6, 192(sp) + subw t3, t6, t2 + addw a5, t0, t4 + li t6, 127 + addw t2, a3, a6 + addw t0, a5, t3 + sh1add t3, t2, t0 + max a5, t3, zero + ld t3, 88(sp) + min a3, a5, t6 + ld t0, 64(sp) + ld t6, 408(sp) + mulw t4, a3, a7 + ld a7, 392(sp) + addw a3, t3, t0 + addw t2, t1, t4 + li t3, 67 + subw t4, zero, t5 + slli t1, t6, 5 + addw t0, t1, t6 + ld t6, 400(sp) + mulw t1, a7, t3 + addw a5, a3, t0 + slli t0, t6, 4 addw a3, a5, t1 - ld t6, 184(sp) - mulw t2, t5, t0 - li t1, -79 - addw a5, a3, t2 - ld t5, 152(sp) - mulw t2, t6, t1 - sh1add t0, t5, t5 - ld t6, 160(sp) - addw a3, a5, t2 - slliw t1, t0, 4 - ld t5, 168(sp) - addw a5, a3, t1 - slli t0, t5, 4 - mulw t1, t6, a7 + subw t3, t0, t6 + ld t6, 376(sp) + sh1add t1, t3, a3 + ld t0, 104(sp) + addw a5, t1, t4 + addw t3, t0, t6 + ld t1, 432(sp) + addw t4, a5, t3 + ld t6, 368(sp) + li t3, -13 + sh3add a3, t1, t4 + ld a7, 360(sp) + ld a6, 112(sp) + sh3add t0, a7, a7 + mulw a5, t6, t3 + li a7, 107 ld t6, 128(sp) - addw a3, a5, t1 - ld t2, 80(sp) - subw t1, t5, t0 - li t5, 127 - addw t0, a2, t6 - addw a5, a3, t1 - sh1add t1, t0, a5 - max a3, t1, zero - min a2, a3, t5 - ld a3, 56(sp) - ld t5, 400(sp) - mulw t0, a2, a6 - ld a6, 376(sp) - addw a2, t2, a3 - addw a5, t4, t0 - li t2, 67 - slli t0, t5, 5 - addw t1, t0, t5 - ld t5, 392(sp) - mulw t0, a6, t2 - addw a3, a2, t1 - subw t2, zero, t3 - ld t4, 360(sp) - slli t1, t5, 4 - addw a2, a3, t0 - subw a3, t1, t5 - sh1add t0, a3, a2 - ld a3, 96(sp) - addw t1, t0, t2 - addw t3, a3, t4 - ld t0, 424(sp) - addw t2, t1, t3 - ld t5, 304(sp) - sh3add a2, t0, t2 - ld a6, 352(sp) - ld t6, 104(sp) - sh3add t0, a6, a6 - mulw a3, t5, a7 - ld t2, 136(sp) - sh2add t3, t6, t6 - li a7, 82 - addw t1, a2, a3 - ld t6, 112(sp) - sh1add a3, t0, t1 - ld t5, 416(sp) + sh2add t3, a6, a6 + addw t1, a3, a5 + ld a6, 200(sp) + sh1add a5, t0, t1 li t1, 104 - addw a2, a3, t3 - mulw t4, t2, t1 + addw a3, a5, t3 + mulw t4, t6, t1 li t3, -119 - addw a3, a2, t4 - li t4, 107 - mulw a2, t6, t3 + ld t6, 424(sp) + addw a5, a3, t4 + slli t4, t6, 3 + mulw a3, a6, t3 + ld a6, 352(sp) + addw t1, a5, a3 + li a5, 71 + subw a3, t6, t4 ld t6, 344(sp) - slli t3, t5, 3 - addw t1, a3, a2 - li a3, 71 - subw a2, t5, t3 - ld t5, 336(sp) - mulw t3, t6, a3 - addw t2, t1, a2 - ld a6, 64(sp) - addw t1, t2, t3 - mulw t6, t5, t4 - ld t4, 320(sp) - addw t3, t1, t6 - ld t5, 120(sp) - sh3add t2, a6, t3 - sh1add t3, t5, t5 - mulw t6, t4, a7 - slliw a6, t3, 5 - addw t1, t2, t6 - li t3, -104 - subw t4, zero, a6 - ld t6, 192(sp) - li a6, -121 - addw t2, t1, t4 - ld t5, 144(sp) - mulw t4, t6, t3 - addw t1, t2, t4 - mulw t3, t5, a6 - li t4, 97 - ld t5, 152(sp) - addw t2, t1, t3 - ld t6, 160(sp) - mulw t1, t5, t4 + addw t3, t1, a3 + mulw t4, a6, a5 + ld t5, 72(sp) + addw t1, t3, t4 + mulw a6, t6, a7 + li a7, 82 + addw t4, t1, a6 + ld a6, 328(sp) + sh3add t3, t5, t4 + ld t6, 120(sp) + sh1add t5, t6, t6 + mulw t4, a6, a7 + ld a6, 136(sp) + slliw a7, t5, 5 + addw t1, t3, t4 + li t5, -104 + ld t6, 144(sp) + subw t4, zero, a7 + li a7, 97 + addw t3, t1, t4 + mulw t4, a6, t5 + li t5, -121 + addw t1, t3, t4 + mulw t3, t6, t5 li t5, 83 - addw t3, t2, t1 - mulw t4, t6, t5 + addw t4, t1, t3 + ld t6, 160(sp) + ld a6, 168(sp) + mulw t1, t6, a7 + ld t6, 176(sp) + mulw a7, a6, t5 + addw t3, t4, t1 + li t5, -84 + ld a6, 192(sp) li t1, 46 - ld t5, 168(sp) - addw t2, t3, t4 - ld t6, 128(sp) - li t4, -84 - mulw a6, t5, t1 - li t5, 127 - mulw t1, t6, t4 - addw t3, t2, a6 - addw a6, t3, t1 - max t2, a6, zero - min t4, t2, t5 - ld t2, 88(sp) - ld t6, 48(sp) - mulw t3, t4, a1 - ld a7, 40(sp) - addw a1, t2, t6 - addw t1, a5, t3 - ld t5, 400(sp) - li t2, -29 - addw a5, a7, a1 - ld t6, 384(sp) - li a7, 38 - mulw t3, t5, t2 - ld t5, 392(sp) - slli t2, t6, 3 - addw a1, a5, t3 - subw t3, t2, t6 - ld t6, 368(sp) - addw a5, a1, t3 + addw t4, t3, a7 + mulw a7, t6, t1 + li t6, 127 + addw t3, t4, a7 + mulw t4, a6, t5 + li a6, -50 + addw a7, t3, t4 + ld t3, 96(sp) + max t1, a7, zero + ld s0, 56(sp) + min t4, t1, t6 + mulw t5, t4, a6 + ld a6, 48(sp) + addw t1, t2, t5 + ld t6, 408(sp) + addw t2, t3, s0 + li s0, 38 + li t3, -29 + addw t4, a6, t2 + mulw a7, t6, t3 ld a6, 312(sp) - mulw t3, t5, a7 - li a7, -77 - addw a1, a5, t3 - mulw t2, t6, a4 - ld t5, 304(sp) - slli t3, a6, 5 - addw a5, a1, t2 - li a1, 37 - subw t2, zero, t3 - mulw t3, t5, a1 - addw a4, a5, t2 - ld t2, 136(sp) - addw t4, a4, t3 - ld t6, 112(sp) - li t3, -125 - sh2add a5, t0, t4 - ld t5, 416(sp) - li t0, -46 - mulw t4, t2, t3 - li t3, -70 - mulw t2, t6, t0 - addw a4, a5, t4 + addw t2, t4, a7 + slli t5, a6, 3 + ld t6, 400(sp) + subw t4, t5, a6 + ld a6, 384(sp) + addw t3, t2, t4 + ld a7, 320(sp) + mulw t2, t6, s0 + slli t5, a7, 5 + li s0, -77 + mulw t6, a6, a4 + addw t4, t3, t2 + li a4, 37 + addw t2, t4, t6 + ld t6, 368(sp) + subw t4, zero, t5 + addw t3, t2, t4 + mulw t4, t6, a4 + ld t6, 128(sp) + addw t5, t3, t4 + ld a6, 200(sp) + li t4, -125 + sh2add t2, t0, t5 + li t5, -46 + mulw t0, t6, t4 + ld t6, 424(sp) + addw t3, t2, t0 + mulw t2, a6, t5 + ld a6, 352(sp) + li t5, -70 + addw t0, t3, t2 + mulw t4, t6, t5 ld t6, 344(sp) - addw a5, a4, t2 - li t2, -73 - mulw a4, t5, t3 - ld t5, 336(sp) - addw t0, a5, a4 - mulw t3, t6, a1 - addw a4, t0, t3 - mulw t4, t5, t2 - ld t0, 408(sp) - li t2, -87 - addw a5, a4, t4 - subw t3, zero, t0 - ld t4, 320(sp) - addw a4, a5, t3 - ld t5, 120(sp) + addw t2, t0, t4 + mulw t3, a6, a4 + li t4, -73 + addw t0, t2, t3 + mulw a6, t6, t4 + ld t3, 416(sp) + li t4, -87 + addw t2, t0, a6 + subw t5, zero, t3 + ld a6, 328(sp) li t3, -75 - ld t6, 192(sp) - mulw t0, t4, t2 - li t4, -114 - addw a5, a4, t0 - mulw t2, t6, a3 - mulw a4, t5, t3 - ld t5, 144(sp) - addw t0, a5, a4 - ld t6, 184(sp) - li a5, 53 - addw a4, t0, t2 - mulw t0, t5, a7 - mulw t2, t6, a5 - li a7, -13 - ld t5, 152(sp) - addw a3, a4, t0 + addw t0, t2, t5 + ld t6, 120(sp) + mulw t5, a6, t4 + ld a6, 136(sp) + addw t2, t0, t5 + li t5, 75 + mulw t0, t6, t3 + ld t6, 144(sp) + addw t4, t2, t0 + mulw t3, a6, a5 + li t2, 53 + ld a6, 152(sp) + addw t0, t4, t3 + mulw t3, t6, s0 ld t6, 160(sp) - addw a4, a3, t2 - li t2, 67 - mulw t0, t5, a1 - mulw a5, t6, a0 - ld t5, 168(sp) - addw a3, a4, t0 - ld t6, 128(sp) - addw a1, a3, a5 - mulw a3, t5, a7 - li a7, -77 - li t5, 127 - addw a0, a1, a3 - mulw a1, t6, t4 - li t4, -23 - addw a4, a0, a1 - max a3, a4, zero - min a1, a3, t5 - li a3, 42 - ld t5, 400(sp) - mulw a4, a1, t4 - ld t6, 384(sp) - addw a0, t1, a4 - ld a6, 376(sp) - li t1, -123 - mulw a4, t5, t2 - mulw a5, t6, a3 - ld t5, 392(sp) - addw a1, a2, a4 - ld t6, 368(sp) - li a4, 41 - addw a2, a1, a5 - mulw a3, a6, a4 - ld t4, 360(sp) - li a4, -92 - addw a1, a2, a3 + addw a5, t0, t3 + mulw t4, a6, t2 + ld a6, 168(sp) + addw t0, a5, t4 + mulw t3, t6, a4 + ld t6, 176(sp) + addw a5, t0, t3 + mulw t2, a6, a2 + li t3, -13 + ld a6, 192(sp) + addw a4, a5, t2 + mulw a5, t6, t3 + li t3, 67 + li t6, -114 + addw a2, a4, a5 + mulw a4, a6, t6 + li a6, -23 + li t6, 127 + addw t0, a2, a4 + max a5, t0, zero + min a4, a5, t6 + li a5, 42 + ld t6, 408(sp) + mulw t0, a4, a6 ld a6, 312(sp) - mulw a3, t5, t1 + addw a2, t1, t0 + ld a7, 392(sp) + mulw t0, t6, t3 + ld t6, 400(sp) + addw a4, a3, t0 + mulw t1, a6, a5 + li t0, 41 + ld a6, 384(sp) + addw a3, a4, t1 + mulw a5, a7, t0 li t1, 47 - addw a2, a1, a3 - ld t5, 304(sp) - mulw a1, t6, a4 - sh2add a4, t4, t4 - addw a3, a2, a1 - li t4, -74 - sh1add a1, a4, a3 - mulw a3, a6, a7 - ld a6, 352(sp) - li a7, 75 - addw a2, a1, a3 - sh1add a5, a6, a6 - ld t6, 104(sp) - mulw a3, t5, a7 - li a6, -121 - slliw a4, a5, 5 - ld t2, 136(sp) - li a7, 38 - addw a1, a2, a3 - li a3, -51 - addw a2, a1, a4 - mulw a5, t6, a3 + mulw t0, t6, a1 + addw a4, a3, a5 + ld t6, 376(sp) + li a5, -92 + addw a3, a4, t0 + mulw a1, a6, a5 + ld a7, 320(sp) + sh2add a5, t6, t6 + addw a4, a3, a1 + ld t6, 368(sp) + sh1add a1, a5, a4 + li a5, -51 + mulw a4, a7, s0 + li s0, 38 + addw a3, a1, a4 + ld a7, 360(sp) + mulw a4, t6, t5 + ld a6, 112(sp) + sh1add t0, a7, a7 + li t5, 29 + addw a1, a3, a4 + ld t6, 128(sp) + li a7, -74 + slliw a4, t0, 5 + mulw t0, a6, a5 + addw a3, a1, a4 + ld a6, 200(sp) li a4, 109 - ld t6, 112(sp) - addw a1, a2, a5 - mulw a2, t2, a4 - li t2, 67 + addw a1, a3, t0 + mulw a5, t6, a4 li a4, -122 - addw a3, a1, a2 - mulw a1, t6, t4 + addw a3, a1, a5 + mulw a5, a6, a7 + ld a6, 352(sp) + addw a1, a3, a5 ld t6, 344(sp) - addw a2, a3, a1 - ld t5, 336(sp) - mulw a3, t6, a4 - ld t6, 328(sp) + ld a7, 336(sp) + mulw a5, a6, a4 + ld a6, 328(sp) + mulw a4, t6, t3 + addw a3, a1, a5 + ld t6, 120(sp) + mulw a5, a7, t1 + addw a1, a3, a4 li a4, 22 - addw a1, a2, a3 - ld t4, 320(sp) - mulw a3, t5, t2 - ld t5, 120(sp) - addw a2, a1, a3 - mulw a1, t6, t1 - mulw a5, t4, a4 - ld t6, 192(sp) - li t1, 46 - addw a3, a2, a1 - li t4, 29 - slli a2, t5, 4 + addw a3, a1, a5 + mulw a5, a6, a4 + ld a6, 136(sp) + slli a4, t6, 4 addw a1, a3, a5 - addw t0, a2, t5 - ld t5, 144(sp) - slli a5, t0, 2 - subw a4, zero, a5 - addw a3, a1, a4 - mulw a4, t5, t4 - mulw a1, t6, a7 - ld t6, 184(sp) - addw a2, a3, a1 - ld t5, 152(sp) - li a3, 115 - addw a1, a2, a4 - mulw a4, t6, a3 + addw t1, a4, t6 + ld t6, 144(sp) + slli t0, t1, 2 + li t1, 46 + subw a5, zero, t0 + addw a3, a1, a5 + li a5, 115 + mulw a1, a6, s0 + ld a6, 152(sp) + addw a4, a3, a1 + mulw a3, t6, t5 ld t6, 160(sp) - addw a2, a1, a4 - sh3add a3, t6, t6 - mulw a4, t5, a6 - ld t5, 168(sp) - addw a1, a2, a4 - ld t6, 128(sp) - li a4, -49 - sh2add a2, a3, a1 - mulw a5, t5, a4 - li a1, 85 - li t5, 127 - addw a3, a2, a5 - mulw a4, t6, a1 + li t5, -121 + addw a1, a4, a3 + mulw a4, a6, a5 + li a5, -49 + addw a3, a1, a4 + ld a6, 168(sp) + mulw a1, t6, t5 + sh3add t0, a6, a6 + ld t6, 176(sp) + addw a4, a3, a1 + ld a6, 192(sp) + sh2add a1, t0, a4 + mulw t0, t6, a5 + li t6, 127 + addw a3, a1, t0 + mulw a4, a6, a0 addw a5, a3, a4 - max a2, a5, zero - min a1, a2, t5 - mulw a3, a1, t1 - addw a2, a0, a3 - ble a2, zero, label15 + max a1, a5, zero + min a0, a1, t6 + mulw a3, a0, t1 + addw a1, a2, a3 + ble a1, zero, label15 li a0, 99 jal putch li a0, 97 @@ -1087,13 +1086,13 @@ label12: jal putch li a0, 10 jal putch - ld a0, 176(sp) + ld a0, 184(sp) addiw a0, a0, -1 ble a0, zero, label17 .p2align 2 label615: - sd a0, 176(sp) - addi s0, sp, 200 + sd a0, 184(sp) + addi s0, sp, 208 mv s1, zero mv s3, zero mv s2, s0 @@ -1108,7 +1107,7 @@ label15: jal putch li a0, 10 jal putch - ld a0, 176(sp) + ld a0, 184(sp) addiw a0, a0, -1 bgt a0, zero, label615 label17: @@ -1118,5 +1117,5 @@ label17: ld s1, 16(sp) ld s2, 24(sp) ld s3, 32(sp) - addi sp, sp, 432 + addi sp, sp, 440 ret diff --git a/tests/SysY2022/functional/74_kmp.arm.s b/tests/SysY2022/functional/74_kmp.arm.s index c7e79faf6..a52b8c4ce 100644 --- a/tests/SysY2022/functional/74_kmp.arm.s +++ b/tests/SysY2022/functional/74_kmp.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 next: .zero 16384 -.align 8 +.p2align 3 src: .zero 16384 -.align 8 +.p2align 3 dst: .zero 16384 .text diff --git a/tests/SysY2022/functional/74_kmp.riscv.s b/tests/SysY2022/functional/74_kmp.riscv.s index c2bb6894c..7515cf98e 100644 --- a/tests/SysY2022/functional/74_kmp.riscv.s +++ b/tests/SysY2022/functional/74_kmp.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 next: .zero 16384 -.align 8 +.p2align 3 src: .zero 16384 -.align 8 +.p2align 3 dst: .zero 16384 .text diff --git a/tests/SysY2022/functional/75_max_flow.arm.s b/tests/SysY2022/functional/75_max_flow.arm.s index 50acba887..41bffb5f4 100644 --- a/tests/SysY2022/functional/75_max_flow.arm.s +++ b/tests/SysY2022/functional/75_max_flow.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 to: .zero 400 -.align 8 +.p2align 3 cap: .zero 400 -.align 8 +.p2align 3 rev: .zero 400 .text diff --git a/tests/SysY2022/functional/75_max_flow.riscv.s b/tests/SysY2022/functional/75_max_flow.riscv.s index 7964e7d03..29f8e7681 100644 --- a/tests/SysY2022/functional/75_max_flow.riscv.s +++ b/tests/SysY2022/functional/75_max_flow.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 to: .zero 400 -.align 8 +.p2align 3 cap: .zero 400 -.align 8 +.p2align 3 rev: .zero 400 .text diff --git a/tests/SysY2022/functional/76_n_queens.arm.s b/tests/SysY2022/functional/76_n_queens.arm.s index 7f9fd7399..339b4978a 100644 --- a/tests/SysY2022/functional/76_n_queens.arm.s +++ b/tests/SysY2022/functional/76_n_queens.arm.s @@ -1,16 +1,16 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 ans: .zero 200 -.align 8 +.p2align 3 row: .zero 200 -.align 8 +.p2align 3 line1: .zero 200 -.align 8 +.p2align 3 line2: .zero 400 .text diff --git a/tests/SysY2022/functional/76_n_queens.riscv.s b/tests/SysY2022/functional/76_n_queens.riscv.s index b9e781aee..512585e62 100644 --- a/tests/SysY2022/functional/76_n_queens.riscv.s +++ b/tests/SysY2022/functional/76_n_queens.riscv.s @@ -1,16 +1,16 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 ans: .zero 200 -.align 8 +.p2align 3 row: .zero 200 -.align 8 +.p2align 3 line1: .zero 200 -.align 8 +.p2align 3 line2: .zero 400 .text diff --git a/tests/SysY2022/functional/77_substr.arm.s b/tests/SysY2022/functional/77_substr.arm.s index 1303fbe0e..d68c3a59d 100644 --- a/tests/SysY2022/functional/77_substr.arm.s +++ b/tests/SysY2022/functional/77_substr.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 p: .zero 1024 .text diff --git a/tests/SysY2022/functional/77_substr.riscv.s b/tests/SysY2022/functional/77_substr.riscv.s index 263880019..e169c0b8e 100644 --- a/tests/SysY2022/functional/77_substr.riscv.s +++ b/tests/SysY2022/functional/77_substr.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 p: .zero 1024 .text @@ -12,11 +12,10 @@ main: addi sp, sp, -152 li a1, 7 li a2, 1 - li t0, 9 - sd ra, 0(sp) slli a3, a1, 32 - sd s1, 8(sp) + sd ra, 0(sp) ori a0, a3, 8 + sd s1, 8(sp) addi s1, sp, 96 sd s2, 16(sp) addi s2, sp, 32 @@ -36,20 +35,21 @@ main: sd a4, 64(sp) ori a4, a3, 3 sd a5, 72(sp) - slli a3, t0, 32 + li a5, 9 sd a4, 80(sp) - addi a5, a3, 3 - ori a4, a0, 7 - li a3, 2 + slli a3, a5, 32 sw zero, 88(sp) - sd a5, 96(sp) - slli a5, a3, 32 - sd a4, 104(sp) + ori a5, a0, 7 + addi a4, a3, 3 + li a3, 2 + sd a4, 96(sp) + slli a4, a3, 32 + sd a5, 104(sp) ori a3, a1, 4 - addi a4, a5, 4 + addi a5, a4, 4 li a1, 5 - sd a4, 112(sp) ori a4, a2, 6 + sd a5, 112(sp) sd a3, 120(sp) sd a4, 128(sp) sd a0, 136(sp) @@ -58,240 +58,242 @@ main: jal putint li a0, 10 jal putch - mv a3, zero -pcrel421: - auipc a1, %pcrel_hi(p) - addi a0, a1, %pcrel_lo(pcrel421) - li a1, 13 - mv a2, a0 + mv a2, zero +pcrel420: + auipc a3, %pcrel_hi(p) + addi a0, a3, %pcrel_lo(pcrel420) + mv a1, a0 .p2align 2 label2: - sd zero, 0(a2) - addiw a3, a3, 64 - li a4, 256 - sd zero, 8(a2) - sd zero, 16(a2) - sd zero, 24(a2) - sd zero, 32(a2) - sd zero, 40(a2) - sd zero, 48(a2) - sd zero, 56(a2) - sd zero, 64(a2) - sd zero, 72(a2) - sd zero, 80(a2) - sd zero, 88(a2) - sd zero, 96(a2) - sd zero, 104(a2) - sd zero, 112(a2) - sd zero, 120(a2) - sd zero, 128(a2) - sd zero, 136(a2) - sd zero, 144(a2) - sd zero, 152(a2) - sd zero, 160(a2) - sd zero, 168(a2) - sd zero, 176(a2) - sd zero, 184(a2) - sd zero, 192(a2) - sd zero, 200(a2) - sd zero, 208(a2) - sd zero, 216(a2) - sd zero, 224(a2) - sd zero, 232(a2) - sd zero, 240(a2) - sd zero, 248(a2) - bge a3, a4, label132 - addi a2, a2, 256 + sd zero, 0(a1) + addiw a2, a2, 64 + li a3, 256 + sd zero, 8(a1) + sd zero, 16(a1) + sd zero, 24(a1) + sd zero, 32(a1) + sd zero, 40(a1) + sd zero, 48(a1) + sd zero, 56(a1) + sd zero, 64(a1) + sd zero, 72(a1) + sd zero, 80(a1) + sd zero, 88(a1) + sd zero, 96(a1) + sd zero, 104(a1) + sd zero, 112(a1) + sd zero, 120(a1) + sd zero, 128(a1) + sd zero, 136(a1) + sd zero, 144(a1) + sd zero, 152(a1) + sd zero, 160(a1) + sd zero, 168(a1) + sd zero, 176(a1) + sd zero, 184(a1) + sd zero, 192(a1) + sd zero, 200(a1) + sd zero, 208(a1) + sd zero, 216(a1) + sd zero, 224(a1) + sd zero, 232(a1) + sd zero, 240(a1) + sd zero, 248(a1) + bge a2, a3, label132 + addi a1, a1, 256 j label2 label132: - addi a2, s2, 4 - mv a3, s0 - slli t1, s0, 6 - addi a4, s1, 4 - add a5, a0, t1 - mv t1, s0 - addi t0, a5, -64 - lw t2, -4(a2) + addi a1, s2, 4 + mv a2, s0 + slli t0, s0, 6 + addi a3, s1, 4 + add a4, a0, t0 + mv t0, s0 + addi a5, a4, -64 + lw t1, -4(a1) + sh2add t2, s0, a4 sh2add t3, s0, a5 - sh2add t4, s0, t0 - lw t5, -4(a4) - beq t2, t5, label13 - sh2add a7, s0, t0 - sh2add t5, s0, a5 - lw t6, 0(a7) - lw a6, -4(t5) - max a7, t6, a6 - sw a7, 0(t5) - lw t5, 0(a4) - beq t2, t5, label15 - lw t5, 4(t4) - sh2add t6, s0, a5 - lw a7, 0(t6) - max a6, t5, a7 - sw a6, 4(t3) - lw t5, 4(a4) - beq t2, t5, label17 - j label393 + lw t4, -4(a3) + beq t1, t4, label13 + sh2add a6, s0, a5 + sh2add t4, s0, a4 + lw t5, 0(a6) + lw t6, -4(t4) + max a6, t5, t6 + sw a6, 0(t4) + lw t4, 0(a3) + beq t1, t4, label15 + lw t4, 4(t3) + sh2add t6, s0, a4 + lw t5, 0(t6) + max a7, t4, t5 + sw a7, 4(t2) + lw t4, 4(a3) + beq t1, t4, label17 + j label392 .p2align 2 -label386: - lw a5, 12(a4) - bne t2, a5, label396 +label385: + lw a4, 12(a3) + bne t1, a4, label395 .p2align 2 label23: - lw a4, 12(t4) - addiw a3, a3, 1 - addi a5, a4, 1 - li a4, 16 - sw a5, 16(t3) - bge a3, a4, label26 + lw a3, 12(t3) + addiw a2, a2, 1 + addi a4, a3, 1 + li a3, 16 + sw a4, 16(t2) + bge a2, a3, label26 .p2align 2 label27: - addi a2, a2, 4 - slli t1, a3, 6 - addi a4, s1, 4 - add a5, a0, t1 - lw t2, -4(a2) - mv t1, s0 + addi a1, a1, 4 + slli t0, a2, 6 + addi a3, s1, 4 + add a4, a0, t0 + lw t1, -4(a1) + sh2add t2, s0, a4 + mv t0, s0 + addi a5, a4, -64 + lw t4, -4(a3) sh2add t3, s0, a5 - addi t0, a5, -64 - lw t5, -4(a4) - sh2add t4, s0, t0 - bne t2, t5, label398 + bne t1, t4, label397 .p2align 2 label13: - lw t5, -4(t4) - sh2add a6, t1, a5 - addi t6, t5, 1 - sw t6, 0(a6) - lw t5, 0(a4) - beq t2, t5, label15 - lw t5, 4(t4) - sh2add t6, t1, a5 - lw a7, 0(t6) - max a6, t5, a7 - sw a6, 4(t3) - lw t5, 4(a4) - bne t2, t5, label393 + lw t4, -4(t3) + sh2add t6, t0, a4 + addi t5, t4, 1 + sw t5, 0(t6) + lw t4, 0(a3) + beq t1, t4, label15 + lw t4, 4(t3) + lw t5, 0(t6) + max a7, t4, t5 + sw a7, 4(t2) + lw t4, 4(a3) + bne t1, t4, label392 .p2align 2 label17: - lw t5, 4(t4) - addi t6, t5, 1 - sw t6, 8(t3) - lw t5, 8(a4) - beq t2, t5, label19 - lw t5, 12(t4) - addiw t1, t1, 4 - lw t6, 8(t3) - max a6, t5, t6 - sw a6, 12(t3) - blt t1, a1, label28 - lw a5, 12(a4) - beq t2, a5, label23 -label403: - lw a4, 16(t4) - addiw a3, a3, 1 - lw a5, 12(t3) - max t0, a4, a5 - li a4, 16 - sw t0, 16(t3) - blt a3, a4, label27 + lw t4, 4(t3) + addi t5, t4, 1 + sw t5, 8(t2) + lw t4, 8(a3) + beq t1, t4, label19 + lw t4, 12(t3) + addiw t0, t0, 4 + lw t6, 8(t2) + max t5, t4, t6 + li t4, 13 + sw t5, 12(t2) + blt t0, t4, label28 + lw a4, 12(a3) + beq t1, a4, label23 +label402: + lw a3, 16(t3) + addiw a2, a2, 1 + lw a4, 12(t2) + max a5, a3, a4 + li a3, 16 + sw a5, 16(t2) + blt a2, a3, label27 j label26 .p2align 2 -label398: - sh2add a7, t1, t0 - sh2add t5, t1, a5 - lw t6, 0(a7) - lw a6, -4(t5) - max a7, t6, a6 - sw a7, 0(t5) - lw t5, 0(a4) - beq t2, t5, label15 - lw t5, 4(t4) - sh2add t6, t1, a5 - lw a7, 0(t6) - max a6, t5, a7 - sw a6, 4(t3) - lw t5, 4(a4) - beq t2, t5, label17 +label397: + sh2add a6, t0, a5 + sh2add t4, t0, a4 + lw t5, 0(a6) + lw t6, -4(t4) + max a6, t5, t6 + sw a6, 0(t4) + lw t4, 0(a3) + beq t1, t4, label15 + lw t4, 4(t3) + sh2add t6, t0, a4 + lw t5, 0(t6) + max a7, t4, t5 + sw a7, 4(t2) + lw t4, 4(a3) + beq t1, t4, label17 .p2align 2 -label393: - lw t5, 8(t4) - lw a6, 4(t3) - max t6, t5, a6 - sw t6, 8(t3) - lw t5, 8(a4) - bne t2, t5, label401 +label392: + lw t4, 8(t3) + lw t6, 4(t2) + max t5, t4, t6 + sw t5, 8(t2) + lw t4, 8(a3) + bne t1, t4, label400 .p2align 2 label19: - lw t5, 8(t4) - addiw t1, t1, 4 - addi t6, t5, 1 - sw t6, 12(t3) - bge t1, a1, label386 + lw t4, 8(t3) + addiw t0, t0, 4 + addi t5, t4, 1 + li t4, 13 + sw t5, 12(t2) + bge t0, t4, label385 .p2align 2 label28: - addi a4, a4, 16 - sh2add t3, t1, a5 - sh2add t4, t1, t0 - lw t2, -4(a2) - lw t5, -4(a4) - beq t2, t5, label13 - sh2add a7, t1, t0 - sh2add t5, t1, a5 - lw t6, 0(a7) - lw a6, -4(t5) - max a7, t6, a6 - sw a7, 0(t5) - lw t5, 0(a4) - bne t2, t5, label399 + addi a3, a3, 16 + sh2add t2, t0, a4 + sh2add t3, t0, a5 + lw t1, -4(a1) + lw t4, -4(a3) + beq t1, t4, label13 + sh2add a6, t0, a5 + sh2add t4, t0, a4 + lw t5, 0(a6) + lw t6, -4(t4) + max a6, t5, t6 + sw a6, 0(t4) + lw t4, 0(a3) + bne t1, t4, label398 .p2align 2 label15: - sh2add a6, t1, t0 - lw t5, 0(a6) + sh2add t4, t0, a5 + lw t5, 0(t4) addi t6, t5, 1 - sw t6, 4(t3) - lw t5, 4(a4) - beq t2, t5, label17 - lw t5, 8(t4) - lw a6, 4(t3) - max t6, t5, a6 - sw t6, 8(t3) - lw t5, 8(a4) - beq t2, t5, label19 - lw t5, 12(t4) - addiw t1, t1, 4 - lw t6, 8(t3) - max a6, t5, t6 - sw a6, 12(t3) - blt t1, a1, label28 - j label402 + sw t6, 4(t2) + lw t4, 4(a3) + beq t1, t4, label17 + lw t4, 8(t3) + lw t6, 4(t2) + max t5, t4, t6 + sw t5, 8(t2) + lw t4, 8(a3) + beq t1, t4, label19 + lw t4, 12(t3) + addiw t0, t0, 4 + lw t6, 8(t2) + max t5, t4, t6 + li t4, 13 + sw t5, 12(t2) + blt t0, t4, label28 + j label401 .p2align 2 -label399: - lw t5, 4(t4) - sh2add t6, t1, a5 - lw a7, 0(t6) - max a6, t5, a7 - sw a6, 4(t3) - lw t5, 4(a4) - beq t2, t5, label17 - lw t5, 8(t4) - lw a6, 4(t3) - max t6, t5, a6 - sw t6, 8(t3) - lw t5, 8(a4) - beq t2, t5, label19 +label398: + lw t4, 4(t3) + sh2add t6, t0, a4 + lw t5, 0(t6) + max a7, t4, t5 + sw a7, 4(t2) + lw t4, 4(a3) + beq t1, t4, label17 + lw t4, 8(t3) + lw t6, 4(t2) + max t5, t4, t6 + sw t5, 8(t2) + lw t4, 8(a3) + beq t1, t4, label19 .p2align 2 +label400: + lw t4, 12(t3) + addiw t0, t0, 4 + lw t6, 8(t2) + max t5, t4, t6 + li t4, 13 + sw t5, 12(t2) + blt t0, t4, label28 label401: - lw t5, 12(t4) - addiw t1, t1, 4 - lw t6, 8(t3) - max a6, t5, t6 - sw a6, 12(t3) - blt t1, a1, label28 -label402: - lw a5, 12(a4) - beq t2, a5, label23 - j label403 + lw a4, 12(a3) + beq t1, a4, label23 + j label402 label26: lw a0, 1012(a0) jal putint @@ -304,12 +306,12 @@ label26: ld s0, 24(sp) addi sp, sp, 152 ret -label396: - lw a4, 16(t4) - addiw a3, a3, 1 - lw a5, 12(t3) - max t0, a4, a5 - li a4, 16 - sw t0, 16(t3) - blt a3, a4, label27 +label395: + lw a3, 16(t3) + addiw a2, a2, 1 + lw a4, 12(t2) + max a5, a3, a4 + li a3, 16 + sw a5, 16(t2) + blt a2, a3, label27 j label26 diff --git a/tests/SysY2022/functional/80_chaos_token.arm.s b/tests/SysY2022/functional/80_chaos_token.arm.s index cd81dd37b..cc0c18fd2 100644 --- a/tests/SysY2022/functional/80_chaos_token.arm.s +++ b/tests/SysY2022/functional/80_chaos_token.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .section .rodata -.align 8 +.p2align 3 __HELLO: .4byte 87 .4byte 101 @@ -32,7 +32,7 @@ __HELLO: .4byte 33 .4byte 10 .zero 288 -.align 8 +.p2align 3 N4__mE___: .4byte 83 .4byte 97 @@ -102,7 +102,7 @@ N4__mE___: .4byte 109 .4byte 105 .zero 144 -.align 8 +.p2align 3 saY_HeI10_To: .4byte 32 .4byte 115 diff --git a/tests/SysY2022/functional/80_chaos_token.riscv.s b/tests/SysY2022/functional/80_chaos_token.riscv.s index d6751b0ce..4f0017bf4 100644 --- a/tests/SysY2022/functional/80_chaos_token.riscv.s +++ b/tests/SysY2022/functional/80_chaos_token.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 8 +.p2align 3 __HELLO: .4byte 87 .4byte 101 @@ -32,7 +32,7 @@ __HELLO: .4byte 33 .4byte 10 .zero 288 -.align 8 +.p2align 3 N4__mE___: .4byte 83 .4byte 97 @@ -102,7 +102,7 @@ N4__mE___: .4byte 109 .4byte 105 .zero 144 -.align 8 +.p2align 3 saY_HeI10_To: .4byte 32 .4byte 115 diff --git a/tests/SysY2022/functional/83_long_array.arm.s b/tests/SysY2022/functional/83_long_array.arm.s index 37a77d5f0..cba3fa9b2 100644 --- a/tests/SysY2022/functional/83_long_array.arm.s +++ b/tests/SysY2022/functional/83_long_array.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a1: .zero 40000 -.align 8 +.p2align 3 a2: .zero 40000 -.align 8 +.p2align 3 a3: .zero 40000 .text diff --git a/tests/SysY2022/functional/83_long_array.riscv.s b/tests/SysY2022/functional/83_long_array.riscv.s index f009752b7..7ff11b202 100644 --- a/tests/SysY2022/functional/83_long_array.riscv.s +++ b/tests/SysY2022/functional/83_long_array.riscv.s @@ -1,20 +1,20 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a1: .zero 40000 -.align 8 +.p2align 3 a2: .zero 40000 -.align 8 +.p2align 3 a3: .zero 40000 .text .p2align 2 .globl main main: - addi sp, sp, -88 + addi sp, sp, -64 pcrel905: auipc a1, %pcrel_hi(a1) pcrel906: @@ -28,26 +28,20 @@ pcrel907: mv t1, zero li a3, 1 mv t3, zero - li t4, 1249 sd ra, 0(sp) - sd s2, 8(sp) - addi s2, a1, %pcrel_lo(pcrel905) - sd s4, 16(sp) + sd s0, 8(sp) + addi s0, a1, %pcrel_lo(pcrel905) + sd s5, 16(sp) li a1, 10 - addi s4, a0, %pcrel_lo(pcrel906) - sd s3, 24(sp) - mv a0, s2 - addi s3, a2, %pcrel_lo(pcrel907) - sd s0, 32(sp) + sd s2, 24(sp) + addi s2, a0, %pcrel_lo(pcrel906) + sd s1, 32(sp) + mv a0, s0 + addi s1, a2, %pcrel_lo(pcrel907) + sd s6, 40(sp) li a2, 9 - slli s0, t4, 3 - sd s5, 40(sp) - sd s1, 48(sp) - addi s1, s0, 8 - sd s6, 56(sp) - sd s7, 64(sp) - sd s9, 72(sp) - sd s8, 80(sp) + sd s3, 48(sp) + sd s4, 56(sp) .p2align 2 label2: lui a6, 419430 @@ -58,192 +52,194 @@ label2: srai t6, t5, 34 add a6, a7, t6 addw t6, t1, a3 - sh2add s5, a6, a6 + sh2add s3, a6, a6 mul a6, t6, t4 - slliw a7, s5, 1 - srli s6, a6, 63 + slliw a7, s3, 1 + srli s5, a6, 63 subw t5, t1, a7 srai a7, a6, 34 - add s5, s6, a7 - sh2add a6, s5, s5 - slliw a7, a6, 1 - subw s6, t6, a7 - slli a6, s6, 32 - add.uw a7, t5, a6 - addiw a6, a3, 1 - sh1add t6, a6, t1 - sd a7, 0(a0) + add s3, s5, a7 + sh2add s4, s3, s3 + slliw a6, s4, 1 + subw s5, t6, a6 + slli a7, s5, 32 + add.uw a6, t5, a7 + addiw a7, a3, 1 + sh1add t6, a7, t1 + sd a6, 0(a0) mul t5, t6, t4 - srli s6, t5, 63 + srli s4, t5, 63 srai a6, t5, 34 - add a7, s6, a6 - sh2add s5, a7, a7 + add a7, s4, a6 + sh2add s3, a7, a7 addiw a7, t2, 6 - slliw a6, s5, 1 + slliw a6, s3, 1 addiw t2, t2, 96 subw t5, t6, a6 addw a6, t1, a7 mul t6, a6, t4 - srli s7, t6, 63 + srli s4, t6, 63 srai a7, t6, 34 - add s5, s7, a7 - sh2add s6, s5, s5 - slliw a7, s6, 1 - subw t6, a6, a7 + add s3, s4, a7 + sh2add s5, s3, s3 + slliw t6, s5, 1 + subw s4, a6, t6 slliw a6, a3, 2 - slli s5, t6, 32 - add.uw a7, t5, s5 - sd a7, 8(a0) + slli a7, s4, 32 + add.uw t6, t5, a7 addi a7, a6, 12 + sd t6, 8(a0) addw t6, t1, a7 mul t5, t6, t4 - srli s5, t5, 63 + srli s4, t5, 63 srai a6, t5, 34 - add a7, s5, a6 - sh2add s6, a7, a7 + add a7, s4, a6 + sh2add s3, a7, a7 addiw a7, t0, 20 - slliw a6, s6, 1 + slliw a6, s3, 1 addiw t0, t0, 160 subw t5, t6, a6 addw a6, t1, a7 mul t6, a6, t4 - srli s6, t6, 63 + srli s4, t6, 63 srai a7, t6, 34 - add s5, s6, a7 - sh2add s7, s5, s5 - slliw t6, s7, 1 - subw a7, a6, t6 - slli s5, a7, 32 + add s3, s4, a7 + sh2add s5, s3, s3 + slliw a7, s5, 1 + subw t6, a6, a7 addiw a7, a5, 30 - add.uw t6, t5, s5 + slli s3, t6, 32 addiw a5, a5, 192 + add.uw a6, t5, s3 + sd a6, 16(a0) addw a6, t1, a7 mul t5, a6, t4 - sd t6, 16(a0) - srli s5, t5, 63 + srli s4, t5, 63 srai t6, t5, 34 - add a7, s5, t6 + add a7, s4, t6 sh2add t5, a7, a7 addiw a7, a4, 42 - slliw s6, t5, 1 + slliw s3, t5, 1 addiw a4, a4, 224 addw t5, t1, a7 - subw t6, a6, s6 + subw t6, a6, s3 mul a6, t5, t4 - srli s6, a6, 63 + srli s4, a6, 63 srai a7, a6, 34 - add s5, s6, a7 - sh2add s7, s5, s5 - slliw a6, s7, 1 - subw s6, t5, a6 + add s3, s4, a7 + sh2add a6, s3, s3 + slliw s5, a6, 1 + subw a7, t5, s5 slliw t5, a3, 3 - slli a7, s6, 32 - add.uw a6, t6, a7 + slli s3, a7, 32 addi a7, t5, 56 + add.uw a6, t6, s3 sd a6, 24(a0) addw a6, t1, a7 mul t6, a6, t4 - srli s6, t6, 63 + srli s4, t6, 63 srai a7, t6, 34 - add s5, s6, a7 - addiw s6, a2, 72 - sh2add s7, s5, s5 + add s3, s4, a7 + addiw s4, a2, 72 + sh2add s5, s3, s3 addiw a2, a2, 288 - slliw a7, s7, 1 + slliw a7, s5, 1 subw t6, a6, a7 - addw a7, t1, s6 + addw a7, t1, s4 mul a6, a7, t4 - srli s7, a6, 63 - srai s5, a6, 34 - add s6, s7, s5 - sh2add s9, s6, s6 - slliw s8, s9, 1 - subw a6, a7, s8 - addiw a7, a1, 90 - slli s6, a6, 32 + srli s5, a6, 63 + srai s3, a6, 34 + add s4, s5, s3 + sh2add a6, s4, s4 + slliw s6, a6, 1 + subw s3, a7, s6 + slli a6, s3, 32 + addiw s3, a1, 90 + add.uw a7, t6, a6 addiw a1, a1, 320 - addw a6, t1, a7 - add.uw s5, t6, s6 + addw a6, t1, s3 mul t6, a6, t4 - srli s6, t6, 63 + sd a7, 32(a0) + srli s4, t6, 63 srai a7, t6, 34 - sd s5, 32(a0) - add s5, s6, a7 - sh2add s7, s5, s5 - li s5, 11 - slliw a7, s7, 1 + add s3, s4, a7 + sh2add s5, s3, s3 + li s3, 11 + slliw a7, s5, 1 subw t6, a6, a7 - mulw a7, a3, s5 - addi s6, a7, 110 - addw a6, t1, s6 + mulw a7, a3, s3 + addi s4, a7, 110 + addw a6, t1, s4 mul a7, a6, t4 - srli s8, a7, 63 - srai s5, a7, 34 - add s6, s8, s5 - sh2add a7, s6, s6 - slliw s7, a7, 1 - subw s5, a6, s7 - slli a7, s5, 32 + srli s5, a7, 63 + srai s3, a7, 34 + add s4, s5, s3 + sh2add a7, s4, s4 + slliw s6, a7, 1 + subw s3, a6, s6 + slli a7, s3, 32 add.uw a6, t6, a7 - sh1add a7, a3, a3 - slliw t6, a7, 2 + sh1add t6, a3, a3 + slliw a7, t6, 2 sd a6, 40(a0) - addi s5, t6, 132 - addw a6, t1, s5 + addi s3, a7, 132 + addw a6, t1, s3 mul t6, a6, t4 - srli s7, t6, 63 + srli s4, t6, 63 srai a7, t6, 34 - add s5, s7, a7 - sh2add s6, s5, s5 - li s5, 13 - slliw a7, s6, 1 + add s3, s4, a7 + sh2add s5, s3, s3 + li s3, 13 + slliw a7, s5, 1 subw t6, a6, a7 - mulw a7, a3, s5 - addi s6, a7, 156 - addw a6, t1, s6 + mulw a7, a3, s3 + addi s4, a7, 156 + addw a6, t1, s4 mul a7, a6, t4 - srli s7, a7, 63 - srai s5, a7, 34 - add s6, s7, s5 - sh2add s8, s6, s6 - slliw a7, s8, 1 - subw s7, a6, a7 - subw a6, t5, a3 - slli s5, s7, 32 - add.uw a7, t6, s5 - slliw t6, a6, 1 - sd a7, 48(a0) - addi a7, t6, 182 - addw a6, t1, a7 + srli s5, a7, 63 + srai s3, a7, 34 + add s4, s5, s3 + sh2add s6, s4, s4 + slliw a7, s6, 1 + subw s5, a6, a7 + subw a7, t5, a3 + slli s3, s5, 32 + add.uw a6, t6, s3 + slliw s3, a7, 1 + addi t6, s3, 182 + sd a6, 48(a0) + addw a6, t1, t6 mul t5, a6, t4 - srli s5, t5, 63 + srli s4, t5, 63 srai t6, t5, 34 - add a7, s5, t6 + add a7, s4, t6 sh2add t5, a7, a7 - slliw s6, t5, 1 + slliw s3, t5, 1 slliw t5, a3, 4 - subw t6, a6, s6 + subw t6, a6, s3 subw a7, t5, a3 addiw a3, a3, 32 - addi s6, a7, 210 - addw a6, t1, s6 + addi s4, a7, 210 + addw a6, t1, s4 mul a7, a6, t4 - srli s7, a7, 63 - srai s5, a7, 34 - add t4, s7, s5 + srli s4, a7, 63 + srai s3, a7, 34 + add t4, s4, s3 sh2add a7, t4, t4 - slliw s6, a7, 1 - subw s5, a6, s6 + slliw s5, a7, 1 + subw s3, a6, s5 addi a6, t5, 240 - slli a7, s5, 32 + slli t4, s3, 32 + li t5, 625 addw t1, t1, a6 - add.uw t4, t6, a7 - sd t4, 56(a0) - bge t3, s1, label140 + add.uw a7, t6, t4 + slli t4, t5, 4 + sd a7, 56(a0) + bge t3, t4, label140 addi a0, a0, 64 j label2 label140: - mv a0, s3 + mv a0, s1 mv a1, zero j label15 .p2align 2 @@ -251,7 +247,7 @@ label18: addi a0, a0, 64 .p2align 2 label15: - sh2add a2, a1, s2 + sh2add a2, a1, s0 lui t1, 419430 addiw a1, a1, 16 addiw a3, t1, 1639 @@ -261,60 +257,60 @@ label15: srli t4, a5, 63 srai t0, a5, 34 add t1, t4, t0 - sh2add t2, t1, t1 - slliw t3, t2, 1 - subw a5, a4, t3 + sh2add t3, t1, t1 + slliw t2, t3, 1 + subw a5, a4, t2 sw a5, 0(a0) lw t1, 4(a2) mulw a4, t1, t1 mul a5, a4, a3 - srli t3, a5, 63 + srli t2, a5, 63 srai t0, a5, 34 - add t1, t3, t0 - sh2add t2, t1, t1 - slliw a5, t2, 1 + add t1, t2, t0 + sh2add t3, t1, t1 + slliw a5, t3, 1 subw t0, a4, a5 sw t0, 4(a0) lw t1, 8(a2) mulw a4, t1, t1 mul a5, a4, a3 - srli t2, a5, 63 + srli t3, a5, 63 srai t0, a5, 34 - add t1, t2, t0 - sh2add t3, t1, t1 - slliw a5, t3, 1 + add t1, t3, t0 + sh2add t2, t1, t1 + slliw a5, t2, 1 subw t0, a4, a5 sw t0, 8(a0) lw t1, 12(a2) mulw a4, t1, t1 mul a5, a4, a3 - srli t2, a5, 63 + srli t3, a5, 63 srai t0, a5, 34 - add t1, t2, t0 - sh2add t4, t1, t1 - slliw t3, t4, 1 - subw a5, a4, t3 - sw a5, 12(a0) + add t1, t3, t0 + sh2add t2, t1, t1 + slliw a5, t2, 1 + subw t0, a4, a5 + sw t0, 12(a0) lw t1, 16(a2) mulw a4, t1, t1 mul a5, a4, a3 - srli t2, a5, 63 + srli t4, a5, 63 srai t0, a5, 34 - add t1, t2, t0 - sh2add t4, t1, t1 - slliw t3, t4, 1 - subw a5, a4, t3 + add t1, t4, t0 + sh2add t3, t1, t1 + slliw t2, t3, 1 + subw a5, a4, t2 sw a5, 16(a0) lw t1, 20(a2) mulw a4, t1, t1 mul a5, a4, a3 - srli t2, a5, 63 + srli t4, a5, 63 srai t0, a5, 34 - add t1, t2, t0 + add t1, t4, t0 sh2add t3, t1, t1 - slliw a5, t3, 1 - subw t0, a4, a5 - sw t0, 20(a0) + slliw t2, t3, 1 + subw a5, a4, t2 + sw a5, 20(a0) lw t1, 24(a2) mulw a4, t1, t1 mul a5, a4, a3 @@ -348,51 +344,51 @@ label15: lw t1, 36(a2) mulw a4, t1, t1 mul a5, a4, a3 - srli t3, a5, 63 + srli t2, a5, 63 srai t0, a5, 34 - add t1, t3, t0 - sh2add t2, t1, t1 - slliw a5, t2, 1 - subw t0, a4, a5 - sw t0, 36(a0) + add t1, t2, t0 + sh2add t3, t1, t1 + slliw t4, t3, 1 + subw a5, a4, t4 + sw a5, 36(a0) lw t1, 40(a2) mulw a4, t1, t1 mul a5, a4, a3 - srli t3, a5, 63 - srai t0, a5, 34 - add t1, t3, t0 - sh2add t2, t1, t1 - slliw a5, t2, 1 - subw t0, a4, a5 - sw t0, 40(a0) - lw t1, 44(a2) - mulw a4, t1, t1 - mul a5, a4, a3 srli t2, a5, 63 srai t0, a5, 34 add t1, t2, t0 sh2add t4, t1, t1 slliw t3, t4, 1 subw a5, a4, t3 - sw a5, 44(a0) + sw a5, 40(a0) + lw t1, 44(a2) + mulw a4, t1, t1 + mul a5, a4, a3 + srli t3, a5, 63 + srai t0, a5, 34 + add t1, t3, t0 + sh2add t2, t1, t1 + slliw a5, t2, 1 + subw t0, a4, a5 + sw t0, 44(a0) lw t1, 48(a2) mulw a4, t1, t1 mul a5, a4, a3 srli t2, a5, 63 srai t0, a5, 34 add t1, t2, t0 - sh2add t4, t1, t1 - slliw t3, t4, 1 - subw a5, a4, t3 + sh2add t3, t1, t1 + slliw t4, t3, 1 + subw a5, a4, t4 sw a5, 48(a0) lw t1, 52(a2) mulw a4, t1, t1 mul a5, a4, a3 - srli t2, a5, 63 + srli t3, a5, 63 srai t0, a5, 34 - add t1, t2, t0 - sh2add t3, t1, t1 - slliw t4, t3, 1 + add t1, t3, t0 + sh2add t2, t1, t1 + slliw t4, t2, 1 subw a5, a4, t4 sw a5, 52(a0) lw t1, 56(a2) @@ -412,15 +408,17 @@ label15: srai a2, a5, 34 add a3, t0, a2 sh2add t1, a3, a3 + li a3, 625 slliw a5, t1, 1 subw a2, a4, a5 sw a2, 60(a0) - blt a1, s1, label18 - mv a0, s4 + slli a2, a3, 4 + blt a1, a2, label18 + mv a0, s2 mv a1, zero .p2align 2 label20: - sh2add a2, a1, s3 + sh2add a2, a1, s1 lui t2, 335544 lw a4, 0(a2) addiw a3, t2, 1311 @@ -429,88 +427,92 @@ label20: mul a5, t0, a3 srli t3, a5, 63 srai t1, a5, 37 - sh2add a5, a1, s2 + sh2add a5, a1, s0 add t2, t3, t1 addiw a1, a1, 4 + lw t5, 0(a5) mulw t4, t2, a4 - lw t2, 0(a5) subw t1, t0, t4 - addw t4, t1, t2 - sw t4, 0(a0) + addw t2, t1, t5 + sw t2, 0(a0) lw t3, 4(a2) + lw t6, 4(a5) mulw t0, t3, t3 mul t1, t0, a3 srli t3, t1, 63 srai t2, t1, 37 add t4, t3, t2 - lw t2, 4(a5) mulw t5, t4, a4 subw t1, t0, t5 - addw t4, t1, t2 - sw t4, 4(a0) + addw t2, t1, t6 + sw t2, 4(a0) lw t3, 8(a2) mulw t0, t3, t3 mul t1, t0, a3 - srli t3, t1, 63 + srli t4, t1, 63 srai t2, t1, 37 - add t4, t3, t2 - lw t3, 8(a5) - mulw t5, t4, a4 + add t3, t4, t2 + lw t2, 8(a5) + mulw t5, t3, a4 subw t1, t0, t5 - addw t4, t1, t3 + addw t4, t1, t2 sw t4, 8(a0) - lw t2, 12(a2) - mulw t0, t2, t2 + lw t3, 12(a2) + mulw t0, t3, t3 mul t1, t0, a3 + lw a3, 12(a5) srli t3, t1, 63 - srai a2, t1, 37 - lw t1, 12(a5) - add t4, t3, a2 - mulw t2, t4, a4 - subw a3, t0, t2 - addw a2, a3, t1 - sw a2, 12(a0) - bge a1, s1, label265 + srai t2, t1, 37 + add t4, t3, t2 + mulw t5, t4, a4 + subw a2, t0, t5 + addw a4, a2, a3 + li a3, 625 + slli a2, a3, 4 + sw a4, 12(a0) + bge a1, a2, label265 addi a0, a0, 16 j label20 label265: - mv s5, s4 - mv s6, zero - mv a3, zero + mv s3, s2 + mv s4, zero + mv s5, zero li a0, 10 bge zero, a0, label270 .p2align 2 label53: - lw a0, 0(s5) + lw a0, 0(s3) lui a4, 201377 li t0, 1333 - addw a1, a3, a0 - addiw a5, a4, -261 - mul a0, a1, a5 + addw a1, s5, a0 + addiw a2, a4, -261 + mul a0, a1, a2 srli a4, a0, 63 - srai a2, a0, 40 - add a3, a4, a2 - mulw a5, a3, t0 - subw s7, a1, a5 - mv a0, s7 + srai a3, a0, 40 + add a2, a4, a3 + mulw a5, a2, t0 + subw s5, a1, a5 + mv a0, s5 jal putint - addiw s6, s6, 1 - mv a3, s7 - bge s6, s1, label31 + li a1, 625 + addiw s4, s4, 1 + slli a0, a1, 4 + bge s4, a0, label31 .p2align 2 label32: - addi s5, s5, 4 + addi s3, s3, 4 li a0, 10 - blt s6, a0, label53 + blt s4, a0, label53 li a0, 20 - bge s6, a0, label895 + bge s4, a0, label895 .p2align 2 label46: - lw a0, 0(s5) + lw a0, 0(s3) li a2, 625 + mv a3, s5 slli a4, a2, 5 srli a2, a4, 2 - add a1, s2, a4 + add a1, s0, a4 j label47 .p2align 2 label52: @@ -520,175 +522,183 @@ label47: addw a3, a0, a3 addiw a2, a2, 16 lw t0, 0(a1) - lw t1, 4(a1) + lw t2, 4(a1) subw a5, a3, t0 + lw t0, 8(a1) addw a4, a0, a5 - lw a5, 8(a1) + subw t1, a4, t2 + addw a3, a0, t1 + lw t1, 12(a1) + subw a5, a3, t0 + addw a4, a0, a5 + lw a5, 16(a1) subw t0, a4, t1 - lw t2, 12(a1) addw a3, a0, t0 + lw t0, 20(a1) subw t1, a3, a5 addw a4, a0, t1 - lw t1, 16(a1) - subw t0, a4, t2 - addw a3, a0, t0 - lw t0, 20(a1) - subw a5, a3, t1 + lw t1, 24(a1) + subw a3, a4, t0 + lw t0, 28(a1) + addw a5, a0, a3 + subw a4, a5, t1 + lw t1, 32(a1) + addw a3, a0, a4 + subw a5, a3, t0 + lw t0, 36(a1) addw a4, a0, a5 - lw a5, 24(a1) - subw t1, a4, t0 - addw a3, a0, t1 - lw t1, 28(a1) - subw t0, a3, a5 - addw a4, a0, t0 - lw t0, 32(a1) - subw a5, a4, t1 - lw t2, 36(a1) - addw a3, a0, a5 - lw a5, 40(a1) - subw t1, a3, t0 - addw a4, a0, t1 - subw t0, a4, t2 - addw a3, a0, t0 + subw a3, a4, t1 + lw t1, 40(a1) + addw a5, a0, a3 + subw a4, a5, t0 lw t0, 44(a1) - subw t1, a3, a5 - addw a4, a0, t1 + addw a3, a0, a4 + subw a5, a3, t1 lw t1, 48(a1) - subw a5, a4, t0 - addw a3, a0, a5 - lw a5, 52(a1) - subw t0, a3, t1 + addw a4, a0, a5 + subw a3, a4, t0 + lw t0, 52(a1) + addw a5, a0, a3 lw t2, 56(a1) - addw a4, a0, t0 + subw a4, a5, t1 + addw a3, a0, a4 + subw a5, a3, t0 lw t0, 60(a1) - subw t1, a4, a5 - addw a3, a0, t1 - subw a5, a3, t2 addw a4, a0, a5 - subw a3, a4, t0 - blt a2, s0, label52 + subw t1, a4, t2 + addw a5, a0, t1 + li t1, 1249 + subw a3, a5, t0 + slli a4, t1, 3 + blt a2, a4, label52 addw a2, a0, a3 - lw a5, 64(a1) - lw a4, 68(a1) - subw t0, a2, a5 - addw a3, a0, t0 - lw t0, 72(a1) - subw a5, a3, a4 - addw a2, a0, a5 - lw a5, 76(a1) - subw a4, a2, t0 - lw t0, 80(a1) + lw a4, 64(a1) + lw t0, 68(a1) + subw a5, a2, a4 + addw a3, a0, a5 + lw a5, 72(a1) + subw a4, a3, t0 + addw a2, a0, a4 + lw a4, 76(a1) + subw t1, a2, a5 + lw a5, 80(a1) + addw a3, a0, t1 + subw t0, a3, a4 + addw a2, a0, t0 + lw t0, 84(a1) + subw a4, a2, a5 addw a3, a0, a4 - lw t1, 84(a1) - subw a2, a3, a5 - lw a5, 88(a1) - addw a4, a0, a2 - subw a3, a4, t0 - addw a2, a0, a3 - subw t0, a2, t1 + lw a4, 88(a1) + subw a5, a3, t0 + addw a2, a0, a5 + lw a5, 92(a1) + subw t0, a2, a4 addw a3, a0, t0 - lw t0, 92(a1) - subw a4, a3, a5 - addw a2, a0, a4 - subw s7, a2, t0 - mv a0, s7 + subw s5, a3, a5 + mv a0, s5 jal putint - addiw s6, s6, 1 - mv a3, s7 - blt s6, s1, label32 + li a1, 625 + addiw s4, s4, 1 + slli a0, a1, 4 + blt s4, a0, label32 j label31 .p2align 2 label895: li a0, 30 - bge s6, a0, label45 + bge s4, a0, label45 .p2align 2 label35: li a2, 625 - mv s7, a3 - slli a4, a2, 5 - srli a2, a4, 2 - add a0, s4, a4 - add a1, s2, a4 - blt a2, s1, label42 + slli a3, a2, 5 + srli a2, a3, 2 + add a0, s2, a3 + add a1, s0, a3 + li a4, 625 + slli a3, a4, 4 + blt a2, a3, label42 j label41 .p2align 2 label43: - sh2add a3, s6, s2 - lui t1, 80533 + sh2add a3, s4, s0 + lui t2, 80533 addiw a2, a2, 2 addi a1, a1, 8 - lw a5, 0(a3) - addiw t2, t1, -1433 - lw t0, 0(a0) - addw a4, s7, a5 + lw t0, 0(a3) + addiw t1, t2, -1433 + lw a5, 0(a0) + addw a4, s5, t0 addi a0, a0, 8 - addw a3, a4, t0 - mul a5, a3, t2 - lui t2, 3 - srli t1, a5, 63 + addw a3, a4, a5 + mul a5, a3, t1 + lui t1, 3 + srli t2, a5, 63 srai t0, a5, 42 - addiw a5, t2, 1045 - add a4, t1, t0 + addiw a5, t1, 1045 + add a4, t2, t0 mulw t0, a4, a5 - subw s7, a3, t0 - bge a2, s1, label41 + li a4, 625 + subw s5, a3, t0 + slli a3, a4, 4 + bge a2, a3, label41 .p2align 2 label42: lui a4, 1 addiw a3, a4, -1863 ble a2, a3, label43 - sh2add a4, s6, s3 + sh2add a5, s4, s1 addiw a2, a2, 1 addi a0, a0, 4 - lw a5, 0(a4) - lw t0, 0(a1) - addw a3, s7, a5 + lw t0, 0(a5) + lw a4, 0(a1) + addw a3, s5, t0 addi a1, a1, 4 - subw s7, a3, t0 - blt a2, s1, label42 + subw s5, a3, a4 + li a4, 625 + slli a3, a4, 4 + blt a2, a3, label42 .p2align 2 label41: - mv a0, s7 + mv a0, s5 jal putint - addiw s6, s6, 1 - mv a3, s7 - blt s6, s1, label32 + li a1, 625 + addiw s4, s4, 1 + slli a0, a1, 4 + blt s4, a0, label32 j label31 .p2align 2 label45: - lw a1, 0(s5) + lw a1, 0(s3) lui a4, 343639 lui a5, 24 - addiw s6, s6, 1 + addiw s4, s4, 1 sh3add a2, a1, a1 addiw a1, a4, -1555 - addw a0, a3, a2 + addw a0, s5, a2 mul a2, a0, a1 srli a4, a2, 63 srai a3, a2, 47 addiw a2, a5, 1684 add a1, a4, a3 - mulw a4, a1, a2 - subw a3, a0, a4 - blt s6, s1, label32 + mulw a3, a1, a2 + li a1, 625 + subw s5, a0, a3 + slli a0, a1, 4 + blt s4, a0, label32 label31: - mv a0, a3 + mv a0, s5 ld ra, 0(sp) - ld s2, 8(sp) - ld s4, 16(sp) - ld s3, 24(sp) - ld s0, 32(sp) - ld s5, 40(sp) - ld s1, 48(sp) - ld s6, 56(sp) - ld s7, 64(sp) - ld s9, 72(sp) - ld s8, 80(sp) - addi sp, sp, 88 + ld s0, 8(sp) + ld s5, 16(sp) + ld s2, 24(sp) + ld s1, 32(sp) + ld s6, 40(sp) + ld s3, 48(sp) + ld s4, 56(sp) + addi sp, sp, 64 ret label270: li a0, 20 - blt s6, a0, label46 + blt s4, a0, label46 li a0, 30 - blt s6, a0, label35 + blt s4, a0, label35 j label45 diff --git a/tests/SysY2022/functional/84_long_array2.arm.s b/tests/SysY2022/functional/84_long_array2.arm.s index 839a08a49..10d985aa9 100644 --- a/tests/SysY2022/functional/84_long_array2.arm.s +++ b/tests/SysY2022/functional/84_long_array2.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 16384 -.align 8 +.p2align 3 c: .zero 16384 .text diff --git a/tests/SysY2022/functional/84_long_array2.riscv.s b/tests/SysY2022/functional/84_long_array2.riscv.s index c3e13a90b..a1fe070f5 100644 --- a/tests/SysY2022/functional/84_long_array2.riscv.s +++ b/tests/SysY2022/functional/84_long_array2.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 16384 -.align 8 +.p2align 3 c: .zero 16384 .text diff --git a/tests/SysY2022/functional/85_long_code.arm.s b/tests/SysY2022/functional/85_long_code.arm.s index 94bed8b69..cda6b275e 100644 --- a/tests/SysY2022/functional/85_long_code.arm.s +++ b/tests/SysY2022/functional/85_long_code.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 count: .zero 4000 .text diff --git a/tests/SysY2022/functional/85_long_code.riscv.s b/tests/SysY2022/functional/85_long_code.riscv.s index d01a2e06c..a6889783a 100644 --- a/tests/SysY2022/functional/85_long_code.riscv.s +++ b/tests/SysY2022/functional/85_long_code.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 count: .zero 4000 .text diff --git a/tests/SysY2022/functional/88_many_params2.arm.s b/tests/SysY2022/functional/88_many_params2.arm.s index ff18266c1..41da6bcaf 100644 --- a/tests/SysY2022/functional/88_many_params2.arm.s +++ b/tests/SysY2022/functional/88_many_params2.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 b: .zero 12508 -.align 8 +.p2align 3 a: .zero 16348 .text diff --git a/tests/SysY2022/functional/88_many_params2.riscv.s b/tests/SysY2022/functional/88_many_params2.riscv.s index e8cf59325..a0ec119cc 100644 --- a/tests/SysY2022/functional/88_many_params2.riscv.s +++ b/tests/SysY2022/functional/88_many_params2.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 b: .zero 12508 -.align 8 +.p2align 3 a: .zero 16348 .text diff --git a/tests/SysY2022/functional/94_nested_loops.arm.s b/tests/SysY2022/functional/94_nested_loops.arm.s index 1e0338afb..716bf4f35 100644 --- a/tests/SysY2022/functional/94_nested_loops.arm.s +++ b/tests/SysY2022/functional/94_nested_loops.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 arr1: .zero 57600 -.align 8 +.p2align 3 arr2: .zero 107520 .text diff --git a/tests/SysY2022/functional/94_nested_loops.riscv.s b/tests/SysY2022/functional/94_nested_loops.riscv.s index 4af4428be..0ab88d599 100644 --- a/tests/SysY2022/functional/94_nested_loops.riscv.s +++ b/tests/SysY2022/functional/94_nested_loops.riscv.s @@ -1,159 +1,147 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 arr1: .zero 57600 -.align 8 +.p2align 3 arr2: .zero 107520 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[240] CalleeSaved[104] - addi sp, sp, -344 + # stack usage: CalleeArg[0] Local[0] RegSpill[224] CalleeSaved[104] + addi sp, sp, -328 sd ra, 0(sp) sd s0, 8(sp) sd s5, 16(sp) - sd s2, 24(sp) + sd s3, 24(sp) sd s1, 32(sp) sd s6, 40(sp) - sd s3, 48(sp) + sd s2, 48(sp) sd s4, 56(sp) sd s8, 64(sp) sd s7, 72(sp) - sd s10, 80(sp) - sd s9, 88(sp) + sd s9, 80(sp) + sd s10, 88(sp) sd s11, 96(sp) jal getint slt s0, zero, a0 - sd a0, 248(sp) + sd a0, 224(sp) jal getint - sd a0, 240(sp) slt a3, zero, a0 - ld a0, 248(sp) - and s2, s0, a3 - ld a1, 240(sp) + sd a0, 216(sp) + and s3, s0, a3 + ld a0, 224(sp) + ld a1, 216(sp) addiw a2, a0, 1 - addw a0, a1, a2 - sd a0, 264(sp) - ld a0, 248(sp) - addiw a2, a0, 2 addw a1, a1, a2 - addiw a2, a0, 3 - sd a1, 272(sp) - ld a1, 240(sp) + addiw a2, a0, 2 + sd a1, 240(sp) + ld a1, 216(sp) addw a0, a1, a2 - sd a0, 288(sp) - ld a0, 248(sp) - addiw a2, a0, 4 + sd a0, 248(sp) + ld a0, 224(sp) + addiw a2, a0, 3 addw a1, a1, a2 - addiw a2, a0, 5 - sd a1, 296(sp) - ld a1, 240(sp) + addiw a2, a0, 4 + sd a1, 256(sp) + ld a1, 216(sp) addw a0, a1, a2 - sd a0, 304(sp) - ld a0, 248(sp) - addiw a2, a0, 6 - addw a1, a1, a2 + sd a0, 272(sp) + ld a0, 224(sp) + addiw a3, a0, 5 + addw a2, a1, a3 + addiw a3, a0, 6 + addw a1, a1, a3 + sd a2, 280(sp) addiw a2, a0, 7 - sd a1, 312(sp) - ld a1, 240(sp) + sd a1, 288(sp) + ld a1, 216(sp) addw a0, a1, a2 - sd a0, 328(sp) - ld a0, 248(sp) + sd a0, 296(sp) + ld a0, 224(sp) addiw a2, a0, 8 addw a1, a1, a2 addiw a2, a0, 9 - sd a1, 336(sp) - ld a1, 240(sp) + sd a1, 312(sp) + ld a1, 216(sp) addw a0, a1, a2 - sd a0, 104(sp) - ld a0, 248(sp) - addw s1, a0, a1 + sd a0, 320(sp) + ld a0, 224(sp) addiw a2, a0, 10 + addw s1, a0, a1 addw s0, a1, a2 jal getint - sd a0, 168(sp) + sd a0, 160(sp) jal getint - sd a0, 184(sp) + sd a0, 176(sp) jal getint - sd a0, 200(sp) - mv a4, a0 + sd a0, 192(sp) + mv a2, a0 jal getint - sd a0, 208(sp) - mv t1, a0 + sd a0, 200(sp) + mv a5, a0 jal getint - sd a0, 192(sp) - mv t4, a0 + sd a0, 184(sp) + mv t2, a0 jal getint - sd a0, 176(sp) - mv a6, a0 + sd a0, 168(sp) + mv t5, a0 jal getint - mv s3, a0 -pcrel844: +pcrel840: auipc a1, %pcrel_hi(arr1) - li a2, 45 - addi a4, a1, %pcrel_lo(pcrel844) - sd a0, 160(sp) - li a1, 21 + mv s2, a0 + sd a0, 152(sp) + addi a2, a1, %pcrel_lo(pcrel840) li a0, 21 - sd a4, 224(sp) + li a1, 45 + sd a2, 120(sp) slli a5, a0, 8 - slli a0, a2, 6 - sd a5, 152(sp) - sd a0, 144(sp) - slli a0, a1, 9 - lui a1, 244 - sd a0, 128(sp) - addiw a6, a1, 576 - ld a0, 144(sp) - slli a0, a0, 1 + slli a0, a1, 6 + sd a5, 144(sp) sd a0, 136(sp) - lui a0, 24 - sd a6, 120(sp) - addiw t4, a0, 1696 - sd t4, 112(sp) - beq s2, zero, label2 + beq s3, zero, label2 mv s4, zero - mv s3, a4 + mv s3, a2 j label83 .p2align 2 -label802: - ld a0, 248(sp) - ld a1, 240(sp) +label798: + ld a0, 224(sp) + ld a1, 216(sp) slt a0, s4, a0 slt a3, s4, a1 and a2, a0, a3 beq a2, zero, label2 - ld a0, 136(sp) + li a1, 45 + slli a0, a1, 7 add s3, s3, a0 .p2align 2 label83: addw a3, s1, s4 addw a6, s0, s4 mv s2, s3 - mv a2, zero - ld a0, 264(sp) - ld a1, 272(sp) - addw a4, a0, s4 - addw a5, a1, s4 - ld a0, 288(sp) - ld a1, 296(sp) - addw t0, a0, s4 - addw t1, a1, s4 - ld a0, 304(sp) - ld a1, 312(sp) - addw t2, a0, s4 + ld a1, 240(sp) + ld a0, 248(sp) + addw a4, a1, s4 + addw a5, a0, s4 + ld a1, 256(sp) + ld a0, 272(sp) + addw t0, a1, s4 + ld a2, 280(sp) + addw t1, a0, s4 + ld a1, 288(sp) + addw t2, a2, s4 + ld a0, 296(sp) addw t3, a1, s4 - ld a0, 328(sp) - ld a1, 336(sp) + mv a2, zero addw t4, a0, s4 + ld a1, 312(sp) + ld a0, 320(sp) addw t5, a1, s4 - ld a0, 104(sp) - mv a1, zero addw t6, a0, s4 + mv a1, zero mv a0, zero addiw s4, s4, 1 j label86 @@ -163,48 +151,48 @@ label488: .p2align 2 label86: addw s6, a3, a2 - slli s7, a0, 4 + slli s8, a0, 4 + addw s9, a4, a2 addw a7, a0, s6 + addw s10, a0, s9 addw s5, a1, a7 - sub a7, s7, a0 - addw s7, a4, a2 - slli s9, a7, 6 + sub a7, s8, a0 + slli s7, a7, 6 slli a7, a2, 4 - add s6, s2, s9 + add s6, s2, s7 sub s8, a7, a2 - addw s9, a0, s7 - slli s10, s8, 4 + slli s7, s8, 4 + add a7, s6, s7 + addw s6, a1, s10 addw s7, a5, a2 - add a7, s6, s10 + slli s11, s6, 32 addw s10, a0, s7 - addw s6, a1, s9 + add.uw s8, s5, s11 addw s7, t0, a2 - slli s8, s6, 32 - add.uw s11, s5, s8 addw s5, a1, s10 - sd s11, 0(a7) + addw s11, a0, s7 + sd s8, 0(a7) + addw s7, t1, a2 slli s8, s5, 32 add.uw s9, s6, s8 - addw s6, a0, s7 + addw s6, a1, s11 sd s9, 8(a7) - addw s7, t1, a2 - addw s10, a1, s6 - slli s11, s10, 32 - add.uw s8, s5, s11 addw s11, a0, s7 + slli s10, s6, 32 + add.uw s8, s5, s10 + addw s10, a1, s11 sd s8, 16(a7) - addw s6, a1, s11 - slli s5, s6, 32 - add.uw s7, s10, s5 - addw s10, t2, a2 + slli s5, s10, 32 + add.uw s7, s6, s5 + addw s5, t2, a2 sd s7, 24(a7) - addw s5, a0, s10 - addw s11, a1, s5 - slli s10, s11, 32 - add.uw s5, s6, s10 - addw s6, t3, a2 + addw s6, a0, s5 + addw s11, a1, s6 + slli s6, s11, 32 + add.uw s5, s10, s6 + addw s10, t3, a2 sd s5, 32(a7) - addw s6, a0, s6 + addw s6, a0, s10 addw s10, a1, s6 slli s6, s10, 32 add.uw s6, s11, s6 @@ -247,12 +235,12 @@ label86: addw s6, a6, a2 sd s9, 208(a7) addiw a2, a2, 1 - addw s5, a0, s6 + addw s9, a0, s6 sd s8, 216(a7) - addw s8, a1, s5 + addw s8, a1, s9 sd s11, 224(a7) - slli s9, s8, 32 - add.uw s6, s7, s9 + slli s5, s8, 32 + add.uw s6, s7, s5 sd s6, 232(a7) li a7, 4 blt a2, a7, label86 @@ -261,8 +249,8 @@ label86: blt a0, a7, label488 addiw a1, a1, 1 li a7, 2 - bge a1, a7, label802 - ld a0, 144(sp) + bge a1, a7, label798 + ld a0, 136(sp) mv a2, zero add s2, s2, a0 mv a0, zero @@ -274,47 +262,47 @@ label2: mv a1, zero addi a3, a0, %pcrel_lo(label2) mv a0, zero - sd a3, 216(sp) + sd a3, 128(sp) j label3 .p2align 2 label220: li a7, 2 - bge a5, a7, label787 + bge a5, a7, label783 mv a6, a5 .p2align 2 label3: - ld a5, 152(sp) - slli t1, a2, 3 - addiw a7, a6, 5 - addw s0, a6, a0 - sub t2, t1, a2 + ld a5, 144(sp) + slli t2, a2, 3 + addiw t5, a6, 4 + addiw a7, a6, 6 + sub t1, t2, a2 mul t0, a0, a5 - slli t3, t2, 8 - slli a5, a6, 3 + slli t2, a6, 3 + slli a5, t1, 8 add a4, a3, t0 - sub t1, a5, a6 - add t0, a4, t3 - addiw a5, a6, 1 - slli t2, t1, 7 - addiw t3, a6, 3 - addw t1, a0, a5 - add a4, t0, t2 - addw t5, a0, t3 + sub t3, t2, a6 + add t0, a4, a5 addiw t2, a6, 2 - addw t0, a1, t1 + slli t1, t3, 7 + addiw a5, a6, 1 addw t4, a0, t2 - addw t2, a1, t5 + add a4, t0, t1 + addw t3, a0, a5 addw t1, a1, t4 - addw t5, a0, a7 - addiw t4, a6, 4 - addw t6, a0, t4 + addw t0, a1, t3 + addw t4, a0, t5 + addiw t3, a6, 3 + addw t6, a0, t3 + addw t3, a1, t4 + addw t2, a1, t6 + addiw t6, a6, 5 + addw t5, a0, t6 + addw t6, a0, a7 addw t4, a1, t5 - addw t3, a1, t6 - addiw t6, a6, 6 + addw a7, a6, a0 + addw t5, a1, t6 mv a6, zero - addw a7, a0, t6 - addw t6, a1, s0 - addw t5, a1, a7 + addw t6, a1, a7 .p2align 2 label9: slli s0, t0, 32 @@ -365,350 +353,368 @@ label9: addi a4, a4, 224 j label9 .p2align 2 -label787: +label783: addiw a2, a2, 1 li a7, 3 - bge a2, a7, label804 + bge a2, a7, label800 mv a6, zero j label3 .p2align 2 -label804: +label800: addiw a0, a0, 1 li a7, 2 - bge a0, a7, label813 + bge a0, a7, label809 mv a6, zero mv a2, zero j label3 .p2align 2 -label813: +label809: addiw a1, a1, 1 li a0, 10 - bge a1, a0, label819 - ld a0, 128(sp) + bge a1, a0, label815 + li a2, 21 mv a6, zero + slli a0, a2, 9 mv a2, zero add a3, a3, a0 mv a0, zero j label3 -label819: - ld a4, 224(sp) - mv s5, zero +label815: + ld a2, 120(sp) + mv s3, zero mv a1, zero - sd a4, 232(sp) - sd zero, 256(sp) + sd a2, 112(sp) + sd zero, 104(sp) label19: - ld a0, 128(sp) - ld a1, 256(sp) - ld a3, 216(sp) + li a3, 21 + ld a1, 104(sp) + slli a0, a3, 9 + ld a3, 128(sp) mul a2, a1, a0 li a0, 10 add a3, a3, a2 - sd a3, 280(sp) + sd a3, 208(sp) bge a1, a0, label247 - ld a4, 232(sp) + ld a2, 112(sp) mv a1, zero - sd a4, 320(sp) + sd a2, 232(sp) + sd zero, 264(sp) label24: - ld a5, 152(sp) - li a4, 100 - ld a3, 280(sp) + ld a5, 144(sp) + li a2, 100 + ld a1, 264(sp) + ld a3, 208(sp) mul a0, a1, a5 - add a2, a3, a0 - bge a1, a4, label253 - ld a4, 320(sp) - mv a3, zero - mv a0, a4 - sub t1, zero, zero - li a4, 1000 - slli t0, t1, 8 - add a5, a2, t0 - bge zero, a4, label259 + add a4, a3, a0 + sd a4, 304(sp) + bge a1, a2, label253 + ld a2, 232(sp) + mv a1, zero + mv a0, a2 + sub a5, zero, zero + slli a2, a5, 8 + li a5, 1000 + add a3, a4, a2 + bge zero, a5, label259 .p2align 2 label36: - mv a4, a0 - mv t0, zero - sub t3, zero, zero - li t4, 625 - slli t1, t3, 7 - slli t3, t4, 4 - add t2, a5, t1 - bge zero, t3, label271 + mv a2, a0 + mv a4, zero + sub t1, zero, zero + li t2, 625 + slli a5, t1, 7 + slli t1, t2, 4 + add t0, a3, a5 + bge zero, t1, label271 .p2align 2 label41: - mv t1, a4 - mv t3, zero - sub t6, zero, zero - ld t4, 112(sp) - slli a6, t6, 5 - add t5, t2, a6 - bge zero, t4, label277 + mv a5, a2 + mv t1, zero + lui t6, 24 + sub t5, zero, zero + addiw t3, t6, 1696 + slli t4, t5, 5 + add t2, t0, t4 + bge zero, t3, label277 .p2align 2 label49: - mv t4, t1 - mv t6, zero - ld a6, 120(sp) - bge zero, a6, label287 + mv t3, a5 + mv t4, zero + lui t6, 244 + addiw t5, t6, 576 + bge zero, t5, label287 .p2align 2 label57: - slli s0, t6, 3 - ld s3, 160(sp) - sub a7, s0, t6 - sh2add a6, a7, t5 + slli t6, t4, 3 li a7, 3 - ble s3, a7, label299 - lw s0, 0(t4) - lw s4, 8(a6) - lw s3, 12(t4) - lw s1, 0(a6) - lw s6, 12(a6) - addw a7, s0, s1 - lw s2, 4(t4) - lw s8, 4(a6) - lw s7, 8(t4) - addw s0, s2, s8 - addw s1, s4, s7 - addw s2, s3, s6 + ld s2, 152(sp) + sub a6, t6, t4 + sh2add t5, a6, t2 + ble s2, a7, label299 + lw a6, 0(t3) + lw s1, 8(t5) + lw s2, 12(t3) + lw a7, 0(t5) + lw s4, 12(t5) + addw t6, a6, a7 + lw a7, 4(t3) + lw s0, 4(t5) + lw s5, 8(t3) + addw a6, a7, s0 + addw s0, s2, s4 + addw a7, s1, s5 .p2align 2 label59: - lui s6, 657125 - addiw s3, s6, -1067 - mul s7, s5, s3 - srli s4, s7, 32 - add s6, s4, s5 - li s4, 817 - srliw s10, s6, 31 - sraiw s8, s6, 9 - add s7, s10, s8 - mulw s9, s7, s4 + lui s4, 657125 + addiw s1, s4, -1067 + mul s2, s3, s1 + srli s6, s2, 32 + li s2, 817 + add s4, s6, s3 + srliw s8, s4, 31 + sraiw s5, s4, 9 + add s6, s8, s5 + mulw s7, s6, s2 + subw s5, s3, s7 + addw s4, t6, s5 + mul s6, s4, s1 + srli s7, s6, 32 + add s3, s7, s4 + srliw s8, s3, 31 + sraiw s5, s3, 9 + add s7, s8, s5 + mulw s6, s7, s2 + subw s3, s4, s6 + addw s5, a6, s3 + mul s7, s5, s1 + srli s6, s7, 32 + add s3, s6, s5 + srliw s7, s3, 31 + sraiw s4, s3, 9 + add s6, s7, s4 + mulw s9, s6, s2 subw s8, s5, s9 - addw s6, a7, s8 - mul s10, s6, s3 - srli s9, s10, 32 - add s5, s9, s6 - srliw s11, s5, 31 - sraiw s7, s5, 9 - add s8, s11, s7 - mulw s9, s8, s4 - subw s10, s6, s9 - addw s5, s0, s10 - mul s8, s5, s3 - srli s9, s8, 32 - add s6, s9, s5 - srliw s10, s6, 31 - sraiw s7, s6, 9 - add s8, s10, s7 - mulw s11, s8, s4 - subw s9, s5, s11 - addw s6, s1, s9 - mul s7, s6, s3 - srli s8, s7, 32 - add s5, s8, s6 - srliw s10, s5, 31 - sraiw s3, s5, 9 - add s7, s10, s3 - ld s3, 160(sp) - mulw s8, s7, s4 - li s4, 7 - subw s9, s6, s8 - addw s5, s2, s9 - bgt s3, s4, label59 + addw s3, a7, s8 + mul s6, s3, s1 + srli s5, s6, 32 + add s4, s5, s3 + srliw s7, s4, 31 + sraiw s1, s4, 9 + add s5, s7, s1 + li s1, 7 + mulw s6, s5, s2 + ld s2, 152(sp) + subw s4, s3, s6 + addw s3, s0, s4 + bgt s2, s1, label59 li a7, 4 - mv s1, s5 - mv s0, a7 - ble s3, a7, label335 + mv a6, a7 + mv a7, s3 + ble s2, a6, label335 .p2align 2 label65: - sh2add a7, s0, t4 + sh2add t6, a6, t3 j label66 .p2align 2 label70: - addi a7, a7, 4 + addi t6, t6, 4 .p2align 2 label66: - lw s2, 0(a7) - lui s3, 657125 - li s7, 817 - addiw s4, s3, -1067 - mul s6, s1, s4 - srli s5, s6, 32 - add s3, s5, s1 - srliw s6, s3, 31 - sraiw s4, s3, 9 - add s5, s6, s4 - mulw s3, s5, s7 - sh2add s5, s0, a6 - subw s6, s1, s3 - addiw s0, s0, 1 - addw s4, s2, s6 - lw s7, 0(s5) - ld s3, 160(sp) - addw s1, s4, s7 - bgt s3, s0, label70 - addiw t6, t6, 1 - ld a6, 176(sp) - ble a6, t6, label797 + lw s0, 0(t6) + lui s1, 657125 + li s6, 817 + addiw s2, s1, -1067 + mul s3, a7, s2 + srli s4, s3, 32 + add s1, s4, a7 + srliw s5, s1, 31 + sraiw s2, s1, 9 + add s3, s5, s2 + sh2add s2, a6, t5 + addiw a6, a6, 1 + mulw s4, s3, s6 + lw s3, 0(s2) + subw s5, a7, s4 + ld s2, 152(sp) + addw s1, s0, s5 + addw a7, s1, s3 + bgt s2, a6, label70 + addiw t4, t4, 1 + ld t5, 168(sp) + ble t5, t4, label793 .p2align 2 label73: - addi t4, t4, 8 - mv s5, s1 - ld a6, 120(sp) - blt t6, a6, label57 + addi t3, t3, 8 + mv s3, a7 + lui t6, 244 + addiw t5, t6, 576 + blt t4, t5, label57 label287: - mv s1, s5 - addiw t3, t3, 1 - ld t4, 192(sp) - bgt t4, t3, label56 - addiw t0, t0, 1 - ld t1, 208(sp) - bgt t1, t0, label48 - addiw a3, a3, 1 - ld a4, 200(sp) - bgt a4, a3, label76 + mv a7, s3 + addiw t1, t1, 1 + ld t2, 184(sp) + bgt t2, t1, label56 + addiw a4, a4, 1 + ld a5, 200(sp) + bgt a5, a4, label48 + addiw a1, a1, 1 + ld a2, 192(sp) + bgt a2, a1, label76 + ld a1, 264(sp) + ld a0, 176(sp) addiw a1, a1, 1 - ld a0, 184(sp) bgt a0, a1, label35 j label77 .p2align 2 -label797: - addiw t3, t3, 1 - ld t4, 192(sp) - ble t4, t3, label810 +label793: + addiw t1, t1, 1 + ld t2, 184(sp) + ble t2, t1, label806 .p2align 2 label56: - addi t1, t1, 48 - mv s5, s1 - slli t4, t3, 3 - sub t6, t4, t3 - ld t4, 112(sp) - slli a6, t6, 5 - add t5, t2, a6 - blt t3, t4, label49 + addi a5, a5, 48 + mv s3, a7 + slli t3, t1, 3 + lui t6, 24 + sub t5, t3, t1 + addiw t3, t6, 1696 + slli t4, t5, 5 + add t2, t0, t4 + blt t1, t3, label49 label277: - mv s1, s5 - addiw t0, t0, 1 - ld t1, 208(sp) - bgt t1, t0, label48 - addiw a3, a3, 1 - ld a4, 200(sp) - bgt a4, a3, label76 + mv a7, s3 + addiw a4, a4, 1 + ld a5, 200(sp) + bgt a5, a4, label48 + addiw a1, a1, 1 + ld a2, 192(sp) + bgt a2, a1, label76 + ld a1, 264(sp) + ld a0, 176(sp) addiw a1, a1, 1 - ld a0, 184(sp) bgt a0, a1, label35 j label77 label299: - mv s0, zero - mv s1, s5 - ld s3, 160(sp) - mv s5, zero - bgt s3, zero, label65 + mv a6, zero + mv a7, s3 + ld s2, 152(sp) + mv s3, zero + bgt s2, zero, label65 .p2align 2 label335: - mv s1, s5 - addiw t6, t6, 1 - ld a6, 176(sp) - bgt a6, t6, label73 - addiw t3, t3, 1 - ld t4, 192(sp) - bgt t4, t3, label56 - addiw t0, t0, 1 - ld t1, 208(sp) - bgt t1, t0, label48 - addiw a3, a3, 1 - ld a4, 200(sp) - bgt a4, a3, label76 + mv a7, s3 + addiw t4, t4, 1 + ld t5, 168(sp) + bgt t5, t4, label73 + addiw t1, t1, 1 + ld t2, 184(sp) + bgt t2, t1, label56 + addiw a4, a4, 1 + ld a5, 200(sp) + bgt a5, a4, label48 + addiw a1, a1, 1 + ld a2, 192(sp) + bgt a2, a1, label76 + ld a1, 264(sp) + ld a0, 176(sp) addiw a1, a1, 1 - ld a0, 184(sp) bgt a0, a1, label35 j label77 .p2align 2 -label810: - addiw t0, t0, 1 - ld t1, 208(sp) - ble t1, t0, label817 +label806: + addiw a4, a4, 1 + ld a5, 200(sp) + ble a5, a4, label813 .p2align 2 label48: - addi a4, a4, 240 - mv s5, s1 - slli t4, t0, 3 - sub t3, t4, t0 - li t4, 625 - slli t1, t3, 7 - slli t3, t4, 4 - add t2, a5, t1 - blt t0, t3, label41 + addi a2, a2, 240 + mv s3, a7 + slli t2, a4, 3 + sub t1, t2, a4 + li t2, 625 + slli a5, t1, 7 + slli t1, t2, 4 + add t0, a3, a5 + blt a4, t1, label41 label271: - mv s1, s5 - addiw a3, a3, 1 - ld a4, 200(sp) - ble a4, a3, label361 + mv a7, s3 + addiw a1, a1, 1 + ld a2, 192(sp) + ble a2, a1, label361 label76: addi a0, a0, 960 - mv s5, s1 - slli a4, a3, 3 - sub t1, a4, a3 - li a4, 1000 - slli t0, t1, 8 - add a5, a2, t0 - blt a3, a4, label36 + mv s3, a7 + slli a3, a1, 3 + ld a4, 304(sp) + sub a5, a3, a1 + slli a2, a5, 8 + li a5, 1000 + add a3, a4, a2 + blt a1, a5, label36 label259: - mv s1, s5 + mv a7, s3 + ld a1, 264(sp) + ld a0, 176(sp) addiw a1, a1, 1 - ld a0, 184(sp) bgt a0, a1, label35 j label77 .p2align 2 -label817: - addiw a3, a3, 1 - ld a4, 200(sp) - bgt a4, a3, label76 +label813: + addiw a1, a1, 1 + ld a2, 192(sp) + bgt a2, a1, label76 + ld a1, 264(sp) + ld a0, 176(sp) addiw a1, a1, 1 - ld a0, 184(sp) ble a0, a1, label77 label35: - ld a0, 144(sp) - mv s5, s1 - ld a4, 320(sp) - add a4, a4, a0 - sd a4, 320(sp) + ld a0, 136(sp) + mv s3, a7 + ld a2, 232(sp) + add a2, a2, a0 + sd a2, 232(sp) + sd a1, 264(sp) j label24 label77: - ld a1, 256(sp) - ld a0, 168(sp) + ld a1, 104(sp) + ld a0, 160(sp) addiw a1, a1, 1 bgt a0, a1, label79 - mv a0, s1 + mv a0, a7 label80: ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) - ld s2, 24(sp) + ld s3, 24(sp) ld s1, 32(sp) ld s6, 40(sp) - ld s3, 48(sp) + ld s2, 48(sp) ld s4, 56(sp) ld s8, 64(sp) ld s7, 72(sp) - ld s10, 80(sp) - ld s9, 88(sp) + ld s9, 80(sp) + ld s10, 88(sp) ld s11, 96(sp) - addi sp, sp, 344 + addi sp, sp, 328 ret label361: + ld a1, 264(sp) + ld a0, 176(sp) addiw a1, a1, 1 - ld a0, 184(sp) bgt a0, a1, label35 j label77 label253: - mv s1, s5 + mv a7, s3 j label77 label79: - ld a0, 136(sp) - mv s5, s1 - ld a4, 232(sp) - add a4, a4, a0 - sd a4, 232(sp) - sd a1, 256(sp) + li a3, 45 + mv s3, a7 + ld a2, 112(sp) + slli a0, a3, 7 + add a2, a2, a0 + sd a2, 112(sp) + sd a1, 104(sp) j label19 label247: - mv a0, s5 + mv a0, s3 j label80 diff --git a/tests/SysY2022/functional/95_float.riscv.s b/tests/SysY2022/functional/95_float.riscv.s index bdc753945..0ca4dc0fa 100644 --- a/tests/SysY2022/functional/95_float.riscv.s +++ b/tests/SysY2022/functional/95_float.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1078530011 .text diff --git a/tests/SysY2022/hidden_functional/09_BFS.arm.s b/tests/SysY2022/hidden_functional/09_BFS.arm.s index f495d9038..6db120828 100644 --- a/tests/SysY2022/hidden_functional/09_BFS.arm.s +++ b/tests/SysY2022/hidden_functional/09_BFS.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 to: .zero 20020 -.align 8 +.p2align 3 next: .zero 20020 -.align 8 +.p2align 3 head: .zero 4020 -.align 8 +.p2align 3 que: .zero 4020 -.align 8 +.p2align 3 inq: .zero 4020 .text diff --git a/tests/SysY2022/hidden_functional/09_BFS.riscv.s b/tests/SysY2022/hidden_functional/09_BFS.riscv.s index 5db3988de..1a5b38510 100644 --- a/tests/SysY2022/hidden_functional/09_BFS.riscv.s +++ b/tests/SysY2022/hidden_functional/09_BFS.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 to: .zero 20020 -.align 8 +.p2align 3 next: .zero 20020 -.align 8 +.p2align 3 head: .zero 4020 -.align 8 +.p2align 3 que: .zero 4020 -.align 8 +.p2align 3 inq: .zero 4020 .text @@ -25,102 +25,104 @@ main: sd ra, 0(sp) sd s0, 8(sp) sd s5, 16(sp) - sd s4, 24(sp) + sd s3, 24(sp) sd s1, 32(sp) sd s6, 40(sp) sd s2, 48(sp) - sd s3, 56(sp) + sd s4, 56(sp) sd s7, 64(sp) sd s8, 72(sp) sd s9, 80(sp) sd s10, 88(sp) sd s11, 96(sp) jal getch - li s6, 960 + li s4, 45 + li s2, 1 li s1, -1 +pcrel938: + auipc a3, %pcrel_hi(que) + li s3, 9 addiw a1, a0, -48 - li s2, 1 -pcrel944: - auipc a2, %pcrel_hi(que) - li s3, 10 -pcrel945: - auipc a3, %pcrel_hi(to) - li s4, 9 - addi t1, a2, %pcrel_lo(pcrel944) - addi t0, a3, %pcrel_lo(pcrel945) -pcrel946: - auipc a2, %pcrel_hi(next) -pcrel947: - auipc a3, %pcrel_hi(inq) +pcrel939: + auipc a2, %pcrel_hi(to) + addi t1, a3, %pcrel_lo(pcrel938) + addi t0, a2, %pcrel_lo(pcrel939) +pcrel940: + auipc a3, %pcrel_hi(next) sd t1, 104(sp) - addi a5, a2, %pcrel_lo(pcrel946) - addi s0, a3, %pcrel_lo(pcrel947) +pcrel941: + auipc a2, %pcrel_hi(inq) + addi a5, a3, %pcrel_lo(pcrel940) + addi s0, a2, %pcrel_lo(pcrel941) sd t0, 112(sp) sd a5, 120(sp) - bleu a1, s4, label3 + bleu a1, s3, label3 .p2align 2 label2: jal getch addiw a1, a0, -48 - bgtu a1, s4, label2 + bgtu a1, s3, label2 label3: addiw a1, a0, -48 - bgeu a1, s3, label6 + li a2, 10 + bgeu a1, a2, label6 .p2align 2 label5: jal getch + li a2, 10 addiw a1, a0, -48 - bltu a1, s3, label5 + bltu a1, a2, label5 label6: jal getch addiw a1, a0, -48 - bleu a1, s4, label137 + bleu a1, s3, label137 mv s5, a0 - mv s7, zero + mv s6, zero j label7 .p2align 2 -label853: - bleu a3, s4, label145 +label847: + bleu a2, s3, label145 mv s5, a0 - mv s7, a2 + mv s6, a1 .p2align 2 label7: jal getch - li a1, 45 - li a2, 1 - addiw a3, a0, -48 - beq s5, a1, label853 - mv a2, s7 - j label853 + li a1, 1 + addiw a2, a0, -48 + beq s5, s4, label847 + mv a1, s6 + j label847 label145: - mv s5, a2 + mv s5, a1 label10: - addiw a1, a0, -48 - bgeu a1, s3, label150 - mv s7, a0 - mv s8, zero + addiw a2, a0, -48 + li a1, 10 + bgeu a2, a1, label150 + mv s6, a0 + mv s7, zero j label13 .p2align 2 label160: - mv s7, a0 + mv s6, a0 .p2align 2 label13: jal getch - sh2add a3, s8, s8 + sh2add a4, s7, s7 addiw a1, a0, -48 - slliw a4, a3, 1 - addi a2, a4, -48 - addw s8, s7, a2 - bltu a1, s3, label160 + slliw a3, a4, 1 + li a4, 10 + addi a2, a3, -48 + addw s7, s6, a2 + bltu a1, a4, label160 label16: - subw a0, zero, s8 - mv a1, a0 - bne s5, zero, label855 - mv a1, s8 -label855: - auipc a2, %pcrel_hi(head) - mv a3, zero - addi s5, a2, %pcrel_lo(label855) + subw a0, zero, s7 + mv s6, a0 + bne s5, zero, label849 + mv s6, s7 +label849: + auipc a1, %pcrel_hi(head) + mv a2, zero + addi s5, a1, %pcrel_lo(label849) mv a0, s5 j label18 .p2align 2 @@ -128,78 +130,78 @@ label21: addi a0, a0, 256 .p2align 2 label18: - li a5, -1 - addiw a3, a3, 64 - slli a4, a5, 32 - add.uw a2, s1, a4 - sd a2, 0(a0) - sd a2, 8(a0) - sd a2, 16(a0) - sd a2, 24(a0) - sd a2, 32(a0) - sd a2, 40(a0) - sd a2, 48(a0) - sd a2, 56(a0) - sd a2, 64(a0) - sd a2, 72(a0) - sd a2, 80(a0) - sd a2, 88(a0) - sd a2, 96(a0) - sd a2, 104(a0) - sd a2, 112(a0) - sd a2, 120(a0) - sd a2, 128(a0) - sd a2, 136(a0) - sd a2, 144(a0) - sd a2, 152(a0) - sd a2, 160(a0) - sd a2, 168(a0) - sd a2, 176(a0) - sd a2, 184(a0) - sd a2, 192(a0) - sd a2, 200(a0) - sd a2, 208(a0) - sd a2, 216(a0) - sd a2, 224(a0) - sd a2, 232(a0) - sd a2, 240(a0) - sd a2, 248(a0) - blt a3, s6, label21 li a4, -1 + addiw a2, a2, 64 slli a3, a4, 32 - add.uw a2, s1, a3 - sd a2, 256(a0) - sd a2, 264(a0) - sd a2, 272(a0) - sd a2, 280(a0) - sd a2, 288(a0) - sd a2, 296(a0) - sd a2, 304(a0) - sd a2, 312(a0) - sd a2, 320(a0) - sd a2, 328(a0) - sd a2, 336(a0) - sd a2, 344(a0) - sd a2, 352(a0) - sd a2, 360(a0) - sd a2, 368(a0) - sd a2, 376(a0) - sd a2, 384(a0) - sd a2, 392(a0) - sd a2, 400(a0) - sd a2, 408(a0) - sd a2, 416(a0) - sd a2, 424(a0) + add.uw a1, s1, a3 + li a3, 960 + sd a1, 0(a0) + sd a1, 8(a0) + sd a1, 16(a0) + sd a1, 24(a0) + sd a1, 32(a0) + sd a1, 40(a0) + sd a1, 48(a0) + sd a1, 56(a0) + sd a1, 64(a0) + sd a1, 72(a0) + sd a1, 80(a0) + sd a1, 88(a0) + sd a1, 96(a0) + sd a1, 104(a0) + sd a1, 112(a0) + sd a1, 120(a0) + sd a1, 128(a0) + sd a1, 136(a0) + sd a1, 144(a0) + sd a1, 152(a0) + sd a1, 160(a0) + sd a1, 168(a0) + sd a1, 176(a0) + sd a1, 184(a0) + sd a1, 192(a0) + sd a1, 200(a0) + sd a1, 208(a0) + sd a1, 216(a0) + sd a1, 224(a0) + sd a1, 232(a0) + sd a1, 240(a0) + sd a1, 248(a0) + blt a2, a3, label21 + li a3, -1 + slli a2, a3, 32 + add.uw a1, s1, a2 + sd a1, 256(a0) + sd a1, 264(a0) + sd a1, 272(a0) + sd a1, 280(a0) + sd a1, 288(a0) + sd a1, 296(a0) + sd a1, 304(a0) + sd a1, 312(a0) + sd a1, 320(a0) + sd a1, 328(a0) + sd a1, 336(a0) + sd a1, 344(a0) + sd a1, 352(a0) + sd a1, 360(a0) + sd a1, 368(a0) + sd a1, 376(a0) + sd a1, 384(a0) + sd a1, 392(a0) + sd a1, 400(a0) + sd a1, 408(a0) + sd a1, 416(a0) + sd a1, 424(a0) sw s1, 432(a0) - beq a1, zero, label102 - mv s6, a1 + beq s6, zero, label102 mv s7, zero j label23 .p2align 2 label54: mv a0, a1 jal putint - mv a0, s3 + li a0, 10 jal putch addiw s6, s6, -1 beq s6, zero, label102 @@ -207,65 +209,64 @@ label54: label23: jal getch xori a3, a0, 85 - xori a2, a0, 81 - sltu a4, zero, a3 - sltu a1, zero, a2 - and a2, a1, a4 - bne a2, zero, label23 + xori a4, a0, 81 + sltu a2, zero, a3 + sltu a1, zero, a4 + and a4, a1, a2 + bne a4, zero, label23 li a1, 81 beq a0, a1, label27 jal getch addiw a1, a0, -48 - bleu a1, s4, label454 + bleu a1, s3, label454 mv s8, a0 mv s9, zero j label80 .p2align 2 -label868: - mv a2, s9 +label862: + mv a1, s9 .p2align 2 -label869: - bleu a3, s4, label462 +label863: + bleu a2, s3, label462 mv s8, a0 - mv s9, a2 + mv s9, a1 .p2align 2 label80: jal getch - li a1, 45 - li a2, 1 - addiw a3, a0, -48 - beq s8, a1, label869 - j label868 + li a1, 1 + addiw a2, a0, -48 + beq s8, s4, label863 + j label862 .p2align 2 label27: jal getch addiw a1, a0, -48 - bleu a1, s4, label299 + bleu a1, s3, label299 mv s8, a0 mv s9, zero j label28 .p2align 2 -label858: - mv a2, s9 +label852: + mv a1, s9 .p2align 2 -label859: - bleu a3, s4, label307 +label853: + bleu a2, s3, label307 mv s8, a0 - mv s9, a2 + mv s9, a1 .p2align 2 label28: jal getch - li a1, 45 - li a2, 1 - addiw a3, a0, -48 - beq s8, a1, label859 - j label858 + li a1, 1 + addiw a2, a0, -48 + beq s8, s4, label853 + j label852 .p2align 2 label462: mv s8, a0 - mv s9, a2 + mv s9, a1 addiw a0, a0, -48 - bgeu a0, s3, label901 + li a1, 10 + bgeu a0, a1, label895 .p2align 2 label468: mv s10, zero @@ -276,45 +277,46 @@ label477: .p2align 2 label86: jal getch - sh2add a3, s10, s10 + sh2add a2, s10, s10 addiw a1, a0, -48 - slliw a2, a3, 1 - addi a4, a2, -48 + slliw a3, a2, 1 + li a2, 10 + addi a4, a3, -48 addw s10, s8, a4 - bltu a1, s3, label477 + bltu a1, a2, label477 .p2align 2 label89: jal getch subw a2, zero, s10 addiw a1, a0, -48 mv s8, a2 - bne s9, zero, label871 + bne s9, zero, label865 mv s8, s10 .p2align 2 -label871: - bleu a1, s4, label485 +label865: + bleu a1, s3, label485 mv s9, a0 mv s10, zero j label91 .p2align 2 -label873: - bleu a3, s4, label493 +label867: + bleu a2, s3, label493 mv s9, a0 - mv s10, a2 + mv s10, a1 .p2align 2 label91: jal getch - li a1, 45 - li a2, 1 - addiw a3, a0, -48 - beq s9, a1, label873 - mv a2, s10 - j label873 + li a1, 1 + addiw a2, a0, -48 + beq s9, s4, label867 + mv a1, s10 + j label867 .p2align 2 label493: - mv s9, a2 - addiw a1, a0, -48 - bgeu a1, s3, label902 + mv s9, a1 + addiw a2, a0, -48 + li a1, 10 + bgeu a2, a1, label896 .p2align 2 label499: mv s10, a0 @@ -326,45 +328,47 @@ label508: .p2align 2 label97: jal getch - sh2add a2, s11, s11 + sh2add a4, s11, s11 addiw a1, a0, -48 - slliw a4, a2, 1 - addi a3, a4, -48 - addw s11, s10, a3 - bltu a1, s3, label508 + slliw a3, a4, 1 + li a4, 10 + addi a2, a3, -48 + addw s11, s10, a2 + bltu a1, a4, label508 subw a1, zero, s11 mv a0, a1 - bne s9, zero, label875 + bne s9, zero, label869 .p2align 2 -label903: +label897: mv a0, s11 .p2align 2 -label875: +label869: ld t0, 112(sp) - sh2add a3, s8, s5 + sh2add a2, s8, s5 sh2add a4, a0, s5 addiw s6, s6, -1 sh2add a1, s7, t0 sw a0, 0(a1) addiw a0, s7, 1 - lw t0, 0(a3) + lw t0, 0(a2) ld a5, 120(sp) - sh2add a2, s7, a5 - sw t0, 0(a2) - sw s7, 0(a3) + sh2add a3, s7, a5 + sw t0, 0(a3) + sw s7, 0(a2) addiw s7, s7, 2 sw s8, 4(a1) lw a5, 0(a4) - sw a5, 4(a2) + sw a5, 4(a3) sw a0, 0(a4) bne s6, zero, label23 j label102 .p2align 2 label307: mv s8, a0 - mv s9, a2 + mv s9, a1 addiw a0, a0, -48 - bgeu a0, s3, label892 + li a1, 10 + bgeu a0, a1, label886 .p2align 2 label313: mv s10, zero @@ -375,45 +379,46 @@ label444: .p2align 2 label74: jal getch - sh2add a2, s10, s10 + sh2add a4, s10, s10 addiw a1, a0, -48 - slliw a4, a2, 1 - addi a3, a4, -48 - addw s10, s8, a3 - bltu a1, s3, label444 + slliw a3, a4, 1 + li a4, 10 + addi a2, a3, -48 + addw s10, s8, a2 + bltu a1, a4, label444 .p2align 2 label34: jal getch subw a2, zero, s10 addiw a1, a0, -48 mv s8, a2 - bne s9, zero, label861 + bne s9, zero, label855 mv s8, s10 .p2align 2 -label861: - bleu a1, s4, label321 +label855: + bleu a1, s3, label321 mv s9, a0 mv s10, zero j label36 .p2align 2 -label863: - bleu a3, s4, label329 +label857: + bleu a2, s3, label329 mv s9, a0 - mv s10, a2 + mv s10, a1 .p2align 2 label36: jal getch - li a1, 45 - li a2, 1 - addiw a3, a0, -48 - beq s9, a1, label863 - mv a2, s10 - j label863 + li a1, 1 + addiw a2, a0, -48 + beq s9, s4, label857 + mv a1, s10 + j label857 .p2align 2 label329: - mv s9, a2 - addiw a1, a0, -48 - bgeu a1, s3, label893 + mv s9, a1 + addiw a2, a0, -48 + li a1, 10 + bgeu a2, a1, label887 .p2align 2 label335: mv s10, a0 @@ -424,18 +429,19 @@ label42: sh2add a4, s11, s11 addiw a1, a0, -48 slliw a3, a4, 1 + li a4, 10 addi a2, a3, -48 addw s11, s10, a2 - bgeu a1, s3, label343 + bgeu a1, a4, label343 mv s10, a0 j label42 .p2align 2 label343: subw a1, zero, s11 mv a0, a1 - beq s9, zero, label894 + beq s9, zero, label888 .p2align 2 -label865: +label859: sh2add a1, s8, s0 mv a4, s2 mv a3, zero @@ -446,10 +452,10 @@ label865: li a1, 1 sh2add t0, a2, t1 lw a5, 0(t0) - beq a0, a5, label867 + beq a0, a5, label861 mv a1, zero .p2align 2 -label867: +label861: sh2add t0, a5, s5 lw a3, 0(t0) bne a3, s1, label68 @@ -470,7 +476,7 @@ label73: ld a5, 120(sp) sh2add t0, a3, a5 lw a3, 0(t0) - beq a3, s1, label900 + beq a3, s1, label894 .p2align 2 label68: ld t0, 112(sp) @@ -487,8 +493,8 @@ label68: bge a4, zero, label55 j label54 .p2align 2 -label900: - bge a2, a4, label906 +label894: + bge a2, a4, label900 .p2align 2 label367: mv a3, a1 @@ -497,7 +503,7 @@ label367: li a1, 1 sh2add t0, a2, t1 lw a5, 0(t0) - beq a0, a5, label867 + beq a0, a5, label861 mv a1, a3 sh2add t0, a5, s5 lw a3, 0(t0) @@ -505,7 +511,7 @@ label367: blt a2, a4, label367 j label366 .p2align 2 -label906: +label900: blt a4, zero, label54 .p2align 2 label55: @@ -520,15 +526,15 @@ label57: addiw a2, a2, 4 sh2add a5, a3, s0 sw zero, 0(a5) - lw t0, 4(t1) - sh2add t2, t0, s0 - sw zero, 0(t2) + lw t2, 4(t1) + sh2add t0, t2, s0 + sw zero, 0(t0) lw a3, 8(t1) - sh2add t0, a3, s0 + sh2add t2, a3, s0 addiw a3, a4, -2 - sw zero, 0(t0) - lw t2, 12(t1) - sh2add a5, t2, s0 + sw zero, 0(t2) + lw t0, 12(t1) + sh2add a5, t0, s0 sw zero, 0(a5) bge a2, a3, label396 addi t1, t1, 16 @@ -551,7 +557,7 @@ label64: addi a2, a2, 4 j label64 .p2align 2 -label894: +label888: mv a0, s11 sh2add a1, s8, s0 mv a4, s2 @@ -563,7 +569,7 @@ label894: sh2add t0, a2, t1 sw s8, 4(t1) lw a5, 0(t0) - beq s11, a5, label867 + beq s11, a5, label861 mv a1, zero sh2add t0, a5, s5 lw a3, 0(t0) @@ -578,11 +584,11 @@ label102: ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) - ld s4, 24(sp) + ld s3, 24(sp) ld s1, 32(sp) ld s6, 40(sp) ld s2, 48(sp) - ld s3, 56(sp) + ld s4, 56(sp) ld s7, 64(sp) ld s8, 72(sp) ld s9, 80(sp) @@ -592,43 +598,47 @@ label102: ret label485: mv s9, zero - addiw a1, a0, -48 - bltu a1, s3, label499 -label902: + addiw a2, a0, -48 + li a1, 10 + bltu a2, a1, label499 +label896: mv s11, zero mv a1, zero mv a0, zero - bne s9, zero, label875 - j label903 + bne s9, zero, label869 + j label897 label454: mv s8, a0 mv s9, zero addiw a0, a0, -48 - bltu a0, s3, label468 -label901: + li a1, 10 + bltu a0, a1, label468 +label895: mv s10, zero j label89 label321: mv s9, zero - addiw a1, a0, -48 - bltu a1, s3, label335 -label893: + addiw a2, a0, -48 + li a1, 10 + bltu a2, a1, label335 +label887: mv s11, zero mv a1, zero mv a0, zero - bne s9, zero, label865 - j label865 + bne s9, zero, label859 + j label859 label299: mv s8, a0 mv s9, zero addiw a0, a0, -48 - bltu a0, s3, label313 -label892: + li a1, 10 + bltu a0, a1, label313 +label886: mv s10, zero j label34 label137: mv s5, zero j label10 label150: - mv s8, zero + mv s7, zero j label16 diff --git a/tests/SysY2022/hidden_functional/10_DFS.arm.s b/tests/SysY2022/hidden_functional/10_DFS.arm.s index 05ffb8e2c..91e747497 100644 --- a/tests/SysY2022/hidden_functional/10_DFS.arm.s +++ b/tests/SysY2022/hidden_functional/10_DFS.arm.s @@ -1,16 +1,16 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 to: .zero 20020 -.align 8 +.p2align 3 next: .zero 20020 -.align 8 +.p2align 3 head: .zero 4020 -.align 8 +.p2align 3 vis: .zero 4020 .text diff --git a/tests/SysY2022/hidden_functional/10_DFS.riscv.s b/tests/SysY2022/hidden_functional/10_DFS.riscv.s index a19e0483b..6445ebfe2 100644 --- a/tests/SysY2022/hidden_functional/10_DFS.riscv.s +++ b/tests/SysY2022/hidden_functional/10_DFS.riscv.s @@ -1,16 +1,16 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 to: .zero 20020 -.align 8 +.p2align 3 next: .zero 20020 -.align 8 +.p2align 3 head: .zero 4020 -.align 8 +.p2align 3 vis: .zero 4020 .text @@ -237,34 +237,30 @@ main: sd ra, 0(sp) sd s0, 8(sp) sd s5, 16(sp) - sd s6, 24(sp) - sd s1, 32(sp) - sd s7, 40(sp) - sd s8, 48(sp) - sd s2, 56(sp) - sd s3, 64(sp) - sd s4, 72(sp) - sd s9, 80(sp) + sd s1, 24(sp) + sd s6, 32(sp) + sd s2, 40(sp) + sd s3, 48(sp) + sd s4, 56(sp) + sd s7, 64(sp) + sd s9, 72(sp) + sd s8, 80(sp) sd s10, 88(sp) sd s11, 96(sp) jal getch - addiw a1, a0, -48 -pcrel1300: - auipc a2, %pcrel_hi(vis) -pcrel1301: +pcrel1293: + auipc a3, %pcrel_hi(next) +pcrel1294: auipc a5, %pcrel_hi(to) li s0, 10 - li s7, 960 -pcrel1302: - auipc a3, %pcrel_hi(next) - li s6, -1 - addi a4, a2, %pcrel_lo(pcrel1300) - addi t0, a3, %pcrel_lo(pcrel1302) - addi a2, a5, %pcrel_lo(pcrel1301) +pcrel1295: + auipc a2, %pcrel_hi(vis) + addiw a1, a0, -48 + addi t0, a3, %pcrel_lo(pcrel1293) + addi a4, a2, %pcrel_lo(pcrel1295) + addi a2, a5, %pcrel_lo(pcrel1294) sd a4, 104(sp) - li a3, -1 sd a2, 112(sp) - slli s8, a3, 32 li a2, 9 sd t0, 120(sp) bleu a1, a2, label492 @@ -272,10 +268,10 @@ pcrel1302: mv s2, zero j label384 .p2align 2 -label1212: +label1203: mv a1, s2 .p2align 2 -label1213: +label1204: li a2, 9 bleu a3, a2, label500 mv s1, a0 @@ -286,8 +282,8 @@ label384: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s1, a2, label1213 - j label1212 + beq s1, a2, label1204 + j label1203 label500: mv s1, a1 label387: @@ -298,11 +294,11 @@ label387: .p2align 2 label390: jal getch - sh2add a3, s3, s3 + sh2add a2, s3, s3 addiw a1, a0, -48 - slliw a4, a3, 1 - addi a2, a4, -48 - addw s3, s2, a2 + slliw a3, a2, 1 + addi a4, a3, -48 + addw s3, s2, a4 bgeu a1, s0, label393 mv s2, a0 j label390 @@ -311,63 +307,63 @@ label393: subw a2, zero, s3 addiw a1, a0, -48 mv s4, a2 - bne s1, zero, label1215 + bne s1, zero, label1206 mv s4, s3 -label1215: +label1206: addiw s2, s4, 1 addiw s3, s4, -2 addiw s1, s4, -17 li a2, 9 bleu a1, a2, label526 mv s5, a0 - mv s9, zero + mv s6, zero j label395 .p2align 2 -label1216: - mv a1, s9 +label1207: + mv a1, s6 .p2align 2 -label1217: +label1208: li a2, 9 bleu a3, a2, label534 mv s5, a0 - mv s9, a1 + mv s6, a1 .p2align 2 label395: jal getch li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s5, a2, label1217 - j label1216 + beq s5, a2, label1208 + j label1207 label534: mv s5, a1 label398: addiw a2, a0, -48 bgeu a2, s0, label539 - mv s9, a0 - mv s10, zero + mv s6, a0 + mv s7, zero j label401 .p2align 2 label549: - mv s9, a0 + mv s6, a0 .p2align 2 label401: jal getch - sh2add a4, s10, s10 + sh2add a4, s7, s7 addiw a1, a0, -48 slliw a2, a4, 1 addi a3, a2, -48 - addw s10, s9, a3 + addw s7, s6, a3 bltu a1, s0, label549 label404: - subw a0, zero, s10 - mv a1, a0 - bne s5, zero, label1219 - mv a1, s10 -label1219: - auipc a2, %pcrel_hi(head) - mv a3, zero - addi s5, a2, %pcrel_lo(label1219) + subw a0, zero, s7 + mv s6, a0 + bne s5, zero, label1210 + mv s6, s7 +label1210: + auipc a1, %pcrel_hi(head) + mv a2, zero + addi s5, a1, %pcrel_lo(label1210) mv a0, s5 j label406 .p2align 2 @@ -375,116 +371,123 @@ label409: addi a0, a0, 256 .p2align 2 label406: - add.uw a2, s6, s8 - addiw a3, a3, 64 - sd a2, 0(a0) - sd a2, 8(a0) - sd a2, 16(a0) - sd a2, 24(a0) - sd a2, 32(a0) - sd a2, 40(a0) - sd a2, 48(a0) - sd a2, 56(a0) - sd a2, 64(a0) - sd a2, 72(a0) - sd a2, 80(a0) - sd a2, 88(a0) - sd a2, 96(a0) - sd a2, 104(a0) - sd a2, 112(a0) - sd a2, 120(a0) - sd a2, 128(a0) - sd a2, 136(a0) - sd a2, 144(a0) - sd a2, 152(a0) - sd a2, 160(a0) - sd a2, 168(a0) - sd a2, 176(a0) - sd a2, 184(a0) - sd a2, 192(a0) - sd a2, 200(a0) - sd a2, 208(a0) - sd a2, 216(a0) - sd a2, 224(a0) - sd a2, 232(a0) - sd a2, 240(a0) - sd a2, 248(a0) - blt a3, s7, label409 - sd a2, 256(a0) - sd a2, 264(a0) - sd a2, 272(a0) - sd a2, 280(a0) - sd a2, 288(a0) - sd a2, 296(a0) - sd a2, 304(a0) - sd a2, 312(a0) - sd a2, 320(a0) - sd a2, 328(a0) - sd a2, 336(a0) - sd a2, 344(a0) - sd a2, 352(a0) - sd a2, 360(a0) - sd a2, 368(a0) - sd a2, 376(a0) - sd a2, 384(a0) - sd a2, 392(a0) - sd a2, 400(a0) - sd a2, 408(a0) - sd a2, 416(a0) - sd a2, 424(a0) - sw s6, 432(a0) - beq a1, zero, label411 + li a3, -1 + li a5, -1 + addiw a2, a2, 64 + slli a4, a5, 32 + add.uw a1, a3, a4 + sd a1, 0(a0) + li a3, 960 + sd a1, 8(a0) + sd a1, 16(a0) + sd a1, 24(a0) + sd a1, 32(a0) + sd a1, 40(a0) + sd a1, 48(a0) + sd a1, 56(a0) + sd a1, 64(a0) + sd a1, 72(a0) + sd a1, 80(a0) + sd a1, 88(a0) + sd a1, 96(a0) + sd a1, 104(a0) + sd a1, 112(a0) + sd a1, 120(a0) + sd a1, 128(a0) + sd a1, 136(a0) + sd a1, 144(a0) + sd a1, 152(a0) + sd a1, 160(a0) + sd a1, 168(a0) + sd a1, 176(a0) + sd a1, 184(a0) + sd a1, 192(a0) + sd a1, 200(a0) + sd a1, 208(a0) + sd a1, 216(a0) + sd a1, 224(a0) + sd a1, 232(a0) + sd a1, 240(a0) + sd a1, 248(a0) + blt a2, a3, label409 + li a3, -1 + li a4, -1 + slli a2, a4, 32 + add.uw a1, a3, a2 + sd a1, 256(a0) + sd a1, 264(a0) + sd a1, 272(a0) + sd a1, 280(a0) + sd a1, 288(a0) + sd a1, 296(a0) + sd a1, 304(a0) + sd a1, 312(a0) + sd a1, 320(a0) + sd a1, 328(a0) + sd a1, 336(a0) + sd a1, 344(a0) + sd a1, 352(a0) + sd a1, 360(a0) + sd a1, 368(a0) + sd a1, 376(a0) + sd a1, 384(a0) + sd a1, 392(a0) + sd a1, 400(a0) + sd a1, 408(a0) + sd a1, 416(a0) + sd a1, 424(a0) + sw a3, 432(a0) + beq s6, zero, label411 mv s7, zero - mv s6, a1 j label412 .p2align 2 label883: subw a1, zero, s11 mv a0, a1 - bne s9, zero, label1233 + bne s9, zero, label1226 .p2align 2 -label1232: +label1225: mv a0, s11 .p2align 2 -label1233: +label1226: ld a2, 112(sp) sh2add a4, s8, s5 - sh2add a3, a0, s5 addiw s6, s6, -1 sh2add a1, s7, a2 + sh2add a2, a0, s5 sw a0, 0(a1) addiw a0, s7, 1 lw a5, 0(a4) ld t0, 120(sp) - sh2add a2, s7, t0 - sw a5, 0(a2) + sh2add a3, s7, t0 + sw a5, 0(a3) sw s7, 0(a4) addiw s7, s7, 2 sw s8, 4(a1) - lw a5, 0(a3) - sw a5, 4(a2) - sw a0, 0(a3) + lw a5, 0(a2) + sw a5, 4(a3) + sw a0, 0(a2) beq s6, zero, label411 .p2align 2 label412: jal getch - xori a4, a0, 85 - xori a2, a0, 81 - sltu a3, zero, a4 - sltu a1, zero, a2 - and a2, a1, a3 - bne a2, zero, label412 + xori a2, a0, 85 + xori a3, a0, 81 + sltu a4, zero, a2 + sltu a1, zero, a3 + and a3, a1, a4 + bne a3, zero, label412 li a1, 81 beq a0, a1, label416 jal getch - addiw a1, a0, -48 li a2, 9 + addiw a1, a0, -48 mv s8, a0 bleu a1, a2, label817 mv s9, zero j label484 .p2align 2 -label1235: +label1228: li a2, 9 bleu a3, a2, label891 mv s8, a0 @@ -495,9 +498,9 @@ label484: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s8, a2, label1235 + beq s8, a2, label1228 mv a1, s9 - j label1235 + j label1228 .p2align 2 label416: jal getch @@ -508,7 +511,7 @@ label416: mv s9, zero j label417 .p2align 2 -label1221: +label1214: li a2, 9 bleu a3, a2, label696 mv s8, a0 @@ -516,12 +519,12 @@ label1221: .p2align 2 label417: jal getch - li a1, 1 li a2, 45 + li a1, 1 addiw a3, a0, -48 - beq s8, a2, label1221 + beq s8, a2, label1214 mv a1, s9 - j label1221 + j label1214 .p2align 2 label891: mv s8, a0 @@ -538,11 +541,11 @@ label832: .p2align 2 label468: jal getch - sh2add a2, s10, s10 + sh2add a4, s10, s10 addiw a1, a0, -48 - slliw a4, a2, 1 - addi a3, a4, -48 - addw s10, s8, a3 + slliw a3, a4, 1 + addi a2, a3, -48 + addw s10, s8, a2 bltu a1, s0, label832 .p2align 2 label471: @@ -550,17 +553,17 @@ label471: subw a2, zero, s10 addiw a1, a0, -48 mv s8, a2 - bne s9, zero, label1229 + bne s9, zero, label1222 mv s8, s10 .p2align 2 -label1229: +label1222: li a2, 9 bleu a1, a2, label840 mv s9, a0 mv s10, zero j label473 .p2align 2 -label1231: +label1224: li a2, 9 bleu a3, a2, label848 mv s9, a0 @@ -571,14 +574,14 @@ label473: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s9, a2, label1231 + beq s9, a2, label1224 mv a1, s10 - j label1231 + j label1224 .p2align 2 label848: mv s9, a1 addiw a2, a0, -48 - bgeu a2, s0, label1259 + bgeu a2, s0, label1252 .p2align 2 label854: mv s10, a0 @@ -586,11 +589,11 @@ label854: .p2align 2 label481: jal getch - sh2add a3, s11, s11 + sh2add a2, s11, s11 addiw a1, a0, -48 - slliw a2, a3, 1 - addi a4, a2, -48 - addw s11, s10, a4 + slliw a4, a2, 1 + addi a3, a4, -48 + addw s11, s10, a3 bgeu a1, s0, label883 mv s10, a0 j label481 @@ -598,7 +601,7 @@ label481: label696: mv s9, a1 addiw a2, a0, -48 - bgeu a2, s0, label1252 + bgeu a2, s0, label1245 .p2align 2 label702: mv s8, a0 @@ -610,11 +613,11 @@ label807: .p2align 2 label459: jal getch - sh2add a2, s10, s10 + sh2add a4, s10, s10 addiw a1, a0, -48 - slliw a4, a2, 1 - addi a3, a4, -48 - addw s10, s8, a3 + slliw a3, a4, 1 + addi a2, a3, -48 + addw s10, s8, a2 bltu a1, s0, label807 .p2align 2 label423: @@ -622,17 +625,17 @@ label423: subw a2, zero, s10 addiw a1, a0, -48 mv s8, a2 - bne s9, zero, label1223 + bne s9, zero, label1216 mv s8, s10 .p2align 2 -label1223: +label1216: li a2, 9 bleu a1, a2, label710 mv s9, a0 mv s10, zero j label456 .p2align 2 -label1227: +label1220: li a2, 9 bleu a3, a2, label797 mv s9, a0 @@ -643,9 +646,9 @@ label456: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s9, a2, label1227 + beq s9, a2, label1220 mv a1, s10 - j label1227 + j label1220 .p2align 2 label797: mv s9, a1 @@ -658,11 +661,11 @@ label716: .p2align 2 label428: jal getch - sh2add a3, s11, s11 + sh2add a2, s11, s11 addiw a1, a0, -48 - slliw a4, a3, 1 - addi a2, a4, -48 - addw s11, s10, a2 + slliw a4, a2, 1 + addi a3, a4, -48 + addw s11, s10, a3 bgeu a1, s0, label724 mv s10, a0 j label428 @@ -670,9 +673,9 @@ label715: mv s11, zero mv a0, zero mv a1, zero - bne s9, zero, label1225 + bne s9, zero, label1218 .p2align 2 -label1225: +label1218: ble s4, zero, label455 .p2align 2 label433: @@ -708,7 +711,7 @@ label444: sw zero, 60(a0) bgt s1, a2, label447 mv a3, a2 - ble s3, a2, label1256 + ble s3, a2, label1249 .p2align 2 label438: ld a4, 104(sp) @@ -754,7 +757,7 @@ label455: label724: subw a0, zero, s11 mv a1, a0 - bne s9, zero, label1225 + bne s9, zero, label1218 mv a1, s11 bgt s4, zero, label433 j label455 @@ -763,7 +766,7 @@ label735: bgt s2, a2, label450 j label455 .p2align 2 -label1256: +label1249: bgt s2, a2, label450 j label455 label739: @@ -776,17 +779,17 @@ label840: mv s9, zero addiw a2, a0, -48 bltu a2, s0, label854 -label1259: +label1252: mv s11, zero mv a1, zero mv a0, zero - bne s9, zero, label1233 - j label1232 + bne s9, zero, label1226 + j label1225 label688: mv s9, zero addiw a2, a0, -48 bltu a2, s0, label702 -label1252: +label1245: mv s10, zero j label423 label411: @@ -794,14 +797,14 @@ label411: ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) - ld s6, 24(sp) - ld s1, 32(sp) - ld s7, 40(sp) - ld s8, 48(sp) - ld s2, 56(sp) - ld s3, 64(sp) - ld s4, 72(sp) - ld s9, 80(sp) + ld s1, 24(sp) + ld s6, 32(sp) + ld s2, 40(sp) + ld s3, 48(sp) + ld s4, 56(sp) + ld s7, 64(sp) + ld s9, 72(sp) + ld s8, 80(sp) ld s10, 88(sp) ld s11, 96(sp) addi sp, sp, 128 @@ -825,7 +828,7 @@ label505: mv s3, zero j label393 label539: - mv s10, zero + mv s7, zero j label404 label492: mv s1, zero diff --git a/tests/SysY2022/hidden_functional/11_BST.arm.s b/tests/SysY2022/hidden_functional/11_BST.arm.s index 44696d2ff..6bd2f9477 100644 --- a/tests/SysY2022/hidden_functional/11_BST.arm.s +++ b/tests/SysY2022/hidden_functional/11_BST.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 value: .zero 40000 -.align 8 +.p2align 3 left_child: .zero 40000 -.align 8 +.p2align 3 right_child: .zero 40000 .text diff --git a/tests/SysY2022/hidden_functional/11_BST.riscv.s b/tests/SysY2022/hidden_functional/11_BST.riscv.s index 91f1f7cc8..84b0a3683 100644 --- a/tests/SysY2022/hidden_functional/11_BST.riscv.s +++ b/tests/SysY2022/hidden_functional/11_BST.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 value: .zero 40000 -.align 8 +.p2align 3 left_child: .zero 40000 -.align 8 +.p2align 3 right_child: .zero 40000 .text diff --git a/tests/SysY2022/hidden_functional/12_DSU.arm.s b/tests/SysY2022/hidden_functional/12_DSU.arm.s index 1648efe66..5d4490d84 100644 --- a/tests/SysY2022/hidden_functional/12_DSU.arm.s +++ b/tests/SysY2022/hidden_functional/12_DSU.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 fa: .zero 400020 .text diff --git a/tests/SysY2022/hidden_functional/12_DSU.riscv.s b/tests/SysY2022/hidden_functional/12_DSU.riscv.s index 7498686e7..5182d3aee 100644 --- a/tests/SysY2022/hidden_functional/12_DSU.riscv.s +++ b/tests/SysY2022/hidden_functional/12_DSU.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 fa: .zero 400020 .text @@ -174,15 +174,15 @@ main: jal getch li s3, 81 li s0, 10 - li s2, 45 li s1, 9 + li s2, 45 addiw a1, a0, -48 bleu a1, s1, label242 mv s4, a0 mv s5, zero j label143 .p2align 2 -label629: +label625: bleu a2, s1, label250 mv s4, a0 mv s5, a1 @@ -191,9 +191,9 @@ label143: jal getch li a1, 1 addiw a2, a0, -48 - beq s4, s2, label629 + beq s4, s2, label625 mv a1, s5 - j label629 + j label625 label250: mv s4, a1 label146: @@ -217,9 +217,9 @@ label152: subw a2, zero, s6 addiw a1, a0, -48 mv s7, a2 - bne s4, zero, label631 + bne s4, zero, label627 mv s7, s6 -label631: +label627: addiw s5, s7, -2 addiw s6, s7, -17 bleu a1, s1, label275 @@ -227,7 +227,7 @@ label631: mv s8, zero j label236 .p2align 2 -label651: +label647: bleu a2, s1, label534 mv s4, a0 mv s8, a1 @@ -236,9 +236,9 @@ label236: jal getch li a1, 1 addiw a2, a0, -48 - beq s4, s2, label651 + beq s4, s2, label647 mv a1, s8 - j label651 + j label647 label534: mv s4, a1 label154: @@ -262,11 +262,11 @@ label157: label160: subw a0, zero, s9 mv a1, a0 - bne s4, zero, label633 + bne s4, zero, label629 mv a1, s9 -label633: +label629: auipc a0, %pcrel_hi(fa) - addi s4, a0, %pcrel_lo(label633) + addi s4, a0, %pcrel_lo(label629) ble s7, zero, label184 addiw a2, s7, 1 li a3, 4 @@ -355,10 +355,10 @@ label229: subw a1, zero, s9 mv a2, a0 mv a0, a1 - bne s7, zero, label647 + bne s7, zero, label643 mv a0, s9 .p2align 2 -label647: +label643: jal find xor a1, a2, a0 sltiu a0, a1, 1 @@ -384,10 +384,10 @@ label185: mv s7, zero j label189 .p2align 2 -label634: +label630: mv a1, s7 .p2align 2 -label635: +label631: bleu a2, s1, label403 mv s6, a0 mv s7, a1 @@ -396,8 +396,8 @@ label189: jal getch li a1, 1 addiw a2, a0, -48 - beq s6, s2, label635 - j label634 + beq s6, s2, label631 + j label630 .p2align 2 label211: jal getch @@ -407,7 +407,7 @@ label211: mv s7, zero j label231 .p2align 2 -label649: +label645: bleu a2, s1, label521 mv s6, a0 mv s7, a1 @@ -416,9 +416,9 @@ label231: jal getch li a1, 1 addiw a2, a0, -48 - beq s6, s2, label649 + beq s6, s2, label645 mv a1, s7 - j label649 + j label645 .p2align 2 label521: mv s6, a0 @@ -447,19 +447,19 @@ label218: subw a2, zero, s8 addiw a1, a0, -48 mv s6, a2 - bne s7, zero, label643 + bne s7, zero, label639 mv s6, s8 .p2align 2 -label643: +label639: bleu a1, s1, label484 mv s7, a0 mv s8, zero j label220 .p2align 2 -label644: +label640: mv a1, s8 .p2align 2 -label645: +label641: bleu a2, s1, label492 mv s7, a0 mv s8, a1 @@ -468,13 +468,13 @@ label220: jal getch li a1, 1 addiw a2, a0, -48 - beq s7, s2, label645 - j label644 + beq s7, s2, label641 + j label640 .p2align 2 label492: mv s7, a1 addiw a2, a0, -48 - bgeu a2, s0, label668 + bgeu a2, s0, label664 .p2align 2 label498: mv s8, a0 @@ -495,7 +495,7 @@ label403: mv s7, a0 mv s6, a1 addiw a0, a0, -48 - bgeu a0, s0, label663 + bgeu a0, s0, label659 .p2align 2 label409: mv s8, zero @@ -514,12 +514,12 @@ label195: bltu a1, s0, label418 subw a1, zero, s8 mv a0, a1 - bne s6, zero, label637 + bne s6, zero, label633 .p2align 2 -label664: +label660: mv a0, s8 .p2align 2 -label637: +label633: jal find mv s6, a0 jal getch @@ -529,10 +529,10 @@ label637: mv s8, zero j label208 .p2align 2 -label640: +label636: mv a1, s8 .p2align 2 -label641: +label637: bleu a2, s1, label455 mv s7, a0 mv s8, a1 @@ -541,8 +541,8 @@ label208: jal getch li a1, 1 addiw a2, a0, -48 - beq s7, s2, label641 - j label640 + beq s7, s2, label637 + j label636 .p2align 2 label455: mv s7, a1 @@ -567,12 +567,12 @@ label203: label441: subw a1, zero, s9 mv a0, a1 - bne s7, zero, label639 + bne s7, zero, label635 .p2align 2 -label665: +label661: mv a0, s9 .p2align 2 -label639: +label635: jal find addiw s5, s5, -1 sh2add a1, s6, s4 @@ -597,7 +597,7 @@ label484: mv s7, zero addiw a2, a0, -48 bltu a2, s0, label498 -label668: +label664: mv s9, zero j label229 label395: @@ -605,12 +605,12 @@ label395: mv s6, zero addiw a0, a0, -48 bltu a0, s0, label409 -label663: +label659: mv s8, zero mv a1, zero mv a0, zero - bne s6, zero, label637 - j label664 + bne s6, zero, label633 + j label660 label427: mv s7, zero addiw a1, a0, -48 @@ -619,8 +619,8 @@ label432: mv s9, zero mv a1, zero mv a0, zero - bne s7, zero, label639 - j label665 + bne s7, zero, label635 + j label661 label461: mv s6, a0 mv s7, zero diff --git a/tests/SysY2022/hidden_functional/13_LCA.arm.s b/tests/SysY2022/hidden_functional/13_LCA.arm.s index 12ffc6292..19cbb3df9 100644 --- a/tests/SysY2022/hidden_functional/13_LCA.arm.s +++ b/tests/SysY2022/hidden_functional/13_LCA.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 f: .zero 800400 -.align 8 +.p2align 3 dep: .zero 40020 -.align 8 +.p2align 3 to: .zero 40020 -.align 8 +.p2align 3 next: .zero 40020 -.align 8 +.p2align 3 head: .zero 40020 .text diff --git a/tests/SysY2022/hidden_functional/13_LCA.riscv.s b/tests/SysY2022/hidden_functional/13_LCA.riscv.s index df6c22b0d..bc54cb04e 100644 --- a/tests/SysY2022/hidden_functional/13_LCA.riscv.s +++ b/tests/SysY2022/hidden_functional/13_LCA.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 f: .zero 800400 -.align 8 +.p2align 3 dep: .zero 40020 -.align 8 +.p2align 3 to: .zero 40020 -.align 8 +.p2align 3 next: .zero 40020 -.align 8 +.p2align 3 head: .zero 40020 .text @@ -109,16 +109,16 @@ main: jal getch li a2, 9 li s0, -1 -pcrel1247: +pcrel1243: auipc a3, %pcrel_hi(next) addiw a1, a0, -48 - addi s1, a3, %pcrel_lo(pcrel1247) + addi s1, a3, %pcrel_lo(pcrel1243) bleu a1, a2, label214 mv s2, a0 mv s3, zero j label101 .p2align 2 -label1039: +label1035: li a2, 9 bleu a3, a2, label222 mv s2, a0 @@ -129,9 +129,9 @@ label101: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s2, a2, label1039 + beq s2, a2, label1035 mv a1, s3 - j label1039 + j label1035 label222: mv s3, a1 label104: @@ -159,9 +159,9 @@ label110: subw a2, zero, s4 addiw a1, a0, -48 mv s2, a2 - bne s3, zero, label1041 + bne s3, zero, label1037 mv s2, s4 -label1041: +label1037: addiw s5, s2, -2 addiw s6, s2, -17 li a2, 9 @@ -170,10 +170,10 @@ label1041: mv s4, zero j label112 .p2align 2 -label1042: +label1038: mv a1, s4 .p2align 2 -label1043: +label1039: li a2, 9 bleu a3, a2, label255 mv s3, a0 @@ -184,8 +184,8 @@ label112: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s3, a2, label1043 - j label1042 + beq s3, a2, label1039 + j label1038 label255: mv s4, a1 label115: @@ -209,18 +209,18 @@ label118: label121: subw a0, zero, s7 mv s3, a0 - bne s4, zero, label1045 + bne s4, zero, label1041 mv s3, s7 -label1045: +label1041: auipc a0, %pcrel_hi(dep) lui a2, 259060 -pcrel1248: +pcrel1244: auipc a3, %pcrel_hi(head) - addi a5, a0, %pcrel_lo(label1045) + addi a5, a0, %pcrel_lo(label1041) addiw a1, a2, -193 - addi s4, a3, %pcrel_lo(pcrel1248) + addi s4, a3, %pcrel_lo(pcrel1244) sd a5, 104(sp) - sw a1, %pcrel_lo(label1045)(a0) + sw a1, %pcrel_lo(label1041)(a0) ble s2, zero, label145 addiw a2, s2, 1 li a1, 4 @@ -281,22 +281,22 @@ label145: li a0, 1 addi s0, a1, %pcrel_lo(label145) beq s2, a0, label174 -pcrel1249: +pcrel1245: auipc a0, %pcrel_hi(to) mv s6, zero li s7, 1 - addi s5, a0, %pcrel_lo(pcrel1249) + addi s5, a0, %pcrel_lo(pcrel1245) j label147 .p2align 2 label380: subw a0, zero, s11 mv a1, a0 - bne s9, zero, label1050 + bne s9, zero, label1046 .p2align 2 -label1161: +label1157: mv a1, s11 .p2align 2 -label1050: +label1046: sh2add a2, a1, a1 sw a1, 0(s5) addiw s7, s7, 1 @@ -321,7 +321,7 @@ label147: mv s9, zero j label171 .p2align 2 -label1054: +label1050: li a2, 9 bleu a3, a2, label424 mv s8, a0 @@ -332,9 +332,9 @@ label171: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s8, a2, label1054 + beq s8, a2, label1050 mv a1, s9 - j label1054 + j label1050 .p2align 2 label424: mv s8, a0 @@ -365,17 +365,17 @@ label154: subw a2, zero, s10 addiw a1, a0, -48 mv s8, a2 - bne s9, zero, label1048 + bne s9, zero, label1044 mv s8, s10 .p2align 2 -label1048: +label1044: li a2, 9 bleu a1, a2, label366 mv s9, a0 mv s10, zero j label165 .p2align 2 -label1052: +label1048: li a2, 9 bleu a3, a2, label407 mv s9, a0 @@ -386,9 +386,9 @@ label165: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s9, a2, label1052 + beq s9, a2, label1048 mv a1, s10 - j label1052 + j label1048 .p2align 2 label407: mv s9, a1 @@ -439,86 +439,86 @@ label200: slli a4, a5, 4 sh2add a5, a1, a1 add a2, s0, a4 - slli a4, a5, 4 + slli t0, a5, 4 lw a0, 76(a2) - add t1, s0, a4 - mv a4, a0 - lw a2, 76(t1) - xor a5, a0, a2 + add a2, s0, t0 + lw a4, 76(a2) + mv a2, a0 + xor a5, a0, a4 sltu t0, zero, a5 - bne t0, zero, label1066 - mv a4, a3 + bne t0, zero, label1062 + mv a2, a3 .p2align 2 -label1066: - sh2add a0, a4, a4 +label1062: + sh2add a0, a2, a2 slli a3, a0, 4 - mv a0, a2 + mv a0, a4 add t1, s0, a3 lw a5, 72(t1) - bne t0, zero, label1068 + bne t0, zero, label1064 mv a0, a1 .p2align 2 -label1068: +label1064: sh2add t0, a0, a0 - slli a2, t0, 4 - add t1, s0, a2 - mv a2, a5 - lw a3, 72(t1) - xor a1, a5, a3 + slli a3, t0, 4 + add t1, s0, a3 + mv a3, a5 + lw a4, 72(t1) + xor a1, a5, a4 sltu t0, zero, a1 - bne t0, zero, label1070 - mv a2, a4 + bne t0, zero, label1066 + mv a3, a2 .p2align 2 -label1070: - sh2add a1, a2, a2 +label1066: + sh2add a1, a3, a3 slli a5, a1, 4 - mv a1, a3 + mv a1, a4 add t1, s0, a5 - lw a4, 68(t1) - bne t0, zero, label1072 + lw a2, 68(t1) + bne t0, zero, label1068 mv a1, a0 .p2align 2 -label1072: +label1068: sh2add a5, a1, a1 slli a0, a5, 4 add t1, s0, a0 - mv a0, a4 - lw a3, 68(t1) - xor t0, a4, a3 - sltu a5, zero, t0 - bne a5, zero, label1074 mv a0, a2 + lw a4, 68(t1) + xor a5, a2, a4 + sltu t0, zero, a5 + bne t0, zero, label1070 + mv a0, a3 .p2align 2 -label1074: - sh2add t0, a0, a0 - mv a2, a3 - slli t2, t0, 4 - add t1, s0, t2 - lw a4, 64(t1) - bne a5, zero, label1076 +label1070: + sh2add a2, a0, a0 + slli t1, a2, 4 + mv a2, a4 + add a3, s0, t1 + lw a5, 64(a3) + bne t0, zero, label1072 mv a2, a1 .p2align 2 -label1076: +label1072: sh2add t0, a2, a2 slli a1, t0, 4 add a3, s0, a1 - lw a5, 64(a3) - mv a3, a4 - xor t1, a4, a5 + lw a4, 64(a3) + mv a3, a5 + xor t1, a5, a4 sltu t0, zero, t1 - bne t0, zero, label1078 + bne t0, zero, label1074 mv a3, a0 .p2align 2 -label1078: +label1074: sh2add a1, a3, a3 slli t1, a1, 4 - mv a1, a5 - add a4, s0, t1 - lw a0, 60(a4) - bne t0, zero, label1080 + mv a1, a4 + add a5, s0, t1 + lw a0, 60(a5) + bne t0, zero, label1076 mv a1, a2 .p2align 2 -label1080: +label1076: sh2add a5, a1, a1 slli t1, a5, 4 add a2, s0, t1 @@ -526,19 +526,19 @@ label1080: mv a2, a0 xor t0, a0, a4 sltu a5, zero, t0 - bne a5, zero, label1082 + bne a5, zero, label1078 mv a2, a3 .p2align 2 -label1082: +label1078: sh2add a0, a2, a2 slli t0, a0, 4 mv a0, a4 add t1, s0, t0 lw a3, 56(t1) - bne a5, zero, label1084 + bne a5, zero, label1080 mv a0, a1 .p2align 2 -label1084: +label1080: sh2add a5, a0, a0 slli a1, a5, 4 add t1, s0, a1 @@ -546,19 +546,19 @@ label1084: lw a4, 56(t1) xor t0, a3, a4 sltu a5, zero, t0 - bne a5, zero, label1086 + bne a5, zero, label1082 mv a1, a2 .p2align 2 -label1086: +label1082: sh2add t1, a1, a1 mv a2, a4 slli t0, t1, 4 add t2, s0, t0 lw a3, 52(t2) - bne a5, zero, label1088 + bne a5, zero, label1084 mv a2, a0 .p2align 2 -label1088: +label1084: sh2add a5, a2, a2 slli t1, a5, 4 add a0, s0, t1 @@ -566,219 +566,219 @@ label1088: mv a0, a3 xor t0, a3, a4 sltu a5, zero, t0 - bne a5, zero, label1090 + bne a5, zero, label1086 mv a0, a1 .p2align 2 -label1090: +label1086: sh2add t2, a0, a0 mv a1, a4 slli t0, t2, 4 add t1, s0, t0 lw a3, 48(t1) - bne a5, zero, label1092 + bne a5, zero, label1088 mv a1, a2 .p2align 2 -label1092: +label1088: sh2add a5, a1, a1 slli t0, a5, 4 add a2, s0, t0 lw a4, 48(a2) mv a2, a3 - xor a5, a3, a4 - sltu t0, zero, a5 - bne t0, zero, label1094 + xor t1, a3, a4 + sltu a5, zero, t1 + bne a5, zero, label1090 mv a2, a0 .p2align 2 -label1094: +label1090: sh2add t1, a2, a2 mv a0, a4 slli t2, t1, 4 - add a3, s0, t2 - lw a5, 44(a3) - bne t0, zero, label1096 + add t0, s0, t2 + lw a3, 44(t0) + bne a5, zero, label1092 mv a0, a1 .p2align 2 +label1092: + sh2add t0, a0, a0 + slli a5, t0, 4 + add a1, s0, a5 + lw a4, 44(a1) + mv a1, a3 + xor t0, a3, a4 + sltu a5, zero, t0 + bne a5, zero, label1094 + mv a1, a2 +.p2align 2 +label1094: + sh2add a3, a1, a1 + slli t1, a3, 4 + mv a3, a4 + add t0, s0, t1 + lw a2, 40(t0) + bne a5, zero, label1096 + mv a3, a0 +.p2align 2 label1096: - sh2add a4, a0, a0 - slli t0, a4, 4 - add a3, s0, t0 - lw a1, 44(a3) - mv a3, a5 - xor a4, a5, a1 - sltu t0, zero, a4 + sh2add t0, a3, a3 + slli a5, t0, 4 + add a0, s0, a5 + lw a4, 40(a0) + mv a0, a2 + xor t1, a2, a4 + sltu t0, zero, t1 bne t0, zero, label1098 - mv a3, a2 + mv a0, a1 .p2align 2 label1098: - sh2add t1, a3, a3 - mv a2, a1 - slli t2, t1, 4 - add a5, s0, t2 - lw a4, 40(a5) + sh2add a2, a0, a0 + slli t1, a2, 4 + mv a2, a4 + add a1, s0, t1 + lw a5, 36(a1) bne t0, zero, label1100 - mv a2, a0 + mv a2, a3 .p2align 2 label1100: - sh2add t0, a2, a2 - slli a5, t0, 4 - add a1, s0, a5 - lw a0, 40(a1) - mv a1, a4 - xor t0, a4, a0 - sltu a5, zero, t0 - bne a5, zero, label1102 - mv a1, a3 + sh2add a4, a2, a2 + slli t0, a4, 4 + add a1, s0, t0 + lw a3, 36(a1) + mv a1, a5 + xor a4, a5, a3 + sltu t0, zero, a4 + bne t0, zero, label1102 + mv a1, a0 .p2align 2 label1102: - sh2add t1, a1, a1 - mv a3, a0 - slli t2, t1, 4 - add t0, s0, t2 - lw a4, 36(t0) - bne a5, zero, label1104 - mv a3, a2 + sh2add t2, a1, a1 + mv a0, a3 + slli t1, t2, 4 + add a5, s0, t1 + lw a4, 32(a5) + bne t0, zero, label1104 + mv a0, a2 .p2align 2 label1104: - sh2add t0, a3, a3 - slli a5, t0, 4 - add a0, s0, a5 - lw a2, 36(a0) - mv a0, a4 - xor t0, a4, a2 - sltu a5, zero, t0 - bne a5, zero, label1106 - mv a0, a1 + sh2add t0, a0, a0 + slli t1, t0, 4 + add a3, s0, t1 + lw a5, 32(a3) + mv a3, a4 + xor a2, a4, a5 + sltu t0, zero, a2 + bne t0, zero, label1106 + mv a3, a1 .p2align 2 label1106: - sh2add t2, a0, a0 - mv a1, a2 - slli t1, t2, 4 - add t0, s0, t1 - lw a4, 32(t0) - bne a5, zero, label1108 - mv a1, a3 + sh2add a2, a3, a3 + slli t1, a2, 4 + mv a2, a5 + add a4, s0, t1 + lw a1, 28(a4) + bne t0, zero, label1108 + mv a2, a0 .p2align 2 label1108: - sh2add a5, a1, a1 - slli t0, a5, 4 - add a2, s0, t0 - lw a3, 32(a2) - mv a2, a4 - xor a5, a4, a3 - sltu t0, zero, a5 - bne t0, zero, label1110 - mv a2, a0 + sh2add a5, a2, a2 + slli t1, a5, 4 + add a4, s0, t1 + lw a0, 28(a4) + mv a4, a1 + xor t0, a1, a0 + sltu a5, zero, t0 + bne a5, zero, label1110 + mv a4, a3 .p2align 2 label1110: - sh2add a4, a2, a2 - mv a0, a3 - slli t2, a4, 4 - add t1, s0, t2 - lw a5, 28(t1) - bne t0, zero, label1112 - mv a0, a1 + sh2add a1, a4, a4 + slli t1, a1, 4 + mv a1, a0 + add t0, s0, t1 + lw a3, 24(t0) + bne a5, zero, label1112 + mv a1, a2 .p2align 2 label1112: - sh2add a3, a0, a0 - slli t0, a3, 4 - mv a3, a5 - add a1, s0, t0 - lw a4, 28(a1) - xor t1, a5, a4 - sltu t0, zero, t1 + sh2add a5, a1, a1 + slli t0, a5, 4 + add a0, s0, t0 + lw a2, 24(a0) + mv a0, a3 + xor a5, a3, a2 + sltu t0, zero, a5 bne t0, zero, label1114 - mv a3, a2 + mv a0, a4 .p2align 2 label1114: - sh2add a1, a3, a3 - slli t1, a1, 4 - mv a1, a4 - add a5, s0, t1 - lw a2, 24(a5) + sh2add t1, a0, a0 + mv a4, a2 + slli t2, t1, 4 + add a3, s0, t2 + lw a5, 20(a3) bne t0, zero, label1116 - mv a1, a0 + mv a4, a1 .p2align 2 label1116: - sh2add t0, a1, a1 - slli a5, t0, 4 - add a0, s0, a5 - lw a4, 24(a0) - mv a0, a2 - xor t0, a2, a4 - sltu a5, zero, t0 - bne a5, zero, label1118 - mv a0, a3 + sh2add t0, a4, a4 + slli a3, t0, 4 + add a2, s0, a3 + mv a3, a5 + lw a1, 20(a2) + xor t1, a5, a1 + sltu t0, zero, t1 + bne t0, zero, label1118 + mv a3, a0 .p2align 2 label1118: - sh2add a2, a0, a0 - slli t1, a2, 4 - mv a2, a4 - add t0, s0, t1 - lw a3, 20(t0) - bne a5, zero, label1120 + sh2add a2, a3, a3 + slli a5, a2, 4 mv a2, a1 + add t1, s0, a5 + lw a0, 16(t1) + bne t0, zero, label1120 + mv a2, a4 .p2align 2 label1120: sh2add t0, a2, a2 slli a5, t0, 4 - add a4, s0, a5 - lw a1, 20(a4) - mv a4, a3 - xor t0, a3, a1 + add a1, s0, a5 + lw a4, 16(a1) + mv a1, a0 + xor t0, a0, a4 sltu a5, zero, t0 bne a5, zero, label1122 - mv a4, a0 + mv a1, a3 .p2align 2 label1122: - sh2add a3, a4, a4 - slli t0, a3, 4 - mv a3, a1 - add t1, s0, t0 - lw a0, 16(t1) - bne a5, zero, label1124 - mv a3, a2 -.p2align 2 -label1124: - sh2add a5, a3, a3 - slli a1, a5, 4 - add t0, s0, a1 - mv a1, a0 - lw a2, 16(t0) - xor t1, a0, a2 - sltu a5, zero, t1 - bne a5, zero, label1126 - mv a1, a4 -.p2align 2 -label1126: sh2add a0, a1, a1 slli t0, a0, 4 - mv a0, a2 + mv a0, a4 add t1, s0, t0 - lw a4, 12(t1) - bne a5, zero, label1128 - mv a0, a3 + lw a3, 12(t1) + bne a5, zero, label1124 + mv a0, a2 .p2align 2 -label1128: - sh2add a5, a0, a0 - slli t1, a5, 4 - add a2, s0, t1 - lw a3, 12(a2) - mv a2, a4 - xor t0, a4, a3 - sltu a5, zero, t0 - bne a5, zero, label1130 +label1124: + sh2add a4, a0, a0 + slli t0, a4, 4 + add a2, s0, t0 + lw a5, 12(a2) + mv a2, a3 + xor a4, a3, a5 + sltu t0, zero, a4 + bne t0, zero, label1126 mv a2, a1 .p2align 2 -label1130: +label1126: sh2add t2, a2, a2 - mv a1, a3 + mv a1, a5 slli t1, t2, 4 - add t0, s0, t1 - lw a4, 8(t0) - bne a5, zero, label1132 + add a3, s0, t1 + lw a4, 8(a3) + bne t0, zero, label1128 mv a1, a0 .p2align 2 -label1132: +label1128: sh2add a5, a1, a1 slli t1, a5, 4 add a0, s0, t1 @@ -786,19 +786,19 @@ label1132: mv a0, a4 xor t0, a4, a3 sltu a5, zero, t0 - bne a5, zero, label1134 + bne a5, zero, label1130 mv a0, a2 .p2align 2 -label1134: +label1130: sh2add t0, a0, a0 mv a2, a3 slli t2, t0, 4 add t1, s0, t2 lw a4, 4(t1) - bne a5, zero, label1136 + bne a5, zero, label1132 mv a2, a1 .p2align 2 -label1136: +label1132: sh2add t0, a2, a2 slli a5, t0, 4 add a1, s0, a5 @@ -806,32 +806,32 @@ label1136: mv a1, a4 xor t0, a4, a3 sltu a5, zero, t0 - bne a5, zero, label1138 + bne a5, zero, label1134 mv a1, a0 .p2align 2 -label1138: +label1134: sh2add a4, a1, a1 slli t1, a4, 4 mv a4, a3 add t0, s0, t1 lw a0, 0(t0) - bne a5, zero, label1140 + bne a5, zero, label1136 mv a4, a2 .p2align 2 -label1140: +label1136: sh2add a5, a4, a4 mv a2, a0 slli t1, a5, 4 add t0, s0, t1 lw a3, 0(t0) - bne a0, a3, label1142 + bne a0, a3, label1138 mv a2, a1 .p2align 2 -label1142: +label1138: sh2add a4, a2, a2 - slli a1, a4, 4 - add a3, s0, a1 - lw a0, 0(a3) + slli a3, a4, 4 + add a1, s0, a3 + lw a0, 0(a1) .p2align 2 label198: jal putint @@ -850,7 +850,7 @@ label175: mv s3, zero j label207 .p2align 2 -label1146: +label1142: li a2, 9 bleu a3, a2, label760 mv s2, a0 @@ -861,9 +861,9 @@ label207: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s2, a2, label1146 + beq s2, a2, label1142 mv a1, s3 - j label1146 + j label1142 .p2align 2 label760: mv s2, a0 @@ -879,24 +879,24 @@ label204: jal getch sh2add a2, s4, s4 addiw a1, a0, -48 - slliw a3, a2, 1 + slliw a4, a2, 1 li a2, 10 - addi a4, a3, -48 - addw s4, s2, a4 + addi a3, a4, -48 + addw s4, s2, a3 bgeu a1, a2, label180 mv s2, a0 j label204 .p2align 2 -label1063: +label1059: mv t0, a3 .p2align 2 -label1064: +label1060: mv a3, t0 addiw a2, a2, -1 ld a5, 104(sp) sh2add t0, t0, a5 lw a4, 0(t0) - bge a0, a4, label1166 + bge a0, a4, label1162 .p2align 2 label190: sh2add a4, a3, a3 @@ -909,25 +909,25 @@ label190: mv t0, a4 sh2add t2, a4, a5 lw t1, 0(t2) - ble a0, t1, label1064 - j label1063 + ble a0, t1, label1060 + j label1059 .p2align 2 label180: jal getch subw a2, zero, s4 addiw a1, a0, -48 mv s2, a2 - bne s3, zero, label1056 + bne s3, zero, label1052 mv s2, s4 .p2align 2 -label1056: +label1052: li a2, 9 bleu a1, a2, label448 mv s3, a0 mv s4, zero j label201 .p2align 2 -label1144: +label1140: li a2, 9 bleu a3, a2, label743 mv s3, a0 @@ -938,9 +938,9 @@ label201: li a2, 45 li a1, 1 addiw a3, a0, -48 - beq s3, a2, label1144 + beq s3, a2, label1140 mv a1, s4 - j label1144 + j label1140 .p2align 2 label743: mv s3, a1 @@ -967,9 +967,9 @@ label185: label462: subw a1, zero, s5 mv a0, a1 - beq s3, zero, label1164 + beq s3, zero, label1160 .p2align 2 -label1058: +label1054: ld a5, 104(sp) sh2add a3, s2, a5 sh2add a1, a0, a5 @@ -977,18 +977,18 @@ label1058: mv a3, a0 lw t0, 0(a1) slt a2, a4, t0 - bne a2, zero, label1060 + bne a2, zero, label1056 mv a3, s2 .p2align 2 -label1060: +label1056: ld a5, 104(sp) mv a1, s2 sh2add t0, a3, a5 lw a4, 0(t0) - bne a2, zero, label1062 + bne a2, zero, label1058 mv a1, a0 .p2align 2 -label1062: +label1058: ld a5, 104(sp) sh2add a2, a1, a5 lw a0, 0(a2) @@ -998,7 +998,7 @@ label485: li a2, 19 j label190 .p2align 2 -label1164: +label1160: mv a0, s5 ld a5, 104(sp) sh2add a3, s2, a5 @@ -1007,7 +1007,7 @@ label1164: mv a3, s5 lw t0, 0(a1) slt a2, a4, t0 - bne a2, zero, label1060 + bne a2, zero, label1056 mv a3, s2 sh2add t0, s2, a5 lw a4, 0(t0) @@ -1017,15 +1017,15 @@ label1164: bgt a4, a0, label485 j label196 .p2align 2 -label1166: +label1162: beq a1, a3, label511 j label200 label371: mv s11, zero mv a0, zero mv a1, zero - bne s9, zero, label1050 - j label1161 + bne s9, zero, label1046 + j label1157 label366: mv s9, zero addiw a1, a0, -48 @@ -1036,8 +1036,8 @@ label453: mv s5, zero mv a1, zero mv a0, zero - bne s3, zero, label1058 - j label1058 + bne s3, zero, label1054 + j label1054 label448: mv s3, zero addiw a1, a0, -48 diff --git a/tests/SysY2022/hidden_functional/14_dp.arm.s b/tests/SysY2022/hidden_functional/14_dp.arm.s index aeb7e1273..cd75dc566 100644 --- a/tests/SysY2022/hidden_functional/14_dp.arm.s +++ b/tests/SysY2022/hidden_functional/14_dp.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 t: .zero 8040 -.align 8 +.p2align 3 dp: .zero 140700 .text diff --git a/tests/SysY2022/hidden_functional/14_dp.riscv.s b/tests/SysY2022/hidden_functional/14_dp.riscv.s index 6c531010e..c216d6c59 100644 --- a/tests/SysY2022/hidden_functional/14_dp.riscv.s +++ b/tests/SysY2022/hidden_functional/14_dp.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 t: .zero 8040 -.align 8 +.p2align 3 dp: .zero 140700 .text diff --git a/tests/SysY2022/hidden_functional/16_k_smallest.arm.s b/tests/SysY2022/hidden_functional/16_k_smallest.arm.s index 3b4a3084c..f1c310bf9 100644 --- a/tests/SysY2022/hidden_functional/16_k_smallest.arm.s +++ b/tests/SysY2022/hidden_functional/16_k_smallest.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 array: .zero 4000 .text diff --git a/tests/SysY2022/hidden_functional/16_k_smallest.riscv.s b/tests/SysY2022/hidden_functional/16_k_smallest.riscv.s index df47a3452..32e400fb9 100644 --- a/tests/SysY2022/hidden_functional/16_k_smallest.riscv.s +++ b/tests/SysY2022/hidden_functional/16_k_smallest.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 array: .zero 4000 .text diff --git a/tests/SysY2022/hidden_functional/17_maximal_clique.arm.s b/tests/SysY2022/hidden_functional/17_maximal_clique.arm.s index 92508cb99..f04a34a2a 100644 --- a/tests/SysY2022/hidden_functional/17_maximal_clique.arm.s +++ b/tests/SysY2022/hidden_functional/17_maximal_clique.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 graph: .zero 3600 -.align 8 +.p2align 3 edges: .zero 4800 .text diff --git a/tests/SysY2022/hidden_functional/17_maximal_clique.riscv.s b/tests/SysY2022/hidden_functional/17_maximal_clique.riscv.s index bd7a6f8ea..0a1434825 100644 --- a/tests/SysY2022/hidden_functional/17_maximal_clique.riscv.s +++ b/tests/SysY2022/hidden_functional/17_maximal_clique.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 graph: .zero 3600 -.align 8 +.p2align 3 edges: .zero 4800 .text diff --git a/tests/SysY2022/hidden_functional/18_prim.arm.s b/tests/SysY2022/hidden_functional/18_prim.arm.s index ca2bbfef5..d71347e22 100644 --- a/tests/SysY2022/hidden_functional/18_prim.arm.s +++ b/tests/SysY2022/hidden_functional/18_prim.arm.s @@ -1,16 +1,16 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 u: .zero 4020 -.align 8 +.p2align 3 v: .zero 4020 -.align 8 +.p2align 3 c: .zero 4020 -.align 8 +.p2align 3 fa: .zero 4020 .text diff --git a/tests/SysY2022/hidden_functional/18_prim.riscv.s b/tests/SysY2022/hidden_functional/18_prim.riscv.s index 3abd8a92d..d6615d003 100644 --- a/tests/SysY2022/hidden_functional/18_prim.riscv.s +++ b/tests/SysY2022/hidden_functional/18_prim.riscv.s @@ -1,16 +1,16 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 u: .zero 4020 -.align 8 +.p2align 3 v: .zero 4020 -.align 8 +.p2align 3 c: .zero 4020 -.align 8 +.p2align 3 fa: .zero 4020 .text @@ -170,10 +170,10 @@ label85: main: # stack usage: CalleeArg[0] Local[0] RegSpill[8] CalleeSaved[104] addi sp, sp, -112 -pcrel515: +pcrel511: auipc a0, %pcrel_hi(fa) sd ra, 0(sp) - addi a5, a0, %pcrel_lo(pcrel515) + addi a5, a0, %pcrel_lo(pcrel511) sd s0, 8(sp) li s0, 10 sd s5, 16(sp) @@ -207,10 +207,10 @@ label146: mv s4, zero j label204 .p2align 2 -label469: +label465: mv a1, s4 .p2align 2 -label470: +label466: bleu a2, s1, label385 mv s3, a0 mv s4, a1 @@ -219,8 +219,8 @@ label204: jal getch li a1, 1 addiw a2, a0, -48 - beq s3, s2, label470 - j label469 + beq s3, s2, label466 + j label465 label385: mv s4, a0 mv s3, a1 @@ -242,17 +242,17 @@ label201: label151: subw a0, zero, s5 mv s7, a0 - bne s3, zero, label456 + bne s3, zero, label452 mv s7, s5 -label456: +label452: auipc a0, %pcrel_hi(u) -pcrel516: +pcrel512: auipc a1, %pcrel_hi(v) - addi s3, a0, %pcrel_lo(label456) - addi s4, a1, %pcrel_lo(pcrel516) -pcrel517: + addi s3, a0, %pcrel_lo(label452) + addi s4, a1, %pcrel_lo(pcrel512) +pcrel513: auipc a0, %pcrel_hi(c) - addi s5, a0, %pcrel_lo(pcrel517) + addi s5, a0, %pcrel_lo(pcrel513) ble s7, zero, label240 mv s6, s3 mv s8, zero @@ -261,9 +261,9 @@ label299: mv s11, zero mv a1, zero mv a0, zero - bne s9, zero, label462 + bne s9, zero, label458 .p2align 2 -label462: +label458: sh2add a1, s8, s5 addiw s8, s8, 1 sw a0, 0(a1) @@ -280,7 +280,7 @@ label154: mv s10, zero j label188 .p2align 2 -label468: +label464: bleu a2, s1, label343 mv s9, a0 mv s10, a1 @@ -289,9 +289,9 @@ label188: jal getch li a1, 1 addiw a2, a0, -48 - beq s9, s2, label468 + beq s9, s2, label464 mv a1, s10 - j label468 + j label464 .p2align 2 label343: mv s10, a0 @@ -316,12 +316,12 @@ label160: bltu a1, s0, label261 subw a0, zero, s11 mv a1, a0 - bne s9, zero, label458 + bne s9, zero, label454 .p2align 2 -label485: +label481: mv a1, s11 .p2align 2 -label458: +label454: sw a1, 0(s6) jal getch addiw a1, a0, -48 @@ -330,10 +330,10 @@ label458: mv s10, zero j label185 .p2align 2 -label465: +label461: mv a1, s10 .p2align 2 -label466: +label462: bleu a2, s1, label335 mv s9, a0 mv s10, a1 @@ -342,8 +342,8 @@ label185: jal getch li a1, 1 addiw a2, a0, -48 - beq s9, s2, label466 - j label465 + beq s9, s2, label462 + j label461 .p2align 2 label335: mv s10, a0 @@ -368,12 +368,12 @@ label168: label283: subw a0, zero, s11 mv a1, a0 - bne s9, zero, label460 + bne s9, zero, label456 .p2align 2 -label486: +label482: mv a1, s11 .p2align 2 -label460: +label456: sh2add a2, s8, s4 sw a1, 0(a2) jal getch @@ -383,7 +383,7 @@ label460: mv s10, zero j label182 .p2align 2 -label464: +label460: bleu a2, s1, label327 mv s9, a0 mv s10, a1 @@ -392,9 +392,9 @@ label182: jal getch li a1, 1 addiw a2, a0, -48 - beq s9, s2, label464 + beq s9, s2, label460 mv a1, s10 - j label464 + j label460 .p2align 2 label327: mv s10, a0 @@ -419,7 +419,7 @@ label176: bltu a1, s0, label309 subw a1, zero, s11 mv a0, a1 - bne s9, zero, label462 + bne s9, zero, label458 mv a0, s11 sh2add a1, s8, s5 addiw s8, s8, 1 @@ -476,14 +476,14 @@ label251: mv s11, zero mv a0, zero mv a1, zero - bne s9, zero, label458 - j label485 + bne s9, zero, label454 + j label481 label274: mv s11, zero mv a0, zero mv a1, zero - bne s9, zero, label460 - j label486 + bne s9, zero, label456 + j label482 label198: mv a0, a2 ld ra, 0(sp) diff --git a/tests/SysY2022/hidden_functional/19_search.arm.s b/tests/SysY2022/hidden_functional/19_search.arm.s index 9b3dc103e..2f3d783fa 100644 --- a/tests/SysY2022/hidden_functional/19_search.arm.s +++ b/tests/SysY2022/hidden_functional/19_search.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 3600 .text diff --git a/tests/SysY2022/hidden_functional/19_search.riscv.s b/tests/SysY2022/hidden_functional/19_search.riscv.s index 64c42d5d9..446a876a9 100644 --- a/tests/SysY2022/hidden_functional/19_search.riscv.s +++ b/tests/SysY2022/hidden_functional/19_search.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 3600 .text @@ -263,59 +263,62 @@ label145: main: # stack usage: CalleeArg[0] Local[48] RegSpill[0] CalleeSaved[104] addi sp, sp, -152 - li a3, -1 - li a2, 1 - zext.w a1, a3 + li a2, -1 sd ra, 0(sp) - sd s9, 8(sp) - li s9, 1 - sd s2, 16(sp) - zext.w a0, s9 - slli s2, a2, 32 - sd s4, 24(sp) - sd s10, 32(sp) - mv s10, zero - sd s11, 40(sp) - mv s11, zero - sd s0, 48(sp) - li s0, 2 - sd s5, 56(sp) - li s5, 11 - sd s1, 64(sp) - li s1, 3 - sd s6, 72(sp) - sd s3, 80(sp) - li s3, 28 - sd s7, 88(sp) - sd s8, 96(sp) - sd a0, 104(sp) - sub a0, zero, s2 + zext.w a0, a2 + sd s2, 8(sp) + addi s2, sp, 144 + sd s4, 16(sp) + addi s4, sp, 148 + sd s5, 24(sp) + addi s5, sp, 104 + sd s0, 32(sp) + li s0, 1 + sd s1, 40(sp) + zext.w a1, s0 + sd s6, 48(sp) + mv s6, zero + sd s3, 56(sp) + sd s8, 64(sp) + mv s8, zero + sd s7, 72(sp) + sd s10, 80(sp) + sd s11, 88(sp) + sd s9, 96(sp) sd a1, 112(sp) - sd s2, 120(sp) - sd a0, 128(sp) -pcrel1039: + li a1, 1 + sd a0, 120(sp) + slli s1, a1, 32 + sub a0, zero, s1 + sd s1, 128(sp) + sd a0, 136(sp) +pcrel1035: auipc a0, %pcrel_hi(a) - sw zero, 140(sp) - addi s4, a0, %pcrel_lo(pcrel1039) - sw zero, 136(sp) + sw zero, 148(sp) + addi s3, a0, %pcrel_lo(pcrel1035) + sw zero, 144(sp) j label334 .p2align 2 +label1016: + mv s6, s10 + mv s8, s11 +.p2align 2 label344: - addi a3, sp, 104 - addi a4, sp, 148 - addi a5, sp, 144 - addi a6, sp, 140 - addi a7, sp, 136 - li s9, 1 - mv a0, s11 - mv a1, s10 - mv a2, s9 + addi a3, sp, 112 + addi a4, sp, 108 + mv a0, s8 + mv a1, s6 + mv a2, s0 + mv a5, s5 + mv a6, s4 + mv a7, s2 jal search mv a1, a0 - blt a0, s5, label1006 + li a2, 11 + blt a0, a2, label1002 li a1, -1 .p2align 2 -label1006: +label1002: mv a0, a1 jal putint li a0, 10 @@ -323,12 +326,12 @@ label1006: .p2align 2 label334: jal getint - sw a0, 148(sp) - mv s6, a0 + sw a0, 108(sp) + mv s7, a0 jal getint - sw a0, 144(sp) - beq s6, zero, label337 - mv a0, s4 + sw a0, 104(sp) + beq s7, zero, label337 + mv a0, s3 mv a2, zero j label339 .p2align 2 @@ -336,8 +339,9 @@ label342: addi a0, a0, 480 .p2align 2 label339: - ori a1, s2, 1 + ori a1, s1, 1 addiw a2, a2, 4 + li a3, 28 sd a1, 0(a0) sd a1, 8(a0) sd a1, 16(a0) @@ -398,7 +402,7 @@ label339: sd a1, 456(a0) sd a1, 464(a0) sd a1, 472(a0) - blt a2, s3, label342 + blt a2, a3, label342 sd a1, 480(a0) sd a1, 488(a0) sd a1, 496(a0) @@ -429,91 +433,89 @@ label339: sd a1, 696(a0) sd a1, 704(a0) sd a1, 712(a0) - lw a2, 144(sp) + lw a2, 104(sp) ble a2, zero, label344 - addi s7, s4, 120 - li s9, 1 - mv s6, s9 - lw a0, 148(sp) + addi s7, s3, 120 + mv s10, s6 + mv s11, s8 + mv s6, s0 + lw a0, 108(sp) bgt a0, zero, label353 - addiw s6, s9, 1 - lw a0, 144(sp) + addiw s6, s0, 1 + lw a0, 104(sp) ble s6, a0, label368 - j label344 + j label1016 .p2align 2 label598: - bne a0, s1, label1021 - sw s6, 140(sp) - mv a0, s11 - sw s9, 136(sp) + li a1, 3 + bne a0, a1, label1017 + sw s6, 148(sp) + sw s9, 144(sp) addiw s9, s9, 1 - lw a1, 148(sp) - bgt s9, a1, label608 + lw a0, 108(sp) + bgt s9, a0, label608 .p2align 2 label364: addi s8, s8, 4 - mv s11, a0 .p2align 2 label354: jal getint + li a1, 2 sw a0, 0(s8) - bne a0, s0, label598 + bne a0, a1, label598 mv s10, s9 - mv a0, s6 - lw a1, 148(sp) - addiw s9, s9, 1 - ble s9, a1, label364 mv s11, s6 + lw a0, 108(sp) + addiw s9, s9, 1 + ble s9, a0, label364 addiw s6, s6, 1 - lw a0, 144(sp) - bgt s6, a0, label344 + lw a0, 104(sp) + bgt s6, a0, label1016 .p2align 2 label368: addi s7, s7, 120 - lw a0, 148(sp) - ble a0, zero, label1025 + lw a0, 108(sp) + ble a0, zero, label1021 .p2align 2 label353: addi s8, s7, 4 - li s9, 1 + mv s9, s0 j label354 .p2align 2 -label1021: - mv a0, s11 +label1017: addiw s9, s9, 1 - lw a1, 148(sp) - ble s9, a1, label364 + lw a0, 108(sp) + ble s9, a0, label364 addiw s6, s6, 1 - lw a0, 144(sp) + lw a0, 104(sp) ble s6, a0, label368 - j label344 + j label1016 .p2align 2 label608: - mv s11, a0 addiw s6, s6, 1 - lw a0, 144(sp) + lw a0, 104(sp) ble s6, a0, label368 - j label344 + j label1016 label337: mv a0, zero ld ra, 0(sp) - ld s9, 8(sp) - ld s2, 16(sp) - ld s4, 24(sp) - ld s10, 32(sp) - ld s11, 40(sp) - ld s0, 48(sp) - ld s5, 56(sp) - ld s1, 64(sp) - ld s6, 72(sp) - ld s3, 80(sp) - ld s7, 88(sp) - ld s8, 96(sp) + ld s2, 8(sp) + ld s4, 16(sp) + ld s5, 24(sp) + ld s0, 32(sp) + ld s1, 40(sp) + ld s6, 48(sp) + ld s3, 56(sp) + ld s8, 64(sp) + ld s7, 72(sp) + ld s10, 80(sp) + ld s11, 88(sp) + ld s9, 96(sp) addi sp, sp, 152 ret .p2align 2 -label1025: +label1021: addiw s6, s6, 1 - lw a0, 144(sp) + lw a0, 104(sp) ble s6, a0, label368 - j label344 + j label1016 diff --git a/tests/SysY2022/hidden_functional/20_sort.arm.s b/tests/SysY2022/hidden_functional/20_sort.arm.s index 9b7bbcf0e..b9615bd57 100644 --- a/tests/SysY2022/hidden_functional/20_sort.arm.s +++ b/tests/SysY2022/hidden_functional/20_sort.arm.s @@ -1,16 +1,16 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 cnt: .zero 1600080 -.align 8 +.p2align 3 a: .zero 400020 -.align 8 +.p2align 3 b: .zero 400020 -.align 8 +.p2align 3 c: .zero 400020 .text diff --git a/tests/SysY2022/hidden_functional/20_sort.riscv.s b/tests/SysY2022/hidden_functional/20_sort.riscv.s index e6fbc1faa..89fc30229 100644 --- a/tests/SysY2022/hidden_functional/20_sort.riscv.s +++ b/tests/SysY2022/hidden_functional/20_sort.riscv.s @@ -1,16 +1,16 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 cnt: .zero 1600080 -.align 8 +.p2align 3 a: .zero 400020 -.align 8 +.p2align 3 b: .zero 400020 -.align 8 +.p2align 3 c: .zero 400020 .text diff --git a/tests/SysY2022/hidden_functional/21_union_find.arm.s b/tests/SysY2022/hidden_functional/21_union_find.arm.s index 54e6d5655..bb7a329f5 100644 --- a/tests/SysY2022/hidden_functional/21_union_find.arm.s +++ b/tests/SysY2022/hidden_functional/21_union_find.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 parent: .zero 4020 .text diff --git a/tests/SysY2022/hidden_functional/21_union_find.riscv.s b/tests/SysY2022/hidden_functional/21_union_find.riscv.s index 8196a0b7c..8a05d22c6 100644 --- a/tests/SysY2022/hidden_functional/21_union_find.riscv.s +++ b/tests/SysY2022/hidden_functional/21_union_find.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 parent: .zero 4020 .text diff --git a/tests/SysY2022/hidden_functional/22_matrix_multiply.arm.s b/tests/SysY2022/hidden_functional/22_matrix_multiply.arm.s index 571d43973..bf0a512c1 100644 --- a/tests/SysY2022/hidden_functional/22_matrix_multiply.arm.s +++ b/tests/SysY2022/hidden_functional/22_matrix_multiply.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 40000 -.align 8 +.p2align 3 b: .zero 40000 -.align 8 +.p2align 3 res: .zero 40000 .text diff --git a/tests/SysY2022/hidden_functional/22_matrix_multiply.riscv.s b/tests/SysY2022/hidden_functional/22_matrix_multiply.riscv.s index a93415f15..6f3277a1f 100644 --- a/tests/SysY2022/hidden_functional/22_matrix_multiply.riscv.s +++ b/tests/SysY2022/hidden_functional/22_matrix_multiply.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 40000 -.align 8 +.p2align 3 b: .zero 40000 -.align 8 +.p2align 3 res: .zero 40000 .text diff --git a/tests/SysY2022/hidden_functional/23_json.arm.s b/tests/SysY2022/hidden_functional/23_json.arm.s index 7b4750eb0..625d8dd1e 100644 --- a/tests/SysY2022/hidden_functional/23_json.arm.s +++ b/tests/SysY2022/hidden_functional/23_json.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .section .rodata -.align 8 +.p2align 3 __cmmc_jumptable194: .word label103-__cmmc_jumptable194 .word label19-__cmmc_jumptable194 @@ -12,7 +12,7 @@ __cmmc_jumptable194: .word label63-__cmmc_jumptable194 .word label12-__cmmc_jumptable194 .bss -.align 8 +.p2align 3 buffer: .zero 200000000 .text diff --git a/tests/SysY2022/hidden_functional/23_json.riscv.s b/tests/SysY2022/hidden_functional/23_json.riscv.s index f1b9db78e..77ad54c12 100644 --- a/tests/SysY2022/hidden_functional/23_json.riscv.s +++ b/tests/SysY2022/hidden_functional/23_json.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 8 +.p2align 3 __cmmc_jumptable209: .word label105-__cmmc_jumptable209 .word label21-__cmmc_jumptable209 @@ -12,7 +12,7 @@ __cmmc_jumptable209: .word label65-__cmmc_jumptable209 .word label14-__cmmc_jumptable209 .bss -.align 8 +.p2align 3 buffer: .zero 200000000 .text diff --git a/tests/SysY2022/hidden_functional/28_side_effect2.riscv.s b/tests/SysY2022/hidden_functional/28_side_effect2.riscv.s index 4ecfbe793..c4c97bd21 100644 --- a/tests/SysY2022/hidden_functional/28_side_effect2.riscv.s +++ b/tests/SysY2022/hidden_functional/28_side_effect2.riscv.s @@ -32,7 +32,7 @@ label206: bne a0, zero, label8 mv a1, t1 addiw t0, t0, 1 - bge t0, a3, label958 + bge t0, a3, label957 .p2align 2 label2: addiw t1, a1, 1 @@ -41,7 +41,7 @@ label2: mv a1, t1 addiw t0, t0, 1 blt t0, a3, label2 -label958: +label957: mv t0, zero j label70 label126: @@ -77,7 +77,7 @@ label70: blt t0, a4, label622 sw zero, 12(sp) lw t2, 8(sp) - beq t2, zero, label961 + beq t2, zero, label960 .p2align 2 label626: mv a1, t1 @@ -100,13 +100,13 @@ label444: mv t2, a0 bne a0, zero, label448 addiw t1, a1, 4 - bge t0, a4, label945 + bge t0, a4, label944 .p2align 2 label622: mv t2, a0 bne a0, zero, label626 addiw t2, a1, 6 - bge t0, a5, label946 + bge t0, a5, label945 label631: mv a1, t2 addiw t0, t0, 1 @@ -121,44 +121,44 @@ label220: mv a1, t1 addiw t0, t0, 1 blt t0, a3, label2 - j label958 -label961: + j label957 +label960: addiw t2, a1, 6 blt t0, a5, label631 -label946: +label945: sw zero, 16(sp) lw t1, 12(sp) beq t1, zero, label126 label638: mv a1, t2 j label75 -label945: +label944: sw zero, 12(sp) lw t2, 8(sp) bne t2, zero, label626 addiw t2, a1, 6 blt t0, a5, label631 - j label946 + j label945 label11: addiw t1, a1, 3 bge t0, a5, label225 mv a1, t1 addiw t0, t0, 1 blt t0, a3, label2 - j label958 + j label957 label225: sw a0, 16(sp) lw t2, 12(sp) - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 4 li t2, 4 bge t0, t2, label14 -label942: +label941: mv a1, t1 label68: addiw t0, t0, 1 blt t0, a3, label2 - j label958 + j label957 label216: sw a0, 12(sp) lw t2, 8(sp) @@ -262,14 +262,14 @@ label118: label14: sw a0, 20(sp) lw t2, 16(sp) - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 5 li t2, 5 - blt t0, t2, label942 + blt t0, t2, label941 sw a0, 24(sp) lw t2, 20(sp) bne t2, zero, label17 - j label942 + j label941 label89: lw a5, 60(sp) addiw a2, a1, 9 @@ -321,28 +321,28 @@ label131: label17: addiw t1, a1, 6 li t2, 6 - blt t0, t2, label942 + blt t0, t2, label941 sw a0, 28(sp) lw t2, 24(sp) - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 7 li t2, 7 - blt t0, t2, label942 + blt t0, t2, label941 sw a0, 32(sp) lw t2, 28(sp) - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 8 li t2, 8 - blt t0, t2, label942 + blt t0, t2, label941 sw a0, 36(sp) lw t2, 32(sp) - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 9 li t2, 9 - blt t0, t2, label942 + blt t0, t2, label941 sw a0, 40(sp) lw t2, 36(sp) - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 10 li t3, 10 blt t0, t3, label310 @@ -435,7 +435,7 @@ label175: label310: mv t2, zero label27: - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 11 li t3, 11 blt t0, t3, label322 @@ -445,7 +445,7 @@ label27: label322: mv t2, zero label31: - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 12 li t3, 12 blt t0, t3, label334 @@ -455,49 +455,49 @@ label31: label334: mv t2, zero label35: - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 13 li t3, 13 blt t0, t3, label346 sw a0, 56(sp) lw t2, 52(sp) label38: - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 14 li t3, 14 blt t0, t3, label355 sw a0, 60(sp) lw t2, 56(sp) label41: - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 15 li t3, 15 blt t0, t3, label364 sw a0, 64(sp) lw t2, 60(sp) label45: - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 16 li t3, 16 blt t0, t3, label376 sw a0, 68(sp) lw t2, 64(sp) label48: - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 17 li t3, 17 blt t0, t3, label385 sw a0, 72(sp) lw t2, 68(sp) label51: - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 18 li t3, 18 blt t0, t3, label394 sw a0, 76(sp) lw t2, 72(sp) label55: - beq t2, zero, label942 + beq t2, zero, label941 addiw t1, a1, 19 li t3, 19 bge t0, t3, label405 @@ -507,7 +507,7 @@ label405: sw a0, 80(sp) lw t2, 76(sp) label58: - beq t2, zero, label942 + beq t2, zero, label941 addiw a1, a1, 20 blt t0, a3, label68 sw a0, 84(sp) diff --git a/tests/SysY2022/hidden_functional/29_long_line.arm.s b/tests/SysY2022/hidden_functional/29_long_line.arm.s index 8f96483ec..3fdeb2aac 100644 --- a/tests/SysY2022/hidden_functional/29_long_line.arm.s +++ b/tests/SysY2022/hidden_functional/29_long_line.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 lut_fib: .zero 16336 .text diff --git a/tests/SysY2022/hidden_functional/29_long_line.riscv.s b/tests/SysY2022/hidden_functional/29_long_line.riscv.s index 20372f427..88b7695f0 100644 --- a/tests/SysY2022/hidden_functional/29_long_line.riscv.s +++ b/tests/SysY2022/hidden_functional/29_long_line.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 lut_fib: .zero 16336 .text diff --git a/tests/SysY2022/hidden_functional/30_many_dimensions.arm.s b/tests/SysY2022/hidden_functional/30_many_dimensions.arm.s index 7afd4aff1..b9237ebbd 100644 --- a/tests/SysY2022/hidden_functional/30_many_dimensions.arm.s +++ b/tests/SysY2022/hidden_functional/30_many_dimensions.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 array: .zero 2097152 .text diff --git a/tests/SysY2022/hidden_functional/30_many_dimensions.riscv.s b/tests/SysY2022/hidden_functional/30_many_dimensions.riscv.s index fffc36983..865b97403 100644 --- a/tests/SysY2022/hidden_functional/30_many_dimensions.riscv.s +++ b/tests/SysY2022/hidden_functional/30_many_dimensions.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 array: .zero 2097152 .text @@ -14,10 +14,10 @@ main: lui a2, 128 sd ra, 0(sp) sd s4, 8(sp) -pcrel521: +pcrel517: auipc s4, %pcrel_hi(array) sd s3, 16(sp) - addi s3, s4, %pcrel_lo(pcrel521) + addi s3, s4, %pcrel_lo(pcrel517) sd s2, 24(sp) mv a1, s3 lui s2, 256 @@ -27,9 +27,9 @@ pcrel521: sd s5, 56(sp) sd s9, 64(sp) sd s7, 72(sp) - sd s11, 80(sp) + sd s10, 80(sp) sd s8, 88(sp) - sd s10, 96(sp) + sd s11, 96(sp) .p2align 2 label2: sd zero, 0(a1) @@ -94,85 +94,85 @@ label169: .p2align 2 label7: slli s9, a4, 19 - slli s11, a1, 18 - slli s10, t0, 17 + slli s10, a1, 18 + slli s11, t0, 17 add s7, a3, s9 - add s8, s7, s11 - slli s11, t1, 15 - add s9, s8, s10 - slli s10, a2, 14 - slli s8, a5, 16 - add s7, s9, s8 - add s8, s7, s11 - slli s11, t3, 12 - add s9, s8, s10 - slli s8, t2, 13 - add s7, s9, s8 + add s8, s7, s10 + slli s10, a5, 16 + add s9, s8, s11 + slli s11, a2, 14 + add s7, s9, s10 + slli s9, t1, 15 + add s8, s7, s9 + slli s7, t2, 13 + add s9, s8, s11 + slli s8, t3, 12 + add s10, s9, s7 slli s9, t4, 11 - add s10, s7, s11 - slli s11, t5, 10 - add s8, s10, s9 - slli s10, t6, 9 - add s7, s8, s11 + add s7, s10, s8 + slli s10, t5, 10 + add s11, s7, s9 + slli s7, t6, 9 + add s8, s11, s10 + slli s10, a7, 7 slli s11, a6, 8 - add s9, s7, s10 - slli s7, a7, 7 - add s8, s9, s11 + add s9, s8, s7 + add s7, s9, s11 slli s11, s0, 6 - add s10, s8, s7 - slli s8, s6, 5 - add s9, s10, s11 + add s8, s7, s10 + slli s10, s6, 5 + add s9, s8, s11 addiw s6, s6, 1 - addiw s11, s5, 1 - add s7, s9, s8 - slli s10, s11, 32 + addiw s8, s5, 1 + add s7, s9, s10 + slli s11, s8, 32 addiw s9, s5, 2 - addiw s11, s5, 3 - add.uw s8, s5, s10 - sd s8, 0(s7) - slli s8, s11, 32 - add.uw s10, s9, s8 - addiw s9, s5, 5 + addiw s8, s5, 3 + add.uw s10, s5, s11 + slli s11, s8, 32 + sd s10, 0(s7) addiw s8, s5, 4 + add.uw s10, s9, s11 + addiw s9, s5, 5 sd s10, 8(s7) - slli s11, s9, 32 + slli s10, s9, 32 addiw s9, s5, 6 - add.uw s10, s8, s11 - addiw s11, s5, 7 - sd s10, 16(s7) + add.uw s11, s8, s10 + addiw s10, s5, 7 + sd s11, 16(s7) addiw s5, s5, 8 - slli s10, s11, 32 - add.uw s8, s9, s10 - sd s8, 24(s7) + slli s8, s10, 32 + add.uw s11, s9, s8 + sd s11, 24(s7) blt s6, a0, label7 addiw s0, s0, 1 blt s0, a0, label169 addiw a7, a7, 1 - bge a7, a0, label443 + bge a7, a0, label439 mv s6, zero mv s0, zero j label7 .p2align 2 -label443: +label439: addiw a6, a6, 1 - bge a6, a0, label457 + bge a6, a0, label453 mv s6, zero mv s0, zero mv a7, zero j label7 .p2align 2 -label457: +label453: addiw t6, t6, 1 - bge t6, a0, label470 + bge t6, a0, label466 mv s6, zero mv s0, zero mv a7, zero mv a6, zero j label7 .p2align 2 -label470: +label466: addiw t5, t5, 1 - bge t5, a0, label478 + bge t5, a0, label474 mv s6, zero mv s0, zero mv a7, zero @@ -180,9 +180,9 @@ label470: mv a6, zero j label7 .p2align 2 -label478: +label474: addiw t4, t4, 1 - bge t4, a0, label485 + bge t4, a0, label481 mv s6, zero mv s0, zero mv a7, zero @@ -191,9 +191,9 @@ label478: mv a6, zero j label7 .p2align 2 -label485: +label481: addiw t3, t3, 1 - bge t3, a0, label491 + bge t3, a0, label487 mv s6, zero mv s0, zero mv a7, zero @@ -203,9 +203,9 @@ label485: mv a6, zero j label7 .p2align 2 -label491: +label487: addiw t2, t2, 1 - bge t2, a0, label496 + bge t2, a0, label492 mv s6, zero mv s0, zero mv a7, zero @@ -216,9 +216,9 @@ label491: mv a6, zero j label7 .p2align 2 -label496: +label492: addiw a2, a2, 1 - bge a2, a0, label500 + bge a2, a0, label496 mv s6, zero mv s0, zero mv a7, zero @@ -230,9 +230,9 @@ label496: mv a6, zero j label7 .p2align 2 -label500: +label496: addiw t1, t1, 1 - bge t1, a0, label503 + bge t1, a0, label499 mv s6, zero mv s0, zero mv a7, zero @@ -245,9 +245,9 @@ label500: mv a6, zero j label7 .p2align 2 -label503: +label499: addiw a5, a5, 1 - bge a5, a0, label452 + bge a5, a0, label448 mv s6, zero mv s0, zero mv a7, zero @@ -261,9 +261,9 @@ label503: mv a6, zero j label7 .p2align 2 -label452: +label448: addiw t0, t0, 1 - bge t0, a0, label453 + bge t0, a0, label449 mv s6, zero mv s0, zero mv a7, zero @@ -278,9 +278,9 @@ label452: mv a6, zero j label7 .p2align 2 -label453: +label449: addiw a1, a1, 1 - bge a1, a0, label454 + bge a1, a0, label450 mv s6, zero mv s0, zero mv a7, zero @@ -296,9 +296,9 @@ label453: mv a6, zero j label7 .p2align 2 -label454: +label450: addiw a4, a4, 1 - bge a4, a0, label455 + bge a4, a0, label451 mv s6, zero mv s0, zero mv a7, zero @@ -315,7 +315,7 @@ label454: mv a6, zero j label7 .p2align 2 -label455: +label451: addiw s1, s1, 1 bge s1, a0, label41 add a3, a3, s2 @@ -337,26 +337,26 @@ label455: j label7 label41: auipc s4, %pcrel_hi(array) - lw a0, %pcrel_lo(label41)(s4) + lw a1, %pcrel_lo(label41)(s4) lw a3, 8(s3) lw a5, 4(s3) - addw a2, a0, a3 + addw a2, a1, a3 lw a4, 40(s3) - addw a1, a2, a5 - lw a2, 24(s3) - addw a0, a1, a4 + addw a0, a2, a5 + lw a5, 24(s3) + addw a3, a0, a4 lw a4, 228(s3) - addw a3, a0, a2 + addw a1, a3, a5 lw a5, 56(s3) - addw a1, a3, a4 - lw a3, 964(s3) - addw a2, a1, a5 - lw a4, 224(s3) - addw a0, a2, a3 - lw a5, 804(s3) - addw a1, a0, a4 + addw a2, a1, a4 + lw a4, 964(s3) + addw a0, a2, a5 + lw a5, 224(s3) + addw a3, a0, a4 + lw a4, 804(s3) + addw a1, a3, a5 lw a3, 1996(s3) - addw a2, a1, a5 + addw a2, a1, a4 li a1, 403 addw a0, a2, a3 slli a4, a1, 3 @@ -404,8 +404,8 @@ label41: ld s5, 56(sp) ld s9, 64(sp) ld s7, 72(sp) - ld s11, 80(sp) + ld s10, 80(sp) ld s8, 88(sp) - ld s10, 96(sp) + ld s11, 96(sp) addi sp, sp, 104 ret diff --git a/tests/SysY2022/hidden_functional/31_many_indirections.arm.s b/tests/SysY2022/hidden_functional/31_many_indirections.arm.s index 4923c7e4a..f98f8670e 100644 --- a/tests/SysY2022/hidden_functional/31_many_indirections.arm.s +++ b/tests/SysY2022/hidden_functional/31_many_indirections.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 array: .zero 8000 .text diff --git a/tests/SysY2022/hidden_functional/31_many_indirections.riscv.s b/tests/SysY2022/hidden_functional/31_many_indirections.riscv.s index 6d77f6c81..da2940e7a 100644 --- a/tests/SysY2022/hidden_functional/31_many_indirections.riscv.s +++ b/tests/SysY2022/hidden_functional/31_many_indirections.riscv.s @@ -1,227 +1,224 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 array: .zero 8000 .text .p2align 2 .globl main main: - addi sp, sp, -32 -pcrel660: + addi sp, sp, -8 +pcrel646: auipc a2, %pcrel_hi(array) - mv a6, zero - li a4, 1 - li a3, 20 - li t0, 3 - li t1, 5 - li t2, 7 - li t3, 9 + mv t3, zero + li a3, 1 + li a5, 3 + li t0, 5 + li t1, 7 + li t2, 9 li t4, 11 li t5, 13 - li t6, 15 - li a7, 17 - addi a1, a2, %pcrel_lo(pcrel660) - slli a5, t0, 32 - slli a2, a4, 32 + sd ra, 0(sp) + addi a1, a2, %pcrel_lo(pcrel646) + slli a4, a5, 32 + slli a2, a3, 32 mv a0, a1 + slli a5, t0, 32 + addi a3, a4, 2 slli t0, t1, 32 - addi a4, a5, 2 + addi a4, a5, 4 slli t1, t2, 32 - sd ra, 0(sp) - addi a5, t0, 4 - slli t2, t3, 32 - addi t0, t1, 6 - sd s0, 8(sp) - slli t3, t4, 32 - addi t1, t2, 8 - li s0, 19 - sd s1, 16(sp) + addi a5, t0, 6 + slli t2, t4, 32 + addi t0, t1, 8 slli t4, t5, 32 - addi t2, t3, 10 - slli t5, t6, 32 - sd s2, 24(sp) - addi t3, t4, 12 - slli t6, a7, 32 - addi t4, t5, 14 - slli a7, s0, 32 - addi t5, t6, 16 - addi t6, a7, 18 + addi t1, t2, 10 + addi t2, t4, 12 .p2align 2 label2: sd a2, 0(a0) - li s1, 21 - li s2, 23 - addiw a6, a6, 1 - sd a4, 8(a0) - slli a7, s1, 32 - sd a5, 16(a0) - slli s1, s2, 32 - addi s0, a7, 20 - li s2, 25 - sd t0, 24(a0) - addi a7, s1, 22 - sd t1, 32(a0) - li s1, 27 - sd t2, 40(a0) - sd t3, 48(a0) - sd t4, 56(a0) - sd t5, 64(a0) + li t5, 15 + li a6, 17 + li a7, 27 + addiw t3, t3, 1 + slli t4, t5, 32 + sd a3, 8(a0) + slli t5, a6, 32 + addi t6, t4, 14 + li a6, 19 + sd a4, 16(a0) + addi t4, t5, 16 + sd a5, 24(a0) + slli t5, a6, 32 + sd t0, 32(a0) + li a6, 21 + sd t1, 40(a0) + sd t2, 48(a0) + sd t6, 56(a0) + addi t6, t5, 18 + sd t4, 64(a0) + li t4, 20 sd t6, 72(a0) - sd s0, 80(a0) - slli s0, s2, 32 - sd a7, 88(a0) - li s2, 29 - addi a7, s0, 24 - slli s0, s1, 32 - sd a7, 96(a0) - addi a7, s0, 26 - sd a7, 104(a0) - slli a7, s2, 32 - li s2, 31 - addi s1, a7, 28 - slli s0, s2, 32 - sd s1, 112(a0) - li s2, 35 - addi a7, s0, 30 - li s1, 33 - sd a7, 120(a0) - slli a7, s1, 32 - slli s1, s2, 32 - addi s0, a7, 32 - li s2, 37 - addi a7, s1, 34 - sd s0, 128(a0) - li s1, 39 - slli s0, s2, 32 - sd a7, 136(a0) - li s2, 45 - addi a7, s0, 36 - slli s0, s1, 32 - sd a7, 144(a0) - li s1, 41 - addi a7, s0, 38 - sd a7, 152(a0) - slli a7, s1, 32 - li s1, 43 - addi s0, a7, 40 - slli a7, s1, 32 - sd s0, 160(a0) - addi s0, a7, 42 - slli a7, s2, 32 - sd s0, 168(a0) - li s2, 47 - addi s1, a7, 44 - slli s0, s2, 32 - sd s1, 176(a0) - li s2, 59 - addi a7, s0, 46 - li s1, 49 - sd a7, 184(a0) - slli s0, s1, 32 - li s1, 51 - addi a7, s0, 48 - slli s0, s1, 32 - sd a7, 192(a0) - li s1, 53 - addi a7, s0, 50 - slli s0, s1, 32 - sd a7, 200(a0) - li s1, 55 - addi a7, s0, 52 - sd a7, 208(a0) - slli a7, s1, 32 - li s1, 57 - addi s0, a7, 54 - slli a7, s1, 32 - sd s0, 216(a0) - slli s1, s2, 32 - addi s0, a7, 56 - li s2, 61 - addi a7, s1, 58 - sd s0, 224(a0) - li s1, 63 - slli s0, s2, 32 - sd a7, 232(a0) - li s2, 67 - addi a7, s0, 60 - sd a7, 240(a0) - slli a7, s1, 32 - li s1, 65 - addi s0, a7, 62 - slli a7, s1, 32 - sd s0, 248(a0) - slli s1, s2, 32 - addi s0, a7, 64 - li s2, 69 - addi a7, s1, 66 - sd s0, 256(a0) - li s1, 71 - slli s0, s2, 32 - sd a7, 264(a0) - li s2, 73 - addi a7, s0, 68 - sd a7, 272(a0) - slli a7, s1, 32 - slli s1, s2, 32 - addi s0, a7, 70 - li s2, 75 - addi a7, s1, 72 - sd s0, 280(a0) - sd a7, 288(a0) - slli a7, s2, 32 - li s2, 77 - addi s0, a7, 74 - slli s1, s2, 32 - sd s0, 296(a0) - li s2, 81 - addi a7, s1, 76 - li s0, 79 - sd a7, 304(a0) - slli a7, s0, 32 - slli s0, s2, 32 - addi s1, a7, 78 - li s2, 83 - addi a7, s0, 80 - sd s1, 312(a0) - sd a7, 320(a0) - slli a7, s2, 32 - li s2, 85 - addi s1, a7, 82 - slli s0, s2, 32 - sd s1, 328(a0) - li s2, 91 - addi a7, s0, 84 - li s1, 87 - sd a7, 336(a0) - slli a7, s1, 32 - li s1, 89 - addi s0, a7, 86 - slli a7, s1, 32 - sd s0, 344(a0) - slli s1, s2, 32 - addi s0, a7, 88 - li s2, 93 - addi a7, s1, 90 - sd s0, 352(a0) - li s1, 95 - slli s0, s2, 32 - sd a7, 360(a0) - addi a7, s0, 92 - slli s0, s1, 32 - sd a7, 368(a0) - li s1, 97 - addi a7, s0, 94 - slli s0, s1, 32 - sd a7, 376(a0) - li s1, 99 - addi a7, s0, 96 - sd a7, 384(a0) - slli a7, s1, 32 - addi s0, a7, 98 - sd s0, 392(a0) - bge a6, a3, label6 + slli t6, a6, 32 + li a6, 23 + addi t5, t6, 20 + slli t6, a6, 32 + sd t5, 80(a0) + li a6, 25 + addi t5, t6, 22 + slli t6, a6, 32 + sd t5, 88(a0) + slli a6, a7, 32 + addi t5, t6, 24 + li a7, 29 + addi t6, a6, 26 + sd t5, 96(a0) + li a6, 31 + slli t5, a7, 32 + sd t6, 104(a0) + li a7, 35 + addi t6, t5, 28 + slli t5, a6, 32 + sd t6, 112(a0) + li a6, 33 + addi t6, t5, 30 + slli t5, a6, 32 + sd t6, 120(a0) + slli a6, a7, 32 + addi t6, t5, 32 + li a7, 37 + addi t5, a6, 34 + sd t6, 128(a0) + li a6, 39 + slli t6, a7, 32 + sd t5, 136(a0) + li a7, 51 + addi t5, t6, 36 + slli t6, a6, 32 + sd t5, 144(a0) + li a6, 41 + addi t5, t6, 38 + slli t6, a6, 32 + sd t5, 152(a0) + li a6, 43 + addi t5, t6, 40 + slli t6, a6, 32 + sd t5, 160(a0) + li a6, 45 + addi t5, t6, 42 + slli t6, a6, 32 + sd t5, 168(a0) + li a6, 47 + addi t5, t6, 44 + slli t6, a6, 32 + sd t5, 176(a0) + li a6, 49 + addi t5, t6, 46 + slli t6, a6, 32 + sd t5, 184(a0) + slli a6, a7, 32 + addi t5, t6, 48 + li a7, 53 + addi t6, a6, 50 + sd t5, 192(a0) + slli t5, a7, 32 + sd t6, 200(a0) + li a7, 55 + addi t6, t5, 52 + slli a6, a7, 32 + sd t6, 208(a0) + li a7, 57 + addi t5, a6, 54 + slli t6, a7, 32 + sd t5, 216(a0) + li a7, 59 + addi a6, t6, 56 + slli t5, a7, 32 + sd a6, 224(a0) + li a7, 61 + addi a6, t5, 58 + slli t6, a7, 32 + sd a6, 232(a0) + li a7, 71 + addi t5, t6, 60 + li a6, 63 + sd t5, 240(a0) + slli t6, a6, 32 + li a6, 65 + addi t5, t6, 62 + slli t6, a6, 32 + sd t5, 248(a0) + li a6, 67 + addi t5, t6, 64 + slli t6, a6, 32 + sd t5, 256(a0) + li a6, 69 + addi t5, t6, 66 + slli t6, a6, 32 + sd t5, 264(a0) + slli a6, a7, 32 + addi t5, t6, 68 + li a7, 73 + addi t6, a6, 70 + sd t5, 272(a0) + li a6, 75 + slli t5, a7, 32 + sd t6, 280(a0) + li a7, 79 + addi t6, t5, 72 + slli t5, a6, 32 + sd t6, 288(a0) + li a6, 77 + addi t6, t5, 74 + slli t5, a6, 32 + sd t6, 296(a0) + addi t6, t5, 76 + slli t5, a7, 32 + sd t6, 304(a0) + li a7, 81 + addi a6, t5, 78 + slli t6, a7, 32 + sd a6, 312(a0) + li a7, 83 + addi t5, t6, 80 + slli t6, a7, 32 + sd t5, 320(a0) + li a7, 85 + addi a6, t6, 82 + slli t5, a7, 32 + sd a6, 328(a0) + li a7, 87 + addi t6, t5, 84 + slli a6, a7, 32 + sd t6, 336(a0) + li a7, 89 + addi t5, a6, 86 + slli t6, a7, 32 + li a6, 91 + sd t5, 344(a0) + addi t5, t6, 88 + slli t6, a6, 32 + sd t5, 352(a0) + li a6, 93 + addi t5, t6, 90 + slli t6, a6, 32 + sd t5, 360(a0) + li a6, 95 + addi t5, t6, 92 + slli t6, a6, 32 + sd t5, 368(a0) + li a6, 97 + addi t5, t6, 94 + slli t6, a6, 32 + sd t5, 376(a0) + li a6, 99 + addi t5, t6, 96 + slli t6, a6, 32 + sd t5, 384(a0) + addi t5, t6, 98 + sd t5, 392(a0) + bge t3, t4, label6 addi a0, a0, 400 j label2 label6: @@ -229,103 +226,103 @@ label6: slli a0, a2, 2 addi a2, a0, -492 add a3, a1, a0 - add t1, a1, a2 - lw a5, 0(a3) + add t0, a1, a2 + lw a4, 0(a3) addi a3, a2, -400 - sh2add t0, a5, t1 + sh2add t1, a4, t0 addi a2, a3, -400 - add t1, a1, a3 + add t0, a1, a3 addi a3, a2, -400 - lw a4, 0(t0) - sh2add t0, a4, t1 - add t1, a1, a2 + lw a5, 0(t1) + sh2add t1, a5, t0 + add t0, a1, a2 addi a2, a3, -400 - lw a5, 0(t0) - sh2add t0, a5, t1 - add t1, a1, a3 + lw a4, 0(t1) + sh2add t1, a4, t0 + add t0, a1, a3 addi a3, a2, -400 - lw a4, 0(t0) - sh2add t0, a4, t1 - add t1, a1, a2 + lw a5, 0(t1) + add t2, a1, a3 + sh2add t1, a5, t0 + add t0, a1, a2 addi a2, a3, -400 - lw a5, 0(t0) - sh2add t0, a5, t1 - add t1, a1, a3 + lw a4, 0(t1) addi a3, a2, -400 - lw a4, 0(t0) - sh2add t0, a4, t1 + sh2add t1, a4, t0 + lw a5, 0(t1) add t1, a1, a2 + sh2add t0, a5, t2 addi a2, a3, -400 - lw a5, 0(t0) - sh2add t0, a5, t1 + lw a4, 0(t0) + sh2add t0, a4, t1 add t1, a1, a3 addi a3, a2, -400 + lw a5, 0(t0) + sh2add t0, a5, t1 + add t1, a1, a2 lw a4, 0(t0) sh2add t0, a4, t1 - add a4, a1, a2 - lw a5, 0(t0) - sh2add t1, a5, a4 - add a5, a1, a3 addi a4, a3, -400 + add t1, a1, a3 addi a2, a4, -400 - add t2, a1, a4 - lw t0, 0(t1) - sh2add t3, t0, a5 - lw t1, 0(t3) - sh2add t0, t1, t2 - addi t2, a2, -400 - add t1, a1, a2 lw a5, 0(t0) + sh2add t2, a5, t1 + add t1, a1, a4 + lw t0, 0(t2) + sh2add t2, t0, t1 + add t1, a1, a2 + lw a5, 0(t2) + addi t2, a2, -400 sh2add t0, a5, t1 - addi t1, a1, 2000 - add a5, a1, t2 - addi t2, a1, 800 + addi a5, a1, 2000 + add t1, a1, t2 lw a3, 0(t0) - sh2add t0, a3, a5 - addi a5, a1, 1600 + sh2add t0, a3, t1 lw a4, 0(t0) - addi t0, a1, 1200 - sh2add a3, a4, t1 - addi t1, a1, 400 + addi t0, a1, 1600 + sh2add a3, a4, a5 + addi a4, a1, 1200 lw a2, 0(a3) - sh2add a4, a2, a5 - lw a3, 0(a4) - sh2add a5, a3, t0 + sh2add t1, a2, t0 + addi t0, a1, 800 + lw a3, 0(t1) + addi t1, a1, 400 + sh2add a5, a3, a4 lw a2, 0(a5) - sh2add a4, a2, t2 + sh2add a4, a2, t0 lw a3, 0(a4) - sh2add a5, a3, t1 - lw a4, 0(a5) - addi a5, a0, -20 - sh2add t0, a4, a1 + sh2add t0, a3, t1 + lw a4, 0(t0) + addi t0, a0, -20 + sh2add a5, a4, a1 li a0, 400 - add a4, a1, a5 - lw a2, 0(t0) - lw a3, 0(a4) - mul a5, a3, a0 - add t0, a1, a5 - lw a4, 68(t0) - mul a3, a4, a0 - add a5, a1, a3 - lw t0, 64(a5) + add t1, a1, t0 + lw a2, 0(a5) + lw a3, 0(t1) + mul a4, a3, a0 + add t0, a1, a4 + lw a5, 68(t0) + mul t1, a5, a0 + add a3, a1, t1 + lw t0, 64(a3) mul a4, t0, a0 - add t1, a1, a4 - lw a5, 60(t1) - mul a3, a5, a0 - add a4, a1, a3 - lw t0, 56(a4) - mul t1, t0, a0 - add a5, a1, t1 - lw a4, 52(a5) - mul a3, a4, a0 - add t1, a1, a3 - lw t0, 48(t1) - mul t2, t0, a0 - add a5, a1, t2 + add a5, a1, a4 + lw t1, 60(a5) + mul a3, t1, a0 + add t0, a1, a3 + lw t2, 56(t0) + mul a4, t2, a0 + add a5, a1, a4 + lw a3, 52(a5) + mul t1, a3, a0 + add a4, a1, t1 + lw t0, 48(a4) + mul a3, t0, a0 + add a5, a1, a3 lw a4, 44(a5) - mul a3, a4, a0 - add t1, a1, a3 - lw t0, 40(t1) + mul t1, a4, a0 + add a3, a1, t1 + lw t0, 40(a3) mul a5, t0, a0 add a4, a1, a5 lw a3, 36(a4) @@ -363,8 +360,5 @@ label6: jal putint ld ra, 0(sp) mv a0, zero - ld s0, 8(sp) - ld s1, 16(sp) - ld s2, 24(sp) - addi sp, sp, 32 + addi sp, sp, 8 ret diff --git a/tests/SysY2022/hidden_functional/35_math.riscv.s b/tests/SysY2022/hidden_functional/35_math.riscv.s index 38b1334d5..82ab9e892 100644 --- a/tests/SysY2022/hidden_functional/35_math.riscv.s +++ b/tests/SysY2022/hidden_functional/35_math.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 981668463 .4byte 1076754516 @@ -670,13 +670,13 @@ main: fsw f18, 68(sp) fsw f9, 72(sp) jal getint -pcrel751: +pcrel747: auipc a1, %pcrel_hi(__cmmc_fp_constant_pool) lui s3, 265216 lui s2, 264192 lui s1, 258048 lui s0, 260096 - addi s4, a1, %pcrel_lo(pcrel751) + addi s4, a1, %pcrel_lo(pcrel747) beq a0, zero, label619 mv s5, a0 j label603 @@ -743,14 +743,14 @@ label603: fmul.s f13, f12, f8 andi a1, a0, 1 fmv.s f11, f13 - bne a1, zero, label733 + bne a1, zero, label729 fmv.s f11, f12 .p2align 2 -label733: - srliw a1, a0, 31 +label729: + srliw a2, a0, 31 fmul.s f10, f10, f10 - add a2, a0, a1 - sraiw a0, a2, 1 + add a1, a0, a2 + sraiw a0, a1, 1 beq a0, zero, label609 .p2align 2 label637: @@ -758,12 +758,12 @@ label637: fmul.s f13, f11, f10 andi a1, a0, 1 fmv.s f11, f13 - bne a1, zero, label733 + bne a1, zero, label729 fmv.s f11, f12 - srliw a1, a0, 31 + srliw a2, a0, 31 fmul.s f10, f10, f10 - add a2, a0, a1 - sraiw a0, a2, 1 + add a1, a0, a2 + sraiw a0, a1, 1 bne a0, zero, label637 .p2align 2 label609: @@ -814,9 +814,9 @@ label610: li a0, 32 jal putch fmv.w.x f10, zero - flt.s a0, f10, f9 - and a1, s6, a0 - bne a1, zero, label616 + flt.s a1, f10, f9 + and a0, s6, a1 + bne a0, zero, label616 li a0, 45 jal putch j label611 diff --git a/tests/SysY2022/hidden_functional/36_rotate.arm.s b/tests/SysY2022/hidden_functional/36_rotate.arm.s index 43311316c..b9daea733 100644 --- a/tests/SysY2022/hidden_functional/36_rotate.arm.s +++ b/tests/SysY2022/hidden_functional/36_rotate.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 image: .zero 4194304 .text diff --git a/tests/SysY2022/hidden_functional/36_rotate.riscv.s b/tests/SysY2022/hidden_functional/36_rotate.riscv.s index 5c39f3634..9b5730c46 100644 --- a/tests/SysY2022/hidden_functional/36_rotate.riscv.s +++ b/tests/SysY2022/hidden_functional/36_rotate.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 897988541 .4byte 1070141403 @@ -10,7 +10,7 @@ __cmmc_fp_constant_pool: .4byte 1078530011 .4byte 3226013659 .bss -.align 8 +.p2align 3 image: .zero 4194304 .text diff --git a/tests/SysY2022/hidden_functional/37_dct.arm.s b/tests/SysY2022/hidden_functional/37_dct.arm.s index 197bd96a2..a27d9425a 100644 --- a/tests/SysY2022/hidden_functional/37_dct.arm.s +++ b/tests/SysY2022/hidden_functional/37_dct.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 test_block: .zero 256 -.align 8 +.p2align 3 test_dct: .zero 256 -.align 8 +.p2align 3 test_idct: .zero 256 .text diff --git a/tests/SysY2022/hidden_functional/37_dct.riscv.s b/tests/SysY2022/hidden_functional/37_dct.riscv.s index 571171817..4c3be688b 100644 --- a/tests/SysY2022/hidden_functional/37_dct.riscv.s +++ b/tests/SysY2022/hidden_functional/37_dct.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 897988541 .4byte 1078530011 @@ -10,13 +10,13 @@ __cmmc_fp_constant_pool: .4byte 3234402267 .4byte 3226013659 .bss -.align 8 +.p2align 3 test_block: .zero 256 -.align 8 +.p2align 3 test_dct: .zero 256 -.align 8 +.p2align 3 test_idct: .zero 256 .text @@ -148,27 +148,27 @@ main: sd a0, 128(sp) mv a1, a0 jal getint - li s4, 16 -pcrel1082: + li s3, 4 +pcrel1077: auipc s6, %pcrel_hi(test_dct) lui s5, 256000 - addiw a2, a0, -3 + li s4, 16 sd a0, 136(sp) + addiw a2, a0, -3 addiw s2, a0, -18 - li s3, 4 lui s1, 258048 sd a2, 152(sp) -pcrel1083: +pcrel1078: auipc a0, %pcrel_hi(test_block) ld a1, 128(sp) - addi s8, a0, %pcrel_lo(pcrel1083) + addi s8, a0, %pcrel_lo(pcrel1078) addiw a2, a1, -3 sd a2, 144(sp) addiw a2, a1, -18 sd a2, 160(sp) -pcrel1084: +pcrel1079: auipc a2, %pcrel_hi(__cmmc_fp_constant_pool) - addi s0, a2, %pcrel_lo(pcrel1084) + addi s0, a2, %pcrel_lo(pcrel1079) ble a1, zero, label114 mv s7, s8 mv s9, zero @@ -201,9 +201,9 @@ label107: j label108 label114: ld a1, 128(sp) -pcrel1085: +pcrel1080: auipc s6, %pcrel_hi(test_dct) - addi s7, s6, %pcrel_lo(pcrel1085) + addi s7, s6, %pcrel_lo(pcrel1080) flw f10, 4(s0) ld a0, 136(sp) fcvt.s.w f18, a1 @@ -217,7 +217,7 @@ pcrel1085: bgt a0, zero, label119 j label141 .p2align 2 -label1001: +label996: addiw a5, a5, 1 ld a1, 128(sp) ble a1, a5, label143 @@ -241,7 +241,7 @@ label119: label414: addiw t2, t2, 1 ld a1, 128(sp) - ble a1, t2, label981 + ble a1, t2, label976 addi t1, t1, 32 .p2align 2 label123: @@ -255,9 +255,9 @@ label123: fadd.s f10, f11, f13 flw f11, 16(s0) flt.s a0, f12, f10 - flt.s a2, f10, f11 - or a1, a0, a2 - beq a1, zero, label334 + flt.s a1, f10, f11 + or a2, a0, a1 + beq a2, zero, label334 flw f11, 12(s0) fdiv.s f12, f10, f11 fcvt.w.s a0, f12, rtz @@ -269,20 +269,20 @@ label123: flt.s a0, f11, f10 fsub.s f13, f10, f12 fmv.s f11, f13 - bne a0, zero, label940 + bne a0, zero, label935 fmv.s f11, f10 .p2align 2 -label940: +label935: flw f14, 20(s0) fadd.s f13, f11, f12 flt.s a0, f11, f14 fmv.s f10, f13 - bne a0, zero, label942 + bne a0, zero, label937 .p2align 2 -label1013: +label1008: fmv.s f10, f11 .p2align 2 -label942: +label937: jal my_sin_impl mv t4, zero mv t3, t1 @@ -290,20 +290,20 @@ label942: fmv.s f2, f10 j label129 .p2align 2 -label943: +label938: fmv.s f11, f10 .p2align 2 -label944: +label939: flw f14, 20(s0) fadd.s f13, f11, f12 flt.s a0, f11, f14 fmv.s f10, f13 - bne a0, zero, label946 + bne a0, zero, label941 .p2align 2 -label1014: +label1009: fmv.s f10, f11 .p2align 2 -label946: +label941: jal my_sin_impl addiw t4, t4, 1 fmul.s f11, f4, f10 @@ -340,8 +340,8 @@ label129: flt.s a0, f11, f10 fsub.s f13, f10, f12 fmv.s f11, f13 - bne a0, zero, label944 - j label943 + bne a0, zero, label939 + j label938 .p2align 2 label383: flw f11, 4(s0) @@ -349,14 +349,14 @@ label383: flt.s a0, f11, f10 fsub.s f13, f10, f12 fmv.s f11, f13 - bne a0, zero, label944 + bne a0, zero, label939 fmv.s f11, f10 flw f14, 20(s0) fadd.s f13, f10, f12 flt.s a0, f10, f14 fmv.s f10, f13 - bne a0, zero, label946 - j label1014 + bne a0, zero, label941 + j label1009 .p2align 2 label334: flw f11, 4(s0) @@ -364,19 +364,19 @@ label334: flt.s a0, f11, f10 fsub.s f13, f10, f12 fmv.s f11, f13 - bne a0, zero, label940 + bne a0, zero, label935 fmv.s f11, f10 flw f14, 20(s0) fadd.s f13, f10, f12 flt.s a0, f10, f14 fmv.s f10, f13 - bne a0, zero, label942 - j label1013 + bne a0, zero, label937 + j label1008 .p2align 2 -label981: +label976: addiw t0, t0, 1 ld a0, 136(sp) - ble a0, t0, label1001 + ble a0, t0, label996 addi a3, a3, 4 fmv.w.x f10, zero fcvt.s.w f0, t0 @@ -422,9 +422,9 @@ label144: li a0, 10 jal putch ld a1, 128(sp) -pcrel1086: +pcrel1081: auipc a2, %pcrel_hi(test_idct) - addi a5, a2, %pcrel_lo(pcrel1086) + addi a5, a2, %pcrel_lo(pcrel1081) ble a1, zero, label145 mv a4, a5 mv t0, zero @@ -436,7 +436,7 @@ pcrel1086: bgt a0, zero, label163 j label150 .p2align 2 -label1003: +label998: addiw t0, t0, 1 ld a1, 128(sp) ble a1, t0, label456 @@ -455,7 +455,7 @@ label163: mv t1, zero j label164 .p2align 2 -label988: +label983: flw f11, 0(a3) addiw t1, t1, 1 fadd.s f13, f11, f11 @@ -464,7 +464,7 @@ label988: fdiv.s f11, f12, f19 fsw f11, 0(a3) ld a0, 136(sp) - ble a0, t1, label1003 + ble a0, t1, label998 .p2align 2 label170: addi a3, a3, 4 @@ -548,7 +548,7 @@ label868: fmv.s f10, f11 mv t5, a1 ld a2, 144(sp) - ble a2, a1, label1000 + ble a2, a1, label995 .p2align 2 label245: slliw a2, t5, 5 @@ -580,7 +580,7 @@ label795: mv a2, a1 fmv.s f10, f11 ld a1, 128(sp) - ble a1, a2, label999 + ble a1, a2, label994 .p2align 2 label228: slliw a1, a2, 5 @@ -601,7 +601,7 @@ label760: fsw f10, 0(a3) li t5, 1 ld a0, 136(sp) - ble a0, t5, label1010 + ble a0, t5, label1005 .p2align 2 label189: flw f11, 0(a3) @@ -672,7 +672,7 @@ label721: fmv.s f10, f11 mv t5, a1 ld a2, 152(sp) - ble a2, a1, label995 + ble a2, a1, label990 .p2align 2 label199: sh2add a0, t5, s7 @@ -703,7 +703,7 @@ label649: mv a2, a1 fmv.s f10, f11 ld a0, 136(sp) - ble a0, a1, label994 + ble a0, a1, label989 .p2align 2 label215: sh2add a1, a2, s7 @@ -723,7 +723,7 @@ label738: fsw f10, 0(a3) li t5, 1 ld a1, 128(sp) - ble a1, t5, label1008 + ble a1, t5, label1003 .p2align 2 label171: addi t2, s7, 32 @@ -742,18 +742,18 @@ label171: fdiv.s f11, f12, f19 fsw f11, 0(a3) bgt a0, t1, label170 - j label990 + j label985 .p2align 2 label595: addiw t3, t3, 1 ld a1, 128(sp) - ble a1, t3, label988 + ble a1, t3, label983 .p2align 2 label188: addi t2, t2, 32 li t5, 1 ld a0, 136(sp) - ble a0, t5, label991 + ble a0, t5, label986 .p2align 2 label175: fcvt.s.w f13, t3 @@ -763,22 +763,22 @@ label175: fadd.s f10, f11, f12 flw f11, 16(s0) flt.s a0, f13, f10 - flt.s a2, f10, f11 - or a1, a0, a2 - bne a1, zero, label176 + flt.s a1, f10, f11 + or a2, a0, a1 + bne a2, zero, label176 flw f11, 4(s0) flw f12, 12(s0) flt.s a0, f11, f10 fsub.s f13, f10, f12 fmv.s f11, f13 - bne a0, zero, label948 + bne a0, zero, label943 fmv.s f11, f10 flw f14, 20(s0) fadd.s f13, f10, f12 flt.s a0, f10, f14 fmv.s f10, f13 - bne a0, zero, label950 - j label1016 + bne a0, zero, label945 + j label1011 .p2align 2 label176: flw f11, 12(s0) @@ -792,20 +792,20 @@ label176: flt.s a0, f11, f10 fsub.s f13, f10, f12 fmv.s f11, f13 - bne a0, zero, label948 + bne a0, zero, label943 fmv.s f11, f10 .p2align 2 -label948: +label943: flw f14, 20(s0) fadd.s f13, f11, f12 flt.s a0, f11, f14 fmv.s f10, f13 - bne a0, zero, label950 + bne a0, zero, label945 .p2align 2 -label1016: +label1011: fmv.s f10, f11 .p2align 2 -label950: +label945: jal my_sin_impl li t5, 1 addi t4, t2, 4 @@ -813,20 +813,20 @@ label950: fmv.s f2, f10 j label179 .p2align 2 -label951: +label946: fmv.s f11, f10 .p2align 2 -label952: +label947: flw f13, 20(s0) fadd.s f14, f11, f12 flt.s a0, f11, f13 fmv.s f10, f14 - bne a0, zero, label954 + bne a0, zero, label949 .p2align 2 -label953: +label948: fmv.s f10, f11 .p2align 2 -label954: +label949: jal my_sin_impl addiw t5, t5, 1 fmul.s f11, f4, f10 @@ -854,8 +854,8 @@ label179: flt.s a0, f11, f10 fsub.s f13, f10, f12 fmv.s f11, f13 - bne a0, zero, label952 - j label951 + bne a0, zero, label947 + j label946 .p2align 2 label186: flw f11, 12(s0) @@ -869,14 +869,14 @@ label186: fsub.s f13, f10, f12 flt.s a0, f11, f10 fmv.s f11, f13 - bne a0, zero, label952 + bne a0, zero, label947 fmv.s f11, f10 flw f13, 20(s0) fadd.s f14, f10, f12 flt.s a0, f10, f13 fmv.s f10, f14 - bne a0, zero, label954 - j label953 + bne a0, zero, label949 + j label948 .p2align 2 label743: fmv.w.x f12, zero @@ -927,7 +927,7 @@ label765: bgt a1, t5, label171 j label169 .p2align 2 -label1000: +label995: mv a2, a1 fmv.s f10, f11 ld a1, 128(sp) @@ -939,7 +939,7 @@ label1000: bgt a1, t5, label171 j label169 .p2align 2 -label995: +label990: mv a2, a1 fmv.s f10, f11 ld a0, 136(sp) @@ -968,7 +968,7 @@ label168: bgt a1, t5, label171 j label169 .p2align 2 -label991: +label986: addiw t3, t3, 1 ld a1, 128(sp) bgt a1, t3, label188 @@ -981,7 +981,7 @@ label991: fsw f11, 0(a3) ld a0, 136(sp) bgt a0, t1, label170 -label990: +label985: addiw t0, t0, 1 ld a1, 128(sp) bgt a1, t0, label151 @@ -1011,7 +1011,7 @@ label169: bgt a0, t1, label170 j label503 .p2align 2 -label994: +label989: fmv.s f10, f11 li t5, 1 fsw f11, 0(a3) @@ -1028,7 +1028,7 @@ label994: bgt a0, t1, label170 j label503 .p2align 2 -label999: +label994: fmv.s f10, f11 li t5, 1 fsw f11, 0(a3) @@ -1109,13 +1109,13 @@ label145: addi sp, sp, 168 ret .p2align 2 -label1010: +label1005: ld a1, 128(sp) li t5, 1 bgt a1, t5, label171 j label169 .p2align 2 -label1008: +label1003: flw f11, 0(a3) addiw t1, t1, 1 fadd.s f13, f11, f11 diff --git a/tests/SysY2022/hidden_functional/38_light2d.riscv.s b/tests/SysY2022/hidden_functional/38_light2d.riscv.s index 9b264cb3c..3d974d27e 100644 --- a/tests/SysY2022/hidden_functional/38_light2d.riscv.s +++ b/tests/SysY2022/hidden_functional/38_light2d.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 897988541 .4byte 1287568417 @@ -125,10 +125,10 @@ main: sd ra, 0(sp) sd s10, 8(sp) sd s8, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s1, 40(sp) - sd s6, 48(sp) + sd s1, 24(sp) + sd s6, 32(sp) + sd s0, 40(sp) + sd s5, 48(sp) sd s2, 56(sp) sd s3, 64(sp) sd s4, 72(sp) @@ -152,21 +152,19 @@ main: jal putint li a0, 10 jal putch - mv s10, zero - mv s8, zero - lui s1, 264192 -pcrel462: + lui s6, 269312 + lui s5, 263168 + lui s4, 258048 +pcrel454: auipc a0, %pcrel_hi(__cmmc_fp_constant_pool) - lui s3, 258048 - lui s4, 263168 - lui a1, 6 - lui s2, 253952 - addi s0, a0, %pcrel_lo(pcrel462) - addiw s7, a1, -1243 + lui s1, 262144 + lui s2, 264192 + mv s8, zero + lui s3, 253952 + mv s10, zero + addi s0, a0, %pcrel_lo(pcrel454) lui a0, 4878 - addiw s6, a0, -158 - lui a0, 24414 - addiw s5, a0, 263 + addiw s7, a0, -158 j label103 .p2align 2 label127: @@ -185,12 +183,11 @@ label103: j label106 .p2align 2 label126: - lui a0, 269312 - lui a2, 276464 - fmv.w.x f11, a0 - fmv.w.x f12, a2 - fdiv.s f10, f1, f11 + fmv.w.x f11, s6 + lui a0, 276464 li a2, 255 + fmv.w.x f12, a0 + fdiv.s f10, f1, f11 fmul.s f11, f10, f12 fcvt.w.s a1, f11, rtz min a0, a1, a2 @@ -222,30 +219,29 @@ label116: flt.s a0, f10, f3 fsub.s f12, f3, f11 fmv.s f10, f12 - bne a0, zero, label427 + bne a0, zero, label419 fmv.s f10, f3 .p2align 2 -label427: +label419: flw f14, 24(s0) fadd.s f13, f10, f11 flt.s a0, f10, f14 fmv.s f12, f13 - bne a0, zero, label429 + bne a0, zero, label421 .p2align 2 -label453: +label445: fmv.s f12, f10 .p2align 2 -label429: +label421: fmv.s f10, f12 jal my_sin_impl mv a0, zero fmv.w.x f11, zero li a1, 1 - lui a5, 262144 - fmv.w.x f12, a5 - flt.s a2, f11, f12 - and a4, a1, a2 - beq a4, zero, label234 + fmv.w.x f12, s1 + flt.s a4, f11, f12 + and a2, a1, a4 + beq a2, zero, label234 .p2align 2 label122: fmul.s f14, f2, f11 @@ -257,14 +253,14 @@ label122: fmul.s f4, f15, f15 fsub.s f15, f14, f12 fmul.s f5, f15, f15 - fmv.w.x f15, s1 + fmv.w.x f15, s2 fadd.s f3, f4, f5 - fmv.w.x f4, s2 + fmv.w.x f4, s3 fadd.s f12, f3, f3 fadd.s f7, f3, f15 fmul.s f6, f3, f4 fdiv.s f5, f12, f7 - fmv.w.x f12, s3 + fmv.w.x f12, s4 fadd.s f29, f6, f12 fadd.s f7, f29, f5 fdiv.s f30, f3, f7 @@ -282,28 +278,28 @@ label122: fdiv.s f28, f3, f5 fadd.s f29, f5, f28 fmul.s f6, f29, f12 - fdiv.s f30, f3, f6 - fadd.s f7, f6, f30 - fmul.s f5, f7, f12 - fdiv.s f28, f3, f5 - fadd.s f6, f5, f28 - fmul.s f7, f6, f12 - fdiv.s f29, f3, f7 - fadd.s f5, f7, f29 - fmul.s f6, f5, f12 + fdiv.s f7, f3, f6 + fadd.s f28, f6, f7 + fmul.s f5, f28, f12 + fdiv.s f30, f3, f5 + fadd.s f7, f5, f30 + fmul.s f6, f7, f12 + fdiv.s f29, f3, f6 + fadd.s f28, f6, f29 + fmul.s f5, f28, f12 + fdiv.s f7, f3, f5 + fadd.s f29, f5, f7 + fmul.s f6, f29, f12 + flw f29, 32(s0) fdiv.s f28, f3, f6 - fadd.s f30, f6, f28 - flw f28, 32(s0) - fmul.s f5, f30, f12 - fdiv.s f29, f3, f5 - fadd.s f7, f5, f29 + fadd.s f5, f6, f28 + fmul.s f7, f5, f12 flw f5, 36(s0) - fmul.s f6, f7, f12 - fsub.s f7, f13, f5 - fsub.s f3, f6, f28 - fsub.s f28, f14, f5 - fmul.s f6, f7, f7 - fmul.s f29, f28, f28 + fsub.s f28, f13, f5 + fsub.s f3, f7, f29 + fsub.s f7, f14, f5 + fmul.s f6, f28, f28 + fmul.s f29, f7, f7 fadd.s f13, f6, f29 fadd.s f5, f13, f13 fmul.s f6, f13, f4 @@ -321,42 +317,42 @@ label122: fadd.s f6, f15, f4 fmul.s f14, f6, f12 fdiv.s f5, f13, f14 - fadd.s f15, f14, f5 - fmul.s f4, f15, f12 - fdiv.s f7, f13, f4 - fadd.s f14, f4, f7 - fmul.s f15, f14, f12 + fadd.s f4, f14, f5 + fmul.s f15, f4, f12 + fdiv.s f28, f13, f15 + fadd.s f7, f15, f28 + fmul.s f14, f7, f12 + fdiv.s f6, f13, f14 + fadd.s f5, f14, f6 + fmul.s f15, f5, f12 + fdiv.s f4, f13, f15 + fadd.s f6, f15, f4 + fmul.s f14, f6, f12 + fdiv.s f5, f13, f14 + fadd.s f4, f14, f5 + fmul.s f15, f4, f12 fdiv.s f6, f13, f15 fadd.s f5, f15, f6 fmul.s f14, f5, f12 + flw f5, 40(s0) fdiv.s f4, f13, f14 fadd.s f6, f14, f4 fmul.s f15, f6, f12 - fdiv.s f5, f13, f15 - fadd.s f14, f15, f5 - fmul.s f4, f14, f12 - fdiv.s f6, f13, f4 - fadd.s f15, f4, f6 - flw f4, 40(s0) - fmul.s f14, f15, f12 - fdiv.s f5, f13, f14 - fadd.s f6, f14, f5 - fmul.s f15, f6, f12 fmv.s f12, f3 - fsub.s f13, f15, f4 + fsub.s f13, f15, f5 flt.s a1, f3, f13 - bne a1, zero, label431 + bne a1, zero, label423 fmv.s f12, f13 .p2align 2 -label431: +label423: flw f13, 0(s0) - fmv.w.x f3, s4 + fmv.w.x f3, s5 fmv.w.x f15, zero fmv.s f14, f3 - bne a1, zero, label433 + bne a1, zero, label425 fmv.s f14, f15 .p2align 2 -label433: +label425: flt.s a1, f12, f13 beq a1, zero, label123 fadd.s f1, f1, f14 @@ -365,29 +361,32 @@ label433: bge a3, a0, label126 .p2align 2 label109: - mulw a1, s10, s6 - lui a5, 87961 - addw a0, a1, s7 - addiw a4, a5, -311 - mul a1, a0, a4 + mulw a1, s10, s7 + lui a4, 6 + lui t0, 87961 + lui t1, 24414 + addiw a2, a4, -1243 + addiw a5, t0, -311 + addw a0, a1, a2 + mul a1, a0, a5 srli t0, a1, 63 - srai a2, a1, 55 - add a5, t0, a2 - mulw a4, a5, s5 - subw a1, a0, a4 - mv s10, a1 - bge a1, zero, label421 - addw s10, a1, s5 + srai a4, a1, 55 + addiw a1, t1, 263 + add a2, t0, a4 + mulw a4, a2, a1 + subw a5, a0, a4 + mv s10, a5 + bge a5, zero, label413 + addw s10, a5, a1 .p2align 2 -label421: +label413: fcvt.s.w f11, s10 flw f12, 4(s0) fcvt.s.w f14, a3 - lui a0, 269312 fdiv.s f10, f11, f12 flw f11, 8(s0) fadd.s f13, f14, f10 - fmv.w.x f14, a0 + fmv.w.x f14, s6 fmul.s f12, f13, f11 flw f13, 12(s0) fdiv.s f3, f12, f14 @@ -402,49 +401,48 @@ label421: flt.s a0, f11, f10 fsub.s f12, f10, f4 fmv.s f11, f12 - beq a0, zero, label447 + beq a0, zero, label439 .p2align 2 -label423: +label415: flw f13, 24(s0) fadd.s f12, f11, f4 flt.s a0, f11, f13 fmv.s f10, f12 - bne a0, zero, label425 + bne a0, zero, label417 .p2align 2 -label452: +label444: fmv.s f10, f11 .p2align 2 -label425: +label417: jal my_sin_impl flw f11, 16(s0) flt.s a0, f4, f3 fmv.s f2, f10 - flt.s a1, f3, f11 - or a2, a0, a1 - bne a2, zero, label116 + flt.s a2, f3, f11 + or a1, a0, a2 + bne a1, zero, label116 flw f10, 20(s0) flw f11, 8(s0) flt.s a0, f10, f3 fsub.s f12, f3, f11 fmv.s f10, f12 - bne a0, zero, label427 + bne a0, zero, label419 fmv.s f10, f3 flw f14, 24(s0) fadd.s f13, f3, f11 flt.s a0, f3, f14 fmv.s f12, f13 - bne a0, zero, label429 - j label453 + bne a0, zero, label421 + j label445 .p2align 2 label123: fadd.s f11, f11, f12 addiw a0, a0, 1 - lui a5, 262144 + fmv.w.x f12, s1 slti a1, a0, 10 - fmv.w.x f12, a5 - flt.s a2, f11, f12 - and a4, a1, a2 - bne a4, zero, label122 + flt.s a4, f11, f12 + and a2, a1, a4 + bne a2, zero, label122 fmv.w.x f14, zero addiw a3, a3, 1 li a0, 24 @@ -464,9 +462,9 @@ label113: flt.s a0, f11, f10 fsub.s f12, f10, f4 fmv.s f11, f12 - bne a0, zero, label423 + bne a0, zero, label415 fmv.s f11, f10 - j label423 + j label415 .p2align 2 label234: fmv.w.x f14, zero @@ -480,10 +478,10 @@ label128: ld ra, 0(sp) ld s10, 8(sp) ld s8, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s1, 40(sp) - ld s6, 48(sp) + ld s1, 24(sp) + ld s6, 32(sp) + ld s0, 40(sp) + ld s5, 48(sp) ld s2, 56(sp) ld s3, 64(sp) ld s4, 72(sp) @@ -493,11 +491,11 @@ label128: addi sp, sp, 104 ret .p2align 2 -label447: +label439: fmv.s f11, f10 flw f13, 24(s0) fadd.s f12, f10, f4 flt.s a0, f10, f13 fmv.s f10, f12 - bne a0, zero, label425 - j label452 + bne a0, zero, label417 + j label444 diff --git a/tests/SysY2022/performance/00_bitset1.arm.s b/tests/SysY2022/performance/00_bitset1.arm.s index a8ea6d50c..c512d7b89 100644 --- a/tests/SysY2022/performance/00_bitset1.arm.s +++ b/tests/SysY2022/performance/00_bitset1.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 40000 .text diff --git a/tests/SysY2022/performance/00_bitset1.riscv.s b/tests/SysY2022/performance/00_bitset1.riscv.s index d3120373d..0fa92f5ca 100644 --- a/tests/SysY2022/performance/00_bitset1.riscv.s +++ b/tests/SysY2022/performance/00_bitset1.riscv.s @@ -1,135 +1,133 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 40000 .text .p2align 2 .globl main main: - addi sp, sp, -72 + addi sp, sp, -48 sd ra, 0(sp) sd s1, 8(sp) - sd s6, 16(sp) - sd s2, 24(sp) - sd s0, 32(sp) - sd s5, 40(sp) - sd s3, 48(sp) - sd s4, 56(sp) - sd s7, 64(sp) + sd s2, 16(sp) + sd s0, 24(sp) + sd s4, 32(sp) + sd s3, 40(sp) jal getint mv s1, a0 jal getint mv s2, a0 li a0, 56 jal _sysy_starttime - lui t5, 458130 - lui t4, 281475 - lui t3, 559241 -pcrel173: + lui t1, 281475 +pcrel166: auipc a2, %pcrel_hi(a) - lui a0, 4876 lui a3, 4878 - li a5, 1 - lui a4, 244141 - addiw t1, t3, -1911 - addi s0, a2, %pcrel_lo(pcrel173) + lui a0, 4876 + lui a5, 244141 + li a4, 1 + lui t2, 73 + addi s0, a2, %pcrel_lo(pcrel166) addiw a1, a0, -865 - addiw t3, t5, -635 addiw a2, a3, 725 - addiw a0, a4, -1529 + addiw a0, a5, -1529 lui a3, 524288 - lui a4, 73 - addiw t2, a3, 1 - addiw t0, a4, 992 - li a3, 64 - addiw a4, t4, -103 + addiw a5, t2, 992 + addiw t0, a3, 1 + addiw a3, t1, -103 ble s1, zero, label8 - mv t4, s2 + mv t2, s2 + mv t1, s1 j label2 .p2align 2 -label155: - addw t5, a6, s2 - sh2add a7, t6, s0 - sw t5, 0(a7) - ble s1, zero, label8 +label151: + addw t3, t5, a6 + sh2add t6, t4, s0 + sw t3, 0(t6) + ble t1, zero, label8 .p2align 2 label2: - mulw a6, t4, a1 - addw t4, a6, a2 - mul t5, t4, a4 - srli a6, t5, 63 - srai t6, t5, 60 - add s2, a6, t6 - mulw a7, s2, a0 - subw t5, t4, a7 - mv t6, t5 - bge t5, zero, label149 - addw t6, t5, a0 + mulw t5, t2, a1 + addw t2, t5, a2 + mul t3, t2, a3 + srli t6, t3, 63 + srai t4, t3, 60 + add t5, t6, t4 + mulw a6, t5, a0 + subw t3, t2, a6 + mv t4, t3 + bge t3, zero, label143 + addw t4, t3, a0 .p2align 2 -label149: - mulw a7, t6, a1 - addw t5, a7, a2 - mul t4, t5, a4 - srli a7, t4, 63 - srai a6, t4, 60 - add s2, a7, a6 - mulw s3, s2, a0 - subw a7, t5, s3 - mv t4, a7 - bge a7, zero, label151 - addw t4, a7, a0 +label143: + mulw t6, t4, a1 + addw t3, t6, a2 + mul t2, t3, a3 + srli t6, t2, 63 + srai t5, t2, 60 + add s1, t6, t5 + mulw a6, s1, a0 + subw a7, t3, a6 + mv t2, a7 + bge a7, zero, label145 + addw t2, a7, a0 .p2align 2 -label151: - andi t5, t4, 1 - mul a6, t6, t3 - addiw s1, s1, -1 - srli s4, a6, 63 - srai a7, a6, 49 - add s3, s4, a7 - mulw a6, s3, t0 - subw s2, t6, a6 - mul a7, s2, t1 - srli t6, a7, 32 - add a6, t6, s2 +label145: + andi t3, t2, 1 + lui a6, 458130 + lui s1, 559241 + li s4, 64 + addiw t1, t1, -1 + addiw a7, a6, -635 + mul t5, t4, a7 + srli a6, t5, 63 + srai t6, t5, 49 + add a7, a6, t6 + addiw t6, s1, -1911 + mulw t5, a7, a5 + subw a6, t4, t5 + mul a7, a6, t6 + srli t4, a7, 32 + add t5, t4, a6 + srliw a7, t5, 31 + sraiw t6, t5, 4 + add t4, a7, t6 + slliw t6, t4, 4 + sh2add s2, t4, s0 + subw s1, t6, t4 + lw t5, 0(s2) + slli s2, s1, 1 + subw a7, a6, s2 + subw s3, s4, a7 + slli a6, t5, 1 + sllw t6, a4, a7 + srl s2, a6, s3 + add s1, t5, s2 + sraw a6, s1, a7 srliw s3, a6, 31 - sraiw a7, a6, 4 - add t6, s3, a7 - slliw s6, t6, 4 - sh2add s4, t6, s0 - subw a7, s6, t6 - slli s5, a7, 1 - lw a6, 0(s4) - subw s3, s2, s5 - slli s4, a6, 1 - subw s7, a3, s3 - sllw a7, a5, s3 - srl s6, s4, s7 - add s5, a6, s6 - sraw s2, s5, s3 - srliw s7, s2, 31 - add s6, s2, s7 - andi s4, s6, -2 - subw s3, s2, s4 - beq t5, s3, label41 - andi s5, s2, 1 - xori s6, t5, 1 - mv s3, a7 - or s4, s5, s6 - beq s4, zero, label153 - mv s3, zero + add s2, a6, s3 + andi s1, s2, -2 + subw a7, a6, s1 + beq t3, a7, label41 + andi s2, a6, 1 + xori s3, t3, 1 + mv a7, t6 + or s1, s2, s3 + beq s1, zero, label149 + mv a7, zero .p2align 2 -label153: - and s6, s2, t2 - mv s2, s3 - xori s5, s6, 1 - or s4, t5, s5 - bne s4, zero, label155 - subw s2, s3, a7 - j label155 +label149: + and s3, a6, t0 + mv a6, a7 + xori s2, s3, 1 + or s1, t3, s2 + bne s1, zero, label151 + subw a6, a7, t6 + j label151 label8: - mv a0, a3 + li a0, 64 jal _sysy_stoptime mv a1, s0 li a2, 625 @@ -138,18 +136,15 @@ label8: ld ra, 0(sp) mv a0, zero ld s1, 8(sp) - ld s6, 16(sp) - ld s2, 24(sp) - ld s0, 32(sp) - ld s5, 40(sp) - ld s3, 48(sp) - ld s4, 56(sp) - ld s7, 64(sp) - addi sp, sp, 72 + ld s2, 16(sp) + ld s0, 24(sp) + ld s4, 32(sp) + ld s3, 40(sp) + addi sp, sp, 48 ret .p2align 2 label41: - sh2add t5, t6, s0 - sw a6, 0(t5) - bgt s1, zero, label2 + sh2add t3, t4, s0 + sw t5, 0(t3) + bgt t1, zero, label2 j label8 diff --git a/tests/SysY2022/performance/00_bitset2.arm.s b/tests/SysY2022/performance/00_bitset2.arm.s index a8ea6d50c..c512d7b89 100644 --- a/tests/SysY2022/performance/00_bitset2.arm.s +++ b/tests/SysY2022/performance/00_bitset2.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 40000 .text diff --git a/tests/SysY2022/performance/00_bitset2.riscv.s b/tests/SysY2022/performance/00_bitset2.riscv.s index d3120373d..0fa92f5ca 100644 --- a/tests/SysY2022/performance/00_bitset2.riscv.s +++ b/tests/SysY2022/performance/00_bitset2.riscv.s @@ -1,135 +1,133 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 40000 .text .p2align 2 .globl main main: - addi sp, sp, -72 + addi sp, sp, -48 sd ra, 0(sp) sd s1, 8(sp) - sd s6, 16(sp) - sd s2, 24(sp) - sd s0, 32(sp) - sd s5, 40(sp) - sd s3, 48(sp) - sd s4, 56(sp) - sd s7, 64(sp) + sd s2, 16(sp) + sd s0, 24(sp) + sd s4, 32(sp) + sd s3, 40(sp) jal getint mv s1, a0 jal getint mv s2, a0 li a0, 56 jal _sysy_starttime - lui t5, 458130 - lui t4, 281475 - lui t3, 559241 -pcrel173: + lui t1, 281475 +pcrel166: auipc a2, %pcrel_hi(a) - lui a0, 4876 lui a3, 4878 - li a5, 1 - lui a4, 244141 - addiw t1, t3, -1911 - addi s0, a2, %pcrel_lo(pcrel173) + lui a0, 4876 + lui a5, 244141 + li a4, 1 + lui t2, 73 + addi s0, a2, %pcrel_lo(pcrel166) addiw a1, a0, -865 - addiw t3, t5, -635 addiw a2, a3, 725 - addiw a0, a4, -1529 + addiw a0, a5, -1529 lui a3, 524288 - lui a4, 73 - addiw t2, a3, 1 - addiw t0, a4, 992 - li a3, 64 - addiw a4, t4, -103 + addiw a5, t2, 992 + addiw t0, a3, 1 + addiw a3, t1, -103 ble s1, zero, label8 - mv t4, s2 + mv t2, s2 + mv t1, s1 j label2 .p2align 2 -label155: - addw t5, a6, s2 - sh2add a7, t6, s0 - sw t5, 0(a7) - ble s1, zero, label8 +label151: + addw t3, t5, a6 + sh2add t6, t4, s0 + sw t3, 0(t6) + ble t1, zero, label8 .p2align 2 label2: - mulw a6, t4, a1 - addw t4, a6, a2 - mul t5, t4, a4 - srli a6, t5, 63 - srai t6, t5, 60 - add s2, a6, t6 - mulw a7, s2, a0 - subw t5, t4, a7 - mv t6, t5 - bge t5, zero, label149 - addw t6, t5, a0 + mulw t5, t2, a1 + addw t2, t5, a2 + mul t3, t2, a3 + srli t6, t3, 63 + srai t4, t3, 60 + add t5, t6, t4 + mulw a6, t5, a0 + subw t3, t2, a6 + mv t4, t3 + bge t3, zero, label143 + addw t4, t3, a0 .p2align 2 -label149: - mulw a7, t6, a1 - addw t5, a7, a2 - mul t4, t5, a4 - srli a7, t4, 63 - srai a6, t4, 60 - add s2, a7, a6 - mulw s3, s2, a0 - subw a7, t5, s3 - mv t4, a7 - bge a7, zero, label151 - addw t4, a7, a0 +label143: + mulw t6, t4, a1 + addw t3, t6, a2 + mul t2, t3, a3 + srli t6, t2, 63 + srai t5, t2, 60 + add s1, t6, t5 + mulw a6, s1, a0 + subw a7, t3, a6 + mv t2, a7 + bge a7, zero, label145 + addw t2, a7, a0 .p2align 2 -label151: - andi t5, t4, 1 - mul a6, t6, t3 - addiw s1, s1, -1 - srli s4, a6, 63 - srai a7, a6, 49 - add s3, s4, a7 - mulw a6, s3, t0 - subw s2, t6, a6 - mul a7, s2, t1 - srli t6, a7, 32 - add a6, t6, s2 +label145: + andi t3, t2, 1 + lui a6, 458130 + lui s1, 559241 + li s4, 64 + addiw t1, t1, -1 + addiw a7, a6, -635 + mul t5, t4, a7 + srli a6, t5, 63 + srai t6, t5, 49 + add a7, a6, t6 + addiw t6, s1, -1911 + mulw t5, a7, a5 + subw a6, t4, t5 + mul a7, a6, t6 + srli t4, a7, 32 + add t5, t4, a6 + srliw a7, t5, 31 + sraiw t6, t5, 4 + add t4, a7, t6 + slliw t6, t4, 4 + sh2add s2, t4, s0 + subw s1, t6, t4 + lw t5, 0(s2) + slli s2, s1, 1 + subw a7, a6, s2 + subw s3, s4, a7 + slli a6, t5, 1 + sllw t6, a4, a7 + srl s2, a6, s3 + add s1, t5, s2 + sraw a6, s1, a7 srliw s3, a6, 31 - sraiw a7, a6, 4 - add t6, s3, a7 - slliw s6, t6, 4 - sh2add s4, t6, s0 - subw a7, s6, t6 - slli s5, a7, 1 - lw a6, 0(s4) - subw s3, s2, s5 - slli s4, a6, 1 - subw s7, a3, s3 - sllw a7, a5, s3 - srl s6, s4, s7 - add s5, a6, s6 - sraw s2, s5, s3 - srliw s7, s2, 31 - add s6, s2, s7 - andi s4, s6, -2 - subw s3, s2, s4 - beq t5, s3, label41 - andi s5, s2, 1 - xori s6, t5, 1 - mv s3, a7 - or s4, s5, s6 - beq s4, zero, label153 - mv s3, zero + add s2, a6, s3 + andi s1, s2, -2 + subw a7, a6, s1 + beq t3, a7, label41 + andi s2, a6, 1 + xori s3, t3, 1 + mv a7, t6 + or s1, s2, s3 + beq s1, zero, label149 + mv a7, zero .p2align 2 -label153: - and s6, s2, t2 - mv s2, s3 - xori s5, s6, 1 - or s4, t5, s5 - bne s4, zero, label155 - subw s2, s3, a7 - j label155 +label149: + and s3, a6, t0 + mv a6, a7 + xori s2, s3, 1 + or s1, t3, s2 + bne s1, zero, label151 + subw a6, a7, t6 + j label151 label8: - mv a0, a3 + li a0, 64 jal _sysy_stoptime mv a1, s0 li a2, 625 @@ -138,18 +136,15 @@ label8: ld ra, 0(sp) mv a0, zero ld s1, 8(sp) - ld s6, 16(sp) - ld s2, 24(sp) - ld s0, 32(sp) - ld s5, 40(sp) - ld s3, 48(sp) - ld s4, 56(sp) - ld s7, 64(sp) - addi sp, sp, 72 + ld s2, 16(sp) + ld s0, 24(sp) + ld s4, 32(sp) + ld s3, 40(sp) + addi sp, sp, 48 ret .p2align 2 label41: - sh2add t5, t6, s0 - sw a6, 0(t5) - bgt s1, zero, label2 + sh2add t3, t4, s0 + sw t5, 0(t3) + bgt t1, zero, label2 j label8 diff --git a/tests/SysY2022/performance/00_bitset3.arm.s b/tests/SysY2022/performance/00_bitset3.arm.s index a8ea6d50c..c512d7b89 100644 --- a/tests/SysY2022/performance/00_bitset3.arm.s +++ b/tests/SysY2022/performance/00_bitset3.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 40000 .text diff --git a/tests/SysY2022/performance/00_bitset3.riscv.s b/tests/SysY2022/performance/00_bitset3.riscv.s index d3120373d..0fa92f5ca 100644 --- a/tests/SysY2022/performance/00_bitset3.riscv.s +++ b/tests/SysY2022/performance/00_bitset3.riscv.s @@ -1,135 +1,133 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 40000 .text .p2align 2 .globl main main: - addi sp, sp, -72 + addi sp, sp, -48 sd ra, 0(sp) sd s1, 8(sp) - sd s6, 16(sp) - sd s2, 24(sp) - sd s0, 32(sp) - sd s5, 40(sp) - sd s3, 48(sp) - sd s4, 56(sp) - sd s7, 64(sp) + sd s2, 16(sp) + sd s0, 24(sp) + sd s4, 32(sp) + sd s3, 40(sp) jal getint mv s1, a0 jal getint mv s2, a0 li a0, 56 jal _sysy_starttime - lui t5, 458130 - lui t4, 281475 - lui t3, 559241 -pcrel173: + lui t1, 281475 +pcrel166: auipc a2, %pcrel_hi(a) - lui a0, 4876 lui a3, 4878 - li a5, 1 - lui a4, 244141 - addiw t1, t3, -1911 - addi s0, a2, %pcrel_lo(pcrel173) + lui a0, 4876 + lui a5, 244141 + li a4, 1 + lui t2, 73 + addi s0, a2, %pcrel_lo(pcrel166) addiw a1, a0, -865 - addiw t3, t5, -635 addiw a2, a3, 725 - addiw a0, a4, -1529 + addiw a0, a5, -1529 lui a3, 524288 - lui a4, 73 - addiw t2, a3, 1 - addiw t0, a4, 992 - li a3, 64 - addiw a4, t4, -103 + addiw a5, t2, 992 + addiw t0, a3, 1 + addiw a3, t1, -103 ble s1, zero, label8 - mv t4, s2 + mv t2, s2 + mv t1, s1 j label2 .p2align 2 -label155: - addw t5, a6, s2 - sh2add a7, t6, s0 - sw t5, 0(a7) - ble s1, zero, label8 +label151: + addw t3, t5, a6 + sh2add t6, t4, s0 + sw t3, 0(t6) + ble t1, zero, label8 .p2align 2 label2: - mulw a6, t4, a1 - addw t4, a6, a2 - mul t5, t4, a4 - srli a6, t5, 63 - srai t6, t5, 60 - add s2, a6, t6 - mulw a7, s2, a0 - subw t5, t4, a7 - mv t6, t5 - bge t5, zero, label149 - addw t6, t5, a0 + mulw t5, t2, a1 + addw t2, t5, a2 + mul t3, t2, a3 + srli t6, t3, 63 + srai t4, t3, 60 + add t5, t6, t4 + mulw a6, t5, a0 + subw t3, t2, a6 + mv t4, t3 + bge t3, zero, label143 + addw t4, t3, a0 .p2align 2 -label149: - mulw a7, t6, a1 - addw t5, a7, a2 - mul t4, t5, a4 - srli a7, t4, 63 - srai a6, t4, 60 - add s2, a7, a6 - mulw s3, s2, a0 - subw a7, t5, s3 - mv t4, a7 - bge a7, zero, label151 - addw t4, a7, a0 +label143: + mulw t6, t4, a1 + addw t3, t6, a2 + mul t2, t3, a3 + srli t6, t2, 63 + srai t5, t2, 60 + add s1, t6, t5 + mulw a6, s1, a0 + subw a7, t3, a6 + mv t2, a7 + bge a7, zero, label145 + addw t2, a7, a0 .p2align 2 -label151: - andi t5, t4, 1 - mul a6, t6, t3 - addiw s1, s1, -1 - srli s4, a6, 63 - srai a7, a6, 49 - add s3, s4, a7 - mulw a6, s3, t0 - subw s2, t6, a6 - mul a7, s2, t1 - srli t6, a7, 32 - add a6, t6, s2 +label145: + andi t3, t2, 1 + lui a6, 458130 + lui s1, 559241 + li s4, 64 + addiw t1, t1, -1 + addiw a7, a6, -635 + mul t5, t4, a7 + srli a6, t5, 63 + srai t6, t5, 49 + add a7, a6, t6 + addiw t6, s1, -1911 + mulw t5, a7, a5 + subw a6, t4, t5 + mul a7, a6, t6 + srli t4, a7, 32 + add t5, t4, a6 + srliw a7, t5, 31 + sraiw t6, t5, 4 + add t4, a7, t6 + slliw t6, t4, 4 + sh2add s2, t4, s0 + subw s1, t6, t4 + lw t5, 0(s2) + slli s2, s1, 1 + subw a7, a6, s2 + subw s3, s4, a7 + slli a6, t5, 1 + sllw t6, a4, a7 + srl s2, a6, s3 + add s1, t5, s2 + sraw a6, s1, a7 srliw s3, a6, 31 - sraiw a7, a6, 4 - add t6, s3, a7 - slliw s6, t6, 4 - sh2add s4, t6, s0 - subw a7, s6, t6 - slli s5, a7, 1 - lw a6, 0(s4) - subw s3, s2, s5 - slli s4, a6, 1 - subw s7, a3, s3 - sllw a7, a5, s3 - srl s6, s4, s7 - add s5, a6, s6 - sraw s2, s5, s3 - srliw s7, s2, 31 - add s6, s2, s7 - andi s4, s6, -2 - subw s3, s2, s4 - beq t5, s3, label41 - andi s5, s2, 1 - xori s6, t5, 1 - mv s3, a7 - or s4, s5, s6 - beq s4, zero, label153 - mv s3, zero + add s2, a6, s3 + andi s1, s2, -2 + subw a7, a6, s1 + beq t3, a7, label41 + andi s2, a6, 1 + xori s3, t3, 1 + mv a7, t6 + or s1, s2, s3 + beq s1, zero, label149 + mv a7, zero .p2align 2 -label153: - and s6, s2, t2 - mv s2, s3 - xori s5, s6, 1 - or s4, t5, s5 - bne s4, zero, label155 - subw s2, s3, a7 - j label155 +label149: + and s3, a6, t0 + mv a6, a7 + xori s2, s3, 1 + or s1, t3, s2 + bne s1, zero, label151 + subw a6, a7, t6 + j label151 label8: - mv a0, a3 + li a0, 64 jal _sysy_stoptime mv a1, s0 li a2, 625 @@ -138,18 +136,15 @@ label8: ld ra, 0(sp) mv a0, zero ld s1, 8(sp) - ld s6, 16(sp) - ld s2, 24(sp) - ld s0, 32(sp) - ld s5, 40(sp) - ld s3, 48(sp) - ld s4, 56(sp) - ld s7, 64(sp) - addi sp, sp, 72 + ld s2, 16(sp) + ld s0, 24(sp) + ld s4, 32(sp) + ld s3, 40(sp) + addi sp, sp, 48 ret .p2align 2 label41: - sh2add t5, t6, s0 - sw a6, 0(t5) - bgt s1, zero, label2 + sh2add t3, t4, s0 + sw t5, 0(t3) + bgt t1, zero, label2 j label8 diff --git a/tests/SysY2022/performance/01_mm1.arm.s b/tests/SysY2022/performance/01_mm1.arm.s index e424c6434..344f0aa0b 100644 --- a/tests/SysY2022/performance/01_mm1.arm.s +++ b/tests/SysY2022/performance/01_mm1.arm.s @@ -1,22 +1,22 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 A: .zero 4194304 -.align 8 +.p2align 3 B: .zero 4194304 -.align 8 +.p2align 3 C: .zero 4194304 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 .text diff --git a/tests/SysY2022/performance/01_mm1.riscv.s b/tests/SysY2022/performance/01_mm1.riscv.s index bb4605631..563a93453 100644 --- a/tests/SysY2022/performance/01_mm1.riscv.s +++ b/tests/SysY2022/performance/01_mm1.riscv.s @@ -1,28 +1,28 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 A: .zero 4194304 -.align 8 +.p2align 3 B: .zero 4194304 -.align 8 +.p2align 3 C: .zero 4194304 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_4: .zero 4 .text @@ -43,112 +43,112 @@ main: sd s9, 80(sp) sd s10, 88(sp) jal getint -pcrel1067: +pcrel1390: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel1068: +pcrel1391: auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - lui s6, 1 li s3, 5 -pcrel1069: + lui s6, 1 +pcrel1392: auipc a1, %pcrel_hi(cmmc_parallel_body_1) mv s0, a0 - addi s1, a1, %pcrel_lo(pcrel1069) -pcrel1070: + addi s1, a1, %pcrel_lo(pcrel1392) +pcrel1393: auipc a0, %pcrel_hi(cmmc_parallel_body_3) - addi s2, a0, %pcrel_lo(pcrel1070) - ble s0, zero, label907 -pcrel1071: + addi s2, a0, %pcrel_lo(pcrel1393) + ble s0, zero, label1250 +pcrel1394: auipc a0, %pcrel_hi(A) mv s8, zero - addi s7, a0, %pcrel_lo(pcrel1071) + addi s7, a0, %pcrel_lo(pcrel1394) mv s9, s7 mv s10, zero - j label923 + j label1234 .p2align 2 -label989: +label1273: addiw s8, s8, 1 - ble s0, s8, label1059 + ble s0, s8, label1381 add s7, s7, s6 mv s10, zero mv s9, s7 .p2align 2 -label923: +label1234: jal getint addiw s10, s10, 1 sw a0, 0(s9) - ble s0, s10, label989 + ble s0, s10, label1273 addi s9, s9, 4 - j label923 -label907: + j label1234 +label1250: li a0, 65 jal _sysy_starttime mv s6, zero - j label908 -label910: + j label1251 +label1253: addiw s6, s6, 1 - bge s6, s3, label911 + bge s6, s3, label1254 .p2align 2 -label908: - ble s0, zero, label910 -pcrel1072: +label1251: + ble s0, zero, label1253 +pcrel1395: auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - sw s0, %pcrel_lo(pcrel1072)(s4) + sw s0, %pcrel_lo(pcrel1395)(s4) mv a0, zero mv a1, s0 mv a2, s1 jal cmmcParallelFor - ble s0, zero, label917 -pcrel1073: + ble s0, zero, label1260 +pcrel1396: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel1074: +pcrel1397: auipc a3, %pcrel_hi(cmmc_parallel_body_2) - sw s0, %pcrel_lo(pcrel1073)(a0) - addi a2, a3, %pcrel_lo(pcrel1074) + sw s0, %pcrel_lo(pcrel1396)(a0) + addi a2, a3, %pcrel_lo(pcrel1397) mv a1, s0 mv a0, zero jal cmmcParallelFor .p2align 2 -label917: +label1260: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) - sw s0, %pcrel_lo(label917)(s5) + sw s0, %pcrel_lo(label1260)(s5) mv a0, zero mv a1, s0 mv a2, s2 jal cmmcParallelFor - ble s0, zero, label910 -pcrel1075: + ble s0, zero, label1253 +pcrel1398: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel1076: +pcrel1399: auipc a3, %pcrel_hi(cmmc_parallel_body_4) - sw s0, %pcrel_lo(pcrel1075)(a0) - addi a2, a3, %pcrel_lo(pcrel1076) + sw s0, %pcrel_lo(pcrel1398)(a0) + addi a2, a3, %pcrel_lo(pcrel1399) mv a1, s0 mv a0, zero jal cmmcParallelFor addiw s6, s6, 1 - blt s6, s3, label908 -label911: - ble s0, zero, label953 -pcrel1077: + blt s6, s3, label1251 +label1254: + ble s0, zero, label1306 +pcrel1400: auipc s1, %pcrel_hi(cmmc_parallel_body_payload_0) slli a0, s0, 32 -pcrel1078: +pcrel1401: auipc a3, %pcrel_hi(cmmc_parallel_body_0) - sd a0, %pcrel_lo(pcrel1077)(s1) - addi a2, a3, %pcrel_lo(pcrel1078) + sd a0, %pcrel_lo(pcrel1400)(s1) + addi a2, a3, %pcrel_lo(pcrel1401) mv a1, s0 mv a0, zero jal cmmcParallelFor - lw s0, %pcrel_lo(pcrel1077)(s1) -label912: + lw s0, %pcrel_lo(pcrel1400)(s1) +label1255: li a0, 84 jal _sysy_stoptime mv a0, s0 jal putint li a0, 10 jal putch - ld ra, 0(sp) mv a0, zero + ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) ld s1, 24(sp) @@ -162,713 +162,941 @@ label912: ld s10, 88(sp) addi sp, sp, 96 ret -label1059: +label1381: auipc a0, %pcrel_hi(B) mv s8, zero mv s10, zero - addi s7, a0, %pcrel_lo(label1059) + addi s7, a0, %pcrel_lo(label1381) mv s9, s7 - j label933 + j label1244 .p2align 2 -label938: +label1249: addi s9, s9, 4 .p2align 2 -label933: +label1244: jal getint addiw s10, s10, 1 sw a0, 0(s9) - bgt s0, s10, label938 + bgt s0, s10, label1249 addiw s8, s8, 1 - ble s0, s8, label907 + ble s0, s8, label1250 add s7, s7, s6 mv s10, zero mv s9, s7 - j label933 -label953: + j label1244 +label1306: mv s0, zero - j label912 + j label1255 .p2align 2 cmmc_parallel_body_0: - mv t0, a0 -pcrel146: + addi sp, sp, -8 + mv t1, a0 +pcrel289: auipc a4, %pcrel_hi(cmmc_parallel_body_payload_0) lui a2, 1 - addi a3, a4, %pcrel_lo(pcrel146) + sd s0, 0(sp) + addi a3, a4, %pcrel_lo(pcrel289) lw a0, 4(a3) bgt a0, zero, label2 - mv t2, zero -label35: - amoadd.w.aqrl a0, t2, (a3) + mv t1, zero +label65: + amoadd.w.aqrl a0, t1, (a3) + ld s0, 0(sp) + addi sp, sp, 8 ret label2: - addiw a5, a0, -7 -pcrel147: - auipc t2, %pcrel_hi(B) - li t1, 7 - addi a4, t2, %pcrel_lo(pcrel147) - bgt a0, t1, label3 - slli t1, t0, 12 - mv a5, t0 + auipc t0, %pcrel_hi(B) + li a4, 3 + addi a5, t0, %pcrel_lo(label2) + ble a0, a4, label80 + addiw a4, a0, -3 + addiw t0, a0, -18 + li t2, 15 + bgt a4, t2, label24 + slli t2, t1, 12 + mv t0, t1 + mv t3, zero + add a5, a5, t2 + mv t1, a5 mv t2, zero - add a4, a4, t1 + j label9 +.p2align 2 +label124: + addiw t0, t0, 1 + ble a1, t0, label271 +.p2align 2 +label17: + add a5, a5, a2 + lw t6, 0(a5) + mv t1, a5 + lw a7, 4(a5) + addw t5, t2, t6 + lw a6, 8(a5) + li t2, 4 + addw t4, t5, a7 + lw t5, 12(a5) + addw t6, t4, a6 + addw t3, t6, t5 + ble a4, t2, label104 +.p2align 2 +label13: + addi t1, t1, 16 +.p2align 2 +label9: + lw t6, 0(t1) + addiw t2, t2, 4 + lw a7, 4(t1) + addw t5, t3, t6 + lw a6, 8(t1) + addw t4, t5, a7 + lw t5, 12(t1) + addw t6, t4, a6 + addw t3, t6, t5 + bgt a4, t2, label13 +.p2align 2 +label104: + ble a0, t2, label263 + sh2add t1, t2, a5 + mv t4, t2 + mv t2, t3 +.p2align 2 +label19: + lw t5, 0(t1) + addiw t4, t4, 1 + addw t2, t2, t5 + ble a0, t4, label124 + addi t1, t1, 4 + j label19 +label24: + slli t3, t1, 12 + mv t4, zero + add a5, a5, t3 + mv t2, a5 + mv t3, zero + j label29 +.p2align 2 +label203: + addiw t1, t1, 1 + ble a1, t1, label193 +.p2align 2 +label40: + add a5, a5, a2 + mv t4, t3 + mv t2, a5 + mv t3, zero +.p2align 2 +label29: + lw a6, 0(t2) + addiw t3, t3, 16 + lw a7, 4(t2) + addw t6, t4, a6 + lw s0, 8(t2) + addw t5, t6, a7 + lw a6, 12(t2) + addw t4, t5, s0 + lw a7, 16(t2) + addw t6, t4, a6 + lw t4, 20(t2) + addw t5, t6, a7 + lw a7, 24(t2) + addw a6, t5, t4 + lw t5, 28(t2) + addw t4, a6, a7 + lw a6, 32(t2) + addw t6, t4, t5 + lw a7, 36(t2) + addw t5, t6, a6 + lw t6, 40(t2) + addw t4, t5, a7 + lw a7, 44(t2) + addw a6, t4, t6 + lw t5, 48(t2) + addw t6, a6, a7 + lw a7, 52(t2) + addw t4, t6, t5 + lw a6, 56(t2) + addw t5, t4, a7 + lw a7, 60(t2) + addw t6, t5, a6 + addw t4, t6, a7 + ble t0, t3, label179 + addi t2, t2, 64 + j label29 +.p2align 2 +label179: + ble a4, t3, label265 + sh2add t2, t3, a5 +.p2align 2 +label48: + lw t6, 0(t2) + addiw t3, t3, 4 + lw a7, 4(t2) + addw t5, t4, t6 + lw t4, 8(t2) + addw a6, t5, a7 + lw t5, 12(t2) + addw t6, a6, t4 + addw t4, t6, t5 + ble a4, t3, label222 + addi t2, t2, 16 + j label48 +.p2align 2 +label222: + mv t5, t4 + ble a0, t3, label272 +.p2align 2 +label41: + sh2add t2, t3, a5 + mv t4, t3 + mv t3, t5 +.p2align 2 +label42: + lw t6, 0(t2) + addiw t4, t4, 1 + addw t3, t3, t6 + ble a0, t4, label203 + addi t2, t2, 4 + j label42 +label272: + mv t3, t5 + addiw t1, t1, 1 + bgt a1, t1, label40 +label193: + mv t1, t3 + j label65 +label271: + mv t1, t2 + j label65 +label263: + mv t2, t3 + addiw t0, t0, 1 + bgt a1, t0, label17 + j label271 +label265: + mv t5, t4 + bgt a0, t3, label41 + j label272 +label80: + slli t0, t1, 12 + mv t2, zero + add a4, a5, t0 mv t0, a4 + mv a5, t1 mv t1, zero - j label28 + j label58 .p2align 2 -label32: +label62: addi t0, t0, 4 .p2align 2 -label28: +label58: lw t3, 0(t0) addiw t2, t2, 1 addw t1, t1, t3 - bgt a0, t2, label32 + bgt a0, t2, label62 addiw a5, a5, 1 - ble a1, a5, label137 + ble a1, a5, label65 .p2align 2 -label34: +label64: add a4, a4, a2 li t2, 1 lw t3, 0(a4) mv t0, a4 addw t1, t1, t3 - bgt a0, t2, label32 + bgt a0, t2, label62 addiw a5, a5, 1 - bgt a1, a5, label34 -label137: - mv t2, t1 - j label35 -label3: - slli t1, t0, 12 - mv t3, zero - add a4, a4, t1 - mv t1, a4 - mv t2, zero - j label8 -.p2align 2 -label95: - addiw t0, t0, 1 - ble a1, t0, label35 + bgt a1, a5, label64 + j label65 .p2align 2 -label22: - add a4, a4, a2 - mv t3, t2 - mv t1, a4 - mv t2, zero +cmmc_parallel_body_1: + mv t0, a0 +pcrel529: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) +pcrel530: + auipc a5, %pcrel_hi(C) + li a3, 3 + lw a0, %pcrel_lo(pcrel529)(a2) + addi a4, a5, %pcrel_lo(pcrel530) + lui a2, 1 + bgt a0, a3, label291 + bgt a0, zero, label333 +label331: + ret +label333: + slliw a5, t0, 12 + add a3, a4, a5 + mv a4, t0 + mv a5, a3 + mv t0, zero + j label337 .p2align 2 -label8: - lw t5, 0(t1) - addiw t2, t2, 8 - lw a6, 4(t1) - addw t4, t3, t5 - lw t3, 8(t1) - addw t6, t4, a6 - lw a6, 12(t1) - addw t5, t6, t3 - lw t6, 16(t1) - addw t4, t5, a6 - lw a6, 20(t1) - addw t3, t4, t6 - lw t6, 24(t1) - addw t5, t3, a6 - lw a6, 28(t1) - addw t4, t5, t6 - addw t3, t4, a6 - ble a5, t2, label81 - addi t1, t1, 32 - j label8 -.p2align 2 -label81: - ble a0, t2, label134 - sh2add t1, t2, a4 - mv t4, t2 - mv t2, t3 +label340: + addi a5, a5, 4 .p2align 2 -label15: - lw t5, 0(t1) - addiw t4, t4, 1 - addw t2, t2, t5 - ble a0, t4, label95 - addi t1, t1, 4 - j label15 -label134: - mv t2, t3 +label337: addiw t0, t0, 1 - bgt a1, t0, label22 - j label35 + sw zero, 0(a5) + bgt a0, t0, label340 + addiw a4, a4, 1 + ble a1, a4, label331 .p2align 2 -cmmc_parallel_body_1: - mv a5, a0 -pcrel273: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) -pcrel274: - auipc t1, %pcrel_hi(C) - li t0, 7 - lw a0, %pcrel_lo(pcrel273)(a2) - addi a4, t1, %pcrel_lo(pcrel274) - lui a2, 1 - addi a3, a0, -7 - bgt a0, t0, label149 - bgt a0, zero, label166 - j label176 -label149: - slliw t0, a5, 12 - add a4, a4, t0 - mv t0, a4 +label342: + add a3, a3, a2 + li t0, 1 + sw zero, 0(a3) + mv a5, a3 + bgt a0, t0, label340 + addiw a4, a4, 1 + bgt a1, a4, label342 + j label331 +label291: + addiw a3, a0, -3 + addiw a5, a0, -18 + li t1, 15 + bgt a3, t1, label292 + slliw t1, t0, 12 + mv a5, t0 + add a4, a4, t1 mv t1, zero - j label153 -.p2align 2 -label162: - addi t0, t0, 4 + mv t0, a4 + j label319 .p2align 2 -label159: - addiw t2, t2, 1 - sw zero, 0(t0) - bgt a0, t2, label162 +label435: addiw a5, a5, 1 - ble a1, a5, label176 + ble a1, a5, label331 .p2align 2 -label164: +label329: add a4, a4, a2 - li t1, 8 + li t1, 4 sd zero, 0(a4) mv t0, a4 sd zero, 8(a4) - sd zero, 16(a4) - sd zero, 24(a4) - ble a3, t1, label265 + ble a3, t1, label515 .p2align 2 -label156: - addi t0, t0, 32 +label330: + addi t0, t0, 16 .p2align 2 -label153: - addiw t1, t1, 8 +label319: + addiw t1, t1, 4 sd zero, 0(t0) sd zero, 8(t0) - sd zero, 16(t0) - sd zero, 24(t0) - bgt a3, t1, label156 - ble a0, t1, label261 + bgt a3, t1, label330 + ble a0, t1, label428 .p2align 2 -label158: +label323: sh2add t0, t1, a4 mv t2, t1 - j label159 -label265: - bgt a0, t1, label158 .p2align 2 -label261: - addiw a5, a5, 1 - bgt a1, a5, label164 -label176: - ret -label166: - slliw t0, a5, 12 - add a3, a4, t0 - mv a4, a5 - mv a5, a3 - mv t0, zero - j label170 +label324: + addiw t2, t2, 1 + sw zero, 0(t0) + ble a0, t2, label435 + addi t0, t0, 4 + j label324 +label515: + bgt a0, t1, label323 .p2align 2 -label175: - addi a5, a5, 4 +label428: + addiw a5, a5, 1 + bgt a1, a5, label329 + j label331 +label292: + slliw t1, t0, 12 + add a4, a4, t1 + mv t1, a4 + mv t2, zero + j label296 .p2align 2 -label170: +label412: addiw t0, t0, 1 - sw zero, 0(a5) - bgt a0, t0, label175 - addiw a4, a4, 1 - ble a1, a4, label176 + ble a1, t0, label331 .p2align 2 -label174: - add a3, a3, a2 - li t0, 1 - sw zero, 0(a3) - mv a5, a3 - bgt a0, t0, label175 - addiw a4, a4, 1 - bgt a1, a4, label174 - j label176 +label308: + add a4, a4, a2 + li t2, 16 + sd zero, 0(a4) + mv t1, a4 + sd zero, 8(a4) + sd zero, 16(a4) + sd zero, 24(a4) + sd zero, 32(a4) + sd zero, 40(a4) + sd zero, 48(a4) + sd zero, 56(a4) + ble a5, t2, label299 +.p2align 2 +label314: + addi t1, t1, 64 +.p2align 2 +label296: + addiw t2, t2, 16 + sd zero, 0(t1) + sd zero, 8(t1) + sd zero, 16(t1) + sd zero, 24(t1) + sd zero, 32(t1) + sd zero, 40(t1) + sd zero, 48(t1) + sd zero, 56(t1) + bgt a5, t2, label314 +.p2align 2 +label299: + ble a3, t2, label384 + sh2add t1, t2, a4 + mv t3, t2 + j label301 +.p2align 2 +label304: + addi t1, t1, 16 +.p2align 2 +label301: + addiw t3, t3, 4 + sd zero, 0(t1) + sd zero, 8(t1) + bgt a3, t3, label304 + ble a0, t3, label508 +.p2align 2 +label309: + sh2add t1, t3, a4 + mv t2, t3 +.p2align 2 +label310: + addiw t2, t2, 1 + sw zero, 0(t1) + ble a0, t2, label412 + addi t1, t1, 4 + j label310 +.p2align 2 +label384: + mv t3, t2 + bgt a0, t2, label309 + addiw t0, t0, 1 + bgt a1, t0, label308 + j label331 +label508: + addiw t0, t0, 1 + bgt a1, t0, label308 + j label331 .p2align 2 cmmc_parallel_body_2: addi sp, sp, -48 mv t2, a0 mv a4, a1 -pcrel522: +pcrel753: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel523: - auipc a5, %pcrel_hi(A) -pcrel524: +pcrel754: + auipc t1, %pcrel_hi(C) +pcrel755: auipc t3, %pcrel_hi(B) - li t0, 7 -pcrel525: - auipc a1, %pcrel_hi(C) - addi a3, a5, %pcrel_lo(pcrel523) sd s0, 0(sp) - addi a5, t3, %pcrel_lo(pcrel524) +pcrel756: + auipc a1, %pcrel_hi(A) + addi a5, t3, %pcrel_lo(pcrel755) + addi a3, a1, %pcrel_lo(pcrel756) sd s5, 8(sp) + lui a1, 1 sd s1, 16(sp) sd s3, 24(sp) sd s2, 32(sp) sd s4, 40(sp) - lw a0, %pcrel_lo(pcrel522)(a2) - addi a2, a1, %pcrel_lo(pcrel525) - addi t1, a0, -7 - lui a1, 1 - bgt a0, t0, label276 - bgt a0, zero, label302 -label300: - ld s0, 0(sp) - ld s5, 8(sp) - ld s1, 16(sp) - ld s3, 24(sp) - ld s2, 32(sp) - ld s4, 40(sp) - addi sp, sp, 48 - ret -label276: + lw a0, %pcrel_lo(pcrel753)(a2) + addi a2, t1, %pcrel_lo(pcrel754) + addi t0, a0, -3 + li t1, 3 + bgt a0, t1, label532 + ble a0, zero, label556 + slliw t1, t2, 12 + add t0, a5, t1 + mv a5, t2 + mv t1, a3 + mv t4, zero + bgt a0, zero, label567 + j label565 +.p2align 2 +label572: + add t1, t1, a1 + mv t4, t2 + ble a0, t2, label735 +.p2align 2 +label567: + sh2add t5, a5, t1 + addiw t2, t4, 1 + lw t3, 0(t5) + beq t3, zero, label574 + slliw a6, t4, 12 + mv t5, t0 + mv t6, zero + add t4, a2, a6 +.p2align 2 +label569: + lw s0, 0(t5) + sh2add a7, t6, t4 + addiw t6, t6, 1 + mulw a6, t3, s0 + amoadd.w.aqrl s1, a6, (a7) + ble a0, t6, label572 + addi t5, t5, 4 + j label569 +.p2align 2 +label574: + add t1, t1, a1 + mv t4, t2 + bgt a0, t2, label567 + addiw a5, a5, 1 + ble a4, a5, label556 +.p2align 2 +label566: + add t0, t0, a1 + mv t1, a3 + mv t4, zero + bgt a0, zero, label567 +label565: + addiw a5, a5, 1 + bgt a4, a5, label566 + j label556 +label532: slliw t3, t2, 12 - add t0, a5, t3 + add t1, a5, t3 mv t3, a3 mv t5, zero - bgt a0, zero, label283 - j label339 + bgt a0, zero, label541 + j label539 +.p2align 2 +label553: + addi t6, t6, 4 .p2align 2 -label284: +label549: + lw s1, 0(t6) + sh2add s0, a6, t5 + addiw a6, a6, 1 + mulw a7, a5, s1 + amoadd.w.aqrl s2, a7, (s0) + bgt a0, a6, label553 add t3, t3, a1 mv t5, t4 - ble a0, t4, label498 + ble a0, t4, label733 .p2align 2 -label283: +label541: sh2add t6, t2, t3 addiw t4, t5, 1 lw a5, 0(t6) - beq a5, zero, label284 + bne a5, zero, label609 + add t3, t3, a1 + mv t5, t4 + bgt a0, t4, label541 + addiw t2, t2, 1 + ble a4, t2, label556 +.p2align 2 +label540: + add t1, t1, a1 + mv t3, a3 + mv t5, zero + bgt a0, zero, label541 +label539: + addiw t2, t2, 1 + bgt a4, t2, label540 + j label556 +.p2align 2 +label609: slliw a7, t5, 12 + mv t6, t1 mv a6, zero add t5, a2, a7 - mv t6, t5 - j label286 -.p2align 2 -label289: - addi t6, t6, 32 .p2align 2 -label286: - sh2add a7, a6, t0 - addi s4, t6, 4 - addiw a6, a6, 8 - lw s0, 0(a7) +label544: + lw s0, 0(t6) + sh2add a7, a6, t5 + addiw a6, a6, 4 mulw s1, a5, s0 - amoadd.w.aqrl s3, s1, (t6) - lw s2, 4(a7) + amoadd.w.aqrl s3, s1, (a7) + addi s3, a7, 4 + lw s2, 4(t6) mulw s0, a5, s2 - addi s2, t6, 8 - amoadd.w.aqrl s5, s0, (s4) - lw s3, 8(a7) - mulw s1, a5, s3 - amoadd.w.aqrl s4, s1, (s2) - addi s2, t6, 12 - lw s3, 12(a7) - mulw s0, a5, s3 - amoadd.w.aqrl s4, s0, (s2) - addi s2, t6, 16 - lw s3, 16(a7) - mulw s1, a5, s3 - amoadd.w.aqrl s4, s1, (s2) - addi s2, t6, 20 - lw s3, 20(a7) - mulw s0, a5, s3 - amoadd.w.aqrl s4, s0, (s2) - addi s2, t6, 24 - lw s3, 24(a7) - mulw s1, a5, s3 - amoadd.w.aqrl s4, s1, (s2) - lw s3, 28(a7) - addi a7, t6, 28 + addi s2, a7, 8 + amoadd.w.aqrl s5, s0, (s3) + lw s4, 8(t6) + mulw s1, a5, s4 + amoadd.w.aqrl s5, s1, (s2) + addi s1, a7, 12 + lw s3, 12(t6) mulw s0, a5, s3 - amoadd.w.aqrl s1, s0, (a7) - bgt t1, a6, label289 - ble a0, a6, label499 - sh2add t6, a6, t0 - j label292 -.p2align 2 -label295: - addi t6, t6, 4 + amoadd.w.aqrl s2, s0, (s1) + ble t0, a6, label547 + addi t6, t6, 16 + j label544 +.p2align 2 +label547: + ble a0, a6, label641 + sh2add t6, a6, t1 + j label549 .p2align 2 -label292: - lw s1, 0(t6) - sh2add s0, a6, t5 - addiw a6, a6, 1 - mulw a7, a5, s1 - amoadd.w.aqrl s2, a7, (s0) - bgt a0, a6, label295 +label641: add t3, t3, a1 mv t5, t4 - bgt a0, t4, label283 + bgt a0, t4, label541 addiw t2, t2, 1 - bgt a4, t2, label299 - j label300 + bgt a4, t2, label540 +label556: + ld s0, 0(sp) + ld s5, 8(sp) + ld s1, 16(sp) + ld s3, 24(sp) + ld s2, 32(sp) + ld s4, 40(sp) + addi sp, sp, 48 + ret .p2align 2 -label498: +label733: addiw t2, t2, 1 - ble a4, t2, label300 + bgt a4, t2, label540 + j label556 .p2align 2 -label299: - add t0, t0, a1 - mv t3, a3 - mv t5, zero - bgt a0, zero, label283 -label339: - addiw t2, t2, 1 - bgt a4, t2, label299 - j label300 +label735: + addiw a5, a5, 1 + bgt a4, a5, label566 + j label556 .p2align 2 -label499: - add t3, t3, a1 - mv t5, t4 - bgt a0, t4, label283 - addiw t2, t2, 1 - bgt a4, t2, label299 - j label300 -label302: - slliw t1, t2, 12 - add t0, a5, t1 - mv a5, t2 - mv t1, a3 - mv t4, zero - bgt a0, zero, label311 - j label309 +cmmc_parallel_body_3: + mv t0, a0 +pcrel999: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_3) +pcrel1000: + auipc a5, %pcrel_hi(B) + li a3, 3 + lw a0, %pcrel_lo(pcrel999)(a2) + addi a4, a5, %pcrel_lo(pcrel1000) + lui a2, 1 + bgt a0, a3, label758 + ble a0, zero, label798 + slliw a5, t0, 12 + add a3, a4, a5 + mv a4, t0 + mv a5, a3 + mv t0, zero + j label804 .p2align 2 -label317: - addi t5, t5, 4 +label807: + addi a5, a5, 4 .p2align 2 -label313: - lw a7, 0(t5) - sh2add s0, t6, t4 - addiw t6, t6, 1 - mulw a6, t3, a7 - amoadd.w.aqrl a7, a6, (s0) - bgt a0, t6, label317 - add t1, t1, a1 - mv t4, t2 - ble a0, t2, label502 +label804: + addiw t0, t0, 1 + sw zero, 0(a5) + bgt a0, t0, label807 + addiw a4, a4, 1 + ble a1, a4, label798 .p2align 2 -label311: - sh2add t5, a5, t1 - addiw t2, t4, 1 - lw t3, 0(t5) - bne t3, zero, label312 - add t1, t1, a1 - mv t4, t2 - bgt a0, t2, label311 - addiw a5, a5, 1 - bgt a4, a5, label310 - j label300 +label809: + add a3, a3, a2 + li t0, 1 + sw zero, 0(a3) + mv a5, a3 + bgt a0, t0, label807 + addiw a4, a4, 1 + bgt a1, a4, label809 + j label798 +label758: + addiw a3, a0, -3 + addiw a5, a0, -18 + li t1, 15 + ble a3, t1, label825 + slliw t1, t0, 12 + add a4, a4, t1 + mv t1, a4 + mv t2, zero + j label763 .p2align 2 -label312: - slliw a6, t4, 12 - mv t5, t0 - mv t6, zero - add t4, a2, a6 - j label313 +label879: + addiw t0, t0, 1 + ble a1, t0, label798 .p2align 2 -label502: - addiw a5, a5, 1 - ble a4, a5, label300 +label775: + add a4, a4, a2 + li t2, 16 + sd zero, 0(a4) + mv t1, a4 + sd zero, 8(a4) + sd zero, 16(a4) + sd zero, 24(a4) + sd zero, 32(a4) + sd zero, 40(a4) + sd zero, 48(a4) + sd zero, 56(a4) + ble a5, t2, label766 +.p2align 2 +label781: + addi t1, t1, 64 +.p2align 2 +label763: + addiw t2, t2, 16 + sd zero, 0(t1) + sd zero, 8(t1) + sd zero, 16(t1) + sd zero, 24(t1) + sd zero, 32(t1) + sd zero, 40(t1) + sd zero, 48(t1) + sd zero, 56(t1) + bgt a5, t2, label781 +.p2align 2 +label766: + ble a3, t2, label851 + sh2add t1, t2, a4 + mv t3, t2 + j label768 .p2align 2 -label310: - add t0, t0, a1 - mv t1, a3 - mv t4, zero - bgt a0, zero, label311 -label309: - addiw a5, a5, 1 - bgt a4, a5, label310 - j label300 +label771: + addi t1, t1, 16 .p2align 2 -cmmc_parallel_body_3: - mv a5, a0 -pcrel651: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel652: - auipc t1, %pcrel_hi(B) - li t0, 7 - lw a0, %pcrel_lo(pcrel651)(a2) - addi a4, t1, %pcrel_lo(pcrel652) - lui a2, 1 - addi a3, a0, -7 - bgt a0, t0, label527 - bgt a0, zero, label545 - j label543 -label527: - slliw t0, a5, 12 - add a4, a4, t0 - mv t0, a4 - mv t1, zero - j label531 +label768: + addiw t3, t3, 4 + sd zero, 0(t1) + sd zero, 8(t1) + bgt a3, t3, label771 + ble a0, t3, label975 .p2align 2 -label540: - addi t0, t0, 4 +label776: + sh2add t1, t3, a4 + mv t2, t3 .p2align 2 -label537: +label777: addiw t2, t2, 1 - sw zero, 0(t0) - bgt a0, t2, label540 + sw zero, 0(t1) + ble a0, t2, label879 + addi t1, t1, 4 + j label777 +.p2align 2 +label851: + mv t3, t2 + bgt a0, t2, label776 + addiw t0, t0, 1 + bgt a1, t0, label775 +label798: + ret +label975: + addiw t0, t0, 1 + bgt a1, t0, label775 + j label798 +label825: + slliw t1, t0, 12 + mv a5, t0 + add a4, a4, t1 + mv t1, zero + mv t0, a4 + j label786 +.p2align 2 +label909: addiw a5, a5, 1 - ble a1, a5, label543 + ble a1, a5, label798 .p2align 2 -label542: +label792: add a4, a4, a2 - li t1, 8 + li t1, 4 sd zero, 0(a4) mv t0, a4 sd zero, 8(a4) - sd zero, 16(a4) - sd zero, 24(a4) - ble a3, t1, label643 + ble a3, t1, label983 .p2align 2 -label534: - addi t0, t0, 32 +label789: + addi t0, t0, 16 .p2align 2 -label531: - addiw t1, t1, 8 +label786: + addiw t1, t1, 4 sd zero, 0(t0) sd zero, 8(t0) - sd zero, 16(t0) - sd zero, 24(t0) - bgt a3, t1, label534 - ble a0, t1, label639 + bgt a3, t1, label789 + ble a0, t1, label977 .p2align 2 -label536: +label793: sh2add t0, t1, a4 mv t2, t1 - j label537 -label643: - bgt a0, t1, label536 -.p2align 2 -label639: - addiw a5, a5, 1 - bgt a1, a5, label542 -label543: - ret -label545: - slliw t0, a5, 12 - add a3, a4, t0 - mv a4, a5 - mv a5, a3 - mv t0, zero - j label549 .p2align 2 -label554: - addi a5, a5, 4 -.p2align 2 -label549: - addiw t0, t0, 1 - sw zero, 0(a5) - bgt a0, t0, label554 - addiw a4, a4, 1 - ble a1, a4, label543 +label794: + addiw t2, t2, 1 + sw zero, 0(t0) + ble a0, t2, label909 + addi t0, t0, 4 + j label794 +label983: + bgt a0, t1, label793 .p2align 2 -label553: - add a3, a3, a2 - li t0, 1 - sw zero, 0(a3) - mv a5, a3 - bgt a0, t0, label554 - addiw a4, a4, 1 - bgt a1, a4, label553 - j label543 +label977: + addiw a5, a5, 1 + bgt a1, a5, label792 + j label798 .p2align 2 cmmc_parallel_body_4: - addi sp, sp, -48 + addi sp, sp, -40 mv t2, a0 mv a4, a1 -pcrel902: +pcrel1225: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel903: +pcrel1226: auipc a5, %pcrel_hi(A) -pcrel904: +pcrel1227: auipc t3, %pcrel_hi(C) - li t0, 7 -pcrel905: + li t1, 3 +pcrel1228: auipc a1, %pcrel_hi(B) - addi a3, a5, %pcrel_lo(pcrel903) - sd s0, 0(sp) - addi a5, t3, %pcrel_lo(pcrel904) - sd s5, 8(sp) - sd s2, 16(sp) - sd s3, 24(sp) - sd s1, 32(sp) - sd s4, 40(sp) - lw a0, %pcrel_lo(pcrel902)(a2) - addi a2, a1, %pcrel_lo(pcrel905) - addi t1, a0, -7 + addi a3, a5, %pcrel_lo(pcrel1226) + sd s1, 0(sp) + addi a5, t3, %pcrel_lo(pcrel1227) + sd s0, 8(sp) + sd s3, 16(sp) + sd s2, 24(sp) + sd s4, 32(sp) + lw a0, %pcrel_lo(pcrel1225)(a2) + addi a2, a1, %pcrel_lo(pcrel1228) + addi t0, a0, -3 lui a1, 1 - ble a0, t0, label679 + bgt a0, t1, label1002 + bgt a0, zero, label1028 +label1026: + ld s1, 0(sp) + ld s0, 8(sp) + ld s3, 16(sp) + ld s2, 24(sp) + ld s4, 32(sp) + addi sp, sp, 40 + ret +label1002: slliw t3, t2, 12 - add t0, a5, t3 + add t1, a5, t3 mv t3, a3 mv t5, zero - bgt a0, zero, label661 - j label717 + bgt a0, zero, label1011 + j label1009 .p2align 2 -label675: +label1023: + addi t6, t6, 4 +.p2align 2 +label1020: + lw s1, 0(t6) + sh2add s0, a6, t5 + addiw a6, a6, 1 + mulw a7, a5, s1 + amoadd.w.aqrl s2, a7, (s0) + bgt a0, a6, label1023 add t3, t3, a1 mv t5, t4 - ble a0, t4, label879 + ble a0, t4, label1210 .p2align 2 -label661: +label1011: sh2add t6, t2, t3 addiw t4, t5, 1 lw a5, 0(t6) - beq a5, zero, label675 + bne a5, zero, label1012 + add t3, t3, a1 + mv t5, t4 + bgt a0, t4, label1011 + addiw t2, t2, 1 + ble a4, t2, label1026 +.p2align 2 +label1010: + add t1, t1, a1 + mv t3, a3 + mv t5, zero + bgt a0, zero, label1011 +label1009: + addiw t2, t2, 1 + bgt a4, t2, label1010 + j label1026 +.p2align 2 +label1012: slliw a7, t5, 12 + mv t6, t1 mv a6, zero add t5, a2, a7 - mv t6, t5 - j label663 -.p2align 2 -label666: - addi t6, t6, 32 -.p2align 2 -label663: - sh2add a7, a6, t0 - addiw a6, a6, 8 - lw s0, 0(a7) - mulw s2, a5, s0 - amoadd.w.aqrl s3, s2, (t6) - addi s2, t6, 4 - lw s1, 4(a7) + j label1013 +.p2align 2 +label1016: + addi t6, t6, 16 +.p2align 2 +label1013: + lw s1, 0(t6) + sh2add a7, a6, t5 + addiw a6, a6, 4 mulw s0, a5, s1 - amoadd.w.aqrl s3, s0, (s2) - addi s3, t6, 8 - lw s4, 8(a7) - mulw s1, a5, s4 - amoadd.w.aqrl s5, s1, (s3) - addi s3, t6, 12 - lw s2, 12(a7) - mulw s0, a5, s2 - amoadd.w.aqrl s4, s0, (s3) - addi s3, t6, 16 - lw s2, 16(a7) + amoadd.w.aqrl s3, s0, (a7) + addi s3, a7, 4 + lw s2, 4(t6) mulw s1, a5, s2 amoadd.w.aqrl s4, s1, (s3) - addi s3, t6, 20 - lw s2, 20(a7) + addi s3, a7, 8 + lw s2, 8(t6) mulw s0, a5, s2 - addi s2, t6, 24 - amoadd.w.aqrl s5, s0, (s3) - lw s4, 24(a7) - mulw s1, a5, s4 - amoadd.w.aqrl s5, s1, (s2) - lw s3, 28(a7) - addi a7, t6, 28 - mulw s0, a5, s3 - amoadd.w.aqrl s1, s0, (a7) - bgt t1, a6, label666 - ble a0, a6, label876 - sh2add t6, a6, t0 -.p2align 2 -label669: - lw s1, 0(t6) - sh2add s0, a6, t5 - addiw a6, a6, 1 - mulw a7, a5, s1 - amoadd.w.aqrl s2, a7, (s0) - ble a0, a6, label789 - addi t6, t6, 4 - j label669 -.p2align 2 -label879: - addiw t2, t2, 1 - ble a4, t2, label678 -.p2align 2 -label677: - add t0, t0, a1 - mv t3, a3 - mv t5, zero - bgt a0, zero, label661 - j label717 -.p2align 2 -label789: - add t3, t3, a1 - mv t5, t4 - bgt a0, t4, label661 - addiw t2, t2, 1 - bgt a4, t2, label677 - j label678 + amoadd.w.aqrl s4, s0, (s3) + addi s0, a7, 12 + lw s2, 12(t6) + mulw s1, a5, s2 + amoadd.w.aqrl s3, s1, (s0) + bgt t0, a6, label1016 + ble a0, a6, label1201 + sh2add t6, a6, t1 + j label1020 .p2align 2 -label876: +label1201: add t3, t3, a1 mv t5, t4 - bgt a0, t4, label661 -label717: + bgt a0, t4, label1011 addiw t2, t2, 1 - bgt a4, t2, label677 - j label678 -label679: - bgt a0, zero, label680 -label678: - ld s0, 0(sp) - ld s5, 8(sp) - ld s2, 16(sp) - ld s3, 24(sp) - ld s1, 32(sp) - ld s4, 40(sp) - addi sp, sp, 48 - ret -label680: + bgt a4, t2, label1010 + j label1026 +label1028: slliw t1, t2, 12 add t0, a5, t1 mv a5, t2 mv t1, a3 mv t4, zero - bgt a0, zero, label689 - j label687 + bgt a0, zero, label1037 + j label1035 .p2align 2 -label694: +label1042: add t1, t1, a1 mv t4, t2 - ble a0, t2, label880 + ble a0, t2, label1205 .p2align 2 -label689: +label1037: sh2add t5, a5, t1 addiw t2, t4, 1 lw t3, 0(t5) - bne t3, zero, label690 + beq t3, zero, label1044 + slliw a6, t4, 12 + mv t5, t0 + mv t6, zero + add t4, a2, a6 +.p2align 2 +label1039: + lw s0, 0(t5) + sh2add a7, t6, t4 + addiw t6, t6, 1 + mulw a6, t3, s0 + amoadd.w.aqrl s1, a6, (a7) + ble a0, t6, label1042 + addi t5, t5, 4 + j label1039 +.p2align 2 +label1044: add t1, t1, a1 mv t4, t2 - bgt a0, t2, label689 + bgt a0, t2, label1037 addiw a5, a5, 1 - ble a4, a5, label678 + ble a4, a5, label1026 .p2align 2 -label688: +label1036: add t0, t0, a1 mv t1, a3 mv t4, zero - bgt a0, zero, label689 -label687: + bgt a0, zero, label1037 +label1035: addiw a5, a5, 1 - bgt a4, a5, label688 - j label678 -.p2align 2 -label690: - slliw a6, t4, 12 - mv t5, t0 - mv t6, zero - add t4, a2, a6 + bgt a4, a5, label1036 + j label1026 .p2align 2 -label691: - lw a7, 0(t5) - sh2add s0, t6, t4 - addiw t6, t6, 1 - mulw a6, t3, a7 - amoadd.w.aqrl a7, a6, (s0) - ble a0, t6, label694 - addi t5, t5, 4 - j label691 +label1210: + addiw t2, t2, 1 + bgt a4, t2, label1010 + j label1026 .p2align 2 -label880: +label1205: addiw a5, a5, 1 - bgt a4, a5, label688 - j label678 + bgt a4, a5, label1036 + j label1026 diff --git a/tests/SysY2022/performance/01_mm1.sy.ir b/tests/SysY2022/performance/01_mm1.sy.ir index ff3d45414..17e4897a1 100644 --- a/tests/SysY2022/performance/01_mm1.sy.ir +++ b/tests/SysY2022/performance/01_mm1.sy.ir @@ -112,85 +112,176 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %5 = icmp sgt i32 %4, i32 0; cbr i1 %5(prob = 0.5), ^cond, ^b1; ^cond: - i32 %6 = add i32 %4, i32 -7; - i1 %7 = icmp sgt i32 %4, i32 7; - [1024 * [1024 * i32]]* %8 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; - cbr i1 %7(prob = 0.5), ^b2, ^b3; + i1 %6 = icmp sgt i32 %4, i32 3; + [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; + cbr i1 %6(prob = 0.5), ^cond1, ^b2; ^b1: - i32 %9 = phi [^b, i32 0] [^scalar.final, i32 %50] [^scalar.final1, i32 %63]; - i32* %10 = ptradd [8 * i8]* %2, i32 0; - atomicadd i32* %10, i32 %9; + i32 %8 = phi [^b, i32 0] [^scalar.final, i32 %27] [^scalar.final3, i32 %128] [^scalar.final4, i32 %138]; + i32* %9 = ptradd [8 * i8]* %2, i32 0; + atomicadd i32* %9, i32 %8; ret; ^b2: - i32 %12 = phi [^cond, i32 %0] [^scalar.final1, i32 %64]; - i32 %13 = phi [^cond, i32 0] [^scalar.final1, i32 %63]; - [1024 * i32]* %14 = getelementptr &([1024 * [1024 * i32]]* %8)[i64 0][i32 %12]; + i32 %11 = phi [^cond, i32 %0] [^scalar.final, i32 %98]; + i32 %12 = phi [^cond, i32 0] [^scalar.final, i32 %27]; + [1024 * i32]* %13 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i32 %11]; ubr ^while.body; + ^cond1: + i32 %14 = add i32 %4, i32 -3; + i1 %15 = icmp sgt i32 %14, i32 15; + i32 %16 = add i32 %4, i32 -18; + cbr i1 %15(prob = 0.5), ^b3, ^b4; ^b3: - i32 %15 = phi [^cond, i32 %0] [^scalar.final, i32 %54]; - i32 %16 = phi [^cond, i32 0] [^scalar.final, i32 %50]; - [1024 * i32]* %17 = getelementptr &([1024 * [1024 * i32]]* %8)[i64 0][i32 %15]; + i32 %17 = phi [^cond1, i32 0] [^scalar.final4, i32 %138]; + i32 %18 = phi [^cond1, i32 %0] [^scalar.final4, i32 %139]; + [1024 * i32]* %19 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i32 %18]; ubr ^while.body1; - ^while.body: - i32 %18 = phi [^b2, i32 %13] [^while.body, i32 %43]; - i32 %19 = phi [^b2, i32 0] [^while.body, i32 %44]; - i32* %20 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %19]; - i32 %21 = load i32* %20; - i32 %22 = add i32 %18, i32 %21; - i32* %23 = getelementptr &(i32* %20)[i64 1]; - i32 %24 = load i32* %23; - i32 %25 = add i32 %22, i32 %24; - i32* %26 = getelementptr &(i32* %20)[i64 2]; - i32 %27 = load i32* %26; - i32 %28 = add i32 %25, i32 %27; - i32* %29 = getelementptr &(i32* %20)[i64 3]; - i32 %30 = load i32* %29; - i32 %31 = add i32 %28, i32 %30; - i32* %32 = getelementptr &(i32* %20)[i64 4]; + ^b4: + i32 %20 = phi [^cond1, i32 %0] [^scalar.final3, i32 %129]; + i32 %21 = phi [^cond1, i32 0] [^scalar.final3, i32 %128]; + [1024 * i32]* %22 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i32 %20]; + ubr ^while.body2; + ^while.body {scalar}: + i32 %23 = phi [^b2, i32 0] [^while.body, i32 %28]; + i32 %24 = phi [^b2, i32 %12] [^while.body, i32 %27]; + i32* %25 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %23]; + i32 %26 = load i32* %25; + i32 %27 = add i32 %24, i32 %26; + i32 %28 = add i32 %23, i32 1; + i1 %29 = icmp sgt i32 %4, i32 %28; + cbr i1 %29(prob = 0.75), ^while.body, ^scalar.final; + ^while.body1: + i32 %30 = phi [^b3, i32 0] [^while.body1, i32 %80]; + i32 %31 = phi [^b3, i32 %17] [^while.body1, i32 %79]; + i32* %32 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %30]; i32 %33 = load i32* %32; i32 %34 = add i32 %31, i32 %33; - i32* %35 = getelementptr &(i32* %20)[i64 5]; + i32* %35 = getelementptr &(i32* %32)[i64 1]; i32 %36 = load i32* %35; i32 %37 = add i32 %34, i32 %36; - i32* %38 = getelementptr &(i32* %20)[i64 6]; + i32* %38 = getelementptr &(i32* %32)[i64 2]; i32 %39 = load i32* %38; i32 %40 = add i32 %37, i32 %39; - i32* %41 = getelementptr &(i32* %20)[i64 7]; + i32* %41 = getelementptr &(i32* %32)[i64 3]; i32 %42 = load i32* %41; i32 %43 = add i32 %40, i32 %42; - i32 %44 = add i32 %19, i32 8; - i1 %45 = icmp sgt i32 %6, i32 %44; - cbr i1 %45(prob = 0.888889), ^while.body, ^scalar.header; - ^while.body1 {scalar}: - i32 %46 = phi [^b3, i32 0] [^while.body1, i32 %51]; - i32 %47 = phi [^b3, i32 %16] [^while.body1, i32 %50]; - i32* %48 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %46]; - i32 %49 = load i32* %48; - i32 %50 = add i32 %47, i32 %49; - i32 %51 = add i32 %46, i32 1; - i1 %52 = icmp sgt i32 %4, i32 %51; - cbr i1 %52(prob = 0.875), ^while.body1, ^scalar.final; - ^scalar.header: - i1 %53 = icmp sgt i32 %4, i32 %44; - cbr i1 %53(prob = 0.875), ^while.body2, ^scalar.final1; - ^scalar.final: - i32 %54 = add i32 %15, i32 1; - i1 %55 = icmp sgt i32 %1, i32 %54; - cbr i1 %55(prob = 0.984615), ^b3, ^b1; + i32* %44 = getelementptr &(i32* %32)[i64 4]; + i32 %45 = load i32* %44; + i32 %46 = add i32 %43, i32 %45; + i32* %47 = getelementptr &(i32* %32)[i64 5]; + i32 %48 = load i32* %47; + i32 %49 = add i32 %46, i32 %48; + i32* %50 = getelementptr &(i32* %32)[i64 6]; + i32 %51 = load i32* %50; + i32 %52 = add i32 %49, i32 %51; + i32* %53 = getelementptr &(i32* %32)[i64 7]; + i32 %54 = load i32* %53; + i32 %55 = add i32 %52, i32 %54; + i32* %56 = getelementptr &(i32* %32)[i64 8]; + i32 %57 = load i32* %56; + i32 %58 = add i32 %55, i32 %57; + i32* %59 = getelementptr &(i32* %32)[i64 9]; + i32 %60 = load i32* %59; + i32 %61 = add i32 %58, i32 %60; + i32* %62 = getelementptr &(i32* %32)[i64 10]; + i32 %63 = load i32* %62; + i32 %64 = add i32 %61, i32 %63; + i32* %65 = getelementptr &(i32* %32)[i64 11]; + i32 %66 = load i32* %65; + i32 %67 = add i32 %64, i32 %66; + i32* %68 = getelementptr &(i32* %32)[i64 12]; + i32 %69 = load i32* %68; + i32 %70 = add i32 %67, i32 %69; + i32* %71 = getelementptr &(i32* %32)[i64 13]; + i32 %72 = load i32* %71; + i32 %73 = add i32 %70, i32 %72; + i32* %74 = getelementptr &(i32* %32)[i64 14]; + i32 %75 = load i32* %74; + i32 %76 = add i32 %73, i32 %75; + i32* %77 = getelementptr &(i32* %32)[i64 15]; + i32 %78 = load i32* %77; + i32 %79 = add i32 %76, i32 %78; + i32 %80 = add i32 %30, i32 16; + i1 %81 = icmp sgt i32 %16, i32 %80; + cbr i1 %81(prob = 0.941176), ^while.body1, ^scalar.header; ^while.body2 {scalar}: - i32 %56 = phi [^scalar.header, i32 %44] [^while.body2, i32 %61]; - i32 %57 = phi [^scalar.header, i32 %43] [^while.body2, i32 %60]; - i32* %58 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %56]; - i32 %59 = load i32* %58; - i32 %60 = add i32 %57, i32 %59; - i32 %61 = add i32 %56, i32 1; - i1 %62 = icmp sgt i32 %4, i32 %61; - cbr i1 %62(prob = 0.875), ^while.body2, ^scalar.final1; + i32 %82 = phi [^b4, i32 %21] [^while.body2, i32 %95]; + i32 %83 = phi [^b4, i32 0] [^while.body2, i32 %96]; + i32* %84 = getelementptr &([1024 * i32]* %22)[i64 0][i32 %83]; + i32 %85 = load i32* %84; + i32 %86 = add i32 %82, i32 %85; + i32* %87 = getelementptr &(i32* %84)[i64 1]; + i32 %88 = load i32* %87; + i32 %89 = add i32 %86, i32 %88; + i32* %90 = getelementptr &(i32* %84)[i64 2]; + i32 %91 = load i32* %90; + i32 %92 = add i32 %89, i32 %91; + i32* %93 = getelementptr &(i32* %84)[i64 3]; + i32 %94 = load i32* %93; + i32 %95 = add i32 %92, i32 %94; + i32 %96 = add i32 %83, i32 4; + i1 %97 = icmp sgt i32 %14, i32 %96; + cbr i1 %97(prob = 0.75), ^while.body2, ^scalar.final1; + ^scalar.final: + i32 %98 = add i32 %11, i32 1; + i1 %99 = icmp sgt i32 %1, i32 %98; + cbr i1 %99(prob = 0.984615), ^b2, ^b1; + ^scalar.header: + i1 %100 = icmp sgt i32 %14, i32 %80; + cbr i1 %100(prob = 0.75), ^while.body3, ^scalar.final2; ^scalar.final1: - i32 %63 = phi [^scalar.header, i32 %43] [^while.body2, i32 %60]; - i32 %64 = add i32 %12, i32 1; - i1 %65 = icmp sgt i32 %1, i32 %64; - cbr i1 %65(prob = 0.984615), ^b2, ^b1; + i1 %101 = icmp sgt i32 %4, i32 %96; + cbr i1 %101(prob = 0.75), ^while.body4, ^scalar.final3; + ^while.body3 {scalar}: + i32 %102 = phi [^scalar.header, i32 %79] [^while.body3, i32 %115]; + i32 %103 = phi [^scalar.header, i32 %80] [^while.body3, i32 %116]; + i32* %104 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %103]; + i32 %105 = load i32* %104; + i32 %106 = add i32 %102, i32 %105; + i32* %107 = getelementptr &(i32* %104)[i64 1]; + i32 %108 = load i32* %107; + i32 %109 = add i32 %106, i32 %108; + i32* %110 = getelementptr &(i32* %104)[i64 2]; + i32 %111 = load i32* %110; + i32 %112 = add i32 %109, i32 %111; + i32* %113 = getelementptr &(i32* %104)[i64 3]; + i32 %114 = load i32* %113; + i32 %115 = add i32 %112, i32 %114; + i32 %116 = add i32 %103, i32 4; + i1 %117 = icmp sgt i32 %14, i32 %116; + cbr i1 %117(prob = 0.75), ^while.body3, ^scalar.final2; + ^scalar.final2: + i32 %118 = phi [^scalar.header, i32 %79] [^while.body3, i32 %115]; + i32 %119 = phi [^scalar.header, i32 %80] [^while.body3, i32 %116]; + i1 %120 = icmp sgt i32 %4, i32 %119; + cbr i1 %120(prob = 0.75), ^while.body5, ^scalar.final4; + ^while.body4 {scalar}: + i32 %121 = phi [^scalar.final1, i32 %96] [^while.body4, i32 %126]; + i32 %122 = phi [^scalar.final1, i32 %95] [^while.body4, i32 %125]; + i32* %123 = getelementptr &([1024 * i32]* %22)[i64 0][i32 %121]; + i32 %124 = load i32* %123; + i32 %125 = add i32 %122, i32 %124; + i32 %126 = add i32 %121, i32 1; + i1 %127 = icmp sgt i32 %4, i32 %126; + cbr i1 %127(prob = 0.75), ^while.body4, ^scalar.final3; + ^scalar.final3: + i32 %128 = phi [^scalar.final1, i32 %95] [^while.body4, i32 %125]; + i32 %129 = add i32 %20, i32 1; + i1 %130 = icmp sgt i32 %1, i32 %129; + cbr i1 %130(prob = 0.984615), ^b4, ^b1; + ^while.body5 {scalar}: + i32 %131 = phi [^scalar.final2, i32 %119] [^while.body5, i32 %136]; + i32 %132 = phi [^scalar.final2, i32 %118] [^while.body5, i32 %135]; + i32* %133 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %131]; + i32 %134 = load i32* %133; + i32 %135 = add i32 %132, i32 %134; + i32 %136 = add i32 %131, i32 1; + i1 %137 = icmp sgt i32 %4, i32 %136; + cbr i1 %137(prob = 0.75), ^while.body5, ^scalar.final4; + ^scalar.final4: + i32 %138 = phi [^scalar.final2, i32 %118] [^while.body5, i32 %135]; + i32 %139 = add i32 %18, i32 1; + i1 %140 = icmp sgt i32 %1, i32 %139; + cbr i1 %140(prob = 0.984615), ^b3, ^b1; } internal [8 * i8]* @cmmc_parallel_body_payload_0, align 8; internal func @cmmc_parallel_body_1(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -198,70 +289,138 @@ internal func @cmmc_parallel_body_1(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_1 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; - [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @C to [1024 * [1024 * i32]]*; - [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; - cbr i1 %5(prob = 0.5), ^b1, ^cond; - ^b1: - i32 %9 = phi [^b, i32 %0] [^b4, i32 %34]; - [1024 * i32]* %10 = getelementptr &([1024 * i32]* %8)[i32 %9]; - ubr ^while.body; + i1 %5 = icmp sgt i32 %4, i32 3; + [1024 * [1024 * i32]]* %6 = ptrcast [1024 * [1024 * i32]]* @C to [1024 * [1024 * i32]]*; + [1024 * i32]* %7 = getelementptr &([1024 * [1024 * i32]]* %6)[i64 0][i64 0]; + cbr i1 %5(prob = 0.5), ^cond, ^cond1; ^cond: + i32 %8 = add i32 %4, i32 -3; + i1 %9 = icmp sgt i32 %8, i32 15; + i32 %10 = add i32 %4, i32 -18; + cbr i1 %9(prob = 0.5), ^b1, ^b3; + ^cond1: i1 %11 = icmp sgt i32 %4, i32 0; - cbr i1 %11(prob = 0.5), ^b2, ^b3; + cbr i1 %11(prob = 0.5), ^b2, ^b4; + ^b1: + i32 %12 = phi [^cond, i32 %0] [^b7, i32 %71]; + [1024 * i32]* %13 = getelementptr &([1024 * i32]* %7)[i32 %12]; + ubr ^while.body; ^b2: - i32 %12 = phi [^cond, i32 %0] [^b5, i32 %36]; - [1024 * i32]* %13 = getelementptr &([1024 * i32]* %8)[i32 %12]; + i32 %14 = phi [^cond1, i32 %0] [^b5, i32 %49]; + [1024 * i32]* %15 = getelementptr &([1024 * i32]* %7)[i32 %14]; ubr ^while.body1; + ^b3: + i32 %16 = phi [^cond, i32 %0] [^b6, i32 %65]; + [1024 * i32]* %17 = getelementptr &([1024 * i32]* %7)[i32 %16]; + ubr ^while.body2; + ^b4: + ret; ^while.body: - i32 %14 = phi [^b1, i32 0] [^while.body, i32 %23]; - i32* %15 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %14]; - store i32* %15 with i32 0; - i32* %16 = getelementptr &(i32* %15)[i64 1]; - store i32* %16 with i32 0; - i32* %17 = getelementptr &(i32* %15)[i64 2]; - store i32* %17 with i32 0; - i32* %18 = getelementptr &(i32* %15)[i64 3]; - store i32* %18 with i32 0; - i32* %19 = getelementptr &(i32* %15)[i64 4]; + i32 %18 = phi [^b1, i32 0] [^while.body, i32 %35]; + i32* %19 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %18]; store i32* %19 with i32 0; - i32* %20 = getelementptr &(i32* %15)[i64 5]; + i32* %20 = getelementptr &(i32* %19)[i64 1]; store i32* %20 with i32 0; - i32* %21 = getelementptr &(i32* %15)[i64 6]; + i32* %21 = getelementptr &(i32* %19)[i64 2]; store i32* %21 with i32 0; - i32* %22 = getelementptr &(i32* %15)[i64 7]; + i32* %22 = getelementptr &(i32* %19)[i64 3]; store i32* %22 with i32 0; - i32 %23 = add i32 %14, i32 8; - i1 %24 = icmp sgt i32 %6, i32 %23; - cbr i1 %24(prob = 0.888889), ^while.body, ^scalar.header; - ^b3: - ret; - ^scalar.header: - i1 %25 = icmp sgt i32 %4, i32 %23; - cbr i1 %25(prob = 0.875), ^while.body2, ^b4; - ^while.body1 {scalar}: - i32 %26 = phi [^b2, i32 0] [^while.body1, i32 %28]; - i32* %27 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %26]; + i32* %23 = getelementptr &(i32* %19)[i64 4]; + store i32* %23 with i32 0; + i32* %24 = getelementptr &(i32* %19)[i64 5]; + store i32* %24 with i32 0; + i32* %25 = getelementptr &(i32* %19)[i64 6]; + store i32* %25 with i32 0; + i32* %26 = getelementptr &(i32* %19)[i64 7]; + store i32* %26 with i32 0; + i32* %27 = getelementptr &(i32* %19)[i64 8]; store i32* %27 with i32 0; - i32 %28 = add i32 %26, i32 1; - i1 %29 = icmp sgt i32 %4, i32 %28; - cbr i1 %29(prob = 0.875), ^while.body1, ^b5; - ^while.body2 {scalar}: - i32 %30 = phi [^scalar.header, i32 %23] [^while.body2, i32 %32]; - i32* %31 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %30]; + i32* %28 = getelementptr &(i32* %19)[i64 9]; + store i32* %28 with i32 0; + i32* %29 = getelementptr &(i32* %19)[i64 10]; + store i32* %29 with i32 0; + i32* %30 = getelementptr &(i32* %19)[i64 11]; + store i32* %30 with i32 0; + i32* %31 = getelementptr &(i32* %19)[i64 12]; store i32* %31 with i32 0; - i32 %32 = add i32 %30, i32 1; - i1 %33 = icmp sgt i32 %4, i32 %32; - cbr i1 %33(prob = 0.875), ^while.body2, ^b4; - ^b4: - i32 %34 = add i32 %9, i32 1; - i1 %35 = icmp sgt i32 %1, i32 %34; - cbr i1 %35(prob = 0.984615), ^b1, ^b3; + i32* %32 = getelementptr &(i32* %19)[i64 13]; + store i32* %32 with i32 0; + i32* %33 = getelementptr &(i32* %19)[i64 14]; + store i32* %33 with i32 0; + i32* %34 = getelementptr &(i32* %19)[i64 15]; + store i32* %34 with i32 0; + i32 %35 = add i32 %18, i32 16; + i1 %36 = icmp sgt i32 %10, i32 %35; + cbr i1 %36(prob = 0.941176), ^while.body, ^scalar.header; + ^while.body1 {scalar}: + i32 %37 = phi [^b2, i32 0] [^while.body1, i32 %39]; + i32* %38 = getelementptr &([1024 * i32]* %15)[i64 0][i32 %37]; + store i32* %38 with i32 0; + i32 %39 = add i32 %37, i32 1; + i1 %40 = icmp sgt i32 %4, i32 %39; + cbr i1 %40(prob = 0.75), ^while.body1, ^b5; + ^while.body2 {scalar}: + i32 %41 = phi [^b3, i32 0] [^while.body2, i32 %46]; + i32* %42 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %41]; + store i32* %42 with i32 0; + i32* %43 = getelementptr &(i32* %42)[i64 1]; + store i32* %43 with i32 0; + i32* %44 = getelementptr &(i32* %42)[i64 2]; + store i32* %44 with i32 0; + i32* %45 = getelementptr &(i32* %42)[i64 3]; + store i32* %45 with i32 0; + i32 %46 = add i32 %41, i32 4; + i1 %47 = icmp sgt i32 %8, i32 %46; + cbr i1 %47(prob = 0.75), ^while.body2, ^scalar.final; + ^scalar.header: + i1 %48 = icmp sgt i32 %8, i32 %35; + cbr i1 %48(prob = 0.75), ^while.body3, ^scalar.final1; ^b5: - i32 %36 = add i32 %12, i32 1; - i1 %37 = icmp sgt i32 %1, i32 %36; - cbr i1 %37(prob = 0.984615), ^b2, ^b3; + i32 %49 = add i32 %14, i32 1; + i1 %50 = icmp sgt i32 %1, i32 %49; + cbr i1 %50(prob = 0.984615), ^b2, ^b4; + ^scalar.final: + i1 %51 = icmp sgt i32 %4, i32 %46; + cbr i1 %51(prob = 0.75), ^while.body4, ^b6; + ^while.body3 {scalar}: + i32 %52 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i32* %53 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %52]; + store i32* %53 with i32 0; + i32* %54 = getelementptr &(i32* %53)[i64 1]; + store i32* %54 with i32 0; + i32* %55 = getelementptr &(i32* %53)[i64 2]; + store i32* %55 with i32 0; + i32* %56 = getelementptr &(i32* %53)[i64 3]; + store i32* %56 with i32 0; + i32 %57 = add i32 %52, i32 4; + i1 %58 = icmp sgt i32 %8, i32 %57; + cbr i1 %58(prob = 0.75), ^while.body3, ^scalar.final1; + ^scalar.final1: + i32 %59 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i1 %60 = icmp sgt i32 %4, i32 %59; + cbr i1 %60(prob = 0.75), ^while.body5, ^b7; + ^while.body4 {scalar}: + i32 %61 = phi [^scalar.final, i32 %46] [^while.body4, i32 %63]; + i32* %62 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %61]; + store i32* %62 with i32 0; + i32 %63 = add i32 %61, i32 1; + i1 %64 = icmp sgt i32 %4, i32 %63; + cbr i1 %64(prob = 0.75), ^while.body4, ^b6; + ^b6: + i32 %65 = add i32 %16, i32 1; + i1 %66 = icmp sgt i32 %1, i32 %65; + cbr i1 %66(prob = 0.984615), ^b3, ^b4; + ^while.body5 {scalar}: + i32 %67 = phi [^scalar.final1, i32 %59] [^while.body5, i32 %69]; + i32* %68 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %67]; + store i32* %68 with i32 0; + i32 %69 = add i32 %67, i32 1; + i1 %70 = icmp sgt i32 %4, i32 %69; + cbr i1 %70(prob = 0.75), ^while.body5, ^b7; + ^b7: + i32 %71 = add i32 %12, i32 1; + i1 %72 = icmp sgt i32 %1, i32 %71; + cbr i1 %72(prob = 0.984615), ^b1, ^b4; } internal [4 * i8]* @cmmc_parallel_body_payload_1, align 8; internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -269,8 +428,8 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_2 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; + i1 %5 = icmp sgt i32 %4, i32 3; + i32 %6 = add i32 %4, i32 -3; [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @A to [1024 * [1024 * i32]]*; [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; [1024 * [1024 * i32]]* %9 = ptrcast [1024 * [1024 * i32]]* @C to [1024 * [1024 * i32]]*; @@ -325,7 +484,7 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %36 = icmp sgt i32 %1, i32 %35; cbr i1 %36(prob = 0.984615), ^b2, ^b3; ^while.body2: - i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %78]; + i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %58]; i32* %38 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %37]; i32 %39 = load i32* %38; i32 %40 = mul i32 %22, i32 %39; @@ -346,55 +505,35 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %55 = mul i32 %22, i32 %54; i32* %56 = getelementptr &(i32* %41)[i64 3]; atomicadd i32* %56, i32 %55; - i32* %58 = getelementptr &(i32* %38)[i64 4]; - i32 %59 = load i32* %58; - i32 %60 = mul i32 %22, i32 %59; - i32* %61 = getelementptr &(i32* %41)[i64 4]; - atomicadd i32* %61, i32 %60; - i32* %63 = getelementptr &(i32* %38)[i64 5]; - i32 %64 = load i32* %63; - i32 %65 = mul i32 %22, i32 %64; - i32* %66 = getelementptr &(i32* %41)[i64 5]; - atomicadd i32* %66, i32 %65; - i32* %68 = getelementptr &(i32* %38)[i64 6]; - i32 %69 = load i32* %68; - i32 %70 = mul i32 %22, i32 %69; - i32* %71 = getelementptr &(i32* %41)[i64 6]; - atomicadd i32* %71, i32 %70; - i32* %73 = getelementptr &(i32* %38)[i64 7]; - i32 %74 = load i32* %73; - i32 %75 = mul i32 %22, i32 %74; - i32* %76 = getelementptr &(i32* %41)[i64 7]; - atomicadd i32* %76, i32 %75; - i32 %78 = add i32 %37, i32 8; - i1 %79 = icmp sgt i32 %6, i32 %78; - cbr i1 %79(prob = 0.888889), ^while.body2, ^scalar.header; + i32 %58 = add i32 %37, i32 4; + i1 %59 = icmp sgt i32 %6, i32 %58; + cbr i1 %59(prob = 0.941176), ^while.body2, ^scalar.header; ^prebody1: - [1024 * i32]* %80 = getelementptr &([1024 * i32]* %10)[i32 %27]; + [1024 * i32]* %60 = getelementptr &([1024 * i32]* %10)[i32 %27]; ubr ^while.body3; ^scalar.header: - i1 %81 = icmp sle i32 %4, i32 %78; - cbr i1 %81(prob = 0.125), ^while.header, ^while.body4; + i1 %61 = icmp sle i32 %4, i32 %58; + cbr i1 %61(prob = 0.25), ^while.header, ^while.body4; ^while.body3 {scalar}: - i32 %82 = phi [^prebody1, i32 0] [^while.body3, i32 %88]; - i32* %83 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %82]; - i32 %84 = load i32* %83; - i32 %85 = mul i32 %32, i32 %84; - i32* %86 = getelementptr &([1024 * i32]* %80)[i64 0][i32 %82]; - atomicadd i32* %86, i32 %85; - i32 %88 = add i32 %82, i32 1; - i1 %89 = icmp sgt i32 %4, i32 %88; - cbr i1 %89(prob = 0.875), ^while.body3, ^while.header1; + i32 %62 = phi [^prebody1, i32 0] [^while.body3, i32 %68]; + i32* %63 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %62]; + i32 %64 = load i32* %63; + i32 %65 = mul i32 %32, i32 %64; + i32* %66 = getelementptr &([1024 * i32]* %60)[i64 0][i32 %62]; + atomicadd i32* %66, i32 %65; + i32 %68 = add i32 %62, i32 1; + i1 %69 = icmp sgt i32 %4, i32 %68; + cbr i1 %69(prob = 0.75), ^while.body3, ^while.header1; ^while.body4 {scalar}: - i32 %90 = phi [^scalar.header, i32 %78] [^while.body4, i32 %96]; - i32* %91 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %90]; - i32 %92 = load i32* %91; - i32 %93 = mul i32 %22, i32 %92; - i32* %94 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %90]; - atomicadd i32* %94, i32 %93; - i32 %96 = add i32 %90, i32 1; - i1 %97 = icmp sgt i32 %4, i32 %96; - cbr i1 %97(prob = 0.875), ^while.body4, ^while.header; + i32 %70 = phi [^scalar.header, i32 %58] [^while.body4, i32 %76]; + i32* %71 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %70]; + i32 %72 = load i32* %71; + i32 %73 = mul i32 %22, i32 %72; + i32* %74 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %70]; + atomicadd i32* %74, i32 %73; + i32 %76 = add i32 %70, i32 1; + i1 %77 = icmp sgt i32 %4, i32 %76; + cbr i1 %77(prob = 0.75), ^while.body4, ^while.header; } internal [4 * i8]* @cmmc_parallel_body_payload_2, align 8; internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -402,70 +541,138 @@ internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_3 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; - [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; - [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; - cbr i1 %5(prob = 0.5), ^b1, ^cond; - ^b1: - i32 %9 = phi [^b, i32 %0] [^b4, i32 %34]; - [1024 * i32]* %10 = getelementptr &([1024 * i32]* %8)[i32 %9]; - ubr ^while.body; + i1 %5 = icmp sgt i32 %4, i32 3; + [1024 * [1024 * i32]]* %6 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; + [1024 * i32]* %7 = getelementptr &([1024 * [1024 * i32]]* %6)[i64 0][i64 0]; + cbr i1 %5(prob = 0.5), ^cond, ^cond1; ^cond: + i32 %8 = add i32 %4, i32 -3; + i1 %9 = icmp sgt i32 %8, i32 15; + i32 %10 = add i32 %4, i32 -18; + cbr i1 %9(prob = 0.5), ^b1, ^b3; + ^cond1: i1 %11 = icmp sgt i32 %4, i32 0; - cbr i1 %11(prob = 0.5), ^b2, ^b3; + cbr i1 %11(prob = 0.5), ^b2, ^b4; + ^b1: + i32 %12 = phi [^cond, i32 %0] [^b7, i32 %71]; + [1024 * i32]* %13 = getelementptr &([1024 * i32]* %7)[i32 %12]; + ubr ^while.body; ^b2: - i32 %12 = phi [^cond, i32 %0] [^b5, i32 %36]; - [1024 * i32]* %13 = getelementptr &([1024 * i32]* %8)[i32 %12]; + i32 %14 = phi [^cond1, i32 %0] [^b5, i32 %49]; + [1024 * i32]* %15 = getelementptr &([1024 * i32]* %7)[i32 %14]; ubr ^while.body1; + ^b3: + i32 %16 = phi [^cond, i32 %0] [^b6, i32 %65]; + [1024 * i32]* %17 = getelementptr &([1024 * i32]* %7)[i32 %16]; + ubr ^while.body2; + ^b4: + ret; ^while.body: - i32 %14 = phi [^b1, i32 0] [^while.body, i32 %23]; - i32* %15 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %14]; - store i32* %15 with i32 0; - i32* %16 = getelementptr &(i32* %15)[i64 1]; - store i32* %16 with i32 0; - i32* %17 = getelementptr &(i32* %15)[i64 2]; - store i32* %17 with i32 0; - i32* %18 = getelementptr &(i32* %15)[i64 3]; - store i32* %18 with i32 0; - i32* %19 = getelementptr &(i32* %15)[i64 4]; + i32 %18 = phi [^b1, i32 0] [^while.body, i32 %35]; + i32* %19 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %18]; store i32* %19 with i32 0; - i32* %20 = getelementptr &(i32* %15)[i64 5]; + i32* %20 = getelementptr &(i32* %19)[i64 1]; store i32* %20 with i32 0; - i32* %21 = getelementptr &(i32* %15)[i64 6]; + i32* %21 = getelementptr &(i32* %19)[i64 2]; store i32* %21 with i32 0; - i32* %22 = getelementptr &(i32* %15)[i64 7]; + i32* %22 = getelementptr &(i32* %19)[i64 3]; store i32* %22 with i32 0; - i32 %23 = add i32 %14, i32 8; - i1 %24 = icmp sgt i32 %6, i32 %23; - cbr i1 %24(prob = 0.888889), ^while.body, ^scalar.header; - ^b3: - ret; - ^scalar.header: - i1 %25 = icmp sgt i32 %4, i32 %23; - cbr i1 %25(prob = 0.875), ^while.body2, ^b4; - ^while.body1 {scalar}: - i32 %26 = phi [^b2, i32 0] [^while.body1, i32 %28]; - i32* %27 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %26]; + i32* %23 = getelementptr &(i32* %19)[i64 4]; + store i32* %23 with i32 0; + i32* %24 = getelementptr &(i32* %19)[i64 5]; + store i32* %24 with i32 0; + i32* %25 = getelementptr &(i32* %19)[i64 6]; + store i32* %25 with i32 0; + i32* %26 = getelementptr &(i32* %19)[i64 7]; + store i32* %26 with i32 0; + i32* %27 = getelementptr &(i32* %19)[i64 8]; store i32* %27 with i32 0; - i32 %28 = add i32 %26, i32 1; - i1 %29 = icmp sgt i32 %4, i32 %28; - cbr i1 %29(prob = 0.875), ^while.body1, ^b5; - ^while.body2 {scalar}: - i32 %30 = phi [^scalar.header, i32 %23] [^while.body2, i32 %32]; - i32* %31 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %30]; + i32* %28 = getelementptr &(i32* %19)[i64 9]; + store i32* %28 with i32 0; + i32* %29 = getelementptr &(i32* %19)[i64 10]; + store i32* %29 with i32 0; + i32* %30 = getelementptr &(i32* %19)[i64 11]; + store i32* %30 with i32 0; + i32* %31 = getelementptr &(i32* %19)[i64 12]; store i32* %31 with i32 0; - i32 %32 = add i32 %30, i32 1; - i1 %33 = icmp sgt i32 %4, i32 %32; - cbr i1 %33(prob = 0.875), ^while.body2, ^b4; - ^b4: - i32 %34 = add i32 %9, i32 1; - i1 %35 = icmp sgt i32 %1, i32 %34; - cbr i1 %35(prob = 0.984615), ^b1, ^b3; + i32* %32 = getelementptr &(i32* %19)[i64 13]; + store i32* %32 with i32 0; + i32* %33 = getelementptr &(i32* %19)[i64 14]; + store i32* %33 with i32 0; + i32* %34 = getelementptr &(i32* %19)[i64 15]; + store i32* %34 with i32 0; + i32 %35 = add i32 %18, i32 16; + i1 %36 = icmp sgt i32 %10, i32 %35; + cbr i1 %36(prob = 0.941176), ^while.body, ^scalar.header; + ^while.body1 {scalar}: + i32 %37 = phi [^b2, i32 0] [^while.body1, i32 %39]; + i32* %38 = getelementptr &([1024 * i32]* %15)[i64 0][i32 %37]; + store i32* %38 with i32 0; + i32 %39 = add i32 %37, i32 1; + i1 %40 = icmp sgt i32 %4, i32 %39; + cbr i1 %40(prob = 0.75), ^while.body1, ^b5; + ^while.body2 {scalar}: + i32 %41 = phi [^b3, i32 0] [^while.body2, i32 %46]; + i32* %42 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %41]; + store i32* %42 with i32 0; + i32* %43 = getelementptr &(i32* %42)[i64 1]; + store i32* %43 with i32 0; + i32* %44 = getelementptr &(i32* %42)[i64 2]; + store i32* %44 with i32 0; + i32* %45 = getelementptr &(i32* %42)[i64 3]; + store i32* %45 with i32 0; + i32 %46 = add i32 %41, i32 4; + i1 %47 = icmp sgt i32 %8, i32 %46; + cbr i1 %47(prob = 0.75), ^while.body2, ^scalar.final; + ^scalar.header: + i1 %48 = icmp sgt i32 %8, i32 %35; + cbr i1 %48(prob = 0.75), ^while.body3, ^scalar.final1; ^b5: - i32 %36 = add i32 %12, i32 1; - i1 %37 = icmp sgt i32 %1, i32 %36; - cbr i1 %37(prob = 0.984615), ^b2, ^b3; + i32 %49 = add i32 %14, i32 1; + i1 %50 = icmp sgt i32 %1, i32 %49; + cbr i1 %50(prob = 0.984615), ^b2, ^b4; + ^scalar.final: + i1 %51 = icmp sgt i32 %4, i32 %46; + cbr i1 %51(prob = 0.75), ^while.body4, ^b6; + ^while.body3 {scalar}: + i32 %52 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i32* %53 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %52]; + store i32* %53 with i32 0; + i32* %54 = getelementptr &(i32* %53)[i64 1]; + store i32* %54 with i32 0; + i32* %55 = getelementptr &(i32* %53)[i64 2]; + store i32* %55 with i32 0; + i32* %56 = getelementptr &(i32* %53)[i64 3]; + store i32* %56 with i32 0; + i32 %57 = add i32 %52, i32 4; + i1 %58 = icmp sgt i32 %8, i32 %57; + cbr i1 %58(prob = 0.75), ^while.body3, ^scalar.final1; + ^scalar.final1: + i32 %59 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i1 %60 = icmp sgt i32 %4, i32 %59; + cbr i1 %60(prob = 0.75), ^while.body5, ^b7; + ^while.body4 {scalar}: + i32 %61 = phi [^scalar.final, i32 %46] [^while.body4, i32 %63]; + i32* %62 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %61]; + store i32* %62 with i32 0; + i32 %63 = add i32 %61, i32 1; + i1 %64 = icmp sgt i32 %4, i32 %63; + cbr i1 %64(prob = 0.75), ^while.body4, ^b6; + ^b6: + i32 %65 = add i32 %16, i32 1; + i1 %66 = icmp sgt i32 %1, i32 %65; + cbr i1 %66(prob = 0.984615), ^b3, ^b4; + ^while.body5 {scalar}: + i32 %67 = phi [^scalar.final1, i32 %59] [^while.body5, i32 %69]; + i32* %68 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %67]; + store i32* %68 with i32 0; + i32 %69 = add i32 %67, i32 1; + i1 %70 = icmp sgt i32 %4, i32 %69; + cbr i1 %70(prob = 0.75), ^while.body5, ^b7; + ^b7: + i32 %71 = add i32 %12, i32 1; + i1 %72 = icmp sgt i32 %1, i32 %71; + cbr i1 %72(prob = 0.984615), ^b1, ^b4; } internal [4 * i8]* @cmmc_parallel_body_payload_3, align 8; internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -473,8 +680,8 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_4 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; + i1 %5 = icmp sgt i32 %4, i32 3; + i32 %6 = add i32 %4, i32 -3; [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @A to [1024 * [1024 * i32]]*; [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; [1024 * [1024 * i32]]* %9 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; @@ -529,7 +736,7 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %36 = icmp sgt i32 %1, i32 %35; cbr i1 %36(prob = 0.984615), ^b2, ^b3; ^while.body2: - i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %78]; + i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %58]; i32* %38 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %37]; i32 %39 = load i32* %38; i32 %40 = mul i32 %22, i32 %39; @@ -550,54 +757,34 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %55 = mul i32 %22, i32 %54; i32* %56 = getelementptr &(i32* %41)[i64 3]; atomicadd i32* %56, i32 %55; - i32* %58 = getelementptr &(i32* %38)[i64 4]; - i32 %59 = load i32* %58; - i32 %60 = mul i32 %22, i32 %59; - i32* %61 = getelementptr &(i32* %41)[i64 4]; - atomicadd i32* %61, i32 %60; - i32* %63 = getelementptr &(i32* %38)[i64 5]; - i32 %64 = load i32* %63; - i32 %65 = mul i32 %22, i32 %64; - i32* %66 = getelementptr &(i32* %41)[i64 5]; - atomicadd i32* %66, i32 %65; - i32* %68 = getelementptr &(i32* %38)[i64 6]; - i32 %69 = load i32* %68; - i32 %70 = mul i32 %22, i32 %69; - i32* %71 = getelementptr &(i32* %41)[i64 6]; - atomicadd i32* %71, i32 %70; - i32* %73 = getelementptr &(i32* %38)[i64 7]; - i32 %74 = load i32* %73; - i32 %75 = mul i32 %22, i32 %74; - i32* %76 = getelementptr &(i32* %41)[i64 7]; - atomicadd i32* %76, i32 %75; - i32 %78 = add i32 %37, i32 8; - i1 %79 = icmp sgt i32 %6, i32 %78; - cbr i1 %79(prob = 0.888889), ^while.body2, ^scalar.header; + i32 %58 = add i32 %37, i32 4; + i1 %59 = icmp sgt i32 %6, i32 %58; + cbr i1 %59(prob = 0.941176), ^while.body2, ^scalar.header; ^prebody1: - [1024 * i32]* %80 = getelementptr &([1024 * i32]* %10)[i32 %27]; + [1024 * i32]* %60 = getelementptr &([1024 * i32]* %10)[i32 %27]; ubr ^while.body3; ^scalar.header: - i1 %81 = icmp sle i32 %4, i32 %78; - cbr i1 %81(prob = 0.125), ^while.header, ^while.body4; + i1 %61 = icmp sle i32 %4, i32 %58; + cbr i1 %61(prob = 0.25), ^while.header, ^while.body4; ^while.body3 {scalar}: - i32 %82 = phi [^prebody1, i32 0] [^while.body3, i32 %88]; - i32* %83 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %82]; - i32 %84 = load i32* %83; - i32 %85 = mul i32 %32, i32 %84; - i32* %86 = getelementptr &([1024 * i32]* %80)[i64 0][i32 %82]; - atomicadd i32* %86, i32 %85; - i32 %88 = add i32 %82, i32 1; - i1 %89 = icmp sgt i32 %4, i32 %88; - cbr i1 %89(prob = 0.875), ^while.body3, ^while.header1; + i32 %62 = phi [^prebody1, i32 0] [^while.body3, i32 %68]; + i32* %63 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %62]; + i32 %64 = load i32* %63; + i32 %65 = mul i32 %32, i32 %64; + i32* %66 = getelementptr &([1024 * i32]* %60)[i64 0][i32 %62]; + atomicadd i32* %66, i32 %65; + i32 %68 = add i32 %62, i32 1; + i1 %69 = icmp sgt i32 %4, i32 %68; + cbr i1 %69(prob = 0.75), ^while.body3, ^while.header1; ^while.body4 {scalar}: - i32 %90 = phi [^scalar.header, i32 %78] [^while.body4, i32 %96]; - i32* %91 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %90]; - i32 %92 = load i32* %91; - i32 %93 = mul i32 %22, i32 %92; - i32* %94 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %90]; - atomicadd i32* %94, i32 %93; - i32 %96 = add i32 %90, i32 1; - i1 %97 = icmp sgt i32 %4, i32 %96; - cbr i1 %97(prob = 0.875), ^while.body4, ^while.header; + i32 %70 = phi [^scalar.header, i32 %58] [^while.body4, i32 %76]; + i32* %71 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %70]; + i32 %72 = load i32* %71; + i32 %73 = mul i32 %22, i32 %72; + i32* %74 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %70]; + atomicadd i32* %74, i32 %73; + i32 %76 = add i32 %70, i32 1; + i1 %77 = icmp sgt i32 %4, i32 %76; + cbr i1 %77(prob = 0.75), ^while.body4, ^while.header; } internal [4 * i8]* @cmmc_parallel_body_payload_4, align 8; diff --git a/tests/SysY2022/performance/01_mm2.arm.s b/tests/SysY2022/performance/01_mm2.arm.s index e424c6434..344f0aa0b 100644 --- a/tests/SysY2022/performance/01_mm2.arm.s +++ b/tests/SysY2022/performance/01_mm2.arm.s @@ -1,22 +1,22 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 A: .zero 4194304 -.align 8 +.p2align 3 B: .zero 4194304 -.align 8 +.p2align 3 C: .zero 4194304 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 .text diff --git a/tests/SysY2022/performance/01_mm2.riscv.s b/tests/SysY2022/performance/01_mm2.riscv.s index bb4605631..563a93453 100644 --- a/tests/SysY2022/performance/01_mm2.riscv.s +++ b/tests/SysY2022/performance/01_mm2.riscv.s @@ -1,28 +1,28 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 A: .zero 4194304 -.align 8 +.p2align 3 B: .zero 4194304 -.align 8 +.p2align 3 C: .zero 4194304 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_4: .zero 4 .text @@ -43,112 +43,112 @@ main: sd s9, 80(sp) sd s10, 88(sp) jal getint -pcrel1067: +pcrel1390: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel1068: +pcrel1391: auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - lui s6, 1 li s3, 5 -pcrel1069: + lui s6, 1 +pcrel1392: auipc a1, %pcrel_hi(cmmc_parallel_body_1) mv s0, a0 - addi s1, a1, %pcrel_lo(pcrel1069) -pcrel1070: + addi s1, a1, %pcrel_lo(pcrel1392) +pcrel1393: auipc a0, %pcrel_hi(cmmc_parallel_body_3) - addi s2, a0, %pcrel_lo(pcrel1070) - ble s0, zero, label907 -pcrel1071: + addi s2, a0, %pcrel_lo(pcrel1393) + ble s0, zero, label1250 +pcrel1394: auipc a0, %pcrel_hi(A) mv s8, zero - addi s7, a0, %pcrel_lo(pcrel1071) + addi s7, a0, %pcrel_lo(pcrel1394) mv s9, s7 mv s10, zero - j label923 + j label1234 .p2align 2 -label989: +label1273: addiw s8, s8, 1 - ble s0, s8, label1059 + ble s0, s8, label1381 add s7, s7, s6 mv s10, zero mv s9, s7 .p2align 2 -label923: +label1234: jal getint addiw s10, s10, 1 sw a0, 0(s9) - ble s0, s10, label989 + ble s0, s10, label1273 addi s9, s9, 4 - j label923 -label907: + j label1234 +label1250: li a0, 65 jal _sysy_starttime mv s6, zero - j label908 -label910: + j label1251 +label1253: addiw s6, s6, 1 - bge s6, s3, label911 + bge s6, s3, label1254 .p2align 2 -label908: - ble s0, zero, label910 -pcrel1072: +label1251: + ble s0, zero, label1253 +pcrel1395: auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - sw s0, %pcrel_lo(pcrel1072)(s4) + sw s0, %pcrel_lo(pcrel1395)(s4) mv a0, zero mv a1, s0 mv a2, s1 jal cmmcParallelFor - ble s0, zero, label917 -pcrel1073: + ble s0, zero, label1260 +pcrel1396: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel1074: +pcrel1397: auipc a3, %pcrel_hi(cmmc_parallel_body_2) - sw s0, %pcrel_lo(pcrel1073)(a0) - addi a2, a3, %pcrel_lo(pcrel1074) + sw s0, %pcrel_lo(pcrel1396)(a0) + addi a2, a3, %pcrel_lo(pcrel1397) mv a1, s0 mv a0, zero jal cmmcParallelFor .p2align 2 -label917: +label1260: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) - sw s0, %pcrel_lo(label917)(s5) + sw s0, %pcrel_lo(label1260)(s5) mv a0, zero mv a1, s0 mv a2, s2 jal cmmcParallelFor - ble s0, zero, label910 -pcrel1075: + ble s0, zero, label1253 +pcrel1398: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel1076: +pcrel1399: auipc a3, %pcrel_hi(cmmc_parallel_body_4) - sw s0, %pcrel_lo(pcrel1075)(a0) - addi a2, a3, %pcrel_lo(pcrel1076) + sw s0, %pcrel_lo(pcrel1398)(a0) + addi a2, a3, %pcrel_lo(pcrel1399) mv a1, s0 mv a0, zero jal cmmcParallelFor addiw s6, s6, 1 - blt s6, s3, label908 -label911: - ble s0, zero, label953 -pcrel1077: + blt s6, s3, label1251 +label1254: + ble s0, zero, label1306 +pcrel1400: auipc s1, %pcrel_hi(cmmc_parallel_body_payload_0) slli a0, s0, 32 -pcrel1078: +pcrel1401: auipc a3, %pcrel_hi(cmmc_parallel_body_0) - sd a0, %pcrel_lo(pcrel1077)(s1) - addi a2, a3, %pcrel_lo(pcrel1078) + sd a0, %pcrel_lo(pcrel1400)(s1) + addi a2, a3, %pcrel_lo(pcrel1401) mv a1, s0 mv a0, zero jal cmmcParallelFor - lw s0, %pcrel_lo(pcrel1077)(s1) -label912: + lw s0, %pcrel_lo(pcrel1400)(s1) +label1255: li a0, 84 jal _sysy_stoptime mv a0, s0 jal putint li a0, 10 jal putch - ld ra, 0(sp) mv a0, zero + ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) ld s1, 24(sp) @@ -162,713 +162,941 @@ label912: ld s10, 88(sp) addi sp, sp, 96 ret -label1059: +label1381: auipc a0, %pcrel_hi(B) mv s8, zero mv s10, zero - addi s7, a0, %pcrel_lo(label1059) + addi s7, a0, %pcrel_lo(label1381) mv s9, s7 - j label933 + j label1244 .p2align 2 -label938: +label1249: addi s9, s9, 4 .p2align 2 -label933: +label1244: jal getint addiw s10, s10, 1 sw a0, 0(s9) - bgt s0, s10, label938 + bgt s0, s10, label1249 addiw s8, s8, 1 - ble s0, s8, label907 + ble s0, s8, label1250 add s7, s7, s6 mv s10, zero mv s9, s7 - j label933 -label953: + j label1244 +label1306: mv s0, zero - j label912 + j label1255 .p2align 2 cmmc_parallel_body_0: - mv t0, a0 -pcrel146: + addi sp, sp, -8 + mv t1, a0 +pcrel289: auipc a4, %pcrel_hi(cmmc_parallel_body_payload_0) lui a2, 1 - addi a3, a4, %pcrel_lo(pcrel146) + sd s0, 0(sp) + addi a3, a4, %pcrel_lo(pcrel289) lw a0, 4(a3) bgt a0, zero, label2 - mv t2, zero -label35: - amoadd.w.aqrl a0, t2, (a3) + mv t1, zero +label65: + amoadd.w.aqrl a0, t1, (a3) + ld s0, 0(sp) + addi sp, sp, 8 ret label2: - addiw a5, a0, -7 -pcrel147: - auipc t2, %pcrel_hi(B) - li t1, 7 - addi a4, t2, %pcrel_lo(pcrel147) - bgt a0, t1, label3 - slli t1, t0, 12 - mv a5, t0 + auipc t0, %pcrel_hi(B) + li a4, 3 + addi a5, t0, %pcrel_lo(label2) + ble a0, a4, label80 + addiw a4, a0, -3 + addiw t0, a0, -18 + li t2, 15 + bgt a4, t2, label24 + slli t2, t1, 12 + mv t0, t1 + mv t3, zero + add a5, a5, t2 + mv t1, a5 mv t2, zero - add a4, a4, t1 + j label9 +.p2align 2 +label124: + addiw t0, t0, 1 + ble a1, t0, label271 +.p2align 2 +label17: + add a5, a5, a2 + lw t6, 0(a5) + mv t1, a5 + lw a7, 4(a5) + addw t5, t2, t6 + lw a6, 8(a5) + li t2, 4 + addw t4, t5, a7 + lw t5, 12(a5) + addw t6, t4, a6 + addw t3, t6, t5 + ble a4, t2, label104 +.p2align 2 +label13: + addi t1, t1, 16 +.p2align 2 +label9: + lw t6, 0(t1) + addiw t2, t2, 4 + lw a7, 4(t1) + addw t5, t3, t6 + lw a6, 8(t1) + addw t4, t5, a7 + lw t5, 12(t1) + addw t6, t4, a6 + addw t3, t6, t5 + bgt a4, t2, label13 +.p2align 2 +label104: + ble a0, t2, label263 + sh2add t1, t2, a5 + mv t4, t2 + mv t2, t3 +.p2align 2 +label19: + lw t5, 0(t1) + addiw t4, t4, 1 + addw t2, t2, t5 + ble a0, t4, label124 + addi t1, t1, 4 + j label19 +label24: + slli t3, t1, 12 + mv t4, zero + add a5, a5, t3 + mv t2, a5 + mv t3, zero + j label29 +.p2align 2 +label203: + addiw t1, t1, 1 + ble a1, t1, label193 +.p2align 2 +label40: + add a5, a5, a2 + mv t4, t3 + mv t2, a5 + mv t3, zero +.p2align 2 +label29: + lw a6, 0(t2) + addiw t3, t3, 16 + lw a7, 4(t2) + addw t6, t4, a6 + lw s0, 8(t2) + addw t5, t6, a7 + lw a6, 12(t2) + addw t4, t5, s0 + lw a7, 16(t2) + addw t6, t4, a6 + lw t4, 20(t2) + addw t5, t6, a7 + lw a7, 24(t2) + addw a6, t5, t4 + lw t5, 28(t2) + addw t4, a6, a7 + lw a6, 32(t2) + addw t6, t4, t5 + lw a7, 36(t2) + addw t5, t6, a6 + lw t6, 40(t2) + addw t4, t5, a7 + lw a7, 44(t2) + addw a6, t4, t6 + lw t5, 48(t2) + addw t6, a6, a7 + lw a7, 52(t2) + addw t4, t6, t5 + lw a6, 56(t2) + addw t5, t4, a7 + lw a7, 60(t2) + addw t6, t5, a6 + addw t4, t6, a7 + ble t0, t3, label179 + addi t2, t2, 64 + j label29 +.p2align 2 +label179: + ble a4, t3, label265 + sh2add t2, t3, a5 +.p2align 2 +label48: + lw t6, 0(t2) + addiw t3, t3, 4 + lw a7, 4(t2) + addw t5, t4, t6 + lw t4, 8(t2) + addw a6, t5, a7 + lw t5, 12(t2) + addw t6, a6, t4 + addw t4, t6, t5 + ble a4, t3, label222 + addi t2, t2, 16 + j label48 +.p2align 2 +label222: + mv t5, t4 + ble a0, t3, label272 +.p2align 2 +label41: + sh2add t2, t3, a5 + mv t4, t3 + mv t3, t5 +.p2align 2 +label42: + lw t6, 0(t2) + addiw t4, t4, 1 + addw t3, t3, t6 + ble a0, t4, label203 + addi t2, t2, 4 + j label42 +label272: + mv t3, t5 + addiw t1, t1, 1 + bgt a1, t1, label40 +label193: + mv t1, t3 + j label65 +label271: + mv t1, t2 + j label65 +label263: + mv t2, t3 + addiw t0, t0, 1 + bgt a1, t0, label17 + j label271 +label265: + mv t5, t4 + bgt a0, t3, label41 + j label272 +label80: + slli t0, t1, 12 + mv t2, zero + add a4, a5, t0 mv t0, a4 + mv a5, t1 mv t1, zero - j label28 + j label58 .p2align 2 -label32: +label62: addi t0, t0, 4 .p2align 2 -label28: +label58: lw t3, 0(t0) addiw t2, t2, 1 addw t1, t1, t3 - bgt a0, t2, label32 + bgt a0, t2, label62 addiw a5, a5, 1 - ble a1, a5, label137 + ble a1, a5, label65 .p2align 2 -label34: +label64: add a4, a4, a2 li t2, 1 lw t3, 0(a4) mv t0, a4 addw t1, t1, t3 - bgt a0, t2, label32 + bgt a0, t2, label62 addiw a5, a5, 1 - bgt a1, a5, label34 -label137: - mv t2, t1 - j label35 -label3: - slli t1, t0, 12 - mv t3, zero - add a4, a4, t1 - mv t1, a4 - mv t2, zero - j label8 -.p2align 2 -label95: - addiw t0, t0, 1 - ble a1, t0, label35 + bgt a1, a5, label64 + j label65 .p2align 2 -label22: - add a4, a4, a2 - mv t3, t2 - mv t1, a4 - mv t2, zero +cmmc_parallel_body_1: + mv t0, a0 +pcrel529: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) +pcrel530: + auipc a5, %pcrel_hi(C) + li a3, 3 + lw a0, %pcrel_lo(pcrel529)(a2) + addi a4, a5, %pcrel_lo(pcrel530) + lui a2, 1 + bgt a0, a3, label291 + bgt a0, zero, label333 +label331: + ret +label333: + slliw a5, t0, 12 + add a3, a4, a5 + mv a4, t0 + mv a5, a3 + mv t0, zero + j label337 .p2align 2 -label8: - lw t5, 0(t1) - addiw t2, t2, 8 - lw a6, 4(t1) - addw t4, t3, t5 - lw t3, 8(t1) - addw t6, t4, a6 - lw a6, 12(t1) - addw t5, t6, t3 - lw t6, 16(t1) - addw t4, t5, a6 - lw a6, 20(t1) - addw t3, t4, t6 - lw t6, 24(t1) - addw t5, t3, a6 - lw a6, 28(t1) - addw t4, t5, t6 - addw t3, t4, a6 - ble a5, t2, label81 - addi t1, t1, 32 - j label8 -.p2align 2 -label81: - ble a0, t2, label134 - sh2add t1, t2, a4 - mv t4, t2 - mv t2, t3 +label340: + addi a5, a5, 4 .p2align 2 -label15: - lw t5, 0(t1) - addiw t4, t4, 1 - addw t2, t2, t5 - ble a0, t4, label95 - addi t1, t1, 4 - j label15 -label134: - mv t2, t3 +label337: addiw t0, t0, 1 - bgt a1, t0, label22 - j label35 + sw zero, 0(a5) + bgt a0, t0, label340 + addiw a4, a4, 1 + ble a1, a4, label331 .p2align 2 -cmmc_parallel_body_1: - mv a5, a0 -pcrel273: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) -pcrel274: - auipc t1, %pcrel_hi(C) - li t0, 7 - lw a0, %pcrel_lo(pcrel273)(a2) - addi a4, t1, %pcrel_lo(pcrel274) - lui a2, 1 - addi a3, a0, -7 - bgt a0, t0, label149 - bgt a0, zero, label166 - j label176 -label149: - slliw t0, a5, 12 - add a4, a4, t0 - mv t0, a4 +label342: + add a3, a3, a2 + li t0, 1 + sw zero, 0(a3) + mv a5, a3 + bgt a0, t0, label340 + addiw a4, a4, 1 + bgt a1, a4, label342 + j label331 +label291: + addiw a3, a0, -3 + addiw a5, a0, -18 + li t1, 15 + bgt a3, t1, label292 + slliw t1, t0, 12 + mv a5, t0 + add a4, a4, t1 mv t1, zero - j label153 -.p2align 2 -label162: - addi t0, t0, 4 + mv t0, a4 + j label319 .p2align 2 -label159: - addiw t2, t2, 1 - sw zero, 0(t0) - bgt a0, t2, label162 +label435: addiw a5, a5, 1 - ble a1, a5, label176 + ble a1, a5, label331 .p2align 2 -label164: +label329: add a4, a4, a2 - li t1, 8 + li t1, 4 sd zero, 0(a4) mv t0, a4 sd zero, 8(a4) - sd zero, 16(a4) - sd zero, 24(a4) - ble a3, t1, label265 + ble a3, t1, label515 .p2align 2 -label156: - addi t0, t0, 32 +label330: + addi t0, t0, 16 .p2align 2 -label153: - addiw t1, t1, 8 +label319: + addiw t1, t1, 4 sd zero, 0(t0) sd zero, 8(t0) - sd zero, 16(t0) - sd zero, 24(t0) - bgt a3, t1, label156 - ble a0, t1, label261 + bgt a3, t1, label330 + ble a0, t1, label428 .p2align 2 -label158: +label323: sh2add t0, t1, a4 mv t2, t1 - j label159 -label265: - bgt a0, t1, label158 .p2align 2 -label261: - addiw a5, a5, 1 - bgt a1, a5, label164 -label176: - ret -label166: - slliw t0, a5, 12 - add a3, a4, t0 - mv a4, a5 - mv a5, a3 - mv t0, zero - j label170 +label324: + addiw t2, t2, 1 + sw zero, 0(t0) + ble a0, t2, label435 + addi t0, t0, 4 + j label324 +label515: + bgt a0, t1, label323 .p2align 2 -label175: - addi a5, a5, 4 +label428: + addiw a5, a5, 1 + bgt a1, a5, label329 + j label331 +label292: + slliw t1, t0, 12 + add a4, a4, t1 + mv t1, a4 + mv t2, zero + j label296 .p2align 2 -label170: +label412: addiw t0, t0, 1 - sw zero, 0(a5) - bgt a0, t0, label175 - addiw a4, a4, 1 - ble a1, a4, label176 + ble a1, t0, label331 .p2align 2 -label174: - add a3, a3, a2 - li t0, 1 - sw zero, 0(a3) - mv a5, a3 - bgt a0, t0, label175 - addiw a4, a4, 1 - bgt a1, a4, label174 - j label176 +label308: + add a4, a4, a2 + li t2, 16 + sd zero, 0(a4) + mv t1, a4 + sd zero, 8(a4) + sd zero, 16(a4) + sd zero, 24(a4) + sd zero, 32(a4) + sd zero, 40(a4) + sd zero, 48(a4) + sd zero, 56(a4) + ble a5, t2, label299 +.p2align 2 +label314: + addi t1, t1, 64 +.p2align 2 +label296: + addiw t2, t2, 16 + sd zero, 0(t1) + sd zero, 8(t1) + sd zero, 16(t1) + sd zero, 24(t1) + sd zero, 32(t1) + sd zero, 40(t1) + sd zero, 48(t1) + sd zero, 56(t1) + bgt a5, t2, label314 +.p2align 2 +label299: + ble a3, t2, label384 + sh2add t1, t2, a4 + mv t3, t2 + j label301 +.p2align 2 +label304: + addi t1, t1, 16 +.p2align 2 +label301: + addiw t3, t3, 4 + sd zero, 0(t1) + sd zero, 8(t1) + bgt a3, t3, label304 + ble a0, t3, label508 +.p2align 2 +label309: + sh2add t1, t3, a4 + mv t2, t3 +.p2align 2 +label310: + addiw t2, t2, 1 + sw zero, 0(t1) + ble a0, t2, label412 + addi t1, t1, 4 + j label310 +.p2align 2 +label384: + mv t3, t2 + bgt a0, t2, label309 + addiw t0, t0, 1 + bgt a1, t0, label308 + j label331 +label508: + addiw t0, t0, 1 + bgt a1, t0, label308 + j label331 .p2align 2 cmmc_parallel_body_2: addi sp, sp, -48 mv t2, a0 mv a4, a1 -pcrel522: +pcrel753: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel523: - auipc a5, %pcrel_hi(A) -pcrel524: +pcrel754: + auipc t1, %pcrel_hi(C) +pcrel755: auipc t3, %pcrel_hi(B) - li t0, 7 -pcrel525: - auipc a1, %pcrel_hi(C) - addi a3, a5, %pcrel_lo(pcrel523) sd s0, 0(sp) - addi a5, t3, %pcrel_lo(pcrel524) +pcrel756: + auipc a1, %pcrel_hi(A) + addi a5, t3, %pcrel_lo(pcrel755) + addi a3, a1, %pcrel_lo(pcrel756) sd s5, 8(sp) + lui a1, 1 sd s1, 16(sp) sd s3, 24(sp) sd s2, 32(sp) sd s4, 40(sp) - lw a0, %pcrel_lo(pcrel522)(a2) - addi a2, a1, %pcrel_lo(pcrel525) - addi t1, a0, -7 - lui a1, 1 - bgt a0, t0, label276 - bgt a0, zero, label302 -label300: - ld s0, 0(sp) - ld s5, 8(sp) - ld s1, 16(sp) - ld s3, 24(sp) - ld s2, 32(sp) - ld s4, 40(sp) - addi sp, sp, 48 - ret -label276: + lw a0, %pcrel_lo(pcrel753)(a2) + addi a2, t1, %pcrel_lo(pcrel754) + addi t0, a0, -3 + li t1, 3 + bgt a0, t1, label532 + ble a0, zero, label556 + slliw t1, t2, 12 + add t0, a5, t1 + mv a5, t2 + mv t1, a3 + mv t4, zero + bgt a0, zero, label567 + j label565 +.p2align 2 +label572: + add t1, t1, a1 + mv t4, t2 + ble a0, t2, label735 +.p2align 2 +label567: + sh2add t5, a5, t1 + addiw t2, t4, 1 + lw t3, 0(t5) + beq t3, zero, label574 + slliw a6, t4, 12 + mv t5, t0 + mv t6, zero + add t4, a2, a6 +.p2align 2 +label569: + lw s0, 0(t5) + sh2add a7, t6, t4 + addiw t6, t6, 1 + mulw a6, t3, s0 + amoadd.w.aqrl s1, a6, (a7) + ble a0, t6, label572 + addi t5, t5, 4 + j label569 +.p2align 2 +label574: + add t1, t1, a1 + mv t4, t2 + bgt a0, t2, label567 + addiw a5, a5, 1 + ble a4, a5, label556 +.p2align 2 +label566: + add t0, t0, a1 + mv t1, a3 + mv t4, zero + bgt a0, zero, label567 +label565: + addiw a5, a5, 1 + bgt a4, a5, label566 + j label556 +label532: slliw t3, t2, 12 - add t0, a5, t3 + add t1, a5, t3 mv t3, a3 mv t5, zero - bgt a0, zero, label283 - j label339 + bgt a0, zero, label541 + j label539 +.p2align 2 +label553: + addi t6, t6, 4 .p2align 2 -label284: +label549: + lw s1, 0(t6) + sh2add s0, a6, t5 + addiw a6, a6, 1 + mulw a7, a5, s1 + amoadd.w.aqrl s2, a7, (s0) + bgt a0, a6, label553 add t3, t3, a1 mv t5, t4 - ble a0, t4, label498 + ble a0, t4, label733 .p2align 2 -label283: +label541: sh2add t6, t2, t3 addiw t4, t5, 1 lw a5, 0(t6) - beq a5, zero, label284 + bne a5, zero, label609 + add t3, t3, a1 + mv t5, t4 + bgt a0, t4, label541 + addiw t2, t2, 1 + ble a4, t2, label556 +.p2align 2 +label540: + add t1, t1, a1 + mv t3, a3 + mv t5, zero + bgt a0, zero, label541 +label539: + addiw t2, t2, 1 + bgt a4, t2, label540 + j label556 +.p2align 2 +label609: slliw a7, t5, 12 + mv t6, t1 mv a6, zero add t5, a2, a7 - mv t6, t5 - j label286 -.p2align 2 -label289: - addi t6, t6, 32 .p2align 2 -label286: - sh2add a7, a6, t0 - addi s4, t6, 4 - addiw a6, a6, 8 - lw s0, 0(a7) +label544: + lw s0, 0(t6) + sh2add a7, a6, t5 + addiw a6, a6, 4 mulw s1, a5, s0 - amoadd.w.aqrl s3, s1, (t6) - lw s2, 4(a7) + amoadd.w.aqrl s3, s1, (a7) + addi s3, a7, 4 + lw s2, 4(t6) mulw s0, a5, s2 - addi s2, t6, 8 - amoadd.w.aqrl s5, s0, (s4) - lw s3, 8(a7) - mulw s1, a5, s3 - amoadd.w.aqrl s4, s1, (s2) - addi s2, t6, 12 - lw s3, 12(a7) - mulw s0, a5, s3 - amoadd.w.aqrl s4, s0, (s2) - addi s2, t6, 16 - lw s3, 16(a7) - mulw s1, a5, s3 - amoadd.w.aqrl s4, s1, (s2) - addi s2, t6, 20 - lw s3, 20(a7) - mulw s0, a5, s3 - amoadd.w.aqrl s4, s0, (s2) - addi s2, t6, 24 - lw s3, 24(a7) - mulw s1, a5, s3 - amoadd.w.aqrl s4, s1, (s2) - lw s3, 28(a7) - addi a7, t6, 28 + addi s2, a7, 8 + amoadd.w.aqrl s5, s0, (s3) + lw s4, 8(t6) + mulw s1, a5, s4 + amoadd.w.aqrl s5, s1, (s2) + addi s1, a7, 12 + lw s3, 12(t6) mulw s0, a5, s3 - amoadd.w.aqrl s1, s0, (a7) - bgt t1, a6, label289 - ble a0, a6, label499 - sh2add t6, a6, t0 - j label292 -.p2align 2 -label295: - addi t6, t6, 4 + amoadd.w.aqrl s2, s0, (s1) + ble t0, a6, label547 + addi t6, t6, 16 + j label544 +.p2align 2 +label547: + ble a0, a6, label641 + sh2add t6, a6, t1 + j label549 .p2align 2 -label292: - lw s1, 0(t6) - sh2add s0, a6, t5 - addiw a6, a6, 1 - mulw a7, a5, s1 - amoadd.w.aqrl s2, a7, (s0) - bgt a0, a6, label295 +label641: add t3, t3, a1 mv t5, t4 - bgt a0, t4, label283 + bgt a0, t4, label541 addiw t2, t2, 1 - bgt a4, t2, label299 - j label300 + bgt a4, t2, label540 +label556: + ld s0, 0(sp) + ld s5, 8(sp) + ld s1, 16(sp) + ld s3, 24(sp) + ld s2, 32(sp) + ld s4, 40(sp) + addi sp, sp, 48 + ret .p2align 2 -label498: +label733: addiw t2, t2, 1 - ble a4, t2, label300 + bgt a4, t2, label540 + j label556 .p2align 2 -label299: - add t0, t0, a1 - mv t3, a3 - mv t5, zero - bgt a0, zero, label283 -label339: - addiw t2, t2, 1 - bgt a4, t2, label299 - j label300 +label735: + addiw a5, a5, 1 + bgt a4, a5, label566 + j label556 .p2align 2 -label499: - add t3, t3, a1 - mv t5, t4 - bgt a0, t4, label283 - addiw t2, t2, 1 - bgt a4, t2, label299 - j label300 -label302: - slliw t1, t2, 12 - add t0, a5, t1 - mv a5, t2 - mv t1, a3 - mv t4, zero - bgt a0, zero, label311 - j label309 +cmmc_parallel_body_3: + mv t0, a0 +pcrel999: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_3) +pcrel1000: + auipc a5, %pcrel_hi(B) + li a3, 3 + lw a0, %pcrel_lo(pcrel999)(a2) + addi a4, a5, %pcrel_lo(pcrel1000) + lui a2, 1 + bgt a0, a3, label758 + ble a0, zero, label798 + slliw a5, t0, 12 + add a3, a4, a5 + mv a4, t0 + mv a5, a3 + mv t0, zero + j label804 .p2align 2 -label317: - addi t5, t5, 4 +label807: + addi a5, a5, 4 .p2align 2 -label313: - lw a7, 0(t5) - sh2add s0, t6, t4 - addiw t6, t6, 1 - mulw a6, t3, a7 - amoadd.w.aqrl a7, a6, (s0) - bgt a0, t6, label317 - add t1, t1, a1 - mv t4, t2 - ble a0, t2, label502 +label804: + addiw t0, t0, 1 + sw zero, 0(a5) + bgt a0, t0, label807 + addiw a4, a4, 1 + ble a1, a4, label798 .p2align 2 -label311: - sh2add t5, a5, t1 - addiw t2, t4, 1 - lw t3, 0(t5) - bne t3, zero, label312 - add t1, t1, a1 - mv t4, t2 - bgt a0, t2, label311 - addiw a5, a5, 1 - bgt a4, a5, label310 - j label300 +label809: + add a3, a3, a2 + li t0, 1 + sw zero, 0(a3) + mv a5, a3 + bgt a0, t0, label807 + addiw a4, a4, 1 + bgt a1, a4, label809 + j label798 +label758: + addiw a3, a0, -3 + addiw a5, a0, -18 + li t1, 15 + ble a3, t1, label825 + slliw t1, t0, 12 + add a4, a4, t1 + mv t1, a4 + mv t2, zero + j label763 .p2align 2 -label312: - slliw a6, t4, 12 - mv t5, t0 - mv t6, zero - add t4, a2, a6 - j label313 +label879: + addiw t0, t0, 1 + ble a1, t0, label798 .p2align 2 -label502: - addiw a5, a5, 1 - ble a4, a5, label300 +label775: + add a4, a4, a2 + li t2, 16 + sd zero, 0(a4) + mv t1, a4 + sd zero, 8(a4) + sd zero, 16(a4) + sd zero, 24(a4) + sd zero, 32(a4) + sd zero, 40(a4) + sd zero, 48(a4) + sd zero, 56(a4) + ble a5, t2, label766 +.p2align 2 +label781: + addi t1, t1, 64 +.p2align 2 +label763: + addiw t2, t2, 16 + sd zero, 0(t1) + sd zero, 8(t1) + sd zero, 16(t1) + sd zero, 24(t1) + sd zero, 32(t1) + sd zero, 40(t1) + sd zero, 48(t1) + sd zero, 56(t1) + bgt a5, t2, label781 +.p2align 2 +label766: + ble a3, t2, label851 + sh2add t1, t2, a4 + mv t3, t2 + j label768 .p2align 2 -label310: - add t0, t0, a1 - mv t1, a3 - mv t4, zero - bgt a0, zero, label311 -label309: - addiw a5, a5, 1 - bgt a4, a5, label310 - j label300 +label771: + addi t1, t1, 16 .p2align 2 -cmmc_parallel_body_3: - mv a5, a0 -pcrel651: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel652: - auipc t1, %pcrel_hi(B) - li t0, 7 - lw a0, %pcrel_lo(pcrel651)(a2) - addi a4, t1, %pcrel_lo(pcrel652) - lui a2, 1 - addi a3, a0, -7 - bgt a0, t0, label527 - bgt a0, zero, label545 - j label543 -label527: - slliw t0, a5, 12 - add a4, a4, t0 - mv t0, a4 - mv t1, zero - j label531 +label768: + addiw t3, t3, 4 + sd zero, 0(t1) + sd zero, 8(t1) + bgt a3, t3, label771 + ble a0, t3, label975 .p2align 2 -label540: - addi t0, t0, 4 +label776: + sh2add t1, t3, a4 + mv t2, t3 .p2align 2 -label537: +label777: addiw t2, t2, 1 - sw zero, 0(t0) - bgt a0, t2, label540 + sw zero, 0(t1) + ble a0, t2, label879 + addi t1, t1, 4 + j label777 +.p2align 2 +label851: + mv t3, t2 + bgt a0, t2, label776 + addiw t0, t0, 1 + bgt a1, t0, label775 +label798: + ret +label975: + addiw t0, t0, 1 + bgt a1, t0, label775 + j label798 +label825: + slliw t1, t0, 12 + mv a5, t0 + add a4, a4, t1 + mv t1, zero + mv t0, a4 + j label786 +.p2align 2 +label909: addiw a5, a5, 1 - ble a1, a5, label543 + ble a1, a5, label798 .p2align 2 -label542: +label792: add a4, a4, a2 - li t1, 8 + li t1, 4 sd zero, 0(a4) mv t0, a4 sd zero, 8(a4) - sd zero, 16(a4) - sd zero, 24(a4) - ble a3, t1, label643 + ble a3, t1, label983 .p2align 2 -label534: - addi t0, t0, 32 +label789: + addi t0, t0, 16 .p2align 2 -label531: - addiw t1, t1, 8 +label786: + addiw t1, t1, 4 sd zero, 0(t0) sd zero, 8(t0) - sd zero, 16(t0) - sd zero, 24(t0) - bgt a3, t1, label534 - ble a0, t1, label639 + bgt a3, t1, label789 + ble a0, t1, label977 .p2align 2 -label536: +label793: sh2add t0, t1, a4 mv t2, t1 - j label537 -label643: - bgt a0, t1, label536 -.p2align 2 -label639: - addiw a5, a5, 1 - bgt a1, a5, label542 -label543: - ret -label545: - slliw t0, a5, 12 - add a3, a4, t0 - mv a4, a5 - mv a5, a3 - mv t0, zero - j label549 .p2align 2 -label554: - addi a5, a5, 4 -.p2align 2 -label549: - addiw t0, t0, 1 - sw zero, 0(a5) - bgt a0, t0, label554 - addiw a4, a4, 1 - ble a1, a4, label543 +label794: + addiw t2, t2, 1 + sw zero, 0(t0) + ble a0, t2, label909 + addi t0, t0, 4 + j label794 +label983: + bgt a0, t1, label793 .p2align 2 -label553: - add a3, a3, a2 - li t0, 1 - sw zero, 0(a3) - mv a5, a3 - bgt a0, t0, label554 - addiw a4, a4, 1 - bgt a1, a4, label553 - j label543 +label977: + addiw a5, a5, 1 + bgt a1, a5, label792 + j label798 .p2align 2 cmmc_parallel_body_4: - addi sp, sp, -48 + addi sp, sp, -40 mv t2, a0 mv a4, a1 -pcrel902: +pcrel1225: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel903: +pcrel1226: auipc a5, %pcrel_hi(A) -pcrel904: +pcrel1227: auipc t3, %pcrel_hi(C) - li t0, 7 -pcrel905: + li t1, 3 +pcrel1228: auipc a1, %pcrel_hi(B) - addi a3, a5, %pcrel_lo(pcrel903) - sd s0, 0(sp) - addi a5, t3, %pcrel_lo(pcrel904) - sd s5, 8(sp) - sd s2, 16(sp) - sd s3, 24(sp) - sd s1, 32(sp) - sd s4, 40(sp) - lw a0, %pcrel_lo(pcrel902)(a2) - addi a2, a1, %pcrel_lo(pcrel905) - addi t1, a0, -7 + addi a3, a5, %pcrel_lo(pcrel1226) + sd s1, 0(sp) + addi a5, t3, %pcrel_lo(pcrel1227) + sd s0, 8(sp) + sd s3, 16(sp) + sd s2, 24(sp) + sd s4, 32(sp) + lw a0, %pcrel_lo(pcrel1225)(a2) + addi a2, a1, %pcrel_lo(pcrel1228) + addi t0, a0, -3 lui a1, 1 - ble a0, t0, label679 + bgt a0, t1, label1002 + bgt a0, zero, label1028 +label1026: + ld s1, 0(sp) + ld s0, 8(sp) + ld s3, 16(sp) + ld s2, 24(sp) + ld s4, 32(sp) + addi sp, sp, 40 + ret +label1002: slliw t3, t2, 12 - add t0, a5, t3 + add t1, a5, t3 mv t3, a3 mv t5, zero - bgt a0, zero, label661 - j label717 + bgt a0, zero, label1011 + j label1009 .p2align 2 -label675: +label1023: + addi t6, t6, 4 +.p2align 2 +label1020: + lw s1, 0(t6) + sh2add s0, a6, t5 + addiw a6, a6, 1 + mulw a7, a5, s1 + amoadd.w.aqrl s2, a7, (s0) + bgt a0, a6, label1023 add t3, t3, a1 mv t5, t4 - ble a0, t4, label879 + ble a0, t4, label1210 .p2align 2 -label661: +label1011: sh2add t6, t2, t3 addiw t4, t5, 1 lw a5, 0(t6) - beq a5, zero, label675 + bne a5, zero, label1012 + add t3, t3, a1 + mv t5, t4 + bgt a0, t4, label1011 + addiw t2, t2, 1 + ble a4, t2, label1026 +.p2align 2 +label1010: + add t1, t1, a1 + mv t3, a3 + mv t5, zero + bgt a0, zero, label1011 +label1009: + addiw t2, t2, 1 + bgt a4, t2, label1010 + j label1026 +.p2align 2 +label1012: slliw a7, t5, 12 + mv t6, t1 mv a6, zero add t5, a2, a7 - mv t6, t5 - j label663 -.p2align 2 -label666: - addi t6, t6, 32 -.p2align 2 -label663: - sh2add a7, a6, t0 - addiw a6, a6, 8 - lw s0, 0(a7) - mulw s2, a5, s0 - amoadd.w.aqrl s3, s2, (t6) - addi s2, t6, 4 - lw s1, 4(a7) + j label1013 +.p2align 2 +label1016: + addi t6, t6, 16 +.p2align 2 +label1013: + lw s1, 0(t6) + sh2add a7, a6, t5 + addiw a6, a6, 4 mulw s0, a5, s1 - amoadd.w.aqrl s3, s0, (s2) - addi s3, t6, 8 - lw s4, 8(a7) - mulw s1, a5, s4 - amoadd.w.aqrl s5, s1, (s3) - addi s3, t6, 12 - lw s2, 12(a7) - mulw s0, a5, s2 - amoadd.w.aqrl s4, s0, (s3) - addi s3, t6, 16 - lw s2, 16(a7) + amoadd.w.aqrl s3, s0, (a7) + addi s3, a7, 4 + lw s2, 4(t6) mulw s1, a5, s2 amoadd.w.aqrl s4, s1, (s3) - addi s3, t6, 20 - lw s2, 20(a7) + addi s3, a7, 8 + lw s2, 8(t6) mulw s0, a5, s2 - addi s2, t6, 24 - amoadd.w.aqrl s5, s0, (s3) - lw s4, 24(a7) - mulw s1, a5, s4 - amoadd.w.aqrl s5, s1, (s2) - lw s3, 28(a7) - addi a7, t6, 28 - mulw s0, a5, s3 - amoadd.w.aqrl s1, s0, (a7) - bgt t1, a6, label666 - ble a0, a6, label876 - sh2add t6, a6, t0 -.p2align 2 -label669: - lw s1, 0(t6) - sh2add s0, a6, t5 - addiw a6, a6, 1 - mulw a7, a5, s1 - amoadd.w.aqrl s2, a7, (s0) - ble a0, a6, label789 - addi t6, t6, 4 - j label669 -.p2align 2 -label879: - addiw t2, t2, 1 - ble a4, t2, label678 -.p2align 2 -label677: - add t0, t0, a1 - mv t3, a3 - mv t5, zero - bgt a0, zero, label661 - j label717 -.p2align 2 -label789: - add t3, t3, a1 - mv t5, t4 - bgt a0, t4, label661 - addiw t2, t2, 1 - bgt a4, t2, label677 - j label678 + amoadd.w.aqrl s4, s0, (s3) + addi s0, a7, 12 + lw s2, 12(t6) + mulw s1, a5, s2 + amoadd.w.aqrl s3, s1, (s0) + bgt t0, a6, label1016 + ble a0, a6, label1201 + sh2add t6, a6, t1 + j label1020 .p2align 2 -label876: +label1201: add t3, t3, a1 mv t5, t4 - bgt a0, t4, label661 -label717: + bgt a0, t4, label1011 addiw t2, t2, 1 - bgt a4, t2, label677 - j label678 -label679: - bgt a0, zero, label680 -label678: - ld s0, 0(sp) - ld s5, 8(sp) - ld s2, 16(sp) - ld s3, 24(sp) - ld s1, 32(sp) - ld s4, 40(sp) - addi sp, sp, 48 - ret -label680: + bgt a4, t2, label1010 + j label1026 +label1028: slliw t1, t2, 12 add t0, a5, t1 mv a5, t2 mv t1, a3 mv t4, zero - bgt a0, zero, label689 - j label687 + bgt a0, zero, label1037 + j label1035 .p2align 2 -label694: +label1042: add t1, t1, a1 mv t4, t2 - ble a0, t2, label880 + ble a0, t2, label1205 .p2align 2 -label689: +label1037: sh2add t5, a5, t1 addiw t2, t4, 1 lw t3, 0(t5) - bne t3, zero, label690 + beq t3, zero, label1044 + slliw a6, t4, 12 + mv t5, t0 + mv t6, zero + add t4, a2, a6 +.p2align 2 +label1039: + lw s0, 0(t5) + sh2add a7, t6, t4 + addiw t6, t6, 1 + mulw a6, t3, s0 + amoadd.w.aqrl s1, a6, (a7) + ble a0, t6, label1042 + addi t5, t5, 4 + j label1039 +.p2align 2 +label1044: add t1, t1, a1 mv t4, t2 - bgt a0, t2, label689 + bgt a0, t2, label1037 addiw a5, a5, 1 - ble a4, a5, label678 + ble a4, a5, label1026 .p2align 2 -label688: +label1036: add t0, t0, a1 mv t1, a3 mv t4, zero - bgt a0, zero, label689 -label687: + bgt a0, zero, label1037 +label1035: addiw a5, a5, 1 - bgt a4, a5, label688 - j label678 -.p2align 2 -label690: - slliw a6, t4, 12 - mv t5, t0 - mv t6, zero - add t4, a2, a6 + bgt a4, a5, label1036 + j label1026 .p2align 2 -label691: - lw a7, 0(t5) - sh2add s0, t6, t4 - addiw t6, t6, 1 - mulw a6, t3, a7 - amoadd.w.aqrl a7, a6, (s0) - ble a0, t6, label694 - addi t5, t5, 4 - j label691 +label1210: + addiw t2, t2, 1 + bgt a4, t2, label1010 + j label1026 .p2align 2 -label880: +label1205: addiw a5, a5, 1 - bgt a4, a5, label688 - j label678 + bgt a4, a5, label1036 + j label1026 diff --git a/tests/SysY2022/performance/01_mm2.sy.ir b/tests/SysY2022/performance/01_mm2.sy.ir index ff3d45414..17e4897a1 100644 --- a/tests/SysY2022/performance/01_mm2.sy.ir +++ b/tests/SysY2022/performance/01_mm2.sy.ir @@ -112,85 +112,176 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %5 = icmp sgt i32 %4, i32 0; cbr i1 %5(prob = 0.5), ^cond, ^b1; ^cond: - i32 %6 = add i32 %4, i32 -7; - i1 %7 = icmp sgt i32 %4, i32 7; - [1024 * [1024 * i32]]* %8 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; - cbr i1 %7(prob = 0.5), ^b2, ^b3; + i1 %6 = icmp sgt i32 %4, i32 3; + [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; + cbr i1 %6(prob = 0.5), ^cond1, ^b2; ^b1: - i32 %9 = phi [^b, i32 0] [^scalar.final, i32 %50] [^scalar.final1, i32 %63]; - i32* %10 = ptradd [8 * i8]* %2, i32 0; - atomicadd i32* %10, i32 %9; + i32 %8 = phi [^b, i32 0] [^scalar.final, i32 %27] [^scalar.final3, i32 %128] [^scalar.final4, i32 %138]; + i32* %9 = ptradd [8 * i8]* %2, i32 0; + atomicadd i32* %9, i32 %8; ret; ^b2: - i32 %12 = phi [^cond, i32 %0] [^scalar.final1, i32 %64]; - i32 %13 = phi [^cond, i32 0] [^scalar.final1, i32 %63]; - [1024 * i32]* %14 = getelementptr &([1024 * [1024 * i32]]* %8)[i64 0][i32 %12]; + i32 %11 = phi [^cond, i32 %0] [^scalar.final, i32 %98]; + i32 %12 = phi [^cond, i32 0] [^scalar.final, i32 %27]; + [1024 * i32]* %13 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i32 %11]; ubr ^while.body; + ^cond1: + i32 %14 = add i32 %4, i32 -3; + i1 %15 = icmp sgt i32 %14, i32 15; + i32 %16 = add i32 %4, i32 -18; + cbr i1 %15(prob = 0.5), ^b3, ^b4; ^b3: - i32 %15 = phi [^cond, i32 %0] [^scalar.final, i32 %54]; - i32 %16 = phi [^cond, i32 0] [^scalar.final, i32 %50]; - [1024 * i32]* %17 = getelementptr &([1024 * [1024 * i32]]* %8)[i64 0][i32 %15]; + i32 %17 = phi [^cond1, i32 0] [^scalar.final4, i32 %138]; + i32 %18 = phi [^cond1, i32 %0] [^scalar.final4, i32 %139]; + [1024 * i32]* %19 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i32 %18]; ubr ^while.body1; - ^while.body: - i32 %18 = phi [^b2, i32 %13] [^while.body, i32 %43]; - i32 %19 = phi [^b2, i32 0] [^while.body, i32 %44]; - i32* %20 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %19]; - i32 %21 = load i32* %20; - i32 %22 = add i32 %18, i32 %21; - i32* %23 = getelementptr &(i32* %20)[i64 1]; - i32 %24 = load i32* %23; - i32 %25 = add i32 %22, i32 %24; - i32* %26 = getelementptr &(i32* %20)[i64 2]; - i32 %27 = load i32* %26; - i32 %28 = add i32 %25, i32 %27; - i32* %29 = getelementptr &(i32* %20)[i64 3]; - i32 %30 = load i32* %29; - i32 %31 = add i32 %28, i32 %30; - i32* %32 = getelementptr &(i32* %20)[i64 4]; + ^b4: + i32 %20 = phi [^cond1, i32 %0] [^scalar.final3, i32 %129]; + i32 %21 = phi [^cond1, i32 0] [^scalar.final3, i32 %128]; + [1024 * i32]* %22 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i32 %20]; + ubr ^while.body2; + ^while.body {scalar}: + i32 %23 = phi [^b2, i32 0] [^while.body, i32 %28]; + i32 %24 = phi [^b2, i32 %12] [^while.body, i32 %27]; + i32* %25 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %23]; + i32 %26 = load i32* %25; + i32 %27 = add i32 %24, i32 %26; + i32 %28 = add i32 %23, i32 1; + i1 %29 = icmp sgt i32 %4, i32 %28; + cbr i1 %29(prob = 0.75), ^while.body, ^scalar.final; + ^while.body1: + i32 %30 = phi [^b3, i32 0] [^while.body1, i32 %80]; + i32 %31 = phi [^b3, i32 %17] [^while.body1, i32 %79]; + i32* %32 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %30]; i32 %33 = load i32* %32; i32 %34 = add i32 %31, i32 %33; - i32* %35 = getelementptr &(i32* %20)[i64 5]; + i32* %35 = getelementptr &(i32* %32)[i64 1]; i32 %36 = load i32* %35; i32 %37 = add i32 %34, i32 %36; - i32* %38 = getelementptr &(i32* %20)[i64 6]; + i32* %38 = getelementptr &(i32* %32)[i64 2]; i32 %39 = load i32* %38; i32 %40 = add i32 %37, i32 %39; - i32* %41 = getelementptr &(i32* %20)[i64 7]; + i32* %41 = getelementptr &(i32* %32)[i64 3]; i32 %42 = load i32* %41; i32 %43 = add i32 %40, i32 %42; - i32 %44 = add i32 %19, i32 8; - i1 %45 = icmp sgt i32 %6, i32 %44; - cbr i1 %45(prob = 0.888889), ^while.body, ^scalar.header; - ^while.body1 {scalar}: - i32 %46 = phi [^b3, i32 0] [^while.body1, i32 %51]; - i32 %47 = phi [^b3, i32 %16] [^while.body1, i32 %50]; - i32* %48 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %46]; - i32 %49 = load i32* %48; - i32 %50 = add i32 %47, i32 %49; - i32 %51 = add i32 %46, i32 1; - i1 %52 = icmp sgt i32 %4, i32 %51; - cbr i1 %52(prob = 0.875), ^while.body1, ^scalar.final; - ^scalar.header: - i1 %53 = icmp sgt i32 %4, i32 %44; - cbr i1 %53(prob = 0.875), ^while.body2, ^scalar.final1; - ^scalar.final: - i32 %54 = add i32 %15, i32 1; - i1 %55 = icmp sgt i32 %1, i32 %54; - cbr i1 %55(prob = 0.984615), ^b3, ^b1; + i32* %44 = getelementptr &(i32* %32)[i64 4]; + i32 %45 = load i32* %44; + i32 %46 = add i32 %43, i32 %45; + i32* %47 = getelementptr &(i32* %32)[i64 5]; + i32 %48 = load i32* %47; + i32 %49 = add i32 %46, i32 %48; + i32* %50 = getelementptr &(i32* %32)[i64 6]; + i32 %51 = load i32* %50; + i32 %52 = add i32 %49, i32 %51; + i32* %53 = getelementptr &(i32* %32)[i64 7]; + i32 %54 = load i32* %53; + i32 %55 = add i32 %52, i32 %54; + i32* %56 = getelementptr &(i32* %32)[i64 8]; + i32 %57 = load i32* %56; + i32 %58 = add i32 %55, i32 %57; + i32* %59 = getelementptr &(i32* %32)[i64 9]; + i32 %60 = load i32* %59; + i32 %61 = add i32 %58, i32 %60; + i32* %62 = getelementptr &(i32* %32)[i64 10]; + i32 %63 = load i32* %62; + i32 %64 = add i32 %61, i32 %63; + i32* %65 = getelementptr &(i32* %32)[i64 11]; + i32 %66 = load i32* %65; + i32 %67 = add i32 %64, i32 %66; + i32* %68 = getelementptr &(i32* %32)[i64 12]; + i32 %69 = load i32* %68; + i32 %70 = add i32 %67, i32 %69; + i32* %71 = getelementptr &(i32* %32)[i64 13]; + i32 %72 = load i32* %71; + i32 %73 = add i32 %70, i32 %72; + i32* %74 = getelementptr &(i32* %32)[i64 14]; + i32 %75 = load i32* %74; + i32 %76 = add i32 %73, i32 %75; + i32* %77 = getelementptr &(i32* %32)[i64 15]; + i32 %78 = load i32* %77; + i32 %79 = add i32 %76, i32 %78; + i32 %80 = add i32 %30, i32 16; + i1 %81 = icmp sgt i32 %16, i32 %80; + cbr i1 %81(prob = 0.941176), ^while.body1, ^scalar.header; ^while.body2 {scalar}: - i32 %56 = phi [^scalar.header, i32 %44] [^while.body2, i32 %61]; - i32 %57 = phi [^scalar.header, i32 %43] [^while.body2, i32 %60]; - i32* %58 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %56]; - i32 %59 = load i32* %58; - i32 %60 = add i32 %57, i32 %59; - i32 %61 = add i32 %56, i32 1; - i1 %62 = icmp sgt i32 %4, i32 %61; - cbr i1 %62(prob = 0.875), ^while.body2, ^scalar.final1; + i32 %82 = phi [^b4, i32 %21] [^while.body2, i32 %95]; + i32 %83 = phi [^b4, i32 0] [^while.body2, i32 %96]; + i32* %84 = getelementptr &([1024 * i32]* %22)[i64 0][i32 %83]; + i32 %85 = load i32* %84; + i32 %86 = add i32 %82, i32 %85; + i32* %87 = getelementptr &(i32* %84)[i64 1]; + i32 %88 = load i32* %87; + i32 %89 = add i32 %86, i32 %88; + i32* %90 = getelementptr &(i32* %84)[i64 2]; + i32 %91 = load i32* %90; + i32 %92 = add i32 %89, i32 %91; + i32* %93 = getelementptr &(i32* %84)[i64 3]; + i32 %94 = load i32* %93; + i32 %95 = add i32 %92, i32 %94; + i32 %96 = add i32 %83, i32 4; + i1 %97 = icmp sgt i32 %14, i32 %96; + cbr i1 %97(prob = 0.75), ^while.body2, ^scalar.final1; + ^scalar.final: + i32 %98 = add i32 %11, i32 1; + i1 %99 = icmp sgt i32 %1, i32 %98; + cbr i1 %99(prob = 0.984615), ^b2, ^b1; + ^scalar.header: + i1 %100 = icmp sgt i32 %14, i32 %80; + cbr i1 %100(prob = 0.75), ^while.body3, ^scalar.final2; ^scalar.final1: - i32 %63 = phi [^scalar.header, i32 %43] [^while.body2, i32 %60]; - i32 %64 = add i32 %12, i32 1; - i1 %65 = icmp sgt i32 %1, i32 %64; - cbr i1 %65(prob = 0.984615), ^b2, ^b1; + i1 %101 = icmp sgt i32 %4, i32 %96; + cbr i1 %101(prob = 0.75), ^while.body4, ^scalar.final3; + ^while.body3 {scalar}: + i32 %102 = phi [^scalar.header, i32 %79] [^while.body3, i32 %115]; + i32 %103 = phi [^scalar.header, i32 %80] [^while.body3, i32 %116]; + i32* %104 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %103]; + i32 %105 = load i32* %104; + i32 %106 = add i32 %102, i32 %105; + i32* %107 = getelementptr &(i32* %104)[i64 1]; + i32 %108 = load i32* %107; + i32 %109 = add i32 %106, i32 %108; + i32* %110 = getelementptr &(i32* %104)[i64 2]; + i32 %111 = load i32* %110; + i32 %112 = add i32 %109, i32 %111; + i32* %113 = getelementptr &(i32* %104)[i64 3]; + i32 %114 = load i32* %113; + i32 %115 = add i32 %112, i32 %114; + i32 %116 = add i32 %103, i32 4; + i1 %117 = icmp sgt i32 %14, i32 %116; + cbr i1 %117(prob = 0.75), ^while.body3, ^scalar.final2; + ^scalar.final2: + i32 %118 = phi [^scalar.header, i32 %79] [^while.body3, i32 %115]; + i32 %119 = phi [^scalar.header, i32 %80] [^while.body3, i32 %116]; + i1 %120 = icmp sgt i32 %4, i32 %119; + cbr i1 %120(prob = 0.75), ^while.body5, ^scalar.final4; + ^while.body4 {scalar}: + i32 %121 = phi [^scalar.final1, i32 %96] [^while.body4, i32 %126]; + i32 %122 = phi [^scalar.final1, i32 %95] [^while.body4, i32 %125]; + i32* %123 = getelementptr &([1024 * i32]* %22)[i64 0][i32 %121]; + i32 %124 = load i32* %123; + i32 %125 = add i32 %122, i32 %124; + i32 %126 = add i32 %121, i32 1; + i1 %127 = icmp sgt i32 %4, i32 %126; + cbr i1 %127(prob = 0.75), ^while.body4, ^scalar.final3; + ^scalar.final3: + i32 %128 = phi [^scalar.final1, i32 %95] [^while.body4, i32 %125]; + i32 %129 = add i32 %20, i32 1; + i1 %130 = icmp sgt i32 %1, i32 %129; + cbr i1 %130(prob = 0.984615), ^b4, ^b1; + ^while.body5 {scalar}: + i32 %131 = phi [^scalar.final2, i32 %119] [^while.body5, i32 %136]; + i32 %132 = phi [^scalar.final2, i32 %118] [^while.body5, i32 %135]; + i32* %133 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %131]; + i32 %134 = load i32* %133; + i32 %135 = add i32 %132, i32 %134; + i32 %136 = add i32 %131, i32 1; + i1 %137 = icmp sgt i32 %4, i32 %136; + cbr i1 %137(prob = 0.75), ^while.body5, ^scalar.final4; + ^scalar.final4: + i32 %138 = phi [^scalar.final2, i32 %118] [^while.body5, i32 %135]; + i32 %139 = add i32 %18, i32 1; + i1 %140 = icmp sgt i32 %1, i32 %139; + cbr i1 %140(prob = 0.984615), ^b3, ^b1; } internal [8 * i8]* @cmmc_parallel_body_payload_0, align 8; internal func @cmmc_parallel_body_1(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -198,70 +289,138 @@ internal func @cmmc_parallel_body_1(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_1 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; - [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @C to [1024 * [1024 * i32]]*; - [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; - cbr i1 %5(prob = 0.5), ^b1, ^cond; - ^b1: - i32 %9 = phi [^b, i32 %0] [^b4, i32 %34]; - [1024 * i32]* %10 = getelementptr &([1024 * i32]* %8)[i32 %9]; - ubr ^while.body; + i1 %5 = icmp sgt i32 %4, i32 3; + [1024 * [1024 * i32]]* %6 = ptrcast [1024 * [1024 * i32]]* @C to [1024 * [1024 * i32]]*; + [1024 * i32]* %7 = getelementptr &([1024 * [1024 * i32]]* %6)[i64 0][i64 0]; + cbr i1 %5(prob = 0.5), ^cond, ^cond1; ^cond: + i32 %8 = add i32 %4, i32 -3; + i1 %9 = icmp sgt i32 %8, i32 15; + i32 %10 = add i32 %4, i32 -18; + cbr i1 %9(prob = 0.5), ^b1, ^b3; + ^cond1: i1 %11 = icmp sgt i32 %4, i32 0; - cbr i1 %11(prob = 0.5), ^b2, ^b3; + cbr i1 %11(prob = 0.5), ^b2, ^b4; + ^b1: + i32 %12 = phi [^cond, i32 %0] [^b7, i32 %71]; + [1024 * i32]* %13 = getelementptr &([1024 * i32]* %7)[i32 %12]; + ubr ^while.body; ^b2: - i32 %12 = phi [^cond, i32 %0] [^b5, i32 %36]; - [1024 * i32]* %13 = getelementptr &([1024 * i32]* %8)[i32 %12]; + i32 %14 = phi [^cond1, i32 %0] [^b5, i32 %49]; + [1024 * i32]* %15 = getelementptr &([1024 * i32]* %7)[i32 %14]; ubr ^while.body1; + ^b3: + i32 %16 = phi [^cond, i32 %0] [^b6, i32 %65]; + [1024 * i32]* %17 = getelementptr &([1024 * i32]* %7)[i32 %16]; + ubr ^while.body2; + ^b4: + ret; ^while.body: - i32 %14 = phi [^b1, i32 0] [^while.body, i32 %23]; - i32* %15 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %14]; - store i32* %15 with i32 0; - i32* %16 = getelementptr &(i32* %15)[i64 1]; - store i32* %16 with i32 0; - i32* %17 = getelementptr &(i32* %15)[i64 2]; - store i32* %17 with i32 0; - i32* %18 = getelementptr &(i32* %15)[i64 3]; - store i32* %18 with i32 0; - i32* %19 = getelementptr &(i32* %15)[i64 4]; + i32 %18 = phi [^b1, i32 0] [^while.body, i32 %35]; + i32* %19 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %18]; store i32* %19 with i32 0; - i32* %20 = getelementptr &(i32* %15)[i64 5]; + i32* %20 = getelementptr &(i32* %19)[i64 1]; store i32* %20 with i32 0; - i32* %21 = getelementptr &(i32* %15)[i64 6]; + i32* %21 = getelementptr &(i32* %19)[i64 2]; store i32* %21 with i32 0; - i32* %22 = getelementptr &(i32* %15)[i64 7]; + i32* %22 = getelementptr &(i32* %19)[i64 3]; store i32* %22 with i32 0; - i32 %23 = add i32 %14, i32 8; - i1 %24 = icmp sgt i32 %6, i32 %23; - cbr i1 %24(prob = 0.888889), ^while.body, ^scalar.header; - ^b3: - ret; - ^scalar.header: - i1 %25 = icmp sgt i32 %4, i32 %23; - cbr i1 %25(prob = 0.875), ^while.body2, ^b4; - ^while.body1 {scalar}: - i32 %26 = phi [^b2, i32 0] [^while.body1, i32 %28]; - i32* %27 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %26]; + i32* %23 = getelementptr &(i32* %19)[i64 4]; + store i32* %23 with i32 0; + i32* %24 = getelementptr &(i32* %19)[i64 5]; + store i32* %24 with i32 0; + i32* %25 = getelementptr &(i32* %19)[i64 6]; + store i32* %25 with i32 0; + i32* %26 = getelementptr &(i32* %19)[i64 7]; + store i32* %26 with i32 0; + i32* %27 = getelementptr &(i32* %19)[i64 8]; store i32* %27 with i32 0; - i32 %28 = add i32 %26, i32 1; - i1 %29 = icmp sgt i32 %4, i32 %28; - cbr i1 %29(prob = 0.875), ^while.body1, ^b5; - ^while.body2 {scalar}: - i32 %30 = phi [^scalar.header, i32 %23] [^while.body2, i32 %32]; - i32* %31 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %30]; + i32* %28 = getelementptr &(i32* %19)[i64 9]; + store i32* %28 with i32 0; + i32* %29 = getelementptr &(i32* %19)[i64 10]; + store i32* %29 with i32 0; + i32* %30 = getelementptr &(i32* %19)[i64 11]; + store i32* %30 with i32 0; + i32* %31 = getelementptr &(i32* %19)[i64 12]; store i32* %31 with i32 0; - i32 %32 = add i32 %30, i32 1; - i1 %33 = icmp sgt i32 %4, i32 %32; - cbr i1 %33(prob = 0.875), ^while.body2, ^b4; - ^b4: - i32 %34 = add i32 %9, i32 1; - i1 %35 = icmp sgt i32 %1, i32 %34; - cbr i1 %35(prob = 0.984615), ^b1, ^b3; + i32* %32 = getelementptr &(i32* %19)[i64 13]; + store i32* %32 with i32 0; + i32* %33 = getelementptr &(i32* %19)[i64 14]; + store i32* %33 with i32 0; + i32* %34 = getelementptr &(i32* %19)[i64 15]; + store i32* %34 with i32 0; + i32 %35 = add i32 %18, i32 16; + i1 %36 = icmp sgt i32 %10, i32 %35; + cbr i1 %36(prob = 0.941176), ^while.body, ^scalar.header; + ^while.body1 {scalar}: + i32 %37 = phi [^b2, i32 0] [^while.body1, i32 %39]; + i32* %38 = getelementptr &([1024 * i32]* %15)[i64 0][i32 %37]; + store i32* %38 with i32 0; + i32 %39 = add i32 %37, i32 1; + i1 %40 = icmp sgt i32 %4, i32 %39; + cbr i1 %40(prob = 0.75), ^while.body1, ^b5; + ^while.body2 {scalar}: + i32 %41 = phi [^b3, i32 0] [^while.body2, i32 %46]; + i32* %42 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %41]; + store i32* %42 with i32 0; + i32* %43 = getelementptr &(i32* %42)[i64 1]; + store i32* %43 with i32 0; + i32* %44 = getelementptr &(i32* %42)[i64 2]; + store i32* %44 with i32 0; + i32* %45 = getelementptr &(i32* %42)[i64 3]; + store i32* %45 with i32 0; + i32 %46 = add i32 %41, i32 4; + i1 %47 = icmp sgt i32 %8, i32 %46; + cbr i1 %47(prob = 0.75), ^while.body2, ^scalar.final; + ^scalar.header: + i1 %48 = icmp sgt i32 %8, i32 %35; + cbr i1 %48(prob = 0.75), ^while.body3, ^scalar.final1; ^b5: - i32 %36 = add i32 %12, i32 1; - i1 %37 = icmp sgt i32 %1, i32 %36; - cbr i1 %37(prob = 0.984615), ^b2, ^b3; + i32 %49 = add i32 %14, i32 1; + i1 %50 = icmp sgt i32 %1, i32 %49; + cbr i1 %50(prob = 0.984615), ^b2, ^b4; + ^scalar.final: + i1 %51 = icmp sgt i32 %4, i32 %46; + cbr i1 %51(prob = 0.75), ^while.body4, ^b6; + ^while.body3 {scalar}: + i32 %52 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i32* %53 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %52]; + store i32* %53 with i32 0; + i32* %54 = getelementptr &(i32* %53)[i64 1]; + store i32* %54 with i32 0; + i32* %55 = getelementptr &(i32* %53)[i64 2]; + store i32* %55 with i32 0; + i32* %56 = getelementptr &(i32* %53)[i64 3]; + store i32* %56 with i32 0; + i32 %57 = add i32 %52, i32 4; + i1 %58 = icmp sgt i32 %8, i32 %57; + cbr i1 %58(prob = 0.75), ^while.body3, ^scalar.final1; + ^scalar.final1: + i32 %59 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i1 %60 = icmp sgt i32 %4, i32 %59; + cbr i1 %60(prob = 0.75), ^while.body5, ^b7; + ^while.body4 {scalar}: + i32 %61 = phi [^scalar.final, i32 %46] [^while.body4, i32 %63]; + i32* %62 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %61]; + store i32* %62 with i32 0; + i32 %63 = add i32 %61, i32 1; + i1 %64 = icmp sgt i32 %4, i32 %63; + cbr i1 %64(prob = 0.75), ^while.body4, ^b6; + ^b6: + i32 %65 = add i32 %16, i32 1; + i1 %66 = icmp sgt i32 %1, i32 %65; + cbr i1 %66(prob = 0.984615), ^b3, ^b4; + ^while.body5 {scalar}: + i32 %67 = phi [^scalar.final1, i32 %59] [^while.body5, i32 %69]; + i32* %68 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %67]; + store i32* %68 with i32 0; + i32 %69 = add i32 %67, i32 1; + i1 %70 = icmp sgt i32 %4, i32 %69; + cbr i1 %70(prob = 0.75), ^while.body5, ^b7; + ^b7: + i32 %71 = add i32 %12, i32 1; + i1 %72 = icmp sgt i32 %1, i32 %71; + cbr i1 %72(prob = 0.984615), ^b1, ^b4; } internal [4 * i8]* @cmmc_parallel_body_payload_1, align 8; internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -269,8 +428,8 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_2 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; + i1 %5 = icmp sgt i32 %4, i32 3; + i32 %6 = add i32 %4, i32 -3; [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @A to [1024 * [1024 * i32]]*; [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; [1024 * [1024 * i32]]* %9 = ptrcast [1024 * [1024 * i32]]* @C to [1024 * [1024 * i32]]*; @@ -325,7 +484,7 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %36 = icmp sgt i32 %1, i32 %35; cbr i1 %36(prob = 0.984615), ^b2, ^b3; ^while.body2: - i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %78]; + i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %58]; i32* %38 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %37]; i32 %39 = load i32* %38; i32 %40 = mul i32 %22, i32 %39; @@ -346,55 +505,35 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %55 = mul i32 %22, i32 %54; i32* %56 = getelementptr &(i32* %41)[i64 3]; atomicadd i32* %56, i32 %55; - i32* %58 = getelementptr &(i32* %38)[i64 4]; - i32 %59 = load i32* %58; - i32 %60 = mul i32 %22, i32 %59; - i32* %61 = getelementptr &(i32* %41)[i64 4]; - atomicadd i32* %61, i32 %60; - i32* %63 = getelementptr &(i32* %38)[i64 5]; - i32 %64 = load i32* %63; - i32 %65 = mul i32 %22, i32 %64; - i32* %66 = getelementptr &(i32* %41)[i64 5]; - atomicadd i32* %66, i32 %65; - i32* %68 = getelementptr &(i32* %38)[i64 6]; - i32 %69 = load i32* %68; - i32 %70 = mul i32 %22, i32 %69; - i32* %71 = getelementptr &(i32* %41)[i64 6]; - atomicadd i32* %71, i32 %70; - i32* %73 = getelementptr &(i32* %38)[i64 7]; - i32 %74 = load i32* %73; - i32 %75 = mul i32 %22, i32 %74; - i32* %76 = getelementptr &(i32* %41)[i64 7]; - atomicadd i32* %76, i32 %75; - i32 %78 = add i32 %37, i32 8; - i1 %79 = icmp sgt i32 %6, i32 %78; - cbr i1 %79(prob = 0.888889), ^while.body2, ^scalar.header; + i32 %58 = add i32 %37, i32 4; + i1 %59 = icmp sgt i32 %6, i32 %58; + cbr i1 %59(prob = 0.941176), ^while.body2, ^scalar.header; ^prebody1: - [1024 * i32]* %80 = getelementptr &([1024 * i32]* %10)[i32 %27]; + [1024 * i32]* %60 = getelementptr &([1024 * i32]* %10)[i32 %27]; ubr ^while.body3; ^scalar.header: - i1 %81 = icmp sle i32 %4, i32 %78; - cbr i1 %81(prob = 0.125), ^while.header, ^while.body4; + i1 %61 = icmp sle i32 %4, i32 %58; + cbr i1 %61(prob = 0.25), ^while.header, ^while.body4; ^while.body3 {scalar}: - i32 %82 = phi [^prebody1, i32 0] [^while.body3, i32 %88]; - i32* %83 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %82]; - i32 %84 = load i32* %83; - i32 %85 = mul i32 %32, i32 %84; - i32* %86 = getelementptr &([1024 * i32]* %80)[i64 0][i32 %82]; - atomicadd i32* %86, i32 %85; - i32 %88 = add i32 %82, i32 1; - i1 %89 = icmp sgt i32 %4, i32 %88; - cbr i1 %89(prob = 0.875), ^while.body3, ^while.header1; + i32 %62 = phi [^prebody1, i32 0] [^while.body3, i32 %68]; + i32* %63 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %62]; + i32 %64 = load i32* %63; + i32 %65 = mul i32 %32, i32 %64; + i32* %66 = getelementptr &([1024 * i32]* %60)[i64 0][i32 %62]; + atomicadd i32* %66, i32 %65; + i32 %68 = add i32 %62, i32 1; + i1 %69 = icmp sgt i32 %4, i32 %68; + cbr i1 %69(prob = 0.75), ^while.body3, ^while.header1; ^while.body4 {scalar}: - i32 %90 = phi [^scalar.header, i32 %78] [^while.body4, i32 %96]; - i32* %91 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %90]; - i32 %92 = load i32* %91; - i32 %93 = mul i32 %22, i32 %92; - i32* %94 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %90]; - atomicadd i32* %94, i32 %93; - i32 %96 = add i32 %90, i32 1; - i1 %97 = icmp sgt i32 %4, i32 %96; - cbr i1 %97(prob = 0.875), ^while.body4, ^while.header; + i32 %70 = phi [^scalar.header, i32 %58] [^while.body4, i32 %76]; + i32* %71 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %70]; + i32 %72 = load i32* %71; + i32 %73 = mul i32 %22, i32 %72; + i32* %74 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %70]; + atomicadd i32* %74, i32 %73; + i32 %76 = add i32 %70, i32 1; + i1 %77 = icmp sgt i32 %4, i32 %76; + cbr i1 %77(prob = 0.75), ^while.body4, ^while.header; } internal [4 * i8]* @cmmc_parallel_body_payload_2, align 8; internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -402,70 +541,138 @@ internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_3 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; - [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; - [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; - cbr i1 %5(prob = 0.5), ^b1, ^cond; - ^b1: - i32 %9 = phi [^b, i32 %0] [^b4, i32 %34]; - [1024 * i32]* %10 = getelementptr &([1024 * i32]* %8)[i32 %9]; - ubr ^while.body; + i1 %5 = icmp sgt i32 %4, i32 3; + [1024 * [1024 * i32]]* %6 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; + [1024 * i32]* %7 = getelementptr &([1024 * [1024 * i32]]* %6)[i64 0][i64 0]; + cbr i1 %5(prob = 0.5), ^cond, ^cond1; ^cond: + i32 %8 = add i32 %4, i32 -3; + i1 %9 = icmp sgt i32 %8, i32 15; + i32 %10 = add i32 %4, i32 -18; + cbr i1 %9(prob = 0.5), ^b1, ^b3; + ^cond1: i1 %11 = icmp sgt i32 %4, i32 0; - cbr i1 %11(prob = 0.5), ^b2, ^b3; + cbr i1 %11(prob = 0.5), ^b2, ^b4; + ^b1: + i32 %12 = phi [^cond, i32 %0] [^b7, i32 %71]; + [1024 * i32]* %13 = getelementptr &([1024 * i32]* %7)[i32 %12]; + ubr ^while.body; ^b2: - i32 %12 = phi [^cond, i32 %0] [^b5, i32 %36]; - [1024 * i32]* %13 = getelementptr &([1024 * i32]* %8)[i32 %12]; + i32 %14 = phi [^cond1, i32 %0] [^b5, i32 %49]; + [1024 * i32]* %15 = getelementptr &([1024 * i32]* %7)[i32 %14]; ubr ^while.body1; + ^b3: + i32 %16 = phi [^cond, i32 %0] [^b6, i32 %65]; + [1024 * i32]* %17 = getelementptr &([1024 * i32]* %7)[i32 %16]; + ubr ^while.body2; + ^b4: + ret; ^while.body: - i32 %14 = phi [^b1, i32 0] [^while.body, i32 %23]; - i32* %15 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %14]; - store i32* %15 with i32 0; - i32* %16 = getelementptr &(i32* %15)[i64 1]; - store i32* %16 with i32 0; - i32* %17 = getelementptr &(i32* %15)[i64 2]; - store i32* %17 with i32 0; - i32* %18 = getelementptr &(i32* %15)[i64 3]; - store i32* %18 with i32 0; - i32* %19 = getelementptr &(i32* %15)[i64 4]; + i32 %18 = phi [^b1, i32 0] [^while.body, i32 %35]; + i32* %19 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %18]; store i32* %19 with i32 0; - i32* %20 = getelementptr &(i32* %15)[i64 5]; + i32* %20 = getelementptr &(i32* %19)[i64 1]; store i32* %20 with i32 0; - i32* %21 = getelementptr &(i32* %15)[i64 6]; + i32* %21 = getelementptr &(i32* %19)[i64 2]; store i32* %21 with i32 0; - i32* %22 = getelementptr &(i32* %15)[i64 7]; + i32* %22 = getelementptr &(i32* %19)[i64 3]; store i32* %22 with i32 0; - i32 %23 = add i32 %14, i32 8; - i1 %24 = icmp sgt i32 %6, i32 %23; - cbr i1 %24(prob = 0.888889), ^while.body, ^scalar.header; - ^b3: - ret; - ^scalar.header: - i1 %25 = icmp sgt i32 %4, i32 %23; - cbr i1 %25(prob = 0.875), ^while.body2, ^b4; - ^while.body1 {scalar}: - i32 %26 = phi [^b2, i32 0] [^while.body1, i32 %28]; - i32* %27 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %26]; + i32* %23 = getelementptr &(i32* %19)[i64 4]; + store i32* %23 with i32 0; + i32* %24 = getelementptr &(i32* %19)[i64 5]; + store i32* %24 with i32 0; + i32* %25 = getelementptr &(i32* %19)[i64 6]; + store i32* %25 with i32 0; + i32* %26 = getelementptr &(i32* %19)[i64 7]; + store i32* %26 with i32 0; + i32* %27 = getelementptr &(i32* %19)[i64 8]; store i32* %27 with i32 0; - i32 %28 = add i32 %26, i32 1; - i1 %29 = icmp sgt i32 %4, i32 %28; - cbr i1 %29(prob = 0.875), ^while.body1, ^b5; - ^while.body2 {scalar}: - i32 %30 = phi [^scalar.header, i32 %23] [^while.body2, i32 %32]; - i32* %31 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %30]; + i32* %28 = getelementptr &(i32* %19)[i64 9]; + store i32* %28 with i32 0; + i32* %29 = getelementptr &(i32* %19)[i64 10]; + store i32* %29 with i32 0; + i32* %30 = getelementptr &(i32* %19)[i64 11]; + store i32* %30 with i32 0; + i32* %31 = getelementptr &(i32* %19)[i64 12]; store i32* %31 with i32 0; - i32 %32 = add i32 %30, i32 1; - i1 %33 = icmp sgt i32 %4, i32 %32; - cbr i1 %33(prob = 0.875), ^while.body2, ^b4; - ^b4: - i32 %34 = add i32 %9, i32 1; - i1 %35 = icmp sgt i32 %1, i32 %34; - cbr i1 %35(prob = 0.984615), ^b1, ^b3; + i32* %32 = getelementptr &(i32* %19)[i64 13]; + store i32* %32 with i32 0; + i32* %33 = getelementptr &(i32* %19)[i64 14]; + store i32* %33 with i32 0; + i32* %34 = getelementptr &(i32* %19)[i64 15]; + store i32* %34 with i32 0; + i32 %35 = add i32 %18, i32 16; + i1 %36 = icmp sgt i32 %10, i32 %35; + cbr i1 %36(prob = 0.941176), ^while.body, ^scalar.header; + ^while.body1 {scalar}: + i32 %37 = phi [^b2, i32 0] [^while.body1, i32 %39]; + i32* %38 = getelementptr &([1024 * i32]* %15)[i64 0][i32 %37]; + store i32* %38 with i32 0; + i32 %39 = add i32 %37, i32 1; + i1 %40 = icmp sgt i32 %4, i32 %39; + cbr i1 %40(prob = 0.75), ^while.body1, ^b5; + ^while.body2 {scalar}: + i32 %41 = phi [^b3, i32 0] [^while.body2, i32 %46]; + i32* %42 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %41]; + store i32* %42 with i32 0; + i32* %43 = getelementptr &(i32* %42)[i64 1]; + store i32* %43 with i32 0; + i32* %44 = getelementptr &(i32* %42)[i64 2]; + store i32* %44 with i32 0; + i32* %45 = getelementptr &(i32* %42)[i64 3]; + store i32* %45 with i32 0; + i32 %46 = add i32 %41, i32 4; + i1 %47 = icmp sgt i32 %8, i32 %46; + cbr i1 %47(prob = 0.75), ^while.body2, ^scalar.final; + ^scalar.header: + i1 %48 = icmp sgt i32 %8, i32 %35; + cbr i1 %48(prob = 0.75), ^while.body3, ^scalar.final1; ^b5: - i32 %36 = add i32 %12, i32 1; - i1 %37 = icmp sgt i32 %1, i32 %36; - cbr i1 %37(prob = 0.984615), ^b2, ^b3; + i32 %49 = add i32 %14, i32 1; + i1 %50 = icmp sgt i32 %1, i32 %49; + cbr i1 %50(prob = 0.984615), ^b2, ^b4; + ^scalar.final: + i1 %51 = icmp sgt i32 %4, i32 %46; + cbr i1 %51(prob = 0.75), ^while.body4, ^b6; + ^while.body3 {scalar}: + i32 %52 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i32* %53 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %52]; + store i32* %53 with i32 0; + i32* %54 = getelementptr &(i32* %53)[i64 1]; + store i32* %54 with i32 0; + i32* %55 = getelementptr &(i32* %53)[i64 2]; + store i32* %55 with i32 0; + i32* %56 = getelementptr &(i32* %53)[i64 3]; + store i32* %56 with i32 0; + i32 %57 = add i32 %52, i32 4; + i1 %58 = icmp sgt i32 %8, i32 %57; + cbr i1 %58(prob = 0.75), ^while.body3, ^scalar.final1; + ^scalar.final1: + i32 %59 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i1 %60 = icmp sgt i32 %4, i32 %59; + cbr i1 %60(prob = 0.75), ^while.body5, ^b7; + ^while.body4 {scalar}: + i32 %61 = phi [^scalar.final, i32 %46] [^while.body4, i32 %63]; + i32* %62 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %61]; + store i32* %62 with i32 0; + i32 %63 = add i32 %61, i32 1; + i1 %64 = icmp sgt i32 %4, i32 %63; + cbr i1 %64(prob = 0.75), ^while.body4, ^b6; + ^b6: + i32 %65 = add i32 %16, i32 1; + i1 %66 = icmp sgt i32 %1, i32 %65; + cbr i1 %66(prob = 0.984615), ^b3, ^b4; + ^while.body5 {scalar}: + i32 %67 = phi [^scalar.final1, i32 %59] [^while.body5, i32 %69]; + i32* %68 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %67]; + store i32* %68 with i32 0; + i32 %69 = add i32 %67, i32 1; + i1 %70 = icmp sgt i32 %4, i32 %69; + cbr i1 %70(prob = 0.75), ^while.body5, ^b7; + ^b7: + i32 %71 = add i32 %12, i32 1; + i1 %72 = icmp sgt i32 %1, i32 %71; + cbr i1 %72(prob = 0.984615), ^b1, ^b4; } internal [4 * i8]* @cmmc_parallel_body_payload_3, align 8; internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -473,8 +680,8 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_4 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; + i1 %5 = icmp sgt i32 %4, i32 3; + i32 %6 = add i32 %4, i32 -3; [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @A to [1024 * [1024 * i32]]*; [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; [1024 * [1024 * i32]]* %9 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; @@ -529,7 +736,7 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %36 = icmp sgt i32 %1, i32 %35; cbr i1 %36(prob = 0.984615), ^b2, ^b3; ^while.body2: - i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %78]; + i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %58]; i32* %38 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %37]; i32 %39 = load i32* %38; i32 %40 = mul i32 %22, i32 %39; @@ -550,54 +757,34 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %55 = mul i32 %22, i32 %54; i32* %56 = getelementptr &(i32* %41)[i64 3]; atomicadd i32* %56, i32 %55; - i32* %58 = getelementptr &(i32* %38)[i64 4]; - i32 %59 = load i32* %58; - i32 %60 = mul i32 %22, i32 %59; - i32* %61 = getelementptr &(i32* %41)[i64 4]; - atomicadd i32* %61, i32 %60; - i32* %63 = getelementptr &(i32* %38)[i64 5]; - i32 %64 = load i32* %63; - i32 %65 = mul i32 %22, i32 %64; - i32* %66 = getelementptr &(i32* %41)[i64 5]; - atomicadd i32* %66, i32 %65; - i32* %68 = getelementptr &(i32* %38)[i64 6]; - i32 %69 = load i32* %68; - i32 %70 = mul i32 %22, i32 %69; - i32* %71 = getelementptr &(i32* %41)[i64 6]; - atomicadd i32* %71, i32 %70; - i32* %73 = getelementptr &(i32* %38)[i64 7]; - i32 %74 = load i32* %73; - i32 %75 = mul i32 %22, i32 %74; - i32* %76 = getelementptr &(i32* %41)[i64 7]; - atomicadd i32* %76, i32 %75; - i32 %78 = add i32 %37, i32 8; - i1 %79 = icmp sgt i32 %6, i32 %78; - cbr i1 %79(prob = 0.888889), ^while.body2, ^scalar.header; + i32 %58 = add i32 %37, i32 4; + i1 %59 = icmp sgt i32 %6, i32 %58; + cbr i1 %59(prob = 0.941176), ^while.body2, ^scalar.header; ^prebody1: - [1024 * i32]* %80 = getelementptr &([1024 * i32]* %10)[i32 %27]; + [1024 * i32]* %60 = getelementptr &([1024 * i32]* %10)[i32 %27]; ubr ^while.body3; ^scalar.header: - i1 %81 = icmp sle i32 %4, i32 %78; - cbr i1 %81(prob = 0.125), ^while.header, ^while.body4; + i1 %61 = icmp sle i32 %4, i32 %58; + cbr i1 %61(prob = 0.25), ^while.header, ^while.body4; ^while.body3 {scalar}: - i32 %82 = phi [^prebody1, i32 0] [^while.body3, i32 %88]; - i32* %83 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %82]; - i32 %84 = load i32* %83; - i32 %85 = mul i32 %32, i32 %84; - i32* %86 = getelementptr &([1024 * i32]* %80)[i64 0][i32 %82]; - atomicadd i32* %86, i32 %85; - i32 %88 = add i32 %82, i32 1; - i1 %89 = icmp sgt i32 %4, i32 %88; - cbr i1 %89(prob = 0.875), ^while.body3, ^while.header1; + i32 %62 = phi [^prebody1, i32 0] [^while.body3, i32 %68]; + i32* %63 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %62]; + i32 %64 = load i32* %63; + i32 %65 = mul i32 %32, i32 %64; + i32* %66 = getelementptr &([1024 * i32]* %60)[i64 0][i32 %62]; + atomicadd i32* %66, i32 %65; + i32 %68 = add i32 %62, i32 1; + i1 %69 = icmp sgt i32 %4, i32 %68; + cbr i1 %69(prob = 0.75), ^while.body3, ^while.header1; ^while.body4 {scalar}: - i32 %90 = phi [^scalar.header, i32 %78] [^while.body4, i32 %96]; - i32* %91 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %90]; - i32 %92 = load i32* %91; - i32 %93 = mul i32 %22, i32 %92; - i32* %94 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %90]; - atomicadd i32* %94, i32 %93; - i32 %96 = add i32 %90, i32 1; - i1 %97 = icmp sgt i32 %4, i32 %96; - cbr i1 %97(prob = 0.875), ^while.body4, ^while.header; + i32 %70 = phi [^scalar.header, i32 %58] [^while.body4, i32 %76]; + i32* %71 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %70]; + i32 %72 = load i32* %71; + i32 %73 = mul i32 %22, i32 %72; + i32* %74 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %70]; + atomicadd i32* %74, i32 %73; + i32 %76 = add i32 %70, i32 1; + i1 %77 = icmp sgt i32 %4, i32 %76; + cbr i1 %77(prob = 0.75), ^while.body4, ^while.header; } internal [4 * i8]* @cmmc_parallel_body_payload_4, align 8; diff --git a/tests/SysY2022/performance/01_mm3.arm.s b/tests/SysY2022/performance/01_mm3.arm.s index e424c6434..344f0aa0b 100644 --- a/tests/SysY2022/performance/01_mm3.arm.s +++ b/tests/SysY2022/performance/01_mm3.arm.s @@ -1,22 +1,22 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 A: .zero 4194304 -.align 8 +.p2align 3 B: .zero 4194304 -.align 8 +.p2align 3 C: .zero 4194304 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 .text diff --git a/tests/SysY2022/performance/01_mm3.riscv.s b/tests/SysY2022/performance/01_mm3.riscv.s index bb4605631..563a93453 100644 --- a/tests/SysY2022/performance/01_mm3.riscv.s +++ b/tests/SysY2022/performance/01_mm3.riscv.s @@ -1,28 +1,28 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 A: .zero 4194304 -.align 8 +.p2align 3 B: .zero 4194304 -.align 8 +.p2align 3 C: .zero 4194304 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_4: .zero 4 .text @@ -43,112 +43,112 @@ main: sd s9, 80(sp) sd s10, 88(sp) jal getint -pcrel1067: +pcrel1390: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel1068: +pcrel1391: auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - lui s6, 1 li s3, 5 -pcrel1069: + lui s6, 1 +pcrel1392: auipc a1, %pcrel_hi(cmmc_parallel_body_1) mv s0, a0 - addi s1, a1, %pcrel_lo(pcrel1069) -pcrel1070: + addi s1, a1, %pcrel_lo(pcrel1392) +pcrel1393: auipc a0, %pcrel_hi(cmmc_parallel_body_3) - addi s2, a0, %pcrel_lo(pcrel1070) - ble s0, zero, label907 -pcrel1071: + addi s2, a0, %pcrel_lo(pcrel1393) + ble s0, zero, label1250 +pcrel1394: auipc a0, %pcrel_hi(A) mv s8, zero - addi s7, a0, %pcrel_lo(pcrel1071) + addi s7, a0, %pcrel_lo(pcrel1394) mv s9, s7 mv s10, zero - j label923 + j label1234 .p2align 2 -label989: +label1273: addiw s8, s8, 1 - ble s0, s8, label1059 + ble s0, s8, label1381 add s7, s7, s6 mv s10, zero mv s9, s7 .p2align 2 -label923: +label1234: jal getint addiw s10, s10, 1 sw a0, 0(s9) - ble s0, s10, label989 + ble s0, s10, label1273 addi s9, s9, 4 - j label923 -label907: + j label1234 +label1250: li a0, 65 jal _sysy_starttime mv s6, zero - j label908 -label910: + j label1251 +label1253: addiw s6, s6, 1 - bge s6, s3, label911 + bge s6, s3, label1254 .p2align 2 -label908: - ble s0, zero, label910 -pcrel1072: +label1251: + ble s0, zero, label1253 +pcrel1395: auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - sw s0, %pcrel_lo(pcrel1072)(s4) + sw s0, %pcrel_lo(pcrel1395)(s4) mv a0, zero mv a1, s0 mv a2, s1 jal cmmcParallelFor - ble s0, zero, label917 -pcrel1073: + ble s0, zero, label1260 +pcrel1396: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel1074: +pcrel1397: auipc a3, %pcrel_hi(cmmc_parallel_body_2) - sw s0, %pcrel_lo(pcrel1073)(a0) - addi a2, a3, %pcrel_lo(pcrel1074) + sw s0, %pcrel_lo(pcrel1396)(a0) + addi a2, a3, %pcrel_lo(pcrel1397) mv a1, s0 mv a0, zero jal cmmcParallelFor .p2align 2 -label917: +label1260: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) - sw s0, %pcrel_lo(label917)(s5) + sw s0, %pcrel_lo(label1260)(s5) mv a0, zero mv a1, s0 mv a2, s2 jal cmmcParallelFor - ble s0, zero, label910 -pcrel1075: + ble s0, zero, label1253 +pcrel1398: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel1076: +pcrel1399: auipc a3, %pcrel_hi(cmmc_parallel_body_4) - sw s0, %pcrel_lo(pcrel1075)(a0) - addi a2, a3, %pcrel_lo(pcrel1076) + sw s0, %pcrel_lo(pcrel1398)(a0) + addi a2, a3, %pcrel_lo(pcrel1399) mv a1, s0 mv a0, zero jal cmmcParallelFor addiw s6, s6, 1 - blt s6, s3, label908 -label911: - ble s0, zero, label953 -pcrel1077: + blt s6, s3, label1251 +label1254: + ble s0, zero, label1306 +pcrel1400: auipc s1, %pcrel_hi(cmmc_parallel_body_payload_0) slli a0, s0, 32 -pcrel1078: +pcrel1401: auipc a3, %pcrel_hi(cmmc_parallel_body_0) - sd a0, %pcrel_lo(pcrel1077)(s1) - addi a2, a3, %pcrel_lo(pcrel1078) + sd a0, %pcrel_lo(pcrel1400)(s1) + addi a2, a3, %pcrel_lo(pcrel1401) mv a1, s0 mv a0, zero jal cmmcParallelFor - lw s0, %pcrel_lo(pcrel1077)(s1) -label912: + lw s0, %pcrel_lo(pcrel1400)(s1) +label1255: li a0, 84 jal _sysy_stoptime mv a0, s0 jal putint li a0, 10 jal putch - ld ra, 0(sp) mv a0, zero + ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) ld s1, 24(sp) @@ -162,713 +162,941 @@ label912: ld s10, 88(sp) addi sp, sp, 96 ret -label1059: +label1381: auipc a0, %pcrel_hi(B) mv s8, zero mv s10, zero - addi s7, a0, %pcrel_lo(label1059) + addi s7, a0, %pcrel_lo(label1381) mv s9, s7 - j label933 + j label1244 .p2align 2 -label938: +label1249: addi s9, s9, 4 .p2align 2 -label933: +label1244: jal getint addiw s10, s10, 1 sw a0, 0(s9) - bgt s0, s10, label938 + bgt s0, s10, label1249 addiw s8, s8, 1 - ble s0, s8, label907 + ble s0, s8, label1250 add s7, s7, s6 mv s10, zero mv s9, s7 - j label933 -label953: + j label1244 +label1306: mv s0, zero - j label912 + j label1255 .p2align 2 cmmc_parallel_body_0: - mv t0, a0 -pcrel146: + addi sp, sp, -8 + mv t1, a0 +pcrel289: auipc a4, %pcrel_hi(cmmc_parallel_body_payload_0) lui a2, 1 - addi a3, a4, %pcrel_lo(pcrel146) + sd s0, 0(sp) + addi a3, a4, %pcrel_lo(pcrel289) lw a0, 4(a3) bgt a0, zero, label2 - mv t2, zero -label35: - amoadd.w.aqrl a0, t2, (a3) + mv t1, zero +label65: + amoadd.w.aqrl a0, t1, (a3) + ld s0, 0(sp) + addi sp, sp, 8 ret label2: - addiw a5, a0, -7 -pcrel147: - auipc t2, %pcrel_hi(B) - li t1, 7 - addi a4, t2, %pcrel_lo(pcrel147) - bgt a0, t1, label3 - slli t1, t0, 12 - mv a5, t0 + auipc t0, %pcrel_hi(B) + li a4, 3 + addi a5, t0, %pcrel_lo(label2) + ble a0, a4, label80 + addiw a4, a0, -3 + addiw t0, a0, -18 + li t2, 15 + bgt a4, t2, label24 + slli t2, t1, 12 + mv t0, t1 + mv t3, zero + add a5, a5, t2 + mv t1, a5 mv t2, zero - add a4, a4, t1 + j label9 +.p2align 2 +label124: + addiw t0, t0, 1 + ble a1, t0, label271 +.p2align 2 +label17: + add a5, a5, a2 + lw t6, 0(a5) + mv t1, a5 + lw a7, 4(a5) + addw t5, t2, t6 + lw a6, 8(a5) + li t2, 4 + addw t4, t5, a7 + lw t5, 12(a5) + addw t6, t4, a6 + addw t3, t6, t5 + ble a4, t2, label104 +.p2align 2 +label13: + addi t1, t1, 16 +.p2align 2 +label9: + lw t6, 0(t1) + addiw t2, t2, 4 + lw a7, 4(t1) + addw t5, t3, t6 + lw a6, 8(t1) + addw t4, t5, a7 + lw t5, 12(t1) + addw t6, t4, a6 + addw t3, t6, t5 + bgt a4, t2, label13 +.p2align 2 +label104: + ble a0, t2, label263 + sh2add t1, t2, a5 + mv t4, t2 + mv t2, t3 +.p2align 2 +label19: + lw t5, 0(t1) + addiw t4, t4, 1 + addw t2, t2, t5 + ble a0, t4, label124 + addi t1, t1, 4 + j label19 +label24: + slli t3, t1, 12 + mv t4, zero + add a5, a5, t3 + mv t2, a5 + mv t3, zero + j label29 +.p2align 2 +label203: + addiw t1, t1, 1 + ble a1, t1, label193 +.p2align 2 +label40: + add a5, a5, a2 + mv t4, t3 + mv t2, a5 + mv t3, zero +.p2align 2 +label29: + lw a6, 0(t2) + addiw t3, t3, 16 + lw a7, 4(t2) + addw t6, t4, a6 + lw s0, 8(t2) + addw t5, t6, a7 + lw a6, 12(t2) + addw t4, t5, s0 + lw a7, 16(t2) + addw t6, t4, a6 + lw t4, 20(t2) + addw t5, t6, a7 + lw a7, 24(t2) + addw a6, t5, t4 + lw t5, 28(t2) + addw t4, a6, a7 + lw a6, 32(t2) + addw t6, t4, t5 + lw a7, 36(t2) + addw t5, t6, a6 + lw t6, 40(t2) + addw t4, t5, a7 + lw a7, 44(t2) + addw a6, t4, t6 + lw t5, 48(t2) + addw t6, a6, a7 + lw a7, 52(t2) + addw t4, t6, t5 + lw a6, 56(t2) + addw t5, t4, a7 + lw a7, 60(t2) + addw t6, t5, a6 + addw t4, t6, a7 + ble t0, t3, label179 + addi t2, t2, 64 + j label29 +.p2align 2 +label179: + ble a4, t3, label265 + sh2add t2, t3, a5 +.p2align 2 +label48: + lw t6, 0(t2) + addiw t3, t3, 4 + lw a7, 4(t2) + addw t5, t4, t6 + lw t4, 8(t2) + addw a6, t5, a7 + lw t5, 12(t2) + addw t6, a6, t4 + addw t4, t6, t5 + ble a4, t3, label222 + addi t2, t2, 16 + j label48 +.p2align 2 +label222: + mv t5, t4 + ble a0, t3, label272 +.p2align 2 +label41: + sh2add t2, t3, a5 + mv t4, t3 + mv t3, t5 +.p2align 2 +label42: + lw t6, 0(t2) + addiw t4, t4, 1 + addw t3, t3, t6 + ble a0, t4, label203 + addi t2, t2, 4 + j label42 +label272: + mv t3, t5 + addiw t1, t1, 1 + bgt a1, t1, label40 +label193: + mv t1, t3 + j label65 +label271: + mv t1, t2 + j label65 +label263: + mv t2, t3 + addiw t0, t0, 1 + bgt a1, t0, label17 + j label271 +label265: + mv t5, t4 + bgt a0, t3, label41 + j label272 +label80: + slli t0, t1, 12 + mv t2, zero + add a4, a5, t0 mv t0, a4 + mv a5, t1 mv t1, zero - j label28 + j label58 .p2align 2 -label32: +label62: addi t0, t0, 4 .p2align 2 -label28: +label58: lw t3, 0(t0) addiw t2, t2, 1 addw t1, t1, t3 - bgt a0, t2, label32 + bgt a0, t2, label62 addiw a5, a5, 1 - ble a1, a5, label137 + ble a1, a5, label65 .p2align 2 -label34: +label64: add a4, a4, a2 li t2, 1 lw t3, 0(a4) mv t0, a4 addw t1, t1, t3 - bgt a0, t2, label32 + bgt a0, t2, label62 addiw a5, a5, 1 - bgt a1, a5, label34 -label137: - mv t2, t1 - j label35 -label3: - slli t1, t0, 12 - mv t3, zero - add a4, a4, t1 - mv t1, a4 - mv t2, zero - j label8 -.p2align 2 -label95: - addiw t0, t0, 1 - ble a1, t0, label35 + bgt a1, a5, label64 + j label65 .p2align 2 -label22: - add a4, a4, a2 - mv t3, t2 - mv t1, a4 - mv t2, zero +cmmc_parallel_body_1: + mv t0, a0 +pcrel529: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) +pcrel530: + auipc a5, %pcrel_hi(C) + li a3, 3 + lw a0, %pcrel_lo(pcrel529)(a2) + addi a4, a5, %pcrel_lo(pcrel530) + lui a2, 1 + bgt a0, a3, label291 + bgt a0, zero, label333 +label331: + ret +label333: + slliw a5, t0, 12 + add a3, a4, a5 + mv a4, t0 + mv a5, a3 + mv t0, zero + j label337 .p2align 2 -label8: - lw t5, 0(t1) - addiw t2, t2, 8 - lw a6, 4(t1) - addw t4, t3, t5 - lw t3, 8(t1) - addw t6, t4, a6 - lw a6, 12(t1) - addw t5, t6, t3 - lw t6, 16(t1) - addw t4, t5, a6 - lw a6, 20(t1) - addw t3, t4, t6 - lw t6, 24(t1) - addw t5, t3, a6 - lw a6, 28(t1) - addw t4, t5, t6 - addw t3, t4, a6 - ble a5, t2, label81 - addi t1, t1, 32 - j label8 -.p2align 2 -label81: - ble a0, t2, label134 - sh2add t1, t2, a4 - mv t4, t2 - mv t2, t3 +label340: + addi a5, a5, 4 .p2align 2 -label15: - lw t5, 0(t1) - addiw t4, t4, 1 - addw t2, t2, t5 - ble a0, t4, label95 - addi t1, t1, 4 - j label15 -label134: - mv t2, t3 +label337: addiw t0, t0, 1 - bgt a1, t0, label22 - j label35 + sw zero, 0(a5) + bgt a0, t0, label340 + addiw a4, a4, 1 + ble a1, a4, label331 .p2align 2 -cmmc_parallel_body_1: - mv a5, a0 -pcrel273: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) -pcrel274: - auipc t1, %pcrel_hi(C) - li t0, 7 - lw a0, %pcrel_lo(pcrel273)(a2) - addi a4, t1, %pcrel_lo(pcrel274) - lui a2, 1 - addi a3, a0, -7 - bgt a0, t0, label149 - bgt a0, zero, label166 - j label176 -label149: - slliw t0, a5, 12 - add a4, a4, t0 - mv t0, a4 +label342: + add a3, a3, a2 + li t0, 1 + sw zero, 0(a3) + mv a5, a3 + bgt a0, t0, label340 + addiw a4, a4, 1 + bgt a1, a4, label342 + j label331 +label291: + addiw a3, a0, -3 + addiw a5, a0, -18 + li t1, 15 + bgt a3, t1, label292 + slliw t1, t0, 12 + mv a5, t0 + add a4, a4, t1 mv t1, zero - j label153 -.p2align 2 -label162: - addi t0, t0, 4 + mv t0, a4 + j label319 .p2align 2 -label159: - addiw t2, t2, 1 - sw zero, 0(t0) - bgt a0, t2, label162 +label435: addiw a5, a5, 1 - ble a1, a5, label176 + ble a1, a5, label331 .p2align 2 -label164: +label329: add a4, a4, a2 - li t1, 8 + li t1, 4 sd zero, 0(a4) mv t0, a4 sd zero, 8(a4) - sd zero, 16(a4) - sd zero, 24(a4) - ble a3, t1, label265 + ble a3, t1, label515 .p2align 2 -label156: - addi t0, t0, 32 +label330: + addi t0, t0, 16 .p2align 2 -label153: - addiw t1, t1, 8 +label319: + addiw t1, t1, 4 sd zero, 0(t0) sd zero, 8(t0) - sd zero, 16(t0) - sd zero, 24(t0) - bgt a3, t1, label156 - ble a0, t1, label261 + bgt a3, t1, label330 + ble a0, t1, label428 .p2align 2 -label158: +label323: sh2add t0, t1, a4 mv t2, t1 - j label159 -label265: - bgt a0, t1, label158 .p2align 2 -label261: - addiw a5, a5, 1 - bgt a1, a5, label164 -label176: - ret -label166: - slliw t0, a5, 12 - add a3, a4, t0 - mv a4, a5 - mv a5, a3 - mv t0, zero - j label170 +label324: + addiw t2, t2, 1 + sw zero, 0(t0) + ble a0, t2, label435 + addi t0, t0, 4 + j label324 +label515: + bgt a0, t1, label323 .p2align 2 -label175: - addi a5, a5, 4 +label428: + addiw a5, a5, 1 + bgt a1, a5, label329 + j label331 +label292: + slliw t1, t0, 12 + add a4, a4, t1 + mv t1, a4 + mv t2, zero + j label296 .p2align 2 -label170: +label412: addiw t0, t0, 1 - sw zero, 0(a5) - bgt a0, t0, label175 - addiw a4, a4, 1 - ble a1, a4, label176 + ble a1, t0, label331 .p2align 2 -label174: - add a3, a3, a2 - li t0, 1 - sw zero, 0(a3) - mv a5, a3 - bgt a0, t0, label175 - addiw a4, a4, 1 - bgt a1, a4, label174 - j label176 +label308: + add a4, a4, a2 + li t2, 16 + sd zero, 0(a4) + mv t1, a4 + sd zero, 8(a4) + sd zero, 16(a4) + sd zero, 24(a4) + sd zero, 32(a4) + sd zero, 40(a4) + sd zero, 48(a4) + sd zero, 56(a4) + ble a5, t2, label299 +.p2align 2 +label314: + addi t1, t1, 64 +.p2align 2 +label296: + addiw t2, t2, 16 + sd zero, 0(t1) + sd zero, 8(t1) + sd zero, 16(t1) + sd zero, 24(t1) + sd zero, 32(t1) + sd zero, 40(t1) + sd zero, 48(t1) + sd zero, 56(t1) + bgt a5, t2, label314 +.p2align 2 +label299: + ble a3, t2, label384 + sh2add t1, t2, a4 + mv t3, t2 + j label301 +.p2align 2 +label304: + addi t1, t1, 16 +.p2align 2 +label301: + addiw t3, t3, 4 + sd zero, 0(t1) + sd zero, 8(t1) + bgt a3, t3, label304 + ble a0, t3, label508 +.p2align 2 +label309: + sh2add t1, t3, a4 + mv t2, t3 +.p2align 2 +label310: + addiw t2, t2, 1 + sw zero, 0(t1) + ble a0, t2, label412 + addi t1, t1, 4 + j label310 +.p2align 2 +label384: + mv t3, t2 + bgt a0, t2, label309 + addiw t0, t0, 1 + bgt a1, t0, label308 + j label331 +label508: + addiw t0, t0, 1 + bgt a1, t0, label308 + j label331 .p2align 2 cmmc_parallel_body_2: addi sp, sp, -48 mv t2, a0 mv a4, a1 -pcrel522: +pcrel753: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel523: - auipc a5, %pcrel_hi(A) -pcrel524: +pcrel754: + auipc t1, %pcrel_hi(C) +pcrel755: auipc t3, %pcrel_hi(B) - li t0, 7 -pcrel525: - auipc a1, %pcrel_hi(C) - addi a3, a5, %pcrel_lo(pcrel523) sd s0, 0(sp) - addi a5, t3, %pcrel_lo(pcrel524) +pcrel756: + auipc a1, %pcrel_hi(A) + addi a5, t3, %pcrel_lo(pcrel755) + addi a3, a1, %pcrel_lo(pcrel756) sd s5, 8(sp) + lui a1, 1 sd s1, 16(sp) sd s3, 24(sp) sd s2, 32(sp) sd s4, 40(sp) - lw a0, %pcrel_lo(pcrel522)(a2) - addi a2, a1, %pcrel_lo(pcrel525) - addi t1, a0, -7 - lui a1, 1 - bgt a0, t0, label276 - bgt a0, zero, label302 -label300: - ld s0, 0(sp) - ld s5, 8(sp) - ld s1, 16(sp) - ld s3, 24(sp) - ld s2, 32(sp) - ld s4, 40(sp) - addi sp, sp, 48 - ret -label276: + lw a0, %pcrel_lo(pcrel753)(a2) + addi a2, t1, %pcrel_lo(pcrel754) + addi t0, a0, -3 + li t1, 3 + bgt a0, t1, label532 + ble a0, zero, label556 + slliw t1, t2, 12 + add t0, a5, t1 + mv a5, t2 + mv t1, a3 + mv t4, zero + bgt a0, zero, label567 + j label565 +.p2align 2 +label572: + add t1, t1, a1 + mv t4, t2 + ble a0, t2, label735 +.p2align 2 +label567: + sh2add t5, a5, t1 + addiw t2, t4, 1 + lw t3, 0(t5) + beq t3, zero, label574 + slliw a6, t4, 12 + mv t5, t0 + mv t6, zero + add t4, a2, a6 +.p2align 2 +label569: + lw s0, 0(t5) + sh2add a7, t6, t4 + addiw t6, t6, 1 + mulw a6, t3, s0 + amoadd.w.aqrl s1, a6, (a7) + ble a0, t6, label572 + addi t5, t5, 4 + j label569 +.p2align 2 +label574: + add t1, t1, a1 + mv t4, t2 + bgt a0, t2, label567 + addiw a5, a5, 1 + ble a4, a5, label556 +.p2align 2 +label566: + add t0, t0, a1 + mv t1, a3 + mv t4, zero + bgt a0, zero, label567 +label565: + addiw a5, a5, 1 + bgt a4, a5, label566 + j label556 +label532: slliw t3, t2, 12 - add t0, a5, t3 + add t1, a5, t3 mv t3, a3 mv t5, zero - bgt a0, zero, label283 - j label339 + bgt a0, zero, label541 + j label539 +.p2align 2 +label553: + addi t6, t6, 4 .p2align 2 -label284: +label549: + lw s1, 0(t6) + sh2add s0, a6, t5 + addiw a6, a6, 1 + mulw a7, a5, s1 + amoadd.w.aqrl s2, a7, (s0) + bgt a0, a6, label553 add t3, t3, a1 mv t5, t4 - ble a0, t4, label498 + ble a0, t4, label733 .p2align 2 -label283: +label541: sh2add t6, t2, t3 addiw t4, t5, 1 lw a5, 0(t6) - beq a5, zero, label284 + bne a5, zero, label609 + add t3, t3, a1 + mv t5, t4 + bgt a0, t4, label541 + addiw t2, t2, 1 + ble a4, t2, label556 +.p2align 2 +label540: + add t1, t1, a1 + mv t3, a3 + mv t5, zero + bgt a0, zero, label541 +label539: + addiw t2, t2, 1 + bgt a4, t2, label540 + j label556 +.p2align 2 +label609: slliw a7, t5, 12 + mv t6, t1 mv a6, zero add t5, a2, a7 - mv t6, t5 - j label286 -.p2align 2 -label289: - addi t6, t6, 32 .p2align 2 -label286: - sh2add a7, a6, t0 - addi s4, t6, 4 - addiw a6, a6, 8 - lw s0, 0(a7) +label544: + lw s0, 0(t6) + sh2add a7, a6, t5 + addiw a6, a6, 4 mulw s1, a5, s0 - amoadd.w.aqrl s3, s1, (t6) - lw s2, 4(a7) + amoadd.w.aqrl s3, s1, (a7) + addi s3, a7, 4 + lw s2, 4(t6) mulw s0, a5, s2 - addi s2, t6, 8 - amoadd.w.aqrl s5, s0, (s4) - lw s3, 8(a7) - mulw s1, a5, s3 - amoadd.w.aqrl s4, s1, (s2) - addi s2, t6, 12 - lw s3, 12(a7) - mulw s0, a5, s3 - amoadd.w.aqrl s4, s0, (s2) - addi s2, t6, 16 - lw s3, 16(a7) - mulw s1, a5, s3 - amoadd.w.aqrl s4, s1, (s2) - addi s2, t6, 20 - lw s3, 20(a7) - mulw s0, a5, s3 - amoadd.w.aqrl s4, s0, (s2) - addi s2, t6, 24 - lw s3, 24(a7) - mulw s1, a5, s3 - amoadd.w.aqrl s4, s1, (s2) - lw s3, 28(a7) - addi a7, t6, 28 + addi s2, a7, 8 + amoadd.w.aqrl s5, s0, (s3) + lw s4, 8(t6) + mulw s1, a5, s4 + amoadd.w.aqrl s5, s1, (s2) + addi s1, a7, 12 + lw s3, 12(t6) mulw s0, a5, s3 - amoadd.w.aqrl s1, s0, (a7) - bgt t1, a6, label289 - ble a0, a6, label499 - sh2add t6, a6, t0 - j label292 -.p2align 2 -label295: - addi t6, t6, 4 + amoadd.w.aqrl s2, s0, (s1) + ble t0, a6, label547 + addi t6, t6, 16 + j label544 +.p2align 2 +label547: + ble a0, a6, label641 + sh2add t6, a6, t1 + j label549 .p2align 2 -label292: - lw s1, 0(t6) - sh2add s0, a6, t5 - addiw a6, a6, 1 - mulw a7, a5, s1 - amoadd.w.aqrl s2, a7, (s0) - bgt a0, a6, label295 +label641: add t3, t3, a1 mv t5, t4 - bgt a0, t4, label283 + bgt a0, t4, label541 addiw t2, t2, 1 - bgt a4, t2, label299 - j label300 + bgt a4, t2, label540 +label556: + ld s0, 0(sp) + ld s5, 8(sp) + ld s1, 16(sp) + ld s3, 24(sp) + ld s2, 32(sp) + ld s4, 40(sp) + addi sp, sp, 48 + ret .p2align 2 -label498: +label733: addiw t2, t2, 1 - ble a4, t2, label300 + bgt a4, t2, label540 + j label556 .p2align 2 -label299: - add t0, t0, a1 - mv t3, a3 - mv t5, zero - bgt a0, zero, label283 -label339: - addiw t2, t2, 1 - bgt a4, t2, label299 - j label300 +label735: + addiw a5, a5, 1 + bgt a4, a5, label566 + j label556 .p2align 2 -label499: - add t3, t3, a1 - mv t5, t4 - bgt a0, t4, label283 - addiw t2, t2, 1 - bgt a4, t2, label299 - j label300 -label302: - slliw t1, t2, 12 - add t0, a5, t1 - mv a5, t2 - mv t1, a3 - mv t4, zero - bgt a0, zero, label311 - j label309 +cmmc_parallel_body_3: + mv t0, a0 +pcrel999: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_3) +pcrel1000: + auipc a5, %pcrel_hi(B) + li a3, 3 + lw a0, %pcrel_lo(pcrel999)(a2) + addi a4, a5, %pcrel_lo(pcrel1000) + lui a2, 1 + bgt a0, a3, label758 + ble a0, zero, label798 + slliw a5, t0, 12 + add a3, a4, a5 + mv a4, t0 + mv a5, a3 + mv t0, zero + j label804 .p2align 2 -label317: - addi t5, t5, 4 +label807: + addi a5, a5, 4 .p2align 2 -label313: - lw a7, 0(t5) - sh2add s0, t6, t4 - addiw t6, t6, 1 - mulw a6, t3, a7 - amoadd.w.aqrl a7, a6, (s0) - bgt a0, t6, label317 - add t1, t1, a1 - mv t4, t2 - ble a0, t2, label502 +label804: + addiw t0, t0, 1 + sw zero, 0(a5) + bgt a0, t0, label807 + addiw a4, a4, 1 + ble a1, a4, label798 .p2align 2 -label311: - sh2add t5, a5, t1 - addiw t2, t4, 1 - lw t3, 0(t5) - bne t3, zero, label312 - add t1, t1, a1 - mv t4, t2 - bgt a0, t2, label311 - addiw a5, a5, 1 - bgt a4, a5, label310 - j label300 +label809: + add a3, a3, a2 + li t0, 1 + sw zero, 0(a3) + mv a5, a3 + bgt a0, t0, label807 + addiw a4, a4, 1 + bgt a1, a4, label809 + j label798 +label758: + addiw a3, a0, -3 + addiw a5, a0, -18 + li t1, 15 + ble a3, t1, label825 + slliw t1, t0, 12 + add a4, a4, t1 + mv t1, a4 + mv t2, zero + j label763 .p2align 2 -label312: - slliw a6, t4, 12 - mv t5, t0 - mv t6, zero - add t4, a2, a6 - j label313 +label879: + addiw t0, t0, 1 + ble a1, t0, label798 .p2align 2 -label502: - addiw a5, a5, 1 - ble a4, a5, label300 +label775: + add a4, a4, a2 + li t2, 16 + sd zero, 0(a4) + mv t1, a4 + sd zero, 8(a4) + sd zero, 16(a4) + sd zero, 24(a4) + sd zero, 32(a4) + sd zero, 40(a4) + sd zero, 48(a4) + sd zero, 56(a4) + ble a5, t2, label766 +.p2align 2 +label781: + addi t1, t1, 64 +.p2align 2 +label763: + addiw t2, t2, 16 + sd zero, 0(t1) + sd zero, 8(t1) + sd zero, 16(t1) + sd zero, 24(t1) + sd zero, 32(t1) + sd zero, 40(t1) + sd zero, 48(t1) + sd zero, 56(t1) + bgt a5, t2, label781 +.p2align 2 +label766: + ble a3, t2, label851 + sh2add t1, t2, a4 + mv t3, t2 + j label768 .p2align 2 -label310: - add t0, t0, a1 - mv t1, a3 - mv t4, zero - bgt a0, zero, label311 -label309: - addiw a5, a5, 1 - bgt a4, a5, label310 - j label300 +label771: + addi t1, t1, 16 .p2align 2 -cmmc_parallel_body_3: - mv a5, a0 -pcrel651: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel652: - auipc t1, %pcrel_hi(B) - li t0, 7 - lw a0, %pcrel_lo(pcrel651)(a2) - addi a4, t1, %pcrel_lo(pcrel652) - lui a2, 1 - addi a3, a0, -7 - bgt a0, t0, label527 - bgt a0, zero, label545 - j label543 -label527: - slliw t0, a5, 12 - add a4, a4, t0 - mv t0, a4 - mv t1, zero - j label531 +label768: + addiw t3, t3, 4 + sd zero, 0(t1) + sd zero, 8(t1) + bgt a3, t3, label771 + ble a0, t3, label975 .p2align 2 -label540: - addi t0, t0, 4 +label776: + sh2add t1, t3, a4 + mv t2, t3 .p2align 2 -label537: +label777: addiw t2, t2, 1 - sw zero, 0(t0) - bgt a0, t2, label540 + sw zero, 0(t1) + ble a0, t2, label879 + addi t1, t1, 4 + j label777 +.p2align 2 +label851: + mv t3, t2 + bgt a0, t2, label776 + addiw t0, t0, 1 + bgt a1, t0, label775 +label798: + ret +label975: + addiw t0, t0, 1 + bgt a1, t0, label775 + j label798 +label825: + slliw t1, t0, 12 + mv a5, t0 + add a4, a4, t1 + mv t1, zero + mv t0, a4 + j label786 +.p2align 2 +label909: addiw a5, a5, 1 - ble a1, a5, label543 + ble a1, a5, label798 .p2align 2 -label542: +label792: add a4, a4, a2 - li t1, 8 + li t1, 4 sd zero, 0(a4) mv t0, a4 sd zero, 8(a4) - sd zero, 16(a4) - sd zero, 24(a4) - ble a3, t1, label643 + ble a3, t1, label983 .p2align 2 -label534: - addi t0, t0, 32 +label789: + addi t0, t0, 16 .p2align 2 -label531: - addiw t1, t1, 8 +label786: + addiw t1, t1, 4 sd zero, 0(t0) sd zero, 8(t0) - sd zero, 16(t0) - sd zero, 24(t0) - bgt a3, t1, label534 - ble a0, t1, label639 + bgt a3, t1, label789 + ble a0, t1, label977 .p2align 2 -label536: +label793: sh2add t0, t1, a4 mv t2, t1 - j label537 -label643: - bgt a0, t1, label536 -.p2align 2 -label639: - addiw a5, a5, 1 - bgt a1, a5, label542 -label543: - ret -label545: - slliw t0, a5, 12 - add a3, a4, t0 - mv a4, a5 - mv a5, a3 - mv t0, zero - j label549 .p2align 2 -label554: - addi a5, a5, 4 -.p2align 2 -label549: - addiw t0, t0, 1 - sw zero, 0(a5) - bgt a0, t0, label554 - addiw a4, a4, 1 - ble a1, a4, label543 +label794: + addiw t2, t2, 1 + sw zero, 0(t0) + ble a0, t2, label909 + addi t0, t0, 4 + j label794 +label983: + bgt a0, t1, label793 .p2align 2 -label553: - add a3, a3, a2 - li t0, 1 - sw zero, 0(a3) - mv a5, a3 - bgt a0, t0, label554 - addiw a4, a4, 1 - bgt a1, a4, label553 - j label543 +label977: + addiw a5, a5, 1 + bgt a1, a5, label792 + j label798 .p2align 2 cmmc_parallel_body_4: - addi sp, sp, -48 + addi sp, sp, -40 mv t2, a0 mv a4, a1 -pcrel902: +pcrel1225: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel903: +pcrel1226: auipc a5, %pcrel_hi(A) -pcrel904: +pcrel1227: auipc t3, %pcrel_hi(C) - li t0, 7 -pcrel905: + li t1, 3 +pcrel1228: auipc a1, %pcrel_hi(B) - addi a3, a5, %pcrel_lo(pcrel903) - sd s0, 0(sp) - addi a5, t3, %pcrel_lo(pcrel904) - sd s5, 8(sp) - sd s2, 16(sp) - sd s3, 24(sp) - sd s1, 32(sp) - sd s4, 40(sp) - lw a0, %pcrel_lo(pcrel902)(a2) - addi a2, a1, %pcrel_lo(pcrel905) - addi t1, a0, -7 + addi a3, a5, %pcrel_lo(pcrel1226) + sd s1, 0(sp) + addi a5, t3, %pcrel_lo(pcrel1227) + sd s0, 8(sp) + sd s3, 16(sp) + sd s2, 24(sp) + sd s4, 32(sp) + lw a0, %pcrel_lo(pcrel1225)(a2) + addi a2, a1, %pcrel_lo(pcrel1228) + addi t0, a0, -3 lui a1, 1 - ble a0, t0, label679 + bgt a0, t1, label1002 + bgt a0, zero, label1028 +label1026: + ld s1, 0(sp) + ld s0, 8(sp) + ld s3, 16(sp) + ld s2, 24(sp) + ld s4, 32(sp) + addi sp, sp, 40 + ret +label1002: slliw t3, t2, 12 - add t0, a5, t3 + add t1, a5, t3 mv t3, a3 mv t5, zero - bgt a0, zero, label661 - j label717 + bgt a0, zero, label1011 + j label1009 .p2align 2 -label675: +label1023: + addi t6, t6, 4 +.p2align 2 +label1020: + lw s1, 0(t6) + sh2add s0, a6, t5 + addiw a6, a6, 1 + mulw a7, a5, s1 + amoadd.w.aqrl s2, a7, (s0) + bgt a0, a6, label1023 add t3, t3, a1 mv t5, t4 - ble a0, t4, label879 + ble a0, t4, label1210 .p2align 2 -label661: +label1011: sh2add t6, t2, t3 addiw t4, t5, 1 lw a5, 0(t6) - beq a5, zero, label675 + bne a5, zero, label1012 + add t3, t3, a1 + mv t5, t4 + bgt a0, t4, label1011 + addiw t2, t2, 1 + ble a4, t2, label1026 +.p2align 2 +label1010: + add t1, t1, a1 + mv t3, a3 + mv t5, zero + bgt a0, zero, label1011 +label1009: + addiw t2, t2, 1 + bgt a4, t2, label1010 + j label1026 +.p2align 2 +label1012: slliw a7, t5, 12 + mv t6, t1 mv a6, zero add t5, a2, a7 - mv t6, t5 - j label663 -.p2align 2 -label666: - addi t6, t6, 32 -.p2align 2 -label663: - sh2add a7, a6, t0 - addiw a6, a6, 8 - lw s0, 0(a7) - mulw s2, a5, s0 - amoadd.w.aqrl s3, s2, (t6) - addi s2, t6, 4 - lw s1, 4(a7) + j label1013 +.p2align 2 +label1016: + addi t6, t6, 16 +.p2align 2 +label1013: + lw s1, 0(t6) + sh2add a7, a6, t5 + addiw a6, a6, 4 mulw s0, a5, s1 - amoadd.w.aqrl s3, s0, (s2) - addi s3, t6, 8 - lw s4, 8(a7) - mulw s1, a5, s4 - amoadd.w.aqrl s5, s1, (s3) - addi s3, t6, 12 - lw s2, 12(a7) - mulw s0, a5, s2 - amoadd.w.aqrl s4, s0, (s3) - addi s3, t6, 16 - lw s2, 16(a7) + amoadd.w.aqrl s3, s0, (a7) + addi s3, a7, 4 + lw s2, 4(t6) mulw s1, a5, s2 amoadd.w.aqrl s4, s1, (s3) - addi s3, t6, 20 - lw s2, 20(a7) + addi s3, a7, 8 + lw s2, 8(t6) mulw s0, a5, s2 - addi s2, t6, 24 - amoadd.w.aqrl s5, s0, (s3) - lw s4, 24(a7) - mulw s1, a5, s4 - amoadd.w.aqrl s5, s1, (s2) - lw s3, 28(a7) - addi a7, t6, 28 - mulw s0, a5, s3 - amoadd.w.aqrl s1, s0, (a7) - bgt t1, a6, label666 - ble a0, a6, label876 - sh2add t6, a6, t0 -.p2align 2 -label669: - lw s1, 0(t6) - sh2add s0, a6, t5 - addiw a6, a6, 1 - mulw a7, a5, s1 - amoadd.w.aqrl s2, a7, (s0) - ble a0, a6, label789 - addi t6, t6, 4 - j label669 -.p2align 2 -label879: - addiw t2, t2, 1 - ble a4, t2, label678 -.p2align 2 -label677: - add t0, t0, a1 - mv t3, a3 - mv t5, zero - bgt a0, zero, label661 - j label717 -.p2align 2 -label789: - add t3, t3, a1 - mv t5, t4 - bgt a0, t4, label661 - addiw t2, t2, 1 - bgt a4, t2, label677 - j label678 + amoadd.w.aqrl s4, s0, (s3) + addi s0, a7, 12 + lw s2, 12(t6) + mulw s1, a5, s2 + amoadd.w.aqrl s3, s1, (s0) + bgt t0, a6, label1016 + ble a0, a6, label1201 + sh2add t6, a6, t1 + j label1020 .p2align 2 -label876: +label1201: add t3, t3, a1 mv t5, t4 - bgt a0, t4, label661 -label717: + bgt a0, t4, label1011 addiw t2, t2, 1 - bgt a4, t2, label677 - j label678 -label679: - bgt a0, zero, label680 -label678: - ld s0, 0(sp) - ld s5, 8(sp) - ld s2, 16(sp) - ld s3, 24(sp) - ld s1, 32(sp) - ld s4, 40(sp) - addi sp, sp, 48 - ret -label680: + bgt a4, t2, label1010 + j label1026 +label1028: slliw t1, t2, 12 add t0, a5, t1 mv a5, t2 mv t1, a3 mv t4, zero - bgt a0, zero, label689 - j label687 + bgt a0, zero, label1037 + j label1035 .p2align 2 -label694: +label1042: add t1, t1, a1 mv t4, t2 - ble a0, t2, label880 + ble a0, t2, label1205 .p2align 2 -label689: +label1037: sh2add t5, a5, t1 addiw t2, t4, 1 lw t3, 0(t5) - bne t3, zero, label690 + beq t3, zero, label1044 + slliw a6, t4, 12 + mv t5, t0 + mv t6, zero + add t4, a2, a6 +.p2align 2 +label1039: + lw s0, 0(t5) + sh2add a7, t6, t4 + addiw t6, t6, 1 + mulw a6, t3, s0 + amoadd.w.aqrl s1, a6, (a7) + ble a0, t6, label1042 + addi t5, t5, 4 + j label1039 +.p2align 2 +label1044: add t1, t1, a1 mv t4, t2 - bgt a0, t2, label689 + bgt a0, t2, label1037 addiw a5, a5, 1 - ble a4, a5, label678 + ble a4, a5, label1026 .p2align 2 -label688: +label1036: add t0, t0, a1 mv t1, a3 mv t4, zero - bgt a0, zero, label689 -label687: + bgt a0, zero, label1037 +label1035: addiw a5, a5, 1 - bgt a4, a5, label688 - j label678 -.p2align 2 -label690: - slliw a6, t4, 12 - mv t5, t0 - mv t6, zero - add t4, a2, a6 + bgt a4, a5, label1036 + j label1026 .p2align 2 -label691: - lw a7, 0(t5) - sh2add s0, t6, t4 - addiw t6, t6, 1 - mulw a6, t3, a7 - amoadd.w.aqrl a7, a6, (s0) - ble a0, t6, label694 - addi t5, t5, 4 - j label691 +label1210: + addiw t2, t2, 1 + bgt a4, t2, label1010 + j label1026 .p2align 2 -label880: +label1205: addiw a5, a5, 1 - bgt a4, a5, label688 - j label678 + bgt a4, a5, label1036 + j label1026 diff --git a/tests/SysY2022/performance/01_mm3.sy.ir b/tests/SysY2022/performance/01_mm3.sy.ir index ff3d45414..17e4897a1 100644 --- a/tests/SysY2022/performance/01_mm3.sy.ir +++ b/tests/SysY2022/performance/01_mm3.sy.ir @@ -112,85 +112,176 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %5 = icmp sgt i32 %4, i32 0; cbr i1 %5(prob = 0.5), ^cond, ^b1; ^cond: - i32 %6 = add i32 %4, i32 -7; - i1 %7 = icmp sgt i32 %4, i32 7; - [1024 * [1024 * i32]]* %8 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; - cbr i1 %7(prob = 0.5), ^b2, ^b3; + i1 %6 = icmp sgt i32 %4, i32 3; + [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; + cbr i1 %6(prob = 0.5), ^cond1, ^b2; ^b1: - i32 %9 = phi [^b, i32 0] [^scalar.final, i32 %50] [^scalar.final1, i32 %63]; - i32* %10 = ptradd [8 * i8]* %2, i32 0; - atomicadd i32* %10, i32 %9; + i32 %8 = phi [^b, i32 0] [^scalar.final, i32 %27] [^scalar.final3, i32 %128] [^scalar.final4, i32 %138]; + i32* %9 = ptradd [8 * i8]* %2, i32 0; + atomicadd i32* %9, i32 %8; ret; ^b2: - i32 %12 = phi [^cond, i32 %0] [^scalar.final1, i32 %64]; - i32 %13 = phi [^cond, i32 0] [^scalar.final1, i32 %63]; - [1024 * i32]* %14 = getelementptr &([1024 * [1024 * i32]]* %8)[i64 0][i32 %12]; + i32 %11 = phi [^cond, i32 %0] [^scalar.final, i32 %98]; + i32 %12 = phi [^cond, i32 0] [^scalar.final, i32 %27]; + [1024 * i32]* %13 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i32 %11]; ubr ^while.body; + ^cond1: + i32 %14 = add i32 %4, i32 -3; + i1 %15 = icmp sgt i32 %14, i32 15; + i32 %16 = add i32 %4, i32 -18; + cbr i1 %15(prob = 0.5), ^b3, ^b4; ^b3: - i32 %15 = phi [^cond, i32 %0] [^scalar.final, i32 %54]; - i32 %16 = phi [^cond, i32 0] [^scalar.final, i32 %50]; - [1024 * i32]* %17 = getelementptr &([1024 * [1024 * i32]]* %8)[i64 0][i32 %15]; + i32 %17 = phi [^cond1, i32 0] [^scalar.final4, i32 %138]; + i32 %18 = phi [^cond1, i32 %0] [^scalar.final4, i32 %139]; + [1024 * i32]* %19 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i32 %18]; ubr ^while.body1; - ^while.body: - i32 %18 = phi [^b2, i32 %13] [^while.body, i32 %43]; - i32 %19 = phi [^b2, i32 0] [^while.body, i32 %44]; - i32* %20 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %19]; - i32 %21 = load i32* %20; - i32 %22 = add i32 %18, i32 %21; - i32* %23 = getelementptr &(i32* %20)[i64 1]; - i32 %24 = load i32* %23; - i32 %25 = add i32 %22, i32 %24; - i32* %26 = getelementptr &(i32* %20)[i64 2]; - i32 %27 = load i32* %26; - i32 %28 = add i32 %25, i32 %27; - i32* %29 = getelementptr &(i32* %20)[i64 3]; - i32 %30 = load i32* %29; - i32 %31 = add i32 %28, i32 %30; - i32* %32 = getelementptr &(i32* %20)[i64 4]; + ^b4: + i32 %20 = phi [^cond1, i32 %0] [^scalar.final3, i32 %129]; + i32 %21 = phi [^cond1, i32 0] [^scalar.final3, i32 %128]; + [1024 * i32]* %22 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i32 %20]; + ubr ^while.body2; + ^while.body {scalar}: + i32 %23 = phi [^b2, i32 0] [^while.body, i32 %28]; + i32 %24 = phi [^b2, i32 %12] [^while.body, i32 %27]; + i32* %25 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %23]; + i32 %26 = load i32* %25; + i32 %27 = add i32 %24, i32 %26; + i32 %28 = add i32 %23, i32 1; + i1 %29 = icmp sgt i32 %4, i32 %28; + cbr i1 %29(prob = 0.75), ^while.body, ^scalar.final; + ^while.body1: + i32 %30 = phi [^b3, i32 0] [^while.body1, i32 %80]; + i32 %31 = phi [^b3, i32 %17] [^while.body1, i32 %79]; + i32* %32 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %30]; i32 %33 = load i32* %32; i32 %34 = add i32 %31, i32 %33; - i32* %35 = getelementptr &(i32* %20)[i64 5]; + i32* %35 = getelementptr &(i32* %32)[i64 1]; i32 %36 = load i32* %35; i32 %37 = add i32 %34, i32 %36; - i32* %38 = getelementptr &(i32* %20)[i64 6]; + i32* %38 = getelementptr &(i32* %32)[i64 2]; i32 %39 = load i32* %38; i32 %40 = add i32 %37, i32 %39; - i32* %41 = getelementptr &(i32* %20)[i64 7]; + i32* %41 = getelementptr &(i32* %32)[i64 3]; i32 %42 = load i32* %41; i32 %43 = add i32 %40, i32 %42; - i32 %44 = add i32 %19, i32 8; - i1 %45 = icmp sgt i32 %6, i32 %44; - cbr i1 %45(prob = 0.888889), ^while.body, ^scalar.header; - ^while.body1 {scalar}: - i32 %46 = phi [^b3, i32 0] [^while.body1, i32 %51]; - i32 %47 = phi [^b3, i32 %16] [^while.body1, i32 %50]; - i32* %48 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %46]; - i32 %49 = load i32* %48; - i32 %50 = add i32 %47, i32 %49; - i32 %51 = add i32 %46, i32 1; - i1 %52 = icmp sgt i32 %4, i32 %51; - cbr i1 %52(prob = 0.875), ^while.body1, ^scalar.final; - ^scalar.header: - i1 %53 = icmp sgt i32 %4, i32 %44; - cbr i1 %53(prob = 0.875), ^while.body2, ^scalar.final1; - ^scalar.final: - i32 %54 = add i32 %15, i32 1; - i1 %55 = icmp sgt i32 %1, i32 %54; - cbr i1 %55(prob = 0.984615), ^b3, ^b1; + i32* %44 = getelementptr &(i32* %32)[i64 4]; + i32 %45 = load i32* %44; + i32 %46 = add i32 %43, i32 %45; + i32* %47 = getelementptr &(i32* %32)[i64 5]; + i32 %48 = load i32* %47; + i32 %49 = add i32 %46, i32 %48; + i32* %50 = getelementptr &(i32* %32)[i64 6]; + i32 %51 = load i32* %50; + i32 %52 = add i32 %49, i32 %51; + i32* %53 = getelementptr &(i32* %32)[i64 7]; + i32 %54 = load i32* %53; + i32 %55 = add i32 %52, i32 %54; + i32* %56 = getelementptr &(i32* %32)[i64 8]; + i32 %57 = load i32* %56; + i32 %58 = add i32 %55, i32 %57; + i32* %59 = getelementptr &(i32* %32)[i64 9]; + i32 %60 = load i32* %59; + i32 %61 = add i32 %58, i32 %60; + i32* %62 = getelementptr &(i32* %32)[i64 10]; + i32 %63 = load i32* %62; + i32 %64 = add i32 %61, i32 %63; + i32* %65 = getelementptr &(i32* %32)[i64 11]; + i32 %66 = load i32* %65; + i32 %67 = add i32 %64, i32 %66; + i32* %68 = getelementptr &(i32* %32)[i64 12]; + i32 %69 = load i32* %68; + i32 %70 = add i32 %67, i32 %69; + i32* %71 = getelementptr &(i32* %32)[i64 13]; + i32 %72 = load i32* %71; + i32 %73 = add i32 %70, i32 %72; + i32* %74 = getelementptr &(i32* %32)[i64 14]; + i32 %75 = load i32* %74; + i32 %76 = add i32 %73, i32 %75; + i32* %77 = getelementptr &(i32* %32)[i64 15]; + i32 %78 = load i32* %77; + i32 %79 = add i32 %76, i32 %78; + i32 %80 = add i32 %30, i32 16; + i1 %81 = icmp sgt i32 %16, i32 %80; + cbr i1 %81(prob = 0.941176), ^while.body1, ^scalar.header; ^while.body2 {scalar}: - i32 %56 = phi [^scalar.header, i32 %44] [^while.body2, i32 %61]; - i32 %57 = phi [^scalar.header, i32 %43] [^while.body2, i32 %60]; - i32* %58 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %56]; - i32 %59 = load i32* %58; - i32 %60 = add i32 %57, i32 %59; - i32 %61 = add i32 %56, i32 1; - i1 %62 = icmp sgt i32 %4, i32 %61; - cbr i1 %62(prob = 0.875), ^while.body2, ^scalar.final1; + i32 %82 = phi [^b4, i32 %21] [^while.body2, i32 %95]; + i32 %83 = phi [^b4, i32 0] [^while.body2, i32 %96]; + i32* %84 = getelementptr &([1024 * i32]* %22)[i64 0][i32 %83]; + i32 %85 = load i32* %84; + i32 %86 = add i32 %82, i32 %85; + i32* %87 = getelementptr &(i32* %84)[i64 1]; + i32 %88 = load i32* %87; + i32 %89 = add i32 %86, i32 %88; + i32* %90 = getelementptr &(i32* %84)[i64 2]; + i32 %91 = load i32* %90; + i32 %92 = add i32 %89, i32 %91; + i32* %93 = getelementptr &(i32* %84)[i64 3]; + i32 %94 = load i32* %93; + i32 %95 = add i32 %92, i32 %94; + i32 %96 = add i32 %83, i32 4; + i1 %97 = icmp sgt i32 %14, i32 %96; + cbr i1 %97(prob = 0.75), ^while.body2, ^scalar.final1; + ^scalar.final: + i32 %98 = add i32 %11, i32 1; + i1 %99 = icmp sgt i32 %1, i32 %98; + cbr i1 %99(prob = 0.984615), ^b2, ^b1; + ^scalar.header: + i1 %100 = icmp sgt i32 %14, i32 %80; + cbr i1 %100(prob = 0.75), ^while.body3, ^scalar.final2; ^scalar.final1: - i32 %63 = phi [^scalar.header, i32 %43] [^while.body2, i32 %60]; - i32 %64 = add i32 %12, i32 1; - i1 %65 = icmp sgt i32 %1, i32 %64; - cbr i1 %65(prob = 0.984615), ^b2, ^b1; + i1 %101 = icmp sgt i32 %4, i32 %96; + cbr i1 %101(prob = 0.75), ^while.body4, ^scalar.final3; + ^while.body3 {scalar}: + i32 %102 = phi [^scalar.header, i32 %79] [^while.body3, i32 %115]; + i32 %103 = phi [^scalar.header, i32 %80] [^while.body3, i32 %116]; + i32* %104 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %103]; + i32 %105 = load i32* %104; + i32 %106 = add i32 %102, i32 %105; + i32* %107 = getelementptr &(i32* %104)[i64 1]; + i32 %108 = load i32* %107; + i32 %109 = add i32 %106, i32 %108; + i32* %110 = getelementptr &(i32* %104)[i64 2]; + i32 %111 = load i32* %110; + i32 %112 = add i32 %109, i32 %111; + i32* %113 = getelementptr &(i32* %104)[i64 3]; + i32 %114 = load i32* %113; + i32 %115 = add i32 %112, i32 %114; + i32 %116 = add i32 %103, i32 4; + i1 %117 = icmp sgt i32 %14, i32 %116; + cbr i1 %117(prob = 0.75), ^while.body3, ^scalar.final2; + ^scalar.final2: + i32 %118 = phi [^scalar.header, i32 %79] [^while.body3, i32 %115]; + i32 %119 = phi [^scalar.header, i32 %80] [^while.body3, i32 %116]; + i1 %120 = icmp sgt i32 %4, i32 %119; + cbr i1 %120(prob = 0.75), ^while.body5, ^scalar.final4; + ^while.body4 {scalar}: + i32 %121 = phi [^scalar.final1, i32 %96] [^while.body4, i32 %126]; + i32 %122 = phi [^scalar.final1, i32 %95] [^while.body4, i32 %125]; + i32* %123 = getelementptr &([1024 * i32]* %22)[i64 0][i32 %121]; + i32 %124 = load i32* %123; + i32 %125 = add i32 %122, i32 %124; + i32 %126 = add i32 %121, i32 1; + i1 %127 = icmp sgt i32 %4, i32 %126; + cbr i1 %127(prob = 0.75), ^while.body4, ^scalar.final3; + ^scalar.final3: + i32 %128 = phi [^scalar.final1, i32 %95] [^while.body4, i32 %125]; + i32 %129 = add i32 %20, i32 1; + i1 %130 = icmp sgt i32 %1, i32 %129; + cbr i1 %130(prob = 0.984615), ^b4, ^b1; + ^while.body5 {scalar}: + i32 %131 = phi [^scalar.final2, i32 %119] [^while.body5, i32 %136]; + i32 %132 = phi [^scalar.final2, i32 %118] [^while.body5, i32 %135]; + i32* %133 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %131]; + i32 %134 = load i32* %133; + i32 %135 = add i32 %132, i32 %134; + i32 %136 = add i32 %131, i32 1; + i1 %137 = icmp sgt i32 %4, i32 %136; + cbr i1 %137(prob = 0.75), ^while.body5, ^scalar.final4; + ^scalar.final4: + i32 %138 = phi [^scalar.final2, i32 %118] [^while.body5, i32 %135]; + i32 %139 = add i32 %18, i32 1; + i1 %140 = icmp sgt i32 %1, i32 %139; + cbr i1 %140(prob = 0.984615), ^b3, ^b1; } internal [8 * i8]* @cmmc_parallel_body_payload_0, align 8; internal func @cmmc_parallel_body_1(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -198,70 +289,138 @@ internal func @cmmc_parallel_body_1(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_1 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; - [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @C to [1024 * [1024 * i32]]*; - [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; - cbr i1 %5(prob = 0.5), ^b1, ^cond; - ^b1: - i32 %9 = phi [^b, i32 %0] [^b4, i32 %34]; - [1024 * i32]* %10 = getelementptr &([1024 * i32]* %8)[i32 %9]; - ubr ^while.body; + i1 %5 = icmp sgt i32 %4, i32 3; + [1024 * [1024 * i32]]* %6 = ptrcast [1024 * [1024 * i32]]* @C to [1024 * [1024 * i32]]*; + [1024 * i32]* %7 = getelementptr &([1024 * [1024 * i32]]* %6)[i64 0][i64 0]; + cbr i1 %5(prob = 0.5), ^cond, ^cond1; ^cond: + i32 %8 = add i32 %4, i32 -3; + i1 %9 = icmp sgt i32 %8, i32 15; + i32 %10 = add i32 %4, i32 -18; + cbr i1 %9(prob = 0.5), ^b1, ^b3; + ^cond1: i1 %11 = icmp sgt i32 %4, i32 0; - cbr i1 %11(prob = 0.5), ^b2, ^b3; + cbr i1 %11(prob = 0.5), ^b2, ^b4; + ^b1: + i32 %12 = phi [^cond, i32 %0] [^b7, i32 %71]; + [1024 * i32]* %13 = getelementptr &([1024 * i32]* %7)[i32 %12]; + ubr ^while.body; ^b2: - i32 %12 = phi [^cond, i32 %0] [^b5, i32 %36]; - [1024 * i32]* %13 = getelementptr &([1024 * i32]* %8)[i32 %12]; + i32 %14 = phi [^cond1, i32 %0] [^b5, i32 %49]; + [1024 * i32]* %15 = getelementptr &([1024 * i32]* %7)[i32 %14]; ubr ^while.body1; + ^b3: + i32 %16 = phi [^cond, i32 %0] [^b6, i32 %65]; + [1024 * i32]* %17 = getelementptr &([1024 * i32]* %7)[i32 %16]; + ubr ^while.body2; + ^b4: + ret; ^while.body: - i32 %14 = phi [^b1, i32 0] [^while.body, i32 %23]; - i32* %15 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %14]; - store i32* %15 with i32 0; - i32* %16 = getelementptr &(i32* %15)[i64 1]; - store i32* %16 with i32 0; - i32* %17 = getelementptr &(i32* %15)[i64 2]; - store i32* %17 with i32 0; - i32* %18 = getelementptr &(i32* %15)[i64 3]; - store i32* %18 with i32 0; - i32* %19 = getelementptr &(i32* %15)[i64 4]; + i32 %18 = phi [^b1, i32 0] [^while.body, i32 %35]; + i32* %19 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %18]; store i32* %19 with i32 0; - i32* %20 = getelementptr &(i32* %15)[i64 5]; + i32* %20 = getelementptr &(i32* %19)[i64 1]; store i32* %20 with i32 0; - i32* %21 = getelementptr &(i32* %15)[i64 6]; + i32* %21 = getelementptr &(i32* %19)[i64 2]; store i32* %21 with i32 0; - i32* %22 = getelementptr &(i32* %15)[i64 7]; + i32* %22 = getelementptr &(i32* %19)[i64 3]; store i32* %22 with i32 0; - i32 %23 = add i32 %14, i32 8; - i1 %24 = icmp sgt i32 %6, i32 %23; - cbr i1 %24(prob = 0.888889), ^while.body, ^scalar.header; - ^b3: - ret; - ^scalar.header: - i1 %25 = icmp sgt i32 %4, i32 %23; - cbr i1 %25(prob = 0.875), ^while.body2, ^b4; - ^while.body1 {scalar}: - i32 %26 = phi [^b2, i32 0] [^while.body1, i32 %28]; - i32* %27 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %26]; + i32* %23 = getelementptr &(i32* %19)[i64 4]; + store i32* %23 with i32 0; + i32* %24 = getelementptr &(i32* %19)[i64 5]; + store i32* %24 with i32 0; + i32* %25 = getelementptr &(i32* %19)[i64 6]; + store i32* %25 with i32 0; + i32* %26 = getelementptr &(i32* %19)[i64 7]; + store i32* %26 with i32 0; + i32* %27 = getelementptr &(i32* %19)[i64 8]; store i32* %27 with i32 0; - i32 %28 = add i32 %26, i32 1; - i1 %29 = icmp sgt i32 %4, i32 %28; - cbr i1 %29(prob = 0.875), ^while.body1, ^b5; - ^while.body2 {scalar}: - i32 %30 = phi [^scalar.header, i32 %23] [^while.body2, i32 %32]; - i32* %31 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %30]; + i32* %28 = getelementptr &(i32* %19)[i64 9]; + store i32* %28 with i32 0; + i32* %29 = getelementptr &(i32* %19)[i64 10]; + store i32* %29 with i32 0; + i32* %30 = getelementptr &(i32* %19)[i64 11]; + store i32* %30 with i32 0; + i32* %31 = getelementptr &(i32* %19)[i64 12]; store i32* %31 with i32 0; - i32 %32 = add i32 %30, i32 1; - i1 %33 = icmp sgt i32 %4, i32 %32; - cbr i1 %33(prob = 0.875), ^while.body2, ^b4; - ^b4: - i32 %34 = add i32 %9, i32 1; - i1 %35 = icmp sgt i32 %1, i32 %34; - cbr i1 %35(prob = 0.984615), ^b1, ^b3; + i32* %32 = getelementptr &(i32* %19)[i64 13]; + store i32* %32 with i32 0; + i32* %33 = getelementptr &(i32* %19)[i64 14]; + store i32* %33 with i32 0; + i32* %34 = getelementptr &(i32* %19)[i64 15]; + store i32* %34 with i32 0; + i32 %35 = add i32 %18, i32 16; + i1 %36 = icmp sgt i32 %10, i32 %35; + cbr i1 %36(prob = 0.941176), ^while.body, ^scalar.header; + ^while.body1 {scalar}: + i32 %37 = phi [^b2, i32 0] [^while.body1, i32 %39]; + i32* %38 = getelementptr &([1024 * i32]* %15)[i64 0][i32 %37]; + store i32* %38 with i32 0; + i32 %39 = add i32 %37, i32 1; + i1 %40 = icmp sgt i32 %4, i32 %39; + cbr i1 %40(prob = 0.75), ^while.body1, ^b5; + ^while.body2 {scalar}: + i32 %41 = phi [^b3, i32 0] [^while.body2, i32 %46]; + i32* %42 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %41]; + store i32* %42 with i32 0; + i32* %43 = getelementptr &(i32* %42)[i64 1]; + store i32* %43 with i32 0; + i32* %44 = getelementptr &(i32* %42)[i64 2]; + store i32* %44 with i32 0; + i32* %45 = getelementptr &(i32* %42)[i64 3]; + store i32* %45 with i32 0; + i32 %46 = add i32 %41, i32 4; + i1 %47 = icmp sgt i32 %8, i32 %46; + cbr i1 %47(prob = 0.75), ^while.body2, ^scalar.final; + ^scalar.header: + i1 %48 = icmp sgt i32 %8, i32 %35; + cbr i1 %48(prob = 0.75), ^while.body3, ^scalar.final1; ^b5: - i32 %36 = add i32 %12, i32 1; - i1 %37 = icmp sgt i32 %1, i32 %36; - cbr i1 %37(prob = 0.984615), ^b2, ^b3; + i32 %49 = add i32 %14, i32 1; + i1 %50 = icmp sgt i32 %1, i32 %49; + cbr i1 %50(prob = 0.984615), ^b2, ^b4; + ^scalar.final: + i1 %51 = icmp sgt i32 %4, i32 %46; + cbr i1 %51(prob = 0.75), ^while.body4, ^b6; + ^while.body3 {scalar}: + i32 %52 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i32* %53 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %52]; + store i32* %53 with i32 0; + i32* %54 = getelementptr &(i32* %53)[i64 1]; + store i32* %54 with i32 0; + i32* %55 = getelementptr &(i32* %53)[i64 2]; + store i32* %55 with i32 0; + i32* %56 = getelementptr &(i32* %53)[i64 3]; + store i32* %56 with i32 0; + i32 %57 = add i32 %52, i32 4; + i1 %58 = icmp sgt i32 %8, i32 %57; + cbr i1 %58(prob = 0.75), ^while.body3, ^scalar.final1; + ^scalar.final1: + i32 %59 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i1 %60 = icmp sgt i32 %4, i32 %59; + cbr i1 %60(prob = 0.75), ^while.body5, ^b7; + ^while.body4 {scalar}: + i32 %61 = phi [^scalar.final, i32 %46] [^while.body4, i32 %63]; + i32* %62 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %61]; + store i32* %62 with i32 0; + i32 %63 = add i32 %61, i32 1; + i1 %64 = icmp sgt i32 %4, i32 %63; + cbr i1 %64(prob = 0.75), ^while.body4, ^b6; + ^b6: + i32 %65 = add i32 %16, i32 1; + i1 %66 = icmp sgt i32 %1, i32 %65; + cbr i1 %66(prob = 0.984615), ^b3, ^b4; + ^while.body5 {scalar}: + i32 %67 = phi [^scalar.final1, i32 %59] [^while.body5, i32 %69]; + i32* %68 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %67]; + store i32* %68 with i32 0; + i32 %69 = add i32 %67, i32 1; + i1 %70 = icmp sgt i32 %4, i32 %69; + cbr i1 %70(prob = 0.75), ^while.body5, ^b7; + ^b7: + i32 %71 = add i32 %12, i32 1; + i1 %72 = icmp sgt i32 %1, i32 %71; + cbr i1 %72(prob = 0.984615), ^b1, ^b4; } internal [4 * i8]* @cmmc_parallel_body_payload_1, align 8; internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -269,8 +428,8 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_2 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; + i1 %5 = icmp sgt i32 %4, i32 3; + i32 %6 = add i32 %4, i32 -3; [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @A to [1024 * [1024 * i32]]*; [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; [1024 * [1024 * i32]]* %9 = ptrcast [1024 * [1024 * i32]]* @C to [1024 * [1024 * i32]]*; @@ -325,7 +484,7 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %36 = icmp sgt i32 %1, i32 %35; cbr i1 %36(prob = 0.984615), ^b2, ^b3; ^while.body2: - i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %78]; + i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %58]; i32* %38 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %37]; i32 %39 = load i32* %38; i32 %40 = mul i32 %22, i32 %39; @@ -346,55 +505,35 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %55 = mul i32 %22, i32 %54; i32* %56 = getelementptr &(i32* %41)[i64 3]; atomicadd i32* %56, i32 %55; - i32* %58 = getelementptr &(i32* %38)[i64 4]; - i32 %59 = load i32* %58; - i32 %60 = mul i32 %22, i32 %59; - i32* %61 = getelementptr &(i32* %41)[i64 4]; - atomicadd i32* %61, i32 %60; - i32* %63 = getelementptr &(i32* %38)[i64 5]; - i32 %64 = load i32* %63; - i32 %65 = mul i32 %22, i32 %64; - i32* %66 = getelementptr &(i32* %41)[i64 5]; - atomicadd i32* %66, i32 %65; - i32* %68 = getelementptr &(i32* %38)[i64 6]; - i32 %69 = load i32* %68; - i32 %70 = mul i32 %22, i32 %69; - i32* %71 = getelementptr &(i32* %41)[i64 6]; - atomicadd i32* %71, i32 %70; - i32* %73 = getelementptr &(i32* %38)[i64 7]; - i32 %74 = load i32* %73; - i32 %75 = mul i32 %22, i32 %74; - i32* %76 = getelementptr &(i32* %41)[i64 7]; - atomicadd i32* %76, i32 %75; - i32 %78 = add i32 %37, i32 8; - i1 %79 = icmp sgt i32 %6, i32 %78; - cbr i1 %79(prob = 0.888889), ^while.body2, ^scalar.header; + i32 %58 = add i32 %37, i32 4; + i1 %59 = icmp sgt i32 %6, i32 %58; + cbr i1 %59(prob = 0.941176), ^while.body2, ^scalar.header; ^prebody1: - [1024 * i32]* %80 = getelementptr &([1024 * i32]* %10)[i32 %27]; + [1024 * i32]* %60 = getelementptr &([1024 * i32]* %10)[i32 %27]; ubr ^while.body3; ^scalar.header: - i1 %81 = icmp sle i32 %4, i32 %78; - cbr i1 %81(prob = 0.125), ^while.header, ^while.body4; + i1 %61 = icmp sle i32 %4, i32 %58; + cbr i1 %61(prob = 0.25), ^while.header, ^while.body4; ^while.body3 {scalar}: - i32 %82 = phi [^prebody1, i32 0] [^while.body3, i32 %88]; - i32* %83 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %82]; - i32 %84 = load i32* %83; - i32 %85 = mul i32 %32, i32 %84; - i32* %86 = getelementptr &([1024 * i32]* %80)[i64 0][i32 %82]; - atomicadd i32* %86, i32 %85; - i32 %88 = add i32 %82, i32 1; - i1 %89 = icmp sgt i32 %4, i32 %88; - cbr i1 %89(prob = 0.875), ^while.body3, ^while.header1; + i32 %62 = phi [^prebody1, i32 0] [^while.body3, i32 %68]; + i32* %63 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %62]; + i32 %64 = load i32* %63; + i32 %65 = mul i32 %32, i32 %64; + i32* %66 = getelementptr &([1024 * i32]* %60)[i64 0][i32 %62]; + atomicadd i32* %66, i32 %65; + i32 %68 = add i32 %62, i32 1; + i1 %69 = icmp sgt i32 %4, i32 %68; + cbr i1 %69(prob = 0.75), ^while.body3, ^while.header1; ^while.body4 {scalar}: - i32 %90 = phi [^scalar.header, i32 %78] [^while.body4, i32 %96]; - i32* %91 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %90]; - i32 %92 = load i32* %91; - i32 %93 = mul i32 %22, i32 %92; - i32* %94 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %90]; - atomicadd i32* %94, i32 %93; - i32 %96 = add i32 %90, i32 1; - i1 %97 = icmp sgt i32 %4, i32 %96; - cbr i1 %97(prob = 0.875), ^while.body4, ^while.header; + i32 %70 = phi [^scalar.header, i32 %58] [^while.body4, i32 %76]; + i32* %71 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %70]; + i32 %72 = load i32* %71; + i32 %73 = mul i32 %22, i32 %72; + i32* %74 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %70]; + atomicadd i32* %74, i32 %73; + i32 %76 = add i32 %70, i32 1; + i1 %77 = icmp sgt i32 %4, i32 %76; + cbr i1 %77(prob = 0.75), ^while.body4, ^while.header; } internal [4 * i8]* @cmmc_parallel_body_payload_2, align 8; internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -402,70 +541,138 @@ internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_3 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; - [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; - [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; - cbr i1 %5(prob = 0.5), ^b1, ^cond; - ^b1: - i32 %9 = phi [^b, i32 %0] [^b4, i32 %34]; - [1024 * i32]* %10 = getelementptr &([1024 * i32]* %8)[i32 %9]; - ubr ^while.body; + i1 %5 = icmp sgt i32 %4, i32 3; + [1024 * [1024 * i32]]* %6 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; + [1024 * i32]* %7 = getelementptr &([1024 * [1024 * i32]]* %6)[i64 0][i64 0]; + cbr i1 %5(prob = 0.5), ^cond, ^cond1; ^cond: + i32 %8 = add i32 %4, i32 -3; + i1 %9 = icmp sgt i32 %8, i32 15; + i32 %10 = add i32 %4, i32 -18; + cbr i1 %9(prob = 0.5), ^b1, ^b3; + ^cond1: i1 %11 = icmp sgt i32 %4, i32 0; - cbr i1 %11(prob = 0.5), ^b2, ^b3; + cbr i1 %11(prob = 0.5), ^b2, ^b4; + ^b1: + i32 %12 = phi [^cond, i32 %0] [^b7, i32 %71]; + [1024 * i32]* %13 = getelementptr &([1024 * i32]* %7)[i32 %12]; + ubr ^while.body; ^b2: - i32 %12 = phi [^cond, i32 %0] [^b5, i32 %36]; - [1024 * i32]* %13 = getelementptr &([1024 * i32]* %8)[i32 %12]; + i32 %14 = phi [^cond1, i32 %0] [^b5, i32 %49]; + [1024 * i32]* %15 = getelementptr &([1024 * i32]* %7)[i32 %14]; ubr ^while.body1; + ^b3: + i32 %16 = phi [^cond, i32 %0] [^b6, i32 %65]; + [1024 * i32]* %17 = getelementptr &([1024 * i32]* %7)[i32 %16]; + ubr ^while.body2; + ^b4: + ret; ^while.body: - i32 %14 = phi [^b1, i32 0] [^while.body, i32 %23]; - i32* %15 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %14]; - store i32* %15 with i32 0; - i32* %16 = getelementptr &(i32* %15)[i64 1]; - store i32* %16 with i32 0; - i32* %17 = getelementptr &(i32* %15)[i64 2]; - store i32* %17 with i32 0; - i32* %18 = getelementptr &(i32* %15)[i64 3]; - store i32* %18 with i32 0; - i32* %19 = getelementptr &(i32* %15)[i64 4]; + i32 %18 = phi [^b1, i32 0] [^while.body, i32 %35]; + i32* %19 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %18]; store i32* %19 with i32 0; - i32* %20 = getelementptr &(i32* %15)[i64 5]; + i32* %20 = getelementptr &(i32* %19)[i64 1]; store i32* %20 with i32 0; - i32* %21 = getelementptr &(i32* %15)[i64 6]; + i32* %21 = getelementptr &(i32* %19)[i64 2]; store i32* %21 with i32 0; - i32* %22 = getelementptr &(i32* %15)[i64 7]; + i32* %22 = getelementptr &(i32* %19)[i64 3]; store i32* %22 with i32 0; - i32 %23 = add i32 %14, i32 8; - i1 %24 = icmp sgt i32 %6, i32 %23; - cbr i1 %24(prob = 0.888889), ^while.body, ^scalar.header; - ^b3: - ret; - ^scalar.header: - i1 %25 = icmp sgt i32 %4, i32 %23; - cbr i1 %25(prob = 0.875), ^while.body2, ^b4; - ^while.body1 {scalar}: - i32 %26 = phi [^b2, i32 0] [^while.body1, i32 %28]; - i32* %27 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %26]; + i32* %23 = getelementptr &(i32* %19)[i64 4]; + store i32* %23 with i32 0; + i32* %24 = getelementptr &(i32* %19)[i64 5]; + store i32* %24 with i32 0; + i32* %25 = getelementptr &(i32* %19)[i64 6]; + store i32* %25 with i32 0; + i32* %26 = getelementptr &(i32* %19)[i64 7]; + store i32* %26 with i32 0; + i32* %27 = getelementptr &(i32* %19)[i64 8]; store i32* %27 with i32 0; - i32 %28 = add i32 %26, i32 1; - i1 %29 = icmp sgt i32 %4, i32 %28; - cbr i1 %29(prob = 0.875), ^while.body1, ^b5; - ^while.body2 {scalar}: - i32 %30 = phi [^scalar.header, i32 %23] [^while.body2, i32 %32]; - i32* %31 = getelementptr &([1024 * i32]* %10)[i64 0][i32 %30]; + i32* %28 = getelementptr &(i32* %19)[i64 9]; + store i32* %28 with i32 0; + i32* %29 = getelementptr &(i32* %19)[i64 10]; + store i32* %29 with i32 0; + i32* %30 = getelementptr &(i32* %19)[i64 11]; + store i32* %30 with i32 0; + i32* %31 = getelementptr &(i32* %19)[i64 12]; store i32* %31 with i32 0; - i32 %32 = add i32 %30, i32 1; - i1 %33 = icmp sgt i32 %4, i32 %32; - cbr i1 %33(prob = 0.875), ^while.body2, ^b4; - ^b4: - i32 %34 = add i32 %9, i32 1; - i1 %35 = icmp sgt i32 %1, i32 %34; - cbr i1 %35(prob = 0.984615), ^b1, ^b3; + i32* %32 = getelementptr &(i32* %19)[i64 13]; + store i32* %32 with i32 0; + i32* %33 = getelementptr &(i32* %19)[i64 14]; + store i32* %33 with i32 0; + i32* %34 = getelementptr &(i32* %19)[i64 15]; + store i32* %34 with i32 0; + i32 %35 = add i32 %18, i32 16; + i1 %36 = icmp sgt i32 %10, i32 %35; + cbr i1 %36(prob = 0.941176), ^while.body, ^scalar.header; + ^while.body1 {scalar}: + i32 %37 = phi [^b2, i32 0] [^while.body1, i32 %39]; + i32* %38 = getelementptr &([1024 * i32]* %15)[i64 0][i32 %37]; + store i32* %38 with i32 0; + i32 %39 = add i32 %37, i32 1; + i1 %40 = icmp sgt i32 %4, i32 %39; + cbr i1 %40(prob = 0.75), ^while.body1, ^b5; + ^while.body2 {scalar}: + i32 %41 = phi [^b3, i32 0] [^while.body2, i32 %46]; + i32* %42 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %41]; + store i32* %42 with i32 0; + i32* %43 = getelementptr &(i32* %42)[i64 1]; + store i32* %43 with i32 0; + i32* %44 = getelementptr &(i32* %42)[i64 2]; + store i32* %44 with i32 0; + i32* %45 = getelementptr &(i32* %42)[i64 3]; + store i32* %45 with i32 0; + i32 %46 = add i32 %41, i32 4; + i1 %47 = icmp sgt i32 %8, i32 %46; + cbr i1 %47(prob = 0.75), ^while.body2, ^scalar.final; + ^scalar.header: + i1 %48 = icmp sgt i32 %8, i32 %35; + cbr i1 %48(prob = 0.75), ^while.body3, ^scalar.final1; ^b5: - i32 %36 = add i32 %12, i32 1; - i1 %37 = icmp sgt i32 %1, i32 %36; - cbr i1 %37(prob = 0.984615), ^b2, ^b3; + i32 %49 = add i32 %14, i32 1; + i1 %50 = icmp sgt i32 %1, i32 %49; + cbr i1 %50(prob = 0.984615), ^b2, ^b4; + ^scalar.final: + i1 %51 = icmp sgt i32 %4, i32 %46; + cbr i1 %51(prob = 0.75), ^while.body4, ^b6; + ^while.body3 {scalar}: + i32 %52 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i32* %53 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %52]; + store i32* %53 with i32 0; + i32* %54 = getelementptr &(i32* %53)[i64 1]; + store i32* %54 with i32 0; + i32* %55 = getelementptr &(i32* %53)[i64 2]; + store i32* %55 with i32 0; + i32* %56 = getelementptr &(i32* %53)[i64 3]; + store i32* %56 with i32 0; + i32 %57 = add i32 %52, i32 4; + i1 %58 = icmp sgt i32 %8, i32 %57; + cbr i1 %58(prob = 0.75), ^while.body3, ^scalar.final1; + ^scalar.final1: + i32 %59 = phi [^scalar.header, i32 %35] [^while.body3, i32 %57]; + i1 %60 = icmp sgt i32 %4, i32 %59; + cbr i1 %60(prob = 0.75), ^while.body5, ^b7; + ^while.body4 {scalar}: + i32 %61 = phi [^scalar.final, i32 %46] [^while.body4, i32 %63]; + i32* %62 = getelementptr &([1024 * i32]* %17)[i64 0][i32 %61]; + store i32* %62 with i32 0; + i32 %63 = add i32 %61, i32 1; + i1 %64 = icmp sgt i32 %4, i32 %63; + cbr i1 %64(prob = 0.75), ^while.body4, ^b6; + ^b6: + i32 %65 = add i32 %16, i32 1; + i1 %66 = icmp sgt i32 %1, i32 %65; + cbr i1 %66(prob = 0.984615), ^b3, ^b4; + ^while.body5 {scalar}: + i32 %67 = phi [^scalar.final1, i32 %59] [^while.body5, i32 %69]; + i32* %68 = getelementptr &([1024 * i32]* %13)[i64 0][i32 %67]; + store i32* %68 with i32 0; + i32 %69 = add i32 %67, i32 1; + i1 %70 = icmp sgt i32 %4, i32 %69; + cbr i1 %70(prob = 0.75), ^while.body5, ^b7; + ^b7: + i32 %71 = add i32 %12, i32 1; + i1 %72 = icmp sgt i32 %1, i32 %71; + cbr i1 %72(prob = 0.984615), ^b1, ^b4; } internal [4 * i8]* @cmmc_parallel_body_payload_3, align 8; internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -473,8 +680,8 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel [4 * i8]* %2 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_4 to [4 * i8]*; i32* %3 = ptradd [4 * i8]* %2, i32 0; i32 %4 = load i32* %3; - i1 %5 = icmp sgt i32 %4, i32 7; - i32 %6 = add i32 %4, i32 -7; + i1 %5 = icmp sgt i32 %4, i32 3; + i32 %6 = add i32 %4, i32 -3; [1024 * [1024 * i32]]* %7 = ptrcast [1024 * [1024 * i32]]* @A to [1024 * [1024 * i32]]*; [1024 * i32]* %8 = getelementptr &([1024 * [1024 * i32]]* %7)[i64 0][i64 0]; [1024 * [1024 * i32]]* %9 = ptrcast [1024 * [1024 * i32]]* @B to [1024 * [1024 * i32]]*; @@ -529,7 +736,7 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %36 = icmp sgt i32 %1, i32 %35; cbr i1 %36(prob = 0.984615), ^b2, ^b3; ^while.body2: - i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %78]; + i32 %37 = phi [^prebody, i32 0] [^while.body2, i32 %58]; i32* %38 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %37]; i32 %39 = load i32* %38; i32 %40 = mul i32 %22, i32 %39; @@ -550,54 +757,34 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %55 = mul i32 %22, i32 %54; i32* %56 = getelementptr &(i32* %41)[i64 3]; atomicadd i32* %56, i32 %55; - i32* %58 = getelementptr &(i32* %38)[i64 4]; - i32 %59 = load i32* %58; - i32 %60 = mul i32 %22, i32 %59; - i32* %61 = getelementptr &(i32* %41)[i64 4]; - atomicadd i32* %61, i32 %60; - i32* %63 = getelementptr &(i32* %38)[i64 5]; - i32 %64 = load i32* %63; - i32 %65 = mul i32 %22, i32 %64; - i32* %66 = getelementptr &(i32* %41)[i64 5]; - atomicadd i32* %66, i32 %65; - i32* %68 = getelementptr &(i32* %38)[i64 6]; - i32 %69 = load i32* %68; - i32 %70 = mul i32 %22, i32 %69; - i32* %71 = getelementptr &(i32* %41)[i64 6]; - atomicadd i32* %71, i32 %70; - i32* %73 = getelementptr &(i32* %38)[i64 7]; - i32 %74 = load i32* %73; - i32 %75 = mul i32 %22, i32 %74; - i32* %76 = getelementptr &(i32* %41)[i64 7]; - atomicadd i32* %76, i32 %75; - i32 %78 = add i32 %37, i32 8; - i1 %79 = icmp sgt i32 %6, i32 %78; - cbr i1 %79(prob = 0.888889), ^while.body2, ^scalar.header; + i32 %58 = add i32 %37, i32 4; + i1 %59 = icmp sgt i32 %6, i32 %58; + cbr i1 %59(prob = 0.941176), ^while.body2, ^scalar.header; ^prebody1: - [1024 * i32]* %80 = getelementptr &([1024 * i32]* %10)[i32 %27]; + [1024 * i32]* %60 = getelementptr &([1024 * i32]* %10)[i32 %27]; ubr ^while.body3; ^scalar.header: - i1 %81 = icmp sle i32 %4, i32 %78; - cbr i1 %81(prob = 0.125), ^while.header, ^while.body4; + i1 %61 = icmp sle i32 %4, i32 %58; + cbr i1 %61(prob = 0.25), ^while.header, ^while.body4; ^while.body3 {scalar}: - i32 %82 = phi [^prebody1, i32 0] [^while.body3, i32 %88]; - i32* %83 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %82]; - i32 %84 = load i32* %83; - i32 %85 = mul i32 %32, i32 %84; - i32* %86 = getelementptr &([1024 * i32]* %80)[i64 0][i32 %82]; - atomicadd i32* %86, i32 %85; - i32 %88 = add i32 %82, i32 1; - i1 %89 = icmp sgt i32 %4, i32 %88; - cbr i1 %89(prob = 0.875), ^while.body3, ^while.header1; + i32 %62 = phi [^prebody1, i32 0] [^while.body3, i32 %68]; + i32* %63 = getelementptr &([1024 * i32]* %19)[i64 0][i32 %62]; + i32 %64 = load i32* %63; + i32 %65 = mul i32 %32, i32 %64; + i32* %66 = getelementptr &([1024 * i32]* %60)[i64 0][i32 %62]; + atomicadd i32* %66, i32 %65; + i32 %68 = add i32 %62, i32 1; + i1 %69 = icmp sgt i32 %4, i32 %68; + cbr i1 %69(prob = 0.75), ^while.body3, ^while.header1; ^while.body4 {scalar}: - i32 %90 = phi [^scalar.header, i32 %78] [^while.body4, i32 %96]; - i32* %91 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %90]; - i32 %92 = load i32* %91; - i32 %93 = mul i32 %22, i32 %92; - i32* %94 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %90]; - atomicadd i32* %94, i32 %93; - i32 %96 = add i32 %90, i32 1; - i1 %97 = icmp sgt i32 %4, i32 %96; - cbr i1 %97(prob = 0.875), ^while.body4, ^while.header; + i32 %70 = phi [^scalar.header, i32 %58] [^while.body4, i32 %76]; + i32* %71 = getelementptr &([1024 * i32]* %14)[i64 0][i32 %70]; + i32 %72 = load i32* %71; + i32 %73 = mul i32 %22, i32 %72; + i32* %74 = getelementptr &([1024 * i32]* %29)[i64 0][i32 %70]; + atomicadd i32* %74, i32 %73; + i32 %76 = add i32 %70, i32 1; + i1 %77 = icmp sgt i32 %4, i32 %76; + cbr i1 %77(prob = 0.75), ^while.body4, ^while.header; } internal [4 * i8]* @cmmc_parallel_body_payload_4, align 8; diff --git a/tests/SysY2022/performance/02_mv1.arm.s b/tests/SysY2022/performance/02_mv1.arm.s index 537f06715..63b5810d2 100644 --- a/tests/SysY2022/performance/02_mv1.arm.s +++ b/tests/SysY2022/performance/02_mv1.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 A: .zero 16160400 -.align 8 +.p2align 3 B: .zero 8040 -.align 8 +.p2align 3 C: .zero 8040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/02_mv1.riscv.s b/tests/SysY2022/performance/02_mv1.riscv.s index e090a7b6a..0a09ecafa 100644 --- a/tests/SysY2022/performance/02_mv1.riscv.s +++ b/tests/SysY2022/performance/02_mv1.riscv.s @@ -1,79 +1,73 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 A: .zero 16160400 -.align 8 +.p2align 3 B: .zero 8040 -.align 8 +.p2align 3 C: .zero 8040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 4 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[24] CalleeSaved[104] - addi sp, sp, -128 + # stack usage: CalleeArg[0] Local[0] RegSpill[8] CalleeSaved[104] + addi sp, sp, -112 sd ra, 0(sp) sd s11, 8(sp) sd s0, 16(sp) sd s5, 24(sp) sd s1, 32(sp) sd s6, 40(sp) - sd s8, 48(sp) - sd s2, 56(sp) - sd s3, 64(sp) + sd s2, 48(sp) + sd s3, 56(sp) + sd s8, 64(sp) sd s4, 72(sp) sd s7, 80(sp) sd s9, 88(sp) sd s10, 96(sp) jal getint -pcrel524: - auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - li s2, 50 +pcrel520: + auipc s6, %pcrel_hi(cmmc_parallel_body_payload_1) mv s11, a0 -pcrel525: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel526: - auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel527: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel528: - auipc a3, %pcrel_hi(cmmc_parallel_body_0) -pcrel529: - auipc a1, %pcrel_hi(cmmc_parallel_body_1) sd a0, 104(sp) - addi a2, a3, %pcrel_lo(pcrel528) -pcrel530: - auipc a0, %pcrel_hi(cmmc_parallel_body_2) - sd a2, 112(sp) - addi s0, a0, %pcrel_lo(pcrel530) - addi a2, a1, %pcrel_lo(pcrel529) +pcrel521: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) + li s4, 50 +pcrel522: + auipc a1, %pcrel_hi(cmmc_parallel_body_0) +pcrel523: + auipc a2, %pcrel_hi(cmmc_parallel_body_3) +pcrel524: + auipc a0, %pcrel_hi(cmmc_parallel_body_1) + addi s0, a1, %pcrel_lo(pcrel522) + addi s3, a2, %pcrel_lo(pcrel523) + addi s1, a0, %pcrel_lo(pcrel524) +pcrel525: + auipc a1, %pcrel_hi(cmmc_parallel_body_2) li a0, 1005 -pcrel531: - auipc a1, %pcrel_hi(cmmc_parallel_body_3) + addi s2, a1, %pcrel_lo(pcrel525) slli s8, a0, 3 - sd a2, 120(sp) - addi s1, a1, %pcrel_lo(pcrel531) ble s11, zero, label389 -pcrel532: +pcrel526: auipc a0, %pcrel_hi(A) mv s9, zero - addi s7, a0, %pcrel_lo(pcrel532) + addi s7, a0, %pcrel_lo(pcrel526) mv s10, s7 mv s11, zero j label405 @@ -98,10 +92,10 @@ label405: label389: ld s11, 104(sp) ble s11, zero, label390 -pcrel533: +pcrel527: auipc a1, %pcrel_hi(B) mv s8, zero - addi s7, a1, %pcrel_lo(pcrel533) + addi s7, a1, %pcrel_lo(pcrel527) .p2align 2 label397: jal getint @@ -116,45 +110,45 @@ label390: li a0, 59 jal _sysy_starttime mv s8, zero -pcrel534: +pcrel528: auipc a1, %pcrel_hi(C) - addi s7, a1, %pcrel_lo(pcrel534) + addi s7, a1, %pcrel_lo(pcrel528) .p2align 2 label391: ld s11, 104(sp) ble s11, zero, label394 mv a0, zero -pcrel535: - auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) - sw s11, %pcrel_lo(pcrel535)(s3) - ld a2, 112(sp) +pcrel529: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) + sw s11, %pcrel_lo(pcrel529)(s5) mv a1, s11 + mv a2, s0 jal cmmcParallelFor mv a0, zero -pcrel536: - auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - sw s11, %pcrel_lo(pcrel536)(s4) - ld a2, 120(sp) +pcrel530: + auipc s6, %pcrel_hi(cmmc_parallel_body_payload_1) + sw s11, %pcrel_lo(pcrel530)(s6) mv a1, s11 + mv a2, s1 jal cmmcParallelFor mv a0, zero -pcrel537: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) - sw s11, %pcrel_lo(pcrel537)(s5) +pcrel531: + auipc a3, %pcrel_hi(cmmc_parallel_body_payload_2) + sw s11, %pcrel_lo(pcrel531)(a3) mv a1, s11 - mv a2, s0 + mv a2, s2 jal cmmcParallelFor mv a0, zero -pcrel538: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_3) - sw s11, %pcrel_lo(pcrel538)(s6) +pcrel532: + auipc a3, %pcrel_hi(cmmc_parallel_body_payload_3) + sw s11, %pcrel_lo(pcrel532)(a3) mv a1, s11 - mv a2, s1 + mv a2, s3 jal cmmcParallelFor .p2align 2 label394: addiw s8, s8, 1 - blt s8, s2, label391 + blt s8, s4, label391 li a0, 67 jal _sysy_stoptime ld s11, 104(sp) @@ -168,14 +162,14 @@ label394: ld s5, 24(sp) ld s1, 32(sp) ld s6, 40(sp) - ld s8, 48(sp) - ld s2, 56(sp) - ld s3, 64(sp) + ld s2, 48(sp) + ld s3, 56(sp) + ld s8, 64(sp) ld s4, 72(sp) ld s7, 80(sp) ld s9, 88(sp) ld s10, 96(sp) - addi sp, sp, 128 + addi sp, sp, 112 ret .p2align 2 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/02_mv2.arm.s b/tests/SysY2022/performance/02_mv2.arm.s index 537f06715..63b5810d2 100644 --- a/tests/SysY2022/performance/02_mv2.arm.s +++ b/tests/SysY2022/performance/02_mv2.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 A: .zero 16160400 -.align 8 +.p2align 3 B: .zero 8040 -.align 8 +.p2align 3 C: .zero 8040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/02_mv2.riscv.s b/tests/SysY2022/performance/02_mv2.riscv.s index e090a7b6a..0a09ecafa 100644 --- a/tests/SysY2022/performance/02_mv2.riscv.s +++ b/tests/SysY2022/performance/02_mv2.riscv.s @@ -1,79 +1,73 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 A: .zero 16160400 -.align 8 +.p2align 3 B: .zero 8040 -.align 8 +.p2align 3 C: .zero 8040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 4 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[24] CalleeSaved[104] - addi sp, sp, -128 + # stack usage: CalleeArg[0] Local[0] RegSpill[8] CalleeSaved[104] + addi sp, sp, -112 sd ra, 0(sp) sd s11, 8(sp) sd s0, 16(sp) sd s5, 24(sp) sd s1, 32(sp) sd s6, 40(sp) - sd s8, 48(sp) - sd s2, 56(sp) - sd s3, 64(sp) + sd s2, 48(sp) + sd s3, 56(sp) + sd s8, 64(sp) sd s4, 72(sp) sd s7, 80(sp) sd s9, 88(sp) sd s10, 96(sp) jal getint -pcrel524: - auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - li s2, 50 +pcrel520: + auipc s6, %pcrel_hi(cmmc_parallel_body_payload_1) mv s11, a0 -pcrel525: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel526: - auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel527: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel528: - auipc a3, %pcrel_hi(cmmc_parallel_body_0) -pcrel529: - auipc a1, %pcrel_hi(cmmc_parallel_body_1) sd a0, 104(sp) - addi a2, a3, %pcrel_lo(pcrel528) -pcrel530: - auipc a0, %pcrel_hi(cmmc_parallel_body_2) - sd a2, 112(sp) - addi s0, a0, %pcrel_lo(pcrel530) - addi a2, a1, %pcrel_lo(pcrel529) +pcrel521: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) + li s4, 50 +pcrel522: + auipc a1, %pcrel_hi(cmmc_parallel_body_0) +pcrel523: + auipc a2, %pcrel_hi(cmmc_parallel_body_3) +pcrel524: + auipc a0, %pcrel_hi(cmmc_parallel_body_1) + addi s0, a1, %pcrel_lo(pcrel522) + addi s3, a2, %pcrel_lo(pcrel523) + addi s1, a0, %pcrel_lo(pcrel524) +pcrel525: + auipc a1, %pcrel_hi(cmmc_parallel_body_2) li a0, 1005 -pcrel531: - auipc a1, %pcrel_hi(cmmc_parallel_body_3) + addi s2, a1, %pcrel_lo(pcrel525) slli s8, a0, 3 - sd a2, 120(sp) - addi s1, a1, %pcrel_lo(pcrel531) ble s11, zero, label389 -pcrel532: +pcrel526: auipc a0, %pcrel_hi(A) mv s9, zero - addi s7, a0, %pcrel_lo(pcrel532) + addi s7, a0, %pcrel_lo(pcrel526) mv s10, s7 mv s11, zero j label405 @@ -98,10 +92,10 @@ label405: label389: ld s11, 104(sp) ble s11, zero, label390 -pcrel533: +pcrel527: auipc a1, %pcrel_hi(B) mv s8, zero - addi s7, a1, %pcrel_lo(pcrel533) + addi s7, a1, %pcrel_lo(pcrel527) .p2align 2 label397: jal getint @@ -116,45 +110,45 @@ label390: li a0, 59 jal _sysy_starttime mv s8, zero -pcrel534: +pcrel528: auipc a1, %pcrel_hi(C) - addi s7, a1, %pcrel_lo(pcrel534) + addi s7, a1, %pcrel_lo(pcrel528) .p2align 2 label391: ld s11, 104(sp) ble s11, zero, label394 mv a0, zero -pcrel535: - auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) - sw s11, %pcrel_lo(pcrel535)(s3) - ld a2, 112(sp) +pcrel529: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) + sw s11, %pcrel_lo(pcrel529)(s5) mv a1, s11 + mv a2, s0 jal cmmcParallelFor mv a0, zero -pcrel536: - auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - sw s11, %pcrel_lo(pcrel536)(s4) - ld a2, 120(sp) +pcrel530: + auipc s6, %pcrel_hi(cmmc_parallel_body_payload_1) + sw s11, %pcrel_lo(pcrel530)(s6) mv a1, s11 + mv a2, s1 jal cmmcParallelFor mv a0, zero -pcrel537: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) - sw s11, %pcrel_lo(pcrel537)(s5) +pcrel531: + auipc a3, %pcrel_hi(cmmc_parallel_body_payload_2) + sw s11, %pcrel_lo(pcrel531)(a3) mv a1, s11 - mv a2, s0 + mv a2, s2 jal cmmcParallelFor mv a0, zero -pcrel538: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_3) - sw s11, %pcrel_lo(pcrel538)(s6) +pcrel532: + auipc a3, %pcrel_hi(cmmc_parallel_body_payload_3) + sw s11, %pcrel_lo(pcrel532)(a3) mv a1, s11 - mv a2, s1 + mv a2, s3 jal cmmcParallelFor .p2align 2 label394: addiw s8, s8, 1 - blt s8, s2, label391 + blt s8, s4, label391 li a0, 67 jal _sysy_stoptime ld s11, 104(sp) @@ -168,14 +162,14 @@ label394: ld s5, 24(sp) ld s1, 32(sp) ld s6, 40(sp) - ld s8, 48(sp) - ld s2, 56(sp) - ld s3, 64(sp) + ld s2, 48(sp) + ld s3, 56(sp) + ld s8, 64(sp) ld s4, 72(sp) ld s7, 80(sp) ld s9, 88(sp) ld s10, 96(sp) - addi sp, sp, 128 + addi sp, sp, 112 ret .p2align 2 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/02_mv3.arm.s b/tests/SysY2022/performance/02_mv3.arm.s index 537f06715..63b5810d2 100644 --- a/tests/SysY2022/performance/02_mv3.arm.s +++ b/tests/SysY2022/performance/02_mv3.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 A: .zero 16160400 -.align 8 +.p2align 3 B: .zero 8040 -.align 8 +.p2align 3 C: .zero 8040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/02_mv3.riscv.s b/tests/SysY2022/performance/02_mv3.riscv.s index e090a7b6a..0a09ecafa 100644 --- a/tests/SysY2022/performance/02_mv3.riscv.s +++ b/tests/SysY2022/performance/02_mv3.riscv.s @@ -1,79 +1,73 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 A: .zero 16160400 -.align 8 +.p2align 3 B: .zero 8040 -.align 8 +.p2align 3 C: .zero 8040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 4 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[24] CalleeSaved[104] - addi sp, sp, -128 + # stack usage: CalleeArg[0] Local[0] RegSpill[8] CalleeSaved[104] + addi sp, sp, -112 sd ra, 0(sp) sd s11, 8(sp) sd s0, 16(sp) sd s5, 24(sp) sd s1, 32(sp) sd s6, 40(sp) - sd s8, 48(sp) - sd s2, 56(sp) - sd s3, 64(sp) + sd s2, 48(sp) + sd s3, 56(sp) + sd s8, 64(sp) sd s4, 72(sp) sd s7, 80(sp) sd s9, 88(sp) sd s10, 96(sp) jal getint -pcrel524: - auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - li s2, 50 +pcrel520: + auipc s6, %pcrel_hi(cmmc_parallel_body_payload_1) mv s11, a0 -pcrel525: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel526: - auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel527: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel528: - auipc a3, %pcrel_hi(cmmc_parallel_body_0) -pcrel529: - auipc a1, %pcrel_hi(cmmc_parallel_body_1) sd a0, 104(sp) - addi a2, a3, %pcrel_lo(pcrel528) -pcrel530: - auipc a0, %pcrel_hi(cmmc_parallel_body_2) - sd a2, 112(sp) - addi s0, a0, %pcrel_lo(pcrel530) - addi a2, a1, %pcrel_lo(pcrel529) +pcrel521: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) + li s4, 50 +pcrel522: + auipc a1, %pcrel_hi(cmmc_parallel_body_0) +pcrel523: + auipc a2, %pcrel_hi(cmmc_parallel_body_3) +pcrel524: + auipc a0, %pcrel_hi(cmmc_parallel_body_1) + addi s0, a1, %pcrel_lo(pcrel522) + addi s3, a2, %pcrel_lo(pcrel523) + addi s1, a0, %pcrel_lo(pcrel524) +pcrel525: + auipc a1, %pcrel_hi(cmmc_parallel_body_2) li a0, 1005 -pcrel531: - auipc a1, %pcrel_hi(cmmc_parallel_body_3) + addi s2, a1, %pcrel_lo(pcrel525) slli s8, a0, 3 - sd a2, 120(sp) - addi s1, a1, %pcrel_lo(pcrel531) ble s11, zero, label389 -pcrel532: +pcrel526: auipc a0, %pcrel_hi(A) mv s9, zero - addi s7, a0, %pcrel_lo(pcrel532) + addi s7, a0, %pcrel_lo(pcrel526) mv s10, s7 mv s11, zero j label405 @@ -98,10 +92,10 @@ label405: label389: ld s11, 104(sp) ble s11, zero, label390 -pcrel533: +pcrel527: auipc a1, %pcrel_hi(B) mv s8, zero - addi s7, a1, %pcrel_lo(pcrel533) + addi s7, a1, %pcrel_lo(pcrel527) .p2align 2 label397: jal getint @@ -116,45 +110,45 @@ label390: li a0, 59 jal _sysy_starttime mv s8, zero -pcrel534: +pcrel528: auipc a1, %pcrel_hi(C) - addi s7, a1, %pcrel_lo(pcrel534) + addi s7, a1, %pcrel_lo(pcrel528) .p2align 2 label391: ld s11, 104(sp) ble s11, zero, label394 mv a0, zero -pcrel535: - auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) - sw s11, %pcrel_lo(pcrel535)(s3) - ld a2, 112(sp) +pcrel529: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) + sw s11, %pcrel_lo(pcrel529)(s5) mv a1, s11 + mv a2, s0 jal cmmcParallelFor mv a0, zero -pcrel536: - auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) - sw s11, %pcrel_lo(pcrel536)(s4) - ld a2, 120(sp) +pcrel530: + auipc s6, %pcrel_hi(cmmc_parallel_body_payload_1) + sw s11, %pcrel_lo(pcrel530)(s6) mv a1, s11 + mv a2, s1 jal cmmcParallelFor mv a0, zero -pcrel537: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) - sw s11, %pcrel_lo(pcrel537)(s5) +pcrel531: + auipc a3, %pcrel_hi(cmmc_parallel_body_payload_2) + sw s11, %pcrel_lo(pcrel531)(a3) mv a1, s11 - mv a2, s0 + mv a2, s2 jal cmmcParallelFor mv a0, zero -pcrel538: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_3) - sw s11, %pcrel_lo(pcrel538)(s6) +pcrel532: + auipc a3, %pcrel_hi(cmmc_parallel_body_payload_3) + sw s11, %pcrel_lo(pcrel532)(a3) mv a1, s11 - mv a2, s1 + mv a2, s3 jal cmmcParallelFor .p2align 2 label394: addiw s8, s8, 1 - blt s8, s2, label391 + blt s8, s4, label391 li a0, 67 jal _sysy_stoptime ld s11, 104(sp) @@ -168,14 +162,14 @@ label394: ld s5, 24(sp) ld s1, 32(sp) ld s6, 40(sp) - ld s8, 48(sp) - ld s2, 56(sp) - ld s3, 64(sp) + ld s2, 48(sp) + ld s3, 56(sp) + ld s8, 64(sp) ld s4, 72(sp) ld s7, 80(sp) ld s9, 88(sp) ld s10, 96(sp) - addi sp, sp, 128 + addi sp, sp, 112 ret .p2align 2 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/03_sort1.arm.s b/tests/SysY2022/performance/03_sort1.arm.s index 74ceb3be3..71936b33d 100644 --- a/tests/SysY2022/performance/03_sort1.arm.s +++ b/tests/SysY2022/performance/03_sort1.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 120000040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 12 .text diff --git a/tests/SysY2022/performance/03_sort1.riscv.s b/tests/SysY2022/performance/03_sort1.riscv.s index 7534a81ec..beb487351 100644 --- a/tests/SysY2022/performance/03_sort1.riscv.s +++ b/tests/SysY2022/performance/03_sort1.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 120000040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 12 .text diff --git a/tests/SysY2022/performance/03_sort2.arm.s b/tests/SysY2022/performance/03_sort2.arm.s index 74ceb3be3..71936b33d 100644 --- a/tests/SysY2022/performance/03_sort2.arm.s +++ b/tests/SysY2022/performance/03_sort2.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 120000040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 12 .text diff --git a/tests/SysY2022/performance/03_sort2.riscv.s b/tests/SysY2022/performance/03_sort2.riscv.s index 7534a81ec..beb487351 100644 --- a/tests/SysY2022/performance/03_sort2.riscv.s +++ b/tests/SysY2022/performance/03_sort2.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 120000040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 12 .text diff --git a/tests/SysY2022/performance/03_sort3.arm.s b/tests/SysY2022/performance/03_sort3.arm.s index 74ceb3be3..71936b33d 100644 --- a/tests/SysY2022/performance/03_sort3.arm.s +++ b/tests/SysY2022/performance/03_sort3.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 120000040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 12 .text diff --git a/tests/SysY2022/performance/03_sort3.riscv.s b/tests/SysY2022/performance/03_sort3.riscv.s index 7534a81ec..beb487351 100644 --- a/tests/SysY2022/performance/03_sort3.riscv.s +++ b/tests/SysY2022/performance/03_sort3.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 120000040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 12 .text diff --git a/tests/SysY2022/performance/04_spmv1.arm.s b/tests/SysY2022/performance/04_spmv1.arm.s index 2c859f215..554764091 100644 --- a/tests/SysY2022/performance/04_spmv1.arm.s +++ b/tests/SysY2022/performance/04_spmv1.arm.s @@ -1,25 +1,25 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 x: .zero 400040 -.align 8 +.p2align 3 y: .zero 12000000 -.align 8 +.p2align 3 v: .zero 12000000 -.align 8 +.p2align 3 a: .zero 400040 -.align 8 +.p2align 3 b: .zero 400040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/04_spmv1.riscv.s b/tests/SysY2022/performance/04_spmv1.riscv.s index e54d39066..e32eb5504 100644 --- a/tests/SysY2022/performance/04_spmv1.riscv.s +++ b/tests/SysY2022/performance/04_spmv1.riscv.s @@ -1,124 +1,114 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 x: .zero 400040 -.align 8 +.p2align 3 y: .zero 12000000 -.align 8 +.p2align 3 v: .zero 12000000 -.align 8 +.p2align 3 a: .zero 400040 -.align 8 +.p2align 3 b: .zero 400040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 4 .text .p2align 2 .globl main main: - addi sp, sp, -88 -pcrel805: + addi sp, sp, -56 +pcrel795: auipc a1, %pcrel_hi(x) sd ra, 0(sp) - addi a0, a1, %pcrel_lo(pcrel805) + addi a0, a1, %pcrel_lo(pcrel795) sd s1, 8(sp) - sd s6, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s4, 40(sp) - sd s9, 48(sp) - sd s2, 56(sp) - sd s3, 64(sp) - sd s7, 72(sp) - sd s8, 80(sp) + sd s0, 16(sp) + sd s5, 24(sp) + sd s2, 32(sp) + sd s3, 40(sp) + sd s4, 48(sp) jal getarray -pcrel806: +pcrel796: auipc a1, %pcrel_hi(y) mv s1, a0 - addi a0, a1, %pcrel_lo(pcrel806) + addi a0, a1, %pcrel_lo(pcrel796) jal getarray -pcrel807: +pcrel797: auipc a1, %pcrel_hi(v) - addi a0, a1, %pcrel_lo(pcrel807) + addi a0, a1, %pcrel_lo(pcrel797) jal getarray -pcrel808: +pcrel798: auipc a1, %pcrel_hi(a) - addi a0, a1, %pcrel_lo(pcrel808) + addi a0, a1, %pcrel_lo(pcrel798) jal getarray li a0, 39 jal _sysy_starttime -pcrel809: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel810: - auipc a0, %pcrel_hi(b) -pcrel811: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) - mv s9, zero -pcrel812: +pcrel799: auipc a1, %pcrel_hi(cmmc_parallel_body_0) - li s5, 100 + mv s5, zero +pcrel800: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_0) + li s3, 100 +pcrel801: + auipc a0, %pcrel_hi(b) addiw s0, s1, -1 -pcrel813: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_0) - addi s4, a0, %pcrel_lo(pcrel810) - addi s1, a1, %pcrel_lo(pcrel812) -pcrel814: - auipc a0, %pcrel_hi(cmmc_parallel_body_1) -pcrel815: - auipc a1, %pcrel_hi(cmmc_parallel_body_2) - addi s2, a0, %pcrel_lo(pcrel814) - addi s3, a1, %pcrel_lo(pcrel815) + addi s2, a0, %pcrel_lo(pcrel801) + addi s1, a1, %pcrel_lo(pcrel799) j label719 .p2align 2 label722: - addiw s9, s9, 1 - bge s9, s5, label723 + addiw s5, s5, 1 + bge s5, s3, label723 .p2align 2 label719: ble s0, zero, label722 -pcrel816: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_0) - sw s0, %pcrel_lo(pcrel816)(s6) +pcrel802: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_0) + sw s0, %pcrel_lo(pcrel802)(s4) mv a0, zero mv a1, s0 mv a2, s1 jal cmmcParallelFor - mv a0, zero -pcrel817: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) - sw s0, %pcrel_lo(pcrel817)(s7) +pcrel803: + auipc a3, %pcrel_hi(cmmc_parallel_body_1) +pcrel804: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_1) + addi a2, a3, %pcrel_lo(pcrel803) + sw s0, %pcrel_lo(pcrel804)(a0) mv a1, s0 - mv a2, s2 - jal cmmcParallelFor mv a0, zero -pcrel818: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_2) - sw s0, %pcrel_lo(pcrel818)(s8) + jal cmmcParallelFor +pcrel805: + auipc a3, %pcrel_hi(cmmc_parallel_body_2) +pcrel806: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) + addi a2, a3, %pcrel_lo(pcrel805) + sw s0, %pcrel_lo(pcrel806)(a0) mv a1, s0 - mv a2, s3 + mv a0, zero jal cmmcParallelFor -pcrel819: +pcrel807: auipc a3, %pcrel_hi(cmmc_parallel_body_3) -pcrel820: +pcrel808: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - addi a2, a3, %pcrel_lo(pcrel819) - sw s0, %pcrel_lo(pcrel820)(a0) + addi a2, a3, %pcrel_lo(pcrel807) + sw s0, %pcrel_lo(pcrel808)(a0) mv a1, s0 mv a0, zero jal cmmcParallelFor @@ -127,21 +117,17 @@ label723: li a0, 47 jal _sysy_stoptime mv a0, s0 - mv a1, s4 + mv a1, s2 jal putarray - mv a0, zero ld ra, 0(sp) + mv a0, zero ld s1, 8(sp) - ld s6, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s4, 40(sp) - ld s9, 48(sp) - ld s2, 56(sp) - ld s3, 64(sp) - ld s7, 72(sp) - ld s8, 80(sp) - addi sp, sp, 88 + ld s0, 16(sp) + ld s5, 24(sp) + ld s2, 32(sp) + ld s3, 40(sp) + ld s4, 48(sp) + addi sp, sp, 56 ret .p2align 2 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/04_spmv2.arm.s b/tests/SysY2022/performance/04_spmv2.arm.s index 2c859f215..554764091 100644 --- a/tests/SysY2022/performance/04_spmv2.arm.s +++ b/tests/SysY2022/performance/04_spmv2.arm.s @@ -1,25 +1,25 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 x: .zero 400040 -.align 8 +.p2align 3 y: .zero 12000000 -.align 8 +.p2align 3 v: .zero 12000000 -.align 8 +.p2align 3 a: .zero 400040 -.align 8 +.p2align 3 b: .zero 400040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/04_spmv2.riscv.s b/tests/SysY2022/performance/04_spmv2.riscv.s index e54d39066..e32eb5504 100644 --- a/tests/SysY2022/performance/04_spmv2.riscv.s +++ b/tests/SysY2022/performance/04_spmv2.riscv.s @@ -1,124 +1,114 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 x: .zero 400040 -.align 8 +.p2align 3 y: .zero 12000000 -.align 8 +.p2align 3 v: .zero 12000000 -.align 8 +.p2align 3 a: .zero 400040 -.align 8 +.p2align 3 b: .zero 400040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 4 .text .p2align 2 .globl main main: - addi sp, sp, -88 -pcrel805: + addi sp, sp, -56 +pcrel795: auipc a1, %pcrel_hi(x) sd ra, 0(sp) - addi a0, a1, %pcrel_lo(pcrel805) + addi a0, a1, %pcrel_lo(pcrel795) sd s1, 8(sp) - sd s6, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s4, 40(sp) - sd s9, 48(sp) - sd s2, 56(sp) - sd s3, 64(sp) - sd s7, 72(sp) - sd s8, 80(sp) + sd s0, 16(sp) + sd s5, 24(sp) + sd s2, 32(sp) + sd s3, 40(sp) + sd s4, 48(sp) jal getarray -pcrel806: +pcrel796: auipc a1, %pcrel_hi(y) mv s1, a0 - addi a0, a1, %pcrel_lo(pcrel806) + addi a0, a1, %pcrel_lo(pcrel796) jal getarray -pcrel807: +pcrel797: auipc a1, %pcrel_hi(v) - addi a0, a1, %pcrel_lo(pcrel807) + addi a0, a1, %pcrel_lo(pcrel797) jal getarray -pcrel808: +pcrel798: auipc a1, %pcrel_hi(a) - addi a0, a1, %pcrel_lo(pcrel808) + addi a0, a1, %pcrel_lo(pcrel798) jal getarray li a0, 39 jal _sysy_starttime -pcrel809: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel810: - auipc a0, %pcrel_hi(b) -pcrel811: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) - mv s9, zero -pcrel812: +pcrel799: auipc a1, %pcrel_hi(cmmc_parallel_body_0) - li s5, 100 + mv s5, zero +pcrel800: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_0) + li s3, 100 +pcrel801: + auipc a0, %pcrel_hi(b) addiw s0, s1, -1 -pcrel813: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_0) - addi s4, a0, %pcrel_lo(pcrel810) - addi s1, a1, %pcrel_lo(pcrel812) -pcrel814: - auipc a0, %pcrel_hi(cmmc_parallel_body_1) -pcrel815: - auipc a1, %pcrel_hi(cmmc_parallel_body_2) - addi s2, a0, %pcrel_lo(pcrel814) - addi s3, a1, %pcrel_lo(pcrel815) + addi s2, a0, %pcrel_lo(pcrel801) + addi s1, a1, %pcrel_lo(pcrel799) j label719 .p2align 2 label722: - addiw s9, s9, 1 - bge s9, s5, label723 + addiw s5, s5, 1 + bge s5, s3, label723 .p2align 2 label719: ble s0, zero, label722 -pcrel816: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_0) - sw s0, %pcrel_lo(pcrel816)(s6) +pcrel802: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_0) + sw s0, %pcrel_lo(pcrel802)(s4) mv a0, zero mv a1, s0 mv a2, s1 jal cmmcParallelFor - mv a0, zero -pcrel817: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) - sw s0, %pcrel_lo(pcrel817)(s7) +pcrel803: + auipc a3, %pcrel_hi(cmmc_parallel_body_1) +pcrel804: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_1) + addi a2, a3, %pcrel_lo(pcrel803) + sw s0, %pcrel_lo(pcrel804)(a0) mv a1, s0 - mv a2, s2 - jal cmmcParallelFor mv a0, zero -pcrel818: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_2) - sw s0, %pcrel_lo(pcrel818)(s8) + jal cmmcParallelFor +pcrel805: + auipc a3, %pcrel_hi(cmmc_parallel_body_2) +pcrel806: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) + addi a2, a3, %pcrel_lo(pcrel805) + sw s0, %pcrel_lo(pcrel806)(a0) mv a1, s0 - mv a2, s3 + mv a0, zero jal cmmcParallelFor -pcrel819: +pcrel807: auipc a3, %pcrel_hi(cmmc_parallel_body_3) -pcrel820: +pcrel808: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - addi a2, a3, %pcrel_lo(pcrel819) - sw s0, %pcrel_lo(pcrel820)(a0) + addi a2, a3, %pcrel_lo(pcrel807) + sw s0, %pcrel_lo(pcrel808)(a0) mv a1, s0 mv a0, zero jal cmmcParallelFor @@ -127,21 +117,17 @@ label723: li a0, 47 jal _sysy_stoptime mv a0, s0 - mv a1, s4 + mv a1, s2 jal putarray - mv a0, zero ld ra, 0(sp) + mv a0, zero ld s1, 8(sp) - ld s6, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s4, 40(sp) - ld s9, 48(sp) - ld s2, 56(sp) - ld s3, 64(sp) - ld s7, 72(sp) - ld s8, 80(sp) - addi sp, sp, 88 + ld s0, 16(sp) + ld s5, 24(sp) + ld s2, 32(sp) + ld s3, 40(sp) + ld s4, 48(sp) + addi sp, sp, 56 ret .p2align 2 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/04_spmv3.arm.s b/tests/SysY2022/performance/04_spmv3.arm.s index 2c859f215..554764091 100644 --- a/tests/SysY2022/performance/04_spmv3.arm.s +++ b/tests/SysY2022/performance/04_spmv3.arm.s @@ -1,25 +1,25 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 x: .zero 400040 -.align 8 +.p2align 3 y: .zero 12000000 -.align 8 +.p2align 3 v: .zero 12000000 -.align 8 +.p2align 3 a: .zero 400040 -.align 8 +.p2align 3 b: .zero 400040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/04_spmv3.riscv.s b/tests/SysY2022/performance/04_spmv3.riscv.s index e54d39066..e32eb5504 100644 --- a/tests/SysY2022/performance/04_spmv3.riscv.s +++ b/tests/SysY2022/performance/04_spmv3.riscv.s @@ -1,124 +1,114 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 x: .zero 400040 -.align 8 +.p2align 3 y: .zero 12000000 -.align 8 +.p2align 3 v: .zero 12000000 -.align 8 +.p2align 3 a: .zero 400040 -.align 8 +.p2align 3 b: .zero 400040 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 4 .text .p2align 2 .globl main main: - addi sp, sp, -88 -pcrel805: + addi sp, sp, -56 +pcrel795: auipc a1, %pcrel_hi(x) sd ra, 0(sp) - addi a0, a1, %pcrel_lo(pcrel805) + addi a0, a1, %pcrel_lo(pcrel795) sd s1, 8(sp) - sd s6, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s4, 40(sp) - sd s9, 48(sp) - sd s2, 56(sp) - sd s3, 64(sp) - sd s7, 72(sp) - sd s8, 80(sp) + sd s0, 16(sp) + sd s5, 24(sp) + sd s2, 32(sp) + sd s3, 40(sp) + sd s4, 48(sp) jal getarray -pcrel806: +pcrel796: auipc a1, %pcrel_hi(y) mv s1, a0 - addi a0, a1, %pcrel_lo(pcrel806) + addi a0, a1, %pcrel_lo(pcrel796) jal getarray -pcrel807: +pcrel797: auipc a1, %pcrel_hi(v) - addi a0, a1, %pcrel_lo(pcrel807) + addi a0, a1, %pcrel_lo(pcrel797) jal getarray -pcrel808: +pcrel798: auipc a1, %pcrel_hi(a) - addi a0, a1, %pcrel_lo(pcrel808) + addi a0, a1, %pcrel_lo(pcrel798) jal getarray li a0, 39 jal _sysy_starttime -pcrel809: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel810: - auipc a0, %pcrel_hi(b) -pcrel811: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) - mv s9, zero -pcrel812: +pcrel799: auipc a1, %pcrel_hi(cmmc_parallel_body_0) - li s5, 100 + mv s5, zero +pcrel800: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_0) + li s3, 100 +pcrel801: + auipc a0, %pcrel_hi(b) addiw s0, s1, -1 -pcrel813: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_0) - addi s4, a0, %pcrel_lo(pcrel810) - addi s1, a1, %pcrel_lo(pcrel812) -pcrel814: - auipc a0, %pcrel_hi(cmmc_parallel_body_1) -pcrel815: - auipc a1, %pcrel_hi(cmmc_parallel_body_2) - addi s2, a0, %pcrel_lo(pcrel814) - addi s3, a1, %pcrel_lo(pcrel815) + addi s2, a0, %pcrel_lo(pcrel801) + addi s1, a1, %pcrel_lo(pcrel799) j label719 .p2align 2 label722: - addiw s9, s9, 1 - bge s9, s5, label723 + addiw s5, s5, 1 + bge s5, s3, label723 .p2align 2 label719: ble s0, zero, label722 -pcrel816: - auipc s6, %pcrel_hi(cmmc_parallel_body_payload_0) - sw s0, %pcrel_lo(pcrel816)(s6) +pcrel802: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_0) + sw s0, %pcrel_lo(pcrel802)(s4) mv a0, zero mv a1, s0 mv a2, s1 jal cmmcParallelFor - mv a0, zero -pcrel817: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) - sw s0, %pcrel_lo(pcrel817)(s7) +pcrel803: + auipc a3, %pcrel_hi(cmmc_parallel_body_1) +pcrel804: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_1) + addi a2, a3, %pcrel_lo(pcrel803) + sw s0, %pcrel_lo(pcrel804)(a0) mv a1, s0 - mv a2, s2 - jal cmmcParallelFor mv a0, zero -pcrel818: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_2) - sw s0, %pcrel_lo(pcrel818)(s8) + jal cmmcParallelFor +pcrel805: + auipc a3, %pcrel_hi(cmmc_parallel_body_2) +pcrel806: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) + addi a2, a3, %pcrel_lo(pcrel805) + sw s0, %pcrel_lo(pcrel806)(a0) mv a1, s0 - mv a2, s3 + mv a0, zero jal cmmcParallelFor -pcrel819: +pcrel807: auipc a3, %pcrel_hi(cmmc_parallel_body_3) -pcrel820: +pcrel808: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - addi a2, a3, %pcrel_lo(pcrel819) - sw s0, %pcrel_lo(pcrel820)(a0) + addi a2, a3, %pcrel_lo(pcrel807) + sw s0, %pcrel_lo(pcrel808)(a0) mv a1, s0 mv a0, zero jal cmmcParallelFor @@ -127,21 +117,17 @@ label723: li a0, 47 jal _sysy_stoptime mv a0, s0 - mv a1, s4 + mv a1, s2 jal putarray - mv a0, zero ld ra, 0(sp) + mv a0, zero ld s1, 8(sp) - ld s6, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s4, 40(sp) - ld s9, 48(sp) - ld s2, 56(sp) - ld s3, 64(sp) - ld s7, 72(sp) - ld s8, 80(sp) - addi sp, sp, 88 + ld s0, 16(sp) + ld s5, 24(sp) + ld s2, 32(sp) + ld s3, 40(sp) + ld s4, 48(sp) + addi sp, sp, 56 ret .p2align 2 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/brainfuck-bootstrap.arm.s b/tests/SysY2022/performance/brainfuck-bootstrap.arm.s index d3f9c2fb3..fd9d3afce 100644 --- a/tests/SysY2022/performance/brainfuck-bootstrap.arm.s +++ b/tests/SysY2022/performance/brainfuck-bootstrap.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 program: .zero 262144 -.align 8 +.p2align 3 tape: .zero 262144 -.align 8 +.p2align 3 input: .zero 262144 -.align 8 +.p2align 3 output: .zero 262144 -.align 8 +.p2align 3 return_a: .zero 2048 .text diff --git a/tests/SysY2022/performance/brainfuck-bootstrap.riscv.s b/tests/SysY2022/performance/brainfuck-bootstrap.riscv.s index a00802beb..2cf74a791 100644 --- a/tests/SysY2022/performance/brainfuck-bootstrap.riscv.s +++ b/tests/SysY2022/performance/brainfuck-bootstrap.riscv.s @@ -1,273 +1,311 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 program: .zero 262144 -.align 8 +.p2align 3 tape: .zero 262144 -.align 8 +.p2align 3 input: .zero 262144 -.align 8 +.p2align 3 output: .zero 262144 -.align 8 +.p2align 3 return_a: .zero 2048 .text .p2align 2 .globl main main: - addi sp, sp, -80 + addi sp, sp, -104 sd ra, 0(sp) sd s0, 8(sp) li s0, 1 sd s5, 16(sp) - li s5, 360287970357415681 - sd s1, 24(sp) + li s5, 45 + sd s8, 24(sp) + li s8, 360287970357415681 + sd s1, 32(sp) li s1, 93 - sd s6, 32(sp) - sd s4, 40(sp) - li s4, 512 - sd s3, 48(sp) + sd s6, 40(sp) + sd s9, 48(sp) + li s9, 35 sd s2, 56(sp) - sd s7, 64(sp) - sd s8, 72(sp) + li s2, 62 + sd s3, 64(sp) + li s3, 60 + sd s4, 72(sp) + li s4, 43 + sd s7, 80(sp) + sd s10, 88(sp) + sd s11, 96(sp) .p2align 2 label2: jal getch addiw a1, a0, -35 - slti a4, a1, 0 - sll a3, s0, a1 - and a5, a3, s5 - sltiu a2, a5, 1 - slt a5, s1, a0 - or a3, a2, a4 - or a1, a3, a5 + slti a5, a1, 0 + sll a2, s0, a1 + and a4, a2, s8 + sltiu a3, a4, 1 + slt a4, s1, a0 + or a2, a3, a5 + or a1, a2, a4 bne a1, zero, label2 -pcrel340: +pcrel479: auipc a1, %pcrel_hi(input) -pcrel341: +pcrel480: auipc a2, %pcrel_hi(program) - addi s3, a1, %pcrel_lo(pcrel340) - addi s2, a2, %pcrel_lo(pcrel341) - li a1, 35 - beq a0, a1, label87 - mv s6, s2 - mv s7, zero - sw a0, 0(s2) + addi s7, a1, %pcrel_lo(pcrel479) + addi s6, a2, %pcrel_lo(pcrel480) + beq a0, s9, label87 + mv s10, s6 + mv s11, zero + sw a0, 0(s6) j label66 .p2align 2 label68: - addi s6, s6, 4 - sw a0, 0(s6) + addi s10, s10, 4 + sw a0, 0(s10) .p2align 2 label66: jal getch addiw a1, a0, -35 slti a5, a1, 0 sll a2, s0, a1 - and a4, a2, s5 + and a4, a2, s8 sltiu a3, a4, 1 slt a4, s1, a0 or a2, a3, a5 or a1, a2, a4 bne a1, zero, label66 - addiw s7, s7, 1 - li a1, 35 - bne a0, a1, label68 - mv s5, s7 + addiw s11, s11, 1 + bne a0, s9, label68 + mv s8, s11 label4: jal getch li a1, 105 - bne a0, a1, label93 - jal getint - mv s6, a0 - jal getch - ble s6, zero, label12 - mv s7, s3 - mv s8, zero -.p2align 2 -label8: - jal getch - addiw s8, s8, 1 - sw a0, 0(s7) - ble s6, s8, label12 - addi s7, s7, 4 - j label8 -label93: - mv s6, zero + beq a0, a1, label6 + mv s9, zero label12: li a0, 116 jal _sysy_starttime mv a2, zero -pcrel342: +pcrel481: auipc a3, %pcrel_hi(return_a) - addi a0, a3, %pcrel_lo(pcrel342) + addi a0, a3, %pcrel_lo(pcrel481) mv a1, a0 - j label14 -.p2align 2 -label17: - addi a1, a1, 4 .p2align 2 label14: - addiw a2, a2, 1 - sw zero, 0(a1) - blt a2, s4, label17 -pcrel343: - auipc a3, %pcrel_hi(tape) -pcrel344: - auipc a2, %pcrel_hi(output) - addi a1, a3, %pcrel_lo(pcrel343) - addi s4, a2, %pcrel_lo(pcrel344) - ble s5, zero, label121 + sd zero, 0(a1) + addiw a2, a2, 64 + li a3, 512 + sd zero, 8(a1) + sd zero, 16(a1) + sd zero, 24(a1) + sd zero, 32(a1) + sd zero, 40(a1) + sd zero, 48(a1) + sd zero, 56(a1) + sd zero, 64(a1) + sd zero, 72(a1) + sd zero, 80(a1) + sd zero, 88(a1) + sd zero, 96(a1) + sd zero, 104(a1) + sd zero, 112(a1) + sd zero, 120(a1) + sd zero, 128(a1) + sd zero, 136(a1) + sd zero, 144(a1) + sd zero, 152(a1) + sd zero, 160(a1) + sd zero, 168(a1) + sd zero, 176(a1) + sd zero, 184(a1) + sd zero, 192(a1) + sd zero, 200(a1) + sd zero, 208(a1) + sd zero, 216(a1) + sd zero, 224(a1) + sd zero, 232(a1) + sd zero, 240(a1) + sd zero, 248(a1) + bge a2, a3, label17 + addi a1, a1, 256 + j label14 +label17: + auipc a2, %pcrel_hi(tape) +pcrel482: + auipc a3, %pcrel_hi(output) + addi a1, a2, %pcrel_lo(label17) + addi s10, a3, %pcrel_lo(pcrel482) + ble s8, zero, label183 mv a2, zero mv t0, zero mv a4, zero mv a3, zero mv a5, zero - j label19 + j label18 .p2align 2 -label25: - li t2, 60 - bne t1, t2, label26 +label45: addiw a4, a4, -1 .p2align 2 label46: addiw t0, t0, 1 - ble s5, t0, label323 + ble s8, t0, label461 .p2align 2 -label19: - sh2add t2, t0, s2 - li t3, 62 +label18: + sh2add t2, t0, s6 lw t1, 0(t2) - bne t1, t3, label25 + bne t1, s2, label190 addiw a4, a4, 1 addiw t0, t0, 1 - bgt s5, t0, label19 -label323: + bgt s8, t0, label18 +label461: mv s0, a2 -label53: +label52: li a0, 118 jal _sysy_stoptime - ble s0, zero, label55 + ble s0, zero, label59 mv s1, zero - j label57 + j label55 .p2align 2 -label60: - addi s4, s4, 4 +label58: + addi s10, s10, 4 .p2align 2 -label57: - lw a0, 0(s4) +label55: + lw a0, 0(s10) jal putch addiw s1, s1, 1 - bgt s0, s1, label60 -label55: + bgt s0, s1, label58 +label59: mv a0, zero ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) - ld s1, 24(sp) - ld s6, 32(sp) - ld s4, 40(sp) - ld s3, 48(sp) + ld s8, 24(sp) + ld s1, 32(sp) + ld s6, 40(sp) + ld s9, 48(sp) ld s2, 56(sp) - ld s7, 64(sp) - ld s8, 72(sp) - addi sp, sp, 80 + ld s3, 64(sp) + ld s4, 72(sp) + ld s7, 80(sp) + ld s10, 88(sp) + ld s11, 96(sp) + addi sp, sp, 104 ret .p2align 2 -label26: - li t2, 43 - bne t1, t2, label27 - sh2add t1, a4, a1 - addiw t0, t0, 1 - lw t2, 0(t1) - addi t3, t2, 1 - sw t3, 0(t1) - bgt s5, t0, label19 - j label323 -.p2align 2 -label27: - li t2, 45 - bne t1, t2, label28 +label190: + beq t1, s3, label45 + beq t1, s4, label44 + bne t1, s5, label465 sh2add t2, a4, a1 addiw t0, t0, 1 - lw t3, 0(t2) - addi t1, t3, -1 - sw t1, 0(t2) - bgt s5, t0, label19 - j label323 -label28: + lw t1, 0(t2) + addi t3, t1, -1 + sw t3, 0(t2) + bgt s8, t0, label18 + j label461 +.p2align 2 +label465: li t2, 91 - bne t1, t2, label29 + beq t1, t2, label38 + beq t1, s1, label30 + li t2, 46 + beq t1, t2, label37 + li t2, 44 + beq t1, t2, label34 + j label46 +label38: sh2add t1, a4, a1 lw t3, 0(t1) - bne t3, zero, label39 + beq t3, zero, label256 + sh2add t1, a3, a0 + addiw a3, a3, 1 + sw t0, 0(t1) + j label46 +label256: mv t2, s0 mv t1, t0 .p2align 2 label40: addiw t1, t1, 1 - sh2add t3, t1, s2 + sh2add t3, t1, s6 lw t0, 0(t3) - xori t4, t0, 93 - sltiu t6, t4, 1 - xori t4, t0, 91 - subw t5, t2, t6 - sltu t6, zero, t4 - addi t3, t5, 1 + xori t5, t0, 93 + sltiu t6, t5, 1 + xori t5, t0, 91 + subw t4, t2, t6 + sltu t6, zero, t5 + addi t3, t4, 1 subw t2, t3, t6 bgt t2, zero, label40 mv t0, t1 j label46 -label29: - beq t1, s1, label36 - li t2, 46 - bne t1, t2, label32 - sh2add t2, a4, a1 - sh2add t3, a2, s4 - lw t1, 0(t2) - addiw a2, a2, 1 - sw t1, 0(t3) - j label46 -label32: - li t2, 44 - beq t1, t2, label33 - j label46 -label36: +label30: sh2add t3, a4, a1 addiw t1, a3, -1 lw t2, 0(t3) - beq t2, zero, label183 + beq t2, zero, label220 sh2add t1, a3, a0 lw t0, -4(t1) j label46 -label33: - bgt s6, a5, label166 +label6: + jal getint + mv s9, a0 + jal getch + ble s9, zero, label12 + mv s10, s7 + mv s11, zero + j label8 +.p2align 2 +label11: + addi s10, s10, 4 +.p2align 2 +label8: + jal getch + addiw s11, s11, 1 + sw a0, 0(s10) + bgt s9, s11, label11 + j label12 +label34: + bgt s9, a5, label235 sh2add t1, a4, a1 sw zero, 0(t1) j label46 -label39: - sh2add t1, a3, a0 - addiw a3, a3, 1 - sw t0, 0(t1) - j label46 -label183: - mv a3, t1 - j label46 -label166: - sh2add t3, a5, s3 +.p2align 2 +label44: + sh2add t1, a4, a1 + addiw t0, t0, 1 + lw t3, 0(t1) + addi t2, t3, 1 + sw t2, 0(t1) + bgt s8, t0, label18 + j label461 +label37: sh2add t2, a4, a1 + sh2add t3, a2, s10 + lw t1, 0(t2) + addiw a2, a2, 1 + sw t1, 0(t3) + j label46 +label235: + sh2add t2, a5, s7 + sh2add t3, a4, a1 addiw a5, a5, 1 - lw t1, 0(t3) - sw t1, 0(t2) + lw t1, 0(t2) + sw t1, 0(t3) + j label46 +label220: + mv a3, t1 j label46 +label183: + mv s0, zero + j label52 label87: - mv s5, zero + mv s8, zero j label4 -label121: - mv s0, zero - j label53 diff --git a/tests/SysY2022/performance/brainfuck-bootstrap.sy.ir b/tests/SysY2022/performance/brainfuck-bootstrap.sy.ir index 2650111ba..f059b0344 100644 --- a/tests/SysY2022/performance/brainfuck-bootstrap.sy.ir +++ b/tests/SysY2022/performance/brainfuck-bootstrap.sy.ir @@ -28,13 +28,13 @@ func @main() -> i32 { NoRecurse Entry } { [65536 * i32]* %12 = ptrcast [65536 * i32]* @program to [65536 * i32]*; cbr i1 %10(prob = 0.984615), ^while.body1, ^b1; ^while.body1: - i32 %13 = phi [^b, i32 0] [^b4, i32 %39]; + i32 %13 = phi [^b, i32 0] [^b4, i32 %102]; i32 %14 = phi [^b, i32 %0] [^b4, i32 %22]; i32* %15 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %13]; store i32* %15 with i32 %14; ubr ^while.body2; ^b1: - i32 %16 = phi [^b, i32 0] [^b4, i32 %39]; + i32 %16 = phi [^b, i32 0] [^b4, i32 %102]; i32 %17 = call () -> i32 @getch(); i1 %18 = icmp neq i32 %17, i32 105; cbr i1 %18(prob = 0.5), ^if.then, ^b2; @@ -62,152 +62,278 @@ func @main() -> i32 { NoRecurse Entry } { i32 %34 = call () -> i32 @getch(); cbr i1 %33(prob = 0.984615), ^while.body3, ^if.then; ^b3: - i32 %35 = phi [^if.then, i32 0] [^b3, i32 %37]; + i32 %35 = phi [^if.then, i32 0] [^b3, i32 %100]; i32* %36 = getelementptr &(i32* %21)[i32 %35]; store i32* %36 with i32 0; - i32 %37 = add i32 %35, i32 1; - i1 %38 = icmp slt i32 %37, i32 512; - cbr i1 %38(prob = 0.998047), ^b3, ^b5; + i32* %37 = getelementptr &(i32* %36)[i64 1]; + store i32* %37 with i32 0; + i32* %38 = getelementptr &(i32* %36)[i64 2]; + store i32* %38 with i32 0; + i32* %39 = getelementptr &(i32* %36)[i64 3]; + store i32* %39 with i32 0; + i32* %40 = getelementptr &(i32* %36)[i64 4]; + store i32* %40 with i32 0; + i32* %41 = getelementptr &(i32* %36)[i64 5]; + store i32* %41 with i32 0; + i32* %42 = getelementptr &(i32* %36)[i64 6]; + store i32* %42 with i32 0; + i32* %43 = getelementptr &(i32* %36)[i64 7]; + store i32* %43 with i32 0; + i32* %44 = getelementptr &(i32* %36)[i64 8]; + store i32* %44 with i32 0; + i32* %45 = getelementptr &(i32* %36)[i64 9]; + store i32* %45 with i32 0; + i32* %46 = getelementptr &(i32* %36)[i64 10]; + store i32* %46 with i32 0; + i32* %47 = getelementptr &(i32* %36)[i64 11]; + store i32* %47 with i32 0; + i32* %48 = getelementptr &(i32* %36)[i64 12]; + store i32* %48 with i32 0; + i32* %49 = getelementptr &(i32* %36)[i64 13]; + store i32* %49 with i32 0; + i32* %50 = getelementptr &(i32* %36)[i64 14]; + store i32* %50 with i32 0; + i32* %51 = getelementptr &(i32* %36)[i64 15]; + store i32* %51 with i32 0; + i32* %52 = getelementptr &(i32* %36)[i64 16]; + store i32* %52 with i32 0; + i32* %53 = getelementptr &(i32* %36)[i64 17]; + store i32* %53 with i32 0; + i32* %54 = getelementptr &(i32* %36)[i64 18]; + store i32* %54 with i32 0; + i32* %55 = getelementptr &(i32* %36)[i64 19]; + store i32* %55 with i32 0; + i32* %56 = getelementptr &(i32* %36)[i64 20]; + store i32* %56 with i32 0; + i32* %57 = getelementptr &(i32* %36)[i64 21]; + store i32* %57 with i32 0; + i32* %58 = getelementptr &(i32* %36)[i64 22]; + store i32* %58 with i32 0; + i32* %59 = getelementptr &(i32* %36)[i64 23]; + store i32* %59 with i32 0; + i32* %60 = getelementptr &(i32* %36)[i64 24]; + store i32* %60 with i32 0; + i32* %61 = getelementptr &(i32* %36)[i64 25]; + store i32* %61 with i32 0; + i32* %62 = getelementptr &(i32* %36)[i64 26]; + store i32* %62 with i32 0; + i32* %63 = getelementptr &(i32* %36)[i64 27]; + store i32* %63 with i32 0; + i32* %64 = getelementptr &(i32* %36)[i64 28]; + store i32* %64 with i32 0; + i32* %65 = getelementptr &(i32* %36)[i64 29]; + store i32* %65 with i32 0; + i32* %66 = getelementptr &(i32* %36)[i64 30]; + store i32* %66 with i32 0; + i32* %67 = getelementptr &(i32* %36)[i64 31]; + store i32* %67 with i32 0; + i32* %68 = getelementptr &(i32* %36)[i64 32]; + store i32* %68 with i32 0; + i32* %69 = getelementptr &(i32* %36)[i64 33]; + store i32* %69 with i32 0; + i32* %70 = getelementptr &(i32* %36)[i64 34]; + store i32* %70 with i32 0; + i32* %71 = getelementptr &(i32* %36)[i64 35]; + store i32* %71 with i32 0; + i32* %72 = getelementptr &(i32* %36)[i64 36]; + store i32* %72 with i32 0; + i32* %73 = getelementptr &(i32* %36)[i64 37]; + store i32* %73 with i32 0; + i32* %74 = getelementptr &(i32* %36)[i64 38]; + store i32* %74 with i32 0; + i32* %75 = getelementptr &(i32* %36)[i64 39]; + store i32* %75 with i32 0; + i32* %76 = getelementptr &(i32* %36)[i64 40]; + store i32* %76 with i32 0; + i32* %77 = getelementptr &(i32* %36)[i64 41]; + store i32* %77 with i32 0; + i32* %78 = getelementptr &(i32* %36)[i64 42]; + store i32* %78 with i32 0; + i32* %79 = getelementptr &(i32* %36)[i64 43]; + store i32* %79 with i32 0; + i32* %80 = getelementptr &(i32* %36)[i64 44]; + store i32* %80 with i32 0; + i32* %81 = getelementptr &(i32* %36)[i64 45]; + store i32* %81 with i32 0; + i32* %82 = getelementptr &(i32* %36)[i64 46]; + store i32* %82 with i32 0; + i32* %83 = getelementptr &(i32* %36)[i64 47]; + store i32* %83 with i32 0; + i32* %84 = getelementptr &(i32* %36)[i64 48]; + store i32* %84 with i32 0; + i32* %85 = getelementptr &(i32* %36)[i64 49]; + store i32* %85 with i32 0; + i32* %86 = getelementptr &(i32* %36)[i64 50]; + store i32* %86 with i32 0; + i32* %87 = getelementptr &(i32* %36)[i64 51]; + store i32* %87 with i32 0; + i32* %88 = getelementptr &(i32* %36)[i64 52]; + store i32* %88 with i32 0; + i32* %89 = getelementptr &(i32* %36)[i64 53]; + store i32* %89 with i32 0; + i32* %90 = getelementptr &(i32* %36)[i64 54]; + store i32* %90 with i32 0; + i32* %91 = getelementptr &(i32* %36)[i64 55]; + store i32* %91 with i32 0; + i32* %92 = getelementptr &(i32* %36)[i64 56]; + store i32* %92 with i32 0; + i32* %93 = getelementptr &(i32* %36)[i64 57]; + store i32* %93 with i32 0; + i32* %94 = getelementptr &(i32* %36)[i64 58]; + store i32* %94 with i32 0; + i32* %95 = getelementptr &(i32* %36)[i64 59]; + store i32* %95 with i32 0; + i32* %96 = getelementptr &(i32* %36)[i64 60]; + store i32* %96 with i32 0; + i32* %97 = getelementptr &(i32* %36)[i64 61]; + store i32* %97 with i32 0; + i32* %98 = getelementptr &(i32* %36)[i64 62]; + store i32* %98 with i32 0; + i32* %99 = getelementptr &(i32* %36)[i64 63]; + store i32* %99 with i32 0; + i32 %100 = add i32 %35, i32 64; + i1 %101 = icmp slt i32 %100, i32 512; + cbr i1 %101(prob = 0.875), ^b3, ^b5; ^b4: - i32 %39 = add i32 %13, i32 1; - i1 %40 = icmp neq i32 %22, i32 35; - cbr i1 %40(prob = 0.984615), ^while.body1, ^b1; + i32 %102 = add i32 %13, i32 1; + i1 %103 = icmp neq i32 %22, i32 35; + cbr i1 %103(prob = 0.984615), ^while.body1, ^b1; ^while.body3: - i32 %41 = phi [^b2, i32 0] [^while.body3, i32 %44]; - i32 %42 = call () -> i32 @getch(); - i32* %43 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %41]; - store i32* %43 with i32 %42; - i32 %44 = add i32 %41, i32 1; - i1 %45 = icmp sgt i32 %32, i32 %44; - cbr i1 %45(prob = 0.984615), ^while.body3, ^if.then; + i32 %104 = phi [^b2, i32 0] [^while.body3, i32 %107]; + i32 %105 = call () -> i32 @getch(); + i32* %106 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %104]; + store i32* %106 with i32 %105; + i32 %107 = add i32 %104, i32 1; + i1 %108 = icmp sgt i32 %32, i32 %107; + cbr i1 %108(prob = 0.984615), ^while.body3, ^if.then; ^b5: - i1 %46 = icmp sgt i32 %16, i32 0; - [65536 * i32]* %47 = ptrcast [65536 * i32]* @tape to [65536 * i32]*; - [65536 * i32]* %48 = ptrcast [65536 * i32]* @output to [65536 * i32]*; - cbr i1 %46(prob = 0.984615), ^while.body4, ^b6; + i1 %109 = icmp sgt i32 %16, i32 0; + [65536 * i32]* %110 = ptrcast [65536 * i32]* @tape to [65536 * i32]*; + [65536 * i32]* %111 = ptrcast [65536 * i32]* @output to [65536 * i32]*; + cbr i1 %109(prob = 0.984615), ^while.body4, ^b6; ^while.body4: - i32 %49 = phi [^b5, i32 0] [^b8, i32 %67]; - i32 %50 = phi [^b5, i32 0] [^b8, i32 %72]; - i32 %51 = phi [^b5, i32 0] [^b8, i32 %70]; - i32 %52 = phi [^b5, i32 0] [^b8, i32 %69]; - i32 %53 = phi [^b5, i32 0] [^b8, i32 %68]; - i32* %54 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %50]; - i32 %55 = load i32* %54; - i1 %56 = icmp eq i32 %55, i32 62; - cbr i1 %56(prob = 0.5), ^if.then1, ^if.else; + i32 %112 = phi [^b5, i32 0] [^b8, i32 %130]; + i32 %113 = phi [^b5, i32 0] [^b8, i32 %135]; + i32 %114 = phi [^b5, i32 0] [^b8, i32 %133]; + i32 %115 = phi [^b5, i32 0] [^b8, i32 %132]; + i32 %116 = phi [^b5, i32 0] [^b8, i32 %131]; + i32* %117 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %113]; + i32 %118 = load i32* %117; + i1 %119 = icmp eq i32 %118, i32 62; + cbr i1 %119(prob = 0.5), ^if.then1, ^if.else; ^b6: - i32 %57 = phi [^b5, i32 0] [^b8, i32 %67]; - i1 %58 = icmp sgt i32 %57, i32 0; + i32 %120 = phi [^b5, i32 0] [^b8, i32 %130]; + i1 %121 = icmp sgt i32 %120, i32 0; call (i32) -> void @stoptime(i32 118); - cbr i1 %58(prob = 0.984615), ^while.body5, ^b7; + cbr i1 %121(prob = 0.984615), ^while.body5, ^b7; ^if.then1: - i32 %59 = add i32 %51, i32 1; + i32 %122 = add i32 %114, i32 1; ubr ^b8; ^if.else: - i1 %60 = icmp eq i32 %55, i32 60; - cbr i1 %60(prob = 0.5), ^if.then2, ^if.else1; + i1 %123 = icmp eq i32 %118, i32 60; + cbr i1 %123(prob = 0.5), ^if.then2, ^if.else1; ^while.body5: - i32 %61 = phi [^b6, i32 0] [^while.body5, i32 %64]; - i32* %62 = getelementptr &([65536 * i32]* %48)[i64 0][i32 %61]; - i32 %63 = load i32* %62; - call (i32) -> void @putch(i32 %63); - i32 %64 = add i32 %61, i32 1; - i1 %65 = icmp sgt i32 %57, i32 %64; - cbr i1 %65(prob = 0.984615), ^while.body5, ^b7; + i32 %124 = phi [^b6, i32 0] [^while.body5, i32 %127]; + i32* %125 = getelementptr &([65536 * i32]* %111)[i64 0][i32 %124]; + i32 %126 = load i32* %125; + call (i32) -> void @putch(i32 %126); + i32 %127 = add i32 %124, i32 1; + i1 %128 = icmp sgt i32 %120, i32 %127; + cbr i1 %128(prob = 0.984615), ^while.body5, ^b7; ^b7: ret i32 0; ^if.then2: - i32 %66 = add i32 %51, i32 -1; + i32 %129 = add i32 %114, i32 -1; ubr ^b8; ^b8: - i32 %67 = phi [^if.then1, i32 %49] [^if.then2, i32 %49] [^if.then3, i32 %49] [^if.then4, i32 %49] [^if.then6, i32 %49] [^if.then7, i32 %49] [^while.body6, i32 %49] [^if.else6, i32 %49] [^if.then8, i32 %109] [^if.else7, i32 %49] [^if.then10, i32 %49] [^if.else8, i32 %49]; - i32 %68 = phi [^if.then1, i32 %53] [^if.then2, i32 %53] [^if.then3, i32 %53] [^if.then4, i32 %53] [^if.then6, i32 %53] [^if.then7, i32 %53] [^while.body6, i32 %53] [^if.else6, i32 %53] [^if.then8, i32 %53] [^if.else7, i32 %53] [^if.then10, i32 %53] [^if.else8, i32 %114]; - i32 %69 = phi [^if.then1, i32 %52] [^if.then2, i32 %52] [^if.then3, i32 %52] [^if.then4, i32 %52] [^if.then6, i32 %86] [^if.then7, i32 %89] [^while.body6, i32 %52] [^if.else6, i32 %52] [^if.then8, i32 %52] [^if.else7, i32 %52] [^if.then10, i32 %52] [^if.else8, i32 %52]; - i32 %70 = phi [^if.then1, i32 %59] [^if.then2, i32 %66] [^if.then3, i32 %51] [^if.then4, i32 %51] [^if.then6, i32 %51] [^if.then7, i32 %51] [^while.body6, i32 %51] [^if.else6, i32 %51] [^if.then8, i32 %51] [^if.else7, i32 %51] [^if.then10, i32 %51] [^if.else8, i32 %51]; - i32 %71 = phi [^if.then1, i32 %50] [^if.then2, i32 %50] [^if.then3, i32 %50] [^if.then4, i32 %50] [^if.then6, i32 %50] [^if.then7, i32 %50] [^while.body6, i32 %92] [^if.else6, i32 %106] [^if.then8, i32 %50] [^if.else7, i32 %50] [^if.then10, i32 %50] [^if.else8, i32 %50]; - i32 %72 = add i32 %71, i32 1; - i1 %73 = icmp sgt i32 %16, i32 %72; - cbr i1 %73(prob = 0.984615), ^while.body4, ^b6; + i32 %130 = phi [^if.then1, i32 %112] [^if.then2, i32 %112] [^if.then3, i32 %112] [^if.then4, i32 %112] [^if.then6, i32 %112] [^if.then7, i32 %112] [^while.body6, i32 %112] [^if.else6, i32 %112] [^if.then8, i32 %172] [^if.else7, i32 %112] [^if.then10, i32 %112] [^if.else8, i32 %112]; + i32 %131 = phi [^if.then1, i32 %116] [^if.then2, i32 %116] [^if.then3, i32 %116] [^if.then4, i32 %116] [^if.then6, i32 %116] [^if.then7, i32 %116] [^while.body6, i32 %116] [^if.else6, i32 %116] [^if.then8, i32 %116] [^if.else7, i32 %116] [^if.then10, i32 %116] [^if.else8, i32 %177]; + i32 %132 = phi [^if.then1, i32 %115] [^if.then2, i32 %115] [^if.then3, i32 %115] [^if.then4, i32 %115] [^if.then6, i32 %149] [^if.then7, i32 %152] [^while.body6, i32 %115] [^if.else6, i32 %115] [^if.then8, i32 %115] [^if.else7, i32 %115] [^if.then10, i32 %115] [^if.else8, i32 %115]; + i32 %133 = phi [^if.then1, i32 %122] [^if.then2, i32 %129] [^if.then3, i32 %114] [^if.then4, i32 %114] [^if.then6, i32 %114] [^if.then7, i32 %114] [^while.body6, i32 %114] [^if.else6, i32 %114] [^if.then8, i32 %114] [^if.else7, i32 %114] [^if.then10, i32 %114] [^if.else8, i32 %114]; + i32 %134 = phi [^if.then1, i32 %113] [^if.then2, i32 %113] [^if.then3, i32 %113] [^if.then4, i32 %113] [^if.then6, i32 %113] [^if.then7, i32 %113] [^while.body6, i32 %155] [^if.else6, i32 %169] [^if.then8, i32 %113] [^if.else7, i32 %113] [^if.then10, i32 %113] [^if.else8, i32 %113]; + i32 %135 = add i32 %134, i32 1; + i1 %136 = icmp sgt i32 %16, i32 %135; + cbr i1 %136(prob = 0.984615), ^while.body4, ^b6; ^if.else1: - i32* %74 = getelementptr &([65536 * i32]* %47)[i64 0][i32 %51]; - i1 %75 = icmp eq i32 %55, i32 43; - cbr i1 %75(prob = 0.5), ^if.then3, ^if.else2; + i32* %137 = getelementptr &([65536 * i32]* %110)[i64 0][i32 %114]; + i1 %138 = icmp eq i32 %118, i32 43; + cbr i1 %138(prob = 0.5), ^if.then3, ^if.else2; ^if.then3: - i32 %76 = load i32* %74; - i32 %77 = add i32 %76, i32 1; - store i32* %74 with i32 %77; + i32 %139 = load i32* %137; + i32 %140 = add i32 %139, i32 1; + store i32* %137 with i32 %140; ubr ^b8; ^if.else2: - i1 %78 = icmp eq i32 %55, i32 45; - cbr i1 %78(prob = 0.5), ^if.then4, ^if.else3; + i1 %141 = icmp eq i32 %118, i32 45; + cbr i1 %141(prob = 0.5), ^if.then4, ^if.else3; ^if.then4: - i32 %79 = load i32* %74; - i32 %80 = add i32 %79, i32 -1; - store i32* %74 with i32 %80; + i32 %142 = load i32* %137; + i32 %143 = add i32 %142, i32 -1; + store i32* %137 with i32 %143; ubr ^b8; ^if.else3: - i1 %81 = icmp eq i32 %55, i32 91; - cbr i1 %81(prob = 0.5), ^if.then5, ^if.else4; + i1 %144 = icmp eq i32 %118, i32 91; + cbr i1 %144(prob = 0.5), ^if.then5, ^if.else4; ^if.then5: - i32 %82 = load i32* %74; - i1 %83 = icmp neq i32 %82, i32 0; - cbr i1 %83(prob = 0.5), ^if.then6, ^while.body6; + i32 %145 = load i32* %137; + i1 %146 = icmp neq i32 %145, i32 0; + cbr i1 %146(prob = 0.5), ^if.then6, ^while.body6; ^if.else4: - i1 %84 = icmp eq i32 %55, i32 93; - cbr i1 %84(prob = 0.5), ^if.then7, ^if.else5; + i1 %147 = icmp eq i32 %118, i32 93; + cbr i1 %147(prob = 0.5), ^if.then7, ^if.else5; ^if.then6: - i32* %85 = getelementptr &([512 * i32]* %20)[i64 0][i32 %52]; - store i32* %85 with i32 %50; - i32 %86 = add i32 %52, i32 1; + i32* %148 = getelementptr &([512 * i32]* %20)[i64 0][i32 %115]; + store i32* %148 with i32 %113; + i32 %149 = add i32 %115, i32 1; ubr ^b8; ^if.then7: - i32 %87 = load i32* %74; - i1 %88 = icmp eq i32 %87, i32 0; - i32 %89 = add i32 %52, i32 -1; - cbr i1 %88(prob = 0.5), ^b8, ^if.else6; + i32 %150 = load i32* %137; + i1 %151 = icmp eq i32 %150, i32 0; + i32 %152 = add i32 %115, i32 -1; + cbr i1 %151(prob = 0.5), ^b8, ^if.else6; ^while.body6: - i32 %90 = phi [^if.then5, i32 1] [^while.body6, i32 %101]; - i32 %91 = phi [^if.then5, i32 %50] [^while.body6, i32 %92]; - i32 %92 = add i32 %91, i32 1; - i32* %93 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %92]; - i32 %94 = load i32* %93; - i1 %95 = icmp eq i32 %94, i32 93; - i32 %96 = zext i1 %95 to i32; - i32 %97 = sub i32 %90, i32 %96; - i32 %98 = add i32 %97, i32 1; - i1 %99 = icmp neq i32 %94, i32 91; - i32 %100 = zext i1 %99 to i32; - i32 %101 = sub i32 %98, i32 %100; - i1 %102 = icmp sgt i32 %101, i32 0; - cbr i1 %102(prob = 0.984615), ^while.body6, ^b8; + i32 %153 = phi [^if.then5, i32 1] [^while.body6, i32 %164]; + i32 %154 = phi [^if.then5, i32 %113] [^while.body6, i32 %155]; + i32 %155 = add i32 %154, i32 1; + i32* %156 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %155]; + i32 %157 = load i32* %156; + i1 %158 = icmp eq i32 %157, i32 93; + i32 %159 = zext i1 %158 to i32; + i32 %160 = sub i32 %153, i32 %159; + i32 %161 = add i32 %160, i32 1; + i1 %162 = icmp neq i32 %157, i32 91; + i32 %163 = zext i1 %162 to i32; + i32 %164 = sub i32 %161, i32 %163; + i1 %165 = icmp sgt i32 %164, i32 0; + cbr i1 %165(prob = 0.984615), ^while.body6, ^b8; ^if.else5: - i1 %103 = icmp eq i32 %55, i32 46; - cbr i1 %103(prob = 0.5), ^if.then8, ^if.else7; + i1 %166 = icmp eq i32 %118, i32 46; + cbr i1 %166(prob = 0.5), ^if.then8, ^if.else7; ^if.else6: - i32* %104 = getelementptr &([512 * i32]* %20)[i64 0][i32 %52]; - i32* %105 = getelementptr &(i32* %104)[i64 -1]; - i32 %106 = load i32* %105; + i32* %167 = getelementptr &([512 * i32]* %20)[i64 0][i32 %115]; + i32* %168 = getelementptr &(i32* %167)[i64 -1]; + i32 %169 = load i32* %168; ubr ^b8; ^if.then8: - i32* %107 = getelementptr &([65536 * i32]* %48)[i64 0][i32 %49]; - i32 %108 = load i32* %74; - store i32* %107 with i32 %108; - i32 %109 = add i32 %49, i32 1; + i32* %170 = getelementptr &([65536 * i32]* %111)[i64 0][i32 %112]; + i32 %171 = load i32* %137; + store i32* %170 with i32 %171; + i32 %172 = add i32 %112, i32 1; ubr ^b8; ^if.else7: - i1 %110 = icmp eq i32 %55, i32 44; - cbr i1 %110(prob = 0.5), ^if.then9, ^b8; + i1 %173 = icmp eq i32 %118, i32 44; + cbr i1 %173(prob = 0.5), ^if.then9, ^b8; ^if.then9: - i1 %111 = icmp sle i32 %19, i32 %53; - cbr i1 %111(prob = 0.5), ^if.then10, ^if.else8; + i1 %174 = icmp sle i32 %19, i32 %116; + cbr i1 %174(prob = 0.5), ^if.then10, ^if.else8; ^if.then10: - store i32* %74 with i32 0; + store i32* %137 with i32 0; ubr ^b8; ^if.else8: - i32* %112 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %53]; - i32 %113 = load i32* %112; - store i32* %74 with i32 %113; - i32 %114 = add i32 %53, i32 1; + i32* %175 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %116]; + i32 %176 = load i32* %175; + store i32* %137 with i32 %176; + i32 %177 = add i32 %116, i32 1; ubr ^b8; } internal [512 * i32]* @return_a, align 8 { Flexible }; diff --git a/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.arm.s b/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.arm.s index d3f9c2fb3..fd9d3afce 100644 --- a/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.arm.s +++ b/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 program: .zero 262144 -.align 8 +.p2align 3 tape: .zero 262144 -.align 8 +.p2align 3 input: .zero 262144 -.align 8 +.p2align 3 output: .zero 262144 -.align 8 +.p2align 3 return_a: .zero 2048 .text diff --git a/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.riscv.s b/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.riscv.s index a00802beb..2cf74a791 100644 --- a/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.riscv.s +++ b/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.riscv.s @@ -1,273 +1,311 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 program: .zero 262144 -.align 8 +.p2align 3 tape: .zero 262144 -.align 8 +.p2align 3 input: .zero 262144 -.align 8 +.p2align 3 output: .zero 262144 -.align 8 +.p2align 3 return_a: .zero 2048 .text .p2align 2 .globl main main: - addi sp, sp, -80 + addi sp, sp, -104 sd ra, 0(sp) sd s0, 8(sp) li s0, 1 sd s5, 16(sp) - li s5, 360287970357415681 - sd s1, 24(sp) + li s5, 45 + sd s8, 24(sp) + li s8, 360287970357415681 + sd s1, 32(sp) li s1, 93 - sd s6, 32(sp) - sd s4, 40(sp) - li s4, 512 - sd s3, 48(sp) + sd s6, 40(sp) + sd s9, 48(sp) + li s9, 35 sd s2, 56(sp) - sd s7, 64(sp) - sd s8, 72(sp) + li s2, 62 + sd s3, 64(sp) + li s3, 60 + sd s4, 72(sp) + li s4, 43 + sd s7, 80(sp) + sd s10, 88(sp) + sd s11, 96(sp) .p2align 2 label2: jal getch addiw a1, a0, -35 - slti a4, a1, 0 - sll a3, s0, a1 - and a5, a3, s5 - sltiu a2, a5, 1 - slt a5, s1, a0 - or a3, a2, a4 - or a1, a3, a5 + slti a5, a1, 0 + sll a2, s0, a1 + and a4, a2, s8 + sltiu a3, a4, 1 + slt a4, s1, a0 + or a2, a3, a5 + or a1, a2, a4 bne a1, zero, label2 -pcrel340: +pcrel479: auipc a1, %pcrel_hi(input) -pcrel341: +pcrel480: auipc a2, %pcrel_hi(program) - addi s3, a1, %pcrel_lo(pcrel340) - addi s2, a2, %pcrel_lo(pcrel341) - li a1, 35 - beq a0, a1, label87 - mv s6, s2 - mv s7, zero - sw a0, 0(s2) + addi s7, a1, %pcrel_lo(pcrel479) + addi s6, a2, %pcrel_lo(pcrel480) + beq a0, s9, label87 + mv s10, s6 + mv s11, zero + sw a0, 0(s6) j label66 .p2align 2 label68: - addi s6, s6, 4 - sw a0, 0(s6) + addi s10, s10, 4 + sw a0, 0(s10) .p2align 2 label66: jal getch addiw a1, a0, -35 slti a5, a1, 0 sll a2, s0, a1 - and a4, a2, s5 + and a4, a2, s8 sltiu a3, a4, 1 slt a4, s1, a0 or a2, a3, a5 or a1, a2, a4 bne a1, zero, label66 - addiw s7, s7, 1 - li a1, 35 - bne a0, a1, label68 - mv s5, s7 + addiw s11, s11, 1 + bne a0, s9, label68 + mv s8, s11 label4: jal getch li a1, 105 - bne a0, a1, label93 - jal getint - mv s6, a0 - jal getch - ble s6, zero, label12 - mv s7, s3 - mv s8, zero -.p2align 2 -label8: - jal getch - addiw s8, s8, 1 - sw a0, 0(s7) - ble s6, s8, label12 - addi s7, s7, 4 - j label8 -label93: - mv s6, zero + beq a0, a1, label6 + mv s9, zero label12: li a0, 116 jal _sysy_starttime mv a2, zero -pcrel342: +pcrel481: auipc a3, %pcrel_hi(return_a) - addi a0, a3, %pcrel_lo(pcrel342) + addi a0, a3, %pcrel_lo(pcrel481) mv a1, a0 - j label14 -.p2align 2 -label17: - addi a1, a1, 4 .p2align 2 label14: - addiw a2, a2, 1 - sw zero, 0(a1) - blt a2, s4, label17 -pcrel343: - auipc a3, %pcrel_hi(tape) -pcrel344: - auipc a2, %pcrel_hi(output) - addi a1, a3, %pcrel_lo(pcrel343) - addi s4, a2, %pcrel_lo(pcrel344) - ble s5, zero, label121 + sd zero, 0(a1) + addiw a2, a2, 64 + li a3, 512 + sd zero, 8(a1) + sd zero, 16(a1) + sd zero, 24(a1) + sd zero, 32(a1) + sd zero, 40(a1) + sd zero, 48(a1) + sd zero, 56(a1) + sd zero, 64(a1) + sd zero, 72(a1) + sd zero, 80(a1) + sd zero, 88(a1) + sd zero, 96(a1) + sd zero, 104(a1) + sd zero, 112(a1) + sd zero, 120(a1) + sd zero, 128(a1) + sd zero, 136(a1) + sd zero, 144(a1) + sd zero, 152(a1) + sd zero, 160(a1) + sd zero, 168(a1) + sd zero, 176(a1) + sd zero, 184(a1) + sd zero, 192(a1) + sd zero, 200(a1) + sd zero, 208(a1) + sd zero, 216(a1) + sd zero, 224(a1) + sd zero, 232(a1) + sd zero, 240(a1) + sd zero, 248(a1) + bge a2, a3, label17 + addi a1, a1, 256 + j label14 +label17: + auipc a2, %pcrel_hi(tape) +pcrel482: + auipc a3, %pcrel_hi(output) + addi a1, a2, %pcrel_lo(label17) + addi s10, a3, %pcrel_lo(pcrel482) + ble s8, zero, label183 mv a2, zero mv t0, zero mv a4, zero mv a3, zero mv a5, zero - j label19 + j label18 .p2align 2 -label25: - li t2, 60 - bne t1, t2, label26 +label45: addiw a4, a4, -1 .p2align 2 label46: addiw t0, t0, 1 - ble s5, t0, label323 + ble s8, t0, label461 .p2align 2 -label19: - sh2add t2, t0, s2 - li t3, 62 +label18: + sh2add t2, t0, s6 lw t1, 0(t2) - bne t1, t3, label25 + bne t1, s2, label190 addiw a4, a4, 1 addiw t0, t0, 1 - bgt s5, t0, label19 -label323: + bgt s8, t0, label18 +label461: mv s0, a2 -label53: +label52: li a0, 118 jal _sysy_stoptime - ble s0, zero, label55 + ble s0, zero, label59 mv s1, zero - j label57 + j label55 .p2align 2 -label60: - addi s4, s4, 4 +label58: + addi s10, s10, 4 .p2align 2 -label57: - lw a0, 0(s4) +label55: + lw a0, 0(s10) jal putch addiw s1, s1, 1 - bgt s0, s1, label60 -label55: + bgt s0, s1, label58 +label59: mv a0, zero ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) - ld s1, 24(sp) - ld s6, 32(sp) - ld s4, 40(sp) - ld s3, 48(sp) + ld s8, 24(sp) + ld s1, 32(sp) + ld s6, 40(sp) + ld s9, 48(sp) ld s2, 56(sp) - ld s7, 64(sp) - ld s8, 72(sp) - addi sp, sp, 80 + ld s3, 64(sp) + ld s4, 72(sp) + ld s7, 80(sp) + ld s10, 88(sp) + ld s11, 96(sp) + addi sp, sp, 104 ret .p2align 2 -label26: - li t2, 43 - bne t1, t2, label27 - sh2add t1, a4, a1 - addiw t0, t0, 1 - lw t2, 0(t1) - addi t3, t2, 1 - sw t3, 0(t1) - bgt s5, t0, label19 - j label323 -.p2align 2 -label27: - li t2, 45 - bne t1, t2, label28 +label190: + beq t1, s3, label45 + beq t1, s4, label44 + bne t1, s5, label465 sh2add t2, a4, a1 addiw t0, t0, 1 - lw t3, 0(t2) - addi t1, t3, -1 - sw t1, 0(t2) - bgt s5, t0, label19 - j label323 -label28: + lw t1, 0(t2) + addi t3, t1, -1 + sw t3, 0(t2) + bgt s8, t0, label18 + j label461 +.p2align 2 +label465: li t2, 91 - bne t1, t2, label29 + beq t1, t2, label38 + beq t1, s1, label30 + li t2, 46 + beq t1, t2, label37 + li t2, 44 + beq t1, t2, label34 + j label46 +label38: sh2add t1, a4, a1 lw t3, 0(t1) - bne t3, zero, label39 + beq t3, zero, label256 + sh2add t1, a3, a0 + addiw a3, a3, 1 + sw t0, 0(t1) + j label46 +label256: mv t2, s0 mv t1, t0 .p2align 2 label40: addiw t1, t1, 1 - sh2add t3, t1, s2 + sh2add t3, t1, s6 lw t0, 0(t3) - xori t4, t0, 93 - sltiu t6, t4, 1 - xori t4, t0, 91 - subw t5, t2, t6 - sltu t6, zero, t4 - addi t3, t5, 1 + xori t5, t0, 93 + sltiu t6, t5, 1 + xori t5, t0, 91 + subw t4, t2, t6 + sltu t6, zero, t5 + addi t3, t4, 1 subw t2, t3, t6 bgt t2, zero, label40 mv t0, t1 j label46 -label29: - beq t1, s1, label36 - li t2, 46 - bne t1, t2, label32 - sh2add t2, a4, a1 - sh2add t3, a2, s4 - lw t1, 0(t2) - addiw a2, a2, 1 - sw t1, 0(t3) - j label46 -label32: - li t2, 44 - beq t1, t2, label33 - j label46 -label36: +label30: sh2add t3, a4, a1 addiw t1, a3, -1 lw t2, 0(t3) - beq t2, zero, label183 + beq t2, zero, label220 sh2add t1, a3, a0 lw t0, -4(t1) j label46 -label33: - bgt s6, a5, label166 +label6: + jal getint + mv s9, a0 + jal getch + ble s9, zero, label12 + mv s10, s7 + mv s11, zero + j label8 +.p2align 2 +label11: + addi s10, s10, 4 +.p2align 2 +label8: + jal getch + addiw s11, s11, 1 + sw a0, 0(s10) + bgt s9, s11, label11 + j label12 +label34: + bgt s9, a5, label235 sh2add t1, a4, a1 sw zero, 0(t1) j label46 -label39: - sh2add t1, a3, a0 - addiw a3, a3, 1 - sw t0, 0(t1) - j label46 -label183: - mv a3, t1 - j label46 -label166: - sh2add t3, a5, s3 +.p2align 2 +label44: + sh2add t1, a4, a1 + addiw t0, t0, 1 + lw t3, 0(t1) + addi t2, t3, 1 + sw t2, 0(t1) + bgt s8, t0, label18 + j label461 +label37: sh2add t2, a4, a1 + sh2add t3, a2, s10 + lw t1, 0(t2) + addiw a2, a2, 1 + sw t1, 0(t3) + j label46 +label235: + sh2add t2, a5, s7 + sh2add t3, a4, a1 addiw a5, a5, 1 - lw t1, 0(t3) - sw t1, 0(t2) + lw t1, 0(t2) + sw t1, 0(t3) + j label46 +label220: + mv a3, t1 j label46 +label183: + mv s0, zero + j label52 label87: - mv s5, zero + mv s8, zero j label4 -label121: - mv s0, zero - j label53 diff --git a/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.sy.ir b/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.sy.ir index 2650111ba..f059b0344 100644 --- a/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.sy.ir +++ b/tests/SysY2022/performance/brainfuck-mandelbrot-nerf.sy.ir @@ -28,13 +28,13 @@ func @main() -> i32 { NoRecurse Entry } { [65536 * i32]* %12 = ptrcast [65536 * i32]* @program to [65536 * i32]*; cbr i1 %10(prob = 0.984615), ^while.body1, ^b1; ^while.body1: - i32 %13 = phi [^b, i32 0] [^b4, i32 %39]; + i32 %13 = phi [^b, i32 0] [^b4, i32 %102]; i32 %14 = phi [^b, i32 %0] [^b4, i32 %22]; i32* %15 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %13]; store i32* %15 with i32 %14; ubr ^while.body2; ^b1: - i32 %16 = phi [^b, i32 0] [^b4, i32 %39]; + i32 %16 = phi [^b, i32 0] [^b4, i32 %102]; i32 %17 = call () -> i32 @getch(); i1 %18 = icmp neq i32 %17, i32 105; cbr i1 %18(prob = 0.5), ^if.then, ^b2; @@ -62,152 +62,278 @@ func @main() -> i32 { NoRecurse Entry } { i32 %34 = call () -> i32 @getch(); cbr i1 %33(prob = 0.984615), ^while.body3, ^if.then; ^b3: - i32 %35 = phi [^if.then, i32 0] [^b3, i32 %37]; + i32 %35 = phi [^if.then, i32 0] [^b3, i32 %100]; i32* %36 = getelementptr &(i32* %21)[i32 %35]; store i32* %36 with i32 0; - i32 %37 = add i32 %35, i32 1; - i1 %38 = icmp slt i32 %37, i32 512; - cbr i1 %38(prob = 0.998047), ^b3, ^b5; + i32* %37 = getelementptr &(i32* %36)[i64 1]; + store i32* %37 with i32 0; + i32* %38 = getelementptr &(i32* %36)[i64 2]; + store i32* %38 with i32 0; + i32* %39 = getelementptr &(i32* %36)[i64 3]; + store i32* %39 with i32 0; + i32* %40 = getelementptr &(i32* %36)[i64 4]; + store i32* %40 with i32 0; + i32* %41 = getelementptr &(i32* %36)[i64 5]; + store i32* %41 with i32 0; + i32* %42 = getelementptr &(i32* %36)[i64 6]; + store i32* %42 with i32 0; + i32* %43 = getelementptr &(i32* %36)[i64 7]; + store i32* %43 with i32 0; + i32* %44 = getelementptr &(i32* %36)[i64 8]; + store i32* %44 with i32 0; + i32* %45 = getelementptr &(i32* %36)[i64 9]; + store i32* %45 with i32 0; + i32* %46 = getelementptr &(i32* %36)[i64 10]; + store i32* %46 with i32 0; + i32* %47 = getelementptr &(i32* %36)[i64 11]; + store i32* %47 with i32 0; + i32* %48 = getelementptr &(i32* %36)[i64 12]; + store i32* %48 with i32 0; + i32* %49 = getelementptr &(i32* %36)[i64 13]; + store i32* %49 with i32 0; + i32* %50 = getelementptr &(i32* %36)[i64 14]; + store i32* %50 with i32 0; + i32* %51 = getelementptr &(i32* %36)[i64 15]; + store i32* %51 with i32 0; + i32* %52 = getelementptr &(i32* %36)[i64 16]; + store i32* %52 with i32 0; + i32* %53 = getelementptr &(i32* %36)[i64 17]; + store i32* %53 with i32 0; + i32* %54 = getelementptr &(i32* %36)[i64 18]; + store i32* %54 with i32 0; + i32* %55 = getelementptr &(i32* %36)[i64 19]; + store i32* %55 with i32 0; + i32* %56 = getelementptr &(i32* %36)[i64 20]; + store i32* %56 with i32 0; + i32* %57 = getelementptr &(i32* %36)[i64 21]; + store i32* %57 with i32 0; + i32* %58 = getelementptr &(i32* %36)[i64 22]; + store i32* %58 with i32 0; + i32* %59 = getelementptr &(i32* %36)[i64 23]; + store i32* %59 with i32 0; + i32* %60 = getelementptr &(i32* %36)[i64 24]; + store i32* %60 with i32 0; + i32* %61 = getelementptr &(i32* %36)[i64 25]; + store i32* %61 with i32 0; + i32* %62 = getelementptr &(i32* %36)[i64 26]; + store i32* %62 with i32 0; + i32* %63 = getelementptr &(i32* %36)[i64 27]; + store i32* %63 with i32 0; + i32* %64 = getelementptr &(i32* %36)[i64 28]; + store i32* %64 with i32 0; + i32* %65 = getelementptr &(i32* %36)[i64 29]; + store i32* %65 with i32 0; + i32* %66 = getelementptr &(i32* %36)[i64 30]; + store i32* %66 with i32 0; + i32* %67 = getelementptr &(i32* %36)[i64 31]; + store i32* %67 with i32 0; + i32* %68 = getelementptr &(i32* %36)[i64 32]; + store i32* %68 with i32 0; + i32* %69 = getelementptr &(i32* %36)[i64 33]; + store i32* %69 with i32 0; + i32* %70 = getelementptr &(i32* %36)[i64 34]; + store i32* %70 with i32 0; + i32* %71 = getelementptr &(i32* %36)[i64 35]; + store i32* %71 with i32 0; + i32* %72 = getelementptr &(i32* %36)[i64 36]; + store i32* %72 with i32 0; + i32* %73 = getelementptr &(i32* %36)[i64 37]; + store i32* %73 with i32 0; + i32* %74 = getelementptr &(i32* %36)[i64 38]; + store i32* %74 with i32 0; + i32* %75 = getelementptr &(i32* %36)[i64 39]; + store i32* %75 with i32 0; + i32* %76 = getelementptr &(i32* %36)[i64 40]; + store i32* %76 with i32 0; + i32* %77 = getelementptr &(i32* %36)[i64 41]; + store i32* %77 with i32 0; + i32* %78 = getelementptr &(i32* %36)[i64 42]; + store i32* %78 with i32 0; + i32* %79 = getelementptr &(i32* %36)[i64 43]; + store i32* %79 with i32 0; + i32* %80 = getelementptr &(i32* %36)[i64 44]; + store i32* %80 with i32 0; + i32* %81 = getelementptr &(i32* %36)[i64 45]; + store i32* %81 with i32 0; + i32* %82 = getelementptr &(i32* %36)[i64 46]; + store i32* %82 with i32 0; + i32* %83 = getelementptr &(i32* %36)[i64 47]; + store i32* %83 with i32 0; + i32* %84 = getelementptr &(i32* %36)[i64 48]; + store i32* %84 with i32 0; + i32* %85 = getelementptr &(i32* %36)[i64 49]; + store i32* %85 with i32 0; + i32* %86 = getelementptr &(i32* %36)[i64 50]; + store i32* %86 with i32 0; + i32* %87 = getelementptr &(i32* %36)[i64 51]; + store i32* %87 with i32 0; + i32* %88 = getelementptr &(i32* %36)[i64 52]; + store i32* %88 with i32 0; + i32* %89 = getelementptr &(i32* %36)[i64 53]; + store i32* %89 with i32 0; + i32* %90 = getelementptr &(i32* %36)[i64 54]; + store i32* %90 with i32 0; + i32* %91 = getelementptr &(i32* %36)[i64 55]; + store i32* %91 with i32 0; + i32* %92 = getelementptr &(i32* %36)[i64 56]; + store i32* %92 with i32 0; + i32* %93 = getelementptr &(i32* %36)[i64 57]; + store i32* %93 with i32 0; + i32* %94 = getelementptr &(i32* %36)[i64 58]; + store i32* %94 with i32 0; + i32* %95 = getelementptr &(i32* %36)[i64 59]; + store i32* %95 with i32 0; + i32* %96 = getelementptr &(i32* %36)[i64 60]; + store i32* %96 with i32 0; + i32* %97 = getelementptr &(i32* %36)[i64 61]; + store i32* %97 with i32 0; + i32* %98 = getelementptr &(i32* %36)[i64 62]; + store i32* %98 with i32 0; + i32* %99 = getelementptr &(i32* %36)[i64 63]; + store i32* %99 with i32 0; + i32 %100 = add i32 %35, i32 64; + i1 %101 = icmp slt i32 %100, i32 512; + cbr i1 %101(prob = 0.875), ^b3, ^b5; ^b4: - i32 %39 = add i32 %13, i32 1; - i1 %40 = icmp neq i32 %22, i32 35; - cbr i1 %40(prob = 0.984615), ^while.body1, ^b1; + i32 %102 = add i32 %13, i32 1; + i1 %103 = icmp neq i32 %22, i32 35; + cbr i1 %103(prob = 0.984615), ^while.body1, ^b1; ^while.body3: - i32 %41 = phi [^b2, i32 0] [^while.body3, i32 %44]; - i32 %42 = call () -> i32 @getch(); - i32* %43 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %41]; - store i32* %43 with i32 %42; - i32 %44 = add i32 %41, i32 1; - i1 %45 = icmp sgt i32 %32, i32 %44; - cbr i1 %45(prob = 0.984615), ^while.body3, ^if.then; + i32 %104 = phi [^b2, i32 0] [^while.body3, i32 %107]; + i32 %105 = call () -> i32 @getch(); + i32* %106 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %104]; + store i32* %106 with i32 %105; + i32 %107 = add i32 %104, i32 1; + i1 %108 = icmp sgt i32 %32, i32 %107; + cbr i1 %108(prob = 0.984615), ^while.body3, ^if.then; ^b5: - i1 %46 = icmp sgt i32 %16, i32 0; - [65536 * i32]* %47 = ptrcast [65536 * i32]* @tape to [65536 * i32]*; - [65536 * i32]* %48 = ptrcast [65536 * i32]* @output to [65536 * i32]*; - cbr i1 %46(prob = 0.984615), ^while.body4, ^b6; + i1 %109 = icmp sgt i32 %16, i32 0; + [65536 * i32]* %110 = ptrcast [65536 * i32]* @tape to [65536 * i32]*; + [65536 * i32]* %111 = ptrcast [65536 * i32]* @output to [65536 * i32]*; + cbr i1 %109(prob = 0.984615), ^while.body4, ^b6; ^while.body4: - i32 %49 = phi [^b5, i32 0] [^b8, i32 %67]; - i32 %50 = phi [^b5, i32 0] [^b8, i32 %72]; - i32 %51 = phi [^b5, i32 0] [^b8, i32 %70]; - i32 %52 = phi [^b5, i32 0] [^b8, i32 %69]; - i32 %53 = phi [^b5, i32 0] [^b8, i32 %68]; - i32* %54 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %50]; - i32 %55 = load i32* %54; - i1 %56 = icmp eq i32 %55, i32 62; - cbr i1 %56(prob = 0.5), ^if.then1, ^if.else; + i32 %112 = phi [^b5, i32 0] [^b8, i32 %130]; + i32 %113 = phi [^b5, i32 0] [^b8, i32 %135]; + i32 %114 = phi [^b5, i32 0] [^b8, i32 %133]; + i32 %115 = phi [^b5, i32 0] [^b8, i32 %132]; + i32 %116 = phi [^b5, i32 0] [^b8, i32 %131]; + i32* %117 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %113]; + i32 %118 = load i32* %117; + i1 %119 = icmp eq i32 %118, i32 62; + cbr i1 %119(prob = 0.5), ^if.then1, ^if.else; ^b6: - i32 %57 = phi [^b5, i32 0] [^b8, i32 %67]; - i1 %58 = icmp sgt i32 %57, i32 0; + i32 %120 = phi [^b5, i32 0] [^b8, i32 %130]; + i1 %121 = icmp sgt i32 %120, i32 0; call (i32) -> void @stoptime(i32 118); - cbr i1 %58(prob = 0.984615), ^while.body5, ^b7; + cbr i1 %121(prob = 0.984615), ^while.body5, ^b7; ^if.then1: - i32 %59 = add i32 %51, i32 1; + i32 %122 = add i32 %114, i32 1; ubr ^b8; ^if.else: - i1 %60 = icmp eq i32 %55, i32 60; - cbr i1 %60(prob = 0.5), ^if.then2, ^if.else1; + i1 %123 = icmp eq i32 %118, i32 60; + cbr i1 %123(prob = 0.5), ^if.then2, ^if.else1; ^while.body5: - i32 %61 = phi [^b6, i32 0] [^while.body5, i32 %64]; - i32* %62 = getelementptr &([65536 * i32]* %48)[i64 0][i32 %61]; - i32 %63 = load i32* %62; - call (i32) -> void @putch(i32 %63); - i32 %64 = add i32 %61, i32 1; - i1 %65 = icmp sgt i32 %57, i32 %64; - cbr i1 %65(prob = 0.984615), ^while.body5, ^b7; + i32 %124 = phi [^b6, i32 0] [^while.body5, i32 %127]; + i32* %125 = getelementptr &([65536 * i32]* %111)[i64 0][i32 %124]; + i32 %126 = load i32* %125; + call (i32) -> void @putch(i32 %126); + i32 %127 = add i32 %124, i32 1; + i1 %128 = icmp sgt i32 %120, i32 %127; + cbr i1 %128(prob = 0.984615), ^while.body5, ^b7; ^b7: ret i32 0; ^if.then2: - i32 %66 = add i32 %51, i32 -1; + i32 %129 = add i32 %114, i32 -1; ubr ^b8; ^b8: - i32 %67 = phi [^if.then1, i32 %49] [^if.then2, i32 %49] [^if.then3, i32 %49] [^if.then4, i32 %49] [^if.then6, i32 %49] [^if.then7, i32 %49] [^while.body6, i32 %49] [^if.else6, i32 %49] [^if.then8, i32 %109] [^if.else7, i32 %49] [^if.then10, i32 %49] [^if.else8, i32 %49]; - i32 %68 = phi [^if.then1, i32 %53] [^if.then2, i32 %53] [^if.then3, i32 %53] [^if.then4, i32 %53] [^if.then6, i32 %53] [^if.then7, i32 %53] [^while.body6, i32 %53] [^if.else6, i32 %53] [^if.then8, i32 %53] [^if.else7, i32 %53] [^if.then10, i32 %53] [^if.else8, i32 %114]; - i32 %69 = phi [^if.then1, i32 %52] [^if.then2, i32 %52] [^if.then3, i32 %52] [^if.then4, i32 %52] [^if.then6, i32 %86] [^if.then7, i32 %89] [^while.body6, i32 %52] [^if.else6, i32 %52] [^if.then8, i32 %52] [^if.else7, i32 %52] [^if.then10, i32 %52] [^if.else8, i32 %52]; - i32 %70 = phi [^if.then1, i32 %59] [^if.then2, i32 %66] [^if.then3, i32 %51] [^if.then4, i32 %51] [^if.then6, i32 %51] [^if.then7, i32 %51] [^while.body6, i32 %51] [^if.else6, i32 %51] [^if.then8, i32 %51] [^if.else7, i32 %51] [^if.then10, i32 %51] [^if.else8, i32 %51]; - i32 %71 = phi [^if.then1, i32 %50] [^if.then2, i32 %50] [^if.then3, i32 %50] [^if.then4, i32 %50] [^if.then6, i32 %50] [^if.then7, i32 %50] [^while.body6, i32 %92] [^if.else6, i32 %106] [^if.then8, i32 %50] [^if.else7, i32 %50] [^if.then10, i32 %50] [^if.else8, i32 %50]; - i32 %72 = add i32 %71, i32 1; - i1 %73 = icmp sgt i32 %16, i32 %72; - cbr i1 %73(prob = 0.984615), ^while.body4, ^b6; + i32 %130 = phi [^if.then1, i32 %112] [^if.then2, i32 %112] [^if.then3, i32 %112] [^if.then4, i32 %112] [^if.then6, i32 %112] [^if.then7, i32 %112] [^while.body6, i32 %112] [^if.else6, i32 %112] [^if.then8, i32 %172] [^if.else7, i32 %112] [^if.then10, i32 %112] [^if.else8, i32 %112]; + i32 %131 = phi [^if.then1, i32 %116] [^if.then2, i32 %116] [^if.then3, i32 %116] [^if.then4, i32 %116] [^if.then6, i32 %116] [^if.then7, i32 %116] [^while.body6, i32 %116] [^if.else6, i32 %116] [^if.then8, i32 %116] [^if.else7, i32 %116] [^if.then10, i32 %116] [^if.else8, i32 %177]; + i32 %132 = phi [^if.then1, i32 %115] [^if.then2, i32 %115] [^if.then3, i32 %115] [^if.then4, i32 %115] [^if.then6, i32 %149] [^if.then7, i32 %152] [^while.body6, i32 %115] [^if.else6, i32 %115] [^if.then8, i32 %115] [^if.else7, i32 %115] [^if.then10, i32 %115] [^if.else8, i32 %115]; + i32 %133 = phi [^if.then1, i32 %122] [^if.then2, i32 %129] [^if.then3, i32 %114] [^if.then4, i32 %114] [^if.then6, i32 %114] [^if.then7, i32 %114] [^while.body6, i32 %114] [^if.else6, i32 %114] [^if.then8, i32 %114] [^if.else7, i32 %114] [^if.then10, i32 %114] [^if.else8, i32 %114]; + i32 %134 = phi [^if.then1, i32 %113] [^if.then2, i32 %113] [^if.then3, i32 %113] [^if.then4, i32 %113] [^if.then6, i32 %113] [^if.then7, i32 %113] [^while.body6, i32 %155] [^if.else6, i32 %169] [^if.then8, i32 %113] [^if.else7, i32 %113] [^if.then10, i32 %113] [^if.else8, i32 %113]; + i32 %135 = add i32 %134, i32 1; + i1 %136 = icmp sgt i32 %16, i32 %135; + cbr i1 %136(prob = 0.984615), ^while.body4, ^b6; ^if.else1: - i32* %74 = getelementptr &([65536 * i32]* %47)[i64 0][i32 %51]; - i1 %75 = icmp eq i32 %55, i32 43; - cbr i1 %75(prob = 0.5), ^if.then3, ^if.else2; + i32* %137 = getelementptr &([65536 * i32]* %110)[i64 0][i32 %114]; + i1 %138 = icmp eq i32 %118, i32 43; + cbr i1 %138(prob = 0.5), ^if.then3, ^if.else2; ^if.then3: - i32 %76 = load i32* %74; - i32 %77 = add i32 %76, i32 1; - store i32* %74 with i32 %77; + i32 %139 = load i32* %137; + i32 %140 = add i32 %139, i32 1; + store i32* %137 with i32 %140; ubr ^b8; ^if.else2: - i1 %78 = icmp eq i32 %55, i32 45; - cbr i1 %78(prob = 0.5), ^if.then4, ^if.else3; + i1 %141 = icmp eq i32 %118, i32 45; + cbr i1 %141(prob = 0.5), ^if.then4, ^if.else3; ^if.then4: - i32 %79 = load i32* %74; - i32 %80 = add i32 %79, i32 -1; - store i32* %74 with i32 %80; + i32 %142 = load i32* %137; + i32 %143 = add i32 %142, i32 -1; + store i32* %137 with i32 %143; ubr ^b8; ^if.else3: - i1 %81 = icmp eq i32 %55, i32 91; - cbr i1 %81(prob = 0.5), ^if.then5, ^if.else4; + i1 %144 = icmp eq i32 %118, i32 91; + cbr i1 %144(prob = 0.5), ^if.then5, ^if.else4; ^if.then5: - i32 %82 = load i32* %74; - i1 %83 = icmp neq i32 %82, i32 0; - cbr i1 %83(prob = 0.5), ^if.then6, ^while.body6; + i32 %145 = load i32* %137; + i1 %146 = icmp neq i32 %145, i32 0; + cbr i1 %146(prob = 0.5), ^if.then6, ^while.body6; ^if.else4: - i1 %84 = icmp eq i32 %55, i32 93; - cbr i1 %84(prob = 0.5), ^if.then7, ^if.else5; + i1 %147 = icmp eq i32 %118, i32 93; + cbr i1 %147(prob = 0.5), ^if.then7, ^if.else5; ^if.then6: - i32* %85 = getelementptr &([512 * i32]* %20)[i64 0][i32 %52]; - store i32* %85 with i32 %50; - i32 %86 = add i32 %52, i32 1; + i32* %148 = getelementptr &([512 * i32]* %20)[i64 0][i32 %115]; + store i32* %148 with i32 %113; + i32 %149 = add i32 %115, i32 1; ubr ^b8; ^if.then7: - i32 %87 = load i32* %74; - i1 %88 = icmp eq i32 %87, i32 0; - i32 %89 = add i32 %52, i32 -1; - cbr i1 %88(prob = 0.5), ^b8, ^if.else6; + i32 %150 = load i32* %137; + i1 %151 = icmp eq i32 %150, i32 0; + i32 %152 = add i32 %115, i32 -1; + cbr i1 %151(prob = 0.5), ^b8, ^if.else6; ^while.body6: - i32 %90 = phi [^if.then5, i32 1] [^while.body6, i32 %101]; - i32 %91 = phi [^if.then5, i32 %50] [^while.body6, i32 %92]; - i32 %92 = add i32 %91, i32 1; - i32* %93 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %92]; - i32 %94 = load i32* %93; - i1 %95 = icmp eq i32 %94, i32 93; - i32 %96 = zext i1 %95 to i32; - i32 %97 = sub i32 %90, i32 %96; - i32 %98 = add i32 %97, i32 1; - i1 %99 = icmp neq i32 %94, i32 91; - i32 %100 = zext i1 %99 to i32; - i32 %101 = sub i32 %98, i32 %100; - i1 %102 = icmp sgt i32 %101, i32 0; - cbr i1 %102(prob = 0.984615), ^while.body6, ^b8; + i32 %153 = phi [^if.then5, i32 1] [^while.body6, i32 %164]; + i32 %154 = phi [^if.then5, i32 %113] [^while.body6, i32 %155]; + i32 %155 = add i32 %154, i32 1; + i32* %156 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %155]; + i32 %157 = load i32* %156; + i1 %158 = icmp eq i32 %157, i32 93; + i32 %159 = zext i1 %158 to i32; + i32 %160 = sub i32 %153, i32 %159; + i32 %161 = add i32 %160, i32 1; + i1 %162 = icmp neq i32 %157, i32 91; + i32 %163 = zext i1 %162 to i32; + i32 %164 = sub i32 %161, i32 %163; + i1 %165 = icmp sgt i32 %164, i32 0; + cbr i1 %165(prob = 0.984615), ^while.body6, ^b8; ^if.else5: - i1 %103 = icmp eq i32 %55, i32 46; - cbr i1 %103(prob = 0.5), ^if.then8, ^if.else7; + i1 %166 = icmp eq i32 %118, i32 46; + cbr i1 %166(prob = 0.5), ^if.then8, ^if.else7; ^if.else6: - i32* %104 = getelementptr &([512 * i32]* %20)[i64 0][i32 %52]; - i32* %105 = getelementptr &(i32* %104)[i64 -1]; - i32 %106 = load i32* %105; + i32* %167 = getelementptr &([512 * i32]* %20)[i64 0][i32 %115]; + i32* %168 = getelementptr &(i32* %167)[i64 -1]; + i32 %169 = load i32* %168; ubr ^b8; ^if.then8: - i32* %107 = getelementptr &([65536 * i32]* %48)[i64 0][i32 %49]; - i32 %108 = load i32* %74; - store i32* %107 with i32 %108; - i32 %109 = add i32 %49, i32 1; + i32* %170 = getelementptr &([65536 * i32]* %111)[i64 0][i32 %112]; + i32 %171 = load i32* %137; + store i32* %170 with i32 %171; + i32 %172 = add i32 %112, i32 1; ubr ^b8; ^if.else7: - i1 %110 = icmp eq i32 %55, i32 44; - cbr i1 %110(prob = 0.5), ^if.then9, ^b8; + i1 %173 = icmp eq i32 %118, i32 44; + cbr i1 %173(prob = 0.5), ^if.then9, ^b8; ^if.then9: - i1 %111 = icmp sle i32 %19, i32 %53; - cbr i1 %111(prob = 0.5), ^if.then10, ^if.else8; + i1 %174 = icmp sle i32 %19, i32 %116; + cbr i1 %174(prob = 0.5), ^if.then10, ^if.else8; ^if.then10: - store i32* %74 with i32 0; + store i32* %137 with i32 0; ubr ^b8; ^if.else8: - i32* %112 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %53]; - i32 %113 = load i32* %112; - store i32* %74 with i32 %113; - i32 %114 = add i32 %53, i32 1; + i32* %175 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %116]; + i32 %176 = load i32* %175; + store i32* %137 with i32 %176; + i32 %177 = add i32 %116, i32 1; ubr ^b8; } internal [512 * i32]* @return_a, align 8 { Flexible }; diff --git a/tests/SysY2022/performance/brainfuck-pi-nerf.arm.s b/tests/SysY2022/performance/brainfuck-pi-nerf.arm.s index d3f9c2fb3..fd9d3afce 100644 --- a/tests/SysY2022/performance/brainfuck-pi-nerf.arm.s +++ b/tests/SysY2022/performance/brainfuck-pi-nerf.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 program: .zero 262144 -.align 8 +.p2align 3 tape: .zero 262144 -.align 8 +.p2align 3 input: .zero 262144 -.align 8 +.p2align 3 output: .zero 262144 -.align 8 +.p2align 3 return_a: .zero 2048 .text diff --git a/tests/SysY2022/performance/brainfuck-pi-nerf.riscv.s b/tests/SysY2022/performance/brainfuck-pi-nerf.riscv.s index a00802beb..2cf74a791 100644 --- a/tests/SysY2022/performance/brainfuck-pi-nerf.riscv.s +++ b/tests/SysY2022/performance/brainfuck-pi-nerf.riscv.s @@ -1,273 +1,311 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 program: .zero 262144 -.align 8 +.p2align 3 tape: .zero 262144 -.align 8 +.p2align 3 input: .zero 262144 -.align 8 +.p2align 3 output: .zero 262144 -.align 8 +.p2align 3 return_a: .zero 2048 .text .p2align 2 .globl main main: - addi sp, sp, -80 + addi sp, sp, -104 sd ra, 0(sp) sd s0, 8(sp) li s0, 1 sd s5, 16(sp) - li s5, 360287970357415681 - sd s1, 24(sp) + li s5, 45 + sd s8, 24(sp) + li s8, 360287970357415681 + sd s1, 32(sp) li s1, 93 - sd s6, 32(sp) - sd s4, 40(sp) - li s4, 512 - sd s3, 48(sp) + sd s6, 40(sp) + sd s9, 48(sp) + li s9, 35 sd s2, 56(sp) - sd s7, 64(sp) - sd s8, 72(sp) + li s2, 62 + sd s3, 64(sp) + li s3, 60 + sd s4, 72(sp) + li s4, 43 + sd s7, 80(sp) + sd s10, 88(sp) + sd s11, 96(sp) .p2align 2 label2: jal getch addiw a1, a0, -35 - slti a4, a1, 0 - sll a3, s0, a1 - and a5, a3, s5 - sltiu a2, a5, 1 - slt a5, s1, a0 - or a3, a2, a4 - or a1, a3, a5 + slti a5, a1, 0 + sll a2, s0, a1 + and a4, a2, s8 + sltiu a3, a4, 1 + slt a4, s1, a0 + or a2, a3, a5 + or a1, a2, a4 bne a1, zero, label2 -pcrel340: +pcrel479: auipc a1, %pcrel_hi(input) -pcrel341: +pcrel480: auipc a2, %pcrel_hi(program) - addi s3, a1, %pcrel_lo(pcrel340) - addi s2, a2, %pcrel_lo(pcrel341) - li a1, 35 - beq a0, a1, label87 - mv s6, s2 - mv s7, zero - sw a0, 0(s2) + addi s7, a1, %pcrel_lo(pcrel479) + addi s6, a2, %pcrel_lo(pcrel480) + beq a0, s9, label87 + mv s10, s6 + mv s11, zero + sw a0, 0(s6) j label66 .p2align 2 label68: - addi s6, s6, 4 - sw a0, 0(s6) + addi s10, s10, 4 + sw a0, 0(s10) .p2align 2 label66: jal getch addiw a1, a0, -35 slti a5, a1, 0 sll a2, s0, a1 - and a4, a2, s5 + and a4, a2, s8 sltiu a3, a4, 1 slt a4, s1, a0 or a2, a3, a5 or a1, a2, a4 bne a1, zero, label66 - addiw s7, s7, 1 - li a1, 35 - bne a0, a1, label68 - mv s5, s7 + addiw s11, s11, 1 + bne a0, s9, label68 + mv s8, s11 label4: jal getch li a1, 105 - bne a0, a1, label93 - jal getint - mv s6, a0 - jal getch - ble s6, zero, label12 - mv s7, s3 - mv s8, zero -.p2align 2 -label8: - jal getch - addiw s8, s8, 1 - sw a0, 0(s7) - ble s6, s8, label12 - addi s7, s7, 4 - j label8 -label93: - mv s6, zero + beq a0, a1, label6 + mv s9, zero label12: li a0, 116 jal _sysy_starttime mv a2, zero -pcrel342: +pcrel481: auipc a3, %pcrel_hi(return_a) - addi a0, a3, %pcrel_lo(pcrel342) + addi a0, a3, %pcrel_lo(pcrel481) mv a1, a0 - j label14 -.p2align 2 -label17: - addi a1, a1, 4 .p2align 2 label14: - addiw a2, a2, 1 - sw zero, 0(a1) - blt a2, s4, label17 -pcrel343: - auipc a3, %pcrel_hi(tape) -pcrel344: - auipc a2, %pcrel_hi(output) - addi a1, a3, %pcrel_lo(pcrel343) - addi s4, a2, %pcrel_lo(pcrel344) - ble s5, zero, label121 + sd zero, 0(a1) + addiw a2, a2, 64 + li a3, 512 + sd zero, 8(a1) + sd zero, 16(a1) + sd zero, 24(a1) + sd zero, 32(a1) + sd zero, 40(a1) + sd zero, 48(a1) + sd zero, 56(a1) + sd zero, 64(a1) + sd zero, 72(a1) + sd zero, 80(a1) + sd zero, 88(a1) + sd zero, 96(a1) + sd zero, 104(a1) + sd zero, 112(a1) + sd zero, 120(a1) + sd zero, 128(a1) + sd zero, 136(a1) + sd zero, 144(a1) + sd zero, 152(a1) + sd zero, 160(a1) + sd zero, 168(a1) + sd zero, 176(a1) + sd zero, 184(a1) + sd zero, 192(a1) + sd zero, 200(a1) + sd zero, 208(a1) + sd zero, 216(a1) + sd zero, 224(a1) + sd zero, 232(a1) + sd zero, 240(a1) + sd zero, 248(a1) + bge a2, a3, label17 + addi a1, a1, 256 + j label14 +label17: + auipc a2, %pcrel_hi(tape) +pcrel482: + auipc a3, %pcrel_hi(output) + addi a1, a2, %pcrel_lo(label17) + addi s10, a3, %pcrel_lo(pcrel482) + ble s8, zero, label183 mv a2, zero mv t0, zero mv a4, zero mv a3, zero mv a5, zero - j label19 + j label18 .p2align 2 -label25: - li t2, 60 - bne t1, t2, label26 +label45: addiw a4, a4, -1 .p2align 2 label46: addiw t0, t0, 1 - ble s5, t0, label323 + ble s8, t0, label461 .p2align 2 -label19: - sh2add t2, t0, s2 - li t3, 62 +label18: + sh2add t2, t0, s6 lw t1, 0(t2) - bne t1, t3, label25 + bne t1, s2, label190 addiw a4, a4, 1 addiw t0, t0, 1 - bgt s5, t0, label19 -label323: + bgt s8, t0, label18 +label461: mv s0, a2 -label53: +label52: li a0, 118 jal _sysy_stoptime - ble s0, zero, label55 + ble s0, zero, label59 mv s1, zero - j label57 + j label55 .p2align 2 -label60: - addi s4, s4, 4 +label58: + addi s10, s10, 4 .p2align 2 -label57: - lw a0, 0(s4) +label55: + lw a0, 0(s10) jal putch addiw s1, s1, 1 - bgt s0, s1, label60 -label55: + bgt s0, s1, label58 +label59: mv a0, zero ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) - ld s1, 24(sp) - ld s6, 32(sp) - ld s4, 40(sp) - ld s3, 48(sp) + ld s8, 24(sp) + ld s1, 32(sp) + ld s6, 40(sp) + ld s9, 48(sp) ld s2, 56(sp) - ld s7, 64(sp) - ld s8, 72(sp) - addi sp, sp, 80 + ld s3, 64(sp) + ld s4, 72(sp) + ld s7, 80(sp) + ld s10, 88(sp) + ld s11, 96(sp) + addi sp, sp, 104 ret .p2align 2 -label26: - li t2, 43 - bne t1, t2, label27 - sh2add t1, a4, a1 - addiw t0, t0, 1 - lw t2, 0(t1) - addi t3, t2, 1 - sw t3, 0(t1) - bgt s5, t0, label19 - j label323 -.p2align 2 -label27: - li t2, 45 - bne t1, t2, label28 +label190: + beq t1, s3, label45 + beq t1, s4, label44 + bne t1, s5, label465 sh2add t2, a4, a1 addiw t0, t0, 1 - lw t3, 0(t2) - addi t1, t3, -1 - sw t1, 0(t2) - bgt s5, t0, label19 - j label323 -label28: + lw t1, 0(t2) + addi t3, t1, -1 + sw t3, 0(t2) + bgt s8, t0, label18 + j label461 +.p2align 2 +label465: li t2, 91 - bne t1, t2, label29 + beq t1, t2, label38 + beq t1, s1, label30 + li t2, 46 + beq t1, t2, label37 + li t2, 44 + beq t1, t2, label34 + j label46 +label38: sh2add t1, a4, a1 lw t3, 0(t1) - bne t3, zero, label39 + beq t3, zero, label256 + sh2add t1, a3, a0 + addiw a3, a3, 1 + sw t0, 0(t1) + j label46 +label256: mv t2, s0 mv t1, t0 .p2align 2 label40: addiw t1, t1, 1 - sh2add t3, t1, s2 + sh2add t3, t1, s6 lw t0, 0(t3) - xori t4, t0, 93 - sltiu t6, t4, 1 - xori t4, t0, 91 - subw t5, t2, t6 - sltu t6, zero, t4 - addi t3, t5, 1 + xori t5, t0, 93 + sltiu t6, t5, 1 + xori t5, t0, 91 + subw t4, t2, t6 + sltu t6, zero, t5 + addi t3, t4, 1 subw t2, t3, t6 bgt t2, zero, label40 mv t0, t1 j label46 -label29: - beq t1, s1, label36 - li t2, 46 - bne t1, t2, label32 - sh2add t2, a4, a1 - sh2add t3, a2, s4 - lw t1, 0(t2) - addiw a2, a2, 1 - sw t1, 0(t3) - j label46 -label32: - li t2, 44 - beq t1, t2, label33 - j label46 -label36: +label30: sh2add t3, a4, a1 addiw t1, a3, -1 lw t2, 0(t3) - beq t2, zero, label183 + beq t2, zero, label220 sh2add t1, a3, a0 lw t0, -4(t1) j label46 -label33: - bgt s6, a5, label166 +label6: + jal getint + mv s9, a0 + jal getch + ble s9, zero, label12 + mv s10, s7 + mv s11, zero + j label8 +.p2align 2 +label11: + addi s10, s10, 4 +.p2align 2 +label8: + jal getch + addiw s11, s11, 1 + sw a0, 0(s10) + bgt s9, s11, label11 + j label12 +label34: + bgt s9, a5, label235 sh2add t1, a4, a1 sw zero, 0(t1) j label46 -label39: - sh2add t1, a3, a0 - addiw a3, a3, 1 - sw t0, 0(t1) - j label46 -label183: - mv a3, t1 - j label46 -label166: - sh2add t3, a5, s3 +.p2align 2 +label44: + sh2add t1, a4, a1 + addiw t0, t0, 1 + lw t3, 0(t1) + addi t2, t3, 1 + sw t2, 0(t1) + bgt s8, t0, label18 + j label461 +label37: sh2add t2, a4, a1 + sh2add t3, a2, s10 + lw t1, 0(t2) + addiw a2, a2, 1 + sw t1, 0(t3) + j label46 +label235: + sh2add t2, a5, s7 + sh2add t3, a4, a1 addiw a5, a5, 1 - lw t1, 0(t3) - sw t1, 0(t2) + lw t1, 0(t2) + sw t1, 0(t3) + j label46 +label220: + mv a3, t1 j label46 +label183: + mv s0, zero + j label52 label87: - mv s5, zero + mv s8, zero j label4 -label121: - mv s0, zero - j label53 diff --git a/tests/SysY2022/performance/brainfuck-pi-nerf.sy.ir b/tests/SysY2022/performance/brainfuck-pi-nerf.sy.ir index 2650111ba..f059b0344 100644 --- a/tests/SysY2022/performance/brainfuck-pi-nerf.sy.ir +++ b/tests/SysY2022/performance/brainfuck-pi-nerf.sy.ir @@ -28,13 +28,13 @@ func @main() -> i32 { NoRecurse Entry } { [65536 * i32]* %12 = ptrcast [65536 * i32]* @program to [65536 * i32]*; cbr i1 %10(prob = 0.984615), ^while.body1, ^b1; ^while.body1: - i32 %13 = phi [^b, i32 0] [^b4, i32 %39]; + i32 %13 = phi [^b, i32 0] [^b4, i32 %102]; i32 %14 = phi [^b, i32 %0] [^b4, i32 %22]; i32* %15 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %13]; store i32* %15 with i32 %14; ubr ^while.body2; ^b1: - i32 %16 = phi [^b, i32 0] [^b4, i32 %39]; + i32 %16 = phi [^b, i32 0] [^b4, i32 %102]; i32 %17 = call () -> i32 @getch(); i1 %18 = icmp neq i32 %17, i32 105; cbr i1 %18(prob = 0.5), ^if.then, ^b2; @@ -62,152 +62,278 @@ func @main() -> i32 { NoRecurse Entry } { i32 %34 = call () -> i32 @getch(); cbr i1 %33(prob = 0.984615), ^while.body3, ^if.then; ^b3: - i32 %35 = phi [^if.then, i32 0] [^b3, i32 %37]; + i32 %35 = phi [^if.then, i32 0] [^b3, i32 %100]; i32* %36 = getelementptr &(i32* %21)[i32 %35]; store i32* %36 with i32 0; - i32 %37 = add i32 %35, i32 1; - i1 %38 = icmp slt i32 %37, i32 512; - cbr i1 %38(prob = 0.998047), ^b3, ^b5; + i32* %37 = getelementptr &(i32* %36)[i64 1]; + store i32* %37 with i32 0; + i32* %38 = getelementptr &(i32* %36)[i64 2]; + store i32* %38 with i32 0; + i32* %39 = getelementptr &(i32* %36)[i64 3]; + store i32* %39 with i32 0; + i32* %40 = getelementptr &(i32* %36)[i64 4]; + store i32* %40 with i32 0; + i32* %41 = getelementptr &(i32* %36)[i64 5]; + store i32* %41 with i32 0; + i32* %42 = getelementptr &(i32* %36)[i64 6]; + store i32* %42 with i32 0; + i32* %43 = getelementptr &(i32* %36)[i64 7]; + store i32* %43 with i32 0; + i32* %44 = getelementptr &(i32* %36)[i64 8]; + store i32* %44 with i32 0; + i32* %45 = getelementptr &(i32* %36)[i64 9]; + store i32* %45 with i32 0; + i32* %46 = getelementptr &(i32* %36)[i64 10]; + store i32* %46 with i32 0; + i32* %47 = getelementptr &(i32* %36)[i64 11]; + store i32* %47 with i32 0; + i32* %48 = getelementptr &(i32* %36)[i64 12]; + store i32* %48 with i32 0; + i32* %49 = getelementptr &(i32* %36)[i64 13]; + store i32* %49 with i32 0; + i32* %50 = getelementptr &(i32* %36)[i64 14]; + store i32* %50 with i32 0; + i32* %51 = getelementptr &(i32* %36)[i64 15]; + store i32* %51 with i32 0; + i32* %52 = getelementptr &(i32* %36)[i64 16]; + store i32* %52 with i32 0; + i32* %53 = getelementptr &(i32* %36)[i64 17]; + store i32* %53 with i32 0; + i32* %54 = getelementptr &(i32* %36)[i64 18]; + store i32* %54 with i32 0; + i32* %55 = getelementptr &(i32* %36)[i64 19]; + store i32* %55 with i32 0; + i32* %56 = getelementptr &(i32* %36)[i64 20]; + store i32* %56 with i32 0; + i32* %57 = getelementptr &(i32* %36)[i64 21]; + store i32* %57 with i32 0; + i32* %58 = getelementptr &(i32* %36)[i64 22]; + store i32* %58 with i32 0; + i32* %59 = getelementptr &(i32* %36)[i64 23]; + store i32* %59 with i32 0; + i32* %60 = getelementptr &(i32* %36)[i64 24]; + store i32* %60 with i32 0; + i32* %61 = getelementptr &(i32* %36)[i64 25]; + store i32* %61 with i32 0; + i32* %62 = getelementptr &(i32* %36)[i64 26]; + store i32* %62 with i32 0; + i32* %63 = getelementptr &(i32* %36)[i64 27]; + store i32* %63 with i32 0; + i32* %64 = getelementptr &(i32* %36)[i64 28]; + store i32* %64 with i32 0; + i32* %65 = getelementptr &(i32* %36)[i64 29]; + store i32* %65 with i32 0; + i32* %66 = getelementptr &(i32* %36)[i64 30]; + store i32* %66 with i32 0; + i32* %67 = getelementptr &(i32* %36)[i64 31]; + store i32* %67 with i32 0; + i32* %68 = getelementptr &(i32* %36)[i64 32]; + store i32* %68 with i32 0; + i32* %69 = getelementptr &(i32* %36)[i64 33]; + store i32* %69 with i32 0; + i32* %70 = getelementptr &(i32* %36)[i64 34]; + store i32* %70 with i32 0; + i32* %71 = getelementptr &(i32* %36)[i64 35]; + store i32* %71 with i32 0; + i32* %72 = getelementptr &(i32* %36)[i64 36]; + store i32* %72 with i32 0; + i32* %73 = getelementptr &(i32* %36)[i64 37]; + store i32* %73 with i32 0; + i32* %74 = getelementptr &(i32* %36)[i64 38]; + store i32* %74 with i32 0; + i32* %75 = getelementptr &(i32* %36)[i64 39]; + store i32* %75 with i32 0; + i32* %76 = getelementptr &(i32* %36)[i64 40]; + store i32* %76 with i32 0; + i32* %77 = getelementptr &(i32* %36)[i64 41]; + store i32* %77 with i32 0; + i32* %78 = getelementptr &(i32* %36)[i64 42]; + store i32* %78 with i32 0; + i32* %79 = getelementptr &(i32* %36)[i64 43]; + store i32* %79 with i32 0; + i32* %80 = getelementptr &(i32* %36)[i64 44]; + store i32* %80 with i32 0; + i32* %81 = getelementptr &(i32* %36)[i64 45]; + store i32* %81 with i32 0; + i32* %82 = getelementptr &(i32* %36)[i64 46]; + store i32* %82 with i32 0; + i32* %83 = getelementptr &(i32* %36)[i64 47]; + store i32* %83 with i32 0; + i32* %84 = getelementptr &(i32* %36)[i64 48]; + store i32* %84 with i32 0; + i32* %85 = getelementptr &(i32* %36)[i64 49]; + store i32* %85 with i32 0; + i32* %86 = getelementptr &(i32* %36)[i64 50]; + store i32* %86 with i32 0; + i32* %87 = getelementptr &(i32* %36)[i64 51]; + store i32* %87 with i32 0; + i32* %88 = getelementptr &(i32* %36)[i64 52]; + store i32* %88 with i32 0; + i32* %89 = getelementptr &(i32* %36)[i64 53]; + store i32* %89 with i32 0; + i32* %90 = getelementptr &(i32* %36)[i64 54]; + store i32* %90 with i32 0; + i32* %91 = getelementptr &(i32* %36)[i64 55]; + store i32* %91 with i32 0; + i32* %92 = getelementptr &(i32* %36)[i64 56]; + store i32* %92 with i32 0; + i32* %93 = getelementptr &(i32* %36)[i64 57]; + store i32* %93 with i32 0; + i32* %94 = getelementptr &(i32* %36)[i64 58]; + store i32* %94 with i32 0; + i32* %95 = getelementptr &(i32* %36)[i64 59]; + store i32* %95 with i32 0; + i32* %96 = getelementptr &(i32* %36)[i64 60]; + store i32* %96 with i32 0; + i32* %97 = getelementptr &(i32* %36)[i64 61]; + store i32* %97 with i32 0; + i32* %98 = getelementptr &(i32* %36)[i64 62]; + store i32* %98 with i32 0; + i32* %99 = getelementptr &(i32* %36)[i64 63]; + store i32* %99 with i32 0; + i32 %100 = add i32 %35, i32 64; + i1 %101 = icmp slt i32 %100, i32 512; + cbr i1 %101(prob = 0.875), ^b3, ^b5; ^b4: - i32 %39 = add i32 %13, i32 1; - i1 %40 = icmp neq i32 %22, i32 35; - cbr i1 %40(prob = 0.984615), ^while.body1, ^b1; + i32 %102 = add i32 %13, i32 1; + i1 %103 = icmp neq i32 %22, i32 35; + cbr i1 %103(prob = 0.984615), ^while.body1, ^b1; ^while.body3: - i32 %41 = phi [^b2, i32 0] [^while.body3, i32 %44]; - i32 %42 = call () -> i32 @getch(); - i32* %43 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %41]; - store i32* %43 with i32 %42; - i32 %44 = add i32 %41, i32 1; - i1 %45 = icmp sgt i32 %32, i32 %44; - cbr i1 %45(prob = 0.984615), ^while.body3, ^if.then; + i32 %104 = phi [^b2, i32 0] [^while.body3, i32 %107]; + i32 %105 = call () -> i32 @getch(); + i32* %106 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %104]; + store i32* %106 with i32 %105; + i32 %107 = add i32 %104, i32 1; + i1 %108 = icmp sgt i32 %32, i32 %107; + cbr i1 %108(prob = 0.984615), ^while.body3, ^if.then; ^b5: - i1 %46 = icmp sgt i32 %16, i32 0; - [65536 * i32]* %47 = ptrcast [65536 * i32]* @tape to [65536 * i32]*; - [65536 * i32]* %48 = ptrcast [65536 * i32]* @output to [65536 * i32]*; - cbr i1 %46(prob = 0.984615), ^while.body4, ^b6; + i1 %109 = icmp sgt i32 %16, i32 0; + [65536 * i32]* %110 = ptrcast [65536 * i32]* @tape to [65536 * i32]*; + [65536 * i32]* %111 = ptrcast [65536 * i32]* @output to [65536 * i32]*; + cbr i1 %109(prob = 0.984615), ^while.body4, ^b6; ^while.body4: - i32 %49 = phi [^b5, i32 0] [^b8, i32 %67]; - i32 %50 = phi [^b5, i32 0] [^b8, i32 %72]; - i32 %51 = phi [^b5, i32 0] [^b8, i32 %70]; - i32 %52 = phi [^b5, i32 0] [^b8, i32 %69]; - i32 %53 = phi [^b5, i32 0] [^b8, i32 %68]; - i32* %54 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %50]; - i32 %55 = load i32* %54; - i1 %56 = icmp eq i32 %55, i32 62; - cbr i1 %56(prob = 0.5), ^if.then1, ^if.else; + i32 %112 = phi [^b5, i32 0] [^b8, i32 %130]; + i32 %113 = phi [^b5, i32 0] [^b8, i32 %135]; + i32 %114 = phi [^b5, i32 0] [^b8, i32 %133]; + i32 %115 = phi [^b5, i32 0] [^b8, i32 %132]; + i32 %116 = phi [^b5, i32 0] [^b8, i32 %131]; + i32* %117 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %113]; + i32 %118 = load i32* %117; + i1 %119 = icmp eq i32 %118, i32 62; + cbr i1 %119(prob = 0.5), ^if.then1, ^if.else; ^b6: - i32 %57 = phi [^b5, i32 0] [^b8, i32 %67]; - i1 %58 = icmp sgt i32 %57, i32 0; + i32 %120 = phi [^b5, i32 0] [^b8, i32 %130]; + i1 %121 = icmp sgt i32 %120, i32 0; call (i32) -> void @stoptime(i32 118); - cbr i1 %58(prob = 0.984615), ^while.body5, ^b7; + cbr i1 %121(prob = 0.984615), ^while.body5, ^b7; ^if.then1: - i32 %59 = add i32 %51, i32 1; + i32 %122 = add i32 %114, i32 1; ubr ^b8; ^if.else: - i1 %60 = icmp eq i32 %55, i32 60; - cbr i1 %60(prob = 0.5), ^if.then2, ^if.else1; + i1 %123 = icmp eq i32 %118, i32 60; + cbr i1 %123(prob = 0.5), ^if.then2, ^if.else1; ^while.body5: - i32 %61 = phi [^b6, i32 0] [^while.body5, i32 %64]; - i32* %62 = getelementptr &([65536 * i32]* %48)[i64 0][i32 %61]; - i32 %63 = load i32* %62; - call (i32) -> void @putch(i32 %63); - i32 %64 = add i32 %61, i32 1; - i1 %65 = icmp sgt i32 %57, i32 %64; - cbr i1 %65(prob = 0.984615), ^while.body5, ^b7; + i32 %124 = phi [^b6, i32 0] [^while.body5, i32 %127]; + i32* %125 = getelementptr &([65536 * i32]* %111)[i64 0][i32 %124]; + i32 %126 = load i32* %125; + call (i32) -> void @putch(i32 %126); + i32 %127 = add i32 %124, i32 1; + i1 %128 = icmp sgt i32 %120, i32 %127; + cbr i1 %128(prob = 0.984615), ^while.body5, ^b7; ^b7: ret i32 0; ^if.then2: - i32 %66 = add i32 %51, i32 -1; + i32 %129 = add i32 %114, i32 -1; ubr ^b8; ^b8: - i32 %67 = phi [^if.then1, i32 %49] [^if.then2, i32 %49] [^if.then3, i32 %49] [^if.then4, i32 %49] [^if.then6, i32 %49] [^if.then7, i32 %49] [^while.body6, i32 %49] [^if.else6, i32 %49] [^if.then8, i32 %109] [^if.else7, i32 %49] [^if.then10, i32 %49] [^if.else8, i32 %49]; - i32 %68 = phi [^if.then1, i32 %53] [^if.then2, i32 %53] [^if.then3, i32 %53] [^if.then4, i32 %53] [^if.then6, i32 %53] [^if.then7, i32 %53] [^while.body6, i32 %53] [^if.else6, i32 %53] [^if.then8, i32 %53] [^if.else7, i32 %53] [^if.then10, i32 %53] [^if.else8, i32 %114]; - i32 %69 = phi [^if.then1, i32 %52] [^if.then2, i32 %52] [^if.then3, i32 %52] [^if.then4, i32 %52] [^if.then6, i32 %86] [^if.then7, i32 %89] [^while.body6, i32 %52] [^if.else6, i32 %52] [^if.then8, i32 %52] [^if.else7, i32 %52] [^if.then10, i32 %52] [^if.else8, i32 %52]; - i32 %70 = phi [^if.then1, i32 %59] [^if.then2, i32 %66] [^if.then3, i32 %51] [^if.then4, i32 %51] [^if.then6, i32 %51] [^if.then7, i32 %51] [^while.body6, i32 %51] [^if.else6, i32 %51] [^if.then8, i32 %51] [^if.else7, i32 %51] [^if.then10, i32 %51] [^if.else8, i32 %51]; - i32 %71 = phi [^if.then1, i32 %50] [^if.then2, i32 %50] [^if.then3, i32 %50] [^if.then4, i32 %50] [^if.then6, i32 %50] [^if.then7, i32 %50] [^while.body6, i32 %92] [^if.else6, i32 %106] [^if.then8, i32 %50] [^if.else7, i32 %50] [^if.then10, i32 %50] [^if.else8, i32 %50]; - i32 %72 = add i32 %71, i32 1; - i1 %73 = icmp sgt i32 %16, i32 %72; - cbr i1 %73(prob = 0.984615), ^while.body4, ^b6; + i32 %130 = phi [^if.then1, i32 %112] [^if.then2, i32 %112] [^if.then3, i32 %112] [^if.then4, i32 %112] [^if.then6, i32 %112] [^if.then7, i32 %112] [^while.body6, i32 %112] [^if.else6, i32 %112] [^if.then8, i32 %172] [^if.else7, i32 %112] [^if.then10, i32 %112] [^if.else8, i32 %112]; + i32 %131 = phi [^if.then1, i32 %116] [^if.then2, i32 %116] [^if.then3, i32 %116] [^if.then4, i32 %116] [^if.then6, i32 %116] [^if.then7, i32 %116] [^while.body6, i32 %116] [^if.else6, i32 %116] [^if.then8, i32 %116] [^if.else7, i32 %116] [^if.then10, i32 %116] [^if.else8, i32 %177]; + i32 %132 = phi [^if.then1, i32 %115] [^if.then2, i32 %115] [^if.then3, i32 %115] [^if.then4, i32 %115] [^if.then6, i32 %149] [^if.then7, i32 %152] [^while.body6, i32 %115] [^if.else6, i32 %115] [^if.then8, i32 %115] [^if.else7, i32 %115] [^if.then10, i32 %115] [^if.else8, i32 %115]; + i32 %133 = phi [^if.then1, i32 %122] [^if.then2, i32 %129] [^if.then3, i32 %114] [^if.then4, i32 %114] [^if.then6, i32 %114] [^if.then7, i32 %114] [^while.body6, i32 %114] [^if.else6, i32 %114] [^if.then8, i32 %114] [^if.else7, i32 %114] [^if.then10, i32 %114] [^if.else8, i32 %114]; + i32 %134 = phi [^if.then1, i32 %113] [^if.then2, i32 %113] [^if.then3, i32 %113] [^if.then4, i32 %113] [^if.then6, i32 %113] [^if.then7, i32 %113] [^while.body6, i32 %155] [^if.else6, i32 %169] [^if.then8, i32 %113] [^if.else7, i32 %113] [^if.then10, i32 %113] [^if.else8, i32 %113]; + i32 %135 = add i32 %134, i32 1; + i1 %136 = icmp sgt i32 %16, i32 %135; + cbr i1 %136(prob = 0.984615), ^while.body4, ^b6; ^if.else1: - i32* %74 = getelementptr &([65536 * i32]* %47)[i64 0][i32 %51]; - i1 %75 = icmp eq i32 %55, i32 43; - cbr i1 %75(prob = 0.5), ^if.then3, ^if.else2; + i32* %137 = getelementptr &([65536 * i32]* %110)[i64 0][i32 %114]; + i1 %138 = icmp eq i32 %118, i32 43; + cbr i1 %138(prob = 0.5), ^if.then3, ^if.else2; ^if.then3: - i32 %76 = load i32* %74; - i32 %77 = add i32 %76, i32 1; - store i32* %74 with i32 %77; + i32 %139 = load i32* %137; + i32 %140 = add i32 %139, i32 1; + store i32* %137 with i32 %140; ubr ^b8; ^if.else2: - i1 %78 = icmp eq i32 %55, i32 45; - cbr i1 %78(prob = 0.5), ^if.then4, ^if.else3; + i1 %141 = icmp eq i32 %118, i32 45; + cbr i1 %141(prob = 0.5), ^if.then4, ^if.else3; ^if.then4: - i32 %79 = load i32* %74; - i32 %80 = add i32 %79, i32 -1; - store i32* %74 with i32 %80; + i32 %142 = load i32* %137; + i32 %143 = add i32 %142, i32 -1; + store i32* %137 with i32 %143; ubr ^b8; ^if.else3: - i1 %81 = icmp eq i32 %55, i32 91; - cbr i1 %81(prob = 0.5), ^if.then5, ^if.else4; + i1 %144 = icmp eq i32 %118, i32 91; + cbr i1 %144(prob = 0.5), ^if.then5, ^if.else4; ^if.then5: - i32 %82 = load i32* %74; - i1 %83 = icmp neq i32 %82, i32 0; - cbr i1 %83(prob = 0.5), ^if.then6, ^while.body6; + i32 %145 = load i32* %137; + i1 %146 = icmp neq i32 %145, i32 0; + cbr i1 %146(prob = 0.5), ^if.then6, ^while.body6; ^if.else4: - i1 %84 = icmp eq i32 %55, i32 93; - cbr i1 %84(prob = 0.5), ^if.then7, ^if.else5; + i1 %147 = icmp eq i32 %118, i32 93; + cbr i1 %147(prob = 0.5), ^if.then7, ^if.else5; ^if.then6: - i32* %85 = getelementptr &([512 * i32]* %20)[i64 0][i32 %52]; - store i32* %85 with i32 %50; - i32 %86 = add i32 %52, i32 1; + i32* %148 = getelementptr &([512 * i32]* %20)[i64 0][i32 %115]; + store i32* %148 with i32 %113; + i32 %149 = add i32 %115, i32 1; ubr ^b8; ^if.then7: - i32 %87 = load i32* %74; - i1 %88 = icmp eq i32 %87, i32 0; - i32 %89 = add i32 %52, i32 -1; - cbr i1 %88(prob = 0.5), ^b8, ^if.else6; + i32 %150 = load i32* %137; + i1 %151 = icmp eq i32 %150, i32 0; + i32 %152 = add i32 %115, i32 -1; + cbr i1 %151(prob = 0.5), ^b8, ^if.else6; ^while.body6: - i32 %90 = phi [^if.then5, i32 1] [^while.body6, i32 %101]; - i32 %91 = phi [^if.then5, i32 %50] [^while.body6, i32 %92]; - i32 %92 = add i32 %91, i32 1; - i32* %93 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %92]; - i32 %94 = load i32* %93; - i1 %95 = icmp eq i32 %94, i32 93; - i32 %96 = zext i1 %95 to i32; - i32 %97 = sub i32 %90, i32 %96; - i32 %98 = add i32 %97, i32 1; - i1 %99 = icmp neq i32 %94, i32 91; - i32 %100 = zext i1 %99 to i32; - i32 %101 = sub i32 %98, i32 %100; - i1 %102 = icmp sgt i32 %101, i32 0; - cbr i1 %102(prob = 0.984615), ^while.body6, ^b8; + i32 %153 = phi [^if.then5, i32 1] [^while.body6, i32 %164]; + i32 %154 = phi [^if.then5, i32 %113] [^while.body6, i32 %155]; + i32 %155 = add i32 %154, i32 1; + i32* %156 = getelementptr &([65536 * i32]* %12)[i64 0][i32 %155]; + i32 %157 = load i32* %156; + i1 %158 = icmp eq i32 %157, i32 93; + i32 %159 = zext i1 %158 to i32; + i32 %160 = sub i32 %153, i32 %159; + i32 %161 = add i32 %160, i32 1; + i1 %162 = icmp neq i32 %157, i32 91; + i32 %163 = zext i1 %162 to i32; + i32 %164 = sub i32 %161, i32 %163; + i1 %165 = icmp sgt i32 %164, i32 0; + cbr i1 %165(prob = 0.984615), ^while.body6, ^b8; ^if.else5: - i1 %103 = icmp eq i32 %55, i32 46; - cbr i1 %103(prob = 0.5), ^if.then8, ^if.else7; + i1 %166 = icmp eq i32 %118, i32 46; + cbr i1 %166(prob = 0.5), ^if.then8, ^if.else7; ^if.else6: - i32* %104 = getelementptr &([512 * i32]* %20)[i64 0][i32 %52]; - i32* %105 = getelementptr &(i32* %104)[i64 -1]; - i32 %106 = load i32* %105; + i32* %167 = getelementptr &([512 * i32]* %20)[i64 0][i32 %115]; + i32* %168 = getelementptr &(i32* %167)[i64 -1]; + i32 %169 = load i32* %168; ubr ^b8; ^if.then8: - i32* %107 = getelementptr &([65536 * i32]* %48)[i64 0][i32 %49]; - i32 %108 = load i32* %74; - store i32* %107 with i32 %108; - i32 %109 = add i32 %49, i32 1; + i32* %170 = getelementptr &([65536 * i32]* %111)[i64 0][i32 %112]; + i32 %171 = load i32* %137; + store i32* %170 with i32 %171; + i32 %172 = add i32 %112, i32 1; ubr ^b8; ^if.else7: - i1 %110 = icmp eq i32 %55, i32 44; - cbr i1 %110(prob = 0.5), ^if.then9, ^b8; + i1 %173 = icmp eq i32 %118, i32 44; + cbr i1 %173(prob = 0.5), ^if.then9, ^b8; ^if.then9: - i1 %111 = icmp sle i32 %19, i32 %53; - cbr i1 %111(prob = 0.5), ^if.then10, ^if.else8; + i1 %174 = icmp sle i32 %19, i32 %116; + cbr i1 %174(prob = 0.5), ^if.then10, ^if.else8; ^if.then10: - store i32* %74 with i32 0; + store i32* %137 with i32 0; ubr ^b8; ^if.else8: - i32* %112 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %53]; - i32 %113 = load i32* %112; - store i32* %74 with i32 %113; - i32 %114 = add i32 %53, i32 1; + i32* %175 = getelementptr &([65536 * i32]* %11)[i64 0][i32 %116]; + i32 %176 = load i32* %175; + store i32* %137 with i32 %176; + i32 %177 = add i32 %116, i32 1; ubr ^b8; } internal [512 * i32]* @return_a, align 8 { Flexible }; diff --git a/tests/SysY2022/performance/conv0.arm.s b/tests/SysY2022/performance/conv0.arm.s index da64b8f36..94f9cc027 100644 --- a/tests/SysY2022/performance/conv0.arm.s +++ b/tests/SysY2022/performance/conv0.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .section .rodata -.align 8 +.p2align 3 __cmmc_jumptable1312: .word label1287-__cmmc_jumptable1312 .word label1286-__cmmc_jumptable1312 @@ -9,34 +9,34 @@ __cmmc_jumptable1312: .word label1284-__cmmc_jumptable1312 .word label1283-__cmmc_jumptable1312 .bss -.align 8 +.p2align 3 a: .zero 40000000 -.align 8 +.p2align 3 b: .zero 40000000 -.align 8 +.p2align 3 kernelid: .zero 40000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_4: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_5: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_6: .zero 16 .text diff --git a/tests/SysY2022/performance/conv0.riscv.s b/tests/SysY2022/performance/conv0.riscv.s index e74ffefcd..028c63550 100644 --- a/tests/SysY2022/performance/conv0.riscv.s +++ b/tests/SysY2022/performance/conv0.riscv.s @@ -1,204 +1,198 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 8 -__cmmc_jumptable1415: - .word label1390-__cmmc_jumptable1415 - .word label1389-__cmmc_jumptable1415 - .word label1388-__cmmc_jumptable1415 - .word label1387-__cmmc_jumptable1415 - .word label1386-__cmmc_jumptable1415 +.p2align 3 +__cmmc_jumptable1433: + .word label1408-__cmmc_jumptable1433 + .word label1407-__cmmc_jumptable1433 + .word label1406-__cmmc_jumptable1433 + .word label1405-__cmmc_jumptable1433 + .word label1404-__cmmc_jumptable1433 .bss -.align 8 +.p2align 3 a: .zero 40000000 -.align 8 +.p2align 3 b: .zero 40000000 -.align 8 +.p2align 3 kernelid: .zero 40000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_4: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_5: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_6: .zero 16 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[64] CalleeSaved[104] - addi sp, sp, -168 + # stack usage: CalleeArg[0] Local[0] RegSpill[48] CalleeSaved[104] + addi sp, sp, -152 sd ra, 0(sp) - sd s0, 8(sp) - sd s5, 16(sp) - sd s1, 24(sp) - sd s6, 32(sp) - sd s2, 40(sp) - sd s4, 48(sp) - sd s7, 56(sp) - sd s8, 64(sp) - sd s11, 72(sp) - sd s3, 80(sp) - sd s9, 88(sp) - sd s10, 96(sp) + sd s2, 8(sp) + sd s0, 16(sp) + sd s5, 24(sp) + sd s1, 32(sp) + sd s6, 40(sp) + sd s3, 48(sp) + sd s4, 56(sp) + sd s7, 64(sp) + sd s9, 72(sp) + sd s8, 80(sp) + sd s10, 88(sp) + sd s11, 96(sp) jal getint - mv s0, a0 + mv s2, a0 jal getint sd a0, 112(sp) mv a1, a0 jal getint -pcrel1546: +pcrel1555: auipc a1, %pcrel_hi(a) sd a0, 104(sp) - addi a2, a1, %pcrel_lo(pcrel1546) + addi a2, a1, %pcrel_lo(pcrel1555) sd a2, 120(sp) mv a0, a2 jal getarray -pcrel1547: +pcrel1556: auipc a1, %pcrel_hi(kernelid) - addi s1, a1, %pcrel_lo(pcrel1547) - mv a0, s1 + addi s0, a1, %pcrel_lo(pcrel1556) + mv a0, s0 jal getarray sd a0, 136(sp) li a0, 109 jal _sysy_starttime -pcrel1548: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_3) - li s9, 5 -pcrel1549: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) - mv s11, zero -pcrel1550: - auipc s10, %pcrel_hi(cmmc_parallel_body_payload_1) - srliw a2, s0, 31 - addi s6, s7, %pcrel_lo(pcrel1548) - addi s4, s5, %pcrel_lo(pcrel1549) ld a0, 104(sp) +pcrel1557: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_5) +pcrel1558: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_4) + srliw a2, s2, 31 +pcrel1559: + auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) + mv s11, zero +pcrel1560: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) + addi s8, s9, %pcrel_lo(pcrel1557) + addi s6, s7, %pcrel_lo(pcrel1558) + addi s4, s5, %pcrel_lo(pcrel1560) ld a1, 112(sp) mulw a1, a1, a0 - add a0, s0, a2 - sraiw s2, a0, 1 + add a0, s2, a2 + addi s2, s3, %pcrel_lo(pcrel1559) + sraiw s1, a0, 1 sd a1, 128(sp) -pcrel1551: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel1552: - auipc a1, %pcrel_hi(__cmmc_jumptable1415) - addi s8, a0, %pcrel_lo(pcrel1551) - addi s3, a1, %pcrel_lo(pcrel1552) -pcrel1553: +pcrel1561: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) - addi a3, a0, %pcrel_lo(pcrel1553) -pcrel1554: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_5) - sd a3, 160(sp) - addi a3, a0, %pcrel_lo(pcrel1554) -pcrel1555: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_6) - sd a3, 152(sp) - addi a2, a0, %pcrel_lo(pcrel1555) -pcrel1556: - auipc a0, %pcrel_hi(cmmc_parallel_body_1) +pcrel1562: + auipc a1, %pcrel_hi(cmmc_parallel_body_payload_6) + addi a2, a0, %pcrel_lo(pcrel1561) + addi s10, a1, %pcrel_lo(pcrel1562) sd a2, 144(sp) - addi s0, a0, %pcrel_lo(pcrel1556) - lw a0, 0(s1) - mv a1, a0 - bltu a0, s9, label1419 + lw a1, 0(s0) + li a3, 5 + mv a0, a1 + bltu a1, a3, label1437 .p2align 2 -label1385: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_6) -pcrel1557: - auipc a5, %pcrel_hi(cmmc_parallel_body_6) - sw zero, %pcrel_lo(label1385)(a0) +label1398: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) +pcrel1563: + auipc a5, %pcrel_hi(cmmc_parallel_body_2) + sw zero, %pcrel_lo(label1398)(a0) ld a2, 144(sp) - sw s2, 4(a2) + sw s1, 4(a2) ld a1, 112(sp) ld a0, 104(sp) slli a3, a1, 32 add.uw a4, a0, a3 mv a0, zero sd a4, 8(a2) - addi a2, a5, %pcrel_lo(pcrel1557) + addi a2, a5, %pcrel_lo(pcrel1563) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 -.p2align 2 -label1387: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel1558: - auipc a4, %pcrel_hi(cmmc_parallel_body_4) - sw zero, %pcrel_lo(label1387)(a0) - sw s2, 4(s8) - ld a1, 112(sp) + bgt a1, zero, label1400 + j label1401 +.p2align 2 +label1404: + auipc a1, %pcrel_hi(cmmc_parallel_body_payload_6) + slli a2, s1, 32 +pcrel1564: + auipc a3, %pcrel_hi(cmmc_parallel_body_6) + sw zero, %pcrel_lo(label1404)(a1) ld a0, 104(sp) - slli a2, a1, 32 - add.uw a3, a0, a2 + sw a0, 4(s10) + ld a1, 112(sp) + add.uw a0, a1, a2 + addi a2, a3, %pcrel_lo(pcrel1564) + sd a0, 8(s10) mv a0, zero - addi a2, a4, %pcrel_lo(pcrel1558) - sd a3, 8(s8) jal cmmcParallelFor ld a1, 128(sp) - ble a1, zero, label1381 + ble a1, zero, label1401 .p2align 2 -label1384: +label1400: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_1) +pcrel1565: + auipc a3, %pcrel_hi(cmmc_parallel_body_1) ld a1, 128(sp) + addi a2, a3, %pcrel_lo(pcrel1565) + sw a1, %pcrel_lo(label1400)(a0) mv a0, zero -pcrel1559: - auipc s10, %pcrel_hi(cmmc_parallel_body_payload_1) - sw a1, %pcrel_lo(pcrel1559)(s10) - mv a2, s0 jal cmmcParallelFor - ld a0, 136(sp) - addiw s11, s11, 1 - ble a0, s11, label1383 .p2align 2 -label1382: - addi s1, s1, 4 - lw a0, 0(s1) - mv a1, a0 - bgeu a0, s9, label1385 -.p2align 2 -label1419: - sh2add a3, a1, s3 - lw a2, 0(a3) - add a0, s3, a2 - jr a0 -.p2align 2 -label1390: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) - sw zero, %pcrel_lo(label1390)(s5) - slli a2, s2, 32 -pcrel1560: +label1401: + addiw s11, s11, 1 + ld a0, 136(sp) + ble a0, s11, label1403 + addi s0, s0, 4 + li a3, 5 + lw a1, 0(s0) + mv a0, a1 + bgeu a1, a3, label1398 +.p2align 2 +label1437: + auipc a4, %pcrel_hi(__cmmc_jumptable1433) + addi a2, a4, %pcrel_lo(label1437) + sh2add a1, a0, a2 + lw a3, 0(a1) + add a4, a2, a3 + jr a4 +.p2align 2 +label1408: + auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) + sw zero, %pcrel_lo(label1408)(s3) +pcrel1566: auipc a3, %pcrel_hi(cmmc_parallel_body_0) ld a0, 104(sp) - sw a0, 4(s4) + sw a0, 4(s2) ld a1, 112(sp) - add.uw a0, a1, a2 - addi a2, a3, %pcrel_lo(pcrel1560) - sd a0, 8(s4) + slli a2, a1, 32 + add.uw a0, s1, a2 + addi a2, a3, %pcrel_lo(pcrel1566) + sd a0, 8(s2) mv a0, zero jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 -label1383: + bgt a1, zero, label1400 + j label1401 +label1403: li a0, 116 jal _sysy_stoptime ld a2, 120(sp) @@ -206,93 +200,87 @@ label1383: mv a0, a1 mv a1, a2 jal putarray - mv a0, zero ld ra, 0(sp) - ld s0, 8(sp) - ld s5, 16(sp) - ld s1, 24(sp) - ld s6, 32(sp) - ld s2, 40(sp) - ld s4, 48(sp) - ld s7, 56(sp) - ld s8, 64(sp) - ld s11, 72(sp) - ld s3, 80(sp) - ld s9, 88(sp) - ld s10, 96(sp) - addi sp, sp, 168 + mv a0, zero + ld s2, 8(sp) + ld s0, 16(sp) + ld s5, 24(sp) + ld s1, 32(sp) + ld s6, 40(sp) + ld s3, 48(sp) + ld s4, 56(sp) + ld s7, 64(sp) + ld s9, 72(sp) + ld s8, 80(sp) + ld s10, 88(sp) + ld s11, 96(sp) + addi sp, sp, 152 ret .p2align 2 -label1389: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_3) - sw zero, %pcrel_lo(label1389)(s7) - slli a2, s2, 32 -pcrel1561: +label1407: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) + sw zero, %pcrel_lo(label1407)(s5) +pcrel1567: auipc a4, %pcrel_hi(cmmc_parallel_body_3) - ld a1, 112(sp) - sw a1, 4(s6) + sw s1, 4(s4) ld a0, 104(sp) - add.uw a3, a0, a2 + ld a1, 112(sp) + slli a2, a0, 32 mv a0, zero - addi a2, a4, %pcrel_lo(pcrel1561) - sd a3, 8(s6) + add.uw a3, a1, a2 + addi a2, a4, %pcrel_lo(pcrel1567) + sd a3, 8(s4) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 -label1381: - addiw s11, s11, 1 - ld a0, 136(sp) - bgt a0, s11, label1382 - j label1383 + bgt a1, zero, label1400 + j label1401 .p2align 2 -label1386: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel1562: - auipc a5, %pcrel_hi(cmmc_parallel_body_2) - sw zero, %pcrel_lo(label1386)(a0) - ld a3, 160(sp) - sw s2, 4(a3) - ld a0, 104(sp) +label1405: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_4) + sw zero, %pcrel_lo(label1405)(s7) +pcrel1568: + auipc a4, %pcrel_hi(cmmc_parallel_body_4) + sw s1, 4(s6) ld a1, 112(sp) - slli a2, a0, 32 + ld a0, 104(sp) + slli a2, a1, 32 + add.uw a3, a0, a2 mv a0, zero - add.uw a4, a1, a2 - addi a2, a5, %pcrel_lo(pcrel1562) - sd a4, 8(a3) + addi a2, a4, %pcrel_lo(pcrel1568) + sd a3, 8(s6) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 -.p2align 2 -label1388: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_5) -pcrel1563: - auipc a5, %pcrel_hi(cmmc_parallel_body_5) - sw zero, %pcrel_lo(label1388)(a0) - ld a1, 112(sp) - ld a3, 152(sp) - sw a1, 4(a3) + bgt a1, zero, label1400 + j label1401 +.p2align 2 +label1406: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_5) + sw zero, %pcrel_lo(label1406)(s9) +pcrel1569: + auipc a4, %pcrel_hi(cmmc_parallel_body_5) + sw s1, 4(s8) ld a0, 104(sp) + ld a1, 112(sp) slli a2, a0, 32 mv a0, zero - add.uw a4, s2, a2 - addi a2, a5, %pcrel_lo(pcrel1563) - sd a4, 8(a3) + add.uw a3, a1, a2 + addi a2, a4, %pcrel_lo(pcrel1569) + sd a3, 8(s8) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 + bgt a1, zero, label1400 + j label1401 .p2align 2 cmmc_parallel_body_0: addi sp, sp, -80 mv t1, a1 -pcrel168: +pcrel172: auipc a5, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel169: +pcrel173: auipc t5, %pcrel_hi(b) sd s0, 0(sp) - addi t0, a5, %pcrel_lo(pcrel168) - addi t4, t5, %pcrel_lo(pcrel169) + addi t0, a5, %pcrel_lo(pcrel172) + addi t4, t5, %pcrel_lo(pcrel173) sd s5, 8(sp) sd s4, 16(sp) sd s3, 24(sp) @@ -300,18 +288,18 @@ pcrel169: sd s6, 40(sp) sd s2, 48(sp) sd s7, 56(sp) - sd s9, 64(sp) - sd s8, 72(sp) + sd s8, 64(sp) + sd s9, 72(sp) lw a2, 4(t0) - lw a4, 8(t0) - lw a3, 12(t0) + lw a3, 8(t0) + lw a4, 12(t0) mulw a1, a0, a2 - lw t3, %pcrel_lo(pcrel168)(a5) + lw t3, %pcrel_lo(pcrel172)(a5) addw t2, a1, t3 -pcrel170: +pcrel174: auipc a1, %pcrel_hi(a) sh2add t0, t2, t4 - addi a5, a1, %pcrel_lo(pcrel170) + addi a5, a1, %pcrel_lo(pcrel174) mv t2, a0 lui a1, 786432 lui a0, 262144 @@ -325,9 +313,39 @@ pcrel170: mv s4, zero j label8 .p2align 2 -label94: +label150: + addiw s3, s3, 1 + ble a7, s3, label154 +.p2align 2 +label28: + addi s2, s2, 4 + or s7, s0, s3 + srliw s8, s7, 31 + slt s7, s3, a2 + andi s6, s8, 1 + xori s8, s7, 1 + or s5, s1, s6 + or s9, s5, s8 + beq s9, zero, label148 +.p2align 2 +label67: + mv s6, zero + sext.w s5, s4 + ble s5, a0, label146 +.p2align 2 +label31: + addw s5, s5, a1 + bgt s5, a0, label31 + mv s4, s5 + bge s5, zero, label150 +.p2align 2 +label29: + addw s4, s4, a0 + blt s4, zero, label29 + addiw s3, s3, 1 + bgt a7, s3, label28 addiw s0, s0, 1 - ble t4, s0, label149 + ble t4, s0, label153 .p2align 2 label8: slt s3, s0, a4 @@ -335,88 +353,58 @@ label8: xori s1, s3, 1 mv s3, a6 or s7, s0, a6 - slt s8, a6, a2 - srliw s9, s7, 31 - xori s7, s8, 1 - andi s6, s9, 1 + srliw s8, s7, 31 + slt s7, a6, a2 + andi s6, s8, 1 + xori s8, s7, 1 or s5, s1, s6 - or s6, s5, s7 - bne s6, zero, label67 - mulw s7, a2, s0 - sh2add s6, s7, s2 - lw s5, 0(s6) - addw s4, s4, s5 - bgt s4, a0, label18 - j label75 -label146: - blt s4, zero, label22 -.p2align 2 -label84: - addiw s3, s3, 1 - bgt a7, s3, label26 - addiw s0, s0, 1 - bgt t4, s0, label8 + or s9, s5, s8 + bne s9, zero, label67 + mulw s5, a2, s0 + sh2add s7, s5, s2 + lw s6, 0(s7) + addw s5, s4, s6 + bgt s5, a0, label31 +label151: + mv s4, s5 + blt s5, zero, label29 + j label20 .p2align 2 -label149: +label153: addiw t6, t6, 1 sw s4, 0(t5) - ble a2, t6, label154 + ble a2, t6, label157 +.p2align 2 +label24: addi t5, t5, 4 subw a6, t6, a3 addw a7, a3, t6 mv s0, t3 mv s4, zero slt s3, t3, a4 - slt s8, a6, a2 or s7, t3, a6 sh2add s2, a6, a5 xori s1, s3, 1 - srliw s9, s7, 31 + srliw s8, s7, 31 mv s3, a6 - xori s7, s8, 1 - andi s6, s9, 1 + slt s7, a6, a2 + andi s6, s8, 1 + xori s8, s7, 1 or s5, s1, s6 - or s6, s5, s7 - beq s6, zero, label157 -.p2align 2 -label67: - mv s5, zero - sext.w s4, s4 - ble s4, a0, label146 -.p2align 2 -label18: - addw s4, s4, a1 - bgt s4, a0, label18 - bge s4, zero, label84 -.p2align 2 -label22: - addw s4, s4, a0 - blt s4, zero, label22 - addiw s3, s3, 1 - ble a7, s3, label94 -.p2align 2 -label26: - addi s2, s2, 4 - or s7, s0, s3 - slt s8, s3, a2 - srliw s9, s7, 31 - xori s7, s8, 1 - andi s6, s9, 1 - or s5, s1, s6 - or s6, s5, s7 - bne s6, zero, label67 - mulw s7, a2, s0 - sh2add s6, s7, s2 - lw s5, 0(s6) - addw s4, s4, s5 - bgt s4, a0, label18 -label75: - blt s4, zero, label22 - j label84 + or s9, s5, s8 + bne s9, zero, label67 + mulw s5, a2, t3 + sh2add s7, s5, s2 + lw s6, 0(s7) + mv s5, s6 + bgt s6, a0, label31 + j label151 .p2align 2 -label154: +label157: addiw t2, t2, 1 - ble t1, t2, label30 + ble t1, t2, label26 +.p2align 2 +label27: sh2add t0, a2, t0 subw t3, t2, a3 addw t4, a3, t2 @@ -427,24 +415,43 @@ label154: mv t5, t0 slt s3, t3, a4 mv s0, t3 - slt s8, a6, a2 or s7, t3, a6 sh2add s2, a6, a5 xori s1, s3, 1 - srliw s9, s7, 31 + srliw s8, s7, 31 mv s3, a6 - xori s7, s8, 1 - andi s6, s9, 1 + slt s7, a6, a2 + andi s6, s8, 1 + xori s8, s7, 1 or s5, s1, s6 - or s6, s5, s7 - bne s6, zero, label67 - mulw s7, a2, t3 - sh2add s6, s7, s2 - lw s5, 0(s6) + or s9, s5, s8 + bne s9, zero, label67 + mulw s5, a2, t3 + sh2add s7, s5, s2 + lw s6, 0(s7) + mv s5, s6 + bgt s6, a0, label31 + j label151 +.p2align 2 +label148: + mulw s5, a2, s0 + sh2add s7, s5, s2 + lw s6, 0(s7) + addw s5, s4, s6 + bgt s5, a0, label31 mv s4, s5 - bgt s5, a0, label18 - j label75 -label30: + blt s5, zero, label29 + j label20 +label154: + addiw s0, s0, 1 + bgt t4, s0, label8 +label23: + addiw t6, t6, 1 + sw s4, 0(t5) + bgt a2, t6, label24 + addiw t2, t2, 1 + bgt t1, t2, label27 +label26: ld s0, 0(sp) ld s5, 8(sp) ld s4, 16(sp) @@ -453,40 +460,42 @@ label30: ld s6, 40(sp) ld s2, 48(sp) ld s7, 56(sp) - ld s9, 64(sp) - ld s8, 72(sp) + ld s8, 64(sp) + ld s9, 72(sp) addi sp, sp, 80 ret .p2align 2 -label157: - mulw s7, a2, s0 - sh2add s6, s7, s2 - lw s5, 0(s6) - addw s4, s4, s5 - bgt s4, a0, label18 - j label75 +label146: + mv s4, s5 + blt s5, zero, label29 +label20: + addiw s3, s3, 1 + bgt a7, s3, label28 + addiw s0, s0, 1 + bgt t4, s0, label8 + j label23 .p2align 2 cmmc_parallel_body_1: mv t0, a0 addiw a4, a0, 3 -pcrel329: +pcrel333: auipc a5, %pcrel_hi(b) -pcrel330: +pcrel334: auipc a0, %pcrel_hi(a) - addi a3, a5, %pcrel_lo(pcrel329) - addi a2, a0, %pcrel_lo(pcrel330) - ble a1, a4, label186 + addi a3, a5, %pcrel_lo(pcrel333) + addi a2, a0, %pcrel_lo(pcrel334) + ble a1, a4, label190 addiw t1, t0, 15 addiw a4, a1, -3 addiw a5, a1, -18 - bge t1, a4, label209 + bge t1, a4, label213 sh2add a0, t0, a2 - j label182 + j label186 .p2align 2 -label185: +label189: addi a0, a0, 64 .p2align 2 -label182: +label186: sh2add t1, t0, a3 addiw t0, t0, 16 ld t3, 0(t1) @@ -505,345 +514,445 @@ label182: sd t2, 48(a0) ld t3, 56(t1) sd t3, 56(a0) - bgt a5, t0, label185 + bgt a5, t0, label189 mv a0, t0 -label173: - ble a4, a0, label186 +label177: + ble a4, a0, label190 sh2add a5, a0, a3 - j label177 -label180: + j label181 +label184: addi a5, a5, 16 -label177: +label181: sh2add t0, a0, a2 ld t2, 0(a5) addiw a0, a0, 4 sd t2, 0(t0) ld t1, 8(a5) sd t1, 8(t0) - bgt a4, a0, label180 + bgt a4, a0, label184 mv t0, a0 -label186: - ble a1, t0, label188 +label190: + ble a1, t0, label197 sh2add a0, t0, a3 - j label190 -label193: + j label193 +label196: addi a0, a0, 4 -label190: +label193: sh2add a3, t0, a2 lw a4, 0(a0) addiw t0, t0, 1 sw a4, 0(a3) - bgt a1, t0, label193 -label188: + bgt a1, t0, label196 +label197: ret -label209: +label213: mv a0, t0 mv t0, zero - j label173 + j label177 .p2align 2 cmmc_parallel_body_2: - addi sp, sp, -96 - mv t3, a1 -pcrel515: + # stack usage: CalleeArg[0] Local[0] RegSpill[40] CalleeSaved[96] + addi sp, sp, -136 +pcrel722: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel516: - auipc t5, %pcrel_hi(a) +pcrel723: + auipc a5, %pcrel_hi(b) mv t4, a0 - addi a3, a2, %pcrel_lo(pcrel515) - sd s0, 0(sp) - sd s5, 8(sp) - sd s1, 16(sp) + sd s4, 0(sp) + sd s0, 8(sp) + sd s5, 16(sp) sd s6, 24(sp) - sd s2, 32(sp) - sd s3, 40(sp) - sd s4, 48(sp) - sd s7, 56(sp) - sd s8, 64(sp) - sd s9, 72(sp) - sd s10, 80(sp) + sd s1, 32(sp) + sd s10, 40(sp) + sd s9, 48(sp) + sd s2, 56(sp) + sd s3, 64(sp) + sd s7, 72(sp) + sd s8, 80(sp) sd s11, 88(sp) - lw a5, 4(a3) - lw t0, 8(a3) - lw a4, 12(a3) -pcrel517: - auipc a3, %pcrel_hi(b) - lw t2, %pcrel_lo(pcrel515)(a2) - mulw t1, a0, a4 - addi a2, a3, %pcrel_lo(pcrel517) - addw a1, t1, t2 - lui a3, 524288 - addi t1, t5, %pcrel_lo(pcrel516) - sh2add t2, a1, a2 - addiw a0, a3, 1 - li a1, 1 - lui a2, 262144 - subw t5, t4, a5 - addw t6, a5, t4 - mv a6, t2 + sd a1, 96(sp) + addi a1, a2, %pcrel_lo(pcrel722) + lw s4, 4(a1) + addi s6, s4, -336 + addi s10, s4, -1359 + addi t0, s4, -18 + addi t1, s4, -81 + sd s4, 104(sp) + lw s0, 8(a1) + mulw a3, a0, s0 + sd s0, 112(sp) + lw a4, %pcrel_lo(pcrel722)(a2) + addi a2, a5, %pcrel_lo(pcrel723) + addw a1, a3, a4 + sd s6, 128(sp) + addi a5, s4, -3 + lui a3, 1048571 + sh2add a4, a1, a2 + sd s10, 120(sp) + lui a2, 1048575 + addiw a1, a2, -1358 + addiw a2, a3, -1357 + addw t2, s4, a1 + lui a3, 32 + addw t3, s4, a2 + lui a1, 16 + addiw a0, a3, -1 + lui a2, 1 + lui a3, 4 + mv a6, a4 mv a7, zero - subw s0, zero, a5 - mv s1, a5 - mv s2, t5 - mv a3, zero - slt s5, t5, t0 - sh2add s4, s0, t1 - xori s3, s5, 1 - mv s5, s0 - or s7, t5, s0 - slt s10, s0, a4 - srliw s8, s7, 31 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, t5 - divw s9, zero, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 -.p2align 2 -label405: - divw s11, s6, s7 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, s7, 1 - sh1add s8, s8, s10 - mv s7, s9 - bge s9, a2, label485 + addw t6, s4, t4 + subw t5, t4, s4 + j label339 .p2align 2 -label348: - divw s9, a3, s7 - and s11, s9, a0 - beq s11, a1, label405 -.p2align 2 -label406: - mv s10, zero - sh1add s8, s8, zero - slliw s9, s7, 1 - mv s7, s9 - blt s9, a2, label348 - addiw s5, s5, 1 - bgt s1, s5, label354 +label639: addiw s2, s2, 1 - ble t6, s2, label479 + ble t6, s2, label662 .p2align 2 -label425: - mv a3, s8 - slt s5, s2, t0 - sh2add s4, s0, t1 - or s7, s2, s0 - slt s10, s0, a4 - xori s3, s5, 1 - srliw s8, s7, 31 - mv s5, s0 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, s2 - divw s9, a3, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 +label409: + blt s0, s1, label586 + addiw s2, s2, 1 + bgt t6, s2, label409 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + ble s0, a7, label676 .p2align 2 -label485: - addiw s5, s5, 1 - ble s1, s5, label491 +label343: + addi a6, a6, 4 .p2align 2 -label354: - addi s4, s4, 4 - mv a3, s8 - or s7, s2, s5 - slt s10, s5, a4 - srliw s8, s7, 31 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - beq s7, zero, label480 +label339: + ld s4, 104(sp) + addw s2, a5, a7 + addw s3, t0, a7 + addw s8, t2, a7 + ld s6, 128(sp) + addw s1, s4, a7 + subw s0, a7, s4 + addw s5, s6, a7 + ld s10, 120(sp) + addw s4, t1, a7 + addiw s9, s0, 3 + addw s6, t3, a7 + addw s7, s10, a7 + blt s9, s1, label451 + mv s2, t5 + bge s0, s1, label639 .p2align 2 -label396: - mv s6, zero - mv s7, a1 - mv s8, zero - divw s9, a3, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, zero, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 +label586: + mv s3, s0 .p2align 2 -label491: +label411: + addiw s3, s3, 1 + bgt s1, s3, label411 addiw s2, s2, 1 - bgt t6, s2, label425 + bgt t6, s2, label409 addiw a7, a7, 1 - sw s8, 0(a6) - ble a4, a7, label502 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + addiw t4, t4, 1 + ld t5, 96(sp) + ble t5, t4, label345 +.p2align 2 +label346: + ld s0, 112(sp) + mv a7, zero + ld s4, 104(sp) + sh2add a4, s0, a4 + addw t6, s4, t4 + subw t5, t4, s4 + mv a6, a4 + j label339 +.p2align 2 +label451: + mv s9, t5 + addiw s11, s0, 15 + ble s2, s11, label640 +.p2align 2 +label359: + addiw s11, s0, 63 + ble s3, s11, label496 + addiw s11, s0, 255 + ble s4, s11, label501 + addiw s11, s0, 1023 + ble s5, s11, label506 + lui s11, 1 + addiw s10, s11, -1 + addw s11, s0, s10 + ble s7, s11, label511 + lui s10, 4 + addiw s11, s10, -1 + addw s10, s0, s11 + ble s8, s10, label525 + lui s11, 16 + addiw s10, s11, -1 + addw s11, s0, s10 + ble s6, s11, label530 + addw s11, s0, a0 + addw s10, s0, a1 + ble s6, s11, label651 +.p2align 2 +label379: + addw s11, s10, a0 + addw s10, s10, a1 + bgt s6, s11, label379 + mv s11, s10 + ble s6, s10, label655 +.p2align 2 +label535: + addw s10, s11, a3 + ble s6, s10, label652 +.p2align 2 +label377: + addw s10, s10, a3 + bgt s6, s10, label377 + mv s11, s10 + ble s8, s10, label654 +.p2align 2 +label384: + addw s10, s10, a2 + bgt s8, s10, label384 + mv s11, s10 + ble s7, s10, label656 +.p2align 2 +label366: + addiw s10, s10, 1024 + bgt s7, s10, label366 + mv s11, s10 + ble s5, s10, label648 +.p2align 2 +label393: + addiw s10, s10, 256 + bgt s5, s10, label393 + mv s11, s10 + ble s4, s10, label567 +.p2align 2 +label398: + addiw s10, s10, 64 + bgt s4, s10, label398 + mv s11, s10 +.p2align 2 +label646: + mv s10, s11 + ble s3, s11, label576 +.p2align 2 +label405: + addiw s10, s10, 16 + bgt s3, s10, label405 + mv s11, s10 + ble s2, s10, label659 .p2align 2 label357: - addi a6, a6, 4 - subw s0, a7, a5 - addw s1, a5, a7 - mv s2, t5 - mv a3, zero - slt s5, t5, t0 - slt s10, s0, a4 - or s7, t5, s0 - sh2add s4, s0, t1 - xori s3, s5, 1 - srliw s8, s7, 31 - mv s5, s0 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, t5 - divw s9, zero, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 + addiw s10, s10, 4 + bgt s2, s10, label357 + ble s1, s10, label644 .p2align 2 -label480: - mulw s8, a4, s2 - divw s9, a3, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 +label354: + addiw s10, s10, 1 + bgt s1, s10, label354 + addiw s9, s9, 1 + ble t6, s9, label486 +.p2align 2 +label347: + addiw s11, s0, 15 + bgt s2, s11, label359 + mv s10, s0 + mv s11, zero +.p2align 2 +label349: + bgt s2, s10, label357 +label679: + mv s10, s11 + bgt s1, s11, label354 + j label476 .p2align 2 -label479: +label486: addiw a7, a7, 1 - sw s8, 0(a6) - bgt a4, a7, label357 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 addiw t4, t4, 1 - ble t3, t4, label359 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 .p2align 2 -label360: - sh2add t2, a4, t2 - subw t5, t4, a5 - addw t6, a5, t4 - mv a7, zero - subw s0, zero, a5 - sext.w s1, a5 - mv a3, zero - mv a6, t2 - slt s5, t5, t0 - mv s2, t5 - slt s10, s0, a4 - or s7, t5, s0 - sh2add s4, s0, t1 - xori s3, s5, 1 - srliw s8, s7, 31 - mv s5, s0 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, t5 - divw s9, zero, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 -label359: - ld s0, 0(sp) - ld s5, 8(sp) - ld s1, 16(sp) +label567: + mv s10, s11 + bgt s3, s11, label405 + bgt s2, s11, label357 + j label679 +.p2align 2 +label644: + addiw s9, s9, 1 + bgt t6, s9, label347 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + addiw t4, t4, 1 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label511: + mv s10, s0 + mv s11, zero +.p2align 2 +label363: + bgt s7, s10, label366 +.p2align 2 +label515: + mv s10, s11 + bgt s5, s11, label393 + j label391 +.p2align 2 +label648: + mv s10, s11 + bgt s4, s11, label398 + bgt s3, s11, label405 +.p2align 2 +label576: + mv s10, s11 + bgt s2, s11, label357 + bgt s1, s11, label354 +label476: + addiw s9, s9, 1 + bgt t6, s9, label347 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + j label663 +.p2align 2 +label659: + mv s10, s11 + bgt s1, s11, label354 + addiw s9, s9, 1 + bgt t6, s9, label347 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 +label663: + addiw t4, t4, 1 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label651: + mv s11, s10 + bgt s6, s10, label535 +.p2align 2 +label655: + mv s11, s10 + bgt s8, s10, label384 + bgt s7, s10, label366 + j label515 +.p2align 2 +label656: + mv s10, s11 + bgt s5, s11, label393 +.p2align 2 +label391: + mv s10, s11 + bgt s4, s11, label398 + bgt s3, s11, label405 + j label576 +.p2align 2 +label652: + mv s11, s10 + bgt s8, s10, label384 + bgt s7, s10, label366 + j label515 +.p2align 2 +label662: + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + addiw t4, t4, 1 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label654: + mv s10, s11 + bgt s7, s11, label366 + bgt s5, s11, label393 + j label391 +.p2align 2 +label496: + mv s10, s0 + mv s11, zero + bgt s3, s0, label405 + mv s10, zero + j label349 +.p2align 2 +label506: + mv s10, s0 + mv s11, zero + bgt s5, s0, label393 + j label391 +.p2align 2 +label525: + mv s10, s0 + mv s11, zero + bgt s8, s0, label384 + mv s10, zero + j label363 +.p2align 2 +label530: + mv s11, s0 + mv s10, zero + bgt s6, s0, label535 + mv s11, zero + bgt s8, zero, label384 + bgt s7, zero, label366 + j label515 +label345: + ld s4, 0(sp) + ld s0, 8(sp) + ld s5, 16(sp) ld s6, 24(sp) - ld s2, 32(sp) - ld s3, 40(sp) - ld s4, 48(sp) - ld s7, 56(sp) - ld s8, 64(sp) - ld s9, 72(sp) - ld s10, 80(sp) + ld s1, 32(sp) + ld s10, 40(sp) + ld s9, 48(sp) + ld s2, 56(sp) + ld s3, 64(sp) + ld s7, 72(sp) + ld s8, 80(sp) ld s11, 88(sp) - addi sp, sp, 96 + addi sp, sp, 136 ret .p2align 2 -label502: +label676: addiw t4, t4, 1 - bgt t3, t4, label360 - j label359 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label501: + mv s10, s0 + mv s11, zero + bgt s4, s0, label398 + j label646 +label640: + mv s10, s0 + mv s11, zero + bgt s2, s0, label357 + j label679 .p2align 2 cmmc_parallel_body_3: addi sp, sp, -96 mv t1, a1 -pcrel673: +pcrel883: auipc a5, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel674: +pcrel884: auipc t5, %pcrel_hi(b) sd s0, 0(sp) - addi t0, a5, %pcrel_lo(pcrel673) - addi t3, t5, %pcrel_lo(pcrel674) + addi t0, a5, %pcrel_lo(pcrel883) + addi t3, t5, %pcrel_lo(pcrel884) sd s5, 8(sp) sd s2, 16(sp) sd s1, 24(sp) @@ -853,20 +962,20 @@ pcrel674: sd s7, 56(sp) sd s8, 64(sp) sd s9, 72(sp) - sd s10, 80(sp) - sd s11, 88(sp) - lw a4, 4(t0) - lw a2, 8(t0) - lw a3, 12(t0) - lw t4, %pcrel_lo(pcrel673)(a5) + sd s11, 80(sp) + sd s10, 88(sp) + lw a3, 4(t0) + lw a4, 8(t0) + lw a2, 12(t0) + lw t4, %pcrel_lo(pcrel883)(a5) mulw t2, a0, a2 addw a1, t2, t4 mv t2, a0 -pcrel675: +pcrel885: auipc t4, %pcrel_hi(a) sh2add t0, a1, t3 lui a0, 262144 - addi a5, t4, %pcrel_lo(pcrel675) + addi a5, t4, %pcrel_lo(pcrel885) subw t3, t2, a3 addw t4, a3, t2 mv t5, t0 @@ -886,50 +995,56 @@ pcrel675: xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - bne s6, zero, label580 + bne s6, zero, label786 + j label785 +.p2align 2 +label745: + addi s3, s3, 4 + mv a1, s6 + or s5, s0, s4 + srliw s7, s5, 31 + slt s5, s4, a2 + andi s6, s7, 1 + xori s7, s5, 1 + or s2, s1, s6 + or s6, s2, s7 + bne s6, zero, label786 .p2align 2 -label579: +label785: mulw s6, a2, s0 sh2add s5, s6, s3 mv s6, zero lw s2, 0(s5) li s5, 1 .p2align 2 -label535: +label741: divw s8, a1, s5 srliw s9, s8, 31 - add s10, s8, s9 + add s11, s8, s9 divw s9, s2, s5 - andi s11, s10, -2 - subw s7, s8, s11 - srliw s10, s9, 31 - add s8, s9, s10 - andi s11, s8, -2 - subw s10, s9, s11 - xor s11, s7, s10 - slliw s7, s6, 1 - sltiu s8, s11, 1 - addi s9, s7, 1 + andi s10, s11, -2 + subw s7, s8, s10 + srliw s11, s9, 31 + add s10, s9, s11 + slliw s11, s6, 1 + andi s8, s10, -2 + subw s10, s9, s8 + xor s9, s7, s10 slliw s7, s5, 1 - subw s6, s9, s8 + sltiu s8, s9, 1 mv s5, s7 - blt s7, a0, label535 + addi s9, s11, 1 + subw s6, s9, s8 + blt s7, a0, label741 addiw s4, s4, 1 - bgt a7, s4, label545 + bgt a7, s4, label745 addiw s0, s0, 1 - bgt t4, s0, label608 - addiw t6, t6, 1 - sw s6, 0(t5) - ble a2, t6, label662 - addi t5, t5, 4 - subw a6, t6, a3 - addw a7, a3, t6 - mv s0, t3 - mv a1, zero - slt s2, t3, a4 - or s5, t3, a6 - mv s4, a6 + ble t4, s0, label868 + mv a1, s6 + slt s2, s0, a4 sh2add s3, a6, a5 + mv s4, a6 + or s5, s0, a6 xori s1, s2, 1 srliw s7, s5, 31 slt s5, a6, a2 @@ -937,46 +1052,40 @@ label535: xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - bne s6, zero, label580 - j label579 + bne s6, zero, label786 + j label785 .p2align 2 -label545: - addi s3, s3, 4 - mv a1, s6 - or s5, s0, s4 +label868: + addiw t6, t6, 1 + sw s6, 0(t5) + ble a2, t6, label872 + addi t5, t5, 4 + subw a6, t6, a3 + addw a7, a3, t6 + mv s0, t3 + mv a1, zero + slt s2, t3, a4 + mv s4, a6 + or s5, t3, a6 + sh2add s3, a6, a5 + xori s1, s2, 1 srliw s7, s5, 31 - slt s5, s4, a2 + slt s5, a6, a2 andi s6, s7, 1 xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - beq s6, zero, label579 + beq s6, zero, label785 .p2align 2 -label580: +label786: mv s2, zero li s5, 1 mv s6, zero - j label535 + j label741 .p2align 2 -label608: - mv a1, s6 - slt s2, s0, a4 - sh2add s3, a6, a5 - mv s4, a6 - or s5, s0, a6 - xori s1, s2, 1 - srliw s7, s5, 31 - slt s5, a6, a2 - andi s6, s7, 1 - xori s7, s5, 1 - or s2, s1, s6 - or s6, s2, s7 - bne s6, zero, label580 - j label579 -.p2align 2 -label662: +label872: addiw t2, t2, 1 - ble t1, t2, label543 + ble t1, t2, label750 sh2add t0, a2, t0 subw t3, t2, a3 addw t4, a3, t2 @@ -987,8 +1096,8 @@ label662: mv t5, t0 slt s2, t3, a4 mv s0, t3 - mv s4, a6 or s5, t3, a6 + mv s4, a6 sh2add s3, a6, a5 xori s1, s2, 1 srliw s7, s5, 31 @@ -997,9 +1106,9 @@ label662: xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - bne s6, zero, label580 - j label579 -label543: + bne s6, zero, label786 + j label785 +label750: ld s0, 0(sp) ld s5, 8(sp) ld s2, 16(sp) @@ -1010,18 +1119,18 @@ label543: ld s7, 56(sp) ld s8, 64(sp) ld s9, 72(sp) - ld s10, 80(sp) - ld s11, 88(sp) + ld s11, 80(sp) + ld s10, 88(sp) addi sp, sp, 96 ret .p2align 2 cmmc_parallel_body_4: addi sp, sp, -96 mv t3, a1 -pcrel856: +pcrel1069: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_4) sd s0, 0(sp) - addi a3, a2, %pcrel_lo(pcrel856) + addi a3, a2, %pcrel_lo(pcrel1069) sd s5, 8(sp) sd s1, 16(sp) sd s6, 24(sp) @@ -1036,17 +1145,17 @@ pcrel856: lw a5, 4(a3) lw a4, 8(a3) lw t0, 12(a3) -pcrel857: +pcrel1070: auipc a3, %pcrel_hi(b) - lw t1, %pcrel_lo(pcrel856)(a2) + lw t1, %pcrel_lo(pcrel1069)(a2) mulw a1, a0, a4 - addi a2, a3, %pcrel_lo(pcrel857) + addi a2, a3, %pcrel_lo(pcrel1070) addw t4, a1, t1 lui a3, 524288 -pcrel858: +pcrel1071: auipc a1, %pcrel_hi(a) sh2add t2, t4, a2 - addi t1, a1, %pcrel_lo(pcrel858) + addi t1, a1, %pcrel_lo(pcrel1071) lui a2, 262144 mv t4, a0 addiw a1, a3, 1 @@ -1066,208 +1175,237 @@ pcrel858: or s7, t5, s0 slt s10, s0, a4 srliw s8, s7, 31 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 + or s7, s6, s8 + bne s7, zero, label951 mulw s8, a4, t5 - divw s11, zero, a0 + divw s10, zero, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label750: +label916: + addi s4, s4, 4 + mv a3, s8 + or s7, s2, s5 + slt s10, s5, a4 + srliw s8, s7, 31 + andi s9, s8, 1 + xori s8, s10, 1 + or s6, s3, s9 + or s7, s6, s8 + beq s7, zero, label1036 +.p2align 2 +label951: + mv s6, zero + mv s7, a0 + mv s8, zero + divw s10, a3, a0 + and s9, s10, a1 + bne s9, a0, label1037 +.p2align 2 +label961: mv s10, a0 sh1add s8, s8, a0 slliw s9, s7, 1 mv s7, s9 - bge s9, a2, label824 -.p2align 2 -label693: - divw s11, a3, s7 - and s9, s11, a1 - beq s9, a0, label750 -.p2align 2 -label696: - divw s11, s6, s7 - and s10, s11, a1 - xori s9, s10, 1 - sltiu s10, s9, 1 - sh1add s8, s8, s10 + bge s9, a2, label1034 +.p2align 2 +label903: + divw s10, a3, s7 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, s7 + and s10, s9, a1 slliw s9, s7, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 + sltiu s10, s11, 1 + sh1add s8, s8, s10 + blt s9, a2, label903 +.p2align 2 +label1038: addiw s5, s5, 1 - bgt s1, s5, label700 + bgt s1, s5, label916 +.p2align 2 +label1044: addiw s2, s2, 1 - bgt t6, s2, label773 - addiw a7, a7, 1 - sw s8, 0(a6) - bgt a4, a7, label706 - addiw t4, t4, 1 - ble t3, t4, label704 + ble t6, s2, label1051 .p2align 2 -label705: - sh2add t2, a4, t2 - subw t5, t4, a5 - addw t6, a5, t4 - mv a7, zero - subw s0, zero, a5 - sext.w s1, a5 - mv a3, zero - mv a6, t2 - slt s5, t5, t0 - mv s2, t5 - slt s10, s0, a4 - or s7, t5, s0 +label982: + mv a3, s8 + slt s5, s2, t0 sh2add s4, s0, t1 + or s7, s2, s0 + slt s10, s0, a4 xori s3, s5, 1 srliw s8, s7, 31 mv s5, s0 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 - mulw s8, a4, t5 - divw s11, zero, a0 + or s7, s6, s8 + bne s7, zero, label951 + mulw s8, a4, s2 + divw s10, a3, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 - slliw s9, a0, 1 - mv s7, s9 - blt s9, a2, label693 - j label824 -.p2align 2 -label700: - addi s4, s4, 4 - mv a3, s8 - or s7, s2, s5 - slt s10, s5, a4 - srliw s8, s7, 31 - xori s7, s10, 1 - andi s9, s8, 1 - or s6, s3, s9 - or s8, s6, s7 - beq s8, zero, label826 -.p2align 2 -label741: - mv s6, zero - mv s7, a0 - mv s8, zero - divw s11, a3, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label824: +label1034: addiw s5, s5, 1 - bgt s1, s5, label700 + bgt s1, s5, label916 addiw s2, s2, 1 - ble t6, s2, label835 + bgt t6, s2, label982 + addiw a7, a7, 1 + sw s8, 0(a6) + ble a4, a7, label1052 .p2align 2 -label773: - mv a3, s8 - slt s5, s2, t0 - sh2add s4, s0, t1 - or s7, s2, s0 +label915: + addi a6, a6, 4 + subw s0, a7, a5 + addw s1, a5, a7 + mv s2, t5 + mv a3, zero + slt s5, t5, t0 slt s10, s0, a4 + or s7, t5, s0 + sh2add s4, s0, t1 xori s3, s5, 1 srliw s8, s7, 31 mv s5, s0 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 - mulw s8, a4, s2 - divw s11, a3, a0 + or s7, s6, s8 + bne s7, zero, label951 + mulw s8, a4, t5 + divw s10, zero, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label826: +label1036: mulw s8, a4, s2 - divw s11, a3, a0 + divw s10, a3, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label835: +label1051: addiw a7, a7, 1 sw s8, 0(a6) - ble a4, a7, label841 + bgt a4, a7, label915 + addiw t4, t4, 1 + ble t3, t4, label914 .p2align 2 -label706: - addi a6, a6, 4 - subw s0, a7, a5 - addw s1, a5, a7 - mv s2, t5 +label913: + sh2add t2, a4, t2 + subw t5, t4, a5 + addw t6, a5, t4 + mv a7, zero + subw s0, zero, a5 + sext.w s1, a5 mv a3, zero + mv a6, t2 slt s5, t5, t0 + mv s2, t5 slt s10, s0, a4 or s7, t5, s0 sh2add s4, s0, t1 xori s3, s5, 1 srliw s8, s7, 31 mv s5, s0 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 + or s7, s6, s8 + bne s7, zero, label951 mulw s8, a4, t5 - divw s11, zero, a0 + divw s10, zero, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 + mv s7, s9 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 +.p2align 2 +label1037: + divw s9, s6, s7 + and s10, s9, a1 + slliw s9, s7, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 -label704: + sltiu s10, s11, 1 + sh1add s8, s8, s10 + blt s9, a2, label903 + addiw s5, s5, 1 + bgt s1, s5, label916 + j label1044 +.p2align 2 +label1052: + addiw t4, t4, 1 + bgt t3, t4, label913 +label914: ld s0, 0(sp) ld s5, 8(sp) ld s1, 16(sp) @@ -1283,22 +1421,17 @@ label704: addi sp, sp, 96 ret .p2align 2 -label841: - addiw t4, t4, 1 - bgt t3, t4, label705 - j label704 -.p2align 2 cmmc_parallel_body_5: addi sp, sp, -64 mv t0, a0 mv a5, a1 -pcrel995: +pcrel1208: auipc a3, %pcrel_hi(cmmc_parallel_body_payload_5) -pcrel996: +pcrel1209: auipc t5, %pcrel_hi(b) - addi a4, a3, %pcrel_lo(pcrel995) + addi a4, a3, %pcrel_lo(pcrel1208) sd s2, 0(sp) - addi t4, t5, %pcrel_lo(pcrel996) + addi t4, t5, %pcrel_lo(pcrel1209) sd s1, 8(sp) sd s6, 16(sp) sd s0, 24(sp) @@ -1306,16 +1439,16 @@ pcrel996: sd s4, 40(sp) sd s7, 48(sp) sd s3, 56(sp) - lw a2, 4(a4) - lw a1, 8(a4) + lw a1, 4(a4) + lw a2, 8(a4) lw a0, 12(a4) - lw t3, %pcrel_lo(pcrel995)(a3) + lw t3, %pcrel_lo(pcrel1208)(a3) mulw t2, t0, a0 addw t1, t2, t3 -pcrel997: +pcrel1210: auipc t2, %pcrel_hi(a) sh2add a4, t1, t4 - addi a3, t2, %pcrel_lo(pcrel997) + addi a3, t2, %pcrel_lo(pcrel1210) subw t1, t0, a1 addw t2, a1, t0 mv t3, a4 @@ -1324,13 +1457,13 @@ pcrel997: mv t6, a1 mv a6, t1 mv s2, zero - j label866 + j label1079 .p2align 2 -label971: +label1184: addiw t0, t0, 1 - ble a5, t0, label881 + ble a5, t0, label1094 .p2align 2 -label880: +label1093: sh2add a4, a0, a4 subw t1, t0, a1 addw t2, a1, t0 @@ -1351,15 +1484,15 @@ label880: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - bne s4, zero, label917 + bne s4, zero, label1130 mulw s4, a0, t1 addiw s1, t5, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, zero, s3 - ble t6, s1, label984 + ble t6, s1, label1197 .p2align 2 -label875: +label1088: addi s0, s0, 4 or s5, a6, s1 slt s6, s1, a0 @@ -1368,17 +1501,17 @@ label875: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - beq s4, zero, label967 + beq s4, zero, label1180 .p2align 2 -label917: +label1130: mv s3, zero addiw s1, s1, 1 max s2, s2, zero - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, a6, 1 - ble t2, a6, label966 + ble t2, a6, label1179 .p2align 2 -label866: +label1079: slt s1, a6, a2 sh2add s0, t5, a3 xori a7, s1, 1 @@ -1390,44 +1523,44 @@ label866: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - bne s4, zero, label917 + bne s4, zero, label1130 mulw s4, a0, a6 addiw s1, t5, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, s2, s3 - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, a6, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 - j label881 + bgt a5, t0, label1093 + j label1094 .p2align 2 -label967: +label1180: mulw s4, a0, a6 addiw s1, s1, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, s2, s3 - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, a6, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 - j label881 + bgt a5, t0, label1093 + j label1094 .p2align 2 -label966: +label1179: addiw t4, t4, 1 sw s2, 0(t3) - ble a0, t4, label971 + ble a0, t4, label1184 .p2align 2 -label878: +label1091: addi t3, t3, 4 subw t5, t4, a1 addw t6, a1, t4 @@ -1444,31 +1577,31 @@ label878: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - bne s4, zero, label917 + bne s4, zero, label1130 mulw s4, a0, t1 addiw s1, t5, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, zero, s3 - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, t1, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 - j label881 + bgt a5, t0, label1093 + j label1094 .p2align 2 -label984: +label1197: addiw a6, a6, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 -label881: + bgt a5, t0, label1093 +label1094: ld s2, 0(sp) ld s1, 8(sp) ld s6, 16(sp) @@ -1481,373 +1614,278 @@ label881: ret .p2align 2 cmmc_parallel_body_6: - # stack usage: CalleeArg[0] Local[0] RegSpill[64] CalleeSaved[96] - addi sp, sp, -160 -pcrel1374: + addi sp, sp, -96 + mv t3, a1 +pcrel1391: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_6) -pcrel1375: - auipc a5, %pcrel_hi(b) - mv t4, a0 - sd s3, 0(sp) - sd s0, 8(sp) - sd s5, 16(sp) - sd s4, 24(sp) - sd s6, 32(sp) - sd s1, 40(sp) - sd s8, 48(sp) - sd s10, 56(sp) - sd s9, 64(sp) - sd s2, 72(sp) - sd s7, 80(sp) + sd s0, 0(sp) + addi a3, a2, %pcrel_lo(pcrel1391) + sd s5, 8(sp) + sd s1, 16(sp) + sd s6, 24(sp) + sd s2, 32(sp) + sd s3, 40(sp) + sd s4, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + sd s9, 72(sp) + sd s10, 80(sp) sd s11, 88(sp) - sd a1, 96(sp) - addi a1, a2, %pcrel_lo(pcrel1374) - lw s3, 4(a1) - addi s10, s3, -1359 - addi s8, s3, -336 - addi s6, s3, -81 - addi s4, s3, -3 - addi s5, s3, -18 - sd s3, 104(sp) - lw s0, 8(a1) - mulw a3, a0, s0 - sd s0, 112(sp) - lw a4, %pcrel_lo(pcrel1374)(a2) - sd s4, 152(sp) - addi a2, a5, %pcrel_lo(pcrel1375) - addw a1, a3, a4 - sd s5, 144(sp) - lui a3, 1048571 - sh2add a4, a1, a2 - sd s6, 136(sp) - lui a1, 1048575 - sd s8, 128(sp) - addiw a2, a1, -1358 - sd s10, 120(sp) - addiw a1, a3, -1357 - addw a5, s3, a2 - lui a3, 32 - addw t0, s3, a1 - lui a2, 1 - addiw a0, a3, -1 - lui a1, 16 - srli t3, a0, 1 - srli t2, a0, 3 - srli t1, a0, 5 - lui a3, 4 - mv a6, a4 - mv a7, zero - addw t6, s3, t4 - subw t5, t4, s3 - j label1002 -.p2align 2 -label1221: - addiw s9, s9, 1 - bgt t6, s9, label1005 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - ble s0, a7, label1340 -.p2align 2 -label1071: - addi a6, a6, 4 -.p2align 2 -label1002: - ld s3, 104(sp) - ld s4, 152(sp) - addw s1, s3, a7 - subw s0, a7, s3 - addw s2, s4, a7 - ld s5, 144(sp) - addiw s9, s0, 3 - ld s6, 136(sp) - addw s3, s5, a7 - ld s8, 128(sp) - addw s4, s6, a7 - ld s10, 120(sp) - addw s5, s8, a7 - addw s6, t0, a7 - addw s7, s10, a7 - addw s8, a5, a7 - bge s9, s1, label1113 - mv s9, t5 -.p2align 2 -label1005: - addiw s11, s0, 15 - ble s2, s11, label1118 - addiw s11, s0, 63 - ble s3, s11, label1123 - addiw s11, s0, 255 - ble s4, s11, label1128 - addiw s11, s0, 1023 - ble s5, s11, label1133 - addw s11, s0, t1 - ble s7, s11, label1138 - addw s11, s0, t2 - ble s8, s11, label1143 - addw s10, s0, t3 - ble s6, s10, label1157 - addw s11, s0, a0 - addw s10, s0, a1 - ble s6, s11, label1313 -.p2align 2 -label1027: - addw s11, s10, a0 - addw s10, s10, a1 - bgt s6, s11, label1027 - mv s11, s10 - ble s6, s10, label1315 -.p2align 2 -label1162: - mv s10, s11 -.p2align 2 -label1023: - addw s10, s10, a3 - bgt s6, s10, label1023 - mv s11, s10 - ble s8, s10, label1314 -.p2align 2 -label1015: - addw s10, s10, a2 - bgt s8, s10, label1015 - mv s11, s10 - ble s7, s10, label1311 -.p2align 2 -label1034: - addiw s10, s10, 1024 - bgt s7, s10, label1034 - mv s11, s10 - ble s5, s10, label1039 -.p2align 2 -label1041: - addiw s10, s10, 256 - bgt s5, s10, label1041 - mv s11, s10 - ble s4, s10, label1046 -.p2align 2 -label1048: - addiw s10, s10, 64 - bgt s4, s10, label1048 - mv s11, s10 - ble s3, s10, label1053 -.p2align 2 -label1055: - addiw s10, s10, 16 - bgt s3, s10, label1055 - mv s11, s10 - ble s2, s10, label1212 -.p2align 2 -label1060: - addiw s10, s10, 4 - bgt s2, s10, label1060 - ble s1, s10, label1221 -.p2align 2 -label1064: - addiw s10, s10, 1 - bgt s1, s10, label1064 - addiw s9, s9, 1 - bgt t6, s9, label1005 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - ble t5, t4, label1069 -.p2align 2 -label1070: - ld s0, 112(sp) + lw a4, 4(a3) + lw t0, 8(a3) + lw a5, 12(a3) + mulw a1, a0, a4 +pcrel1392: + auipc a3, %pcrel_hi(b) + lw t1, %pcrel_lo(pcrel1391)(a2) + addi a2, a3, %pcrel_lo(pcrel1392) + addw t4, a1, t1 + lui a3, 524288 +pcrel1393: + auipc a1, %pcrel_hi(a) + sh2add t2, t4, a2 + addi t1, a1, %pcrel_lo(pcrel1393) + lui a2, 262144 + mv t4, a0 + li a1, 1 + addiw a0, a3, 1 + subw t5, t4, a5 + addw t6, a5, t4 + mv a6, t2 mv a7, zero - ld s3, 104(sp) - sh2add a4, s0, a4 - addw t6, s3, t4 - subw t5, t4, s3 - mv a6, a4 - j label1002 -.p2align 2 -label1113: + subw s0, zero, a5 + mv s1, a5 mv s2, t5 - blt s0, s1, label1249 - addiw s2, t5, 1 - ble t6, s2, label1328 -.p2align 2 -label1072: - bge s0, s1, label1248 -.p2align 2 -label1249: - mv s3, s0 -.p2align 2 -label1074: - addiw s3, s3, 1 - bgt s1, s3, label1074 - addiw s2, s2, 1 - bgt t6, s2, label1072 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1212: - mv s10, s11 - bgt s1, s11, label1064 -.p2align 2 -label1324: - addiw s9, s9, 1 - bgt t6, s9, label1005 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1053: - mv s10, s11 - bgt s2, s11, label1060 + mv a3, zero + slt s5, t5, t0 + sh2add s4, s0, t1 + xori s3, s5, 1 + mv s5, s0 + or s7, t5, s0 + slt s10, s0, a4 + srliw s8, s7, 31 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, t5 + divw s11, zero, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1322: - mv s10, s11 - bgt s1, s11, label1064 - j label1324 +label1362: + addiw s5, s5, 1 + ble s1, s5, label1366 .p2align 2 -label1046: - mv s10, s11 - bgt s3, s11, label1055 +label1240: + addi s4, s4, 4 + mv a3, s8 + or s7, s2, s5 + slt s10, s5, a4 + srliw s8, s7, 31 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + beq s8, zero, label1361 .p2align 2 -label1320: - mv s10, s11 - bgt s2, s11, label1060 - j label1322 +label1276: + mv s6, zero + mv s7, a1 + mv s8, zero + divw s11, a3, a1 + and s9, s11, a0 + bne s9, a1, label1363 +.p2align 2 +label1241: + divw s10, s6, s7 + and s9, s10, a0 + xori s11, s9, 1 + slliw s9, s7, 1 + sltiu s10, s11, 1 + mv s7, s9 + sh1add s8, s8, s10 + bge s9, a2, label1362 .p2align 2 -label1039: - mv s10, s11 - bgt s4, s11, label1048 +label1228: + divw s11, a3, s7 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + sh1add s8, s8, zero + slliw s9, s7, 1 + mv s7, s9 + blt s9, a2, label1228 .p2align 2 -label1318: - mv s10, s11 - bgt s3, s11, label1055 - j label1320 +label1233: + addiw s5, s5, 1 + bgt s1, s5, label1240 .p2align 2 -label1248: +label1234: addiw s2, s2, 1 - bgt t6, s2, label1072 + bgt t6, s2, label1304 addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 + sw s8, 0(a6) + bgt a4, a7, label1236 addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1143: - mv s10, s0 - mv s11, zero -.p2align 2 -label1012: - bgt s8, s10, label1015 - mv s10, s11 - bgt s7, s11, label1034 - j label1032 + bgt t3, t4, label1239 + j label1238 .p2align 2 -label1313: - mv s11, s10 - bgt s6, s10, label1162 -.p2align 2 -label1315: - mv s11, s10 - bgt s8, s10, label1015 - bgt s7, s10, label1034 - j label1032 +label1361: + mulw s8, a4, s2 + divw s11, a3, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1311: - mv s10, s11 - bgt s5, s11, label1041 - bgt s4, s11, label1048 - j label1318 +label1366: + addiw s2, s2, 1 + ble t6, s2, label1371 .p2align 2 -label1314: - mv s10, s11 - bgt s7, s11, label1034 - bgt s5, s11, label1041 -label1316: - mv s10, s11 - bgt s4, s11, label1048 - j label1318 +label1304: + mv a3, s8 + slt s5, s2, t0 + sh2add s4, s0, t1 + or s7, s2, s0 + slt s10, s0, a4 + xori s3, s5, 1 + srliw s8, s7, 31 + mv s5, s0 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, s2 + divw s11, a3, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1328: +label1371: addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1118: - mv s10, s0 - mv s11, zero - bgt s2, s0, label1060 - j label1212 -.p2align 2 -label1123: - mv s10, s0 - mv s11, zero - bgt s3, s0, label1055 - j label1053 + sw s8, 0(a6) + ble a4, a7, label1375 .p2align 2 -label1128: - mv s10, s0 - mv s11, zero - bgt s4, s0, label1048 - j label1046 +label1236: + addi a6, a6, 4 + subw s0, a7, a5 + addw s1, a5, a7 + mv s2, t5 + mv a3, zero + slt s5, t5, t0 + slt s10, s0, a4 + or s7, t5, s0 + sh2add s4, s0, t1 + xori s3, s5, 1 + srliw s8, s7, 31 + mv s5, s0 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, t5 + divw s11, zero, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1133: - mv s10, s0 - mv s11, zero - bgt s5, s0, label1041 - j label1039 +label1375: + addiw t4, t4, 1 + ble t3, t4, label1238 .p2align 2 -label1138: - mv s10, s0 - mv s11, zero - bgt s7, s0, label1034 -label1032: - mv s10, s11 - bgt s5, s11, label1041 - j label1316 +label1239: + sh2add t2, a4, t2 + subw t5, t4, a5 + addw t6, a5, t4 + mv a7, zero + subw s0, zero, a5 + sext.w s1, a5 + mv a3, zero + mv a6, t2 + slt s5, t5, t0 + mv s2, t5 + slt s10, s0, a4 + or s7, t5, s0 + sh2add s4, s0, t1 + xori s3, s5, 1 + srliw s8, s7, 31 + mv s5, s0 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, t5 + divw s11, zero, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1157: - mv s11, s0 +label1363: mv s10, zero - bgt s6, s0, label1162 - mv s11, zero - j label1012 -label1069: - ld s3, 0(sp) - ld s0, 8(sp) - ld s5, 16(sp) - ld s4, 24(sp) - ld s6, 32(sp) - ld s1, 40(sp) - ld s8, 48(sp) - ld s10, 56(sp) - ld s9, 64(sp) - ld s2, 72(sp) - ld s7, 80(sp) + sh1add s8, s8, zero + slliw s9, s7, 1 + mv s7, s9 + blt s9, a2, label1228 + addiw s5, s5, 1 + bgt s1, s5, label1240 + j label1234 +label1238: + ld s0, 0(sp) + ld s5, 8(sp) + ld s1, 16(sp) + ld s6, 24(sp) + ld s2, 32(sp) + ld s3, 40(sp) + ld s4, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + ld s9, 72(sp) + ld s10, 80(sp) ld s11, 88(sp) - addi sp, sp, 160 + addi sp, sp, 96 ret -.p2align 2 -label1340: - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 diff --git a/tests/SysY2022/performance/conv0.sy.ir b/tests/SysY2022/performance/conv0.sy.ir index 4fc305de1..9678ee05d 100644 --- a/tests/SysY2022/performance/conv0.sy.ir +++ b/tests/SysY2022/performance/conv0.sy.ir @@ -38,17 +38,17 @@ func @main() -> i32 { NoRecurse Entry } { i32* %26 = ptradd [16 * i8]* %24, i32 4; i32* %27 = ptradd [16 * i8]* %24, i32 8; i32* %28 = ptradd [16 * i8]* %24, i32 12; - [16 * i8]* %29 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_2 to [16 * i8]*; + [16 * i8]* %29 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_5 to [16 * i8]*; i32* %30 = ptradd [16 * i8]* %29, i32 0; i32* %31 = ptradd [16 * i8]* %29, i32 4; i32* %32 = ptradd [16 * i8]* %29, i32 8; i32* %33 = ptradd [16 * i8]* %29, i32 12; - [16 * i8]* %34 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_5 to [16 * i8]*; + [16 * i8]* %34 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_6 to [16 * i8]*; i32* %35 = ptradd [16 * i8]* %34, i32 0; i32* %36 = ptradd [16 * i8]* %34, i32 4; i32* %37 = ptradd [16 * i8]* %34, i32 8; i32* %38 = ptradd [16 * i8]* %34, i32 12; - [16 * i8]* %39 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_6 to [16 * i8]*; + [16 * i8]* %39 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_2 to [16 * i8]*; i32* %40 = ptradd [16 * i8]* %39, i32 0; i32* %41 = ptradd [16 * i8]* %39, i32 4; i32* %42 = ptradd [16 * i8]* %39, i32 8; @@ -57,30 +57,30 @@ func @main() -> i32 { NoRecurse Entry } { i8* %45 = functionptr () -> void @cmmc_parallel_body_3 as i8*; i8* %46 = functionptr () -> void @cmmc_parallel_body_1 as i8*; i8* %47 = functionptr () -> void @cmmc_parallel_body_4 as i8*; - i8* %48 = functionptr () -> void @cmmc_parallel_body_2 as i8*; - i8* %49 = functionptr () -> void @cmmc_parallel_body_5 as i8*; - i8* %50 = functionptr () -> void @cmmc_parallel_body_6 as i8*; + i8* %48 = functionptr () -> void @cmmc_parallel_body_5 as i8*; + i8* %49 = functionptr () -> void @cmmc_parallel_body_6 as i8*; + i8* %50 = functionptr () -> void @cmmc_parallel_body_2 as i8*; ubr ^while.body; ^b: store i32* %13 with i32 0; store i32* %14 with i32 %2; - store i32* %15 with i32 %1; - store i32* %16 with i32 %11; + store i32* %15 with i32 %11; + store i32* %16 with i32 %1; call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %44); ubr ^b6; ^b1: store i32* %18 with i32 0; - store i32* %19 with i32 %1; - store i32* %20 with i32 %2; - store i32* %21 with i32 %11; + store i32* %19 with i32 %11; + store i32* %20 with i32 %1; + store i32* %21 with i32 %2; call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %45); ubr ^b6; ^b2: - store i32* %35 with i32 0; - store i32* %36 with i32 %1; - store i32* %37 with i32 %11; - store i32* %38 with i32 %2; - call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %49); + store i32* %30 with i32 0; + store i32* %31 with i32 %11; + store i32* %32 with i32 %1; + store i32* %33 with i32 %2; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %48); ubr ^b6; ^while.body: i32 %51 = phi [^entry, i32 0] [^b8, i32 %54]; @@ -95,11 +95,11 @@ func @main() -> i32 { NoRecurse Entry } { call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %47); ubr ^b6; ^b4: - store i32* %30 with i32 0; - store i32* %31 with i32 %11; - store i32* %32 with i32 %1; - store i32* %33 with i32 %2; - call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %48); + store i32* %35 with i32 0; + store i32* %36 with i32 %2; + store i32* %37 with i32 %1; + store i32* %38 with i32 %11; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %49); ubr ^b6; ^b5: store i32* %40 with i32 0; @@ -145,19 +145,19 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel ^b1: i32 %17 = phi [^b, i32 %0] [^b7, i32 %60]; i32 %18 = phi [^b, i32 %12] [^b7, i32 %62]; - i32 %19 = sub i32 %17, i32 %9; - i32 %20 = add i32 %9, i32 %17; + i32 %19 = sub i32 %17, i32 %7; + i32 %20 = add i32 %7, i32 %17; i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: i32 %22 = phi [^b1, i32 0] [^b6, i32 %58]; - i32 %23 = sub i32 %22, i32 %9; - i32 %24 = add i32 %9, i32 %22; + i32 %23 = sub i32 %22, i32 %7; + i32 %24 = add i32 %7, i32 %22; ubr ^while.body1; ^while.body1: i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %55]; i32 %26 = phi [^while.body, i32 0] [^b5, i32 %52]; - i1 %27 = icmp sle i32 %7, i32 %25; + i1 %27 = icmp sle i32 %9, i32 %25; ubr ^while.body2; ^while.body2: i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %53]; @@ -352,6 +352,212 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %4 = load i32* %3; i32* %5 = ptradd [16 * i8]* %2, i32 8; i32 %6 = load i32* %5; + i32 %7 = mul i32 %0, i32 %6; + i32* %8 = ptradd [16 * i8]* %2, i32 0; + i32 %9 = load i32* %8; + i32 %10 = add i32 %7, i32 %9; + i32 %11 = add i32 %4, i32 -3; + i32 %12 = add i32 %4, i32 -18; + i32 %13 = add i32 %4, i32 -81; + i32 %14 = add i32 %4, i32 -336; + i32 %15 = add i32 %4, i32 -1359; + i32 %16 = add i32 %4, i32 -5454; + i32 %17 = add i32 %4, i32 -21837; + [10000000 * i32]* %18 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; + i32* %19 = getelementptr &([10000000 * i32]* %18)[i64 0][i64 0]; + ubr ^b1; + ^b1: + i32 %20 = phi [^b, i32 %0] [^b7, i32 %79]; + i32 %21 = phi [^b, i32 %10] [^b7, i32 %81]; + i32 %22 = sub i32 %20, i32 %4; + i32 %23 = add i32 %4, i32 %20; + i32* %24 = getelementptr &(i32* %19)[i32 %21]; + ubr ^while.body; + ^while.body: + i32 %25 = phi [^b1, i32 0] [^b5, i32 %63]; + i32 %26 = sub i32 %25, i32 %4; + i32 %27 = add i32 %26, i32 3; + i32 %28 = add i32 %4, i32 %25; + i1 %29 = icmp slt i32 %27, i32 %28; + i32 %30 = add i32 %11, i32 %25; + i32 %31 = add i32 %12, i32 %25; + i32 %32 = add i32 %13, i32 %25; + i32 %33 = add i32 %14, i32 %25; + i32 %34 = add i32 %15, i32 %25; + i32 %35 = add i32 %16, i32 %25; + i32 %36 = add i32 %17, i32 %25; + cbr i1 %29(prob = 0.5), ^b2, ^b3; + ^b2: + i32 %37 = phi [^while.body, i32 %22] [^b6, i32 %77]; + i32 %38 = add i32 %26, i32 15; + i1 %39 = icmp sgt i32 %30, i32 %38; + cbr i1 %39(prob = 0.941176), ^super.header, ^scalar.header; + ^b3: + i32 %40 = phi [^while.body, i32 %22] [^b4, i32 %50]; + i1 %41 = icmp slt i32 %26, i32 %28; + cbr i1 %41(prob = 0.75), ^while.body1, ^b4; + ^super.header: + i32 %42 = add i32 %26, i32 63; + i1 %43 = icmp sgt i32 %31, i32 %42; + cbr i1 %43(prob = 0.941176), ^super.header1, ^scalar.header1; + ^while.body1 {scalar}: + i32 %44 = phi [^b3, i32 %26] [^while.body1, i32 %45]; + i32 %45 = add i32 %44, i32 1; + i1 %46 = icmp sgt i32 %28, i32 %45; + cbr i1 %46(prob = 0.75), ^while.body1, ^b4; + ^scalar.header: + i32 %47 = phi [^b2, i32 %26] [^scalar.final1, i32 %67]; + i32 %48 = phi [^b2, i32 undef] [^scalar.final1, i32 %67]; + i1 %49 = icmp sgt i32 %30, i32 %47; + cbr i1 %49(prob = 0.75), ^while.body2, ^scalar.final; + ^b4: + i32 %50 = add i32 %40, i32 1; + i1 %51 = icmp sgt i32 %23, i32 %50; + cbr i1 %51(prob = 0.5), ^b3, ^b5; + ^super.header1: + i32 %52 = add i32 %26, i32 255; + i1 %53 = icmp sgt i32 %32, i32 %52; + cbr i1 %53(prob = 0.941176), ^super.header2, ^scalar.header2; + ^scalar.header1: + i32 %54 = phi [^super.header, i32 %26] [^scalar.final2, i32 %84]; + i32 %55 = phi [^super.header, i32 undef] [^scalar.final2, i32 %84]; + i1 %56 = icmp sgt i32 %31, i32 %54; + cbr i1 %56(prob = 0.75), ^while.body3, ^scalar.final1; + ^while.body2 {scalar}: + i32 %57 = phi [^scalar.header, i32 %47] [^while.body2, i32 %58]; + i32 %58 = add i32 %57, i32 4; + i1 %59 = icmp sgt i32 %30, i32 %58; + cbr i1 %59(prob = 0.75), ^while.body2, ^scalar.final; + ^scalar.final: + i32 %60 = phi [^scalar.header, i32 %48] [^while.body2, i32 %58]; + i1 %61 = icmp sgt i32 %28, i32 %60; + cbr i1 %61(prob = 0.75), ^while.body4, ^b6; + ^b5: + i32* %62 = getelementptr &(i32* %24)[i32 %25]; + store i32* %62 with i32 0; + i32 %63 = add i32 %25, i32 1; + i1 %64 = icmp sgt i32 %6, i32 %63; + cbr i1 %64(prob = 0.5), ^while.body, ^b7; + ^super.header2: + i32 %65 = add i32 %26, i32 1023; + i1 %66 = icmp sgt i32 %33, i32 %65; + cbr i1 %66(prob = 0.941176), ^super.header3, ^scalar.header3; + ^scalar.final1: + i32 %67 = phi [^scalar.header1, i32 %55] [^while.body3, i32 %72]; + ubr ^scalar.header; + ^scalar.header2: + i32 %68 = phi [^super.header1, i32 %26] [^scalar.final3, i32 %93]; + i32 %69 = phi [^super.header1, i32 undef] [^scalar.final3, i32 %93]; + i1 %70 = icmp sgt i32 %32, i32 %68; + cbr i1 %70(prob = 0.75), ^while.body5, ^scalar.final2; + ^while.body3 {scalar}: + i32 %71 = phi [^scalar.header1, i32 %54] [^while.body3, i32 %72]; + i32 %72 = add i32 %71, i32 16; + i1 %73 = icmp sgt i32 %31, i32 %72; + cbr i1 %73(prob = 0.75), ^while.body3, ^scalar.final1; + ^while.body4 {scalar}: + i32 %74 = phi [^scalar.final, i32 %60] [^while.body4, i32 %75]; + i32 %75 = add i32 %74, i32 1; + i1 %76 = icmp sgt i32 %28, i32 %75; + cbr i1 %76(prob = 0.75), ^while.body4, ^b6; + ^b6: + i32 %77 = add i32 %37, i32 1; + i1 %78 = icmp sgt i32 %23, i32 %77; + cbr i1 %78(prob = 0.5), ^b2, ^b5; + ^b7: + i32 %79 = add i32 %20, i32 1; + i1 %80 = icmp sgt i32 %1, i32 %79; + i32 %81 = add i32 %6, i32 %21; + cbr i1 %80(prob = 0.984615), ^b1, ^b8; + ^super.header3: + i32 %82 = add i32 %26, i32 4095; + i1 %83 = icmp sgt i32 %34, i32 %82; + cbr i1 %83(prob = 0.941176), ^super.header4, ^scalar.header4; + ^scalar.final2: + i32 %84 = phi [^scalar.header2, i32 %69] [^while.body5, i32 %89]; + ubr ^scalar.header1; + ^scalar.header3: + i32 %85 = phi [^super.header2, i32 %26] [^scalar.final4, i32 %102]; + i32 %86 = phi [^super.header2, i32 undef] [^scalar.final4, i32 %102]; + i1 %87 = icmp sgt i32 %33, i32 %85; + cbr i1 %87(prob = 0.75), ^while.body6, ^scalar.final3; + ^while.body5 {scalar}: + i32 %88 = phi [^scalar.header2, i32 %68] [^while.body5, i32 %89]; + i32 %89 = add i32 %88, i32 64; + i1 %90 = icmp sgt i32 %32, i32 %89; + cbr i1 %90(prob = 0.75), ^while.body5, ^scalar.final2; + ^b8: + ret; + ^super.header4: + i32 %91 = add i32 %26, i32 16383; + i1 %92 = icmp sgt i32 %35, i32 %91; + cbr i1 %92(prob = 0.941176), ^super.header5, ^scalar.header5; + ^scalar.final3: + i32 %93 = phi [^scalar.header3, i32 %86] [^while.body6, i32 %98]; + ubr ^scalar.header2; + ^scalar.header4: + i32 %94 = phi [^super.header3, i32 %26] [^scalar.final5, i32 %113]; + i32 %95 = phi [^super.header3, i32 undef] [^scalar.final5, i32 %113]; + i1 %96 = icmp sgt i32 %34, i32 %94; + cbr i1 %96(prob = 0.75), ^while.body7, ^scalar.final4; + ^while.body6 {scalar}: + i32 %97 = phi [^scalar.header3, i32 %85] [^while.body6, i32 %98]; + i32 %98 = add i32 %97, i32 256; + i1 %99 = icmp sgt i32 %33, i32 %98; + cbr i1 %99(prob = 0.75), ^while.body6, ^scalar.final3; + ^super.header5: + i32 %100 = add i32 %26, i32 65535; + i1 %101 = icmp sgt i32 %36, i32 %100; + cbr i1 %101(prob = 0.941176), ^while.body8, ^scalar.header6; + ^scalar.final4: + i32 %102 = phi [^scalar.header4, i32 %95] [^while.body7, i32 %107]; + ubr ^scalar.header3; + ^scalar.header5: + i32 %103 = phi [^super.header4, i32 %26] [^scalar.final6, i32 %120]; + i32 %104 = phi [^super.header4, i32 undef] [^scalar.final6, i32 %120]; + i1 %105 = icmp sgt i32 %35, i32 %103; + cbr i1 %105(prob = 0.75), ^while.body9, ^scalar.final5; + ^while.body7 {scalar}: + i32 %106 = phi [^scalar.header4, i32 %94] [^while.body7, i32 %107]; + i32 %107 = add i32 %106, i32 1024; + i1 %108 = icmp sgt i32 %34, i32 %107; + cbr i1 %108(prob = 0.75), ^while.body7, ^scalar.final4; + ^while.body8: + i32 %109 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; + i32 %110 = add i32 %109, i32 131071; + i1 %111 = icmp sgt i32 %36, i32 %110; + i32 %112 = add i32 %109, i32 65536; + cbr i1 %111(prob = 0.941176), ^while.body8, ^scalar.header6; + ^scalar.final5: + i32 %113 = phi [^scalar.header5, i32 %104] [^while.body9, i32 %118]; + ubr ^scalar.header4; + ^scalar.header6: + i32 %114 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; + i32 %115 = phi [^super.header5, i32 undef] [^while.body8, i32 %112]; + i1 %116 = icmp sgt i32 %36, i32 %114; + cbr i1 %116(prob = 0.75), ^while.body10, ^scalar.final6; + ^while.body9 {scalar}: + i32 %117 = phi [^scalar.header5, i32 %103] [^while.body9, i32 %118]; + i32 %118 = add i32 %117, i32 4096; + i1 %119 = icmp sgt i32 %35, i32 %118; + cbr i1 %119(prob = 0.75), ^while.body9, ^scalar.final5; + ^scalar.final6: + i32 %120 = phi [^scalar.header6, i32 %115] [^while.body10, i32 %122]; + ubr ^scalar.header5; + ^while.body10 {scalar}: + i32 %121 = phi [^scalar.header6, i32 %114] [^while.body10, i32 %122]; + i32 %122 = add i32 %121, i32 16384; + i1 %123 = icmp sgt i32 %36, i32 %122; + cbr i1 %123(prob = 0.75), ^while.body10, ^scalar.final6; +} +internal [16 * i8]* @cmmc_parallel_body_payload_2, align 8; +internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { + ^b: + [16 * i8]* %2 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_3 to [16 * i8]*; + i32* %3 = ptradd [16 * i8]* %2, i32 4; + i32 %4 = load i32* %3; + i32* %5 = ptradd [16 * i8]* %2, i32 8; + i32 %6 = load i32* %5; i32* %7 = ptradd [16 * i8]* %2, i32 12; i32 %8 = load i32* %7; i32 %9 = mul i32 %0, i32 %8; @@ -364,25 +570,25 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32* %16 = getelementptr &([10000000 * i32]* %15)[i64 0][i64 0]; ubr ^b1; ^b1: - i32 %17 = phi [^b, i32 %0] [^b9, i32 %62]; - i32 %18 = phi [^b, i32 %12] [^b9, i32 %64]; + i32 %17 = phi [^b, i32 %0] [^b7, i32 %61]; + i32 %18 = phi [^b, i32 %12] [^b7, i32 %63]; i32 %19 = sub i32 %17, i32 %4; i32 %20 = add i32 %4, i32 %17; i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: - i32 %22 = phi [^b1, i32 0] [^b8, i32 %60]; + i32 %22 = phi [^b1, i32 0] [^b6, i32 %59]; i32 %23 = sub i32 %22, i32 %4; i32 %24 = add i32 %4, i32 %22; ubr ^while.body1; ^while.body1: - i32 %25 = phi [^while.body, i32 %19] [^b7, i32 %57]; - i32 %26 = phi [^while.body, i32 0] [^b7, i32 %52]; + i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %56]; + i32 %26 = phi [^while.body, i32 0] [^b5, i32 %51]; i1 %27 = icmp sle i32 %6, i32 %25; ubr ^while.body2; ^while.body2: - i32 %28 = phi [^while.body1, i32 %23] [^b6, i32 %55]; - i32 %29 = phi [^while.body1, i32 %26] [^b6, i32 %52]; + i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %54]; + i32 %29 = phi [^while.body1, i32 %26] [^b4, i32 %51]; i32 %30 = or i32 %25, i32 %28; i32 %31 = lshr i32 %30, i32 31; i1 %32 = ztrunc i32 %31 to i1; @@ -400,105 +606,8 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %40 = load i32* %39; ubr ^b2; ^while.body3: - i32 %41 = phi [^b2, i32 1] [^b5, i32 %53]; - i32 %42 = phi [^b2, i32 0] [^b5, i32 %52]; - i32 %43 = sdiv i32 %29, i32 %41; - i32 %44 = and i32 %43, i32 -2147483647; - i1 %45 = icmp eq i32 %44, i32 1; - cbr i1 %45(prob = 0.49), ^b4, ^b5; - ^b4: - i32 %46 = sdiv i32 %36, i32 %41; - i32 %47 = and i32 %46, i32 -2147483647; - i1 %48 = icmp eq i32 %47, i32 1; - ubr ^b5; - ^b5: - i1 %49 = phi [^while.body3, i1 false] [^b4, i1 %48]; - i32 %50 = mul i32 %42, i32 2; - i32 %51 = zext i1 %49 to i32; - i32 %52 = add i32 %50, i32 %51; - i32 %53 = mul i32 %41, i32 2; - i1 %54 = icmp slt i32 %53, i32 1073741824; - cbr i1 %54(prob = 0.984615), ^while.body3, ^b6; - ^b6: - i32 %55 = add i32 %28, i32 1; - i1 %56 = icmp sgt i32 %24, i32 %55; - cbr i1 %56(prob = 0.5), ^while.body2, ^b7; - ^b7: - i32 %57 = add i32 %25, i32 1; - i1 %58 = icmp sgt i32 %20, i32 %57; - cbr i1 %58(prob = 0.5), ^while.body1, ^b8; - ^b8: - i32* %59 = getelementptr &(i32* %21)[i32 %22]; - store i32* %59 with i32 %52; - i32 %60 = add i32 %22, i32 1; - i1 %61 = icmp sgt i32 %8, i32 %60; - cbr i1 %61(prob = 0.5), ^while.body, ^b9; - ^b9: - i32 %62 = add i32 %17, i32 1; - i1 %63 = icmp sgt i32 %1, i32 %62; - i32 %64 = add i32 %8, i32 %18; - cbr i1 %63(prob = 0.984615), ^b1, ^b10; - ^b10: - ret; -} -internal [16 * i8]* @cmmc_parallel_body_payload_2, align 8; -internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { - ^b: - [16 * i8]* %2 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_3 to [16 * i8]*; - i32* %3 = ptradd [16 * i8]* %2, i32 4; - i32 %4 = load i32* %3; - i32* %5 = ptradd [16 * i8]* %2, i32 8; - i32 %6 = load i32* %5; - i32 %7 = mul i32 %0, i32 %6; - i32* %8 = ptradd [16 * i8]* %2, i32 12; - i32 %9 = load i32* %8; - i32* %10 = ptradd [16 * i8]* %2, i32 0; - i32 %11 = load i32* %10; - i32 %12 = add i32 %7, i32 %11; - [10000000 * i32]* %13 = ptrcast [10000000 * i32]* @a to [10000000 * i32]*; - i32* %14 = getelementptr &([10000000 * i32]* %13)[i64 0][i64 0]; - [10000000 * i32]* %15 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; - i32* %16 = getelementptr &([10000000 * i32]* %15)[i64 0][i64 0]; - ubr ^b1; - ^b1: - i32 %17 = phi [^b, i32 %0] [^b7, i32 %61]; - i32 %18 = phi [^b, i32 %12] [^b7, i32 %63]; - i32 %19 = sub i32 %17, i32 %9; - i32 %20 = add i32 %9, i32 %17; - i32* %21 = getelementptr &(i32* %16)[i32 %18]; - ubr ^while.body; - ^while.body: - i32 %22 = phi [^b1, i32 0] [^b6, i32 %59]; - i32 %23 = sub i32 %22, i32 %9; - i32 %24 = add i32 %9, i32 %22; - ubr ^while.body1; - ^while.body1: - i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %56]; - i32 %26 = phi [^while.body, i32 0] [^b5, i32 %51]; - i1 %27 = icmp sle i32 %4, i32 %25; - ubr ^while.body2; - ^while.body2: - i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %54]; - i32 %29 = phi [^while.body1, i32 %26] [^b4, i32 %51]; - i32 %30 = or i32 %25, i32 %28; - i32 %31 = lshr i32 %30, i32 31; - i1 %32 = ztrunc i32 %31 to i1; - i1 %33 = or i1 %27, i1 %32; - i1 %34 = icmp sle i32 %6, i32 %28; - i1 %35 = or i1 %33, i1 %34; - cbr i1 %35(prob = 0.5), ^b2, ^b3; - ^b2: - i32 %36 = phi [^while.body2, i32 0] [^b3, i32 %40]; - ubr ^while.body3; - ^b3: - i32 %37 = mul i32 %6, i32 %25; - i32* %38 = getelementptr &(i32* %14)[i32 %28]; - i32* %39 = getelementptr &(i32* %38)[i32 %37]; - i32 %40 = load i32* %39; - ubr ^b2; - ^while.body3: - i32 %41 = phi [^b2, i32 1] [^while.body3, i32 %52]; - i32 %42 = phi [^b2, i32 0] [^while.body3, i32 %51]; + i32 %41 = phi [^b2, i32 1] [^while.body3, i32 %52]; + i32 %42 = phi [^b2, i32 0] [^while.body3, i32 %51]; i32 %43 = sdiv i32 %29, i32 %41; i32 %44 = srem i32 %43, i32 2; i32 %45 = sdiv i32 %36, i32 %41; @@ -523,12 +632,12 @@ internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse Parallel i32* %58 = getelementptr &(i32* %21)[i32 %22]; store i32* %58 with i32 %51; i32 %59 = add i32 %22, i32 1; - i1 %60 = icmp sgt i32 %6, i32 %59; + i1 %60 = icmp sgt i32 %8, i32 %59; cbr i1 %60(prob = 0.5), ^while.body, ^b7; ^b7: i32 %61 = add i32 %17, i32 1; i1 %62 = icmp sgt i32 %1, i32 %61; - i32 %63 = add i32 %6, i32 %18; + i32 %63 = add i32 %8, i32 %18; cbr i1 %62(prob = 0.984615), ^b1, ^b8; ^b8: ret; @@ -594,7 +703,7 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %43 = sdiv i32 %29, i32 %41; i32 %44 = and i32 %43, i32 -2147483647; i1 %45 = icmp eq i32 %44, i32 1; - cbr i1 %45(prob = 0.49), ^b5, ^b4; + cbr i1 %45(prob = 0.5), ^b5, ^b4; ^b4: i32 %46 = sdiv i32 %36, i32 %41; i32 %47 = and i32 %46, i32 -2147483647; @@ -652,19 +761,19 @@ internal func @cmmc_parallel_body_5(i32 %0, i32 %1) -> void { NoRecurse Parallel ^b1: i32 %17 = phi [^b, i32 %0] [^b7, i32 %49]; i32 %18 = phi [^b, i32 %12] [^b7, i32 %51]; - i32 %19 = sub i32 %17, i32 %6; - i32 %20 = add i32 %6, i32 %17; + i32 %19 = sub i32 %17, i32 %4; + i32 %20 = add i32 %4, i32 %17; i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: i32 %22 = phi [^b1, i32 0] [^b6, i32 %47]; - i32 %23 = sub i32 %22, i32 %6; - i32 %24 = add i32 %6, i32 %22; + i32 %23 = sub i32 %22, i32 %4; + i32 %24 = add i32 %4, i32 %22; ubr ^while.body1; ^while.body1: i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %44]; i32 %26 = phi [^while.body, i32 0] [^b5, i32 %43]; - i1 %27 = icmp sle i32 %4, i32 %25; + i1 %27 = icmp sle i32 %6, i32 %25; ubr ^b2; ^b2: i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %41]; @@ -712,204 +821,95 @@ internal func @cmmc_parallel_body_6(i32 %0, i32 %1) -> void { NoRecurse Parallel [16 * i8]* %2 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_6 to [16 * i8]*; i32* %3 = ptradd [16 * i8]* %2, i32 4; i32 %4 = load i32* %3; - i32* %5 = ptradd [16 * i8]* %2, i32 8; - i32 %6 = load i32* %5; - i32 %7 = mul i32 %0, i32 %6; - i32* %8 = ptradd [16 * i8]* %2, i32 0; + i32 %5 = mul i32 %0, i32 %4; + i32* %6 = ptradd [16 * i8]* %2, i32 8; + i32 %7 = load i32* %6; + i32* %8 = ptradd [16 * i8]* %2, i32 12; i32 %9 = load i32* %8; - i32 %10 = add i32 %7, i32 %9; - i32 %11 = add i32 %4, i32 -3; - i32 %12 = add i32 %4, i32 -18; - i32 %13 = add i32 %4, i32 -81; - i32 %14 = add i32 %4, i32 -336; - i32 %15 = add i32 %4, i32 -1359; - i32 %16 = add i32 %4, i32 -5454; - i32 %17 = add i32 %4, i32 -21837; - [10000000 * i32]* %18 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; - i32* %19 = getelementptr &([10000000 * i32]* %18)[i64 0][i64 0]; + i32* %10 = ptradd [16 * i8]* %2, i32 0; + i32 %11 = load i32* %10; + i32 %12 = add i32 %5, i32 %11; + [10000000 * i32]* %13 = ptrcast [10000000 * i32]* @a to [10000000 * i32]*; + i32* %14 = getelementptr &([10000000 * i32]* %13)[i64 0][i64 0]; + [10000000 * i32]* %15 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; + i32* %16 = getelementptr &([10000000 * i32]* %15)[i64 0][i64 0]; ubr ^b1; ^b1: - i32 %20 = phi [^b, i32 %0] [^b7, i32 %79]; - i32 %21 = phi [^b, i32 %10] [^b7, i32 %81]; - i32 %22 = sub i32 %20, i32 %4; - i32 %23 = add i32 %4, i32 %20; - i32* %24 = getelementptr &(i32* %19)[i32 %21]; + i32 %17 = phi [^b, i32 %0] [^b9, i32 %62]; + i32 %18 = phi [^b, i32 %12] [^b9, i32 %64]; + i32 %19 = sub i32 %17, i32 %9; + i32 %20 = add i32 %9, i32 %17; + i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: - i32 %25 = phi [^b1, i32 0] [^b5, i32 %63]; - i32 %26 = sub i32 %25, i32 %4; - i32 %27 = add i32 %26, i32 3; - i32 %28 = add i32 %4, i32 %25; - i1 %29 = icmp slt i32 %27, i32 %28; - i32 %30 = add i32 %11, i32 %25; - i32 %31 = add i32 %12, i32 %25; - i32 %32 = add i32 %13, i32 %25; - i32 %33 = add i32 %14, i32 %25; - i32 %34 = add i32 %15, i32 %25; - i32 %35 = add i32 %16, i32 %25; - i32 %36 = add i32 %17, i32 %25; - cbr i1 %29(prob = 0.5), ^b2, ^b3; + i32 %22 = phi [^b1, i32 0] [^b8, i32 %60]; + i32 %23 = sub i32 %22, i32 %9; + i32 %24 = add i32 %9, i32 %22; + ubr ^while.body1; + ^while.body1: + i32 %25 = phi [^while.body, i32 %19] [^b7, i32 %57]; + i32 %26 = phi [^while.body, i32 0] [^b7, i32 %52]; + i1 %27 = icmp sle i32 %7, i32 %25; + ubr ^while.body2; + ^while.body2: + i32 %28 = phi [^while.body1, i32 %23] [^b6, i32 %55]; + i32 %29 = phi [^while.body1, i32 %26] [^b6, i32 %52]; + i32 %30 = or i32 %25, i32 %28; + i32 %31 = lshr i32 %30, i32 31; + i1 %32 = ztrunc i32 %31 to i1; + i1 %33 = or i1 %27, i1 %32; + i1 %34 = icmp sle i32 %4, i32 %28; + i1 %35 = or i1 %33, i1 %34; + cbr i1 %35(prob = 0.5), ^b2, ^b3; ^b2: - i32 %37 = phi [^while.body, i32 %22] [^b6, i32 %77]; - i32 %38 = add i32 %26, i32 15; - i1 %39 = icmp sgt i32 %30, i32 %38; - cbr i1 %39(prob = 0.941176), ^super.header, ^scalar.header; + i32 %36 = phi [^while.body2, i32 0] [^b3, i32 %40]; + ubr ^while.body3; ^b3: - i32 %40 = phi [^while.body, i32 %22] [^b4, i32 %50]; - i1 %41 = icmp slt i32 %26, i32 %28; - cbr i1 %41(prob = 0.75), ^while.body1, ^b4; - ^super.header: - i32 %42 = add i32 %26, i32 63; - i1 %43 = icmp sgt i32 %31, i32 %42; - cbr i1 %43(prob = 0.941176), ^super.header1, ^scalar.header1; - ^while.body1 {scalar}: - i32 %44 = phi [^b3, i32 %26] [^while.body1, i32 %45]; - i32 %45 = add i32 %44, i32 1; - i1 %46 = icmp sgt i32 %28, i32 %45; - cbr i1 %46(prob = 0.75), ^while.body1, ^b4; - ^scalar.header: - i32 %47 = phi [^b2, i32 %26] [^scalar.final1, i32 %67]; - i32 %48 = phi [^b2, i32 undef] [^scalar.final1, i32 %67]; - i1 %49 = icmp sgt i32 %30, i32 %47; - cbr i1 %49(prob = 0.75), ^while.body2, ^scalar.final; + i32 %37 = mul i32 %4, i32 %25; + i32* %38 = getelementptr &(i32* %14)[i32 %28]; + i32* %39 = getelementptr &(i32* %38)[i32 %37]; + i32 %40 = load i32* %39; + ubr ^b2; + ^while.body3: + i32 %41 = phi [^b2, i32 1] [^b5, i32 %53]; + i32 %42 = phi [^b2, i32 0] [^b5, i32 %52]; + i32 %43 = sdiv i32 %29, i32 %41; + i32 %44 = and i32 %43, i32 -2147483647; + i1 %45 = icmp eq i32 %44, i32 1; + cbr i1 %45(prob = 0.5), ^b4, ^b5; ^b4: - i32 %50 = add i32 %40, i32 1; - i1 %51 = icmp sgt i32 %23, i32 %50; - cbr i1 %51(prob = 0.5), ^b3, ^b5; - ^super.header1: - i32 %52 = add i32 %26, i32 255; - i1 %53 = icmp sgt i32 %32, i32 %52; - cbr i1 %53(prob = 0.941176), ^super.header2, ^scalar.header2; - ^scalar.header1: - i32 %54 = phi [^super.header, i32 %26] [^scalar.final2, i32 %84]; - i32 %55 = phi [^super.header, i32 undef] [^scalar.final2, i32 %84]; - i1 %56 = icmp sgt i32 %31, i32 %54; - cbr i1 %56(prob = 0.75), ^while.body3, ^scalar.final1; - ^while.body2 {scalar}: - i32 %57 = phi [^scalar.header, i32 %47] [^while.body2, i32 %58]; - i32 %58 = add i32 %57, i32 4; - i1 %59 = icmp sgt i32 %30, i32 %58; - cbr i1 %59(prob = 0.75), ^while.body2, ^scalar.final; - ^scalar.final: - i32 %60 = phi [^scalar.header, i32 %48] [^while.body2, i32 %58]; - i1 %61 = icmp sgt i32 %28, i32 %60; - cbr i1 %61(prob = 0.75), ^while.body4, ^b6; + i32 %46 = sdiv i32 %36, i32 %41; + i32 %47 = and i32 %46, i32 -2147483647; + i1 %48 = icmp eq i32 %47, i32 1; + ubr ^b5; ^b5: - i32* %62 = getelementptr &(i32* %24)[i32 %25]; - store i32* %62 with i32 0; - i32 %63 = add i32 %25, i32 1; - i1 %64 = icmp sgt i32 %6, i32 %63; - cbr i1 %64(prob = 0.5), ^while.body, ^b7; - ^super.header2: - i32 %65 = add i32 %26, i32 1023; - i1 %66 = icmp sgt i32 %33, i32 %65; - cbr i1 %66(prob = 0.941176), ^super.header3, ^scalar.header3; - ^scalar.final1: - i32 %67 = phi [^scalar.header1, i32 %55] [^while.body3, i32 %72]; - ubr ^scalar.header; - ^scalar.header2: - i32 %68 = phi [^super.header1, i32 %26] [^scalar.final3, i32 %93]; - i32 %69 = phi [^super.header1, i32 undef] [^scalar.final3, i32 %93]; - i1 %70 = icmp sgt i32 %32, i32 %68; - cbr i1 %70(prob = 0.75), ^while.body5, ^scalar.final2; - ^while.body3 {scalar}: - i32 %71 = phi [^scalar.header1, i32 %54] [^while.body3, i32 %72]; - i32 %72 = add i32 %71, i32 16; - i1 %73 = icmp sgt i32 %31, i32 %72; - cbr i1 %73(prob = 0.75), ^while.body3, ^scalar.final1; - ^while.body4 {scalar}: - i32 %74 = phi [^scalar.final, i32 %60] [^while.body4, i32 %75]; - i32 %75 = add i32 %74, i32 1; - i1 %76 = icmp sgt i32 %28, i32 %75; - cbr i1 %76(prob = 0.75), ^while.body4, ^b6; + i1 %49 = phi [^while.body3, i1 false] [^b4, i1 %48]; + i32 %50 = mul i32 %42, i32 2; + i32 %51 = zext i1 %49 to i32; + i32 %52 = add i32 %50, i32 %51; + i32 %53 = mul i32 %41, i32 2; + i1 %54 = icmp slt i32 %53, i32 1073741824; + cbr i1 %54(prob = 0.984615), ^while.body3, ^b6; ^b6: - i32 %77 = add i32 %37, i32 1; - i1 %78 = icmp sgt i32 %23, i32 %77; - cbr i1 %78(prob = 0.5), ^b2, ^b5; + i32 %55 = add i32 %28, i32 1; + i1 %56 = icmp sgt i32 %24, i32 %55; + cbr i1 %56(prob = 0.5), ^while.body2, ^b7; ^b7: - i32 %79 = add i32 %20, i32 1; - i1 %80 = icmp sgt i32 %1, i32 %79; - i32 %81 = add i32 %6, i32 %21; - cbr i1 %80(prob = 0.984615), ^b1, ^b8; - ^super.header3: - i32 %82 = add i32 %26, i32 4095; - i1 %83 = icmp sgt i32 %34, i32 %82; - cbr i1 %83(prob = 0.941176), ^super.header4, ^scalar.header4; - ^scalar.final2: - i32 %84 = phi [^scalar.header2, i32 %69] [^while.body5, i32 %89]; - ubr ^scalar.header1; - ^scalar.header3: - i32 %85 = phi [^super.header2, i32 %26] [^scalar.final4, i32 %102]; - i32 %86 = phi [^super.header2, i32 undef] [^scalar.final4, i32 %102]; - i1 %87 = icmp sgt i32 %33, i32 %85; - cbr i1 %87(prob = 0.75), ^while.body6, ^scalar.final3; - ^while.body5 {scalar}: - i32 %88 = phi [^scalar.header2, i32 %68] [^while.body5, i32 %89]; - i32 %89 = add i32 %88, i32 64; - i1 %90 = icmp sgt i32 %32, i32 %89; - cbr i1 %90(prob = 0.75), ^while.body5, ^scalar.final2; + i32 %57 = add i32 %25, i32 1; + i1 %58 = icmp sgt i32 %20, i32 %57; + cbr i1 %58(prob = 0.5), ^while.body1, ^b8; ^b8: + i32* %59 = getelementptr &(i32* %21)[i32 %22]; + store i32* %59 with i32 %52; + i32 %60 = add i32 %22, i32 1; + i1 %61 = icmp sgt i32 %4, i32 %60; + cbr i1 %61(prob = 0.5), ^while.body, ^b9; + ^b9: + i32 %62 = add i32 %17, i32 1; + i1 %63 = icmp sgt i32 %1, i32 %62; + i32 %64 = add i32 %4, i32 %18; + cbr i1 %63(prob = 0.984615), ^b1, ^b10; + ^b10: ret; - ^super.header4: - i32 %91 = add i32 %26, i32 16383; - i1 %92 = icmp sgt i32 %35, i32 %91; - cbr i1 %92(prob = 0.941176), ^super.header5, ^scalar.header5; - ^scalar.final3: - i32 %93 = phi [^scalar.header3, i32 %86] [^while.body6, i32 %98]; - ubr ^scalar.header2; - ^scalar.header4: - i32 %94 = phi [^super.header3, i32 %26] [^scalar.final5, i32 %113]; - i32 %95 = phi [^super.header3, i32 undef] [^scalar.final5, i32 %113]; - i1 %96 = icmp sgt i32 %34, i32 %94; - cbr i1 %96(prob = 0.75), ^while.body7, ^scalar.final4; - ^while.body6 {scalar}: - i32 %97 = phi [^scalar.header3, i32 %85] [^while.body6, i32 %98]; - i32 %98 = add i32 %97, i32 256; - i1 %99 = icmp sgt i32 %33, i32 %98; - cbr i1 %99(prob = 0.75), ^while.body6, ^scalar.final3; - ^super.header5: - i32 %100 = add i32 %26, i32 65535; - i1 %101 = icmp sgt i32 %36, i32 %100; - cbr i1 %101(prob = 0.941176), ^while.body8, ^scalar.header6; - ^scalar.final4: - i32 %102 = phi [^scalar.header4, i32 %95] [^while.body7, i32 %107]; - ubr ^scalar.header3; - ^scalar.header5: - i32 %103 = phi [^super.header4, i32 %26] [^scalar.final6, i32 %120]; - i32 %104 = phi [^super.header4, i32 undef] [^scalar.final6, i32 %120]; - i1 %105 = icmp sgt i32 %35, i32 %103; - cbr i1 %105(prob = 0.75), ^while.body9, ^scalar.final5; - ^while.body7 {scalar}: - i32 %106 = phi [^scalar.header4, i32 %94] [^while.body7, i32 %107]; - i32 %107 = add i32 %106, i32 1024; - i1 %108 = icmp sgt i32 %34, i32 %107; - cbr i1 %108(prob = 0.75), ^while.body7, ^scalar.final4; - ^while.body8: - i32 %109 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; - i32 %110 = add i32 %109, i32 131071; - i1 %111 = icmp sgt i32 %36, i32 %110; - i32 %112 = add i32 %109, i32 65536; - cbr i1 %111(prob = 0.941176), ^while.body8, ^scalar.header6; - ^scalar.final5: - i32 %113 = phi [^scalar.header5, i32 %104] [^while.body9, i32 %118]; - ubr ^scalar.header4; - ^scalar.header6: - i32 %114 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; - i32 %115 = phi [^super.header5, i32 undef] [^while.body8, i32 %112]; - i1 %116 = icmp sgt i32 %36, i32 %114; - cbr i1 %116(prob = 0.75), ^while.body10, ^scalar.final6; - ^while.body9 {scalar}: - i32 %117 = phi [^scalar.header5, i32 %103] [^while.body9, i32 %118]; - i32 %118 = add i32 %117, i32 4096; - i1 %119 = icmp sgt i32 %35, i32 %118; - cbr i1 %119(prob = 0.75), ^while.body9, ^scalar.final5; - ^scalar.final6: - i32 %120 = phi [^scalar.header6, i32 %115] [^while.body10, i32 %122]; - ubr ^scalar.header5; - ^while.body10 {scalar}: - i32 %121 = phi [^scalar.header6, i32 %114] [^while.body10, i32 %122]; - i32 %122 = add i32 %121, i32 16384; - i1 %123 = icmp sgt i32 %36, i32 %122; - cbr i1 %123(prob = 0.75), ^while.body10, ^scalar.final6; } internal [16 * i8]* @cmmc_parallel_body_payload_6, align 8; diff --git a/tests/SysY2022/performance/conv1.arm.s b/tests/SysY2022/performance/conv1.arm.s index da64b8f36..94f9cc027 100644 --- a/tests/SysY2022/performance/conv1.arm.s +++ b/tests/SysY2022/performance/conv1.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .section .rodata -.align 8 +.p2align 3 __cmmc_jumptable1312: .word label1287-__cmmc_jumptable1312 .word label1286-__cmmc_jumptable1312 @@ -9,34 +9,34 @@ __cmmc_jumptable1312: .word label1284-__cmmc_jumptable1312 .word label1283-__cmmc_jumptable1312 .bss -.align 8 +.p2align 3 a: .zero 40000000 -.align 8 +.p2align 3 b: .zero 40000000 -.align 8 +.p2align 3 kernelid: .zero 40000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_4: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_5: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_6: .zero 16 .text diff --git a/tests/SysY2022/performance/conv1.riscv.s b/tests/SysY2022/performance/conv1.riscv.s index e74ffefcd..028c63550 100644 --- a/tests/SysY2022/performance/conv1.riscv.s +++ b/tests/SysY2022/performance/conv1.riscv.s @@ -1,204 +1,198 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 8 -__cmmc_jumptable1415: - .word label1390-__cmmc_jumptable1415 - .word label1389-__cmmc_jumptable1415 - .word label1388-__cmmc_jumptable1415 - .word label1387-__cmmc_jumptable1415 - .word label1386-__cmmc_jumptable1415 +.p2align 3 +__cmmc_jumptable1433: + .word label1408-__cmmc_jumptable1433 + .word label1407-__cmmc_jumptable1433 + .word label1406-__cmmc_jumptable1433 + .word label1405-__cmmc_jumptable1433 + .word label1404-__cmmc_jumptable1433 .bss -.align 8 +.p2align 3 a: .zero 40000000 -.align 8 +.p2align 3 b: .zero 40000000 -.align 8 +.p2align 3 kernelid: .zero 40000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_4: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_5: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_6: .zero 16 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[64] CalleeSaved[104] - addi sp, sp, -168 + # stack usage: CalleeArg[0] Local[0] RegSpill[48] CalleeSaved[104] + addi sp, sp, -152 sd ra, 0(sp) - sd s0, 8(sp) - sd s5, 16(sp) - sd s1, 24(sp) - sd s6, 32(sp) - sd s2, 40(sp) - sd s4, 48(sp) - sd s7, 56(sp) - sd s8, 64(sp) - sd s11, 72(sp) - sd s3, 80(sp) - sd s9, 88(sp) - sd s10, 96(sp) + sd s2, 8(sp) + sd s0, 16(sp) + sd s5, 24(sp) + sd s1, 32(sp) + sd s6, 40(sp) + sd s3, 48(sp) + sd s4, 56(sp) + sd s7, 64(sp) + sd s9, 72(sp) + sd s8, 80(sp) + sd s10, 88(sp) + sd s11, 96(sp) jal getint - mv s0, a0 + mv s2, a0 jal getint sd a0, 112(sp) mv a1, a0 jal getint -pcrel1546: +pcrel1555: auipc a1, %pcrel_hi(a) sd a0, 104(sp) - addi a2, a1, %pcrel_lo(pcrel1546) + addi a2, a1, %pcrel_lo(pcrel1555) sd a2, 120(sp) mv a0, a2 jal getarray -pcrel1547: +pcrel1556: auipc a1, %pcrel_hi(kernelid) - addi s1, a1, %pcrel_lo(pcrel1547) - mv a0, s1 + addi s0, a1, %pcrel_lo(pcrel1556) + mv a0, s0 jal getarray sd a0, 136(sp) li a0, 109 jal _sysy_starttime -pcrel1548: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_3) - li s9, 5 -pcrel1549: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) - mv s11, zero -pcrel1550: - auipc s10, %pcrel_hi(cmmc_parallel_body_payload_1) - srliw a2, s0, 31 - addi s6, s7, %pcrel_lo(pcrel1548) - addi s4, s5, %pcrel_lo(pcrel1549) ld a0, 104(sp) +pcrel1557: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_5) +pcrel1558: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_4) + srliw a2, s2, 31 +pcrel1559: + auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) + mv s11, zero +pcrel1560: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) + addi s8, s9, %pcrel_lo(pcrel1557) + addi s6, s7, %pcrel_lo(pcrel1558) + addi s4, s5, %pcrel_lo(pcrel1560) ld a1, 112(sp) mulw a1, a1, a0 - add a0, s0, a2 - sraiw s2, a0, 1 + add a0, s2, a2 + addi s2, s3, %pcrel_lo(pcrel1559) + sraiw s1, a0, 1 sd a1, 128(sp) -pcrel1551: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel1552: - auipc a1, %pcrel_hi(__cmmc_jumptable1415) - addi s8, a0, %pcrel_lo(pcrel1551) - addi s3, a1, %pcrel_lo(pcrel1552) -pcrel1553: +pcrel1561: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) - addi a3, a0, %pcrel_lo(pcrel1553) -pcrel1554: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_5) - sd a3, 160(sp) - addi a3, a0, %pcrel_lo(pcrel1554) -pcrel1555: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_6) - sd a3, 152(sp) - addi a2, a0, %pcrel_lo(pcrel1555) -pcrel1556: - auipc a0, %pcrel_hi(cmmc_parallel_body_1) +pcrel1562: + auipc a1, %pcrel_hi(cmmc_parallel_body_payload_6) + addi a2, a0, %pcrel_lo(pcrel1561) + addi s10, a1, %pcrel_lo(pcrel1562) sd a2, 144(sp) - addi s0, a0, %pcrel_lo(pcrel1556) - lw a0, 0(s1) - mv a1, a0 - bltu a0, s9, label1419 + lw a1, 0(s0) + li a3, 5 + mv a0, a1 + bltu a1, a3, label1437 .p2align 2 -label1385: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_6) -pcrel1557: - auipc a5, %pcrel_hi(cmmc_parallel_body_6) - sw zero, %pcrel_lo(label1385)(a0) +label1398: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) +pcrel1563: + auipc a5, %pcrel_hi(cmmc_parallel_body_2) + sw zero, %pcrel_lo(label1398)(a0) ld a2, 144(sp) - sw s2, 4(a2) + sw s1, 4(a2) ld a1, 112(sp) ld a0, 104(sp) slli a3, a1, 32 add.uw a4, a0, a3 mv a0, zero sd a4, 8(a2) - addi a2, a5, %pcrel_lo(pcrel1557) + addi a2, a5, %pcrel_lo(pcrel1563) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 -.p2align 2 -label1387: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel1558: - auipc a4, %pcrel_hi(cmmc_parallel_body_4) - sw zero, %pcrel_lo(label1387)(a0) - sw s2, 4(s8) - ld a1, 112(sp) + bgt a1, zero, label1400 + j label1401 +.p2align 2 +label1404: + auipc a1, %pcrel_hi(cmmc_parallel_body_payload_6) + slli a2, s1, 32 +pcrel1564: + auipc a3, %pcrel_hi(cmmc_parallel_body_6) + sw zero, %pcrel_lo(label1404)(a1) ld a0, 104(sp) - slli a2, a1, 32 - add.uw a3, a0, a2 + sw a0, 4(s10) + ld a1, 112(sp) + add.uw a0, a1, a2 + addi a2, a3, %pcrel_lo(pcrel1564) + sd a0, 8(s10) mv a0, zero - addi a2, a4, %pcrel_lo(pcrel1558) - sd a3, 8(s8) jal cmmcParallelFor ld a1, 128(sp) - ble a1, zero, label1381 + ble a1, zero, label1401 .p2align 2 -label1384: +label1400: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_1) +pcrel1565: + auipc a3, %pcrel_hi(cmmc_parallel_body_1) ld a1, 128(sp) + addi a2, a3, %pcrel_lo(pcrel1565) + sw a1, %pcrel_lo(label1400)(a0) mv a0, zero -pcrel1559: - auipc s10, %pcrel_hi(cmmc_parallel_body_payload_1) - sw a1, %pcrel_lo(pcrel1559)(s10) - mv a2, s0 jal cmmcParallelFor - ld a0, 136(sp) - addiw s11, s11, 1 - ble a0, s11, label1383 .p2align 2 -label1382: - addi s1, s1, 4 - lw a0, 0(s1) - mv a1, a0 - bgeu a0, s9, label1385 -.p2align 2 -label1419: - sh2add a3, a1, s3 - lw a2, 0(a3) - add a0, s3, a2 - jr a0 -.p2align 2 -label1390: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) - sw zero, %pcrel_lo(label1390)(s5) - slli a2, s2, 32 -pcrel1560: +label1401: + addiw s11, s11, 1 + ld a0, 136(sp) + ble a0, s11, label1403 + addi s0, s0, 4 + li a3, 5 + lw a1, 0(s0) + mv a0, a1 + bgeu a1, a3, label1398 +.p2align 2 +label1437: + auipc a4, %pcrel_hi(__cmmc_jumptable1433) + addi a2, a4, %pcrel_lo(label1437) + sh2add a1, a0, a2 + lw a3, 0(a1) + add a4, a2, a3 + jr a4 +.p2align 2 +label1408: + auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) + sw zero, %pcrel_lo(label1408)(s3) +pcrel1566: auipc a3, %pcrel_hi(cmmc_parallel_body_0) ld a0, 104(sp) - sw a0, 4(s4) + sw a0, 4(s2) ld a1, 112(sp) - add.uw a0, a1, a2 - addi a2, a3, %pcrel_lo(pcrel1560) - sd a0, 8(s4) + slli a2, a1, 32 + add.uw a0, s1, a2 + addi a2, a3, %pcrel_lo(pcrel1566) + sd a0, 8(s2) mv a0, zero jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 -label1383: + bgt a1, zero, label1400 + j label1401 +label1403: li a0, 116 jal _sysy_stoptime ld a2, 120(sp) @@ -206,93 +200,87 @@ label1383: mv a0, a1 mv a1, a2 jal putarray - mv a0, zero ld ra, 0(sp) - ld s0, 8(sp) - ld s5, 16(sp) - ld s1, 24(sp) - ld s6, 32(sp) - ld s2, 40(sp) - ld s4, 48(sp) - ld s7, 56(sp) - ld s8, 64(sp) - ld s11, 72(sp) - ld s3, 80(sp) - ld s9, 88(sp) - ld s10, 96(sp) - addi sp, sp, 168 + mv a0, zero + ld s2, 8(sp) + ld s0, 16(sp) + ld s5, 24(sp) + ld s1, 32(sp) + ld s6, 40(sp) + ld s3, 48(sp) + ld s4, 56(sp) + ld s7, 64(sp) + ld s9, 72(sp) + ld s8, 80(sp) + ld s10, 88(sp) + ld s11, 96(sp) + addi sp, sp, 152 ret .p2align 2 -label1389: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_3) - sw zero, %pcrel_lo(label1389)(s7) - slli a2, s2, 32 -pcrel1561: +label1407: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) + sw zero, %pcrel_lo(label1407)(s5) +pcrel1567: auipc a4, %pcrel_hi(cmmc_parallel_body_3) - ld a1, 112(sp) - sw a1, 4(s6) + sw s1, 4(s4) ld a0, 104(sp) - add.uw a3, a0, a2 + ld a1, 112(sp) + slli a2, a0, 32 mv a0, zero - addi a2, a4, %pcrel_lo(pcrel1561) - sd a3, 8(s6) + add.uw a3, a1, a2 + addi a2, a4, %pcrel_lo(pcrel1567) + sd a3, 8(s4) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 -label1381: - addiw s11, s11, 1 - ld a0, 136(sp) - bgt a0, s11, label1382 - j label1383 + bgt a1, zero, label1400 + j label1401 .p2align 2 -label1386: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel1562: - auipc a5, %pcrel_hi(cmmc_parallel_body_2) - sw zero, %pcrel_lo(label1386)(a0) - ld a3, 160(sp) - sw s2, 4(a3) - ld a0, 104(sp) +label1405: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_4) + sw zero, %pcrel_lo(label1405)(s7) +pcrel1568: + auipc a4, %pcrel_hi(cmmc_parallel_body_4) + sw s1, 4(s6) ld a1, 112(sp) - slli a2, a0, 32 + ld a0, 104(sp) + slli a2, a1, 32 + add.uw a3, a0, a2 mv a0, zero - add.uw a4, a1, a2 - addi a2, a5, %pcrel_lo(pcrel1562) - sd a4, 8(a3) + addi a2, a4, %pcrel_lo(pcrel1568) + sd a3, 8(s6) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 -.p2align 2 -label1388: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_5) -pcrel1563: - auipc a5, %pcrel_hi(cmmc_parallel_body_5) - sw zero, %pcrel_lo(label1388)(a0) - ld a1, 112(sp) - ld a3, 152(sp) - sw a1, 4(a3) + bgt a1, zero, label1400 + j label1401 +.p2align 2 +label1406: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_5) + sw zero, %pcrel_lo(label1406)(s9) +pcrel1569: + auipc a4, %pcrel_hi(cmmc_parallel_body_5) + sw s1, 4(s8) ld a0, 104(sp) + ld a1, 112(sp) slli a2, a0, 32 mv a0, zero - add.uw a4, s2, a2 - addi a2, a5, %pcrel_lo(pcrel1563) - sd a4, 8(a3) + add.uw a3, a1, a2 + addi a2, a4, %pcrel_lo(pcrel1569) + sd a3, 8(s8) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 + bgt a1, zero, label1400 + j label1401 .p2align 2 cmmc_parallel_body_0: addi sp, sp, -80 mv t1, a1 -pcrel168: +pcrel172: auipc a5, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel169: +pcrel173: auipc t5, %pcrel_hi(b) sd s0, 0(sp) - addi t0, a5, %pcrel_lo(pcrel168) - addi t4, t5, %pcrel_lo(pcrel169) + addi t0, a5, %pcrel_lo(pcrel172) + addi t4, t5, %pcrel_lo(pcrel173) sd s5, 8(sp) sd s4, 16(sp) sd s3, 24(sp) @@ -300,18 +288,18 @@ pcrel169: sd s6, 40(sp) sd s2, 48(sp) sd s7, 56(sp) - sd s9, 64(sp) - sd s8, 72(sp) + sd s8, 64(sp) + sd s9, 72(sp) lw a2, 4(t0) - lw a4, 8(t0) - lw a3, 12(t0) + lw a3, 8(t0) + lw a4, 12(t0) mulw a1, a0, a2 - lw t3, %pcrel_lo(pcrel168)(a5) + lw t3, %pcrel_lo(pcrel172)(a5) addw t2, a1, t3 -pcrel170: +pcrel174: auipc a1, %pcrel_hi(a) sh2add t0, t2, t4 - addi a5, a1, %pcrel_lo(pcrel170) + addi a5, a1, %pcrel_lo(pcrel174) mv t2, a0 lui a1, 786432 lui a0, 262144 @@ -325,9 +313,39 @@ pcrel170: mv s4, zero j label8 .p2align 2 -label94: +label150: + addiw s3, s3, 1 + ble a7, s3, label154 +.p2align 2 +label28: + addi s2, s2, 4 + or s7, s0, s3 + srliw s8, s7, 31 + slt s7, s3, a2 + andi s6, s8, 1 + xori s8, s7, 1 + or s5, s1, s6 + or s9, s5, s8 + beq s9, zero, label148 +.p2align 2 +label67: + mv s6, zero + sext.w s5, s4 + ble s5, a0, label146 +.p2align 2 +label31: + addw s5, s5, a1 + bgt s5, a0, label31 + mv s4, s5 + bge s5, zero, label150 +.p2align 2 +label29: + addw s4, s4, a0 + blt s4, zero, label29 + addiw s3, s3, 1 + bgt a7, s3, label28 addiw s0, s0, 1 - ble t4, s0, label149 + ble t4, s0, label153 .p2align 2 label8: slt s3, s0, a4 @@ -335,88 +353,58 @@ label8: xori s1, s3, 1 mv s3, a6 or s7, s0, a6 - slt s8, a6, a2 - srliw s9, s7, 31 - xori s7, s8, 1 - andi s6, s9, 1 + srliw s8, s7, 31 + slt s7, a6, a2 + andi s6, s8, 1 + xori s8, s7, 1 or s5, s1, s6 - or s6, s5, s7 - bne s6, zero, label67 - mulw s7, a2, s0 - sh2add s6, s7, s2 - lw s5, 0(s6) - addw s4, s4, s5 - bgt s4, a0, label18 - j label75 -label146: - blt s4, zero, label22 -.p2align 2 -label84: - addiw s3, s3, 1 - bgt a7, s3, label26 - addiw s0, s0, 1 - bgt t4, s0, label8 + or s9, s5, s8 + bne s9, zero, label67 + mulw s5, a2, s0 + sh2add s7, s5, s2 + lw s6, 0(s7) + addw s5, s4, s6 + bgt s5, a0, label31 +label151: + mv s4, s5 + blt s5, zero, label29 + j label20 .p2align 2 -label149: +label153: addiw t6, t6, 1 sw s4, 0(t5) - ble a2, t6, label154 + ble a2, t6, label157 +.p2align 2 +label24: addi t5, t5, 4 subw a6, t6, a3 addw a7, a3, t6 mv s0, t3 mv s4, zero slt s3, t3, a4 - slt s8, a6, a2 or s7, t3, a6 sh2add s2, a6, a5 xori s1, s3, 1 - srliw s9, s7, 31 + srliw s8, s7, 31 mv s3, a6 - xori s7, s8, 1 - andi s6, s9, 1 + slt s7, a6, a2 + andi s6, s8, 1 + xori s8, s7, 1 or s5, s1, s6 - or s6, s5, s7 - beq s6, zero, label157 -.p2align 2 -label67: - mv s5, zero - sext.w s4, s4 - ble s4, a0, label146 -.p2align 2 -label18: - addw s4, s4, a1 - bgt s4, a0, label18 - bge s4, zero, label84 -.p2align 2 -label22: - addw s4, s4, a0 - blt s4, zero, label22 - addiw s3, s3, 1 - ble a7, s3, label94 -.p2align 2 -label26: - addi s2, s2, 4 - or s7, s0, s3 - slt s8, s3, a2 - srliw s9, s7, 31 - xori s7, s8, 1 - andi s6, s9, 1 - or s5, s1, s6 - or s6, s5, s7 - bne s6, zero, label67 - mulw s7, a2, s0 - sh2add s6, s7, s2 - lw s5, 0(s6) - addw s4, s4, s5 - bgt s4, a0, label18 -label75: - blt s4, zero, label22 - j label84 + or s9, s5, s8 + bne s9, zero, label67 + mulw s5, a2, t3 + sh2add s7, s5, s2 + lw s6, 0(s7) + mv s5, s6 + bgt s6, a0, label31 + j label151 .p2align 2 -label154: +label157: addiw t2, t2, 1 - ble t1, t2, label30 + ble t1, t2, label26 +.p2align 2 +label27: sh2add t0, a2, t0 subw t3, t2, a3 addw t4, a3, t2 @@ -427,24 +415,43 @@ label154: mv t5, t0 slt s3, t3, a4 mv s0, t3 - slt s8, a6, a2 or s7, t3, a6 sh2add s2, a6, a5 xori s1, s3, 1 - srliw s9, s7, 31 + srliw s8, s7, 31 mv s3, a6 - xori s7, s8, 1 - andi s6, s9, 1 + slt s7, a6, a2 + andi s6, s8, 1 + xori s8, s7, 1 or s5, s1, s6 - or s6, s5, s7 - bne s6, zero, label67 - mulw s7, a2, t3 - sh2add s6, s7, s2 - lw s5, 0(s6) + or s9, s5, s8 + bne s9, zero, label67 + mulw s5, a2, t3 + sh2add s7, s5, s2 + lw s6, 0(s7) + mv s5, s6 + bgt s6, a0, label31 + j label151 +.p2align 2 +label148: + mulw s5, a2, s0 + sh2add s7, s5, s2 + lw s6, 0(s7) + addw s5, s4, s6 + bgt s5, a0, label31 mv s4, s5 - bgt s5, a0, label18 - j label75 -label30: + blt s5, zero, label29 + j label20 +label154: + addiw s0, s0, 1 + bgt t4, s0, label8 +label23: + addiw t6, t6, 1 + sw s4, 0(t5) + bgt a2, t6, label24 + addiw t2, t2, 1 + bgt t1, t2, label27 +label26: ld s0, 0(sp) ld s5, 8(sp) ld s4, 16(sp) @@ -453,40 +460,42 @@ label30: ld s6, 40(sp) ld s2, 48(sp) ld s7, 56(sp) - ld s9, 64(sp) - ld s8, 72(sp) + ld s8, 64(sp) + ld s9, 72(sp) addi sp, sp, 80 ret .p2align 2 -label157: - mulw s7, a2, s0 - sh2add s6, s7, s2 - lw s5, 0(s6) - addw s4, s4, s5 - bgt s4, a0, label18 - j label75 +label146: + mv s4, s5 + blt s5, zero, label29 +label20: + addiw s3, s3, 1 + bgt a7, s3, label28 + addiw s0, s0, 1 + bgt t4, s0, label8 + j label23 .p2align 2 cmmc_parallel_body_1: mv t0, a0 addiw a4, a0, 3 -pcrel329: +pcrel333: auipc a5, %pcrel_hi(b) -pcrel330: +pcrel334: auipc a0, %pcrel_hi(a) - addi a3, a5, %pcrel_lo(pcrel329) - addi a2, a0, %pcrel_lo(pcrel330) - ble a1, a4, label186 + addi a3, a5, %pcrel_lo(pcrel333) + addi a2, a0, %pcrel_lo(pcrel334) + ble a1, a4, label190 addiw t1, t0, 15 addiw a4, a1, -3 addiw a5, a1, -18 - bge t1, a4, label209 + bge t1, a4, label213 sh2add a0, t0, a2 - j label182 + j label186 .p2align 2 -label185: +label189: addi a0, a0, 64 .p2align 2 -label182: +label186: sh2add t1, t0, a3 addiw t0, t0, 16 ld t3, 0(t1) @@ -505,345 +514,445 @@ label182: sd t2, 48(a0) ld t3, 56(t1) sd t3, 56(a0) - bgt a5, t0, label185 + bgt a5, t0, label189 mv a0, t0 -label173: - ble a4, a0, label186 +label177: + ble a4, a0, label190 sh2add a5, a0, a3 - j label177 -label180: + j label181 +label184: addi a5, a5, 16 -label177: +label181: sh2add t0, a0, a2 ld t2, 0(a5) addiw a0, a0, 4 sd t2, 0(t0) ld t1, 8(a5) sd t1, 8(t0) - bgt a4, a0, label180 + bgt a4, a0, label184 mv t0, a0 -label186: - ble a1, t0, label188 +label190: + ble a1, t0, label197 sh2add a0, t0, a3 - j label190 -label193: + j label193 +label196: addi a0, a0, 4 -label190: +label193: sh2add a3, t0, a2 lw a4, 0(a0) addiw t0, t0, 1 sw a4, 0(a3) - bgt a1, t0, label193 -label188: + bgt a1, t0, label196 +label197: ret -label209: +label213: mv a0, t0 mv t0, zero - j label173 + j label177 .p2align 2 cmmc_parallel_body_2: - addi sp, sp, -96 - mv t3, a1 -pcrel515: + # stack usage: CalleeArg[0] Local[0] RegSpill[40] CalleeSaved[96] + addi sp, sp, -136 +pcrel722: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel516: - auipc t5, %pcrel_hi(a) +pcrel723: + auipc a5, %pcrel_hi(b) mv t4, a0 - addi a3, a2, %pcrel_lo(pcrel515) - sd s0, 0(sp) - sd s5, 8(sp) - sd s1, 16(sp) + sd s4, 0(sp) + sd s0, 8(sp) + sd s5, 16(sp) sd s6, 24(sp) - sd s2, 32(sp) - sd s3, 40(sp) - sd s4, 48(sp) - sd s7, 56(sp) - sd s8, 64(sp) - sd s9, 72(sp) - sd s10, 80(sp) + sd s1, 32(sp) + sd s10, 40(sp) + sd s9, 48(sp) + sd s2, 56(sp) + sd s3, 64(sp) + sd s7, 72(sp) + sd s8, 80(sp) sd s11, 88(sp) - lw a5, 4(a3) - lw t0, 8(a3) - lw a4, 12(a3) -pcrel517: - auipc a3, %pcrel_hi(b) - lw t2, %pcrel_lo(pcrel515)(a2) - mulw t1, a0, a4 - addi a2, a3, %pcrel_lo(pcrel517) - addw a1, t1, t2 - lui a3, 524288 - addi t1, t5, %pcrel_lo(pcrel516) - sh2add t2, a1, a2 - addiw a0, a3, 1 - li a1, 1 - lui a2, 262144 - subw t5, t4, a5 - addw t6, a5, t4 - mv a6, t2 + sd a1, 96(sp) + addi a1, a2, %pcrel_lo(pcrel722) + lw s4, 4(a1) + addi s6, s4, -336 + addi s10, s4, -1359 + addi t0, s4, -18 + addi t1, s4, -81 + sd s4, 104(sp) + lw s0, 8(a1) + mulw a3, a0, s0 + sd s0, 112(sp) + lw a4, %pcrel_lo(pcrel722)(a2) + addi a2, a5, %pcrel_lo(pcrel723) + addw a1, a3, a4 + sd s6, 128(sp) + addi a5, s4, -3 + lui a3, 1048571 + sh2add a4, a1, a2 + sd s10, 120(sp) + lui a2, 1048575 + addiw a1, a2, -1358 + addiw a2, a3, -1357 + addw t2, s4, a1 + lui a3, 32 + addw t3, s4, a2 + lui a1, 16 + addiw a0, a3, -1 + lui a2, 1 + lui a3, 4 + mv a6, a4 mv a7, zero - subw s0, zero, a5 - mv s1, a5 - mv s2, t5 - mv a3, zero - slt s5, t5, t0 - sh2add s4, s0, t1 - xori s3, s5, 1 - mv s5, s0 - or s7, t5, s0 - slt s10, s0, a4 - srliw s8, s7, 31 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, t5 - divw s9, zero, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 -.p2align 2 -label405: - divw s11, s6, s7 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, s7, 1 - sh1add s8, s8, s10 - mv s7, s9 - bge s9, a2, label485 + addw t6, s4, t4 + subw t5, t4, s4 + j label339 .p2align 2 -label348: - divw s9, a3, s7 - and s11, s9, a0 - beq s11, a1, label405 -.p2align 2 -label406: - mv s10, zero - sh1add s8, s8, zero - slliw s9, s7, 1 - mv s7, s9 - blt s9, a2, label348 - addiw s5, s5, 1 - bgt s1, s5, label354 +label639: addiw s2, s2, 1 - ble t6, s2, label479 + ble t6, s2, label662 .p2align 2 -label425: - mv a3, s8 - slt s5, s2, t0 - sh2add s4, s0, t1 - or s7, s2, s0 - slt s10, s0, a4 - xori s3, s5, 1 - srliw s8, s7, 31 - mv s5, s0 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, s2 - divw s9, a3, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 +label409: + blt s0, s1, label586 + addiw s2, s2, 1 + bgt t6, s2, label409 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + ble s0, a7, label676 .p2align 2 -label485: - addiw s5, s5, 1 - ble s1, s5, label491 +label343: + addi a6, a6, 4 .p2align 2 -label354: - addi s4, s4, 4 - mv a3, s8 - or s7, s2, s5 - slt s10, s5, a4 - srliw s8, s7, 31 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - beq s7, zero, label480 +label339: + ld s4, 104(sp) + addw s2, a5, a7 + addw s3, t0, a7 + addw s8, t2, a7 + ld s6, 128(sp) + addw s1, s4, a7 + subw s0, a7, s4 + addw s5, s6, a7 + ld s10, 120(sp) + addw s4, t1, a7 + addiw s9, s0, 3 + addw s6, t3, a7 + addw s7, s10, a7 + blt s9, s1, label451 + mv s2, t5 + bge s0, s1, label639 .p2align 2 -label396: - mv s6, zero - mv s7, a1 - mv s8, zero - divw s9, a3, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, zero, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 +label586: + mv s3, s0 .p2align 2 -label491: +label411: + addiw s3, s3, 1 + bgt s1, s3, label411 addiw s2, s2, 1 - bgt t6, s2, label425 + bgt t6, s2, label409 addiw a7, a7, 1 - sw s8, 0(a6) - ble a4, a7, label502 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + addiw t4, t4, 1 + ld t5, 96(sp) + ble t5, t4, label345 +.p2align 2 +label346: + ld s0, 112(sp) + mv a7, zero + ld s4, 104(sp) + sh2add a4, s0, a4 + addw t6, s4, t4 + subw t5, t4, s4 + mv a6, a4 + j label339 +.p2align 2 +label451: + mv s9, t5 + addiw s11, s0, 15 + ble s2, s11, label640 +.p2align 2 +label359: + addiw s11, s0, 63 + ble s3, s11, label496 + addiw s11, s0, 255 + ble s4, s11, label501 + addiw s11, s0, 1023 + ble s5, s11, label506 + lui s11, 1 + addiw s10, s11, -1 + addw s11, s0, s10 + ble s7, s11, label511 + lui s10, 4 + addiw s11, s10, -1 + addw s10, s0, s11 + ble s8, s10, label525 + lui s11, 16 + addiw s10, s11, -1 + addw s11, s0, s10 + ble s6, s11, label530 + addw s11, s0, a0 + addw s10, s0, a1 + ble s6, s11, label651 +.p2align 2 +label379: + addw s11, s10, a0 + addw s10, s10, a1 + bgt s6, s11, label379 + mv s11, s10 + ble s6, s10, label655 +.p2align 2 +label535: + addw s10, s11, a3 + ble s6, s10, label652 +.p2align 2 +label377: + addw s10, s10, a3 + bgt s6, s10, label377 + mv s11, s10 + ble s8, s10, label654 +.p2align 2 +label384: + addw s10, s10, a2 + bgt s8, s10, label384 + mv s11, s10 + ble s7, s10, label656 +.p2align 2 +label366: + addiw s10, s10, 1024 + bgt s7, s10, label366 + mv s11, s10 + ble s5, s10, label648 +.p2align 2 +label393: + addiw s10, s10, 256 + bgt s5, s10, label393 + mv s11, s10 + ble s4, s10, label567 +.p2align 2 +label398: + addiw s10, s10, 64 + bgt s4, s10, label398 + mv s11, s10 +.p2align 2 +label646: + mv s10, s11 + ble s3, s11, label576 +.p2align 2 +label405: + addiw s10, s10, 16 + bgt s3, s10, label405 + mv s11, s10 + ble s2, s10, label659 .p2align 2 label357: - addi a6, a6, 4 - subw s0, a7, a5 - addw s1, a5, a7 - mv s2, t5 - mv a3, zero - slt s5, t5, t0 - slt s10, s0, a4 - or s7, t5, s0 - sh2add s4, s0, t1 - xori s3, s5, 1 - srliw s8, s7, 31 - mv s5, s0 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, t5 - divw s9, zero, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 + addiw s10, s10, 4 + bgt s2, s10, label357 + ble s1, s10, label644 .p2align 2 -label480: - mulw s8, a4, s2 - divw s9, a3, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 +label354: + addiw s10, s10, 1 + bgt s1, s10, label354 + addiw s9, s9, 1 + ble t6, s9, label486 +.p2align 2 +label347: + addiw s11, s0, 15 + bgt s2, s11, label359 + mv s10, s0 + mv s11, zero +.p2align 2 +label349: + bgt s2, s10, label357 +label679: + mv s10, s11 + bgt s1, s11, label354 + j label476 .p2align 2 -label479: +label486: addiw a7, a7, 1 - sw s8, 0(a6) - bgt a4, a7, label357 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 addiw t4, t4, 1 - ble t3, t4, label359 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 .p2align 2 -label360: - sh2add t2, a4, t2 - subw t5, t4, a5 - addw t6, a5, t4 - mv a7, zero - subw s0, zero, a5 - sext.w s1, a5 - mv a3, zero - mv a6, t2 - slt s5, t5, t0 - mv s2, t5 - slt s10, s0, a4 - or s7, t5, s0 - sh2add s4, s0, t1 - xori s3, s5, 1 - srliw s8, s7, 31 - mv s5, s0 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, t5 - divw s9, zero, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 -label359: - ld s0, 0(sp) - ld s5, 8(sp) - ld s1, 16(sp) +label567: + mv s10, s11 + bgt s3, s11, label405 + bgt s2, s11, label357 + j label679 +.p2align 2 +label644: + addiw s9, s9, 1 + bgt t6, s9, label347 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + addiw t4, t4, 1 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label511: + mv s10, s0 + mv s11, zero +.p2align 2 +label363: + bgt s7, s10, label366 +.p2align 2 +label515: + mv s10, s11 + bgt s5, s11, label393 + j label391 +.p2align 2 +label648: + mv s10, s11 + bgt s4, s11, label398 + bgt s3, s11, label405 +.p2align 2 +label576: + mv s10, s11 + bgt s2, s11, label357 + bgt s1, s11, label354 +label476: + addiw s9, s9, 1 + bgt t6, s9, label347 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + j label663 +.p2align 2 +label659: + mv s10, s11 + bgt s1, s11, label354 + addiw s9, s9, 1 + bgt t6, s9, label347 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 +label663: + addiw t4, t4, 1 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label651: + mv s11, s10 + bgt s6, s10, label535 +.p2align 2 +label655: + mv s11, s10 + bgt s8, s10, label384 + bgt s7, s10, label366 + j label515 +.p2align 2 +label656: + mv s10, s11 + bgt s5, s11, label393 +.p2align 2 +label391: + mv s10, s11 + bgt s4, s11, label398 + bgt s3, s11, label405 + j label576 +.p2align 2 +label652: + mv s11, s10 + bgt s8, s10, label384 + bgt s7, s10, label366 + j label515 +.p2align 2 +label662: + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + addiw t4, t4, 1 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label654: + mv s10, s11 + bgt s7, s11, label366 + bgt s5, s11, label393 + j label391 +.p2align 2 +label496: + mv s10, s0 + mv s11, zero + bgt s3, s0, label405 + mv s10, zero + j label349 +.p2align 2 +label506: + mv s10, s0 + mv s11, zero + bgt s5, s0, label393 + j label391 +.p2align 2 +label525: + mv s10, s0 + mv s11, zero + bgt s8, s0, label384 + mv s10, zero + j label363 +.p2align 2 +label530: + mv s11, s0 + mv s10, zero + bgt s6, s0, label535 + mv s11, zero + bgt s8, zero, label384 + bgt s7, zero, label366 + j label515 +label345: + ld s4, 0(sp) + ld s0, 8(sp) + ld s5, 16(sp) ld s6, 24(sp) - ld s2, 32(sp) - ld s3, 40(sp) - ld s4, 48(sp) - ld s7, 56(sp) - ld s8, 64(sp) - ld s9, 72(sp) - ld s10, 80(sp) + ld s1, 32(sp) + ld s10, 40(sp) + ld s9, 48(sp) + ld s2, 56(sp) + ld s3, 64(sp) + ld s7, 72(sp) + ld s8, 80(sp) ld s11, 88(sp) - addi sp, sp, 96 + addi sp, sp, 136 ret .p2align 2 -label502: +label676: addiw t4, t4, 1 - bgt t3, t4, label360 - j label359 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label501: + mv s10, s0 + mv s11, zero + bgt s4, s0, label398 + j label646 +label640: + mv s10, s0 + mv s11, zero + bgt s2, s0, label357 + j label679 .p2align 2 cmmc_parallel_body_3: addi sp, sp, -96 mv t1, a1 -pcrel673: +pcrel883: auipc a5, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel674: +pcrel884: auipc t5, %pcrel_hi(b) sd s0, 0(sp) - addi t0, a5, %pcrel_lo(pcrel673) - addi t3, t5, %pcrel_lo(pcrel674) + addi t0, a5, %pcrel_lo(pcrel883) + addi t3, t5, %pcrel_lo(pcrel884) sd s5, 8(sp) sd s2, 16(sp) sd s1, 24(sp) @@ -853,20 +962,20 @@ pcrel674: sd s7, 56(sp) sd s8, 64(sp) sd s9, 72(sp) - sd s10, 80(sp) - sd s11, 88(sp) - lw a4, 4(t0) - lw a2, 8(t0) - lw a3, 12(t0) - lw t4, %pcrel_lo(pcrel673)(a5) + sd s11, 80(sp) + sd s10, 88(sp) + lw a3, 4(t0) + lw a4, 8(t0) + lw a2, 12(t0) + lw t4, %pcrel_lo(pcrel883)(a5) mulw t2, a0, a2 addw a1, t2, t4 mv t2, a0 -pcrel675: +pcrel885: auipc t4, %pcrel_hi(a) sh2add t0, a1, t3 lui a0, 262144 - addi a5, t4, %pcrel_lo(pcrel675) + addi a5, t4, %pcrel_lo(pcrel885) subw t3, t2, a3 addw t4, a3, t2 mv t5, t0 @@ -886,50 +995,56 @@ pcrel675: xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - bne s6, zero, label580 + bne s6, zero, label786 + j label785 +.p2align 2 +label745: + addi s3, s3, 4 + mv a1, s6 + or s5, s0, s4 + srliw s7, s5, 31 + slt s5, s4, a2 + andi s6, s7, 1 + xori s7, s5, 1 + or s2, s1, s6 + or s6, s2, s7 + bne s6, zero, label786 .p2align 2 -label579: +label785: mulw s6, a2, s0 sh2add s5, s6, s3 mv s6, zero lw s2, 0(s5) li s5, 1 .p2align 2 -label535: +label741: divw s8, a1, s5 srliw s9, s8, 31 - add s10, s8, s9 + add s11, s8, s9 divw s9, s2, s5 - andi s11, s10, -2 - subw s7, s8, s11 - srliw s10, s9, 31 - add s8, s9, s10 - andi s11, s8, -2 - subw s10, s9, s11 - xor s11, s7, s10 - slliw s7, s6, 1 - sltiu s8, s11, 1 - addi s9, s7, 1 + andi s10, s11, -2 + subw s7, s8, s10 + srliw s11, s9, 31 + add s10, s9, s11 + slliw s11, s6, 1 + andi s8, s10, -2 + subw s10, s9, s8 + xor s9, s7, s10 slliw s7, s5, 1 - subw s6, s9, s8 + sltiu s8, s9, 1 mv s5, s7 - blt s7, a0, label535 + addi s9, s11, 1 + subw s6, s9, s8 + blt s7, a0, label741 addiw s4, s4, 1 - bgt a7, s4, label545 + bgt a7, s4, label745 addiw s0, s0, 1 - bgt t4, s0, label608 - addiw t6, t6, 1 - sw s6, 0(t5) - ble a2, t6, label662 - addi t5, t5, 4 - subw a6, t6, a3 - addw a7, a3, t6 - mv s0, t3 - mv a1, zero - slt s2, t3, a4 - or s5, t3, a6 - mv s4, a6 + ble t4, s0, label868 + mv a1, s6 + slt s2, s0, a4 sh2add s3, a6, a5 + mv s4, a6 + or s5, s0, a6 xori s1, s2, 1 srliw s7, s5, 31 slt s5, a6, a2 @@ -937,46 +1052,40 @@ label535: xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - bne s6, zero, label580 - j label579 + bne s6, zero, label786 + j label785 .p2align 2 -label545: - addi s3, s3, 4 - mv a1, s6 - or s5, s0, s4 +label868: + addiw t6, t6, 1 + sw s6, 0(t5) + ble a2, t6, label872 + addi t5, t5, 4 + subw a6, t6, a3 + addw a7, a3, t6 + mv s0, t3 + mv a1, zero + slt s2, t3, a4 + mv s4, a6 + or s5, t3, a6 + sh2add s3, a6, a5 + xori s1, s2, 1 srliw s7, s5, 31 - slt s5, s4, a2 + slt s5, a6, a2 andi s6, s7, 1 xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - beq s6, zero, label579 + beq s6, zero, label785 .p2align 2 -label580: +label786: mv s2, zero li s5, 1 mv s6, zero - j label535 + j label741 .p2align 2 -label608: - mv a1, s6 - slt s2, s0, a4 - sh2add s3, a6, a5 - mv s4, a6 - or s5, s0, a6 - xori s1, s2, 1 - srliw s7, s5, 31 - slt s5, a6, a2 - andi s6, s7, 1 - xori s7, s5, 1 - or s2, s1, s6 - or s6, s2, s7 - bne s6, zero, label580 - j label579 -.p2align 2 -label662: +label872: addiw t2, t2, 1 - ble t1, t2, label543 + ble t1, t2, label750 sh2add t0, a2, t0 subw t3, t2, a3 addw t4, a3, t2 @@ -987,8 +1096,8 @@ label662: mv t5, t0 slt s2, t3, a4 mv s0, t3 - mv s4, a6 or s5, t3, a6 + mv s4, a6 sh2add s3, a6, a5 xori s1, s2, 1 srliw s7, s5, 31 @@ -997,9 +1106,9 @@ label662: xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - bne s6, zero, label580 - j label579 -label543: + bne s6, zero, label786 + j label785 +label750: ld s0, 0(sp) ld s5, 8(sp) ld s2, 16(sp) @@ -1010,18 +1119,18 @@ label543: ld s7, 56(sp) ld s8, 64(sp) ld s9, 72(sp) - ld s10, 80(sp) - ld s11, 88(sp) + ld s11, 80(sp) + ld s10, 88(sp) addi sp, sp, 96 ret .p2align 2 cmmc_parallel_body_4: addi sp, sp, -96 mv t3, a1 -pcrel856: +pcrel1069: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_4) sd s0, 0(sp) - addi a3, a2, %pcrel_lo(pcrel856) + addi a3, a2, %pcrel_lo(pcrel1069) sd s5, 8(sp) sd s1, 16(sp) sd s6, 24(sp) @@ -1036,17 +1145,17 @@ pcrel856: lw a5, 4(a3) lw a4, 8(a3) lw t0, 12(a3) -pcrel857: +pcrel1070: auipc a3, %pcrel_hi(b) - lw t1, %pcrel_lo(pcrel856)(a2) + lw t1, %pcrel_lo(pcrel1069)(a2) mulw a1, a0, a4 - addi a2, a3, %pcrel_lo(pcrel857) + addi a2, a3, %pcrel_lo(pcrel1070) addw t4, a1, t1 lui a3, 524288 -pcrel858: +pcrel1071: auipc a1, %pcrel_hi(a) sh2add t2, t4, a2 - addi t1, a1, %pcrel_lo(pcrel858) + addi t1, a1, %pcrel_lo(pcrel1071) lui a2, 262144 mv t4, a0 addiw a1, a3, 1 @@ -1066,208 +1175,237 @@ pcrel858: or s7, t5, s0 slt s10, s0, a4 srliw s8, s7, 31 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 + or s7, s6, s8 + bne s7, zero, label951 mulw s8, a4, t5 - divw s11, zero, a0 + divw s10, zero, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label750: +label916: + addi s4, s4, 4 + mv a3, s8 + or s7, s2, s5 + slt s10, s5, a4 + srliw s8, s7, 31 + andi s9, s8, 1 + xori s8, s10, 1 + or s6, s3, s9 + or s7, s6, s8 + beq s7, zero, label1036 +.p2align 2 +label951: + mv s6, zero + mv s7, a0 + mv s8, zero + divw s10, a3, a0 + and s9, s10, a1 + bne s9, a0, label1037 +.p2align 2 +label961: mv s10, a0 sh1add s8, s8, a0 slliw s9, s7, 1 mv s7, s9 - bge s9, a2, label824 -.p2align 2 -label693: - divw s11, a3, s7 - and s9, s11, a1 - beq s9, a0, label750 -.p2align 2 -label696: - divw s11, s6, s7 - and s10, s11, a1 - xori s9, s10, 1 - sltiu s10, s9, 1 - sh1add s8, s8, s10 + bge s9, a2, label1034 +.p2align 2 +label903: + divw s10, a3, s7 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, s7 + and s10, s9, a1 slliw s9, s7, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 + sltiu s10, s11, 1 + sh1add s8, s8, s10 + blt s9, a2, label903 +.p2align 2 +label1038: addiw s5, s5, 1 - bgt s1, s5, label700 + bgt s1, s5, label916 +.p2align 2 +label1044: addiw s2, s2, 1 - bgt t6, s2, label773 - addiw a7, a7, 1 - sw s8, 0(a6) - bgt a4, a7, label706 - addiw t4, t4, 1 - ble t3, t4, label704 + ble t6, s2, label1051 .p2align 2 -label705: - sh2add t2, a4, t2 - subw t5, t4, a5 - addw t6, a5, t4 - mv a7, zero - subw s0, zero, a5 - sext.w s1, a5 - mv a3, zero - mv a6, t2 - slt s5, t5, t0 - mv s2, t5 - slt s10, s0, a4 - or s7, t5, s0 +label982: + mv a3, s8 + slt s5, s2, t0 sh2add s4, s0, t1 + or s7, s2, s0 + slt s10, s0, a4 xori s3, s5, 1 srliw s8, s7, 31 mv s5, s0 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 - mulw s8, a4, t5 - divw s11, zero, a0 + or s7, s6, s8 + bne s7, zero, label951 + mulw s8, a4, s2 + divw s10, a3, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 - slliw s9, a0, 1 - mv s7, s9 - blt s9, a2, label693 - j label824 -.p2align 2 -label700: - addi s4, s4, 4 - mv a3, s8 - or s7, s2, s5 - slt s10, s5, a4 - srliw s8, s7, 31 - xori s7, s10, 1 - andi s9, s8, 1 - or s6, s3, s9 - or s8, s6, s7 - beq s8, zero, label826 -.p2align 2 -label741: - mv s6, zero - mv s7, a0 - mv s8, zero - divw s11, a3, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label824: +label1034: addiw s5, s5, 1 - bgt s1, s5, label700 + bgt s1, s5, label916 addiw s2, s2, 1 - ble t6, s2, label835 + bgt t6, s2, label982 + addiw a7, a7, 1 + sw s8, 0(a6) + ble a4, a7, label1052 .p2align 2 -label773: - mv a3, s8 - slt s5, s2, t0 - sh2add s4, s0, t1 - or s7, s2, s0 +label915: + addi a6, a6, 4 + subw s0, a7, a5 + addw s1, a5, a7 + mv s2, t5 + mv a3, zero + slt s5, t5, t0 slt s10, s0, a4 + or s7, t5, s0 + sh2add s4, s0, t1 xori s3, s5, 1 srliw s8, s7, 31 mv s5, s0 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 - mulw s8, a4, s2 - divw s11, a3, a0 + or s7, s6, s8 + bne s7, zero, label951 + mulw s8, a4, t5 + divw s10, zero, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label826: +label1036: mulw s8, a4, s2 - divw s11, a3, a0 + divw s10, a3, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label835: +label1051: addiw a7, a7, 1 sw s8, 0(a6) - ble a4, a7, label841 + bgt a4, a7, label915 + addiw t4, t4, 1 + ble t3, t4, label914 .p2align 2 -label706: - addi a6, a6, 4 - subw s0, a7, a5 - addw s1, a5, a7 - mv s2, t5 +label913: + sh2add t2, a4, t2 + subw t5, t4, a5 + addw t6, a5, t4 + mv a7, zero + subw s0, zero, a5 + sext.w s1, a5 mv a3, zero + mv a6, t2 slt s5, t5, t0 + mv s2, t5 slt s10, s0, a4 or s7, t5, s0 sh2add s4, s0, t1 xori s3, s5, 1 srliw s8, s7, 31 mv s5, s0 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 + or s7, s6, s8 + bne s7, zero, label951 mulw s8, a4, t5 - divw s11, zero, a0 + divw s10, zero, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 + mv s7, s9 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 +.p2align 2 +label1037: + divw s9, s6, s7 + and s10, s9, a1 + slliw s9, s7, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 -label704: + sltiu s10, s11, 1 + sh1add s8, s8, s10 + blt s9, a2, label903 + addiw s5, s5, 1 + bgt s1, s5, label916 + j label1044 +.p2align 2 +label1052: + addiw t4, t4, 1 + bgt t3, t4, label913 +label914: ld s0, 0(sp) ld s5, 8(sp) ld s1, 16(sp) @@ -1283,22 +1421,17 @@ label704: addi sp, sp, 96 ret .p2align 2 -label841: - addiw t4, t4, 1 - bgt t3, t4, label705 - j label704 -.p2align 2 cmmc_parallel_body_5: addi sp, sp, -64 mv t0, a0 mv a5, a1 -pcrel995: +pcrel1208: auipc a3, %pcrel_hi(cmmc_parallel_body_payload_5) -pcrel996: +pcrel1209: auipc t5, %pcrel_hi(b) - addi a4, a3, %pcrel_lo(pcrel995) + addi a4, a3, %pcrel_lo(pcrel1208) sd s2, 0(sp) - addi t4, t5, %pcrel_lo(pcrel996) + addi t4, t5, %pcrel_lo(pcrel1209) sd s1, 8(sp) sd s6, 16(sp) sd s0, 24(sp) @@ -1306,16 +1439,16 @@ pcrel996: sd s4, 40(sp) sd s7, 48(sp) sd s3, 56(sp) - lw a2, 4(a4) - lw a1, 8(a4) + lw a1, 4(a4) + lw a2, 8(a4) lw a0, 12(a4) - lw t3, %pcrel_lo(pcrel995)(a3) + lw t3, %pcrel_lo(pcrel1208)(a3) mulw t2, t0, a0 addw t1, t2, t3 -pcrel997: +pcrel1210: auipc t2, %pcrel_hi(a) sh2add a4, t1, t4 - addi a3, t2, %pcrel_lo(pcrel997) + addi a3, t2, %pcrel_lo(pcrel1210) subw t1, t0, a1 addw t2, a1, t0 mv t3, a4 @@ -1324,13 +1457,13 @@ pcrel997: mv t6, a1 mv a6, t1 mv s2, zero - j label866 + j label1079 .p2align 2 -label971: +label1184: addiw t0, t0, 1 - ble a5, t0, label881 + ble a5, t0, label1094 .p2align 2 -label880: +label1093: sh2add a4, a0, a4 subw t1, t0, a1 addw t2, a1, t0 @@ -1351,15 +1484,15 @@ label880: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - bne s4, zero, label917 + bne s4, zero, label1130 mulw s4, a0, t1 addiw s1, t5, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, zero, s3 - ble t6, s1, label984 + ble t6, s1, label1197 .p2align 2 -label875: +label1088: addi s0, s0, 4 or s5, a6, s1 slt s6, s1, a0 @@ -1368,17 +1501,17 @@ label875: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - beq s4, zero, label967 + beq s4, zero, label1180 .p2align 2 -label917: +label1130: mv s3, zero addiw s1, s1, 1 max s2, s2, zero - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, a6, 1 - ble t2, a6, label966 + ble t2, a6, label1179 .p2align 2 -label866: +label1079: slt s1, a6, a2 sh2add s0, t5, a3 xori a7, s1, 1 @@ -1390,44 +1523,44 @@ label866: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - bne s4, zero, label917 + bne s4, zero, label1130 mulw s4, a0, a6 addiw s1, t5, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, s2, s3 - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, a6, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 - j label881 + bgt a5, t0, label1093 + j label1094 .p2align 2 -label967: +label1180: mulw s4, a0, a6 addiw s1, s1, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, s2, s3 - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, a6, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 - j label881 + bgt a5, t0, label1093 + j label1094 .p2align 2 -label966: +label1179: addiw t4, t4, 1 sw s2, 0(t3) - ble a0, t4, label971 + ble a0, t4, label1184 .p2align 2 -label878: +label1091: addi t3, t3, 4 subw t5, t4, a1 addw t6, a1, t4 @@ -1444,31 +1577,31 @@ label878: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - bne s4, zero, label917 + bne s4, zero, label1130 mulw s4, a0, t1 addiw s1, t5, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, zero, s3 - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, t1, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 - j label881 + bgt a5, t0, label1093 + j label1094 .p2align 2 -label984: +label1197: addiw a6, a6, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 -label881: + bgt a5, t0, label1093 +label1094: ld s2, 0(sp) ld s1, 8(sp) ld s6, 16(sp) @@ -1481,373 +1614,278 @@ label881: ret .p2align 2 cmmc_parallel_body_6: - # stack usage: CalleeArg[0] Local[0] RegSpill[64] CalleeSaved[96] - addi sp, sp, -160 -pcrel1374: + addi sp, sp, -96 + mv t3, a1 +pcrel1391: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_6) -pcrel1375: - auipc a5, %pcrel_hi(b) - mv t4, a0 - sd s3, 0(sp) - sd s0, 8(sp) - sd s5, 16(sp) - sd s4, 24(sp) - sd s6, 32(sp) - sd s1, 40(sp) - sd s8, 48(sp) - sd s10, 56(sp) - sd s9, 64(sp) - sd s2, 72(sp) - sd s7, 80(sp) + sd s0, 0(sp) + addi a3, a2, %pcrel_lo(pcrel1391) + sd s5, 8(sp) + sd s1, 16(sp) + sd s6, 24(sp) + sd s2, 32(sp) + sd s3, 40(sp) + sd s4, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + sd s9, 72(sp) + sd s10, 80(sp) sd s11, 88(sp) - sd a1, 96(sp) - addi a1, a2, %pcrel_lo(pcrel1374) - lw s3, 4(a1) - addi s10, s3, -1359 - addi s8, s3, -336 - addi s6, s3, -81 - addi s4, s3, -3 - addi s5, s3, -18 - sd s3, 104(sp) - lw s0, 8(a1) - mulw a3, a0, s0 - sd s0, 112(sp) - lw a4, %pcrel_lo(pcrel1374)(a2) - sd s4, 152(sp) - addi a2, a5, %pcrel_lo(pcrel1375) - addw a1, a3, a4 - sd s5, 144(sp) - lui a3, 1048571 - sh2add a4, a1, a2 - sd s6, 136(sp) - lui a1, 1048575 - sd s8, 128(sp) - addiw a2, a1, -1358 - sd s10, 120(sp) - addiw a1, a3, -1357 - addw a5, s3, a2 - lui a3, 32 - addw t0, s3, a1 - lui a2, 1 - addiw a0, a3, -1 - lui a1, 16 - srli t3, a0, 1 - srli t2, a0, 3 - srli t1, a0, 5 - lui a3, 4 - mv a6, a4 - mv a7, zero - addw t6, s3, t4 - subw t5, t4, s3 - j label1002 -.p2align 2 -label1221: - addiw s9, s9, 1 - bgt t6, s9, label1005 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - ble s0, a7, label1340 -.p2align 2 -label1071: - addi a6, a6, 4 -.p2align 2 -label1002: - ld s3, 104(sp) - ld s4, 152(sp) - addw s1, s3, a7 - subw s0, a7, s3 - addw s2, s4, a7 - ld s5, 144(sp) - addiw s9, s0, 3 - ld s6, 136(sp) - addw s3, s5, a7 - ld s8, 128(sp) - addw s4, s6, a7 - ld s10, 120(sp) - addw s5, s8, a7 - addw s6, t0, a7 - addw s7, s10, a7 - addw s8, a5, a7 - bge s9, s1, label1113 - mv s9, t5 -.p2align 2 -label1005: - addiw s11, s0, 15 - ble s2, s11, label1118 - addiw s11, s0, 63 - ble s3, s11, label1123 - addiw s11, s0, 255 - ble s4, s11, label1128 - addiw s11, s0, 1023 - ble s5, s11, label1133 - addw s11, s0, t1 - ble s7, s11, label1138 - addw s11, s0, t2 - ble s8, s11, label1143 - addw s10, s0, t3 - ble s6, s10, label1157 - addw s11, s0, a0 - addw s10, s0, a1 - ble s6, s11, label1313 -.p2align 2 -label1027: - addw s11, s10, a0 - addw s10, s10, a1 - bgt s6, s11, label1027 - mv s11, s10 - ble s6, s10, label1315 -.p2align 2 -label1162: - mv s10, s11 -.p2align 2 -label1023: - addw s10, s10, a3 - bgt s6, s10, label1023 - mv s11, s10 - ble s8, s10, label1314 -.p2align 2 -label1015: - addw s10, s10, a2 - bgt s8, s10, label1015 - mv s11, s10 - ble s7, s10, label1311 -.p2align 2 -label1034: - addiw s10, s10, 1024 - bgt s7, s10, label1034 - mv s11, s10 - ble s5, s10, label1039 -.p2align 2 -label1041: - addiw s10, s10, 256 - bgt s5, s10, label1041 - mv s11, s10 - ble s4, s10, label1046 -.p2align 2 -label1048: - addiw s10, s10, 64 - bgt s4, s10, label1048 - mv s11, s10 - ble s3, s10, label1053 -.p2align 2 -label1055: - addiw s10, s10, 16 - bgt s3, s10, label1055 - mv s11, s10 - ble s2, s10, label1212 -.p2align 2 -label1060: - addiw s10, s10, 4 - bgt s2, s10, label1060 - ble s1, s10, label1221 -.p2align 2 -label1064: - addiw s10, s10, 1 - bgt s1, s10, label1064 - addiw s9, s9, 1 - bgt t6, s9, label1005 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - ble t5, t4, label1069 -.p2align 2 -label1070: - ld s0, 112(sp) + lw a4, 4(a3) + lw t0, 8(a3) + lw a5, 12(a3) + mulw a1, a0, a4 +pcrel1392: + auipc a3, %pcrel_hi(b) + lw t1, %pcrel_lo(pcrel1391)(a2) + addi a2, a3, %pcrel_lo(pcrel1392) + addw t4, a1, t1 + lui a3, 524288 +pcrel1393: + auipc a1, %pcrel_hi(a) + sh2add t2, t4, a2 + addi t1, a1, %pcrel_lo(pcrel1393) + lui a2, 262144 + mv t4, a0 + li a1, 1 + addiw a0, a3, 1 + subw t5, t4, a5 + addw t6, a5, t4 + mv a6, t2 mv a7, zero - ld s3, 104(sp) - sh2add a4, s0, a4 - addw t6, s3, t4 - subw t5, t4, s3 - mv a6, a4 - j label1002 -.p2align 2 -label1113: + subw s0, zero, a5 + mv s1, a5 mv s2, t5 - blt s0, s1, label1249 - addiw s2, t5, 1 - ble t6, s2, label1328 -.p2align 2 -label1072: - bge s0, s1, label1248 -.p2align 2 -label1249: - mv s3, s0 -.p2align 2 -label1074: - addiw s3, s3, 1 - bgt s1, s3, label1074 - addiw s2, s2, 1 - bgt t6, s2, label1072 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1212: - mv s10, s11 - bgt s1, s11, label1064 -.p2align 2 -label1324: - addiw s9, s9, 1 - bgt t6, s9, label1005 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1053: - mv s10, s11 - bgt s2, s11, label1060 + mv a3, zero + slt s5, t5, t0 + sh2add s4, s0, t1 + xori s3, s5, 1 + mv s5, s0 + or s7, t5, s0 + slt s10, s0, a4 + srliw s8, s7, 31 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, t5 + divw s11, zero, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1322: - mv s10, s11 - bgt s1, s11, label1064 - j label1324 +label1362: + addiw s5, s5, 1 + ble s1, s5, label1366 .p2align 2 -label1046: - mv s10, s11 - bgt s3, s11, label1055 +label1240: + addi s4, s4, 4 + mv a3, s8 + or s7, s2, s5 + slt s10, s5, a4 + srliw s8, s7, 31 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + beq s8, zero, label1361 .p2align 2 -label1320: - mv s10, s11 - bgt s2, s11, label1060 - j label1322 +label1276: + mv s6, zero + mv s7, a1 + mv s8, zero + divw s11, a3, a1 + and s9, s11, a0 + bne s9, a1, label1363 +.p2align 2 +label1241: + divw s10, s6, s7 + and s9, s10, a0 + xori s11, s9, 1 + slliw s9, s7, 1 + sltiu s10, s11, 1 + mv s7, s9 + sh1add s8, s8, s10 + bge s9, a2, label1362 .p2align 2 -label1039: - mv s10, s11 - bgt s4, s11, label1048 +label1228: + divw s11, a3, s7 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + sh1add s8, s8, zero + slliw s9, s7, 1 + mv s7, s9 + blt s9, a2, label1228 .p2align 2 -label1318: - mv s10, s11 - bgt s3, s11, label1055 - j label1320 +label1233: + addiw s5, s5, 1 + bgt s1, s5, label1240 .p2align 2 -label1248: +label1234: addiw s2, s2, 1 - bgt t6, s2, label1072 + bgt t6, s2, label1304 addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 + sw s8, 0(a6) + bgt a4, a7, label1236 addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1143: - mv s10, s0 - mv s11, zero -.p2align 2 -label1012: - bgt s8, s10, label1015 - mv s10, s11 - bgt s7, s11, label1034 - j label1032 + bgt t3, t4, label1239 + j label1238 .p2align 2 -label1313: - mv s11, s10 - bgt s6, s10, label1162 -.p2align 2 -label1315: - mv s11, s10 - bgt s8, s10, label1015 - bgt s7, s10, label1034 - j label1032 +label1361: + mulw s8, a4, s2 + divw s11, a3, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1311: - mv s10, s11 - bgt s5, s11, label1041 - bgt s4, s11, label1048 - j label1318 +label1366: + addiw s2, s2, 1 + ble t6, s2, label1371 .p2align 2 -label1314: - mv s10, s11 - bgt s7, s11, label1034 - bgt s5, s11, label1041 -label1316: - mv s10, s11 - bgt s4, s11, label1048 - j label1318 +label1304: + mv a3, s8 + slt s5, s2, t0 + sh2add s4, s0, t1 + or s7, s2, s0 + slt s10, s0, a4 + xori s3, s5, 1 + srliw s8, s7, 31 + mv s5, s0 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, s2 + divw s11, a3, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1328: +label1371: addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1118: - mv s10, s0 - mv s11, zero - bgt s2, s0, label1060 - j label1212 -.p2align 2 -label1123: - mv s10, s0 - mv s11, zero - bgt s3, s0, label1055 - j label1053 + sw s8, 0(a6) + ble a4, a7, label1375 .p2align 2 -label1128: - mv s10, s0 - mv s11, zero - bgt s4, s0, label1048 - j label1046 +label1236: + addi a6, a6, 4 + subw s0, a7, a5 + addw s1, a5, a7 + mv s2, t5 + mv a3, zero + slt s5, t5, t0 + slt s10, s0, a4 + or s7, t5, s0 + sh2add s4, s0, t1 + xori s3, s5, 1 + srliw s8, s7, 31 + mv s5, s0 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, t5 + divw s11, zero, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1133: - mv s10, s0 - mv s11, zero - bgt s5, s0, label1041 - j label1039 +label1375: + addiw t4, t4, 1 + ble t3, t4, label1238 .p2align 2 -label1138: - mv s10, s0 - mv s11, zero - bgt s7, s0, label1034 -label1032: - mv s10, s11 - bgt s5, s11, label1041 - j label1316 +label1239: + sh2add t2, a4, t2 + subw t5, t4, a5 + addw t6, a5, t4 + mv a7, zero + subw s0, zero, a5 + sext.w s1, a5 + mv a3, zero + mv a6, t2 + slt s5, t5, t0 + mv s2, t5 + slt s10, s0, a4 + or s7, t5, s0 + sh2add s4, s0, t1 + xori s3, s5, 1 + srliw s8, s7, 31 + mv s5, s0 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, t5 + divw s11, zero, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1157: - mv s11, s0 +label1363: mv s10, zero - bgt s6, s0, label1162 - mv s11, zero - j label1012 -label1069: - ld s3, 0(sp) - ld s0, 8(sp) - ld s5, 16(sp) - ld s4, 24(sp) - ld s6, 32(sp) - ld s1, 40(sp) - ld s8, 48(sp) - ld s10, 56(sp) - ld s9, 64(sp) - ld s2, 72(sp) - ld s7, 80(sp) + sh1add s8, s8, zero + slliw s9, s7, 1 + mv s7, s9 + blt s9, a2, label1228 + addiw s5, s5, 1 + bgt s1, s5, label1240 + j label1234 +label1238: + ld s0, 0(sp) + ld s5, 8(sp) + ld s1, 16(sp) + ld s6, 24(sp) + ld s2, 32(sp) + ld s3, 40(sp) + ld s4, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + ld s9, 72(sp) + ld s10, 80(sp) ld s11, 88(sp) - addi sp, sp, 160 + addi sp, sp, 96 ret -.p2align 2 -label1340: - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 diff --git a/tests/SysY2022/performance/conv1.sy.ir b/tests/SysY2022/performance/conv1.sy.ir index 4fc305de1..9678ee05d 100644 --- a/tests/SysY2022/performance/conv1.sy.ir +++ b/tests/SysY2022/performance/conv1.sy.ir @@ -38,17 +38,17 @@ func @main() -> i32 { NoRecurse Entry } { i32* %26 = ptradd [16 * i8]* %24, i32 4; i32* %27 = ptradd [16 * i8]* %24, i32 8; i32* %28 = ptradd [16 * i8]* %24, i32 12; - [16 * i8]* %29 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_2 to [16 * i8]*; + [16 * i8]* %29 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_5 to [16 * i8]*; i32* %30 = ptradd [16 * i8]* %29, i32 0; i32* %31 = ptradd [16 * i8]* %29, i32 4; i32* %32 = ptradd [16 * i8]* %29, i32 8; i32* %33 = ptradd [16 * i8]* %29, i32 12; - [16 * i8]* %34 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_5 to [16 * i8]*; + [16 * i8]* %34 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_6 to [16 * i8]*; i32* %35 = ptradd [16 * i8]* %34, i32 0; i32* %36 = ptradd [16 * i8]* %34, i32 4; i32* %37 = ptradd [16 * i8]* %34, i32 8; i32* %38 = ptradd [16 * i8]* %34, i32 12; - [16 * i8]* %39 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_6 to [16 * i8]*; + [16 * i8]* %39 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_2 to [16 * i8]*; i32* %40 = ptradd [16 * i8]* %39, i32 0; i32* %41 = ptradd [16 * i8]* %39, i32 4; i32* %42 = ptradd [16 * i8]* %39, i32 8; @@ -57,30 +57,30 @@ func @main() -> i32 { NoRecurse Entry } { i8* %45 = functionptr () -> void @cmmc_parallel_body_3 as i8*; i8* %46 = functionptr () -> void @cmmc_parallel_body_1 as i8*; i8* %47 = functionptr () -> void @cmmc_parallel_body_4 as i8*; - i8* %48 = functionptr () -> void @cmmc_parallel_body_2 as i8*; - i8* %49 = functionptr () -> void @cmmc_parallel_body_5 as i8*; - i8* %50 = functionptr () -> void @cmmc_parallel_body_6 as i8*; + i8* %48 = functionptr () -> void @cmmc_parallel_body_5 as i8*; + i8* %49 = functionptr () -> void @cmmc_parallel_body_6 as i8*; + i8* %50 = functionptr () -> void @cmmc_parallel_body_2 as i8*; ubr ^while.body; ^b: store i32* %13 with i32 0; store i32* %14 with i32 %2; - store i32* %15 with i32 %1; - store i32* %16 with i32 %11; + store i32* %15 with i32 %11; + store i32* %16 with i32 %1; call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %44); ubr ^b6; ^b1: store i32* %18 with i32 0; - store i32* %19 with i32 %1; - store i32* %20 with i32 %2; - store i32* %21 with i32 %11; + store i32* %19 with i32 %11; + store i32* %20 with i32 %1; + store i32* %21 with i32 %2; call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %45); ubr ^b6; ^b2: - store i32* %35 with i32 0; - store i32* %36 with i32 %1; - store i32* %37 with i32 %11; - store i32* %38 with i32 %2; - call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %49); + store i32* %30 with i32 0; + store i32* %31 with i32 %11; + store i32* %32 with i32 %1; + store i32* %33 with i32 %2; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %48); ubr ^b6; ^while.body: i32 %51 = phi [^entry, i32 0] [^b8, i32 %54]; @@ -95,11 +95,11 @@ func @main() -> i32 { NoRecurse Entry } { call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %47); ubr ^b6; ^b4: - store i32* %30 with i32 0; - store i32* %31 with i32 %11; - store i32* %32 with i32 %1; - store i32* %33 with i32 %2; - call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %48); + store i32* %35 with i32 0; + store i32* %36 with i32 %2; + store i32* %37 with i32 %1; + store i32* %38 with i32 %11; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %49); ubr ^b6; ^b5: store i32* %40 with i32 0; @@ -145,19 +145,19 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel ^b1: i32 %17 = phi [^b, i32 %0] [^b7, i32 %60]; i32 %18 = phi [^b, i32 %12] [^b7, i32 %62]; - i32 %19 = sub i32 %17, i32 %9; - i32 %20 = add i32 %9, i32 %17; + i32 %19 = sub i32 %17, i32 %7; + i32 %20 = add i32 %7, i32 %17; i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: i32 %22 = phi [^b1, i32 0] [^b6, i32 %58]; - i32 %23 = sub i32 %22, i32 %9; - i32 %24 = add i32 %9, i32 %22; + i32 %23 = sub i32 %22, i32 %7; + i32 %24 = add i32 %7, i32 %22; ubr ^while.body1; ^while.body1: i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %55]; i32 %26 = phi [^while.body, i32 0] [^b5, i32 %52]; - i1 %27 = icmp sle i32 %7, i32 %25; + i1 %27 = icmp sle i32 %9, i32 %25; ubr ^while.body2; ^while.body2: i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %53]; @@ -352,6 +352,212 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %4 = load i32* %3; i32* %5 = ptradd [16 * i8]* %2, i32 8; i32 %6 = load i32* %5; + i32 %7 = mul i32 %0, i32 %6; + i32* %8 = ptradd [16 * i8]* %2, i32 0; + i32 %9 = load i32* %8; + i32 %10 = add i32 %7, i32 %9; + i32 %11 = add i32 %4, i32 -3; + i32 %12 = add i32 %4, i32 -18; + i32 %13 = add i32 %4, i32 -81; + i32 %14 = add i32 %4, i32 -336; + i32 %15 = add i32 %4, i32 -1359; + i32 %16 = add i32 %4, i32 -5454; + i32 %17 = add i32 %4, i32 -21837; + [10000000 * i32]* %18 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; + i32* %19 = getelementptr &([10000000 * i32]* %18)[i64 0][i64 0]; + ubr ^b1; + ^b1: + i32 %20 = phi [^b, i32 %0] [^b7, i32 %79]; + i32 %21 = phi [^b, i32 %10] [^b7, i32 %81]; + i32 %22 = sub i32 %20, i32 %4; + i32 %23 = add i32 %4, i32 %20; + i32* %24 = getelementptr &(i32* %19)[i32 %21]; + ubr ^while.body; + ^while.body: + i32 %25 = phi [^b1, i32 0] [^b5, i32 %63]; + i32 %26 = sub i32 %25, i32 %4; + i32 %27 = add i32 %26, i32 3; + i32 %28 = add i32 %4, i32 %25; + i1 %29 = icmp slt i32 %27, i32 %28; + i32 %30 = add i32 %11, i32 %25; + i32 %31 = add i32 %12, i32 %25; + i32 %32 = add i32 %13, i32 %25; + i32 %33 = add i32 %14, i32 %25; + i32 %34 = add i32 %15, i32 %25; + i32 %35 = add i32 %16, i32 %25; + i32 %36 = add i32 %17, i32 %25; + cbr i1 %29(prob = 0.5), ^b2, ^b3; + ^b2: + i32 %37 = phi [^while.body, i32 %22] [^b6, i32 %77]; + i32 %38 = add i32 %26, i32 15; + i1 %39 = icmp sgt i32 %30, i32 %38; + cbr i1 %39(prob = 0.941176), ^super.header, ^scalar.header; + ^b3: + i32 %40 = phi [^while.body, i32 %22] [^b4, i32 %50]; + i1 %41 = icmp slt i32 %26, i32 %28; + cbr i1 %41(prob = 0.75), ^while.body1, ^b4; + ^super.header: + i32 %42 = add i32 %26, i32 63; + i1 %43 = icmp sgt i32 %31, i32 %42; + cbr i1 %43(prob = 0.941176), ^super.header1, ^scalar.header1; + ^while.body1 {scalar}: + i32 %44 = phi [^b3, i32 %26] [^while.body1, i32 %45]; + i32 %45 = add i32 %44, i32 1; + i1 %46 = icmp sgt i32 %28, i32 %45; + cbr i1 %46(prob = 0.75), ^while.body1, ^b4; + ^scalar.header: + i32 %47 = phi [^b2, i32 %26] [^scalar.final1, i32 %67]; + i32 %48 = phi [^b2, i32 undef] [^scalar.final1, i32 %67]; + i1 %49 = icmp sgt i32 %30, i32 %47; + cbr i1 %49(prob = 0.75), ^while.body2, ^scalar.final; + ^b4: + i32 %50 = add i32 %40, i32 1; + i1 %51 = icmp sgt i32 %23, i32 %50; + cbr i1 %51(prob = 0.5), ^b3, ^b5; + ^super.header1: + i32 %52 = add i32 %26, i32 255; + i1 %53 = icmp sgt i32 %32, i32 %52; + cbr i1 %53(prob = 0.941176), ^super.header2, ^scalar.header2; + ^scalar.header1: + i32 %54 = phi [^super.header, i32 %26] [^scalar.final2, i32 %84]; + i32 %55 = phi [^super.header, i32 undef] [^scalar.final2, i32 %84]; + i1 %56 = icmp sgt i32 %31, i32 %54; + cbr i1 %56(prob = 0.75), ^while.body3, ^scalar.final1; + ^while.body2 {scalar}: + i32 %57 = phi [^scalar.header, i32 %47] [^while.body2, i32 %58]; + i32 %58 = add i32 %57, i32 4; + i1 %59 = icmp sgt i32 %30, i32 %58; + cbr i1 %59(prob = 0.75), ^while.body2, ^scalar.final; + ^scalar.final: + i32 %60 = phi [^scalar.header, i32 %48] [^while.body2, i32 %58]; + i1 %61 = icmp sgt i32 %28, i32 %60; + cbr i1 %61(prob = 0.75), ^while.body4, ^b6; + ^b5: + i32* %62 = getelementptr &(i32* %24)[i32 %25]; + store i32* %62 with i32 0; + i32 %63 = add i32 %25, i32 1; + i1 %64 = icmp sgt i32 %6, i32 %63; + cbr i1 %64(prob = 0.5), ^while.body, ^b7; + ^super.header2: + i32 %65 = add i32 %26, i32 1023; + i1 %66 = icmp sgt i32 %33, i32 %65; + cbr i1 %66(prob = 0.941176), ^super.header3, ^scalar.header3; + ^scalar.final1: + i32 %67 = phi [^scalar.header1, i32 %55] [^while.body3, i32 %72]; + ubr ^scalar.header; + ^scalar.header2: + i32 %68 = phi [^super.header1, i32 %26] [^scalar.final3, i32 %93]; + i32 %69 = phi [^super.header1, i32 undef] [^scalar.final3, i32 %93]; + i1 %70 = icmp sgt i32 %32, i32 %68; + cbr i1 %70(prob = 0.75), ^while.body5, ^scalar.final2; + ^while.body3 {scalar}: + i32 %71 = phi [^scalar.header1, i32 %54] [^while.body3, i32 %72]; + i32 %72 = add i32 %71, i32 16; + i1 %73 = icmp sgt i32 %31, i32 %72; + cbr i1 %73(prob = 0.75), ^while.body3, ^scalar.final1; + ^while.body4 {scalar}: + i32 %74 = phi [^scalar.final, i32 %60] [^while.body4, i32 %75]; + i32 %75 = add i32 %74, i32 1; + i1 %76 = icmp sgt i32 %28, i32 %75; + cbr i1 %76(prob = 0.75), ^while.body4, ^b6; + ^b6: + i32 %77 = add i32 %37, i32 1; + i1 %78 = icmp sgt i32 %23, i32 %77; + cbr i1 %78(prob = 0.5), ^b2, ^b5; + ^b7: + i32 %79 = add i32 %20, i32 1; + i1 %80 = icmp sgt i32 %1, i32 %79; + i32 %81 = add i32 %6, i32 %21; + cbr i1 %80(prob = 0.984615), ^b1, ^b8; + ^super.header3: + i32 %82 = add i32 %26, i32 4095; + i1 %83 = icmp sgt i32 %34, i32 %82; + cbr i1 %83(prob = 0.941176), ^super.header4, ^scalar.header4; + ^scalar.final2: + i32 %84 = phi [^scalar.header2, i32 %69] [^while.body5, i32 %89]; + ubr ^scalar.header1; + ^scalar.header3: + i32 %85 = phi [^super.header2, i32 %26] [^scalar.final4, i32 %102]; + i32 %86 = phi [^super.header2, i32 undef] [^scalar.final4, i32 %102]; + i1 %87 = icmp sgt i32 %33, i32 %85; + cbr i1 %87(prob = 0.75), ^while.body6, ^scalar.final3; + ^while.body5 {scalar}: + i32 %88 = phi [^scalar.header2, i32 %68] [^while.body5, i32 %89]; + i32 %89 = add i32 %88, i32 64; + i1 %90 = icmp sgt i32 %32, i32 %89; + cbr i1 %90(prob = 0.75), ^while.body5, ^scalar.final2; + ^b8: + ret; + ^super.header4: + i32 %91 = add i32 %26, i32 16383; + i1 %92 = icmp sgt i32 %35, i32 %91; + cbr i1 %92(prob = 0.941176), ^super.header5, ^scalar.header5; + ^scalar.final3: + i32 %93 = phi [^scalar.header3, i32 %86] [^while.body6, i32 %98]; + ubr ^scalar.header2; + ^scalar.header4: + i32 %94 = phi [^super.header3, i32 %26] [^scalar.final5, i32 %113]; + i32 %95 = phi [^super.header3, i32 undef] [^scalar.final5, i32 %113]; + i1 %96 = icmp sgt i32 %34, i32 %94; + cbr i1 %96(prob = 0.75), ^while.body7, ^scalar.final4; + ^while.body6 {scalar}: + i32 %97 = phi [^scalar.header3, i32 %85] [^while.body6, i32 %98]; + i32 %98 = add i32 %97, i32 256; + i1 %99 = icmp sgt i32 %33, i32 %98; + cbr i1 %99(prob = 0.75), ^while.body6, ^scalar.final3; + ^super.header5: + i32 %100 = add i32 %26, i32 65535; + i1 %101 = icmp sgt i32 %36, i32 %100; + cbr i1 %101(prob = 0.941176), ^while.body8, ^scalar.header6; + ^scalar.final4: + i32 %102 = phi [^scalar.header4, i32 %95] [^while.body7, i32 %107]; + ubr ^scalar.header3; + ^scalar.header5: + i32 %103 = phi [^super.header4, i32 %26] [^scalar.final6, i32 %120]; + i32 %104 = phi [^super.header4, i32 undef] [^scalar.final6, i32 %120]; + i1 %105 = icmp sgt i32 %35, i32 %103; + cbr i1 %105(prob = 0.75), ^while.body9, ^scalar.final5; + ^while.body7 {scalar}: + i32 %106 = phi [^scalar.header4, i32 %94] [^while.body7, i32 %107]; + i32 %107 = add i32 %106, i32 1024; + i1 %108 = icmp sgt i32 %34, i32 %107; + cbr i1 %108(prob = 0.75), ^while.body7, ^scalar.final4; + ^while.body8: + i32 %109 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; + i32 %110 = add i32 %109, i32 131071; + i1 %111 = icmp sgt i32 %36, i32 %110; + i32 %112 = add i32 %109, i32 65536; + cbr i1 %111(prob = 0.941176), ^while.body8, ^scalar.header6; + ^scalar.final5: + i32 %113 = phi [^scalar.header5, i32 %104] [^while.body9, i32 %118]; + ubr ^scalar.header4; + ^scalar.header6: + i32 %114 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; + i32 %115 = phi [^super.header5, i32 undef] [^while.body8, i32 %112]; + i1 %116 = icmp sgt i32 %36, i32 %114; + cbr i1 %116(prob = 0.75), ^while.body10, ^scalar.final6; + ^while.body9 {scalar}: + i32 %117 = phi [^scalar.header5, i32 %103] [^while.body9, i32 %118]; + i32 %118 = add i32 %117, i32 4096; + i1 %119 = icmp sgt i32 %35, i32 %118; + cbr i1 %119(prob = 0.75), ^while.body9, ^scalar.final5; + ^scalar.final6: + i32 %120 = phi [^scalar.header6, i32 %115] [^while.body10, i32 %122]; + ubr ^scalar.header5; + ^while.body10 {scalar}: + i32 %121 = phi [^scalar.header6, i32 %114] [^while.body10, i32 %122]; + i32 %122 = add i32 %121, i32 16384; + i1 %123 = icmp sgt i32 %36, i32 %122; + cbr i1 %123(prob = 0.75), ^while.body10, ^scalar.final6; +} +internal [16 * i8]* @cmmc_parallel_body_payload_2, align 8; +internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { + ^b: + [16 * i8]* %2 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_3 to [16 * i8]*; + i32* %3 = ptradd [16 * i8]* %2, i32 4; + i32 %4 = load i32* %3; + i32* %5 = ptradd [16 * i8]* %2, i32 8; + i32 %6 = load i32* %5; i32* %7 = ptradd [16 * i8]* %2, i32 12; i32 %8 = load i32* %7; i32 %9 = mul i32 %0, i32 %8; @@ -364,25 +570,25 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32* %16 = getelementptr &([10000000 * i32]* %15)[i64 0][i64 0]; ubr ^b1; ^b1: - i32 %17 = phi [^b, i32 %0] [^b9, i32 %62]; - i32 %18 = phi [^b, i32 %12] [^b9, i32 %64]; + i32 %17 = phi [^b, i32 %0] [^b7, i32 %61]; + i32 %18 = phi [^b, i32 %12] [^b7, i32 %63]; i32 %19 = sub i32 %17, i32 %4; i32 %20 = add i32 %4, i32 %17; i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: - i32 %22 = phi [^b1, i32 0] [^b8, i32 %60]; + i32 %22 = phi [^b1, i32 0] [^b6, i32 %59]; i32 %23 = sub i32 %22, i32 %4; i32 %24 = add i32 %4, i32 %22; ubr ^while.body1; ^while.body1: - i32 %25 = phi [^while.body, i32 %19] [^b7, i32 %57]; - i32 %26 = phi [^while.body, i32 0] [^b7, i32 %52]; + i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %56]; + i32 %26 = phi [^while.body, i32 0] [^b5, i32 %51]; i1 %27 = icmp sle i32 %6, i32 %25; ubr ^while.body2; ^while.body2: - i32 %28 = phi [^while.body1, i32 %23] [^b6, i32 %55]; - i32 %29 = phi [^while.body1, i32 %26] [^b6, i32 %52]; + i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %54]; + i32 %29 = phi [^while.body1, i32 %26] [^b4, i32 %51]; i32 %30 = or i32 %25, i32 %28; i32 %31 = lshr i32 %30, i32 31; i1 %32 = ztrunc i32 %31 to i1; @@ -400,105 +606,8 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %40 = load i32* %39; ubr ^b2; ^while.body3: - i32 %41 = phi [^b2, i32 1] [^b5, i32 %53]; - i32 %42 = phi [^b2, i32 0] [^b5, i32 %52]; - i32 %43 = sdiv i32 %29, i32 %41; - i32 %44 = and i32 %43, i32 -2147483647; - i1 %45 = icmp eq i32 %44, i32 1; - cbr i1 %45(prob = 0.49), ^b4, ^b5; - ^b4: - i32 %46 = sdiv i32 %36, i32 %41; - i32 %47 = and i32 %46, i32 -2147483647; - i1 %48 = icmp eq i32 %47, i32 1; - ubr ^b5; - ^b5: - i1 %49 = phi [^while.body3, i1 false] [^b4, i1 %48]; - i32 %50 = mul i32 %42, i32 2; - i32 %51 = zext i1 %49 to i32; - i32 %52 = add i32 %50, i32 %51; - i32 %53 = mul i32 %41, i32 2; - i1 %54 = icmp slt i32 %53, i32 1073741824; - cbr i1 %54(prob = 0.984615), ^while.body3, ^b6; - ^b6: - i32 %55 = add i32 %28, i32 1; - i1 %56 = icmp sgt i32 %24, i32 %55; - cbr i1 %56(prob = 0.5), ^while.body2, ^b7; - ^b7: - i32 %57 = add i32 %25, i32 1; - i1 %58 = icmp sgt i32 %20, i32 %57; - cbr i1 %58(prob = 0.5), ^while.body1, ^b8; - ^b8: - i32* %59 = getelementptr &(i32* %21)[i32 %22]; - store i32* %59 with i32 %52; - i32 %60 = add i32 %22, i32 1; - i1 %61 = icmp sgt i32 %8, i32 %60; - cbr i1 %61(prob = 0.5), ^while.body, ^b9; - ^b9: - i32 %62 = add i32 %17, i32 1; - i1 %63 = icmp sgt i32 %1, i32 %62; - i32 %64 = add i32 %8, i32 %18; - cbr i1 %63(prob = 0.984615), ^b1, ^b10; - ^b10: - ret; -} -internal [16 * i8]* @cmmc_parallel_body_payload_2, align 8; -internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { - ^b: - [16 * i8]* %2 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_3 to [16 * i8]*; - i32* %3 = ptradd [16 * i8]* %2, i32 4; - i32 %4 = load i32* %3; - i32* %5 = ptradd [16 * i8]* %2, i32 8; - i32 %6 = load i32* %5; - i32 %7 = mul i32 %0, i32 %6; - i32* %8 = ptradd [16 * i8]* %2, i32 12; - i32 %9 = load i32* %8; - i32* %10 = ptradd [16 * i8]* %2, i32 0; - i32 %11 = load i32* %10; - i32 %12 = add i32 %7, i32 %11; - [10000000 * i32]* %13 = ptrcast [10000000 * i32]* @a to [10000000 * i32]*; - i32* %14 = getelementptr &([10000000 * i32]* %13)[i64 0][i64 0]; - [10000000 * i32]* %15 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; - i32* %16 = getelementptr &([10000000 * i32]* %15)[i64 0][i64 0]; - ubr ^b1; - ^b1: - i32 %17 = phi [^b, i32 %0] [^b7, i32 %61]; - i32 %18 = phi [^b, i32 %12] [^b7, i32 %63]; - i32 %19 = sub i32 %17, i32 %9; - i32 %20 = add i32 %9, i32 %17; - i32* %21 = getelementptr &(i32* %16)[i32 %18]; - ubr ^while.body; - ^while.body: - i32 %22 = phi [^b1, i32 0] [^b6, i32 %59]; - i32 %23 = sub i32 %22, i32 %9; - i32 %24 = add i32 %9, i32 %22; - ubr ^while.body1; - ^while.body1: - i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %56]; - i32 %26 = phi [^while.body, i32 0] [^b5, i32 %51]; - i1 %27 = icmp sle i32 %4, i32 %25; - ubr ^while.body2; - ^while.body2: - i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %54]; - i32 %29 = phi [^while.body1, i32 %26] [^b4, i32 %51]; - i32 %30 = or i32 %25, i32 %28; - i32 %31 = lshr i32 %30, i32 31; - i1 %32 = ztrunc i32 %31 to i1; - i1 %33 = or i1 %27, i1 %32; - i1 %34 = icmp sle i32 %6, i32 %28; - i1 %35 = or i1 %33, i1 %34; - cbr i1 %35(prob = 0.5), ^b2, ^b3; - ^b2: - i32 %36 = phi [^while.body2, i32 0] [^b3, i32 %40]; - ubr ^while.body3; - ^b3: - i32 %37 = mul i32 %6, i32 %25; - i32* %38 = getelementptr &(i32* %14)[i32 %28]; - i32* %39 = getelementptr &(i32* %38)[i32 %37]; - i32 %40 = load i32* %39; - ubr ^b2; - ^while.body3: - i32 %41 = phi [^b2, i32 1] [^while.body3, i32 %52]; - i32 %42 = phi [^b2, i32 0] [^while.body3, i32 %51]; + i32 %41 = phi [^b2, i32 1] [^while.body3, i32 %52]; + i32 %42 = phi [^b2, i32 0] [^while.body3, i32 %51]; i32 %43 = sdiv i32 %29, i32 %41; i32 %44 = srem i32 %43, i32 2; i32 %45 = sdiv i32 %36, i32 %41; @@ -523,12 +632,12 @@ internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse Parallel i32* %58 = getelementptr &(i32* %21)[i32 %22]; store i32* %58 with i32 %51; i32 %59 = add i32 %22, i32 1; - i1 %60 = icmp sgt i32 %6, i32 %59; + i1 %60 = icmp sgt i32 %8, i32 %59; cbr i1 %60(prob = 0.5), ^while.body, ^b7; ^b7: i32 %61 = add i32 %17, i32 1; i1 %62 = icmp sgt i32 %1, i32 %61; - i32 %63 = add i32 %6, i32 %18; + i32 %63 = add i32 %8, i32 %18; cbr i1 %62(prob = 0.984615), ^b1, ^b8; ^b8: ret; @@ -594,7 +703,7 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %43 = sdiv i32 %29, i32 %41; i32 %44 = and i32 %43, i32 -2147483647; i1 %45 = icmp eq i32 %44, i32 1; - cbr i1 %45(prob = 0.49), ^b5, ^b4; + cbr i1 %45(prob = 0.5), ^b5, ^b4; ^b4: i32 %46 = sdiv i32 %36, i32 %41; i32 %47 = and i32 %46, i32 -2147483647; @@ -652,19 +761,19 @@ internal func @cmmc_parallel_body_5(i32 %0, i32 %1) -> void { NoRecurse Parallel ^b1: i32 %17 = phi [^b, i32 %0] [^b7, i32 %49]; i32 %18 = phi [^b, i32 %12] [^b7, i32 %51]; - i32 %19 = sub i32 %17, i32 %6; - i32 %20 = add i32 %6, i32 %17; + i32 %19 = sub i32 %17, i32 %4; + i32 %20 = add i32 %4, i32 %17; i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: i32 %22 = phi [^b1, i32 0] [^b6, i32 %47]; - i32 %23 = sub i32 %22, i32 %6; - i32 %24 = add i32 %6, i32 %22; + i32 %23 = sub i32 %22, i32 %4; + i32 %24 = add i32 %4, i32 %22; ubr ^while.body1; ^while.body1: i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %44]; i32 %26 = phi [^while.body, i32 0] [^b5, i32 %43]; - i1 %27 = icmp sle i32 %4, i32 %25; + i1 %27 = icmp sle i32 %6, i32 %25; ubr ^b2; ^b2: i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %41]; @@ -712,204 +821,95 @@ internal func @cmmc_parallel_body_6(i32 %0, i32 %1) -> void { NoRecurse Parallel [16 * i8]* %2 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_6 to [16 * i8]*; i32* %3 = ptradd [16 * i8]* %2, i32 4; i32 %4 = load i32* %3; - i32* %5 = ptradd [16 * i8]* %2, i32 8; - i32 %6 = load i32* %5; - i32 %7 = mul i32 %0, i32 %6; - i32* %8 = ptradd [16 * i8]* %2, i32 0; + i32 %5 = mul i32 %0, i32 %4; + i32* %6 = ptradd [16 * i8]* %2, i32 8; + i32 %7 = load i32* %6; + i32* %8 = ptradd [16 * i8]* %2, i32 12; i32 %9 = load i32* %8; - i32 %10 = add i32 %7, i32 %9; - i32 %11 = add i32 %4, i32 -3; - i32 %12 = add i32 %4, i32 -18; - i32 %13 = add i32 %4, i32 -81; - i32 %14 = add i32 %4, i32 -336; - i32 %15 = add i32 %4, i32 -1359; - i32 %16 = add i32 %4, i32 -5454; - i32 %17 = add i32 %4, i32 -21837; - [10000000 * i32]* %18 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; - i32* %19 = getelementptr &([10000000 * i32]* %18)[i64 0][i64 0]; + i32* %10 = ptradd [16 * i8]* %2, i32 0; + i32 %11 = load i32* %10; + i32 %12 = add i32 %5, i32 %11; + [10000000 * i32]* %13 = ptrcast [10000000 * i32]* @a to [10000000 * i32]*; + i32* %14 = getelementptr &([10000000 * i32]* %13)[i64 0][i64 0]; + [10000000 * i32]* %15 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; + i32* %16 = getelementptr &([10000000 * i32]* %15)[i64 0][i64 0]; ubr ^b1; ^b1: - i32 %20 = phi [^b, i32 %0] [^b7, i32 %79]; - i32 %21 = phi [^b, i32 %10] [^b7, i32 %81]; - i32 %22 = sub i32 %20, i32 %4; - i32 %23 = add i32 %4, i32 %20; - i32* %24 = getelementptr &(i32* %19)[i32 %21]; + i32 %17 = phi [^b, i32 %0] [^b9, i32 %62]; + i32 %18 = phi [^b, i32 %12] [^b9, i32 %64]; + i32 %19 = sub i32 %17, i32 %9; + i32 %20 = add i32 %9, i32 %17; + i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: - i32 %25 = phi [^b1, i32 0] [^b5, i32 %63]; - i32 %26 = sub i32 %25, i32 %4; - i32 %27 = add i32 %26, i32 3; - i32 %28 = add i32 %4, i32 %25; - i1 %29 = icmp slt i32 %27, i32 %28; - i32 %30 = add i32 %11, i32 %25; - i32 %31 = add i32 %12, i32 %25; - i32 %32 = add i32 %13, i32 %25; - i32 %33 = add i32 %14, i32 %25; - i32 %34 = add i32 %15, i32 %25; - i32 %35 = add i32 %16, i32 %25; - i32 %36 = add i32 %17, i32 %25; - cbr i1 %29(prob = 0.5), ^b2, ^b3; + i32 %22 = phi [^b1, i32 0] [^b8, i32 %60]; + i32 %23 = sub i32 %22, i32 %9; + i32 %24 = add i32 %9, i32 %22; + ubr ^while.body1; + ^while.body1: + i32 %25 = phi [^while.body, i32 %19] [^b7, i32 %57]; + i32 %26 = phi [^while.body, i32 0] [^b7, i32 %52]; + i1 %27 = icmp sle i32 %7, i32 %25; + ubr ^while.body2; + ^while.body2: + i32 %28 = phi [^while.body1, i32 %23] [^b6, i32 %55]; + i32 %29 = phi [^while.body1, i32 %26] [^b6, i32 %52]; + i32 %30 = or i32 %25, i32 %28; + i32 %31 = lshr i32 %30, i32 31; + i1 %32 = ztrunc i32 %31 to i1; + i1 %33 = or i1 %27, i1 %32; + i1 %34 = icmp sle i32 %4, i32 %28; + i1 %35 = or i1 %33, i1 %34; + cbr i1 %35(prob = 0.5), ^b2, ^b3; ^b2: - i32 %37 = phi [^while.body, i32 %22] [^b6, i32 %77]; - i32 %38 = add i32 %26, i32 15; - i1 %39 = icmp sgt i32 %30, i32 %38; - cbr i1 %39(prob = 0.941176), ^super.header, ^scalar.header; + i32 %36 = phi [^while.body2, i32 0] [^b3, i32 %40]; + ubr ^while.body3; ^b3: - i32 %40 = phi [^while.body, i32 %22] [^b4, i32 %50]; - i1 %41 = icmp slt i32 %26, i32 %28; - cbr i1 %41(prob = 0.75), ^while.body1, ^b4; - ^super.header: - i32 %42 = add i32 %26, i32 63; - i1 %43 = icmp sgt i32 %31, i32 %42; - cbr i1 %43(prob = 0.941176), ^super.header1, ^scalar.header1; - ^while.body1 {scalar}: - i32 %44 = phi [^b3, i32 %26] [^while.body1, i32 %45]; - i32 %45 = add i32 %44, i32 1; - i1 %46 = icmp sgt i32 %28, i32 %45; - cbr i1 %46(prob = 0.75), ^while.body1, ^b4; - ^scalar.header: - i32 %47 = phi [^b2, i32 %26] [^scalar.final1, i32 %67]; - i32 %48 = phi [^b2, i32 undef] [^scalar.final1, i32 %67]; - i1 %49 = icmp sgt i32 %30, i32 %47; - cbr i1 %49(prob = 0.75), ^while.body2, ^scalar.final; + i32 %37 = mul i32 %4, i32 %25; + i32* %38 = getelementptr &(i32* %14)[i32 %28]; + i32* %39 = getelementptr &(i32* %38)[i32 %37]; + i32 %40 = load i32* %39; + ubr ^b2; + ^while.body3: + i32 %41 = phi [^b2, i32 1] [^b5, i32 %53]; + i32 %42 = phi [^b2, i32 0] [^b5, i32 %52]; + i32 %43 = sdiv i32 %29, i32 %41; + i32 %44 = and i32 %43, i32 -2147483647; + i1 %45 = icmp eq i32 %44, i32 1; + cbr i1 %45(prob = 0.5), ^b4, ^b5; ^b4: - i32 %50 = add i32 %40, i32 1; - i1 %51 = icmp sgt i32 %23, i32 %50; - cbr i1 %51(prob = 0.5), ^b3, ^b5; - ^super.header1: - i32 %52 = add i32 %26, i32 255; - i1 %53 = icmp sgt i32 %32, i32 %52; - cbr i1 %53(prob = 0.941176), ^super.header2, ^scalar.header2; - ^scalar.header1: - i32 %54 = phi [^super.header, i32 %26] [^scalar.final2, i32 %84]; - i32 %55 = phi [^super.header, i32 undef] [^scalar.final2, i32 %84]; - i1 %56 = icmp sgt i32 %31, i32 %54; - cbr i1 %56(prob = 0.75), ^while.body3, ^scalar.final1; - ^while.body2 {scalar}: - i32 %57 = phi [^scalar.header, i32 %47] [^while.body2, i32 %58]; - i32 %58 = add i32 %57, i32 4; - i1 %59 = icmp sgt i32 %30, i32 %58; - cbr i1 %59(prob = 0.75), ^while.body2, ^scalar.final; - ^scalar.final: - i32 %60 = phi [^scalar.header, i32 %48] [^while.body2, i32 %58]; - i1 %61 = icmp sgt i32 %28, i32 %60; - cbr i1 %61(prob = 0.75), ^while.body4, ^b6; + i32 %46 = sdiv i32 %36, i32 %41; + i32 %47 = and i32 %46, i32 -2147483647; + i1 %48 = icmp eq i32 %47, i32 1; + ubr ^b5; ^b5: - i32* %62 = getelementptr &(i32* %24)[i32 %25]; - store i32* %62 with i32 0; - i32 %63 = add i32 %25, i32 1; - i1 %64 = icmp sgt i32 %6, i32 %63; - cbr i1 %64(prob = 0.5), ^while.body, ^b7; - ^super.header2: - i32 %65 = add i32 %26, i32 1023; - i1 %66 = icmp sgt i32 %33, i32 %65; - cbr i1 %66(prob = 0.941176), ^super.header3, ^scalar.header3; - ^scalar.final1: - i32 %67 = phi [^scalar.header1, i32 %55] [^while.body3, i32 %72]; - ubr ^scalar.header; - ^scalar.header2: - i32 %68 = phi [^super.header1, i32 %26] [^scalar.final3, i32 %93]; - i32 %69 = phi [^super.header1, i32 undef] [^scalar.final3, i32 %93]; - i1 %70 = icmp sgt i32 %32, i32 %68; - cbr i1 %70(prob = 0.75), ^while.body5, ^scalar.final2; - ^while.body3 {scalar}: - i32 %71 = phi [^scalar.header1, i32 %54] [^while.body3, i32 %72]; - i32 %72 = add i32 %71, i32 16; - i1 %73 = icmp sgt i32 %31, i32 %72; - cbr i1 %73(prob = 0.75), ^while.body3, ^scalar.final1; - ^while.body4 {scalar}: - i32 %74 = phi [^scalar.final, i32 %60] [^while.body4, i32 %75]; - i32 %75 = add i32 %74, i32 1; - i1 %76 = icmp sgt i32 %28, i32 %75; - cbr i1 %76(prob = 0.75), ^while.body4, ^b6; + i1 %49 = phi [^while.body3, i1 false] [^b4, i1 %48]; + i32 %50 = mul i32 %42, i32 2; + i32 %51 = zext i1 %49 to i32; + i32 %52 = add i32 %50, i32 %51; + i32 %53 = mul i32 %41, i32 2; + i1 %54 = icmp slt i32 %53, i32 1073741824; + cbr i1 %54(prob = 0.984615), ^while.body3, ^b6; ^b6: - i32 %77 = add i32 %37, i32 1; - i1 %78 = icmp sgt i32 %23, i32 %77; - cbr i1 %78(prob = 0.5), ^b2, ^b5; + i32 %55 = add i32 %28, i32 1; + i1 %56 = icmp sgt i32 %24, i32 %55; + cbr i1 %56(prob = 0.5), ^while.body2, ^b7; ^b7: - i32 %79 = add i32 %20, i32 1; - i1 %80 = icmp sgt i32 %1, i32 %79; - i32 %81 = add i32 %6, i32 %21; - cbr i1 %80(prob = 0.984615), ^b1, ^b8; - ^super.header3: - i32 %82 = add i32 %26, i32 4095; - i1 %83 = icmp sgt i32 %34, i32 %82; - cbr i1 %83(prob = 0.941176), ^super.header4, ^scalar.header4; - ^scalar.final2: - i32 %84 = phi [^scalar.header2, i32 %69] [^while.body5, i32 %89]; - ubr ^scalar.header1; - ^scalar.header3: - i32 %85 = phi [^super.header2, i32 %26] [^scalar.final4, i32 %102]; - i32 %86 = phi [^super.header2, i32 undef] [^scalar.final4, i32 %102]; - i1 %87 = icmp sgt i32 %33, i32 %85; - cbr i1 %87(prob = 0.75), ^while.body6, ^scalar.final3; - ^while.body5 {scalar}: - i32 %88 = phi [^scalar.header2, i32 %68] [^while.body5, i32 %89]; - i32 %89 = add i32 %88, i32 64; - i1 %90 = icmp sgt i32 %32, i32 %89; - cbr i1 %90(prob = 0.75), ^while.body5, ^scalar.final2; + i32 %57 = add i32 %25, i32 1; + i1 %58 = icmp sgt i32 %20, i32 %57; + cbr i1 %58(prob = 0.5), ^while.body1, ^b8; ^b8: + i32* %59 = getelementptr &(i32* %21)[i32 %22]; + store i32* %59 with i32 %52; + i32 %60 = add i32 %22, i32 1; + i1 %61 = icmp sgt i32 %4, i32 %60; + cbr i1 %61(prob = 0.5), ^while.body, ^b9; + ^b9: + i32 %62 = add i32 %17, i32 1; + i1 %63 = icmp sgt i32 %1, i32 %62; + i32 %64 = add i32 %4, i32 %18; + cbr i1 %63(prob = 0.984615), ^b1, ^b10; + ^b10: ret; - ^super.header4: - i32 %91 = add i32 %26, i32 16383; - i1 %92 = icmp sgt i32 %35, i32 %91; - cbr i1 %92(prob = 0.941176), ^super.header5, ^scalar.header5; - ^scalar.final3: - i32 %93 = phi [^scalar.header3, i32 %86] [^while.body6, i32 %98]; - ubr ^scalar.header2; - ^scalar.header4: - i32 %94 = phi [^super.header3, i32 %26] [^scalar.final5, i32 %113]; - i32 %95 = phi [^super.header3, i32 undef] [^scalar.final5, i32 %113]; - i1 %96 = icmp sgt i32 %34, i32 %94; - cbr i1 %96(prob = 0.75), ^while.body7, ^scalar.final4; - ^while.body6 {scalar}: - i32 %97 = phi [^scalar.header3, i32 %85] [^while.body6, i32 %98]; - i32 %98 = add i32 %97, i32 256; - i1 %99 = icmp sgt i32 %33, i32 %98; - cbr i1 %99(prob = 0.75), ^while.body6, ^scalar.final3; - ^super.header5: - i32 %100 = add i32 %26, i32 65535; - i1 %101 = icmp sgt i32 %36, i32 %100; - cbr i1 %101(prob = 0.941176), ^while.body8, ^scalar.header6; - ^scalar.final4: - i32 %102 = phi [^scalar.header4, i32 %95] [^while.body7, i32 %107]; - ubr ^scalar.header3; - ^scalar.header5: - i32 %103 = phi [^super.header4, i32 %26] [^scalar.final6, i32 %120]; - i32 %104 = phi [^super.header4, i32 undef] [^scalar.final6, i32 %120]; - i1 %105 = icmp sgt i32 %35, i32 %103; - cbr i1 %105(prob = 0.75), ^while.body9, ^scalar.final5; - ^while.body7 {scalar}: - i32 %106 = phi [^scalar.header4, i32 %94] [^while.body7, i32 %107]; - i32 %107 = add i32 %106, i32 1024; - i1 %108 = icmp sgt i32 %34, i32 %107; - cbr i1 %108(prob = 0.75), ^while.body7, ^scalar.final4; - ^while.body8: - i32 %109 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; - i32 %110 = add i32 %109, i32 131071; - i1 %111 = icmp sgt i32 %36, i32 %110; - i32 %112 = add i32 %109, i32 65536; - cbr i1 %111(prob = 0.941176), ^while.body8, ^scalar.header6; - ^scalar.final5: - i32 %113 = phi [^scalar.header5, i32 %104] [^while.body9, i32 %118]; - ubr ^scalar.header4; - ^scalar.header6: - i32 %114 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; - i32 %115 = phi [^super.header5, i32 undef] [^while.body8, i32 %112]; - i1 %116 = icmp sgt i32 %36, i32 %114; - cbr i1 %116(prob = 0.75), ^while.body10, ^scalar.final6; - ^while.body9 {scalar}: - i32 %117 = phi [^scalar.header5, i32 %103] [^while.body9, i32 %118]; - i32 %118 = add i32 %117, i32 4096; - i1 %119 = icmp sgt i32 %35, i32 %118; - cbr i1 %119(prob = 0.75), ^while.body9, ^scalar.final5; - ^scalar.final6: - i32 %120 = phi [^scalar.header6, i32 %115] [^while.body10, i32 %122]; - ubr ^scalar.header5; - ^while.body10 {scalar}: - i32 %121 = phi [^scalar.header6, i32 %114] [^while.body10, i32 %122]; - i32 %122 = add i32 %121, i32 16384; - i1 %123 = icmp sgt i32 %36, i32 %122; - cbr i1 %123(prob = 0.75), ^while.body10, ^scalar.final6; } internal [16 * i8]* @cmmc_parallel_body_payload_6, align 8; diff --git a/tests/SysY2022/performance/conv2.arm.s b/tests/SysY2022/performance/conv2.arm.s index da64b8f36..94f9cc027 100644 --- a/tests/SysY2022/performance/conv2.arm.s +++ b/tests/SysY2022/performance/conv2.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .section .rodata -.align 8 +.p2align 3 __cmmc_jumptable1312: .word label1287-__cmmc_jumptable1312 .word label1286-__cmmc_jumptable1312 @@ -9,34 +9,34 @@ __cmmc_jumptable1312: .word label1284-__cmmc_jumptable1312 .word label1283-__cmmc_jumptable1312 .bss -.align 8 +.p2align 3 a: .zero 40000000 -.align 8 +.p2align 3 b: .zero 40000000 -.align 8 +.p2align 3 kernelid: .zero 40000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_4: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_5: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_6: .zero 16 .text diff --git a/tests/SysY2022/performance/conv2.riscv.s b/tests/SysY2022/performance/conv2.riscv.s index e74ffefcd..028c63550 100644 --- a/tests/SysY2022/performance/conv2.riscv.s +++ b/tests/SysY2022/performance/conv2.riscv.s @@ -1,204 +1,198 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 8 -__cmmc_jumptable1415: - .word label1390-__cmmc_jumptable1415 - .word label1389-__cmmc_jumptable1415 - .word label1388-__cmmc_jumptable1415 - .word label1387-__cmmc_jumptable1415 - .word label1386-__cmmc_jumptable1415 +.p2align 3 +__cmmc_jumptable1433: + .word label1408-__cmmc_jumptable1433 + .word label1407-__cmmc_jumptable1433 + .word label1406-__cmmc_jumptable1433 + .word label1405-__cmmc_jumptable1433 + .word label1404-__cmmc_jumptable1433 .bss -.align 8 +.p2align 3 a: .zero 40000000 -.align 8 +.p2align 3 b: .zero 40000000 -.align 8 +.p2align 3 kernelid: .zero 40000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_4: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_5: .zero 16 -.align 8 +.p2align 3 cmmc_parallel_body_payload_6: .zero 16 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[64] CalleeSaved[104] - addi sp, sp, -168 + # stack usage: CalleeArg[0] Local[0] RegSpill[48] CalleeSaved[104] + addi sp, sp, -152 sd ra, 0(sp) - sd s0, 8(sp) - sd s5, 16(sp) - sd s1, 24(sp) - sd s6, 32(sp) - sd s2, 40(sp) - sd s4, 48(sp) - sd s7, 56(sp) - sd s8, 64(sp) - sd s11, 72(sp) - sd s3, 80(sp) - sd s9, 88(sp) - sd s10, 96(sp) + sd s2, 8(sp) + sd s0, 16(sp) + sd s5, 24(sp) + sd s1, 32(sp) + sd s6, 40(sp) + sd s3, 48(sp) + sd s4, 56(sp) + sd s7, 64(sp) + sd s9, 72(sp) + sd s8, 80(sp) + sd s10, 88(sp) + sd s11, 96(sp) jal getint - mv s0, a0 + mv s2, a0 jal getint sd a0, 112(sp) mv a1, a0 jal getint -pcrel1546: +pcrel1555: auipc a1, %pcrel_hi(a) sd a0, 104(sp) - addi a2, a1, %pcrel_lo(pcrel1546) + addi a2, a1, %pcrel_lo(pcrel1555) sd a2, 120(sp) mv a0, a2 jal getarray -pcrel1547: +pcrel1556: auipc a1, %pcrel_hi(kernelid) - addi s1, a1, %pcrel_lo(pcrel1547) - mv a0, s1 + addi s0, a1, %pcrel_lo(pcrel1556) + mv a0, s0 jal getarray sd a0, 136(sp) li a0, 109 jal _sysy_starttime -pcrel1548: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_3) - li s9, 5 -pcrel1549: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) - mv s11, zero -pcrel1550: - auipc s10, %pcrel_hi(cmmc_parallel_body_payload_1) - srliw a2, s0, 31 - addi s6, s7, %pcrel_lo(pcrel1548) - addi s4, s5, %pcrel_lo(pcrel1549) ld a0, 104(sp) +pcrel1557: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_5) +pcrel1558: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_4) + srliw a2, s2, 31 +pcrel1559: + auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) + mv s11, zero +pcrel1560: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) + addi s8, s9, %pcrel_lo(pcrel1557) + addi s6, s7, %pcrel_lo(pcrel1558) + addi s4, s5, %pcrel_lo(pcrel1560) ld a1, 112(sp) mulw a1, a1, a0 - add a0, s0, a2 - sraiw s2, a0, 1 + add a0, s2, a2 + addi s2, s3, %pcrel_lo(pcrel1559) + sraiw s1, a0, 1 sd a1, 128(sp) -pcrel1551: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel1552: - auipc a1, %pcrel_hi(__cmmc_jumptable1415) - addi s8, a0, %pcrel_lo(pcrel1551) - addi s3, a1, %pcrel_lo(pcrel1552) -pcrel1553: +pcrel1561: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) - addi a3, a0, %pcrel_lo(pcrel1553) -pcrel1554: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_5) - sd a3, 160(sp) - addi a3, a0, %pcrel_lo(pcrel1554) -pcrel1555: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_6) - sd a3, 152(sp) - addi a2, a0, %pcrel_lo(pcrel1555) -pcrel1556: - auipc a0, %pcrel_hi(cmmc_parallel_body_1) +pcrel1562: + auipc a1, %pcrel_hi(cmmc_parallel_body_payload_6) + addi a2, a0, %pcrel_lo(pcrel1561) + addi s10, a1, %pcrel_lo(pcrel1562) sd a2, 144(sp) - addi s0, a0, %pcrel_lo(pcrel1556) - lw a0, 0(s1) - mv a1, a0 - bltu a0, s9, label1419 + lw a1, 0(s0) + li a3, 5 + mv a0, a1 + bltu a1, a3, label1437 .p2align 2 -label1385: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_6) -pcrel1557: - auipc a5, %pcrel_hi(cmmc_parallel_body_6) - sw zero, %pcrel_lo(label1385)(a0) +label1398: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) +pcrel1563: + auipc a5, %pcrel_hi(cmmc_parallel_body_2) + sw zero, %pcrel_lo(label1398)(a0) ld a2, 144(sp) - sw s2, 4(a2) + sw s1, 4(a2) ld a1, 112(sp) ld a0, 104(sp) slli a3, a1, 32 add.uw a4, a0, a3 mv a0, zero sd a4, 8(a2) - addi a2, a5, %pcrel_lo(pcrel1557) + addi a2, a5, %pcrel_lo(pcrel1563) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 -.p2align 2 -label1387: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_4) -pcrel1558: - auipc a4, %pcrel_hi(cmmc_parallel_body_4) - sw zero, %pcrel_lo(label1387)(a0) - sw s2, 4(s8) - ld a1, 112(sp) + bgt a1, zero, label1400 + j label1401 +.p2align 2 +label1404: + auipc a1, %pcrel_hi(cmmc_parallel_body_payload_6) + slli a2, s1, 32 +pcrel1564: + auipc a3, %pcrel_hi(cmmc_parallel_body_6) + sw zero, %pcrel_lo(label1404)(a1) ld a0, 104(sp) - slli a2, a1, 32 - add.uw a3, a0, a2 + sw a0, 4(s10) + ld a1, 112(sp) + add.uw a0, a1, a2 + addi a2, a3, %pcrel_lo(pcrel1564) + sd a0, 8(s10) mv a0, zero - addi a2, a4, %pcrel_lo(pcrel1558) - sd a3, 8(s8) jal cmmcParallelFor ld a1, 128(sp) - ble a1, zero, label1381 + ble a1, zero, label1401 .p2align 2 -label1384: +label1400: + auipc a0, %pcrel_hi(cmmc_parallel_body_payload_1) +pcrel1565: + auipc a3, %pcrel_hi(cmmc_parallel_body_1) ld a1, 128(sp) + addi a2, a3, %pcrel_lo(pcrel1565) + sw a1, %pcrel_lo(label1400)(a0) mv a0, zero -pcrel1559: - auipc s10, %pcrel_hi(cmmc_parallel_body_payload_1) - sw a1, %pcrel_lo(pcrel1559)(s10) - mv a2, s0 jal cmmcParallelFor - ld a0, 136(sp) - addiw s11, s11, 1 - ble a0, s11, label1383 .p2align 2 -label1382: - addi s1, s1, 4 - lw a0, 0(s1) - mv a1, a0 - bgeu a0, s9, label1385 -.p2align 2 -label1419: - sh2add a3, a1, s3 - lw a2, 0(a3) - add a0, s3, a2 - jr a0 -.p2align 2 -label1390: - auipc s5, %pcrel_hi(cmmc_parallel_body_payload_0) - sw zero, %pcrel_lo(label1390)(s5) - slli a2, s2, 32 -pcrel1560: +label1401: + addiw s11, s11, 1 + ld a0, 136(sp) + ble a0, s11, label1403 + addi s0, s0, 4 + li a3, 5 + lw a1, 0(s0) + mv a0, a1 + bgeu a1, a3, label1398 +.p2align 2 +label1437: + auipc a4, %pcrel_hi(__cmmc_jumptable1433) + addi a2, a4, %pcrel_lo(label1437) + sh2add a1, a0, a2 + lw a3, 0(a1) + add a4, a2, a3 + jr a4 +.p2align 2 +label1408: + auipc s3, %pcrel_hi(cmmc_parallel_body_payload_0) + sw zero, %pcrel_lo(label1408)(s3) +pcrel1566: auipc a3, %pcrel_hi(cmmc_parallel_body_0) ld a0, 104(sp) - sw a0, 4(s4) + sw a0, 4(s2) ld a1, 112(sp) - add.uw a0, a1, a2 - addi a2, a3, %pcrel_lo(pcrel1560) - sd a0, 8(s4) + slli a2, a1, 32 + add.uw a0, s1, a2 + addi a2, a3, %pcrel_lo(pcrel1566) + sd a0, 8(s2) mv a0, zero jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 -label1383: + bgt a1, zero, label1400 + j label1401 +label1403: li a0, 116 jal _sysy_stoptime ld a2, 120(sp) @@ -206,93 +200,87 @@ label1383: mv a0, a1 mv a1, a2 jal putarray - mv a0, zero ld ra, 0(sp) - ld s0, 8(sp) - ld s5, 16(sp) - ld s1, 24(sp) - ld s6, 32(sp) - ld s2, 40(sp) - ld s4, 48(sp) - ld s7, 56(sp) - ld s8, 64(sp) - ld s11, 72(sp) - ld s3, 80(sp) - ld s9, 88(sp) - ld s10, 96(sp) - addi sp, sp, 168 + mv a0, zero + ld s2, 8(sp) + ld s0, 16(sp) + ld s5, 24(sp) + ld s1, 32(sp) + ld s6, 40(sp) + ld s3, 48(sp) + ld s4, 56(sp) + ld s7, 64(sp) + ld s9, 72(sp) + ld s8, 80(sp) + ld s10, 88(sp) + ld s11, 96(sp) + addi sp, sp, 152 ret .p2align 2 -label1389: - auipc s7, %pcrel_hi(cmmc_parallel_body_payload_3) - sw zero, %pcrel_lo(label1389)(s7) - slli a2, s2, 32 -pcrel1561: +label1407: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) + sw zero, %pcrel_lo(label1407)(s5) +pcrel1567: auipc a4, %pcrel_hi(cmmc_parallel_body_3) - ld a1, 112(sp) - sw a1, 4(s6) + sw s1, 4(s4) ld a0, 104(sp) - add.uw a3, a0, a2 + ld a1, 112(sp) + slli a2, a0, 32 mv a0, zero - addi a2, a4, %pcrel_lo(pcrel1561) - sd a3, 8(s6) + add.uw a3, a1, a2 + addi a2, a4, %pcrel_lo(pcrel1567) + sd a3, 8(s4) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 -label1381: - addiw s11, s11, 1 - ld a0, 136(sp) - bgt a0, s11, label1382 - j label1383 + bgt a1, zero, label1400 + j label1401 .p2align 2 -label1386: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel1562: - auipc a5, %pcrel_hi(cmmc_parallel_body_2) - sw zero, %pcrel_lo(label1386)(a0) - ld a3, 160(sp) - sw s2, 4(a3) - ld a0, 104(sp) +label1405: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_4) + sw zero, %pcrel_lo(label1405)(s7) +pcrel1568: + auipc a4, %pcrel_hi(cmmc_parallel_body_4) + sw s1, 4(s6) ld a1, 112(sp) - slli a2, a0, 32 + ld a0, 104(sp) + slli a2, a1, 32 + add.uw a3, a0, a2 mv a0, zero - add.uw a4, a1, a2 - addi a2, a5, %pcrel_lo(pcrel1562) - sd a4, 8(a3) + addi a2, a4, %pcrel_lo(pcrel1568) + sd a3, 8(s6) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 -.p2align 2 -label1388: - auipc a0, %pcrel_hi(cmmc_parallel_body_payload_5) -pcrel1563: - auipc a5, %pcrel_hi(cmmc_parallel_body_5) - sw zero, %pcrel_lo(label1388)(a0) - ld a1, 112(sp) - ld a3, 152(sp) - sw a1, 4(a3) + bgt a1, zero, label1400 + j label1401 +.p2align 2 +label1406: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_5) + sw zero, %pcrel_lo(label1406)(s9) +pcrel1569: + auipc a4, %pcrel_hi(cmmc_parallel_body_5) + sw s1, 4(s8) ld a0, 104(sp) + ld a1, 112(sp) slli a2, a0, 32 mv a0, zero - add.uw a4, s2, a2 - addi a2, a5, %pcrel_lo(pcrel1563) - sd a4, 8(a3) + add.uw a3, a1, a2 + addi a2, a4, %pcrel_lo(pcrel1569) + sd a3, 8(s8) jal cmmcParallelFor ld a1, 128(sp) - bgt a1, zero, label1384 - j label1381 + bgt a1, zero, label1400 + j label1401 .p2align 2 cmmc_parallel_body_0: addi sp, sp, -80 mv t1, a1 -pcrel168: +pcrel172: auipc a5, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel169: +pcrel173: auipc t5, %pcrel_hi(b) sd s0, 0(sp) - addi t0, a5, %pcrel_lo(pcrel168) - addi t4, t5, %pcrel_lo(pcrel169) + addi t0, a5, %pcrel_lo(pcrel172) + addi t4, t5, %pcrel_lo(pcrel173) sd s5, 8(sp) sd s4, 16(sp) sd s3, 24(sp) @@ -300,18 +288,18 @@ pcrel169: sd s6, 40(sp) sd s2, 48(sp) sd s7, 56(sp) - sd s9, 64(sp) - sd s8, 72(sp) + sd s8, 64(sp) + sd s9, 72(sp) lw a2, 4(t0) - lw a4, 8(t0) - lw a3, 12(t0) + lw a3, 8(t0) + lw a4, 12(t0) mulw a1, a0, a2 - lw t3, %pcrel_lo(pcrel168)(a5) + lw t3, %pcrel_lo(pcrel172)(a5) addw t2, a1, t3 -pcrel170: +pcrel174: auipc a1, %pcrel_hi(a) sh2add t0, t2, t4 - addi a5, a1, %pcrel_lo(pcrel170) + addi a5, a1, %pcrel_lo(pcrel174) mv t2, a0 lui a1, 786432 lui a0, 262144 @@ -325,9 +313,39 @@ pcrel170: mv s4, zero j label8 .p2align 2 -label94: +label150: + addiw s3, s3, 1 + ble a7, s3, label154 +.p2align 2 +label28: + addi s2, s2, 4 + or s7, s0, s3 + srliw s8, s7, 31 + slt s7, s3, a2 + andi s6, s8, 1 + xori s8, s7, 1 + or s5, s1, s6 + or s9, s5, s8 + beq s9, zero, label148 +.p2align 2 +label67: + mv s6, zero + sext.w s5, s4 + ble s5, a0, label146 +.p2align 2 +label31: + addw s5, s5, a1 + bgt s5, a0, label31 + mv s4, s5 + bge s5, zero, label150 +.p2align 2 +label29: + addw s4, s4, a0 + blt s4, zero, label29 + addiw s3, s3, 1 + bgt a7, s3, label28 addiw s0, s0, 1 - ble t4, s0, label149 + ble t4, s0, label153 .p2align 2 label8: slt s3, s0, a4 @@ -335,88 +353,58 @@ label8: xori s1, s3, 1 mv s3, a6 or s7, s0, a6 - slt s8, a6, a2 - srliw s9, s7, 31 - xori s7, s8, 1 - andi s6, s9, 1 + srliw s8, s7, 31 + slt s7, a6, a2 + andi s6, s8, 1 + xori s8, s7, 1 or s5, s1, s6 - or s6, s5, s7 - bne s6, zero, label67 - mulw s7, a2, s0 - sh2add s6, s7, s2 - lw s5, 0(s6) - addw s4, s4, s5 - bgt s4, a0, label18 - j label75 -label146: - blt s4, zero, label22 -.p2align 2 -label84: - addiw s3, s3, 1 - bgt a7, s3, label26 - addiw s0, s0, 1 - bgt t4, s0, label8 + or s9, s5, s8 + bne s9, zero, label67 + mulw s5, a2, s0 + sh2add s7, s5, s2 + lw s6, 0(s7) + addw s5, s4, s6 + bgt s5, a0, label31 +label151: + mv s4, s5 + blt s5, zero, label29 + j label20 .p2align 2 -label149: +label153: addiw t6, t6, 1 sw s4, 0(t5) - ble a2, t6, label154 + ble a2, t6, label157 +.p2align 2 +label24: addi t5, t5, 4 subw a6, t6, a3 addw a7, a3, t6 mv s0, t3 mv s4, zero slt s3, t3, a4 - slt s8, a6, a2 or s7, t3, a6 sh2add s2, a6, a5 xori s1, s3, 1 - srliw s9, s7, 31 + srliw s8, s7, 31 mv s3, a6 - xori s7, s8, 1 - andi s6, s9, 1 + slt s7, a6, a2 + andi s6, s8, 1 + xori s8, s7, 1 or s5, s1, s6 - or s6, s5, s7 - beq s6, zero, label157 -.p2align 2 -label67: - mv s5, zero - sext.w s4, s4 - ble s4, a0, label146 -.p2align 2 -label18: - addw s4, s4, a1 - bgt s4, a0, label18 - bge s4, zero, label84 -.p2align 2 -label22: - addw s4, s4, a0 - blt s4, zero, label22 - addiw s3, s3, 1 - ble a7, s3, label94 -.p2align 2 -label26: - addi s2, s2, 4 - or s7, s0, s3 - slt s8, s3, a2 - srliw s9, s7, 31 - xori s7, s8, 1 - andi s6, s9, 1 - or s5, s1, s6 - or s6, s5, s7 - bne s6, zero, label67 - mulw s7, a2, s0 - sh2add s6, s7, s2 - lw s5, 0(s6) - addw s4, s4, s5 - bgt s4, a0, label18 -label75: - blt s4, zero, label22 - j label84 + or s9, s5, s8 + bne s9, zero, label67 + mulw s5, a2, t3 + sh2add s7, s5, s2 + lw s6, 0(s7) + mv s5, s6 + bgt s6, a0, label31 + j label151 .p2align 2 -label154: +label157: addiw t2, t2, 1 - ble t1, t2, label30 + ble t1, t2, label26 +.p2align 2 +label27: sh2add t0, a2, t0 subw t3, t2, a3 addw t4, a3, t2 @@ -427,24 +415,43 @@ label154: mv t5, t0 slt s3, t3, a4 mv s0, t3 - slt s8, a6, a2 or s7, t3, a6 sh2add s2, a6, a5 xori s1, s3, 1 - srliw s9, s7, 31 + srliw s8, s7, 31 mv s3, a6 - xori s7, s8, 1 - andi s6, s9, 1 + slt s7, a6, a2 + andi s6, s8, 1 + xori s8, s7, 1 or s5, s1, s6 - or s6, s5, s7 - bne s6, zero, label67 - mulw s7, a2, t3 - sh2add s6, s7, s2 - lw s5, 0(s6) + or s9, s5, s8 + bne s9, zero, label67 + mulw s5, a2, t3 + sh2add s7, s5, s2 + lw s6, 0(s7) + mv s5, s6 + bgt s6, a0, label31 + j label151 +.p2align 2 +label148: + mulw s5, a2, s0 + sh2add s7, s5, s2 + lw s6, 0(s7) + addw s5, s4, s6 + bgt s5, a0, label31 mv s4, s5 - bgt s5, a0, label18 - j label75 -label30: + blt s5, zero, label29 + j label20 +label154: + addiw s0, s0, 1 + bgt t4, s0, label8 +label23: + addiw t6, t6, 1 + sw s4, 0(t5) + bgt a2, t6, label24 + addiw t2, t2, 1 + bgt t1, t2, label27 +label26: ld s0, 0(sp) ld s5, 8(sp) ld s4, 16(sp) @@ -453,40 +460,42 @@ label30: ld s6, 40(sp) ld s2, 48(sp) ld s7, 56(sp) - ld s9, 64(sp) - ld s8, 72(sp) + ld s8, 64(sp) + ld s9, 72(sp) addi sp, sp, 80 ret .p2align 2 -label157: - mulw s7, a2, s0 - sh2add s6, s7, s2 - lw s5, 0(s6) - addw s4, s4, s5 - bgt s4, a0, label18 - j label75 +label146: + mv s4, s5 + blt s5, zero, label29 +label20: + addiw s3, s3, 1 + bgt a7, s3, label28 + addiw s0, s0, 1 + bgt t4, s0, label8 + j label23 .p2align 2 cmmc_parallel_body_1: mv t0, a0 addiw a4, a0, 3 -pcrel329: +pcrel333: auipc a5, %pcrel_hi(b) -pcrel330: +pcrel334: auipc a0, %pcrel_hi(a) - addi a3, a5, %pcrel_lo(pcrel329) - addi a2, a0, %pcrel_lo(pcrel330) - ble a1, a4, label186 + addi a3, a5, %pcrel_lo(pcrel333) + addi a2, a0, %pcrel_lo(pcrel334) + ble a1, a4, label190 addiw t1, t0, 15 addiw a4, a1, -3 addiw a5, a1, -18 - bge t1, a4, label209 + bge t1, a4, label213 sh2add a0, t0, a2 - j label182 + j label186 .p2align 2 -label185: +label189: addi a0, a0, 64 .p2align 2 -label182: +label186: sh2add t1, t0, a3 addiw t0, t0, 16 ld t3, 0(t1) @@ -505,345 +514,445 @@ label182: sd t2, 48(a0) ld t3, 56(t1) sd t3, 56(a0) - bgt a5, t0, label185 + bgt a5, t0, label189 mv a0, t0 -label173: - ble a4, a0, label186 +label177: + ble a4, a0, label190 sh2add a5, a0, a3 - j label177 -label180: + j label181 +label184: addi a5, a5, 16 -label177: +label181: sh2add t0, a0, a2 ld t2, 0(a5) addiw a0, a0, 4 sd t2, 0(t0) ld t1, 8(a5) sd t1, 8(t0) - bgt a4, a0, label180 + bgt a4, a0, label184 mv t0, a0 -label186: - ble a1, t0, label188 +label190: + ble a1, t0, label197 sh2add a0, t0, a3 - j label190 -label193: + j label193 +label196: addi a0, a0, 4 -label190: +label193: sh2add a3, t0, a2 lw a4, 0(a0) addiw t0, t0, 1 sw a4, 0(a3) - bgt a1, t0, label193 -label188: + bgt a1, t0, label196 +label197: ret -label209: +label213: mv a0, t0 mv t0, zero - j label173 + j label177 .p2align 2 cmmc_parallel_body_2: - addi sp, sp, -96 - mv t3, a1 -pcrel515: + # stack usage: CalleeArg[0] Local[0] RegSpill[40] CalleeSaved[96] + addi sp, sp, -136 +pcrel722: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel516: - auipc t5, %pcrel_hi(a) +pcrel723: + auipc a5, %pcrel_hi(b) mv t4, a0 - addi a3, a2, %pcrel_lo(pcrel515) - sd s0, 0(sp) - sd s5, 8(sp) - sd s1, 16(sp) + sd s4, 0(sp) + sd s0, 8(sp) + sd s5, 16(sp) sd s6, 24(sp) - sd s2, 32(sp) - sd s3, 40(sp) - sd s4, 48(sp) - sd s7, 56(sp) - sd s8, 64(sp) - sd s9, 72(sp) - sd s10, 80(sp) + sd s1, 32(sp) + sd s10, 40(sp) + sd s9, 48(sp) + sd s2, 56(sp) + sd s3, 64(sp) + sd s7, 72(sp) + sd s8, 80(sp) sd s11, 88(sp) - lw a5, 4(a3) - lw t0, 8(a3) - lw a4, 12(a3) -pcrel517: - auipc a3, %pcrel_hi(b) - lw t2, %pcrel_lo(pcrel515)(a2) - mulw t1, a0, a4 - addi a2, a3, %pcrel_lo(pcrel517) - addw a1, t1, t2 - lui a3, 524288 - addi t1, t5, %pcrel_lo(pcrel516) - sh2add t2, a1, a2 - addiw a0, a3, 1 - li a1, 1 - lui a2, 262144 - subw t5, t4, a5 - addw t6, a5, t4 - mv a6, t2 + sd a1, 96(sp) + addi a1, a2, %pcrel_lo(pcrel722) + lw s4, 4(a1) + addi s6, s4, -336 + addi s10, s4, -1359 + addi t0, s4, -18 + addi t1, s4, -81 + sd s4, 104(sp) + lw s0, 8(a1) + mulw a3, a0, s0 + sd s0, 112(sp) + lw a4, %pcrel_lo(pcrel722)(a2) + addi a2, a5, %pcrel_lo(pcrel723) + addw a1, a3, a4 + sd s6, 128(sp) + addi a5, s4, -3 + lui a3, 1048571 + sh2add a4, a1, a2 + sd s10, 120(sp) + lui a2, 1048575 + addiw a1, a2, -1358 + addiw a2, a3, -1357 + addw t2, s4, a1 + lui a3, 32 + addw t3, s4, a2 + lui a1, 16 + addiw a0, a3, -1 + lui a2, 1 + lui a3, 4 + mv a6, a4 mv a7, zero - subw s0, zero, a5 - mv s1, a5 - mv s2, t5 - mv a3, zero - slt s5, t5, t0 - sh2add s4, s0, t1 - xori s3, s5, 1 - mv s5, s0 - or s7, t5, s0 - slt s10, s0, a4 - srliw s8, s7, 31 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, t5 - divw s9, zero, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 -.p2align 2 -label405: - divw s11, s6, s7 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, s7, 1 - sh1add s8, s8, s10 - mv s7, s9 - bge s9, a2, label485 + addw t6, s4, t4 + subw t5, t4, s4 + j label339 .p2align 2 -label348: - divw s9, a3, s7 - and s11, s9, a0 - beq s11, a1, label405 -.p2align 2 -label406: - mv s10, zero - sh1add s8, s8, zero - slliw s9, s7, 1 - mv s7, s9 - blt s9, a2, label348 - addiw s5, s5, 1 - bgt s1, s5, label354 +label639: addiw s2, s2, 1 - ble t6, s2, label479 + ble t6, s2, label662 .p2align 2 -label425: - mv a3, s8 - slt s5, s2, t0 - sh2add s4, s0, t1 - or s7, s2, s0 - slt s10, s0, a4 - xori s3, s5, 1 - srliw s8, s7, 31 - mv s5, s0 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, s2 - divw s9, a3, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 +label409: + blt s0, s1, label586 + addiw s2, s2, 1 + bgt t6, s2, label409 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + ble s0, a7, label676 .p2align 2 -label485: - addiw s5, s5, 1 - ble s1, s5, label491 +label343: + addi a6, a6, 4 .p2align 2 -label354: - addi s4, s4, 4 - mv a3, s8 - or s7, s2, s5 - slt s10, s5, a4 - srliw s8, s7, 31 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - beq s7, zero, label480 +label339: + ld s4, 104(sp) + addw s2, a5, a7 + addw s3, t0, a7 + addw s8, t2, a7 + ld s6, 128(sp) + addw s1, s4, a7 + subw s0, a7, s4 + addw s5, s6, a7 + ld s10, 120(sp) + addw s4, t1, a7 + addiw s9, s0, 3 + addw s6, t3, a7 + addw s7, s10, a7 + blt s9, s1, label451 + mv s2, t5 + bge s0, s1, label639 .p2align 2 -label396: - mv s6, zero - mv s7, a1 - mv s8, zero - divw s9, a3, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, zero, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 +label586: + mv s3, s0 .p2align 2 -label491: +label411: + addiw s3, s3, 1 + bgt s1, s3, label411 addiw s2, s2, 1 - bgt t6, s2, label425 + bgt t6, s2, label409 addiw a7, a7, 1 - sw s8, 0(a6) - ble a4, a7, label502 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + addiw t4, t4, 1 + ld t5, 96(sp) + ble t5, t4, label345 +.p2align 2 +label346: + ld s0, 112(sp) + mv a7, zero + ld s4, 104(sp) + sh2add a4, s0, a4 + addw t6, s4, t4 + subw t5, t4, s4 + mv a6, a4 + j label339 +.p2align 2 +label451: + mv s9, t5 + addiw s11, s0, 15 + ble s2, s11, label640 +.p2align 2 +label359: + addiw s11, s0, 63 + ble s3, s11, label496 + addiw s11, s0, 255 + ble s4, s11, label501 + addiw s11, s0, 1023 + ble s5, s11, label506 + lui s11, 1 + addiw s10, s11, -1 + addw s11, s0, s10 + ble s7, s11, label511 + lui s10, 4 + addiw s11, s10, -1 + addw s10, s0, s11 + ble s8, s10, label525 + lui s11, 16 + addiw s10, s11, -1 + addw s11, s0, s10 + ble s6, s11, label530 + addw s11, s0, a0 + addw s10, s0, a1 + ble s6, s11, label651 +.p2align 2 +label379: + addw s11, s10, a0 + addw s10, s10, a1 + bgt s6, s11, label379 + mv s11, s10 + ble s6, s10, label655 +.p2align 2 +label535: + addw s10, s11, a3 + ble s6, s10, label652 +.p2align 2 +label377: + addw s10, s10, a3 + bgt s6, s10, label377 + mv s11, s10 + ble s8, s10, label654 +.p2align 2 +label384: + addw s10, s10, a2 + bgt s8, s10, label384 + mv s11, s10 + ble s7, s10, label656 +.p2align 2 +label366: + addiw s10, s10, 1024 + bgt s7, s10, label366 + mv s11, s10 + ble s5, s10, label648 +.p2align 2 +label393: + addiw s10, s10, 256 + bgt s5, s10, label393 + mv s11, s10 + ble s4, s10, label567 +.p2align 2 +label398: + addiw s10, s10, 64 + bgt s4, s10, label398 + mv s11, s10 +.p2align 2 +label646: + mv s10, s11 + ble s3, s11, label576 +.p2align 2 +label405: + addiw s10, s10, 16 + bgt s3, s10, label405 + mv s11, s10 + ble s2, s10, label659 .p2align 2 label357: - addi a6, a6, 4 - subw s0, a7, a5 - addw s1, a5, a7 - mv s2, t5 - mv a3, zero - slt s5, t5, t0 - slt s10, s0, a4 - or s7, t5, s0 - sh2add s4, s0, t1 - xori s3, s5, 1 - srliw s8, s7, 31 - mv s5, s0 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, t5 - divw s9, zero, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 + addiw s10, s10, 4 + bgt s2, s10, label357 + ble s1, s10, label644 .p2align 2 -label480: - mulw s8, a4, s2 - divw s9, a3, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 +label354: + addiw s10, s10, 1 + bgt s1, s10, label354 + addiw s9, s9, 1 + ble t6, s9, label486 +.p2align 2 +label347: + addiw s11, s0, 15 + bgt s2, s11, label359 + mv s10, s0 + mv s11, zero +.p2align 2 +label349: + bgt s2, s10, label357 +label679: + mv s10, s11 + bgt s1, s11, label354 + j label476 .p2align 2 -label479: +label486: addiw a7, a7, 1 - sw s8, 0(a6) - bgt a4, a7, label357 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 addiw t4, t4, 1 - ble t3, t4, label359 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 .p2align 2 -label360: - sh2add t2, a4, t2 - subw t5, t4, a5 - addw t6, a5, t4 - mv a7, zero - subw s0, zero, a5 - sext.w s1, a5 - mv a3, zero - mv a6, t2 - slt s5, t5, t0 - mv s2, t5 - slt s10, s0, a4 - or s7, t5, s0 - sh2add s4, s0, t1 - xori s3, s5, 1 - srliw s8, s7, 31 - mv s5, s0 - andi s9, s8, 1 - xori s8, s10, 1 - or s6, s3, s9 - or s7, s6, s8 - bne s7, zero, label396 - mulw s8, a4, t5 - divw s9, zero, a1 - sh2add s7, s8, s4 - mv s8, zero - lw s6, 0(s7) - mv s7, a1 - and s11, s9, a0 - bne s11, a1, label406 - divw s11, s6, a1 - and s10, s11, a0 - xori s9, s10, 1 - sltiu s10, s9, 1 - slliw s9, a1, 1 - mv s8, s10 - mv s7, s9 - blt s9, a2, label348 - j label485 -label359: - ld s0, 0(sp) - ld s5, 8(sp) - ld s1, 16(sp) +label567: + mv s10, s11 + bgt s3, s11, label405 + bgt s2, s11, label357 + j label679 +.p2align 2 +label644: + addiw s9, s9, 1 + bgt t6, s9, label347 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + addiw t4, t4, 1 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label511: + mv s10, s0 + mv s11, zero +.p2align 2 +label363: + bgt s7, s10, label366 +.p2align 2 +label515: + mv s10, s11 + bgt s5, s11, label393 + j label391 +.p2align 2 +label648: + mv s10, s11 + bgt s4, s11, label398 + bgt s3, s11, label405 +.p2align 2 +label576: + mv s10, s11 + bgt s2, s11, label357 + bgt s1, s11, label354 +label476: + addiw s9, s9, 1 + bgt t6, s9, label347 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + j label663 +.p2align 2 +label659: + mv s10, s11 + bgt s1, s11, label354 + addiw s9, s9, 1 + bgt t6, s9, label347 + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 +label663: + addiw t4, t4, 1 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label651: + mv s11, s10 + bgt s6, s10, label535 +.p2align 2 +label655: + mv s11, s10 + bgt s8, s10, label384 + bgt s7, s10, label366 + j label515 +.p2align 2 +label656: + mv s10, s11 + bgt s5, s11, label393 +.p2align 2 +label391: + mv s10, s11 + bgt s4, s11, label398 + bgt s3, s11, label405 + j label576 +.p2align 2 +label652: + mv s11, s10 + bgt s8, s10, label384 + bgt s7, s10, label366 + j label515 +.p2align 2 +label662: + addiw a7, a7, 1 + sw zero, 0(a6) + ld s0, 112(sp) + bgt s0, a7, label343 + addiw t4, t4, 1 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label654: + mv s10, s11 + bgt s7, s11, label366 + bgt s5, s11, label393 + j label391 +.p2align 2 +label496: + mv s10, s0 + mv s11, zero + bgt s3, s0, label405 + mv s10, zero + j label349 +.p2align 2 +label506: + mv s10, s0 + mv s11, zero + bgt s5, s0, label393 + j label391 +.p2align 2 +label525: + mv s10, s0 + mv s11, zero + bgt s8, s0, label384 + mv s10, zero + j label363 +.p2align 2 +label530: + mv s11, s0 + mv s10, zero + bgt s6, s0, label535 + mv s11, zero + bgt s8, zero, label384 + bgt s7, zero, label366 + j label515 +label345: + ld s4, 0(sp) + ld s0, 8(sp) + ld s5, 16(sp) ld s6, 24(sp) - ld s2, 32(sp) - ld s3, 40(sp) - ld s4, 48(sp) - ld s7, 56(sp) - ld s8, 64(sp) - ld s9, 72(sp) - ld s10, 80(sp) + ld s1, 32(sp) + ld s10, 40(sp) + ld s9, 48(sp) + ld s2, 56(sp) + ld s3, 64(sp) + ld s7, 72(sp) + ld s8, 80(sp) ld s11, 88(sp) - addi sp, sp, 96 + addi sp, sp, 136 ret .p2align 2 -label502: +label676: addiw t4, t4, 1 - bgt t3, t4, label360 - j label359 + ld t5, 96(sp) + bgt t5, t4, label346 + j label345 +.p2align 2 +label501: + mv s10, s0 + mv s11, zero + bgt s4, s0, label398 + j label646 +label640: + mv s10, s0 + mv s11, zero + bgt s2, s0, label357 + j label679 .p2align 2 cmmc_parallel_body_3: addi sp, sp, -96 mv t1, a1 -pcrel673: +pcrel883: auipc a5, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel674: +pcrel884: auipc t5, %pcrel_hi(b) sd s0, 0(sp) - addi t0, a5, %pcrel_lo(pcrel673) - addi t3, t5, %pcrel_lo(pcrel674) + addi t0, a5, %pcrel_lo(pcrel883) + addi t3, t5, %pcrel_lo(pcrel884) sd s5, 8(sp) sd s2, 16(sp) sd s1, 24(sp) @@ -853,20 +962,20 @@ pcrel674: sd s7, 56(sp) sd s8, 64(sp) sd s9, 72(sp) - sd s10, 80(sp) - sd s11, 88(sp) - lw a4, 4(t0) - lw a2, 8(t0) - lw a3, 12(t0) - lw t4, %pcrel_lo(pcrel673)(a5) + sd s11, 80(sp) + sd s10, 88(sp) + lw a3, 4(t0) + lw a4, 8(t0) + lw a2, 12(t0) + lw t4, %pcrel_lo(pcrel883)(a5) mulw t2, a0, a2 addw a1, t2, t4 mv t2, a0 -pcrel675: +pcrel885: auipc t4, %pcrel_hi(a) sh2add t0, a1, t3 lui a0, 262144 - addi a5, t4, %pcrel_lo(pcrel675) + addi a5, t4, %pcrel_lo(pcrel885) subw t3, t2, a3 addw t4, a3, t2 mv t5, t0 @@ -886,50 +995,56 @@ pcrel675: xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - bne s6, zero, label580 + bne s6, zero, label786 + j label785 +.p2align 2 +label745: + addi s3, s3, 4 + mv a1, s6 + or s5, s0, s4 + srliw s7, s5, 31 + slt s5, s4, a2 + andi s6, s7, 1 + xori s7, s5, 1 + or s2, s1, s6 + or s6, s2, s7 + bne s6, zero, label786 .p2align 2 -label579: +label785: mulw s6, a2, s0 sh2add s5, s6, s3 mv s6, zero lw s2, 0(s5) li s5, 1 .p2align 2 -label535: +label741: divw s8, a1, s5 srliw s9, s8, 31 - add s10, s8, s9 + add s11, s8, s9 divw s9, s2, s5 - andi s11, s10, -2 - subw s7, s8, s11 - srliw s10, s9, 31 - add s8, s9, s10 - andi s11, s8, -2 - subw s10, s9, s11 - xor s11, s7, s10 - slliw s7, s6, 1 - sltiu s8, s11, 1 - addi s9, s7, 1 + andi s10, s11, -2 + subw s7, s8, s10 + srliw s11, s9, 31 + add s10, s9, s11 + slliw s11, s6, 1 + andi s8, s10, -2 + subw s10, s9, s8 + xor s9, s7, s10 slliw s7, s5, 1 - subw s6, s9, s8 + sltiu s8, s9, 1 mv s5, s7 - blt s7, a0, label535 + addi s9, s11, 1 + subw s6, s9, s8 + blt s7, a0, label741 addiw s4, s4, 1 - bgt a7, s4, label545 + bgt a7, s4, label745 addiw s0, s0, 1 - bgt t4, s0, label608 - addiw t6, t6, 1 - sw s6, 0(t5) - ble a2, t6, label662 - addi t5, t5, 4 - subw a6, t6, a3 - addw a7, a3, t6 - mv s0, t3 - mv a1, zero - slt s2, t3, a4 - or s5, t3, a6 - mv s4, a6 + ble t4, s0, label868 + mv a1, s6 + slt s2, s0, a4 sh2add s3, a6, a5 + mv s4, a6 + or s5, s0, a6 xori s1, s2, 1 srliw s7, s5, 31 slt s5, a6, a2 @@ -937,46 +1052,40 @@ label535: xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - bne s6, zero, label580 - j label579 + bne s6, zero, label786 + j label785 .p2align 2 -label545: - addi s3, s3, 4 - mv a1, s6 - or s5, s0, s4 +label868: + addiw t6, t6, 1 + sw s6, 0(t5) + ble a2, t6, label872 + addi t5, t5, 4 + subw a6, t6, a3 + addw a7, a3, t6 + mv s0, t3 + mv a1, zero + slt s2, t3, a4 + mv s4, a6 + or s5, t3, a6 + sh2add s3, a6, a5 + xori s1, s2, 1 srliw s7, s5, 31 - slt s5, s4, a2 + slt s5, a6, a2 andi s6, s7, 1 xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - beq s6, zero, label579 + beq s6, zero, label785 .p2align 2 -label580: +label786: mv s2, zero li s5, 1 mv s6, zero - j label535 + j label741 .p2align 2 -label608: - mv a1, s6 - slt s2, s0, a4 - sh2add s3, a6, a5 - mv s4, a6 - or s5, s0, a6 - xori s1, s2, 1 - srliw s7, s5, 31 - slt s5, a6, a2 - andi s6, s7, 1 - xori s7, s5, 1 - or s2, s1, s6 - or s6, s2, s7 - bne s6, zero, label580 - j label579 -.p2align 2 -label662: +label872: addiw t2, t2, 1 - ble t1, t2, label543 + ble t1, t2, label750 sh2add t0, a2, t0 subw t3, t2, a3 addw t4, a3, t2 @@ -987,8 +1096,8 @@ label662: mv t5, t0 slt s2, t3, a4 mv s0, t3 - mv s4, a6 or s5, t3, a6 + mv s4, a6 sh2add s3, a6, a5 xori s1, s2, 1 srliw s7, s5, 31 @@ -997,9 +1106,9 @@ label662: xori s7, s5, 1 or s2, s1, s6 or s6, s2, s7 - bne s6, zero, label580 - j label579 -label543: + bne s6, zero, label786 + j label785 +label750: ld s0, 0(sp) ld s5, 8(sp) ld s2, 16(sp) @@ -1010,18 +1119,18 @@ label543: ld s7, 56(sp) ld s8, 64(sp) ld s9, 72(sp) - ld s10, 80(sp) - ld s11, 88(sp) + ld s11, 80(sp) + ld s10, 88(sp) addi sp, sp, 96 ret .p2align 2 cmmc_parallel_body_4: addi sp, sp, -96 mv t3, a1 -pcrel856: +pcrel1069: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_4) sd s0, 0(sp) - addi a3, a2, %pcrel_lo(pcrel856) + addi a3, a2, %pcrel_lo(pcrel1069) sd s5, 8(sp) sd s1, 16(sp) sd s6, 24(sp) @@ -1036,17 +1145,17 @@ pcrel856: lw a5, 4(a3) lw a4, 8(a3) lw t0, 12(a3) -pcrel857: +pcrel1070: auipc a3, %pcrel_hi(b) - lw t1, %pcrel_lo(pcrel856)(a2) + lw t1, %pcrel_lo(pcrel1069)(a2) mulw a1, a0, a4 - addi a2, a3, %pcrel_lo(pcrel857) + addi a2, a3, %pcrel_lo(pcrel1070) addw t4, a1, t1 lui a3, 524288 -pcrel858: +pcrel1071: auipc a1, %pcrel_hi(a) sh2add t2, t4, a2 - addi t1, a1, %pcrel_lo(pcrel858) + addi t1, a1, %pcrel_lo(pcrel1071) lui a2, 262144 mv t4, a0 addiw a1, a3, 1 @@ -1066,208 +1175,237 @@ pcrel858: or s7, t5, s0 slt s10, s0, a4 srliw s8, s7, 31 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 + or s7, s6, s8 + bne s7, zero, label951 mulw s8, a4, t5 - divw s11, zero, a0 + divw s10, zero, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label750: +label916: + addi s4, s4, 4 + mv a3, s8 + or s7, s2, s5 + slt s10, s5, a4 + srliw s8, s7, 31 + andi s9, s8, 1 + xori s8, s10, 1 + or s6, s3, s9 + or s7, s6, s8 + beq s7, zero, label1036 +.p2align 2 +label951: + mv s6, zero + mv s7, a0 + mv s8, zero + divw s10, a3, a0 + and s9, s10, a1 + bne s9, a0, label1037 +.p2align 2 +label961: mv s10, a0 sh1add s8, s8, a0 slliw s9, s7, 1 mv s7, s9 - bge s9, a2, label824 -.p2align 2 -label693: - divw s11, a3, s7 - and s9, s11, a1 - beq s9, a0, label750 -.p2align 2 -label696: - divw s11, s6, s7 - and s10, s11, a1 - xori s9, s10, 1 - sltiu s10, s9, 1 - sh1add s8, s8, s10 + bge s9, a2, label1034 +.p2align 2 +label903: + divw s10, a3, s7 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, s7 + and s10, s9, a1 slliw s9, s7, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 + sltiu s10, s11, 1 + sh1add s8, s8, s10 + blt s9, a2, label903 +.p2align 2 +label1038: addiw s5, s5, 1 - bgt s1, s5, label700 + bgt s1, s5, label916 +.p2align 2 +label1044: addiw s2, s2, 1 - bgt t6, s2, label773 - addiw a7, a7, 1 - sw s8, 0(a6) - bgt a4, a7, label706 - addiw t4, t4, 1 - ble t3, t4, label704 + ble t6, s2, label1051 .p2align 2 -label705: - sh2add t2, a4, t2 - subw t5, t4, a5 - addw t6, a5, t4 - mv a7, zero - subw s0, zero, a5 - sext.w s1, a5 - mv a3, zero - mv a6, t2 - slt s5, t5, t0 - mv s2, t5 - slt s10, s0, a4 - or s7, t5, s0 +label982: + mv a3, s8 + slt s5, s2, t0 sh2add s4, s0, t1 + or s7, s2, s0 + slt s10, s0, a4 xori s3, s5, 1 srliw s8, s7, 31 mv s5, s0 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 - mulw s8, a4, t5 - divw s11, zero, a0 + or s7, s6, s8 + bne s7, zero, label951 + mulw s8, a4, s2 + divw s10, a3, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 - slliw s9, a0, 1 - mv s7, s9 - blt s9, a2, label693 - j label824 -.p2align 2 -label700: - addi s4, s4, 4 - mv a3, s8 - or s7, s2, s5 - slt s10, s5, a4 - srliw s8, s7, 31 - xori s7, s10, 1 - andi s9, s8, 1 - or s6, s3, s9 - or s8, s6, s7 - beq s8, zero, label826 -.p2align 2 -label741: - mv s6, zero - mv s7, a0 - mv s8, zero - divw s11, a3, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label824: +label1034: addiw s5, s5, 1 - bgt s1, s5, label700 + bgt s1, s5, label916 addiw s2, s2, 1 - ble t6, s2, label835 + bgt t6, s2, label982 + addiw a7, a7, 1 + sw s8, 0(a6) + ble a4, a7, label1052 .p2align 2 -label773: - mv a3, s8 - slt s5, s2, t0 - sh2add s4, s0, t1 - or s7, s2, s0 +label915: + addi a6, a6, 4 + subw s0, a7, a5 + addw s1, a5, a7 + mv s2, t5 + mv a3, zero + slt s5, t5, t0 slt s10, s0, a4 + or s7, t5, s0 + sh2add s4, s0, t1 xori s3, s5, 1 srliw s8, s7, 31 mv s5, s0 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 - mulw s8, a4, s2 - divw s11, a3, a0 + or s7, s6, s8 + bne s7, zero, label951 + mulw s8, a4, t5 + divw s10, zero, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label826: +label1036: mulw s8, a4, s2 - divw s11, a3, a0 + divw s10, a3, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 .p2align 2 -label835: +label1051: addiw a7, a7, 1 sw s8, 0(a6) - ble a4, a7, label841 + bgt a4, a7, label915 + addiw t4, t4, 1 + ble t3, t4, label914 .p2align 2 -label706: - addi a6, a6, 4 - subw s0, a7, a5 - addw s1, a5, a7 - mv s2, t5 +label913: + sh2add t2, a4, t2 + subw t5, t4, a5 + addw t6, a5, t4 + mv a7, zero + subw s0, zero, a5 + sext.w s1, a5 mv a3, zero + mv a6, t2 slt s5, t5, t0 + mv s2, t5 slt s10, s0, a4 or s7, t5, s0 sh2add s4, s0, t1 xori s3, s5, 1 srliw s8, s7, 31 mv s5, s0 - xori s7, s10, 1 andi s9, s8, 1 + xori s8, s10, 1 or s6, s3, s9 - or s8, s6, s7 - bne s8, zero, label741 + or s7, s6, s8 + bne s7, zero, label951 mulw s8, a4, t5 - divw s11, zero, a0 + divw s10, zero, a0 sh2add s7, s8, s4 mv s8, zero lw s6, 0(s7) mv s7, a0 - and s9, s11, a1 - bne s9, a0, label696 - mv s10, a0 - mv s8, a0 + and s9, s10, a1 + beq s9, a0, label961 + divw s9, s6, a0 + and s10, s9, a1 slliw s9, a0, 1 + xori s11, s10, 1 + mv s7, s9 + sltiu s10, s11, 1 + mv s8, s10 + blt s9, a2, label903 + j label1038 +.p2align 2 +label1037: + divw s9, s6, s7 + and s10, s9, a1 + slliw s9, s7, 1 + xori s11, s10, 1 mv s7, s9 - blt s9, a2, label693 - j label824 -label704: + sltiu s10, s11, 1 + sh1add s8, s8, s10 + blt s9, a2, label903 + addiw s5, s5, 1 + bgt s1, s5, label916 + j label1044 +.p2align 2 +label1052: + addiw t4, t4, 1 + bgt t3, t4, label913 +label914: ld s0, 0(sp) ld s5, 8(sp) ld s1, 16(sp) @@ -1283,22 +1421,17 @@ label704: addi sp, sp, 96 ret .p2align 2 -label841: - addiw t4, t4, 1 - bgt t3, t4, label705 - j label704 -.p2align 2 cmmc_parallel_body_5: addi sp, sp, -64 mv t0, a0 mv a5, a1 -pcrel995: +pcrel1208: auipc a3, %pcrel_hi(cmmc_parallel_body_payload_5) -pcrel996: +pcrel1209: auipc t5, %pcrel_hi(b) - addi a4, a3, %pcrel_lo(pcrel995) + addi a4, a3, %pcrel_lo(pcrel1208) sd s2, 0(sp) - addi t4, t5, %pcrel_lo(pcrel996) + addi t4, t5, %pcrel_lo(pcrel1209) sd s1, 8(sp) sd s6, 16(sp) sd s0, 24(sp) @@ -1306,16 +1439,16 @@ pcrel996: sd s4, 40(sp) sd s7, 48(sp) sd s3, 56(sp) - lw a2, 4(a4) - lw a1, 8(a4) + lw a1, 4(a4) + lw a2, 8(a4) lw a0, 12(a4) - lw t3, %pcrel_lo(pcrel995)(a3) + lw t3, %pcrel_lo(pcrel1208)(a3) mulw t2, t0, a0 addw t1, t2, t3 -pcrel997: +pcrel1210: auipc t2, %pcrel_hi(a) sh2add a4, t1, t4 - addi a3, t2, %pcrel_lo(pcrel997) + addi a3, t2, %pcrel_lo(pcrel1210) subw t1, t0, a1 addw t2, a1, t0 mv t3, a4 @@ -1324,13 +1457,13 @@ pcrel997: mv t6, a1 mv a6, t1 mv s2, zero - j label866 + j label1079 .p2align 2 -label971: +label1184: addiw t0, t0, 1 - ble a5, t0, label881 + ble a5, t0, label1094 .p2align 2 -label880: +label1093: sh2add a4, a0, a4 subw t1, t0, a1 addw t2, a1, t0 @@ -1351,15 +1484,15 @@ label880: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - bne s4, zero, label917 + bne s4, zero, label1130 mulw s4, a0, t1 addiw s1, t5, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, zero, s3 - ble t6, s1, label984 + ble t6, s1, label1197 .p2align 2 -label875: +label1088: addi s0, s0, 4 or s5, a6, s1 slt s6, s1, a0 @@ -1368,17 +1501,17 @@ label875: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - beq s4, zero, label967 + beq s4, zero, label1180 .p2align 2 -label917: +label1130: mv s3, zero addiw s1, s1, 1 max s2, s2, zero - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, a6, 1 - ble t2, a6, label966 + ble t2, a6, label1179 .p2align 2 -label866: +label1079: slt s1, a6, a2 sh2add s0, t5, a3 xori a7, s1, 1 @@ -1390,44 +1523,44 @@ label866: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - bne s4, zero, label917 + bne s4, zero, label1130 mulw s4, a0, a6 addiw s1, t5, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, s2, s3 - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, a6, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 - j label881 + bgt a5, t0, label1093 + j label1094 .p2align 2 -label967: +label1180: mulw s4, a0, a6 addiw s1, s1, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, s2, s3 - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, a6, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 - j label881 + bgt a5, t0, label1093 + j label1094 .p2align 2 -label966: +label1179: addiw t4, t4, 1 sw s2, 0(t3) - ble a0, t4, label971 + ble a0, t4, label1184 .p2align 2 -label878: +label1091: addi t3, t3, 4 subw t5, t4, a1 addw t6, a1, t4 @@ -1444,31 +1577,31 @@ label878: andi s7, s4, 1 or s3, a7, s7 or s4, s3, s5 - bne s4, zero, label917 + bne s4, zero, label1130 mulw s4, a0, t1 addiw s1, t5, 1 sh2add s5, s4, s0 lw s3, 0(s5) max s2, zero, s3 - bgt t6, s1, label875 + bgt t6, s1, label1088 addiw a6, t1, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 - j label881 + bgt a5, t0, label1093 + j label1094 .p2align 2 -label984: +label1197: addiw a6, a6, 1 - bgt t2, a6, label866 + bgt t2, a6, label1079 addiw t4, t4, 1 sw s2, 0(t3) - bgt a0, t4, label878 + bgt a0, t4, label1091 addiw t0, t0, 1 - bgt a5, t0, label880 -label881: + bgt a5, t0, label1093 +label1094: ld s2, 0(sp) ld s1, 8(sp) ld s6, 16(sp) @@ -1481,373 +1614,278 @@ label881: ret .p2align 2 cmmc_parallel_body_6: - # stack usage: CalleeArg[0] Local[0] RegSpill[64] CalleeSaved[96] - addi sp, sp, -160 -pcrel1374: + addi sp, sp, -96 + mv t3, a1 +pcrel1391: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_6) -pcrel1375: - auipc a5, %pcrel_hi(b) - mv t4, a0 - sd s3, 0(sp) - sd s0, 8(sp) - sd s5, 16(sp) - sd s4, 24(sp) - sd s6, 32(sp) - sd s1, 40(sp) - sd s8, 48(sp) - sd s10, 56(sp) - sd s9, 64(sp) - sd s2, 72(sp) - sd s7, 80(sp) + sd s0, 0(sp) + addi a3, a2, %pcrel_lo(pcrel1391) + sd s5, 8(sp) + sd s1, 16(sp) + sd s6, 24(sp) + sd s2, 32(sp) + sd s3, 40(sp) + sd s4, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + sd s9, 72(sp) + sd s10, 80(sp) sd s11, 88(sp) - sd a1, 96(sp) - addi a1, a2, %pcrel_lo(pcrel1374) - lw s3, 4(a1) - addi s10, s3, -1359 - addi s8, s3, -336 - addi s6, s3, -81 - addi s4, s3, -3 - addi s5, s3, -18 - sd s3, 104(sp) - lw s0, 8(a1) - mulw a3, a0, s0 - sd s0, 112(sp) - lw a4, %pcrel_lo(pcrel1374)(a2) - sd s4, 152(sp) - addi a2, a5, %pcrel_lo(pcrel1375) - addw a1, a3, a4 - sd s5, 144(sp) - lui a3, 1048571 - sh2add a4, a1, a2 - sd s6, 136(sp) - lui a1, 1048575 - sd s8, 128(sp) - addiw a2, a1, -1358 - sd s10, 120(sp) - addiw a1, a3, -1357 - addw a5, s3, a2 - lui a3, 32 - addw t0, s3, a1 - lui a2, 1 - addiw a0, a3, -1 - lui a1, 16 - srli t3, a0, 1 - srli t2, a0, 3 - srli t1, a0, 5 - lui a3, 4 - mv a6, a4 - mv a7, zero - addw t6, s3, t4 - subw t5, t4, s3 - j label1002 -.p2align 2 -label1221: - addiw s9, s9, 1 - bgt t6, s9, label1005 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - ble s0, a7, label1340 -.p2align 2 -label1071: - addi a6, a6, 4 -.p2align 2 -label1002: - ld s3, 104(sp) - ld s4, 152(sp) - addw s1, s3, a7 - subw s0, a7, s3 - addw s2, s4, a7 - ld s5, 144(sp) - addiw s9, s0, 3 - ld s6, 136(sp) - addw s3, s5, a7 - ld s8, 128(sp) - addw s4, s6, a7 - ld s10, 120(sp) - addw s5, s8, a7 - addw s6, t0, a7 - addw s7, s10, a7 - addw s8, a5, a7 - bge s9, s1, label1113 - mv s9, t5 -.p2align 2 -label1005: - addiw s11, s0, 15 - ble s2, s11, label1118 - addiw s11, s0, 63 - ble s3, s11, label1123 - addiw s11, s0, 255 - ble s4, s11, label1128 - addiw s11, s0, 1023 - ble s5, s11, label1133 - addw s11, s0, t1 - ble s7, s11, label1138 - addw s11, s0, t2 - ble s8, s11, label1143 - addw s10, s0, t3 - ble s6, s10, label1157 - addw s11, s0, a0 - addw s10, s0, a1 - ble s6, s11, label1313 -.p2align 2 -label1027: - addw s11, s10, a0 - addw s10, s10, a1 - bgt s6, s11, label1027 - mv s11, s10 - ble s6, s10, label1315 -.p2align 2 -label1162: - mv s10, s11 -.p2align 2 -label1023: - addw s10, s10, a3 - bgt s6, s10, label1023 - mv s11, s10 - ble s8, s10, label1314 -.p2align 2 -label1015: - addw s10, s10, a2 - bgt s8, s10, label1015 - mv s11, s10 - ble s7, s10, label1311 -.p2align 2 -label1034: - addiw s10, s10, 1024 - bgt s7, s10, label1034 - mv s11, s10 - ble s5, s10, label1039 -.p2align 2 -label1041: - addiw s10, s10, 256 - bgt s5, s10, label1041 - mv s11, s10 - ble s4, s10, label1046 -.p2align 2 -label1048: - addiw s10, s10, 64 - bgt s4, s10, label1048 - mv s11, s10 - ble s3, s10, label1053 -.p2align 2 -label1055: - addiw s10, s10, 16 - bgt s3, s10, label1055 - mv s11, s10 - ble s2, s10, label1212 -.p2align 2 -label1060: - addiw s10, s10, 4 - bgt s2, s10, label1060 - ble s1, s10, label1221 -.p2align 2 -label1064: - addiw s10, s10, 1 - bgt s1, s10, label1064 - addiw s9, s9, 1 - bgt t6, s9, label1005 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - ble t5, t4, label1069 -.p2align 2 -label1070: - ld s0, 112(sp) + lw a4, 4(a3) + lw t0, 8(a3) + lw a5, 12(a3) + mulw a1, a0, a4 +pcrel1392: + auipc a3, %pcrel_hi(b) + lw t1, %pcrel_lo(pcrel1391)(a2) + addi a2, a3, %pcrel_lo(pcrel1392) + addw t4, a1, t1 + lui a3, 524288 +pcrel1393: + auipc a1, %pcrel_hi(a) + sh2add t2, t4, a2 + addi t1, a1, %pcrel_lo(pcrel1393) + lui a2, 262144 + mv t4, a0 + li a1, 1 + addiw a0, a3, 1 + subw t5, t4, a5 + addw t6, a5, t4 + mv a6, t2 mv a7, zero - ld s3, 104(sp) - sh2add a4, s0, a4 - addw t6, s3, t4 - subw t5, t4, s3 - mv a6, a4 - j label1002 -.p2align 2 -label1113: + subw s0, zero, a5 + mv s1, a5 mv s2, t5 - blt s0, s1, label1249 - addiw s2, t5, 1 - ble t6, s2, label1328 -.p2align 2 -label1072: - bge s0, s1, label1248 -.p2align 2 -label1249: - mv s3, s0 -.p2align 2 -label1074: - addiw s3, s3, 1 - bgt s1, s3, label1074 - addiw s2, s2, 1 - bgt t6, s2, label1072 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1212: - mv s10, s11 - bgt s1, s11, label1064 -.p2align 2 -label1324: - addiw s9, s9, 1 - bgt t6, s9, label1005 - addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1053: - mv s10, s11 - bgt s2, s11, label1060 + mv a3, zero + slt s5, t5, t0 + sh2add s4, s0, t1 + xori s3, s5, 1 + mv s5, s0 + or s7, t5, s0 + slt s10, s0, a4 + srliw s8, s7, 31 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, t5 + divw s11, zero, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1322: - mv s10, s11 - bgt s1, s11, label1064 - j label1324 +label1362: + addiw s5, s5, 1 + ble s1, s5, label1366 .p2align 2 -label1046: - mv s10, s11 - bgt s3, s11, label1055 +label1240: + addi s4, s4, 4 + mv a3, s8 + or s7, s2, s5 + slt s10, s5, a4 + srliw s8, s7, 31 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + beq s8, zero, label1361 .p2align 2 -label1320: - mv s10, s11 - bgt s2, s11, label1060 - j label1322 +label1276: + mv s6, zero + mv s7, a1 + mv s8, zero + divw s11, a3, a1 + and s9, s11, a0 + bne s9, a1, label1363 +.p2align 2 +label1241: + divw s10, s6, s7 + and s9, s10, a0 + xori s11, s9, 1 + slliw s9, s7, 1 + sltiu s10, s11, 1 + mv s7, s9 + sh1add s8, s8, s10 + bge s9, a2, label1362 .p2align 2 -label1039: - mv s10, s11 - bgt s4, s11, label1048 +label1228: + divw s11, a3, s7 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + sh1add s8, s8, zero + slliw s9, s7, 1 + mv s7, s9 + blt s9, a2, label1228 .p2align 2 -label1318: - mv s10, s11 - bgt s3, s11, label1055 - j label1320 +label1233: + addiw s5, s5, 1 + bgt s1, s5, label1240 .p2align 2 -label1248: +label1234: addiw s2, s2, 1 - bgt t6, s2, label1072 + bgt t6, s2, label1304 addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 + sw s8, 0(a6) + bgt a4, a7, label1236 addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1143: - mv s10, s0 - mv s11, zero -.p2align 2 -label1012: - bgt s8, s10, label1015 - mv s10, s11 - bgt s7, s11, label1034 - j label1032 + bgt t3, t4, label1239 + j label1238 .p2align 2 -label1313: - mv s11, s10 - bgt s6, s10, label1162 -.p2align 2 -label1315: - mv s11, s10 - bgt s8, s10, label1015 - bgt s7, s10, label1034 - j label1032 +label1361: + mulw s8, a4, s2 + divw s11, a3, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1311: - mv s10, s11 - bgt s5, s11, label1041 - bgt s4, s11, label1048 - j label1318 +label1366: + addiw s2, s2, 1 + ble t6, s2, label1371 .p2align 2 -label1314: - mv s10, s11 - bgt s7, s11, label1034 - bgt s5, s11, label1041 -label1316: - mv s10, s11 - bgt s4, s11, label1048 - j label1318 +label1304: + mv a3, s8 + slt s5, s2, t0 + sh2add s4, s0, t1 + or s7, s2, s0 + slt s10, s0, a4 + xori s3, s5, 1 + srliw s8, s7, 31 + mv s5, s0 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, s2 + divw s11, a3, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1328: +label1371: addiw a7, a7, 1 - sw zero, 0(a6) - ld s0, 112(sp) - bgt s0, a7, label1071 - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 -.p2align 2 -label1118: - mv s10, s0 - mv s11, zero - bgt s2, s0, label1060 - j label1212 -.p2align 2 -label1123: - mv s10, s0 - mv s11, zero - bgt s3, s0, label1055 - j label1053 + sw s8, 0(a6) + ble a4, a7, label1375 .p2align 2 -label1128: - mv s10, s0 - mv s11, zero - bgt s4, s0, label1048 - j label1046 +label1236: + addi a6, a6, 4 + subw s0, a7, a5 + addw s1, a5, a7 + mv s2, t5 + mv a3, zero + slt s5, t5, t0 + slt s10, s0, a4 + or s7, t5, s0 + sh2add s4, s0, t1 + xori s3, s5, 1 + srliw s8, s7, 31 + mv s5, s0 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, t5 + divw s11, zero, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1133: - mv s10, s0 - mv s11, zero - bgt s5, s0, label1041 - j label1039 +label1375: + addiw t4, t4, 1 + ble t3, t4, label1238 .p2align 2 -label1138: - mv s10, s0 - mv s11, zero - bgt s7, s0, label1034 -label1032: - mv s10, s11 - bgt s5, s11, label1041 - j label1316 +label1239: + sh2add t2, a4, t2 + subw t5, t4, a5 + addw t6, a5, t4 + mv a7, zero + subw s0, zero, a5 + sext.w s1, a5 + mv a3, zero + mv a6, t2 + slt s5, t5, t0 + mv s2, t5 + slt s10, s0, a4 + or s7, t5, s0 + sh2add s4, s0, t1 + xori s3, s5, 1 + srliw s8, s7, 31 + mv s5, s0 + xori s7, s10, 1 + andi s9, s8, 1 + or s6, s3, s9 + or s8, s6, s7 + bne s8, zero, label1276 + mulw s8, a4, t5 + divw s11, zero, a1 + sh2add s7, s8, s4 + mv s8, zero + lw s6, 0(s7) + mv s7, a1 + and s9, s11, a0 + beq s9, a1, label1241 + mv s10, zero + slliw s9, a1, 1 + mv s7, s9 + blt s9, a2, label1228 + j label1233 .p2align 2 -label1157: - mv s11, s0 +label1363: mv s10, zero - bgt s6, s0, label1162 - mv s11, zero - j label1012 -label1069: - ld s3, 0(sp) - ld s0, 8(sp) - ld s5, 16(sp) - ld s4, 24(sp) - ld s6, 32(sp) - ld s1, 40(sp) - ld s8, 48(sp) - ld s10, 56(sp) - ld s9, 64(sp) - ld s2, 72(sp) - ld s7, 80(sp) + sh1add s8, s8, zero + slliw s9, s7, 1 + mv s7, s9 + blt s9, a2, label1228 + addiw s5, s5, 1 + bgt s1, s5, label1240 + j label1234 +label1238: + ld s0, 0(sp) + ld s5, 8(sp) + ld s1, 16(sp) + ld s6, 24(sp) + ld s2, 32(sp) + ld s3, 40(sp) + ld s4, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + ld s9, 72(sp) + ld s10, 80(sp) ld s11, 88(sp) - addi sp, sp, 160 + addi sp, sp, 96 ret -.p2align 2 -label1340: - addiw t4, t4, 1 - ld t5, 96(sp) - bgt t5, t4, label1070 - j label1069 diff --git a/tests/SysY2022/performance/conv2.sy.ir b/tests/SysY2022/performance/conv2.sy.ir index 4fc305de1..9678ee05d 100644 --- a/tests/SysY2022/performance/conv2.sy.ir +++ b/tests/SysY2022/performance/conv2.sy.ir @@ -38,17 +38,17 @@ func @main() -> i32 { NoRecurse Entry } { i32* %26 = ptradd [16 * i8]* %24, i32 4; i32* %27 = ptradd [16 * i8]* %24, i32 8; i32* %28 = ptradd [16 * i8]* %24, i32 12; - [16 * i8]* %29 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_2 to [16 * i8]*; + [16 * i8]* %29 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_5 to [16 * i8]*; i32* %30 = ptradd [16 * i8]* %29, i32 0; i32* %31 = ptradd [16 * i8]* %29, i32 4; i32* %32 = ptradd [16 * i8]* %29, i32 8; i32* %33 = ptradd [16 * i8]* %29, i32 12; - [16 * i8]* %34 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_5 to [16 * i8]*; + [16 * i8]* %34 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_6 to [16 * i8]*; i32* %35 = ptradd [16 * i8]* %34, i32 0; i32* %36 = ptradd [16 * i8]* %34, i32 4; i32* %37 = ptradd [16 * i8]* %34, i32 8; i32* %38 = ptradd [16 * i8]* %34, i32 12; - [16 * i8]* %39 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_6 to [16 * i8]*; + [16 * i8]* %39 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_2 to [16 * i8]*; i32* %40 = ptradd [16 * i8]* %39, i32 0; i32* %41 = ptradd [16 * i8]* %39, i32 4; i32* %42 = ptradd [16 * i8]* %39, i32 8; @@ -57,30 +57,30 @@ func @main() -> i32 { NoRecurse Entry } { i8* %45 = functionptr () -> void @cmmc_parallel_body_3 as i8*; i8* %46 = functionptr () -> void @cmmc_parallel_body_1 as i8*; i8* %47 = functionptr () -> void @cmmc_parallel_body_4 as i8*; - i8* %48 = functionptr () -> void @cmmc_parallel_body_2 as i8*; - i8* %49 = functionptr () -> void @cmmc_parallel_body_5 as i8*; - i8* %50 = functionptr () -> void @cmmc_parallel_body_6 as i8*; + i8* %48 = functionptr () -> void @cmmc_parallel_body_5 as i8*; + i8* %49 = functionptr () -> void @cmmc_parallel_body_6 as i8*; + i8* %50 = functionptr () -> void @cmmc_parallel_body_2 as i8*; ubr ^while.body; ^b: store i32* %13 with i32 0; store i32* %14 with i32 %2; - store i32* %15 with i32 %1; - store i32* %16 with i32 %11; + store i32* %15 with i32 %11; + store i32* %16 with i32 %1; call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %44); ubr ^b6; ^b1: store i32* %18 with i32 0; - store i32* %19 with i32 %1; - store i32* %20 with i32 %2; - store i32* %21 with i32 %11; + store i32* %19 with i32 %11; + store i32* %20 with i32 %1; + store i32* %21 with i32 %2; call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %45); ubr ^b6; ^b2: - store i32* %35 with i32 0; - store i32* %36 with i32 %1; - store i32* %37 with i32 %11; - store i32* %38 with i32 %2; - call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %49); + store i32* %30 with i32 0; + store i32* %31 with i32 %11; + store i32* %32 with i32 %1; + store i32* %33 with i32 %2; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %48); ubr ^b6; ^while.body: i32 %51 = phi [^entry, i32 0] [^b8, i32 %54]; @@ -95,11 +95,11 @@ func @main() -> i32 { NoRecurse Entry } { call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %47); ubr ^b6; ^b4: - store i32* %30 with i32 0; - store i32* %31 with i32 %11; - store i32* %32 with i32 %1; - store i32* %33 with i32 %2; - call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %48); + store i32* %35 with i32 0; + store i32* %36 with i32 %2; + store i32* %37 with i32 %1; + store i32* %38 with i32 %11; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %1, i8* %49); ubr ^b6; ^b5: store i32* %40 with i32 0; @@ -145,19 +145,19 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel ^b1: i32 %17 = phi [^b, i32 %0] [^b7, i32 %60]; i32 %18 = phi [^b, i32 %12] [^b7, i32 %62]; - i32 %19 = sub i32 %17, i32 %9; - i32 %20 = add i32 %9, i32 %17; + i32 %19 = sub i32 %17, i32 %7; + i32 %20 = add i32 %7, i32 %17; i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: i32 %22 = phi [^b1, i32 0] [^b6, i32 %58]; - i32 %23 = sub i32 %22, i32 %9; - i32 %24 = add i32 %9, i32 %22; + i32 %23 = sub i32 %22, i32 %7; + i32 %24 = add i32 %7, i32 %22; ubr ^while.body1; ^while.body1: i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %55]; i32 %26 = phi [^while.body, i32 0] [^b5, i32 %52]; - i1 %27 = icmp sle i32 %7, i32 %25; + i1 %27 = icmp sle i32 %9, i32 %25; ubr ^while.body2; ^while.body2: i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %53]; @@ -352,6 +352,212 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %4 = load i32* %3; i32* %5 = ptradd [16 * i8]* %2, i32 8; i32 %6 = load i32* %5; + i32 %7 = mul i32 %0, i32 %6; + i32* %8 = ptradd [16 * i8]* %2, i32 0; + i32 %9 = load i32* %8; + i32 %10 = add i32 %7, i32 %9; + i32 %11 = add i32 %4, i32 -3; + i32 %12 = add i32 %4, i32 -18; + i32 %13 = add i32 %4, i32 -81; + i32 %14 = add i32 %4, i32 -336; + i32 %15 = add i32 %4, i32 -1359; + i32 %16 = add i32 %4, i32 -5454; + i32 %17 = add i32 %4, i32 -21837; + [10000000 * i32]* %18 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; + i32* %19 = getelementptr &([10000000 * i32]* %18)[i64 0][i64 0]; + ubr ^b1; + ^b1: + i32 %20 = phi [^b, i32 %0] [^b7, i32 %79]; + i32 %21 = phi [^b, i32 %10] [^b7, i32 %81]; + i32 %22 = sub i32 %20, i32 %4; + i32 %23 = add i32 %4, i32 %20; + i32* %24 = getelementptr &(i32* %19)[i32 %21]; + ubr ^while.body; + ^while.body: + i32 %25 = phi [^b1, i32 0] [^b5, i32 %63]; + i32 %26 = sub i32 %25, i32 %4; + i32 %27 = add i32 %26, i32 3; + i32 %28 = add i32 %4, i32 %25; + i1 %29 = icmp slt i32 %27, i32 %28; + i32 %30 = add i32 %11, i32 %25; + i32 %31 = add i32 %12, i32 %25; + i32 %32 = add i32 %13, i32 %25; + i32 %33 = add i32 %14, i32 %25; + i32 %34 = add i32 %15, i32 %25; + i32 %35 = add i32 %16, i32 %25; + i32 %36 = add i32 %17, i32 %25; + cbr i1 %29(prob = 0.5), ^b2, ^b3; + ^b2: + i32 %37 = phi [^while.body, i32 %22] [^b6, i32 %77]; + i32 %38 = add i32 %26, i32 15; + i1 %39 = icmp sgt i32 %30, i32 %38; + cbr i1 %39(prob = 0.941176), ^super.header, ^scalar.header; + ^b3: + i32 %40 = phi [^while.body, i32 %22] [^b4, i32 %50]; + i1 %41 = icmp slt i32 %26, i32 %28; + cbr i1 %41(prob = 0.75), ^while.body1, ^b4; + ^super.header: + i32 %42 = add i32 %26, i32 63; + i1 %43 = icmp sgt i32 %31, i32 %42; + cbr i1 %43(prob = 0.941176), ^super.header1, ^scalar.header1; + ^while.body1 {scalar}: + i32 %44 = phi [^b3, i32 %26] [^while.body1, i32 %45]; + i32 %45 = add i32 %44, i32 1; + i1 %46 = icmp sgt i32 %28, i32 %45; + cbr i1 %46(prob = 0.75), ^while.body1, ^b4; + ^scalar.header: + i32 %47 = phi [^b2, i32 %26] [^scalar.final1, i32 %67]; + i32 %48 = phi [^b2, i32 undef] [^scalar.final1, i32 %67]; + i1 %49 = icmp sgt i32 %30, i32 %47; + cbr i1 %49(prob = 0.75), ^while.body2, ^scalar.final; + ^b4: + i32 %50 = add i32 %40, i32 1; + i1 %51 = icmp sgt i32 %23, i32 %50; + cbr i1 %51(prob = 0.5), ^b3, ^b5; + ^super.header1: + i32 %52 = add i32 %26, i32 255; + i1 %53 = icmp sgt i32 %32, i32 %52; + cbr i1 %53(prob = 0.941176), ^super.header2, ^scalar.header2; + ^scalar.header1: + i32 %54 = phi [^super.header, i32 %26] [^scalar.final2, i32 %84]; + i32 %55 = phi [^super.header, i32 undef] [^scalar.final2, i32 %84]; + i1 %56 = icmp sgt i32 %31, i32 %54; + cbr i1 %56(prob = 0.75), ^while.body3, ^scalar.final1; + ^while.body2 {scalar}: + i32 %57 = phi [^scalar.header, i32 %47] [^while.body2, i32 %58]; + i32 %58 = add i32 %57, i32 4; + i1 %59 = icmp sgt i32 %30, i32 %58; + cbr i1 %59(prob = 0.75), ^while.body2, ^scalar.final; + ^scalar.final: + i32 %60 = phi [^scalar.header, i32 %48] [^while.body2, i32 %58]; + i1 %61 = icmp sgt i32 %28, i32 %60; + cbr i1 %61(prob = 0.75), ^while.body4, ^b6; + ^b5: + i32* %62 = getelementptr &(i32* %24)[i32 %25]; + store i32* %62 with i32 0; + i32 %63 = add i32 %25, i32 1; + i1 %64 = icmp sgt i32 %6, i32 %63; + cbr i1 %64(prob = 0.5), ^while.body, ^b7; + ^super.header2: + i32 %65 = add i32 %26, i32 1023; + i1 %66 = icmp sgt i32 %33, i32 %65; + cbr i1 %66(prob = 0.941176), ^super.header3, ^scalar.header3; + ^scalar.final1: + i32 %67 = phi [^scalar.header1, i32 %55] [^while.body3, i32 %72]; + ubr ^scalar.header; + ^scalar.header2: + i32 %68 = phi [^super.header1, i32 %26] [^scalar.final3, i32 %93]; + i32 %69 = phi [^super.header1, i32 undef] [^scalar.final3, i32 %93]; + i1 %70 = icmp sgt i32 %32, i32 %68; + cbr i1 %70(prob = 0.75), ^while.body5, ^scalar.final2; + ^while.body3 {scalar}: + i32 %71 = phi [^scalar.header1, i32 %54] [^while.body3, i32 %72]; + i32 %72 = add i32 %71, i32 16; + i1 %73 = icmp sgt i32 %31, i32 %72; + cbr i1 %73(prob = 0.75), ^while.body3, ^scalar.final1; + ^while.body4 {scalar}: + i32 %74 = phi [^scalar.final, i32 %60] [^while.body4, i32 %75]; + i32 %75 = add i32 %74, i32 1; + i1 %76 = icmp sgt i32 %28, i32 %75; + cbr i1 %76(prob = 0.75), ^while.body4, ^b6; + ^b6: + i32 %77 = add i32 %37, i32 1; + i1 %78 = icmp sgt i32 %23, i32 %77; + cbr i1 %78(prob = 0.5), ^b2, ^b5; + ^b7: + i32 %79 = add i32 %20, i32 1; + i1 %80 = icmp sgt i32 %1, i32 %79; + i32 %81 = add i32 %6, i32 %21; + cbr i1 %80(prob = 0.984615), ^b1, ^b8; + ^super.header3: + i32 %82 = add i32 %26, i32 4095; + i1 %83 = icmp sgt i32 %34, i32 %82; + cbr i1 %83(prob = 0.941176), ^super.header4, ^scalar.header4; + ^scalar.final2: + i32 %84 = phi [^scalar.header2, i32 %69] [^while.body5, i32 %89]; + ubr ^scalar.header1; + ^scalar.header3: + i32 %85 = phi [^super.header2, i32 %26] [^scalar.final4, i32 %102]; + i32 %86 = phi [^super.header2, i32 undef] [^scalar.final4, i32 %102]; + i1 %87 = icmp sgt i32 %33, i32 %85; + cbr i1 %87(prob = 0.75), ^while.body6, ^scalar.final3; + ^while.body5 {scalar}: + i32 %88 = phi [^scalar.header2, i32 %68] [^while.body5, i32 %89]; + i32 %89 = add i32 %88, i32 64; + i1 %90 = icmp sgt i32 %32, i32 %89; + cbr i1 %90(prob = 0.75), ^while.body5, ^scalar.final2; + ^b8: + ret; + ^super.header4: + i32 %91 = add i32 %26, i32 16383; + i1 %92 = icmp sgt i32 %35, i32 %91; + cbr i1 %92(prob = 0.941176), ^super.header5, ^scalar.header5; + ^scalar.final3: + i32 %93 = phi [^scalar.header3, i32 %86] [^while.body6, i32 %98]; + ubr ^scalar.header2; + ^scalar.header4: + i32 %94 = phi [^super.header3, i32 %26] [^scalar.final5, i32 %113]; + i32 %95 = phi [^super.header3, i32 undef] [^scalar.final5, i32 %113]; + i1 %96 = icmp sgt i32 %34, i32 %94; + cbr i1 %96(prob = 0.75), ^while.body7, ^scalar.final4; + ^while.body6 {scalar}: + i32 %97 = phi [^scalar.header3, i32 %85] [^while.body6, i32 %98]; + i32 %98 = add i32 %97, i32 256; + i1 %99 = icmp sgt i32 %33, i32 %98; + cbr i1 %99(prob = 0.75), ^while.body6, ^scalar.final3; + ^super.header5: + i32 %100 = add i32 %26, i32 65535; + i1 %101 = icmp sgt i32 %36, i32 %100; + cbr i1 %101(prob = 0.941176), ^while.body8, ^scalar.header6; + ^scalar.final4: + i32 %102 = phi [^scalar.header4, i32 %95] [^while.body7, i32 %107]; + ubr ^scalar.header3; + ^scalar.header5: + i32 %103 = phi [^super.header4, i32 %26] [^scalar.final6, i32 %120]; + i32 %104 = phi [^super.header4, i32 undef] [^scalar.final6, i32 %120]; + i1 %105 = icmp sgt i32 %35, i32 %103; + cbr i1 %105(prob = 0.75), ^while.body9, ^scalar.final5; + ^while.body7 {scalar}: + i32 %106 = phi [^scalar.header4, i32 %94] [^while.body7, i32 %107]; + i32 %107 = add i32 %106, i32 1024; + i1 %108 = icmp sgt i32 %34, i32 %107; + cbr i1 %108(prob = 0.75), ^while.body7, ^scalar.final4; + ^while.body8: + i32 %109 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; + i32 %110 = add i32 %109, i32 131071; + i1 %111 = icmp sgt i32 %36, i32 %110; + i32 %112 = add i32 %109, i32 65536; + cbr i1 %111(prob = 0.941176), ^while.body8, ^scalar.header6; + ^scalar.final5: + i32 %113 = phi [^scalar.header5, i32 %104] [^while.body9, i32 %118]; + ubr ^scalar.header4; + ^scalar.header6: + i32 %114 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; + i32 %115 = phi [^super.header5, i32 undef] [^while.body8, i32 %112]; + i1 %116 = icmp sgt i32 %36, i32 %114; + cbr i1 %116(prob = 0.75), ^while.body10, ^scalar.final6; + ^while.body9 {scalar}: + i32 %117 = phi [^scalar.header5, i32 %103] [^while.body9, i32 %118]; + i32 %118 = add i32 %117, i32 4096; + i1 %119 = icmp sgt i32 %35, i32 %118; + cbr i1 %119(prob = 0.75), ^while.body9, ^scalar.final5; + ^scalar.final6: + i32 %120 = phi [^scalar.header6, i32 %115] [^while.body10, i32 %122]; + ubr ^scalar.header5; + ^while.body10 {scalar}: + i32 %121 = phi [^scalar.header6, i32 %114] [^while.body10, i32 %122]; + i32 %122 = add i32 %121, i32 16384; + i1 %123 = icmp sgt i32 %36, i32 %122; + cbr i1 %123(prob = 0.75), ^while.body10, ^scalar.final6; +} +internal [16 * i8]* @cmmc_parallel_body_payload_2, align 8; +internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { + ^b: + [16 * i8]* %2 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_3 to [16 * i8]*; + i32* %3 = ptradd [16 * i8]* %2, i32 4; + i32 %4 = load i32* %3; + i32* %5 = ptradd [16 * i8]* %2, i32 8; + i32 %6 = load i32* %5; i32* %7 = ptradd [16 * i8]* %2, i32 12; i32 %8 = load i32* %7; i32 %9 = mul i32 %0, i32 %8; @@ -364,25 +570,25 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32* %16 = getelementptr &([10000000 * i32]* %15)[i64 0][i64 0]; ubr ^b1; ^b1: - i32 %17 = phi [^b, i32 %0] [^b9, i32 %62]; - i32 %18 = phi [^b, i32 %12] [^b9, i32 %64]; + i32 %17 = phi [^b, i32 %0] [^b7, i32 %61]; + i32 %18 = phi [^b, i32 %12] [^b7, i32 %63]; i32 %19 = sub i32 %17, i32 %4; i32 %20 = add i32 %4, i32 %17; i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: - i32 %22 = phi [^b1, i32 0] [^b8, i32 %60]; + i32 %22 = phi [^b1, i32 0] [^b6, i32 %59]; i32 %23 = sub i32 %22, i32 %4; i32 %24 = add i32 %4, i32 %22; ubr ^while.body1; ^while.body1: - i32 %25 = phi [^while.body, i32 %19] [^b7, i32 %57]; - i32 %26 = phi [^while.body, i32 0] [^b7, i32 %52]; + i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %56]; + i32 %26 = phi [^while.body, i32 0] [^b5, i32 %51]; i1 %27 = icmp sle i32 %6, i32 %25; ubr ^while.body2; ^while.body2: - i32 %28 = phi [^while.body1, i32 %23] [^b6, i32 %55]; - i32 %29 = phi [^while.body1, i32 %26] [^b6, i32 %52]; + i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %54]; + i32 %29 = phi [^while.body1, i32 %26] [^b4, i32 %51]; i32 %30 = or i32 %25, i32 %28; i32 %31 = lshr i32 %30, i32 31; i1 %32 = ztrunc i32 %31 to i1; @@ -400,105 +606,8 @@ internal func @cmmc_parallel_body_2(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %40 = load i32* %39; ubr ^b2; ^while.body3: - i32 %41 = phi [^b2, i32 1] [^b5, i32 %53]; - i32 %42 = phi [^b2, i32 0] [^b5, i32 %52]; - i32 %43 = sdiv i32 %29, i32 %41; - i32 %44 = and i32 %43, i32 -2147483647; - i1 %45 = icmp eq i32 %44, i32 1; - cbr i1 %45(prob = 0.49), ^b4, ^b5; - ^b4: - i32 %46 = sdiv i32 %36, i32 %41; - i32 %47 = and i32 %46, i32 -2147483647; - i1 %48 = icmp eq i32 %47, i32 1; - ubr ^b5; - ^b5: - i1 %49 = phi [^while.body3, i1 false] [^b4, i1 %48]; - i32 %50 = mul i32 %42, i32 2; - i32 %51 = zext i1 %49 to i32; - i32 %52 = add i32 %50, i32 %51; - i32 %53 = mul i32 %41, i32 2; - i1 %54 = icmp slt i32 %53, i32 1073741824; - cbr i1 %54(prob = 0.984615), ^while.body3, ^b6; - ^b6: - i32 %55 = add i32 %28, i32 1; - i1 %56 = icmp sgt i32 %24, i32 %55; - cbr i1 %56(prob = 0.5), ^while.body2, ^b7; - ^b7: - i32 %57 = add i32 %25, i32 1; - i1 %58 = icmp sgt i32 %20, i32 %57; - cbr i1 %58(prob = 0.5), ^while.body1, ^b8; - ^b8: - i32* %59 = getelementptr &(i32* %21)[i32 %22]; - store i32* %59 with i32 %52; - i32 %60 = add i32 %22, i32 1; - i1 %61 = icmp sgt i32 %8, i32 %60; - cbr i1 %61(prob = 0.5), ^while.body, ^b9; - ^b9: - i32 %62 = add i32 %17, i32 1; - i1 %63 = icmp sgt i32 %1, i32 %62; - i32 %64 = add i32 %8, i32 %18; - cbr i1 %63(prob = 0.984615), ^b1, ^b10; - ^b10: - ret; -} -internal [16 * i8]* @cmmc_parallel_body_payload_2, align 8; -internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { - ^b: - [16 * i8]* %2 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_3 to [16 * i8]*; - i32* %3 = ptradd [16 * i8]* %2, i32 4; - i32 %4 = load i32* %3; - i32* %5 = ptradd [16 * i8]* %2, i32 8; - i32 %6 = load i32* %5; - i32 %7 = mul i32 %0, i32 %6; - i32* %8 = ptradd [16 * i8]* %2, i32 12; - i32 %9 = load i32* %8; - i32* %10 = ptradd [16 * i8]* %2, i32 0; - i32 %11 = load i32* %10; - i32 %12 = add i32 %7, i32 %11; - [10000000 * i32]* %13 = ptrcast [10000000 * i32]* @a to [10000000 * i32]*; - i32* %14 = getelementptr &([10000000 * i32]* %13)[i64 0][i64 0]; - [10000000 * i32]* %15 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; - i32* %16 = getelementptr &([10000000 * i32]* %15)[i64 0][i64 0]; - ubr ^b1; - ^b1: - i32 %17 = phi [^b, i32 %0] [^b7, i32 %61]; - i32 %18 = phi [^b, i32 %12] [^b7, i32 %63]; - i32 %19 = sub i32 %17, i32 %9; - i32 %20 = add i32 %9, i32 %17; - i32* %21 = getelementptr &(i32* %16)[i32 %18]; - ubr ^while.body; - ^while.body: - i32 %22 = phi [^b1, i32 0] [^b6, i32 %59]; - i32 %23 = sub i32 %22, i32 %9; - i32 %24 = add i32 %9, i32 %22; - ubr ^while.body1; - ^while.body1: - i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %56]; - i32 %26 = phi [^while.body, i32 0] [^b5, i32 %51]; - i1 %27 = icmp sle i32 %4, i32 %25; - ubr ^while.body2; - ^while.body2: - i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %54]; - i32 %29 = phi [^while.body1, i32 %26] [^b4, i32 %51]; - i32 %30 = or i32 %25, i32 %28; - i32 %31 = lshr i32 %30, i32 31; - i1 %32 = ztrunc i32 %31 to i1; - i1 %33 = or i1 %27, i1 %32; - i1 %34 = icmp sle i32 %6, i32 %28; - i1 %35 = or i1 %33, i1 %34; - cbr i1 %35(prob = 0.5), ^b2, ^b3; - ^b2: - i32 %36 = phi [^while.body2, i32 0] [^b3, i32 %40]; - ubr ^while.body3; - ^b3: - i32 %37 = mul i32 %6, i32 %25; - i32* %38 = getelementptr &(i32* %14)[i32 %28]; - i32* %39 = getelementptr &(i32* %38)[i32 %37]; - i32 %40 = load i32* %39; - ubr ^b2; - ^while.body3: - i32 %41 = phi [^b2, i32 1] [^while.body3, i32 %52]; - i32 %42 = phi [^b2, i32 0] [^while.body3, i32 %51]; + i32 %41 = phi [^b2, i32 1] [^while.body3, i32 %52]; + i32 %42 = phi [^b2, i32 0] [^while.body3, i32 %51]; i32 %43 = sdiv i32 %29, i32 %41; i32 %44 = srem i32 %43, i32 2; i32 %45 = sdiv i32 %36, i32 %41; @@ -523,12 +632,12 @@ internal func @cmmc_parallel_body_3(i32 %0, i32 %1) -> void { NoRecurse Parallel i32* %58 = getelementptr &(i32* %21)[i32 %22]; store i32* %58 with i32 %51; i32 %59 = add i32 %22, i32 1; - i1 %60 = icmp sgt i32 %6, i32 %59; + i1 %60 = icmp sgt i32 %8, i32 %59; cbr i1 %60(prob = 0.5), ^while.body, ^b7; ^b7: i32 %61 = add i32 %17, i32 1; i1 %62 = icmp sgt i32 %1, i32 %61; - i32 %63 = add i32 %6, i32 %18; + i32 %63 = add i32 %8, i32 %18; cbr i1 %62(prob = 0.984615), ^b1, ^b8; ^b8: ret; @@ -594,7 +703,7 @@ internal func @cmmc_parallel_body_4(i32 %0, i32 %1) -> void { NoRecurse Parallel i32 %43 = sdiv i32 %29, i32 %41; i32 %44 = and i32 %43, i32 -2147483647; i1 %45 = icmp eq i32 %44, i32 1; - cbr i1 %45(prob = 0.49), ^b5, ^b4; + cbr i1 %45(prob = 0.5), ^b5, ^b4; ^b4: i32 %46 = sdiv i32 %36, i32 %41; i32 %47 = and i32 %46, i32 -2147483647; @@ -652,19 +761,19 @@ internal func @cmmc_parallel_body_5(i32 %0, i32 %1) -> void { NoRecurse Parallel ^b1: i32 %17 = phi [^b, i32 %0] [^b7, i32 %49]; i32 %18 = phi [^b, i32 %12] [^b7, i32 %51]; - i32 %19 = sub i32 %17, i32 %6; - i32 %20 = add i32 %6, i32 %17; + i32 %19 = sub i32 %17, i32 %4; + i32 %20 = add i32 %4, i32 %17; i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: i32 %22 = phi [^b1, i32 0] [^b6, i32 %47]; - i32 %23 = sub i32 %22, i32 %6; - i32 %24 = add i32 %6, i32 %22; + i32 %23 = sub i32 %22, i32 %4; + i32 %24 = add i32 %4, i32 %22; ubr ^while.body1; ^while.body1: i32 %25 = phi [^while.body, i32 %19] [^b5, i32 %44]; i32 %26 = phi [^while.body, i32 0] [^b5, i32 %43]; - i1 %27 = icmp sle i32 %4, i32 %25; + i1 %27 = icmp sle i32 %6, i32 %25; ubr ^b2; ^b2: i32 %28 = phi [^while.body1, i32 %23] [^b4, i32 %41]; @@ -712,204 +821,95 @@ internal func @cmmc_parallel_body_6(i32 %0, i32 %1) -> void { NoRecurse Parallel [16 * i8]* %2 = ptrcast [16 * i8]* @cmmc_parallel_body_payload_6 to [16 * i8]*; i32* %3 = ptradd [16 * i8]* %2, i32 4; i32 %4 = load i32* %3; - i32* %5 = ptradd [16 * i8]* %2, i32 8; - i32 %6 = load i32* %5; - i32 %7 = mul i32 %0, i32 %6; - i32* %8 = ptradd [16 * i8]* %2, i32 0; + i32 %5 = mul i32 %0, i32 %4; + i32* %6 = ptradd [16 * i8]* %2, i32 8; + i32 %7 = load i32* %6; + i32* %8 = ptradd [16 * i8]* %2, i32 12; i32 %9 = load i32* %8; - i32 %10 = add i32 %7, i32 %9; - i32 %11 = add i32 %4, i32 -3; - i32 %12 = add i32 %4, i32 -18; - i32 %13 = add i32 %4, i32 -81; - i32 %14 = add i32 %4, i32 -336; - i32 %15 = add i32 %4, i32 -1359; - i32 %16 = add i32 %4, i32 -5454; - i32 %17 = add i32 %4, i32 -21837; - [10000000 * i32]* %18 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; - i32* %19 = getelementptr &([10000000 * i32]* %18)[i64 0][i64 0]; + i32* %10 = ptradd [16 * i8]* %2, i32 0; + i32 %11 = load i32* %10; + i32 %12 = add i32 %5, i32 %11; + [10000000 * i32]* %13 = ptrcast [10000000 * i32]* @a to [10000000 * i32]*; + i32* %14 = getelementptr &([10000000 * i32]* %13)[i64 0][i64 0]; + [10000000 * i32]* %15 = ptrcast [10000000 * i32]* @b to [10000000 * i32]*; + i32* %16 = getelementptr &([10000000 * i32]* %15)[i64 0][i64 0]; ubr ^b1; ^b1: - i32 %20 = phi [^b, i32 %0] [^b7, i32 %79]; - i32 %21 = phi [^b, i32 %10] [^b7, i32 %81]; - i32 %22 = sub i32 %20, i32 %4; - i32 %23 = add i32 %4, i32 %20; - i32* %24 = getelementptr &(i32* %19)[i32 %21]; + i32 %17 = phi [^b, i32 %0] [^b9, i32 %62]; + i32 %18 = phi [^b, i32 %12] [^b9, i32 %64]; + i32 %19 = sub i32 %17, i32 %9; + i32 %20 = add i32 %9, i32 %17; + i32* %21 = getelementptr &(i32* %16)[i32 %18]; ubr ^while.body; ^while.body: - i32 %25 = phi [^b1, i32 0] [^b5, i32 %63]; - i32 %26 = sub i32 %25, i32 %4; - i32 %27 = add i32 %26, i32 3; - i32 %28 = add i32 %4, i32 %25; - i1 %29 = icmp slt i32 %27, i32 %28; - i32 %30 = add i32 %11, i32 %25; - i32 %31 = add i32 %12, i32 %25; - i32 %32 = add i32 %13, i32 %25; - i32 %33 = add i32 %14, i32 %25; - i32 %34 = add i32 %15, i32 %25; - i32 %35 = add i32 %16, i32 %25; - i32 %36 = add i32 %17, i32 %25; - cbr i1 %29(prob = 0.5), ^b2, ^b3; + i32 %22 = phi [^b1, i32 0] [^b8, i32 %60]; + i32 %23 = sub i32 %22, i32 %9; + i32 %24 = add i32 %9, i32 %22; + ubr ^while.body1; + ^while.body1: + i32 %25 = phi [^while.body, i32 %19] [^b7, i32 %57]; + i32 %26 = phi [^while.body, i32 0] [^b7, i32 %52]; + i1 %27 = icmp sle i32 %7, i32 %25; + ubr ^while.body2; + ^while.body2: + i32 %28 = phi [^while.body1, i32 %23] [^b6, i32 %55]; + i32 %29 = phi [^while.body1, i32 %26] [^b6, i32 %52]; + i32 %30 = or i32 %25, i32 %28; + i32 %31 = lshr i32 %30, i32 31; + i1 %32 = ztrunc i32 %31 to i1; + i1 %33 = or i1 %27, i1 %32; + i1 %34 = icmp sle i32 %4, i32 %28; + i1 %35 = or i1 %33, i1 %34; + cbr i1 %35(prob = 0.5), ^b2, ^b3; ^b2: - i32 %37 = phi [^while.body, i32 %22] [^b6, i32 %77]; - i32 %38 = add i32 %26, i32 15; - i1 %39 = icmp sgt i32 %30, i32 %38; - cbr i1 %39(prob = 0.941176), ^super.header, ^scalar.header; + i32 %36 = phi [^while.body2, i32 0] [^b3, i32 %40]; + ubr ^while.body3; ^b3: - i32 %40 = phi [^while.body, i32 %22] [^b4, i32 %50]; - i1 %41 = icmp slt i32 %26, i32 %28; - cbr i1 %41(prob = 0.75), ^while.body1, ^b4; - ^super.header: - i32 %42 = add i32 %26, i32 63; - i1 %43 = icmp sgt i32 %31, i32 %42; - cbr i1 %43(prob = 0.941176), ^super.header1, ^scalar.header1; - ^while.body1 {scalar}: - i32 %44 = phi [^b3, i32 %26] [^while.body1, i32 %45]; - i32 %45 = add i32 %44, i32 1; - i1 %46 = icmp sgt i32 %28, i32 %45; - cbr i1 %46(prob = 0.75), ^while.body1, ^b4; - ^scalar.header: - i32 %47 = phi [^b2, i32 %26] [^scalar.final1, i32 %67]; - i32 %48 = phi [^b2, i32 undef] [^scalar.final1, i32 %67]; - i1 %49 = icmp sgt i32 %30, i32 %47; - cbr i1 %49(prob = 0.75), ^while.body2, ^scalar.final; + i32 %37 = mul i32 %4, i32 %25; + i32* %38 = getelementptr &(i32* %14)[i32 %28]; + i32* %39 = getelementptr &(i32* %38)[i32 %37]; + i32 %40 = load i32* %39; + ubr ^b2; + ^while.body3: + i32 %41 = phi [^b2, i32 1] [^b5, i32 %53]; + i32 %42 = phi [^b2, i32 0] [^b5, i32 %52]; + i32 %43 = sdiv i32 %29, i32 %41; + i32 %44 = and i32 %43, i32 -2147483647; + i1 %45 = icmp eq i32 %44, i32 1; + cbr i1 %45(prob = 0.5), ^b4, ^b5; ^b4: - i32 %50 = add i32 %40, i32 1; - i1 %51 = icmp sgt i32 %23, i32 %50; - cbr i1 %51(prob = 0.5), ^b3, ^b5; - ^super.header1: - i32 %52 = add i32 %26, i32 255; - i1 %53 = icmp sgt i32 %32, i32 %52; - cbr i1 %53(prob = 0.941176), ^super.header2, ^scalar.header2; - ^scalar.header1: - i32 %54 = phi [^super.header, i32 %26] [^scalar.final2, i32 %84]; - i32 %55 = phi [^super.header, i32 undef] [^scalar.final2, i32 %84]; - i1 %56 = icmp sgt i32 %31, i32 %54; - cbr i1 %56(prob = 0.75), ^while.body3, ^scalar.final1; - ^while.body2 {scalar}: - i32 %57 = phi [^scalar.header, i32 %47] [^while.body2, i32 %58]; - i32 %58 = add i32 %57, i32 4; - i1 %59 = icmp sgt i32 %30, i32 %58; - cbr i1 %59(prob = 0.75), ^while.body2, ^scalar.final; - ^scalar.final: - i32 %60 = phi [^scalar.header, i32 %48] [^while.body2, i32 %58]; - i1 %61 = icmp sgt i32 %28, i32 %60; - cbr i1 %61(prob = 0.75), ^while.body4, ^b6; + i32 %46 = sdiv i32 %36, i32 %41; + i32 %47 = and i32 %46, i32 -2147483647; + i1 %48 = icmp eq i32 %47, i32 1; + ubr ^b5; ^b5: - i32* %62 = getelementptr &(i32* %24)[i32 %25]; - store i32* %62 with i32 0; - i32 %63 = add i32 %25, i32 1; - i1 %64 = icmp sgt i32 %6, i32 %63; - cbr i1 %64(prob = 0.5), ^while.body, ^b7; - ^super.header2: - i32 %65 = add i32 %26, i32 1023; - i1 %66 = icmp sgt i32 %33, i32 %65; - cbr i1 %66(prob = 0.941176), ^super.header3, ^scalar.header3; - ^scalar.final1: - i32 %67 = phi [^scalar.header1, i32 %55] [^while.body3, i32 %72]; - ubr ^scalar.header; - ^scalar.header2: - i32 %68 = phi [^super.header1, i32 %26] [^scalar.final3, i32 %93]; - i32 %69 = phi [^super.header1, i32 undef] [^scalar.final3, i32 %93]; - i1 %70 = icmp sgt i32 %32, i32 %68; - cbr i1 %70(prob = 0.75), ^while.body5, ^scalar.final2; - ^while.body3 {scalar}: - i32 %71 = phi [^scalar.header1, i32 %54] [^while.body3, i32 %72]; - i32 %72 = add i32 %71, i32 16; - i1 %73 = icmp sgt i32 %31, i32 %72; - cbr i1 %73(prob = 0.75), ^while.body3, ^scalar.final1; - ^while.body4 {scalar}: - i32 %74 = phi [^scalar.final, i32 %60] [^while.body4, i32 %75]; - i32 %75 = add i32 %74, i32 1; - i1 %76 = icmp sgt i32 %28, i32 %75; - cbr i1 %76(prob = 0.75), ^while.body4, ^b6; + i1 %49 = phi [^while.body3, i1 false] [^b4, i1 %48]; + i32 %50 = mul i32 %42, i32 2; + i32 %51 = zext i1 %49 to i32; + i32 %52 = add i32 %50, i32 %51; + i32 %53 = mul i32 %41, i32 2; + i1 %54 = icmp slt i32 %53, i32 1073741824; + cbr i1 %54(prob = 0.984615), ^while.body3, ^b6; ^b6: - i32 %77 = add i32 %37, i32 1; - i1 %78 = icmp sgt i32 %23, i32 %77; - cbr i1 %78(prob = 0.5), ^b2, ^b5; + i32 %55 = add i32 %28, i32 1; + i1 %56 = icmp sgt i32 %24, i32 %55; + cbr i1 %56(prob = 0.5), ^while.body2, ^b7; ^b7: - i32 %79 = add i32 %20, i32 1; - i1 %80 = icmp sgt i32 %1, i32 %79; - i32 %81 = add i32 %6, i32 %21; - cbr i1 %80(prob = 0.984615), ^b1, ^b8; - ^super.header3: - i32 %82 = add i32 %26, i32 4095; - i1 %83 = icmp sgt i32 %34, i32 %82; - cbr i1 %83(prob = 0.941176), ^super.header4, ^scalar.header4; - ^scalar.final2: - i32 %84 = phi [^scalar.header2, i32 %69] [^while.body5, i32 %89]; - ubr ^scalar.header1; - ^scalar.header3: - i32 %85 = phi [^super.header2, i32 %26] [^scalar.final4, i32 %102]; - i32 %86 = phi [^super.header2, i32 undef] [^scalar.final4, i32 %102]; - i1 %87 = icmp sgt i32 %33, i32 %85; - cbr i1 %87(prob = 0.75), ^while.body6, ^scalar.final3; - ^while.body5 {scalar}: - i32 %88 = phi [^scalar.header2, i32 %68] [^while.body5, i32 %89]; - i32 %89 = add i32 %88, i32 64; - i1 %90 = icmp sgt i32 %32, i32 %89; - cbr i1 %90(prob = 0.75), ^while.body5, ^scalar.final2; + i32 %57 = add i32 %25, i32 1; + i1 %58 = icmp sgt i32 %20, i32 %57; + cbr i1 %58(prob = 0.5), ^while.body1, ^b8; ^b8: + i32* %59 = getelementptr &(i32* %21)[i32 %22]; + store i32* %59 with i32 %52; + i32 %60 = add i32 %22, i32 1; + i1 %61 = icmp sgt i32 %4, i32 %60; + cbr i1 %61(prob = 0.5), ^while.body, ^b9; + ^b9: + i32 %62 = add i32 %17, i32 1; + i1 %63 = icmp sgt i32 %1, i32 %62; + i32 %64 = add i32 %4, i32 %18; + cbr i1 %63(prob = 0.984615), ^b1, ^b10; + ^b10: ret; - ^super.header4: - i32 %91 = add i32 %26, i32 16383; - i1 %92 = icmp sgt i32 %35, i32 %91; - cbr i1 %92(prob = 0.941176), ^super.header5, ^scalar.header5; - ^scalar.final3: - i32 %93 = phi [^scalar.header3, i32 %86] [^while.body6, i32 %98]; - ubr ^scalar.header2; - ^scalar.header4: - i32 %94 = phi [^super.header3, i32 %26] [^scalar.final5, i32 %113]; - i32 %95 = phi [^super.header3, i32 undef] [^scalar.final5, i32 %113]; - i1 %96 = icmp sgt i32 %34, i32 %94; - cbr i1 %96(prob = 0.75), ^while.body7, ^scalar.final4; - ^while.body6 {scalar}: - i32 %97 = phi [^scalar.header3, i32 %85] [^while.body6, i32 %98]; - i32 %98 = add i32 %97, i32 256; - i1 %99 = icmp sgt i32 %33, i32 %98; - cbr i1 %99(prob = 0.75), ^while.body6, ^scalar.final3; - ^super.header5: - i32 %100 = add i32 %26, i32 65535; - i1 %101 = icmp sgt i32 %36, i32 %100; - cbr i1 %101(prob = 0.941176), ^while.body8, ^scalar.header6; - ^scalar.final4: - i32 %102 = phi [^scalar.header4, i32 %95] [^while.body7, i32 %107]; - ubr ^scalar.header3; - ^scalar.header5: - i32 %103 = phi [^super.header4, i32 %26] [^scalar.final6, i32 %120]; - i32 %104 = phi [^super.header4, i32 undef] [^scalar.final6, i32 %120]; - i1 %105 = icmp sgt i32 %35, i32 %103; - cbr i1 %105(prob = 0.75), ^while.body9, ^scalar.final5; - ^while.body7 {scalar}: - i32 %106 = phi [^scalar.header4, i32 %94] [^while.body7, i32 %107]; - i32 %107 = add i32 %106, i32 1024; - i1 %108 = icmp sgt i32 %34, i32 %107; - cbr i1 %108(prob = 0.75), ^while.body7, ^scalar.final4; - ^while.body8: - i32 %109 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; - i32 %110 = add i32 %109, i32 131071; - i1 %111 = icmp sgt i32 %36, i32 %110; - i32 %112 = add i32 %109, i32 65536; - cbr i1 %111(prob = 0.941176), ^while.body8, ^scalar.header6; - ^scalar.final5: - i32 %113 = phi [^scalar.header5, i32 %104] [^while.body9, i32 %118]; - ubr ^scalar.header4; - ^scalar.header6: - i32 %114 = phi [^super.header5, i32 %26] [^while.body8, i32 %112]; - i32 %115 = phi [^super.header5, i32 undef] [^while.body8, i32 %112]; - i1 %116 = icmp sgt i32 %36, i32 %114; - cbr i1 %116(prob = 0.75), ^while.body10, ^scalar.final6; - ^while.body9 {scalar}: - i32 %117 = phi [^scalar.header5, i32 %103] [^while.body9, i32 %118]; - i32 %118 = add i32 %117, i32 4096; - i1 %119 = icmp sgt i32 %35, i32 %118; - cbr i1 %119(prob = 0.75), ^while.body9, ^scalar.final5; - ^scalar.final6: - i32 %120 = phi [^scalar.header6, i32 %115] [^while.body10, i32 %122]; - ubr ^scalar.header5; - ^while.body10 {scalar}: - i32 %121 = phi [^scalar.header6, i32 %114] [^while.body10, i32 %122]; - i32 %122 = add i32 %121, i32 16384; - i1 %123 = icmp sgt i32 %36, i32 %122; - cbr i1 %123(prob = 0.75), ^while.body10, ^scalar.final6; } internal [16 * i8]* @cmmc_parallel_body_payload_6, align 8; diff --git a/tests/SysY2022/performance/crypto-1.arm.s b/tests/SysY2022/performance/crypto-1.arm.s index 32df7a74c..7a06355e8 100644 --- a/tests/SysY2022/performance/crypto-1.arm.s +++ b/tests/SysY2022/performance/crypto-1.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 buffer: .zero 131072 .text diff --git a/tests/SysY2022/performance/crypto-1.riscv.s b/tests/SysY2022/performance/crypto-1.riscv.s index 9c79548b6..fd1263e83 100644 --- a/tests/SysY2022/performance/crypto-1.riscv.s +++ b/tests/SysY2022/performance/crypto-1.riscv.s @@ -1,247 +1,193 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 buffer: .zero 131072 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[340] RegSpill[8] CalleeSaved[104] - addi sp, sp, -456 + # stack usage: CalleeArg[0] Local[340] RegSpill[0] CalleeSaved[104] + addi sp, sp, -448 sd ra, 0(sp) sd s0, 8(sp) sd s5, 16(sp) sd s1, 24(sp) sd s6, 32(sp) sd s2, 40(sp) - sd s3, 48(sp) - sd s4, 56(sp) + sd s4, 48(sp) + sd s3, 56(sp) sd s7, 64(sp) - sd s10, 72(sp) - sd s8, 80(sp) - sd s11, 88(sp) - sd s9, 96(sp) + sd s8, 72(sp) + sd s9, 80(sp) + sd s10, 88(sp) + sd s11, 96(sp) jal getint mv s0, a0 jal getint mv s1, a0 li a0, 161 jal _sysy_starttime -pcrel889: - auipc a1, %pcrel_hi(buffer) - lui a0, 31 + sd zero, 424(sp) + li t0, 125 +pcrel667: + auipc a0, %pcrel_hi(buffer) + lui a3, 31 + lui a2, 66 + addi a4, a0, %pcrel_lo(pcrel667) + addiw a1, a3, 1028 + addiw a0, a2, 33 sd zero, 432(sp) - addi a5, a1, %pcrel_lo(pcrel889) - addiw a3, a0, 1028 - li a1, 20 - sd zero, 440(sp) - li a0, 80 - add a2, a5, a3 - sw zero, 448(sp) - sd a5, 104(sp) - ble s1, zero, label51 - mv a4, s0 - mv a3, s1 - j label2 -.p2align 2 -label47: - lw t0, 432(sp) - addiw a3, a3, -1 - addw t6, t1, t0 - subw a5, zero, t6 - sw a5, 432(sp) - lw t0, 436(sp) - addw t6, t2, t0 - subw t1, zero, t6 - sw t1, 436(sp) - lw t0, 440(sp) - addw a5, t3, t0 - subw t1, zero, a5 - sw t1, 440(sp) - lw t0, 444(sp) - addw a5, t4, t0 - subw t1, zero, a5 - sw t1, 444(sp) - lw t2, 448(sp) - addw t0, t5, t2 - subw a5, zero, t0 - sw a5, 448(sp) - ble a3, zero, label51 -.p2align 2 -label2: - ld a5, 104(sp) - mv t0, zero - j label5 + li a3, 20 + add a5, a4, a1 + li a2, 80 + sw zero, 440(sp) + slli a1, t0, 8 + ble s1, zero, label2 + mv t1, s0 + mv t0, s1 + mv t2, a4 + mv t3, zero + j label6 .p2align 2 -label9: - addi a5, a5, 32 +label49: + lw s0, 424(sp) + addiw t0, t0, -1 + addw t2, t4, s0 + subw t3, zero, t2 + sw t3, 424(sp) + lw s0, 428(sp) + addw t2, t5, s0 + subw t4, zero, t2 + sw t4, 428(sp) + lw t3, 432(sp) + addw t2, a6, t3 + subw t4, zero, t2 + sw t4, 432(sp) + lw t3, 436(sp) + addw t2, t6, t3 + subw t4, zero, t2 + sw t4, 436(sp) + lw t5, 440(sp) + addw t2, a7, t5 + subw t3, zero, t2 + sw t3, 440(sp) + ble t0, zero, label2 + mv t2, a4 + mv t3, zero .p2align 2 -label5: - slliw t3, a4, 13 - addiw t0, t0, 8 - addw t2, t3, a4 - slli t1, t2, 1 - srli t4, t1, 47 - add a4, t2, t4 - lui t4, 66 - sraiw t3, a4, 17 - addiw a4, t4, 33 - addw t1, t2, t3 - mulw t3, t1, a4 - slli t2, t3, 1 - srli t4, t2, 47 - add a6, t3, t4 - sraiw t6, a6, 17 - addw t2, t3, t6 - mulw t5, t2, a4 - slli t4, t5, 1 - srli a6, t4, 47 - add a7, t5, a6 - sraiw t6, a7, 17 - addw t3, t5, t6 - mulw t4, t3, a4 - slli a6, t4, 1 - srli t6, a6, 47 - add a7, t4, t6 - sraiw a6, a7, 17 - addw t5, t4, a6 - mulw t6, t5, a4 - slli a7, t6, 1 - srli a6, a7, 47 - add t4, t6, a6 - sraiw a7, t4, 17 - addw a6, t6, a7 - mulw t4, a6, a4 - slli a7, t4, 1 - srli t6, a7, 47 - add s1, t4, t6 - sraiw s2, s1, 17 - addw a7, t4, s2 - slli s4, a7, 5 - mulw s0, a7, a4 - slli t6, s0, 1 - srli s2, t6, 47 - add t4, s0, s2 - sraiw s1, t4, 17 - addw t6, s0, s1 - mulw t4, t6, a4 - slli s2, t4, 1 - srli s0, s2, 47 - add s3, t4, s0 - sraiw a4, s3, 17 - addw s1, t4, a4 - slli s2, s1, 5 - addw s0, s2, s1 - slli s3, s0, 1 - mv a4, s0 - srli t4, s3, 56 - slli s3, t6, 5 - add s1, s0, t4 - andi s2, s1, -256 - addw s1, s3, t6 - subw t4, s0, s2 - slli s0, s1, 1 - srli t6, s0, 56 - addw s0, s4, a7 - add s2, s1, t6 - andi s3, s2, -256 - slli s2, s0, 1 - subw t6, s1, s3 - srli a7, s2, 56 - slli s2, a6, 5 - add s1, s0, a7 - andi s3, s1, -256 - addw s1, s2, a6 - subw a7, s0, s3 - slli s3, s1, 1 - srli a6, s3, 56 - slli s3, t5, 5 - add s0, s1, a6 +label6: + slliw t4, t1, 13 + addiw t3, t3, 4 + addw t5, t4, t1 + slli a6, t5, 1 + srli a7, a6, 47 + add t6, t5, a7 + sraiw a6, t6, 17 + addw t4, t5, a6 + mulw t1, t4, a0 + slli t6, t1, 1 + srli a7, t6, 47 + add s0, t1, a7 + sraiw a6, s0, 17 + addw t5, t1, a6 + mulw t6, t5, a0 + slli a6, t6, 1 + srli t1, a6, 47 + add a7, t6, t1 + sraiw s0, a7, 17 + addw a6, t6, s0 + mulw t1, a6, a0 + slli s1, t1, 1 + srli t6, s1, 47 + add a7, t1, t6 + sraiw s0, a7, 17 + addw t6, t1, s0 + slli s2, t6, 5 + addw a7, s2, t6 + slli s1, a7, 1 + mv t1, a7 + srli t6, s1, 56 + slli s1, a6, 5 + add s0, a7, t6 andi s2, s0, -256 - addw s0, s3, t5 - subw a6, s1, s2 - slli s1, s0, 1 + subw t6, a7, s2 + addw a7, s1, a6 + slli s0, a7, 1 + srli a6, s0, 56 + slli s0, t5, 5 + add s2, a7, a6 + andi s1, s2, -256 + subw a6, a7, s1 + addw a7, s0, t5 + slli s1, a7, 1 srli t5, s1, 56 - slli s1, t3, 5 - add s2, s0, t5 - andi s3, s2, -256 - addw s2, s1, t3 - subw t5, s0, s3 - slli s0, s2, 1 - srli t3, s0, 56 - slli s0, t2, 5 - add s1, s2, t3 - andi s3, s1, -256 - addw s1, s0, t2 - subw t3, s2, s3 - slli s2, s1, 1 - srli t2, s2, 56 - slli s2, t1, 5 - add s0, s1, t2 - andi s3, s0, -256 - addw s0, s2, t1 - subw t2, s1, s3 - slli s2, t2, 32 - slli s3, s0, 1 - slli t2, t5, 32 - srli t1, s3, 56 - add s4, s0, t1 - andi s1, s4, -256 - subw t1, s0, s1 - add.uw s0, t3, t2 - add.uw s3, t1, s2 - slli t2, t4, 32 - slli t1, a7, 32 - sd s3, 0(a5) - add.uw t3, a6, t1 - sd s0, 8(a5) - add.uw t1, t6, t2 - sd t3, 16(a5) - li t2, 125 - sd t1, 24(a5) - slli t1, t2, 8 - blt t0, t1, label9 - li t1, 125 - li t3, 128 - ld a5, 104(sp) - slli t0, t1, 10 - addi t1, t0, 8 - add t2, a5, t0 - srli t0, t1, 2 - add a5, a5, t1 - sw t3, 0(t2) - mv t3, a2 - j label11 -.p2align 2 -label50: - addi t1, a5, 4 - mv t0, t2 - mv t3, a5 - mv a5, t1 + slli s1, t4, 5 + add s2, a7, t5 + andi s0, s2, -256 + subw t5, a7, s0 + addw a7, s1, t4 + slli s0, a7, 1 + srli t4, s0, 56 + slli s0, t5, 32 + add s2, a7, t4 + slli t5, t6, 32 + andi s1, s2, -256 + subw t4, a7, s1 + add.uw a7, t4, s0 + add.uw t4, a6, t5 + sd a7, 0(t2) + sd t4, 8(t2) + bge t3, a1, label95 + addi t2, t2, 16 + j label6 .p2align 2 -label11: - andi t1, t0, 63 - addiw t2, t0, 1 - li t4, 60 - sw zero, 0(t3) - bne t1, t4, label50 +label95: + li t4, 125 + li t5, 128 + mv t6, a5 + li a6, 60 + slli t3, t4, 10 + addi t4, t3, 8 + add t2, a4, t3 + srli t3, t4, 2 + sw t5, 0(t2) + addiw t5, t3, 1 + add t2, a4, t4 sw zero, 0(a5) - li t1, 125 - addiw t0, t0, 4 - mv t6, zero - lui t3, 422994 - lui t5, 982235 - lui a7, 802094 - addiw t2, t5, -1143 - sw zero, 4(a5) - addiw t5, a7, 496 - xori t4, t2, -1 - sw t1, 8(a5) - addiw t1, t3, 769 - sw zero, 12(a5) - xori t3, t1, -1 + andi t4, t3, 63 + beq t4, a6, label16 +.p2align 2 +label51: + addi t4, t2, 4 + mv t3, t5 + mv t6, t2 + mv t2, t4 + andi t4, t5, 63 + addiw t5, t5, 1 + li a6, 60 + sw zero, 0(t6) + bne t4, a6, label51 +.p2align 2 +label16: + sw zero, 0(t2) + li t4, 125 + addiw t3, t3, 4 + mv s0, zero + lui t6, 422994 + lui a7, 982235 + lui s2, 802094 + addiw t5, a7, -1143 + sw zero, 4(t2) + addiw a7, s2, 496 + sw t4, 8(t2) + addiw t4, t6, 769 + sw zero, 12(t2) + xori t6, t5, -1 + xori a6, t4, -1 + mv t2, a4 + sd zero, 104(sp) sd zero, 112(sp) sd zero, 120(sp) sd zero, 128(sp) @@ -281,364 +227,258 @@ label11: sd zero, 400(sp) sd zero, 408(sp) sd zero, 416(sp) - sd zero, 424(sp) - ld a5, 104(sp) - addi a6, sp, 112 - mv a7, zero - j label24 + addi s1, sp, 104 + mv s2, zero + j label25 .p2align 2 -label27: - addi a6, a6, 32 +label352: + addiw s0, s0, 64 + addw t4, t4, s8 + addw t5, t5, s3 + addw a6, a6, s9 + addw t6, t6, s2 + addw a7, a7, s4 + ble t3, s0, label49 + addi t2, t2, 256 + addi s1, sp, 104 + mv s2, zero .p2align 2 -label24: - slliw s2, a7, 2 - sh2add s0, s2, a5 - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s5 - slli s6, s4, 8 - lw s3, 12(s0) - addiw s4, a7, 1 - addw s1, s2, s6 - slliw s2, s4, 2 - addw s5, s1, s3 - sh2add s0, s2, a5 - sw s5, 0(a6) - lw s1, 0(s0) - slli s6, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s6 - slli s5, s4, 8 - lw s3, 12(s0) - addw s1, s2, s5 - addiw s2, a7, 2 - addw s4, s1, s3 - slliw s3, s2, 2 - sh2add s0, s3, a5 - sw s4, 4(a6) - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s5 - slli s6, s4, 8 - lw s3, 12(s0) - addw s1, s2, s6 - addiw s2, a7, 3 - addw s4, s1, s3 - slliw s3, s2, 2 - sh2add s0, s3, a5 - sw s4, 8(a6) - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - lw s6, 8(s0) - slli s3, s4, 16 - slli s4, s6, 8 - addw s2, s3, s5 - lw s3, 12(s0) - addw s1, s2, s4 - addiw s4, a7, 4 - addw s5, s1, s3 - slliw s2, s4, 2 - sh2add s0, s2, a5 - sw s5, 12(a6) - lw s1, 0(s0) - slli s6, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s6 - slli s5, s4, 8 - lw s3, 12(s0) - addw s1, s2, s5 - addiw s2, a7, 5 - addw s4, s1, s3 - slliw s3, s2, 2 - sh2add s0, s3, a5 - sw s4, 16(a6) - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s5 - slli s6, s4, 8 - addiw s5, a7, 6 - lw s3, 12(s0) - addw s1, s2, s6 - addw s4, s1, s3 - slliw s1, s5, 2 - sh2add s0, s1, a5 - sw s4, 20(a6) - lw s2, 0(s0) - slli s5, s2, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s1, s3, s5 - slli s6, s4, 8 - lw s3, 12(s0) - addiw s4, a7, 7 - addw s2, s1, s6 - addiw a7, a7, 8 - addw s5, s2, s3 - slliw s2, s4, 2 - sh2add s0, s2, a5 - sw s5, 24(a6) - lw s1, 0(s0) - slli s6, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s6 - slli s5, s4, 8 - lw s3, 12(s0) - addw s1, s2, s5 - li s0, 16 - addw s4, s1, s3 - sw s4, 28(a6) - blt a7, s0, label27 - addi a6, sp, 112 - li a7, 16 - addi a6, a6, 64 +label25: + slliw s4, s2, 2 + sh2add s3, s4, t2 + lw s5, 0(s3) + slli s8, s5, 24 + lw s7, 4(s3) + lw s9, 8(s3) + slli s6, s7, 16 + slli s7, s9, 8 + addw s4, s6, s8 + lw s8, 12(s3) + addw s5, s4, s7 + addiw s7, s2, 1 + addw s6, s5, s8 + slliw s5, s7, 2 + sh2add s3, s5, t2 + sw s6, 0(s1) + lw s4, 0(s3) + slli s9, s4, 24 + lw s7, 4(s3) + lw s8, 8(s3) + slli s6, s7, 16 + slli s7, s8, 8 + addw s5, s6, s9 + lw s6, 12(s3) + addw s4, s5, s7 + addiw s7, s2, 2 + addw s8, s4, s6 + slliw s5, s7, 2 + sh2add s3, s5, t2 + sw s8, 4(s1) + lw s4, 0(s3) + slli s8, s4, 24 + lw s7, 4(s3) + lw s9, 8(s3) + slli s6, s7, 16 + slli s7, s9, 8 + addw s5, s6, s8 + lw s6, 12(s3) + addw s4, s5, s7 + addiw s5, s2, 3 + addw s8, s4, s6 + addiw s2, s2, 4 + slliw s6, s5, 2 + sh2add s3, s6, t2 + sw s8, 8(s1) + lw s4, 0(s3) + slli s8, s4, 24 + lw s7, 4(s3) + slli s6, s7, 16 + lw s7, 8(s3) + addw s5, s6, s8 + slli s9, s7, 8 + lw s8, 12(s3) + addw s4, s5, s9 + li s3, 16 + addw s6, s4, s8 + sw s6, 12(s1) + bge s2, s3, label263 + addi s1, s1, 16 + j label25 .p2align 2 -label29: - lw s0, -12(a6) - addiw a7, a7, 8 - lw s2, -32(a6) - lw s4, -56(a6) - addw s3, s0, s2 - lw s7, -64(a6) - addw s6, s3, s7 - subw s1, s4, s6 - srliw s5, s1, 31 - add s3, s1, s5 - andi s7, s3, -2 - subw s6, s1, s7 - sh1add s5, s1, s6 - sw s5, 0(a6) - lw s7, -8(a6) - lw s1, -28(a6) - addw s8, s7, s1 - lw s3, -52(a6) - lw s11, -60(a6) - addw s10, s5, s3 - addw s9, s8, s11 - subw s6, s3, s9 - srliw s5, s6, 31 - add s11, s6, s5 - andi s3, s11, -2 - subw s8, s6, s3 - sh1add s5, s6, s8 - sw s5, 4(a6) - lw s3, -4(a6) - lw s11, -24(a6) - addw s6, s4, s3 - lw s9, -48(a6) - addw s8, s6, s11 - subw s4, s9, s8 - srliw s6, s4, 31 - add s11, s4, s6 - andi s8, s11, -2 - subw s6, s4, s8 - sh1add s11, s4, s6 - sw s11, 8(a6) - lw s8, -20(a6) - addw s11, s10, s8 - lw s8, -44(a6) - subw s10, s8, s11 - srliw s11, s10, 31 - sh1add s7, s10, s7 - add s11, s10, s11 - andi s11, s11, -2 - subw s11, s10, s11 - sh1add s10, s10, s11 - addw s7, s7, s11 - sw s10, 12(a6) - lw s11, -16(a6) - lw s10, -40(a6) - addw s7, s7, s10 - subw s7, s2, s7 - srliw s2, s7, 31 - add s2, s7, s2 - andi s2, s2, -2 - subw s2, s7, s2 - sh1add s2, s7, s2 - addw s7, s5, s9 - addw s5, s11, s7 - subw s7, s10, s5 - srliw s9, s7, 31 - add s11, s7, s9 - andi s10, s11, -2 - sh1add s11, s4, s0 - subw s5, s7, s10 - addw s10, s6, s11 - sh1add s9, s7, s5 - sw s9, 16(a6) - lw s0, -36(a6) - addw s9, s8, s10 - subw s4, s0, s9 - srliw s6, s4, 31 - add s8, s4, s6 - andi s10, s8, -2 - sh1add s8, s7, s3 - subw s9, s4, s10 - sh1add s6, s4, s9 - addw s4, s5, s8 - sw s6, 20(a6) - addw s6, s0, s4 - subw s3, s1, s6 - srliw s5, s3, 31 - add s4, s3, s5 - andi s0, s4, -2 - subw s6, s3, s0 - sh1add s5, s3, s6 - slli s1, s5, 32 - add.uw s0, s2, s1 - sd s0, 24(a6) - bge a7, a0, label32 - addi a6, a6, 32 - j label29 +label263: + addi s1, sp, 104 + li s2, 16 + addi s1, s1, 64 .p2align 2 -label32: - addi a6, sp, 112 - mv s2, zero - mv s0, t1 - mv s3, t2 - mv a7, t3 - mv s1, t4 - mv s4, t5 - blt zero, a1, label461 - addw s6, t2, t3 - li s7, 1 - mv s9, zero - subw s8, s6, t4 - bne s7, zero, label848 - mv s9, s8 - li s6, 1 - mv s5, s8 - bne s6, zero, label850 - lui s10, 586172 - lui s11, 828972 - lui s9, 454047 - addiw s8, s11, 262 - addiw s7, s9, -1151 - j label880 +label30: + lw s4, -12(s1) + addiw s2, s2, 4 + lw s7, -32(s1) + lw s3, -56(s1) + addw s5, s4, s7 + lw s6, -64(s1) + addw s7, s5, s6 + subw s4, s3, s7 + srliw s8, s4, 31 + add s6, s4, s8 + andi s7, s6, -2 + subw s9, s4, s7 + sh1add s5, s4, s9 + sw s5, 0(s1) + lw s7, -8(s1) + lw s8, -28(s1) + addw s9, s7, s8 + lw s6, -52(s1) + lw s10, -60(s1) + addw s4, s5, s6 + addw s11, s9, s10 + subw s5, s6, s11 + srliw s7, s5, 31 + add s8, s5, s7 + andi s6, s8, -2 + subw s9, s5, s6 + sh1add s10, s5, s9 + sw s10, 4(s1) + lw s7, -4(s1) + lw s9, -24(s1) + addw s5, s3, s7 + lw s8, -48(s1) + addw s6, s5, s9 + subw s3, s8, s6 + srliw s7, s3, 31 + add s9, s3, s7 + andi s6, s9, -2 + subw s5, s3, s6 + sh1add s7, s3, s5 + sw s7, 8(s1) + lw s6, -20(s1) + lw s7, -44(s1) + addw s5, s4, s6 + subw s3, s7, s5 + srliw s4, s3, 31 + add s6, s3, s4 + andi s5, s6, -2 + subw s7, s3, s5 + sh1add s4, s3, s7 + sw s4, 12(s1) + bge s2, a2, label323 + addi s1, s1, 16 + j label30 .p2align 2 -label461: - lui s6, 370728 +label323: + addi s1, sp, 104 mv s5, zero - addiw s7, s6, -1639 + mv s3, t4 + mv s6, t5 + mv s2, a6 + mv s4, t6 + mv s7, a7 + blt zero, a3, label329 + addw s9, t5, a6 + li s10, 1 + mv s11, zero + subw s8, s9, t6 + bne s10, zero, label619 + mv s11, s8 + li s9, 1 + bne s9, zero, label621 + lui s10, 828972 + addiw s11, s10, 262 + lui s10, 454047 + addiw s10, s10, -1151 + j label656 .p2align 2 -label42: - slliw s8, s0, 5 - addiw s2, s2, 1 - addw s10, s4, s8 - slli s8, s0, 1 - addw s9, s7, s10 - srli s4, s8, 59 - addw s6, s5, s9 - add s9, s0, s4 - slliw s4, s3, 30 - andi s5, s9, -32 - lw s9, 0(a6) - subw s8, s0, s5 - addw s7, s6, s8 - slli s8, s3, 1 - addw s5, s7, s9 - srli s10, s8, 34 - add s6, s3, s10 - sraiw s9, s6, 30 - slli s7, s9, 30 - subw s8, s3, s7 - addw s6, s4, s8 - bge s2, a0, label484 - addi a6, a6, 4 - mv s3, s0 - mv s4, s1 - mv s0, s5 - mv s1, a7 - mv a7, s6 - blt s2, a1, label461 - addw s6, s3, s6 - slti s7, s2, 60 - mv s9, zero - subw s8, s6, s1 - bne s7, zero, label848 - mv s9, s8 - slti s6, s2, 40 - mv s5, s8 - bne s6, zero, label850 - lui s10, 586172 - lui s11, 828972 - lui s9, 454047 - addiw s8, s11, 262 - addiw s7, s9, -1151 - j label880 +label329: + lui s9, 370728 + mv s8, zero + addiw s10, s9, -1639 .p2align 2 -label848: - slti s6, s2, 40 - mv s5, s8 - bne s6, zero, label850 - mv s5, s9 +label44: + slliw s9, s3, 5 + addiw s5, s5, 1 + addw s11, s7, s9 + addw s7, s10, s11 + slli s11, s3, 1 + addw s9, s8, s7 + srli s10, s11, 59 + lw s11, 0(s1) + add s8, s3, s10 + andi s7, s8, -32 + subw s8, s3, s7 + slliw s7, s6, 30 + addw s10, s9, s8 + addw s8, s10, s11 + slli s11, s6, 1 + srli s10, s11, 34 + add s9, s6, s10 + sraiw s11, s9, 30 + slli s9, s11, 30 + subw s10, s6, s9 + addw s9, s7, s10 + bge s5, a2, label352 + addi s1, s1, 4 + mv s6, s3 + mv s7, s4 + mv s3, s8 + mv s4, s2 + mv s2, s9 + blt s5, a3, label329 + addw s9, s6, s9 + slti s10, s5, 60 + mv s11, zero + subw s8, s9, s4 + bne s10, zero, label619 + mv s11, s8 + slti s9, s5, 40 + bne s9, zero, label621 + lui s10, 828972 + addiw s11, s10, 262 + lui s10, 454047 + addiw s10, s10, -1151 + j label656 .p2align 2 -label850: - lui s10, 586172 - addiw s8, s10, -804 - beq s7, zero, label851 +label619: + slti s9, s5, 40 + bne s9, zero, label621 + mv s8, s11 .p2align 2 -label852: - lui s9, 454047 - addiw s7, s9, -1151 - bne s6, zero, label42 +label621: + lui s11, 586172 + addiw s11, s11, -804 + beq s10, zero, label622 .p2align 2 -label880: - mv s7, s8 - j label42 +label623: + lui s10, 454047 + addiw s10, s10, -1151 + bne s9, zero, label44 .p2align 2 -label484: - addiw t6, t6, 64 - addw t1, t1, s5 - addw t2, t2, s0 - addw t3, t3, s6 - addw t4, t4, a7 - addw t5, t5, s1 - ble t0, t6, label47 - addi a5, a5, 256 - addi a6, sp, 112 - mv a7, zero - j label24 -label51: +label656: + mv s10, s11 + j label44 +label2: li a0, 184 jal _sysy_stoptime li a0, 5 - addi a1, sp, 432 + addi a1, sp, 424 jal putarray - ld ra, 0(sp) mv a0, zero + ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) ld s1, 24(sp) ld s6, 32(sp) ld s2, 40(sp) - ld s3, 48(sp) - ld s4, 56(sp) + ld s4, 48(sp) + ld s3, 56(sp) ld s7, 64(sp) - ld s10, 72(sp) - ld s8, 80(sp) - ld s11, 88(sp) - ld s9, 96(sp) - addi sp, sp, 456 + ld s8, 72(sp) + ld s9, 80(sp) + ld s10, 88(sp) + ld s11, 96(sp) + addi sp, sp, 448 ret .p2align 2 -label851: - lui s11, 828972 - addiw s8, s11, 262 - j label852 +label622: + lui s10, 828972 + addiw s11, s10, 262 + j label623 diff --git a/tests/SysY2022/performance/crypto-1.sy.ir b/tests/SysY2022/performance/crypto-1.sy.ir index 4840c8cb7..adad5665d 100644 --- a/tests/SysY2022/performance/crypto-1.sy.ir +++ b/tests/SysY2022/performance/crypto-1.sy.ir @@ -107,16 +107,16 @@ func @main() -> i32 { NoRecurse Entry } { i32* %91 = getelementptr &([32768 * i32]* %8)[i64 0][i64 32001]; cbr i1 %2(prob = 0.984615), ^while.body, ^b; ^while.body: - i32 %92 = phi [^entry, i32 %0] [^b4, i32 %120]; - i32 %93 = phi [^entry, i32 %1] [^b4, i32 %448]; + i32 %92 = phi [^entry, i32 %0] [^b4, i32 %108]; + i32 %93 = phi [^entry, i32 %1] [^b4, i32 %319]; ubr ^while.body1; ^b: call (i32) -> void @stoptime(i32 184); call (i32, i32*) -> void @putarray(i32 5, i32* %3); ret i32 0; ^while.body1: - i32 %94 = phi [^while.body, i32 %92] [^while.body1, i32 %120]; - i32 %95 = phi [^while.body, i32 0] [^while.body1, i32 %144]; + i32 %94 = phi [^while.body, i32 %92] [^while.body1, i32 %108]; + i32 %95 = phi [^while.body, i32 0] [^while.body1, i32 %120]; i32 %96 = mul i32 %94, i32 8193; i32 %97 = sdiv i32 %96, i32 131072; i32 %98 = add i32 %96, i32 %97; @@ -129,74 +129,46 @@ func @main() -> i32 { NoRecurse Entry } { i32 %105 = mul i32 %104, i32 270369; i32 %106 = sdiv i32 %105, i32 131072; i32 %107 = add i32 %105, i32 %106; - i32 %108 = mul i32 %107, i32 270369; - i32 %109 = sdiv i32 %108, i32 131072; - i32 %110 = add i32 %108, i32 %109; - i32 %111 = mul i32 %110, i32 270369; - i32 %112 = sdiv i32 %111, i32 131072; - i32 %113 = add i32 %111, i32 %112; - i32 %114 = mul i32 %113, i32 270369; - i32 %115 = sdiv i32 %114, i32 131072; - i32 %116 = add i32 %114, i32 %115; - i32 %117 = mul i32 %116, i32 270369; - i32 %118 = sdiv i32 %117, i32 131072; - i32 %119 = add i32 %117, i32 %118; - i32 %120 = mul i32 %119, i32 33; - i32 %121 = srem i32 %120, i32 256; - i32 %122 = mul i32 %116, i32 33; - i32 %123 = srem i32 %122, i32 256; - i32 %124 = mul i32 %113, i32 33; - i32 %125 = srem i32 %124, i32 256; - i32 %126 = mul i32 %110, i32 33; - i32 %127 = srem i32 %126, i32 256; - i32 %128 = mul i32 %107, i32 33; - i32 %129 = srem i32 %128, i32 256; - i32 %130 = mul i32 %104, i32 33; - i32 %131 = srem i32 %130, i32 256; - i32 %132 = mul i32 %101, i32 33; - i32 %133 = srem i32 %132, i32 256; - i32 %134 = mul i32 %98, i32 33; - i32 %135 = srem i32 %134, i32 256; - i32* %136 = getelementptr &([32768 * i32]* %8)[i64 0][i32 %95]; - store i32* %136 with i32 %135; - i32* %137 = getelementptr &(i32* %136)[i64 1]; - store i32* %137 with i32 %133; - i32* %138 = getelementptr &(i32* %136)[i64 2]; - store i32* %138 with i32 %131; - i32* %139 = getelementptr &(i32* %136)[i64 3]; - store i32* %139 with i32 %129; - i32* %140 = getelementptr &(i32* %136)[i64 4]; - store i32* %140 with i32 %127; - i32* %141 = getelementptr &(i32* %136)[i64 5]; - store i32* %141 with i32 %125; - i32* %142 = getelementptr &(i32* %136)[i64 6]; - store i32* %142 with i32 %123; - i32* %143 = getelementptr &(i32* %136)[i64 7]; - store i32* %143 with i32 %121; - i32 %144 = add i32 %95, i32 8; - i1 %145 = icmp slt i32 %144, i32 32000; - cbr i1 %145(prob = 0.99975), ^while.body1, ^postbody; + i32 %108 = mul i32 %107, i32 33; + i32 %109 = srem i32 %108, i32 256; + i32 %110 = mul i32 %104, i32 33; + i32 %111 = srem i32 %110, i32 256; + i32 %112 = mul i32 %101, i32 33; + i32 %113 = srem i32 %112, i32 256; + i32 %114 = mul i32 %98, i32 33; + i32 %115 = srem i32 %114, i32 256; + i32* %116 = getelementptr &([32768 * i32]* %8)[i64 0][i32 %95]; + store i32* %116 with i32 %115; + i32* %117 = getelementptr &(i32* %116)[i64 1]; + store i32* %117 with i32 %113; + i32* %118 = getelementptr &(i32* %116)[i64 2]; + store i32* %118 with i32 %111; + i32* %119 = getelementptr &(i32* %116)[i64 3]; + store i32* %119 with i32 %109; + i32 %120 = add i32 %95, i32 4; + i1 %121 = icmp slt i32 %120, i32 32000; + cbr i1 %121(prob = 0.999875), ^while.body1, ^postbody; ^postbody: store i32* %90 with i32 128; ubr ^while.body2; ^while.body2: - i32 %146 = phi [^postbody, i32 32002] [^while.body2, i32 %151]; - i32* %147 = phi [^postbody, i32* %91] [^while.body2, i32* %150]; - i32 %148 = and i32 %146, i32 63; - i1 %149 = icmp neq i32 %148, i32 60; - i32* %150 = getelementptr &(i32* %9)[i32 %146]; - store i32* %147 with i32 0; - i32 %151 = add i32 %146, i32 1; - cbr i1 %149(prob = 0.984615), ^while.body2, ^b1; + i32 %122 = phi [^postbody, i32 32002] [^while.body2, i32 %127]; + i32* %123 = phi [^postbody, i32* %91] [^while.body2, i32* %126]; + i32 %124 = and i32 %122, i32 63; + i1 %125 = icmp neq i32 %124, i32 60; + i32* %126 = getelementptr &(i32* %9)[i32 %122]; + store i32* %123 with i32 0; + i32 %127 = add i32 %122, i32 1; + cbr i1 %125(prob = 0.984615), ^while.body2, ^b1; ^b1: - store i32* %150 with i32 0; - i32* %152 = getelementptr &(i32* %150)[i64 1]; - store i32* %152 with i32 0; - i32* %153 = getelementptr &(i32* %150)[i64 2]; - store i32* %153 with i32 125; - i32* %154 = getelementptr &(i32* %150)[i64 3]; - store i32* %154 with i32 0; - i32 %155 = add i32 %146, i32 4; + store i32* %126 with i32 0; + i32* %128 = getelementptr &(i32* %126)[i64 1]; + store i32* %128 with i32 0; + i32* %129 = getelementptr &(i32* %126)[i64 2]; + store i32* %129 with i32 125; + i32* %130 = getelementptr &(i32* %126)[i64 3]; + store i32* %130 with i32 0; + i32 %131 = add i32 %122, i32 4; store i32* %10 with i32 0; store i32* %11 with i32 0; store i32* %12 with i32 0; @@ -279,334 +251,221 @@ func @main() -> i32 { NoRecurse Entry } { store i32* %89 with i32 0; ubr ^while.body3; ^while.body3: - i32 %156 = phi [^b1, i32 0] [^b3, i32 %426]; - i32 %157 = phi [^b1, i32 1732584193] [^b3, i32 %428]; - i32 %158 = phi [^b1, i32 -271733879] [^b3, i32 %429]; - i32 %159 = phi [^b1, i32 -1732584194] [^b3, i32 %430]; - i32 %160 = phi [^b1, i32 271733878] [^b3, i32 %431]; - i32 %161 = phi [^b1, i32 -1009589776] [^b3, i32 %432]; - i32* %162 = getelementptr &(i32* %9)[i32 %156]; + i32 %132 = phi [^b1, i32 0] [^b3, i32 %297]; + i32 %133 = phi [^b1, i32 1732584193] [^b3, i32 %299]; + i32 %134 = phi [^b1, i32 -271733879] [^b3, i32 %300]; + i32 %135 = phi [^b1, i32 -1732584194] [^b3, i32 %301]; + i32 %136 = phi [^b1, i32 271733878] [^b3, i32 %302]; + i32 %137 = phi [^b1, i32 -1009589776] [^b3, i32 %303]; + i32* %138 = getelementptr &(i32* %9)[i32 %132]; ubr ^while.body4; ^while.body4: - i32 %163 = phi [^while.body3, i32 0] [^while.body4, i32 %299]; - i32 %164 = mul i32 %163, i32 4; - i32* %165 = getelementptr &(i32* %162)[i32 %164]; + i32 %139 = phi [^while.body3, i32 0] [^while.body4, i32 %207]; + i32 %140 = mul i32 %139, i32 4; + i32* %141 = getelementptr &(i32* %138)[i32 %140]; + i32 %142 = load i32* %141; + i32* %143 = getelementptr &(i32* %141)[i64 1]; + i32 %144 = load i32* %143; + i32 %145 = mul i32 %144, i32 65536; + i32 %146 = mul i32 %142, i32 16777216; + i32 %147 = add i32 %145, i32 %146; + i32* %148 = getelementptr &(i32* %141)[i64 2]; + i32 %149 = load i32* %148; + i32 %150 = mul i32 %149, i32 256; + i32 %151 = add i32 %147, i32 %150; + i32* %152 = getelementptr &(i32* %141)[i64 3]; + i32 %153 = load i32* %152; + i32 %154 = add i32 %151, i32 %153; + i32* %155 = getelementptr &([80 * i32]* %words)[i64 0][i32 %139]; + store i32* %155 with i32 %154; + i32 %156 = add i32 %139, i32 1; + i32 %157 = mul i32 %156, i32 4; + i32* %158 = getelementptr &(i32* %138)[i32 %157]; + i32 %159 = load i32* %158; + i32* %160 = getelementptr &(i32* %158)[i64 1]; + i32 %161 = load i32* %160; + i32 %162 = mul i32 %161, i32 65536; + i32 %163 = mul i32 %159, i32 16777216; + i32 %164 = add i32 %162, i32 %163; + i32* %165 = getelementptr &(i32* %158)[i64 2]; i32 %166 = load i32* %165; - i32* %167 = getelementptr &(i32* %165)[i64 1]; - i32 %168 = load i32* %167; - i32 %169 = mul i32 %168, i32 65536; - i32 %170 = mul i32 %166, i32 16777216; - i32 %171 = add i32 %169, i32 %170; - i32* %172 = getelementptr &(i32* %165)[i64 2]; - i32 %173 = load i32* %172; - i32 %174 = mul i32 %173, i32 256; - i32 %175 = add i32 %171, i32 %174; - i32* %176 = getelementptr &(i32* %165)[i64 3]; - i32 %177 = load i32* %176; - i32 %178 = add i32 %175, i32 %177; - i32* %179 = getelementptr &([80 * i32]* %words)[i64 0][i32 %163]; - store i32* %179 with i32 %178; - i32 %180 = add i32 %163, i32 1; - i32 %181 = mul i32 %180, i32 4; - i32* %182 = getelementptr &(i32* %162)[i32 %181]; + i32 %167 = mul i32 %166, i32 256; + i32 %168 = add i32 %164, i32 %167; + i32* %169 = getelementptr &(i32* %158)[i64 3]; + i32 %170 = load i32* %169; + i32 %171 = add i32 %168, i32 %170; + i32* %172 = getelementptr &(i32* %155)[i64 1]; + store i32* %172 with i32 %171; + i32 %173 = add i32 %139, i32 2; + i32 %174 = mul i32 %173, i32 4; + i32* %175 = getelementptr &(i32* %138)[i32 %174]; + i32 %176 = load i32* %175; + i32* %177 = getelementptr &(i32* %175)[i64 1]; + i32 %178 = load i32* %177; + i32 %179 = mul i32 %178, i32 65536; + i32 %180 = mul i32 %176, i32 16777216; + i32 %181 = add i32 %179, i32 %180; + i32* %182 = getelementptr &(i32* %175)[i64 2]; i32 %183 = load i32* %182; - i32* %184 = getelementptr &(i32* %182)[i64 1]; - i32 %185 = load i32* %184; - i32 %186 = mul i32 %185, i32 65536; - i32 %187 = mul i32 %183, i32 16777216; - i32 %188 = add i32 %186, i32 %187; - i32* %189 = getelementptr &(i32* %182)[i64 2]; - i32 %190 = load i32* %189; - i32 %191 = mul i32 %190, i32 256; - i32 %192 = add i32 %188, i32 %191; - i32* %193 = getelementptr &(i32* %182)[i64 3]; - i32 %194 = load i32* %193; - i32 %195 = add i32 %192, i32 %194; - i32* %196 = getelementptr &(i32* %179)[i64 1]; - store i32* %196 with i32 %195; - i32 %197 = add i32 %163, i32 2; - i32 %198 = mul i32 %197, i32 4; - i32* %199 = getelementptr &(i32* %162)[i32 %198]; + i32 %184 = mul i32 %183, i32 256; + i32 %185 = add i32 %181, i32 %184; + i32* %186 = getelementptr &(i32* %175)[i64 3]; + i32 %187 = load i32* %186; + i32 %188 = add i32 %185, i32 %187; + i32* %189 = getelementptr &(i32* %155)[i64 2]; + store i32* %189 with i32 %188; + i32 %190 = add i32 %139, i32 3; + i32 %191 = mul i32 %190, i32 4; + i32* %192 = getelementptr &(i32* %138)[i32 %191]; + i32 %193 = load i32* %192; + i32* %194 = getelementptr &(i32* %192)[i64 1]; + i32 %195 = load i32* %194; + i32 %196 = mul i32 %195, i32 65536; + i32 %197 = mul i32 %193, i32 16777216; + i32 %198 = add i32 %196, i32 %197; + i32* %199 = getelementptr &(i32* %192)[i64 2]; i32 %200 = load i32* %199; - i32* %201 = getelementptr &(i32* %199)[i64 1]; - i32 %202 = load i32* %201; - i32 %203 = mul i32 %202, i32 65536; - i32 %204 = mul i32 %200, i32 16777216; - i32 %205 = add i32 %203, i32 %204; - i32* %206 = getelementptr &(i32* %199)[i64 2]; - i32 %207 = load i32* %206; - i32 %208 = mul i32 %207, i32 256; - i32 %209 = add i32 %205, i32 %208; - i32* %210 = getelementptr &(i32* %199)[i64 3]; - i32 %211 = load i32* %210; - i32 %212 = add i32 %209, i32 %211; - i32* %213 = getelementptr &(i32* %179)[i64 2]; - store i32* %213 with i32 %212; - i32 %214 = add i32 %163, i32 3; - i32 %215 = mul i32 %214, i32 4; - i32* %216 = getelementptr &(i32* %162)[i32 %215]; + i32 %201 = mul i32 %200, i32 256; + i32 %202 = add i32 %198, i32 %201; + i32* %203 = getelementptr &(i32* %192)[i64 3]; + i32 %204 = load i32* %203; + i32 %205 = add i32 %202, i32 %204; + i32* %206 = getelementptr &(i32* %155)[i64 3]; + store i32* %206 with i32 %205; + i32 %207 = add i32 %139, i32 4; + i1 %208 = icmp slt i32 %207, i32 16; + cbr i1 %208(prob = 0.75), ^while.body4, ^while.body5; + ^while.body5: + i32 %209 = phi [^while.body4, i32 16] [^while.body5, i32 %264]; + i32* %210 = getelementptr &([80 * i32]* %words)[i64 0][i32 %209]; + i32* %211 = getelementptr &(i32* %210)[i64 -3]; + i32 %212 = load i32* %211; + i32* %213 = getelementptr &(i32* %210)[i64 -8]; + i32 %214 = load i32* %213; + i32 %215 = add i32 %212, i32 %214; + i32* %216 = getelementptr &(i32* %210)[i64 -14]; i32 %217 = load i32* %216; - i32* %218 = getelementptr &(i32* %216)[i64 1]; + i32* %218 = getelementptr &(i32* %210)[i64 -16]; i32 %219 = load i32* %218; - i32 %220 = mul i32 %219, i32 65536; - i32 %221 = mul i32 %217, i32 16777216; - i32 %222 = add i32 %220, i32 %221; - i32* %223 = getelementptr &(i32* %216)[i64 2]; - i32 %224 = load i32* %223; - i32 %225 = mul i32 %224, i32 256; - i32 %226 = add i32 %222, i32 %225; - i32* %227 = getelementptr &(i32* %216)[i64 3]; + i32 %220 = add i32 %215, i32 %219; + i32 %221 = sub i32 %217, i32 %220; + i32 %222 = mul i32 %221, i32 2; + i32 %223 = srem i32 %221, i32 2; + i32 %224 = add i32 %222, i32 %223; + store i32* %210 with i32 %224; + i32* %225 = getelementptr &(i32* %210)[i64 -2]; + i32 %226 = load i32* %225; + i32* %227 = getelementptr &(i32* %210)[i64 -7]; i32 %228 = load i32* %227; - i32 %229 = add i32 %226, i32 %228; - i32* %230 = getelementptr &(i32* %179)[i64 3]; - store i32* %230 with i32 %229; - i32 %231 = add i32 %163, i32 4; - i32 %232 = mul i32 %231, i32 4; - i32* %233 = getelementptr &(i32* %162)[i32 %232]; + i32* %229 = getelementptr &(i32* %210)[i64 -13]; + i32 %230 = load i32* %229; + i32 %231 = add i32 %224, i32 %230; + i32 %232 = add i32 %226, i32 %228; + i32* %233 = getelementptr &(i32* %210)[i64 -15]; i32 %234 = load i32* %233; - i32* %235 = getelementptr &(i32* %233)[i64 1]; - i32 %236 = load i32* %235; - i32 %237 = mul i32 %236, i32 65536; - i32 %238 = mul i32 %234, i32 16777216; + i32 %235 = add i32 %232, i32 %234; + i32 %236 = sub i32 %230, i32 %235; + i32 %237 = mul i32 %236, i32 2; + i32 %238 = srem i32 %236, i32 2; i32 %239 = add i32 %237, i32 %238; - i32* %240 = getelementptr &(i32* %233)[i64 2]; - i32 %241 = load i32* %240; - i32 %242 = mul i32 %241, i32 256; - i32 %243 = add i32 %239, i32 %242; - i32* %244 = getelementptr &(i32* %233)[i64 3]; + i32* %240 = getelementptr &(i32* %210)[i64 1]; + store i32* %240 with i32 %239; + i32* %241 = getelementptr &(i32* %210)[i64 -1]; + i32 %242 = load i32* %241; + i32 %243 = add i32 %217, i32 %242; + i32* %244 = getelementptr &(i32* %210)[i64 -6]; i32 %245 = load i32* %244; i32 %246 = add i32 %243, i32 %245; - i32* %247 = getelementptr &(i32* %179)[i64 4]; - store i32* %247 with i32 %246; - i32 %248 = add i32 %163, i32 5; - i32 %249 = mul i32 %248, i32 4; - i32* %250 = getelementptr &(i32* %162)[i32 %249]; - i32 %251 = load i32* %250; - i32* %252 = getelementptr &(i32* %250)[i64 1]; - i32 %253 = load i32* %252; - i32 %254 = mul i32 %253, i32 65536; - i32 %255 = mul i32 %251, i32 16777216; - i32 %256 = add i32 %254, i32 %255; - i32* %257 = getelementptr &(i32* %250)[i64 2]; + i32* %247 = getelementptr &(i32* %210)[i64 -12]; + i32 %248 = load i32* %247; + i32 %249 = sub i32 %248, i32 %246; + i32 %250 = mul i32 %249, i32 2; + i32 %251 = srem i32 %249, i32 2; + i32 %252 = add i32 %250, i32 %251; + i32* %253 = getelementptr &(i32* %210)[i64 2]; + store i32* %253 with i32 %252; + i32* %254 = getelementptr &(i32* %210)[i64 -5]; + i32 %255 = load i32* %254; + i32 %256 = add i32 %231, i32 %255; + i32* %257 = getelementptr &(i32* %210)[i64 -11]; i32 %258 = load i32* %257; - i32 %259 = mul i32 %258, i32 256; - i32 %260 = add i32 %256, i32 %259; - i32* %261 = getelementptr &(i32* %250)[i64 3]; - i32 %262 = load i32* %261; - i32 %263 = add i32 %260, i32 %262; - i32* %264 = getelementptr &(i32* %179)[i64 5]; - store i32* %264 with i32 %263; - i32 %265 = add i32 %163, i32 6; - i32 %266 = mul i32 %265, i32 4; - i32* %267 = getelementptr &(i32* %162)[i32 %266]; - i32 %268 = load i32* %267; - i32* %269 = getelementptr &(i32* %267)[i64 1]; - i32 %270 = load i32* %269; - i32 %271 = mul i32 %270, i32 65536; - i32 %272 = mul i32 %268, i32 16777216; - i32 %273 = add i32 %271, i32 %272; - i32* %274 = getelementptr &(i32* %267)[i64 2]; - i32 %275 = load i32* %274; - i32 %276 = mul i32 %275, i32 256; - i32 %277 = add i32 %273, i32 %276; - i32* %278 = getelementptr &(i32* %267)[i64 3]; - i32 %279 = load i32* %278; - i32 %280 = add i32 %277, i32 %279; - i32* %281 = getelementptr &(i32* %179)[i64 6]; - store i32* %281 with i32 %280; - i32 %282 = add i32 %163, i32 7; - i32 %283 = mul i32 %282, i32 4; - i32* %284 = getelementptr &(i32* %162)[i32 %283]; - i32 %285 = load i32* %284; - i32* %286 = getelementptr &(i32* %284)[i64 1]; - i32 %287 = load i32* %286; - i32 %288 = mul i32 %287, i32 65536; - i32 %289 = mul i32 %285, i32 16777216; - i32 %290 = add i32 %288, i32 %289; - i32* %291 = getelementptr &(i32* %284)[i64 2]; - i32 %292 = load i32* %291; - i32 %293 = mul i32 %292, i32 256; - i32 %294 = add i32 %290, i32 %293; - i32* %295 = getelementptr &(i32* %284)[i64 3]; - i32 %296 = load i32* %295; - i32 %297 = add i32 %294, i32 %296; - i32* %298 = getelementptr &(i32* %179)[i64 7]; - store i32* %298 with i32 %297; - i32 %299 = add i32 %163, i32 8; - i1 %300 = icmp slt i32 %299, i32 16; - cbr i1 %300(prob = 0.5), ^while.body4, ^while.body5; - ^while.body5: - i32 %301 = phi [^while.body4, i32 16] [^while.body5, i32 %393]; - i32* %302 = getelementptr &([80 * i32]* %words)[i64 0][i32 %301]; - i32* %303 = getelementptr &(i32* %302)[i64 -3]; - i32 %304 = load i32* %303; - i32* %305 = getelementptr &(i32* %302)[i64 -8]; - i32 %306 = load i32* %305; - i32 %307 = add i32 %304, i32 %306; - i32* %308 = getelementptr &(i32* %302)[i64 -14]; - i32 %309 = load i32* %308; - i32* %310 = getelementptr &(i32* %302)[i64 -16]; - i32 %311 = load i32* %310; - i32 %312 = add i32 %307, i32 %311; - i32 %313 = sub i32 %309, i32 %312; - i32 %314 = mul i32 %313, i32 2; - i32 %315 = srem i32 %313, i32 2; - i32 %316 = add i32 %314, i32 %315; - store i32* %302 with i32 %316; - i32* %317 = getelementptr &(i32* %302)[i64 -2]; - i32 %318 = load i32* %317; - i32* %319 = getelementptr &(i32* %302)[i64 -7]; - i32 %320 = load i32* %319; - i32* %321 = getelementptr &(i32* %302)[i64 -13]; - i32 %322 = load i32* %321; - i32 %323 = add i32 %316, i32 %322; - i32 %324 = add i32 %318, i32 %320; - i32* %325 = getelementptr &(i32* %302)[i64 -15]; - i32 %326 = load i32* %325; - i32 %327 = add i32 %324, i32 %326; - i32 %328 = sub i32 %322, i32 %327; - i32 %329 = mul i32 %328, i32 2; - i32 %330 = srem i32 %328, i32 2; - i32 %331 = add i32 %329, i32 %330; - i32* %332 = getelementptr &(i32* %302)[i64 1]; - store i32* %332 with i32 %331; - i32* %333 = getelementptr &(i32* %302)[i64 -1]; - i32 %334 = load i32* %333; - i32 %335 = add i32 %309, i32 %334; - i32* %336 = getelementptr &(i32* %302)[i64 -6]; - i32 %337 = load i32* %336; - i32 %338 = add i32 %335, i32 %337; - i32* %339 = getelementptr &(i32* %302)[i64 -12]; - i32 %340 = load i32* %339; - i32 %341 = sub i32 %340, i32 %338; - i32 %342 = mul i32 %341, i32 2; - i32 %343 = srem i32 %341, i32 2; - i32 %344 = add i32 %342, i32 %343; - i32* %345 = getelementptr &(i32* %302)[i64 2]; - store i32* %345 with i32 %344; - i32* %346 = getelementptr &(i32* %302)[i64 -5]; - i32 %347 = load i32* %346; - i32 %348 = add i32 %323, i32 %347; - i32* %349 = getelementptr &(i32* %302)[i64 -11]; - i32 %350 = load i32* %349; - i32 %351 = sub i32 %350, i32 %348; - i32 %352 = mul i32 %351, i32 2; - i32 %353 = add i32 %318, i32 %352; - i32 %354 = srem i32 %351, i32 2; - i32 %355 = add i32 %353, i32 %354; - i32 %356 = add i32 %352, i32 %354; - i32* %357 = getelementptr &(i32* %302)[i64 3]; - store i32* %357 with i32 %356; - i32* %358 = getelementptr &(i32* %302)[i64 -4]; - i32 %359 = load i32* %358; - i32* %360 = getelementptr &(i32* %302)[i64 -10]; - i32 %361 = load i32* %360; - i32 %362 = add i32 %355, i32 %361; - i32 %363 = sub i32 %306, i32 %362; - i32 %364 = mul i32 %363, i32 2; - i32 %365 = srem i32 %363, i32 2; - i32 %366 = add i32 %364, i32 %365; - i32 %367 = add i32 %331, i32 %340; - i32 %368 = add i32 %359, i32 %367; - i32 %369 = sub i32 %361, i32 %368; - i32 %370 = mul i32 %369, i32 2; - i32 %371 = srem i32 %369, i32 2; - i32 %372 = add i32 %370, i32 %371; - i32* %373 = getelementptr &(i32* %302)[i64 4]; - store i32* %373 with i32 %372; - i32 %374 = add i32 %304, i32 %342; - i32 %375 = add i32 %343, i32 %374; - i32 %376 = add i32 %350, i32 %375; - i32* %377 = getelementptr &(i32* %302)[i64 -9]; - i32 %378 = load i32* %377; - i32 %379 = sub i32 %378, i32 %376; - i32 %380 = mul i32 %379, i32 2; - i32 %381 = srem i32 %379, i32 2; - i32 %382 = add i32 %380, i32 %381; - i32* %383 = getelementptr &(i32* %302)[i64 5]; - store i32* %383 with i32 %382; - i32* %384 = getelementptr &(i32* %302)[i64 6]; - store i32* %384 with i32 %366; - i32 %385 = add i32 %334, i32 %370; - i32 %386 = add i32 %371, i32 %385; - i32 %387 = add i32 %378, i32 %386; - i32 %388 = sub i32 %320, i32 %387; - i32 %389 = mul i32 %388, i32 2; - i32 %390 = srem i32 %388, i32 2; - i32 %391 = add i32 %389, i32 %390; - i32* %392 = getelementptr &(i32* %302)[i64 7]; - store i32* %392 with i32 %391; - i32 %393 = add i32 %301, i32 8; - i1 %394 = icmp slt i32 %393, i32 80; - cbr i1 %394(prob = 0.875), ^while.body5, ^while.body6; + i32 %259 = sub i32 %258, i32 %256; + i32 %260 = mul i32 %259, i32 2; + i32 %261 = srem i32 %259, i32 2; + i32 %262 = add i32 %260, i32 %261; + i32* %263 = getelementptr &(i32* %210)[i64 3]; + store i32* %263 with i32 %262; + i32 %264 = add i32 %209, i32 4; + i1 %265 = icmp slt i32 %264, i32 80; + cbr i1 %265(prob = 0.9375), ^while.body5, ^while.body6; ^while.body6: - i32 %395 = phi [^while.body5, i32 0] [^b2, i32 %424]; - i32 %396 = phi [^while.body5, i32 %157] [^b2, i32 %420]; - i32 %397 = phi [^while.body5, i32 %158] [^b2, i32 %396]; - i32 %398 = phi [^while.body5, i32 %159] [^b2, i32 %423]; - i32 %399 = phi [^while.body5, i32 %160] [^b2, i32 %398]; - i32 %400 = phi [^while.body5, i32 %161] [^b2, i32 %399]; - i1 %401 = icmp slt i32 %395, i32 20; - cbr i1 %401(prob = 0.5), ^b2, ^if.else; + i32 %266 = phi [^while.body5, i32 0] [^b2, i32 %295]; + i32 %267 = phi [^while.body5, i32 %133] [^b2, i32 %291]; + i32 %268 = phi [^while.body5, i32 %134] [^b2, i32 %267]; + i32 %269 = phi [^while.body5, i32 %135] [^b2, i32 %294]; + i32 %270 = phi [^while.body5, i32 %136] [^b2, i32 %269]; + i32 %271 = phi [^while.body5, i32 %137] [^b2, i32 %270]; + i1 %272 = icmp slt i32 %266, i32 20; + cbr i1 %272(prob = 0.5), ^b2, ^if.else; ^if.else: - i32 %402 = add i32 %397, i32 %398; - i32 %403 = sub i32 %402, i32 %399; - i1 %404 = icmp slt i32 %395, i32 60; - i32 %405 = select i1 %404 ? i32 0 : i32 %403; - i1 %406 = icmp slt i32 %395, i32 40; - i32 %407 = select i1 %406 ? i32 %403 : i32 %405; - i32 %408 = select i1 %404 ? i32 -1894007588 : i32 -899497722; - i32 %409 = select i1 %406 ? i32 1859775361 : i32 %408; + i32 %273 = add i32 %268, i32 %269; + i32 %274 = sub i32 %273, i32 %270; + i1 %275 = icmp slt i32 %266, i32 60; + i32 %276 = select i1 %275 ? i32 0 : i32 %274; + i1 %277 = icmp slt i32 %266, i32 40; + i32 %278 = select i1 %277 ? i32 %274 : i32 %276; + i32 %279 = select i1 %275 ? i32 -1894007588 : i32 -899497722; + i32 %280 = select i1 %277 ? i32 1859775361 : i32 %279; ubr ^b2; ^b2: - i32 %410 = phi [^while.body6, i32 1518500249] [^if.else, i32 %409]; - i32 %411 = phi [^while.body6, i32 0] [^if.else, i32 %407]; - i32 %412 = mul i32 %396, i32 32; - i32 %413 = add i32 %400, i32 %412; - i32 %414 = add i32 %410, i32 %413; - i32 %415 = add i32 %411, i32 %414; - i32 %416 = srem i32 %396, i32 32; - i32 %417 = add i32 %415, i32 %416; - i32* %418 = getelementptr &([80 * i32]* %words)[i64 0][i32 %395]; - i32 %419 = load i32* %418; - i32 %420 = add i32 %417, i32 %419; - i32 %421 = mul i32 %397, i32 1073741824; - i32 %422 = srem i32 %397, i32 1073741824; - i32 %423 = add i32 %421, i32 %422; - i32 %424 = add i32 %395, i32 1; - i1 %425 = icmp slt i32 %424, i32 80; - cbr i1 %425(prob = 0.9875), ^while.body6, ^b3; + i32 %281 = phi [^while.body6, i32 1518500249] [^if.else, i32 %280]; + i32 %282 = phi [^while.body6, i32 0] [^if.else, i32 %278]; + i32 %283 = mul i32 %267, i32 32; + i32 %284 = add i32 %271, i32 %283; + i32 %285 = add i32 %281, i32 %284; + i32 %286 = add i32 %282, i32 %285; + i32 %287 = srem i32 %267, i32 32; + i32 %288 = add i32 %286, i32 %287; + i32* %289 = getelementptr &([80 * i32]* %words)[i64 0][i32 %266]; + i32 %290 = load i32* %289; + i32 %291 = add i32 %288, i32 %290; + i32 %292 = mul i32 %268, i32 1073741824; + i32 %293 = srem i32 %268, i32 1073741824; + i32 %294 = add i32 %292, i32 %293; + i32 %295 = add i32 %266, i32 1; + i1 %296 = icmp slt i32 %295, i32 80; + cbr i1 %296(prob = 0.9875), ^while.body6, ^b3; ^b3: - i32 %426 = add i32 %156, i32 64; - i1 %427 = icmp sgt i32 %155, i32 %426; - i32 %428 = add i32 %157, i32 %420; - i32 %429 = add i32 %158, i32 %396; - i32 %430 = add i32 %159, i32 %423; - i32 %431 = add i32 %160, i32 %398; - i32 %432 = add i32 %161, i32 %399; - cbr i1 %427(prob = 0.984615), ^while.body3, ^b4; + i32 %297 = add i32 %132, i32 64; + i1 %298 = icmp sgt i32 %131, i32 %297; + i32 %299 = add i32 %133, i32 %291; + i32 %300 = add i32 %134, i32 %267; + i32 %301 = add i32 %135, i32 %294; + i32 %302 = add i32 %136, i32 %269; + i32 %303 = add i32 %137, i32 %270; + cbr i1 %298(prob = 0.984615), ^while.body3, ^b4; ^b4: - i32 %433 = load i32* %3; - i32 %434 = add i32 %428, i32 %433; - i32 %435 = neg i32 %434; - store i32* %3 with i32 %435; - i32 %436 = load i32* %4; - i32 %437 = add i32 %429, i32 %436; - i32 %438 = neg i32 %437; - store i32* %4 with i32 %438; - i32 %439 = load i32* %5; - i32 %440 = add i32 %430, i32 %439; - i32 %441 = neg i32 %440; - store i32* %5 with i32 %441; - i32 %442 = load i32* %6; - i32 %443 = add i32 %431, i32 %442; - i32 %444 = neg i32 %443; - store i32* %6 with i32 %444; - i32 %445 = load i32* %7; - i32 %446 = add i32 %432, i32 %445; - i32 %447 = neg i32 %446; - store i32* %7 with i32 %447; - i32 %448 = add i32 %93, i32 -1; - i1 %449 = icmp sgt i32 %448, i32 0; - cbr i1 %449(prob = 0.984615), ^while.body, ^b; + i32 %304 = load i32* %3; + i32 %305 = add i32 %299, i32 %304; + i32 %306 = neg i32 %305; + store i32* %3 with i32 %306; + i32 %307 = load i32* %4; + i32 %308 = add i32 %300, i32 %307; + i32 %309 = neg i32 %308; + store i32* %4 with i32 %309; + i32 %310 = load i32* %5; + i32 %311 = add i32 %301, i32 %310; + i32 %312 = neg i32 %311; + store i32* %5 with i32 %312; + i32 %313 = load i32* %6; + i32 %314 = add i32 %302, i32 %313; + i32 %315 = neg i32 %314; + store i32* %6 with i32 %315; + i32 %316 = load i32* %7; + i32 %317 = add i32 %303, i32 %316; + i32 %318 = neg i32 %317; + store i32* %7 with i32 %318; + i32 %319 = add i32 %93, i32 -1; + i1 %320 = icmp sgt i32 %319, i32 0; + cbr i1 %320(prob = 0.984615), ^while.body, ^b; } diff --git a/tests/SysY2022/performance/crypto-2.arm.s b/tests/SysY2022/performance/crypto-2.arm.s index 32df7a74c..7a06355e8 100644 --- a/tests/SysY2022/performance/crypto-2.arm.s +++ b/tests/SysY2022/performance/crypto-2.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 buffer: .zero 131072 .text diff --git a/tests/SysY2022/performance/crypto-2.riscv.s b/tests/SysY2022/performance/crypto-2.riscv.s index 9c79548b6..fd1263e83 100644 --- a/tests/SysY2022/performance/crypto-2.riscv.s +++ b/tests/SysY2022/performance/crypto-2.riscv.s @@ -1,247 +1,193 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 buffer: .zero 131072 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[340] RegSpill[8] CalleeSaved[104] - addi sp, sp, -456 + # stack usage: CalleeArg[0] Local[340] RegSpill[0] CalleeSaved[104] + addi sp, sp, -448 sd ra, 0(sp) sd s0, 8(sp) sd s5, 16(sp) sd s1, 24(sp) sd s6, 32(sp) sd s2, 40(sp) - sd s3, 48(sp) - sd s4, 56(sp) + sd s4, 48(sp) + sd s3, 56(sp) sd s7, 64(sp) - sd s10, 72(sp) - sd s8, 80(sp) - sd s11, 88(sp) - sd s9, 96(sp) + sd s8, 72(sp) + sd s9, 80(sp) + sd s10, 88(sp) + sd s11, 96(sp) jal getint mv s0, a0 jal getint mv s1, a0 li a0, 161 jal _sysy_starttime -pcrel889: - auipc a1, %pcrel_hi(buffer) - lui a0, 31 + sd zero, 424(sp) + li t0, 125 +pcrel667: + auipc a0, %pcrel_hi(buffer) + lui a3, 31 + lui a2, 66 + addi a4, a0, %pcrel_lo(pcrel667) + addiw a1, a3, 1028 + addiw a0, a2, 33 sd zero, 432(sp) - addi a5, a1, %pcrel_lo(pcrel889) - addiw a3, a0, 1028 - li a1, 20 - sd zero, 440(sp) - li a0, 80 - add a2, a5, a3 - sw zero, 448(sp) - sd a5, 104(sp) - ble s1, zero, label51 - mv a4, s0 - mv a3, s1 - j label2 -.p2align 2 -label47: - lw t0, 432(sp) - addiw a3, a3, -1 - addw t6, t1, t0 - subw a5, zero, t6 - sw a5, 432(sp) - lw t0, 436(sp) - addw t6, t2, t0 - subw t1, zero, t6 - sw t1, 436(sp) - lw t0, 440(sp) - addw a5, t3, t0 - subw t1, zero, a5 - sw t1, 440(sp) - lw t0, 444(sp) - addw a5, t4, t0 - subw t1, zero, a5 - sw t1, 444(sp) - lw t2, 448(sp) - addw t0, t5, t2 - subw a5, zero, t0 - sw a5, 448(sp) - ble a3, zero, label51 -.p2align 2 -label2: - ld a5, 104(sp) - mv t0, zero - j label5 + li a3, 20 + add a5, a4, a1 + li a2, 80 + sw zero, 440(sp) + slli a1, t0, 8 + ble s1, zero, label2 + mv t1, s0 + mv t0, s1 + mv t2, a4 + mv t3, zero + j label6 .p2align 2 -label9: - addi a5, a5, 32 +label49: + lw s0, 424(sp) + addiw t0, t0, -1 + addw t2, t4, s0 + subw t3, zero, t2 + sw t3, 424(sp) + lw s0, 428(sp) + addw t2, t5, s0 + subw t4, zero, t2 + sw t4, 428(sp) + lw t3, 432(sp) + addw t2, a6, t3 + subw t4, zero, t2 + sw t4, 432(sp) + lw t3, 436(sp) + addw t2, t6, t3 + subw t4, zero, t2 + sw t4, 436(sp) + lw t5, 440(sp) + addw t2, a7, t5 + subw t3, zero, t2 + sw t3, 440(sp) + ble t0, zero, label2 + mv t2, a4 + mv t3, zero .p2align 2 -label5: - slliw t3, a4, 13 - addiw t0, t0, 8 - addw t2, t3, a4 - slli t1, t2, 1 - srli t4, t1, 47 - add a4, t2, t4 - lui t4, 66 - sraiw t3, a4, 17 - addiw a4, t4, 33 - addw t1, t2, t3 - mulw t3, t1, a4 - slli t2, t3, 1 - srli t4, t2, 47 - add a6, t3, t4 - sraiw t6, a6, 17 - addw t2, t3, t6 - mulw t5, t2, a4 - slli t4, t5, 1 - srli a6, t4, 47 - add a7, t5, a6 - sraiw t6, a7, 17 - addw t3, t5, t6 - mulw t4, t3, a4 - slli a6, t4, 1 - srli t6, a6, 47 - add a7, t4, t6 - sraiw a6, a7, 17 - addw t5, t4, a6 - mulw t6, t5, a4 - slli a7, t6, 1 - srli a6, a7, 47 - add t4, t6, a6 - sraiw a7, t4, 17 - addw a6, t6, a7 - mulw t4, a6, a4 - slli a7, t4, 1 - srli t6, a7, 47 - add s1, t4, t6 - sraiw s2, s1, 17 - addw a7, t4, s2 - slli s4, a7, 5 - mulw s0, a7, a4 - slli t6, s0, 1 - srli s2, t6, 47 - add t4, s0, s2 - sraiw s1, t4, 17 - addw t6, s0, s1 - mulw t4, t6, a4 - slli s2, t4, 1 - srli s0, s2, 47 - add s3, t4, s0 - sraiw a4, s3, 17 - addw s1, t4, a4 - slli s2, s1, 5 - addw s0, s2, s1 - slli s3, s0, 1 - mv a4, s0 - srli t4, s3, 56 - slli s3, t6, 5 - add s1, s0, t4 - andi s2, s1, -256 - addw s1, s3, t6 - subw t4, s0, s2 - slli s0, s1, 1 - srli t6, s0, 56 - addw s0, s4, a7 - add s2, s1, t6 - andi s3, s2, -256 - slli s2, s0, 1 - subw t6, s1, s3 - srli a7, s2, 56 - slli s2, a6, 5 - add s1, s0, a7 - andi s3, s1, -256 - addw s1, s2, a6 - subw a7, s0, s3 - slli s3, s1, 1 - srli a6, s3, 56 - slli s3, t5, 5 - add s0, s1, a6 +label6: + slliw t4, t1, 13 + addiw t3, t3, 4 + addw t5, t4, t1 + slli a6, t5, 1 + srli a7, a6, 47 + add t6, t5, a7 + sraiw a6, t6, 17 + addw t4, t5, a6 + mulw t1, t4, a0 + slli t6, t1, 1 + srli a7, t6, 47 + add s0, t1, a7 + sraiw a6, s0, 17 + addw t5, t1, a6 + mulw t6, t5, a0 + slli a6, t6, 1 + srli t1, a6, 47 + add a7, t6, t1 + sraiw s0, a7, 17 + addw a6, t6, s0 + mulw t1, a6, a0 + slli s1, t1, 1 + srli t6, s1, 47 + add a7, t1, t6 + sraiw s0, a7, 17 + addw t6, t1, s0 + slli s2, t6, 5 + addw a7, s2, t6 + slli s1, a7, 1 + mv t1, a7 + srli t6, s1, 56 + slli s1, a6, 5 + add s0, a7, t6 andi s2, s0, -256 - addw s0, s3, t5 - subw a6, s1, s2 - slli s1, s0, 1 + subw t6, a7, s2 + addw a7, s1, a6 + slli s0, a7, 1 + srli a6, s0, 56 + slli s0, t5, 5 + add s2, a7, a6 + andi s1, s2, -256 + subw a6, a7, s1 + addw a7, s0, t5 + slli s1, a7, 1 srli t5, s1, 56 - slli s1, t3, 5 - add s2, s0, t5 - andi s3, s2, -256 - addw s2, s1, t3 - subw t5, s0, s3 - slli s0, s2, 1 - srli t3, s0, 56 - slli s0, t2, 5 - add s1, s2, t3 - andi s3, s1, -256 - addw s1, s0, t2 - subw t3, s2, s3 - slli s2, s1, 1 - srli t2, s2, 56 - slli s2, t1, 5 - add s0, s1, t2 - andi s3, s0, -256 - addw s0, s2, t1 - subw t2, s1, s3 - slli s2, t2, 32 - slli s3, s0, 1 - slli t2, t5, 32 - srli t1, s3, 56 - add s4, s0, t1 - andi s1, s4, -256 - subw t1, s0, s1 - add.uw s0, t3, t2 - add.uw s3, t1, s2 - slli t2, t4, 32 - slli t1, a7, 32 - sd s3, 0(a5) - add.uw t3, a6, t1 - sd s0, 8(a5) - add.uw t1, t6, t2 - sd t3, 16(a5) - li t2, 125 - sd t1, 24(a5) - slli t1, t2, 8 - blt t0, t1, label9 - li t1, 125 - li t3, 128 - ld a5, 104(sp) - slli t0, t1, 10 - addi t1, t0, 8 - add t2, a5, t0 - srli t0, t1, 2 - add a5, a5, t1 - sw t3, 0(t2) - mv t3, a2 - j label11 -.p2align 2 -label50: - addi t1, a5, 4 - mv t0, t2 - mv t3, a5 - mv a5, t1 + slli s1, t4, 5 + add s2, a7, t5 + andi s0, s2, -256 + subw t5, a7, s0 + addw a7, s1, t4 + slli s0, a7, 1 + srli t4, s0, 56 + slli s0, t5, 32 + add s2, a7, t4 + slli t5, t6, 32 + andi s1, s2, -256 + subw t4, a7, s1 + add.uw a7, t4, s0 + add.uw t4, a6, t5 + sd a7, 0(t2) + sd t4, 8(t2) + bge t3, a1, label95 + addi t2, t2, 16 + j label6 .p2align 2 -label11: - andi t1, t0, 63 - addiw t2, t0, 1 - li t4, 60 - sw zero, 0(t3) - bne t1, t4, label50 +label95: + li t4, 125 + li t5, 128 + mv t6, a5 + li a6, 60 + slli t3, t4, 10 + addi t4, t3, 8 + add t2, a4, t3 + srli t3, t4, 2 + sw t5, 0(t2) + addiw t5, t3, 1 + add t2, a4, t4 sw zero, 0(a5) - li t1, 125 - addiw t0, t0, 4 - mv t6, zero - lui t3, 422994 - lui t5, 982235 - lui a7, 802094 - addiw t2, t5, -1143 - sw zero, 4(a5) - addiw t5, a7, 496 - xori t4, t2, -1 - sw t1, 8(a5) - addiw t1, t3, 769 - sw zero, 12(a5) - xori t3, t1, -1 + andi t4, t3, 63 + beq t4, a6, label16 +.p2align 2 +label51: + addi t4, t2, 4 + mv t3, t5 + mv t6, t2 + mv t2, t4 + andi t4, t5, 63 + addiw t5, t5, 1 + li a6, 60 + sw zero, 0(t6) + bne t4, a6, label51 +.p2align 2 +label16: + sw zero, 0(t2) + li t4, 125 + addiw t3, t3, 4 + mv s0, zero + lui t6, 422994 + lui a7, 982235 + lui s2, 802094 + addiw t5, a7, -1143 + sw zero, 4(t2) + addiw a7, s2, 496 + sw t4, 8(t2) + addiw t4, t6, 769 + sw zero, 12(t2) + xori t6, t5, -1 + xori a6, t4, -1 + mv t2, a4 + sd zero, 104(sp) sd zero, 112(sp) sd zero, 120(sp) sd zero, 128(sp) @@ -281,364 +227,258 @@ label11: sd zero, 400(sp) sd zero, 408(sp) sd zero, 416(sp) - sd zero, 424(sp) - ld a5, 104(sp) - addi a6, sp, 112 - mv a7, zero - j label24 + addi s1, sp, 104 + mv s2, zero + j label25 .p2align 2 -label27: - addi a6, a6, 32 +label352: + addiw s0, s0, 64 + addw t4, t4, s8 + addw t5, t5, s3 + addw a6, a6, s9 + addw t6, t6, s2 + addw a7, a7, s4 + ble t3, s0, label49 + addi t2, t2, 256 + addi s1, sp, 104 + mv s2, zero .p2align 2 -label24: - slliw s2, a7, 2 - sh2add s0, s2, a5 - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s5 - slli s6, s4, 8 - lw s3, 12(s0) - addiw s4, a7, 1 - addw s1, s2, s6 - slliw s2, s4, 2 - addw s5, s1, s3 - sh2add s0, s2, a5 - sw s5, 0(a6) - lw s1, 0(s0) - slli s6, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s6 - slli s5, s4, 8 - lw s3, 12(s0) - addw s1, s2, s5 - addiw s2, a7, 2 - addw s4, s1, s3 - slliw s3, s2, 2 - sh2add s0, s3, a5 - sw s4, 4(a6) - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s5 - slli s6, s4, 8 - lw s3, 12(s0) - addw s1, s2, s6 - addiw s2, a7, 3 - addw s4, s1, s3 - slliw s3, s2, 2 - sh2add s0, s3, a5 - sw s4, 8(a6) - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - lw s6, 8(s0) - slli s3, s4, 16 - slli s4, s6, 8 - addw s2, s3, s5 - lw s3, 12(s0) - addw s1, s2, s4 - addiw s4, a7, 4 - addw s5, s1, s3 - slliw s2, s4, 2 - sh2add s0, s2, a5 - sw s5, 12(a6) - lw s1, 0(s0) - slli s6, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s6 - slli s5, s4, 8 - lw s3, 12(s0) - addw s1, s2, s5 - addiw s2, a7, 5 - addw s4, s1, s3 - slliw s3, s2, 2 - sh2add s0, s3, a5 - sw s4, 16(a6) - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s5 - slli s6, s4, 8 - addiw s5, a7, 6 - lw s3, 12(s0) - addw s1, s2, s6 - addw s4, s1, s3 - slliw s1, s5, 2 - sh2add s0, s1, a5 - sw s4, 20(a6) - lw s2, 0(s0) - slli s5, s2, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s1, s3, s5 - slli s6, s4, 8 - lw s3, 12(s0) - addiw s4, a7, 7 - addw s2, s1, s6 - addiw a7, a7, 8 - addw s5, s2, s3 - slliw s2, s4, 2 - sh2add s0, s2, a5 - sw s5, 24(a6) - lw s1, 0(s0) - slli s6, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s6 - slli s5, s4, 8 - lw s3, 12(s0) - addw s1, s2, s5 - li s0, 16 - addw s4, s1, s3 - sw s4, 28(a6) - blt a7, s0, label27 - addi a6, sp, 112 - li a7, 16 - addi a6, a6, 64 +label25: + slliw s4, s2, 2 + sh2add s3, s4, t2 + lw s5, 0(s3) + slli s8, s5, 24 + lw s7, 4(s3) + lw s9, 8(s3) + slli s6, s7, 16 + slli s7, s9, 8 + addw s4, s6, s8 + lw s8, 12(s3) + addw s5, s4, s7 + addiw s7, s2, 1 + addw s6, s5, s8 + slliw s5, s7, 2 + sh2add s3, s5, t2 + sw s6, 0(s1) + lw s4, 0(s3) + slli s9, s4, 24 + lw s7, 4(s3) + lw s8, 8(s3) + slli s6, s7, 16 + slli s7, s8, 8 + addw s5, s6, s9 + lw s6, 12(s3) + addw s4, s5, s7 + addiw s7, s2, 2 + addw s8, s4, s6 + slliw s5, s7, 2 + sh2add s3, s5, t2 + sw s8, 4(s1) + lw s4, 0(s3) + slli s8, s4, 24 + lw s7, 4(s3) + lw s9, 8(s3) + slli s6, s7, 16 + slli s7, s9, 8 + addw s5, s6, s8 + lw s6, 12(s3) + addw s4, s5, s7 + addiw s5, s2, 3 + addw s8, s4, s6 + addiw s2, s2, 4 + slliw s6, s5, 2 + sh2add s3, s6, t2 + sw s8, 8(s1) + lw s4, 0(s3) + slli s8, s4, 24 + lw s7, 4(s3) + slli s6, s7, 16 + lw s7, 8(s3) + addw s5, s6, s8 + slli s9, s7, 8 + lw s8, 12(s3) + addw s4, s5, s9 + li s3, 16 + addw s6, s4, s8 + sw s6, 12(s1) + bge s2, s3, label263 + addi s1, s1, 16 + j label25 .p2align 2 -label29: - lw s0, -12(a6) - addiw a7, a7, 8 - lw s2, -32(a6) - lw s4, -56(a6) - addw s3, s0, s2 - lw s7, -64(a6) - addw s6, s3, s7 - subw s1, s4, s6 - srliw s5, s1, 31 - add s3, s1, s5 - andi s7, s3, -2 - subw s6, s1, s7 - sh1add s5, s1, s6 - sw s5, 0(a6) - lw s7, -8(a6) - lw s1, -28(a6) - addw s8, s7, s1 - lw s3, -52(a6) - lw s11, -60(a6) - addw s10, s5, s3 - addw s9, s8, s11 - subw s6, s3, s9 - srliw s5, s6, 31 - add s11, s6, s5 - andi s3, s11, -2 - subw s8, s6, s3 - sh1add s5, s6, s8 - sw s5, 4(a6) - lw s3, -4(a6) - lw s11, -24(a6) - addw s6, s4, s3 - lw s9, -48(a6) - addw s8, s6, s11 - subw s4, s9, s8 - srliw s6, s4, 31 - add s11, s4, s6 - andi s8, s11, -2 - subw s6, s4, s8 - sh1add s11, s4, s6 - sw s11, 8(a6) - lw s8, -20(a6) - addw s11, s10, s8 - lw s8, -44(a6) - subw s10, s8, s11 - srliw s11, s10, 31 - sh1add s7, s10, s7 - add s11, s10, s11 - andi s11, s11, -2 - subw s11, s10, s11 - sh1add s10, s10, s11 - addw s7, s7, s11 - sw s10, 12(a6) - lw s11, -16(a6) - lw s10, -40(a6) - addw s7, s7, s10 - subw s7, s2, s7 - srliw s2, s7, 31 - add s2, s7, s2 - andi s2, s2, -2 - subw s2, s7, s2 - sh1add s2, s7, s2 - addw s7, s5, s9 - addw s5, s11, s7 - subw s7, s10, s5 - srliw s9, s7, 31 - add s11, s7, s9 - andi s10, s11, -2 - sh1add s11, s4, s0 - subw s5, s7, s10 - addw s10, s6, s11 - sh1add s9, s7, s5 - sw s9, 16(a6) - lw s0, -36(a6) - addw s9, s8, s10 - subw s4, s0, s9 - srliw s6, s4, 31 - add s8, s4, s6 - andi s10, s8, -2 - sh1add s8, s7, s3 - subw s9, s4, s10 - sh1add s6, s4, s9 - addw s4, s5, s8 - sw s6, 20(a6) - addw s6, s0, s4 - subw s3, s1, s6 - srliw s5, s3, 31 - add s4, s3, s5 - andi s0, s4, -2 - subw s6, s3, s0 - sh1add s5, s3, s6 - slli s1, s5, 32 - add.uw s0, s2, s1 - sd s0, 24(a6) - bge a7, a0, label32 - addi a6, a6, 32 - j label29 +label263: + addi s1, sp, 104 + li s2, 16 + addi s1, s1, 64 .p2align 2 -label32: - addi a6, sp, 112 - mv s2, zero - mv s0, t1 - mv s3, t2 - mv a7, t3 - mv s1, t4 - mv s4, t5 - blt zero, a1, label461 - addw s6, t2, t3 - li s7, 1 - mv s9, zero - subw s8, s6, t4 - bne s7, zero, label848 - mv s9, s8 - li s6, 1 - mv s5, s8 - bne s6, zero, label850 - lui s10, 586172 - lui s11, 828972 - lui s9, 454047 - addiw s8, s11, 262 - addiw s7, s9, -1151 - j label880 +label30: + lw s4, -12(s1) + addiw s2, s2, 4 + lw s7, -32(s1) + lw s3, -56(s1) + addw s5, s4, s7 + lw s6, -64(s1) + addw s7, s5, s6 + subw s4, s3, s7 + srliw s8, s4, 31 + add s6, s4, s8 + andi s7, s6, -2 + subw s9, s4, s7 + sh1add s5, s4, s9 + sw s5, 0(s1) + lw s7, -8(s1) + lw s8, -28(s1) + addw s9, s7, s8 + lw s6, -52(s1) + lw s10, -60(s1) + addw s4, s5, s6 + addw s11, s9, s10 + subw s5, s6, s11 + srliw s7, s5, 31 + add s8, s5, s7 + andi s6, s8, -2 + subw s9, s5, s6 + sh1add s10, s5, s9 + sw s10, 4(s1) + lw s7, -4(s1) + lw s9, -24(s1) + addw s5, s3, s7 + lw s8, -48(s1) + addw s6, s5, s9 + subw s3, s8, s6 + srliw s7, s3, 31 + add s9, s3, s7 + andi s6, s9, -2 + subw s5, s3, s6 + sh1add s7, s3, s5 + sw s7, 8(s1) + lw s6, -20(s1) + lw s7, -44(s1) + addw s5, s4, s6 + subw s3, s7, s5 + srliw s4, s3, 31 + add s6, s3, s4 + andi s5, s6, -2 + subw s7, s3, s5 + sh1add s4, s3, s7 + sw s4, 12(s1) + bge s2, a2, label323 + addi s1, s1, 16 + j label30 .p2align 2 -label461: - lui s6, 370728 +label323: + addi s1, sp, 104 mv s5, zero - addiw s7, s6, -1639 + mv s3, t4 + mv s6, t5 + mv s2, a6 + mv s4, t6 + mv s7, a7 + blt zero, a3, label329 + addw s9, t5, a6 + li s10, 1 + mv s11, zero + subw s8, s9, t6 + bne s10, zero, label619 + mv s11, s8 + li s9, 1 + bne s9, zero, label621 + lui s10, 828972 + addiw s11, s10, 262 + lui s10, 454047 + addiw s10, s10, -1151 + j label656 .p2align 2 -label42: - slliw s8, s0, 5 - addiw s2, s2, 1 - addw s10, s4, s8 - slli s8, s0, 1 - addw s9, s7, s10 - srli s4, s8, 59 - addw s6, s5, s9 - add s9, s0, s4 - slliw s4, s3, 30 - andi s5, s9, -32 - lw s9, 0(a6) - subw s8, s0, s5 - addw s7, s6, s8 - slli s8, s3, 1 - addw s5, s7, s9 - srli s10, s8, 34 - add s6, s3, s10 - sraiw s9, s6, 30 - slli s7, s9, 30 - subw s8, s3, s7 - addw s6, s4, s8 - bge s2, a0, label484 - addi a6, a6, 4 - mv s3, s0 - mv s4, s1 - mv s0, s5 - mv s1, a7 - mv a7, s6 - blt s2, a1, label461 - addw s6, s3, s6 - slti s7, s2, 60 - mv s9, zero - subw s8, s6, s1 - bne s7, zero, label848 - mv s9, s8 - slti s6, s2, 40 - mv s5, s8 - bne s6, zero, label850 - lui s10, 586172 - lui s11, 828972 - lui s9, 454047 - addiw s8, s11, 262 - addiw s7, s9, -1151 - j label880 +label329: + lui s9, 370728 + mv s8, zero + addiw s10, s9, -1639 .p2align 2 -label848: - slti s6, s2, 40 - mv s5, s8 - bne s6, zero, label850 - mv s5, s9 +label44: + slliw s9, s3, 5 + addiw s5, s5, 1 + addw s11, s7, s9 + addw s7, s10, s11 + slli s11, s3, 1 + addw s9, s8, s7 + srli s10, s11, 59 + lw s11, 0(s1) + add s8, s3, s10 + andi s7, s8, -32 + subw s8, s3, s7 + slliw s7, s6, 30 + addw s10, s9, s8 + addw s8, s10, s11 + slli s11, s6, 1 + srli s10, s11, 34 + add s9, s6, s10 + sraiw s11, s9, 30 + slli s9, s11, 30 + subw s10, s6, s9 + addw s9, s7, s10 + bge s5, a2, label352 + addi s1, s1, 4 + mv s6, s3 + mv s7, s4 + mv s3, s8 + mv s4, s2 + mv s2, s9 + blt s5, a3, label329 + addw s9, s6, s9 + slti s10, s5, 60 + mv s11, zero + subw s8, s9, s4 + bne s10, zero, label619 + mv s11, s8 + slti s9, s5, 40 + bne s9, zero, label621 + lui s10, 828972 + addiw s11, s10, 262 + lui s10, 454047 + addiw s10, s10, -1151 + j label656 .p2align 2 -label850: - lui s10, 586172 - addiw s8, s10, -804 - beq s7, zero, label851 +label619: + slti s9, s5, 40 + bne s9, zero, label621 + mv s8, s11 .p2align 2 -label852: - lui s9, 454047 - addiw s7, s9, -1151 - bne s6, zero, label42 +label621: + lui s11, 586172 + addiw s11, s11, -804 + beq s10, zero, label622 .p2align 2 -label880: - mv s7, s8 - j label42 +label623: + lui s10, 454047 + addiw s10, s10, -1151 + bne s9, zero, label44 .p2align 2 -label484: - addiw t6, t6, 64 - addw t1, t1, s5 - addw t2, t2, s0 - addw t3, t3, s6 - addw t4, t4, a7 - addw t5, t5, s1 - ble t0, t6, label47 - addi a5, a5, 256 - addi a6, sp, 112 - mv a7, zero - j label24 -label51: +label656: + mv s10, s11 + j label44 +label2: li a0, 184 jal _sysy_stoptime li a0, 5 - addi a1, sp, 432 + addi a1, sp, 424 jal putarray - ld ra, 0(sp) mv a0, zero + ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) ld s1, 24(sp) ld s6, 32(sp) ld s2, 40(sp) - ld s3, 48(sp) - ld s4, 56(sp) + ld s4, 48(sp) + ld s3, 56(sp) ld s7, 64(sp) - ld s10, 72(sp) - ld s8, 80(sp) - ld s11, 88(sp) - ld s9, 96(sp) - addi sp, sp, 456 + ld s8, 72(sp) + ld s9, 80(sp) + ld s10, 88(sp) + ld s11, 96(sp) + addi sp, sp, 448 ret .p2align 2 -label851: - lui s11, 828972 - addiw s8, s11, 262 - j label852 +label622: + lui s10, 828972 + addiw s11, s10, 262 + j label623 diff --git a/tests/SysY2022/performance/crypto-2.sy.ir b/tests/SysY2022/performance/crypto-2.sy.ir index 4840c8cb7..adad5665d 100644 --- a/tests/SysY2022/performance/crypto-2.sy.ir +++ b/tests/SysY2022/performance/crypto-2.sy.ir @@ -107,16 +107,16 @@ func @main() -> i32 { NoRecurse Entry } { i32* %91 = getelementptr &([32768 * i32]* %8)[i64 0][i64 32001]; cbr i1 %2(prob = 0.984615), ^while.body, ^b; ^while.body: - i32 %92 = phi [^entry, i32 %0] [^b4, i32 %120]; - i32 %93 = phi [^entry, i32 %1] [^b4, i32 %448]; + i32 %92 = phi [^entry, i32 %0] [^b4, i32 %108]; + i32 %93 = phi [^entry, i32 %1] [^b4, i32 %319]; ubr ^while.body1; ^b: call (i32) -> void @stoptime(i32 184); call (i32, i32*) -> void @putarray(i32 5, i32* %3); ret i32 0; ^while.body1: - i32 %94 = phi [^while.body, i32 %92] [^while.body1, i32 %120]; - i32 %95 = phi [^while.body, i32 0] [^while.body1, i32 %144]; + i32 %94 = phi [^while.body, i32 %92] [^while.body1, i32 %108]; + i32 %95 = phi [^while.body, i32 0] [^while.body1, i32 %120]; i32 %96 = mul i32 %94, i32 8193; i32 %97 = sdiv i32 %96, i32 131072; i32 %98 = add i32 %96, i32 %97; @@ -129,74 +129,46 @@ func @main() -> i32 { NoRecurse Entry } { i32 %105 = mul i32 %104, i32 270369; i32 %106 = sdiv i32 %105, i32 131072; i32 %107 = add i32 %105, i32 %106; - i32 %108 = mul i32 %107, i32 270369; - i32 %109 = sdiv i32 %108, i32 131072; - i32 %110 = add i32 %108, i32 %109; - i32 %111 = mul i32 %110, i32 270369; - i32 %112 = sdiv i32 %111, i32 131072; - i32 %113 = add i32 %111, i32 %112; - i32 %114 = mul i32 %113, i32 270369; - i32 %115 = sdiv i32 %114, i32 131072; - i32 %116 = add i32 %114, i32 %115; - i32 %117 = mul i32 %116, i32 270369; - i32 %118 = sdiv i32 %117, i32 131072; - i32 %119 = add i32 %117, i32 %118; - i32 %120 = mul i32 %119, i32 33; - i32 %121 = srem i32 %120, i32 256; - i32 %122 = mul i32 %116, i32 33; - i32 %123 = srem i32 %122, i32 256; - i32 %124 = mul i32 %113, i32 33; - i32 %125 = srem i32 %124, i32 256; - i32 %126 = mul i32 %110, i32 33; - i32 %127 = srem i32 %126, i32 256; - i32 %128 = mul i32 %107, i32 33; - i32 %129 = srem i32 %128, i32 256; - i32 %130 = mul i32 %104, i32 33; - i32 %131 = srem i32 %130, i32 256; - i32 %132 = mul i32 %101, i32 33; - i32 %133 = srem i32 %132, i32 256; - i32 %134 = mul i32 %98, i32 33; - i32 %135 = srem i32 %134, i32 256; - i32* %136 = getelementptr &([32768 * i32]* %8)[i64 0][i32 %95]; - store i32* %136 with i32 %135; - i32* %137 = getelementptr &(i32* %136)[i64 1]; - store i32* %137 with i32 %133; - i32* %138 = getelementptr &(i32* %136)[i64 2]; - store i32* %138 with i32 %131; - i32* %139 = getelementptr &(i32* %136)[i64 3]; - store i32* %139 with i32 %129; - i32* %140 = getelementptr &(i32* %136)[i64 4]; - store i32* %140 with i32 %127; - i32* %141 = getelementptr &(i32* %136)[i64 5]; - store i32* %141 with i32 %125; - i32* %142 = getelementptr &(i32* %136)[i64 6]; - store i32* %142 with i32 %123; - i32* %143 = getelementptr &(i32* %136)[i64 7]; - store i32* %143 with i32 %121; - i32 %144 = add i32 %95, i32 8; - i1 %145 = icmp slt i32 %144, i32 32000; - cbr i1 %145(prob = 0.99975), ^while.body1, ^postbody; + i32 %108 = mul i32 %107, i32 33; + i32 %109 = srem i32 %108, i32 256; + i32 %110 = mul i32 %104, i32 33; + i32 %111 = srem i32 %110, i32 256; + i32 %112 = mul i32 %101, i32 33; + i32 %113 = srem i32 %112, i32 256; + i32 %114 = mul i32 %98, i32 33; + i32 %115 = srem i32 %114, i32 256; + i32* %116 = getelementptr &([32768 * i32]* %8)[i64 0][i32 %95]; + store i32* %116 with i32 %115; + i32* %117 = getelementptr &(i32* %116)[i64 1]; + store i32* %117 with i32 %113; + i32* %118 = getelementptr &(i32* %116)[i64 2]; + store i32* %118 with i32 %111; + i32* %119 = getelementptr &(i32* %116)[i64 3]; + store i32* %119 with i32 %109; + i32 %120 = add i32 %95, i32 4; + i1 %121 = icmp slt i32 %120, i32 32000; + cbr i1 %121(prob = 0.999875), ^while.body1, ^postbody; ^postbody: store i32* %90 with i32 128; ubr ^while.body2; ^while.body2: - i32 %146 = phi [^postbody, i32 32002] [^while.body2, i32 %151]; - i32* %147 = phi [^postbody, i32* %91] [^while.body2, i32* %150]; - i32 %148 = and i32 %146, i32 63; - i1 %149 = icmp neq i32 %148, i32 60; - i32* %150 = getelementptr &(i32* %9)[i32 %146]; - store i32* %147 with i32 0; - i32 %151 = add i32 %146, i32 1; - cbr i1 %149(prob = 0.984615), ^while.body2, ^b1; + i32 %122 = phi [^postbody, i32 32002] [^while.body2, i32 %127]; + i32* %123 = phi [^postbody, i32* %91] [^while.body2, i32* %126]; + i32 %124 = and i32 %122, i32 63; + i1 %125 = icmp neq i32 %124, i32 60; + i32* %126 = getelementptr &(i32* %9)[i32 %122]; + store i32* %123 with i32 0; + i32 %127 = add i32 %122, i32 1; + cbr i1 %125(prob = 0.984615), ^while.body2, ^b1; ^b1: - store i32* %150 with i32 0; - i32* %152 = getelementptr &(i32* %150)[i64 1]; - store i32* %152 with i32 0; - i32* %153 = getelementptr &(i32* %150)[i64 2]; - store i32* %153 with i32 125; - i32* %154 = getelementptr &(i32* %150)[i64 3]; - store i32* %154 with i32 0; - i32 %155 = add i32 %146, i32 4; + store i32* %126 with i32 0; + i32* %128 = getelementptr &(i32* %126)[i64 1]; + store i32* %128 with i32 0; + i32* %129 = getelementptr &(i32* %126)[i64 2]; + store i32* %129 with i32 125; + i32* %130 = getelementptr &(i32* %126)[i64 3]; + store i32* %130 with i32 0; + i32 %131 = add i32 %122, i32 4; store i32* %10 with i32 0; store i32* %11 with i32 0; store i32* %12 with i32 0; @@ -279,334 +251,221 @@ func @main() -> i32 { NoRecurse Entry } { store i32* %89 with i32 0; ubr ^while.body3; ^while.body3: - i32 %156 = phi [^b1, i32 0] [^b3, i32 %426]; - i32 %157 = phi [^b1, i32 1732584193] [^b3, i32 %428]; - i32 %158 = phi [^b1, i32 -271733879] [^b3, i32 %429]; - i32 %159 = phi [^b1, i32 -1732584194] [^b3, i32 %430]; - i32 %160 = phi [^b1, i32 271733878] [^b3, i32 %431]; - i32 %161 = phi [^b1, i32 -1009589776] [^b3, i32 %432]; - i32* %162 = getelementptr &(i32* %9)[i32 %156]; + i32 %132 = phi [^b1, i32 0] [^b3, i32 %297]; + i32 %133 = phi [^b1, i32 1732584193] [^b3, i32 %299]; + i32 %134 = phi [^b1, i32 -271733879] [^b3, i32 %300]; + i32 %135 = phi [^b1, i32 -1732584194] [^b3, i32 %301]; + i32 %136 = phi [^b1, i32 271733878] [^b3, i32 %302]; + i32 %137 = phi [^b1, i32 -1009589776] [^b3, i32 %303]; + i32* %138 = getelementptr &(i32* %9)[i32 %132]; ubr ^while.body4; ^while.body4: - i32 %163 = phi [^while.body3, i32 0] [^while.body4, i32 %299]; - i32 %164 = mul i32 %163, i32 4; - i32* %165 = getelementptr &(i32* %162)[i32 %164]; + i32 %139 = phi [^while.body3, i32 0] [^while.body4, i32 %207]; + i32 %140 = mul i32 %139, i32 4; + i32* %141 = getelementptr &(i32* %138)[i32 %140]; + i32 %142 = load i32* %141; + i32* %143 = getelementptr &(i32* %141)[i64 1]; + i32 %144 = load i32* %143; + i32 %145 = mul i32 %144, i32 65536; + i32 %146 = mul i32 %142, i32 16777216; + i32 %147 = add i32 %145, i32 %146; + i32* %148 = getelementptr &(i32* %141)[i64 2]; + i32 %149 = load i32* %148; + i32 %150 = mul i32 %149, i32 256; + i32 %151 = add i32 %147, i32 %150; + i32* %152 = getelementptr &(i32* %141)[i64 3]; + i32 %153 = load i32* %152; + i32 %154 = add i32 %151, i32 %153; + i32* %155 = getelementptr &([80 * i32]* %words)[i64 0][i32 %139]; + store i32* %155 with i32 %154; + i32 %156 = add i32 %139, i32 1; + i32 %157 = mul i32 %156, i32 4; + i32* %158 = getelementptr &(i32* %138)[i32 %157]; + i32 %159 = load i32* %158; + i32* %160 = getelementptr &(i32* %158)[i64 1]; + i32 %161 = load i32* %160; + i32 %162 = mul i32 %161, i32 65536; + i32 %163 = mul i32 %159, i32 16777216; + i32 %164 = add i32 %162, i32 %163; + i32* %165 = getelementptr &(i32* %158)[i64 2]; i32 %166 = load i32* %165; - i32* %167 = getelementptr &(i32* %165)[i64 1]; - i32 %168 = load i32* %167; - i32 %169 = mul i32 %168, i32 65536; - i32 %170 = mul i32 %166, i32 16777216; - i32 %171 = add i32 %169, i32 %170; - i32* %172 = getelementptr &(i32* %165)[i64 2]; - i32 %173 = load i32* %172; - i32 %174 = mul i32 %173, i32 256; - i32 %175 = add i32 %171, i32 %174; - i32* %176 = getelementptr &(i32* %165)[i64 3]; - i32 %177 = load i32* %176; - i32 %178 = add i32 %175, i32 %177; - i32* %179 = getelementptr &([80 * i32]* %words)[i64 0][i32 %163]; - store i32* %179 with i32 %178; - i32 %180 = add i32 %163, i32 1; - i32 %181 = mul i32 %180, i32 4; - i32* %182 = getelementptr &(i32* %162)[i32 %181]; + i32 %167 = mul i32 %166, i32 256; + i32 %168 = add i32 %164, i32 %167; + i32* %169 = getelementptr &(i32* %158)[i64 3]; + i32 %170 = load i32* %169; + i32 %171 = add i32 %168, i32 %170; + i32* %172 = getelementptr &(i32* %155)[i64 1]; + store i32* %172 with i32 %171; + i32 %173 = add i32 %139, i32 2; + i32 %174 = mul i32 %173, i32 4; + i32* %175 = getelementptr &(i32* %138)[i32 %174]; + i32 %176 = load i32* %175; + i32* %177 = getelementptr &(i32* %175)[i64 1]; + i32 %178 = load i32* %177; + i32 %179 = mul i32 %178, i32 65536; + i32 %180 = mul i32 %176, i32 16777216; + i32 %181 = add i32 %179, i32 %180; + i32* %182 = getelementptr &(i32* %175)[i64 2]; i32 %183 = load i32* %182; - i32* %184 = getelementptr &(i32* %182)[i64 1]; - i32 %185 = load i32* %184; - i32 %186 = mul i32 %185, i32 65536; - i32 %187 = mul i32 %183, i32 16777216; - i32 %188 = add i32 %186, i32 %187; - i32* %189 = getelementptr &(i32* %182)[i64 2]; - i32 %190 = load i32* %189; - i32 %191 = mul i32 %190, i32 256; - i32 %192 = add i32 %188, i32 %191; - i32* %193 = getelementptr &(i32* %182)[i64 3]; - i32 %194 = load i32* %193; - i32 %195 = add i32 %192, i32 %194; - i32* %196 = getelementptr &(i32* %179)[i64 1]; - store i32* %196 with i32 %195; - i32 %197 = add i32 %163, i32 2; - i32 %198 = mul i32 %197, i32 4; - i32* %199 = getelementptr &(i32* %162)[i32 %198]; + i32 %184 = mul i32 %183, i32 256; + i32 %185 = add i32 %181, i32 %184; + i32* %186 = getelementptr &(i32* %175)[i64 3]; + i32 %187 = load i32* %186; + i32 %188 = add i32 %185, i32 %187; + i32* %189 = getelementptr &(i32* %155)[i64 2]; + store i32* %189 with i32 %188; + i32 %190 = add i32 %139, i32 3; + i32 %191 = mul i32 %190, i32 4; + i32* %192 = getelementptr &(i32* %138)[i32 %191]; + i32 %193 = load i32* %192; + i32* %194 = getelementptr &(i32* %192)[i64 1]; + i32 %195 = load i32* %194; + i32 %196 = mul i32 %195, i32 65536; + i32 %197 = mul i32 %193, i32 16777216; + i32 %198 = add i32 %196, i32 %197; + i32* %199 = getelementptr &(i32* %192)[i64 2]; i32 %200 = load i32* %199; - i32* %201 = getelementptr &(i32* %199)[i64 1]; - i32 %202 = load i32* %201; - i32 %203 = mul i32 %202, i32 65536; - i32 %204 = mul i32 %200, i32 16777216; - i32 %205 = add i32 %203, i32 %204; - i32* %206 = getelementptr &(i32* %199)[i64 2]; - i32 %207 = load i32* %206; - i32 %208 = mul i32 %207, i32 256; - i32 %209 = add i32 %205, i32 %208; - i32* %210 = getelementptr &(i32* %199)[i64 3]; - i32 %211 = load i32* %210; - i32 %212 = add i32 %209, i32 %211; - i32* %213 = getelementptr &(i32* %179)[i64 2]; - store i32* %213 with i32 %212; - i32 %214 = add i32 %163, i32 3; - i32 %215 = mul i32 %214, i32 4; - i32* %216 = getelementptr &(i32* %162)[i32 %215]; + i32 %201 = mul i32 %200, i32 256; + i32 %202 = add i32 %198, i32 %201; + i32* %203 = getelementptr &(i32* %192)[i64 3]; + i32 %204 = load i32* %203; + i32 %205 = add i32 %202, i32 %204; + i32* %206 = getelementptr &(i32* %155)[i64 3]; + store i32* %206 with i32 %205; + i32 %207 = add i32 %139, i32 4; + i1 %208 = icmp slt i32 %207, i32 16; + cbr i1 %208(prob = 0.75), ^while.body4, ^while.body5; + ^while.body5: + i32 %209 = phi [^while.body4, i32 16] [^while.body5, i32 %264]; + i32* %210 = getelementptr &([80 * i32]* %words)[i64 0][i32 %209]; + i32* %211 = getelementptr &(i32* %210)[i64 -3]; + i32 %212 = load i32* %211; + i32* %213 = getelementptr &(i32* %210)[i64 -8]; + i32 %214 = load i32* %213; + i32 %215 = add i32 %212, i32 %214; + i32* %216 = getelementptr &(i32* %210)[i64 -14]; i32 %217 = load i32* %216; - i32* %218 = getelementptr &(i32* %216)[i64 1]; + i32* %218 = getelementptr &(i32* %210)[i64 -16]; i32 %219 = load i32* %218; - i32 %220 = mul i32 %219, i32 65536; - i32 %221 = mul i32 %217, i32 16777216; - i32 %222 = add i32 %220, i32 %221; - i32* %223 = getelementptr &(i32* %216)[i64 2]; - i32 %224 = load i32* %223; - i32 %225 = mul i32 %224, i32 256; - i32 %226 = add i32 %222, i32 %225; - i32* %227 = getelementptr &(i32* %216)[i64 3]; + i32 %220 = add i32 %215, i32 %219; + i32 %221 = sub i32 %217, i32 %220; + i32 %222 = mul i32 %221, i32 2; + i32 %223 = srem i32 %221, i32 2; + i32 %224 = add i32 %222, i32 %223; + store i32* %210 with i32 %224; + i32* %225 = getelementptr &(i32* %210)[i64 -2]; + i32 %226 = load i32* %225; + i32* %227 = getelementptr &(i32* %210)[i64 -7]; i32 %228 = load i32* %227; - i32 %229 = add i32 %226, i32 %228; - i32* %230 = getelementptr &(i32* %179)[i64 3]; - store i32* %230 with i32 %229; - i32 %231 = add i32 %163, i32 4; - i32 %232 = mul i32 %231, i32 4; - i32* %233 = getelementptr &(i32* %162)[i32 %232]; + i32* %229 = getelementptr &(i32* %210)[i64 -13]; + i32 %230 = load i32* %229; + i32 %231 = add i32 %224, i32 %230; + i32 %232 = add i32 %226, i32 %228; + i32* %233 = getelementptr &(i32* %210)[i64 -15]; i32 %234 = load i32* %233; - i32* %235 = getelementptr &(i32* %233)[i64 1]; - i32 %236 = load i32* %235; - i32 %237 = mul i32 %236, i32 65536; - i32 %238 = mul i32 %234, i32 16777216; + i32 %235 = add i32 %232, i32 %234; + i32 %236 = sub i32 %230, i32 %235; + i32 %237 = mul i32 %236, i32 2; + i32 %238 = srem i32 %236, i32 2; i32 %239 = add i32 %237, i32 %238; - i32* %240 = getelementptr &(i32* %233)[i64 2]; - i32 %241 = load i32* %240; - i32 %242 = mul i32 %241, i32 256; - i32 %243 = add i32 %239, i32 %242; - i32* %244 = getelementptr &(i32* %233)[i64 3]; + i32* %240 = getelementptr &(i32* %210)[i64 1]; + store i32* %240 with i32 %239; + i32* %241 = getelementptr &(i32* %210)[i64 -1]; + i32 %242 = load i32* %241; + i32 %243 = add i32 %217, i32 %242; + i32* %244 = getelementptr &(i32* %210)[i64 -6]; i32 %245 = load i32* %244; i32 %246 = add i32 %243, i32 %245; - i32* %247 = getelementptr &(i32* %179)[i64 4]; - store i32* %247 with i32 %246; - i32 %248 = add i32 %163, i32 5; - i32 %249 = mul i32 %248, i32 4; - i32* %250 = getelementptr &(i32* %162)[i32 %249]; - i32 %251 = load i32* %250; - i32* %252 = getelementptr &(i32* %250)[i64 1]; - i32 %253 = load i32* %252; - i32 %254 = mul i32 %253, i32 65536; - i32 %255 = mul i32 %251, i32 16777216; - i32 %256 = add i32 %254, i32 %255; - i32* %257 = getelementptr &(i32* %250)[i64 2]; + i32* %247 = getelementptr &(i32* %210)[i64 -12]; + i32 %248 = load i32* %247; + i32 %249 = sub i32 %248, i32 %246; + i32 %250 = mul i32 %249, i32 2; + i32 %251 = srem i32 %249, i32 2; + i32 %252 = add i32 %250, i32 %251; + i32* %253 = getelementptr &(i32* %210)[i64 2]; + store i32* %253 with i32 %252; + i32* %254 = getelementptr &(i32* %210)[i64 -5]; + i32 %255 = load i32* %254; + i32 %256 = add i32 %231, i32 %255; + i32* %257 = getelementptr &(i32* %210)[i64 -11]; i32 %258 = load i32* %257; - i32 %259 = mul i32 %258, i32 256; - i32 %260 = add i32 %256, i32 %259; - i32* %261 = getelementptr &(i32* %250)[i64 3]; - i32 %262 = load i32* %261; - i32 %263 = add i32 %260, i32 %262; - i32* %264 = getelementptr &(i32* %179)[i64 5]; - store i32* %264 with i32 %263; - i32 %265 = add i32 %163, i32 6; - i32 %266 = mul i32 %265, i32 4; - i32* %267 = getelementptr &(i32* %162)[i32 %266]; - i32 %268 = load i32* %267; - i32* %269 = getelementptr &(i32* %267)[i64 1]; - i32 %270 = load i32* %269; - i32 %271 = mul i32 %270, i32 65536; - i32 %272 = mul i32 %268, i32 16777216; - i32 %273 = add i32 %271, i32 %272; - i32* %274 = getelementptr &(i32* %267)[i64 2]; - i32 %275 = load i32* %274; - i32 %276 = mul i32 %275, i32 256; - i32 %277 = add i32 %273, i32 %276; - i32* %278 = getelementptr &(i32* %267)[i64 3]; - i32 %279 = load i32* %278; - i32 %280 = add i32 %277, i32 %279; - i32* %281 = getelementptr &(i32* %179)[i64 6]; - store i32* %281 with i32 %280; - i32 %282 = add i32 %163, i32 7; - i32 %283 = mul i32 %282, i32 4; - i32* %284 = getelementptr &(i32* %162)[i32 %283]; - i32 %285 = load i32* %284; - i32* %286 = getelementptr &(i32* %284)[i64 1]; - i32 %287 = load i32* %286; - i32 %288 = mul i32 %287, i32 65536; - i32 %289 = mul i32 %285, i32 16777216; - i32 %290 = add i32 %288, i32 %289; - i32* %291 = getelementptr &(i32* %284)[i64 2]; - i32 %292 = load i32* %291; - i32 %293 = mul i32 %292, i32 256; - i32 %294 = add i32 %290, i32 %293; - i32* %295 = getelementptr &(i32* %284)[i64 3]; - i32 %296 = load i32* %295; - i32 %297 = add i32 %294, i32 %296; - i32* %298 = getelementptr &(i32* %179)[i64 7]; - store i32* %298 with i32 %297; - i32 %299 = add i32 %163, i32 8; - i1 %300 = icmp slt i32 %299, i32 16; - cbr i1 %300(prob = 0.5), ^while.body4, ^while.body5; - ^while.body5: - i32 %301 = phi [^while.body4, i32 16] [^while.body5, i32 %393]; - i32* %302 = getelementptr &([80 * i32]* %words)[i64 0][i32 %301]; - i32* %303 = getelementptr &(i32* %302)[i64 -3]; - i32 %304 = load i32* %303; - i32* %305 = getelementptr &(i32* %302)[i64 -8]; - i32 %306 = load i32* %305; - i32 %307 = add i32 %304, i32 %306; - i32* %308 = getelementptr &(i32* %302)[i64 -14]; - i32 %309 = load i32* %308; - i32* %310 = getelementptr &(i32* %302)[i64 -16]; - i32 %311 = load i32* %310; - i32 %312 = add i32 %307, i32 %311; - i32 %313 = sub i32 %309, i32 %312; - i32 %314 = mul i32 %313, i32 2; - i32 %315 = srem i32 %313, i32 2; - i32 %316 = add i32 %314, i32 %315; - store i32* %302 with i32 %316; - i32* %317 = getelementptr &(i32* %302)[i64 -2]; - i32 %318 = load i32* %317; - i32* %319 = getelementptr &(i32* %302)[i64 -7]; - i32 %320 = load i32* %319; - i32* %321 = getelementptr &(i32* %302)[i64 -13]; - i32 %322 = load i32* %321; - i32 %323 = add i32 %316, i32 %322; - i32 %324 = add i32 %318, i32 %320; - i32* %325 = getelementptr &(i32* %302)[i64 -15]; - i32 %326 = load i32* %325; - i32 %327 = add i32 %324, i32 %326; - i32 %328 = sub i32 %322, i32 %327; - i32 %329 = mul i32 %328, i32 2; - i32 %330 = srem i32 %328, i32 2; - i32 %331 = add i32 %329, i32 %330; - i32* %332 = getelementptr &(i32* %302)[i64 1]; - store i32* %332 with i32 %331; - i32* %333 = getelementptr &(i32* %302)[i64 -1]; - i32 %334 = load i32* %333; - i32 %335 = add i32 %309, i32 %334; - i32* %336 = getelementptr &(i32* %302)[i64 -6]; - i32 %337 = load i32* %336; - i32 %338 = add i32 %335, i32 %337; - i32* %339 = getelementptr &(i32* %302)[i64 -12]; - i32 %340 = load i32* %339; - i32 %341 = sub i32 %340, i32 %338; - i32 %342 = mul i32 %341, i32 2; - i32 %343 = srem i32 %341, i32 2; - i32 %344 = add i32 %342, i32 %343; - i32* %345 = getelementptr &(i32* %302)[i64 2]; - store i32* %345 with i32 %344; - i32* %346 = getelementptr &(i32* %302)[i64 -5]; - i32 %347 = load i32* %346; - i32 %348 = add i32 %323, i32 %347; - i32* %349 = getelementptr &(i32* %302)[i64 -11]; - i32 %350 = load i32* %349; - i32 %351 = sub i32 %350, i32 %348; - i32 %352 = mul i32 %351, i32 2; - i32 %353 = add i32 %318, i32 %352; - i32 %354 = srem i32 %351, i32 2; - i32 %355 = add i32 %353, i32 %354; - i32 %356 = add i32 %352, i32 %354; - i32* %357 = getelementptr &(i32* %302)[i64 3]; - store i32* %357 with i32 %356; - i32* %358 = getelementptr &(i32* %302)[i64 -4]; - i32 %359 = load i32* %358; - i32* %360 = getelementptr &(i32* %302)[i64 -10]; - i32 %361 = load i32* %360; - i32 %362 = add i32 %355, i32 %361; - i32 %363 = sub i32 %306, i32 %362; - i32 %364 = mul i32 %363, i32 2; - i32 %365 = srem i32 %363, i32 2; - i32 %366 = add i32 %364, i32 %365; - i32 %367 = add i32 %331, i32 %340; - i32 %368 = add i32 %359, i32 %367; - i32 %369 = sub i32 %361, i32 %368; - i32 %370 = mul i32 %369, i32 2; - i32 %371 = srem i32 %369, i32 2; - i32 %372 = add i32 %370, i32 %371; - i32* %373 = getelementptr &(i32* %302)[i64 4]; - store i32* %373 with i32 %372; - i32 %374 = add i32 %304, i32 %342; - i32 %375 = add i32 %343, i32 %374; - i32 %376 = add i32 %350, i32 %375; - i32* %377 = getelementptr &(i32* %302)[i64 -9]; - i32 %378 = load i32* %377; - i32 %379 = sub i32 %378, i32 %376; - i32 %380 = mul i32 %379, i32 2; - i32 %381 = srem i32 %379, i32 2; - i32 %382 = add i32 %380, i32 %381; - i32* %383 = getelementptr &(i32* %302)[i64 5]; - store i32* %383 with i32 %382; - i32* %384 = getelementptr &(i32* %302)[i64 6]; - store i32* %384 with i32 %366; - i32 %385 = add i32 %334, i32 %370; - i32 %386 = add i32 %371, i32 %385; - i32 %387 = add i32 %378, i32 %386; - i32 %388 = sub i32 %320, i32 %387; - i32 %389 = mul i32 %388, i32 2; - i32 %390 = srem i32 %388, i32 2; - i32 %391 = add i32 %389, i32 %390; - i32* %392 = getelementptr &(i32* %302)[i64 7]; - store i32* %392 with i32 %391; - i32 %393 = add i32 %301, i32 8; - i1 %394 = icmp slt i32 %393, i32 80; - cbr i1 %394(prob = 0.875), ^while.body5, ^while.body6; + i32 %259 = sub i32 %258, i32 %256; + i32 %260 = mul i32 %259, i32 2; + i32 %261 = srem i32 %259, i32 2; + i32 %262 = add i32 %260, i32 %261; + i32* %263 = getelementptr &(i32* %210)[i64 3]; + store i32* %263 with i32 %262; + i32 %264 = add i32 %209, i32 4; + i1 %265 = icmp slt i32 %264, i32 80; + cbr i1 %265(prob = 0.9375), ^while.body5, ^while.body6; ^while.body6: - i32 %395 = phi [^while.body5, i32 0] [^b2, i32 %424]; - i32 %396 = phi [^while.body5, i32 %157] [^b2, i32 %420]; - i32 %397 = phi [^while.body5, i32 %158] [^b2, i32 %396]; - i32 %398 = phi [^while.body5, i32 %159] [^b2, i32 %423]; - i32 %399 = phi [^while.body5, i32 %160] [^b2, i32 %398]; - i32 %400 = phi [^while.body5, i32 %161] [^b2, i32 %399]; - i1 %401 = icmp slt i32 %395, i32 20; - cbr i1 %401(prob = 0.5), ^b2, ^if.else; + i32 %266 = phi [^while.body5, i32 0] [^b2, i32 %295]; + i32 %267 = phi [^while.body5, i32 %133] [^b2, i32 %291]; + i32 %268 = phi [^while.body5, i32 %134] [^b2, i32 %267]; + i32 %269 = phi [^while.body5, i32 %135] [^b2, i32 %294]; + i32 %270 = phi [^while.body5, i32 %136] [^b2, i32 %269]; + i32 %271 = phi [^while.body5, i32 %137] [^b2, i32 %270]; + i1 %272 = icmp slt i32 %266, i32 20; + cbr i1 %272(prob = 0.5), ^b2, ^if.else; ^if.else: - i32 %402 = add i32 %397, i32 %398; - i32 %403 = sub i32 %402, i32 %399; - i1 %404 = icmp slt i32 %395, i32 60; - i32 %405 = select i1 %404 ? i32 0 : i32 %403; - i1 %406 = icmp slt i32 %395, i32 40; - i32 %407 = select i1 %406 ? i32 %403 : i32 %405; - i32 %408 = select i1 %404 ? i32 -1894007588 : i32 -899497722; - i32 %409 = select i1 %406 ? i32 1859775361 : i32 %408; + i32 %273 = add i32 %268, i32 %269; + i32 %274 = sub i32 %273, i32 %270; + i1 %275 = icmp slt i32 %266, i32 60; + i32 %276 = select i1 %275 ? i32 0 : i32 %274; + i1 %277 = icmp slt i32 %266, i32 40; + i32 %278 = select i1 %277 ? i32 %274 : i32 %276; + i32 %279 = select i1 %275 ? i32 -1894007588 : i32 -899497722; + i32 %280 = select i1 %277 ? i32 1859775361 : i32 %279; ubr ^b2; ^b2: - i32 %410 = phi [^while.body6, i32 1518500249] [^if.else, i32 %409]; - i32 %411 = phi [^while.body6, i32 0] [^if.else, i32 %407]; - i32 %412 = mul i32 %396, i32 32; - i32 %413 = add i32 %400, i32 %412; - i32 %414 = add i32 %410, i32 %413; - i32 %415 = add i32 %411, i32 %414; - i32 %416 = srem i32 %396, i32 32; - i32 %417 = add i32 %415, i32 %416; - i32* %418 = getelementptr &([80 * i32]* %words)[i64 0][i32 %395]; - i32 %419 = load i32* %418; - i32 %420 = add i32 %417, i32 %419; - i32 %421 = mul i32 %397, i32 1073741824; - i32 %422 = srem i32 %397, i32 1073741824; - i32 %423 = add i32 %421, i32 %422; - i32 %424 = add i32 %395, i32 1; - i1 %425 = icmp slt i32 %424, i32 80; - cbr i1 %425(prob = 0.9875), ^while.body6, ^b3; + i32 %281 = phi [^while.body6, i32 1518500249] [^if.else, i32 %280]; + i32 %282 = phi [^while.body6, i32 0] [^if.else, i32 %278]; + i32 %283 = mul i32 %267, i32 32; + i32 %284 = add i32 %271, i32 %283; + i32 %285 = add i32 %281, i32 %284; + i32 %286 = add i32 %282, i32 %285; + i32 %287 = srem i32 %267, i32 32; + i32 %288 = add i32 %286, i32 %287; + i32* %289 = getelementptr &([80 * i32]* %words)[i64 0][i32 %266]; + i32 %290 = load i32* %289; + i32 %291 = add i32 %288, i32 %290; + i32 %292 = mul i32 %268, i32 1073741824; + i32 %293 = srem i32 %268, i32 1073741824; + i32 %294 = add i32 %292, i32 %293; + i32 %295 = add i32 %266, i32 1; + i1 %296 = icmp slt i32 %295, i32 80; + cbr i1 %296(prob = 0.9875), ^while.body6, ^b3; ^b3: - i32 %426 = add i32 %156, i32 64; - i1 %427 = icmp sgt i32 %155, i32 %426; - i32 %428 = add i32 %157, i32 %420; - i32 %429 = add i32 %158, i32 %396; - i32 %430 = add i32 %159, i32 %423; - i32 %431 = add i32 %160, i32 %398; - i32 %432 = add i32 %161, i32 %399; - cbr i1 %427(prob = 0.984615), ^while.body3, ^b4; + i32 %297 = add i32 %132, i32 64; + i1 %298 = icmp sgt i32 %131, i32 %297; + i32 %299 = add i32 %133, i32 %291; + i32 %300 = add i32 %134, i32 %267; + i32 %301 = add i32 %135, i32 %294; + i32 %302 = add i32 %136, i32 %269; + i32 %303 = add i32 %137, i32 %270; + cbr i1 %298(prob = 0.984615), ^while.body3, ^b4; ^b4: - i32 %433 = load i32* %3; - i32 %434 = add i32 %428, i32 %433; - i32 %435 = neg i32 %434; - store i32* %3 with i32 %435; - i32 %436 = load i32* %4; - i32 %437 = add i32 %429, i32 %436; - i32 %438 = neg i32 %437; - store i32* %4 with i32 %438; - i32 %439 = load i32* %5; - i32 %440 = add i32 %430, i32 %439; - i32 %441 = neg i32 %440; - store i32* %5 with i32 %441; - i32 %442 = load i32* %6; - i32 %443 = add i32 %431, i32 %442; - i32 %444 = neg i32 %443; - store i32* %6 with i32 %444; - i32 %445 = load i32* %7; - i32 %446 = add i32 %432, i32 %445; - i32 %447 = neg i32 %446; - store i32* %7 with i32 %447; - i32 %448 = add i32 %93, i32 -1; - i1 %449 = icmp sgt i32 %448, i32 0; - cbr i1 %449(prob = 0.984615), ^while.body, ^b; + i32 %304 = load i32* %3; + i32 %305 = add i32 %299, i32 %304; + i32 %306 = neg i32 %305; + store i32* %3 with i32 %306; + i32 %307 = load i32* %4; + i32 %308 = add i32 %300, i32 %307; + i32 %309 = neg i32 %308; + store i32* %4 with i32 %309; + i32 %310 = load i32* %5; + i32 %311 = add i32 %301, i32 %310; + i32 %312 = neg i32 %311; + store i32* %5 with i32 %312; + i32 %313 = load i32* %6; + i32 %314 = add i32 %302, i32 %313; + i32 %315 = neg i32 %314; + store i32* %6 with i32 %315; + i32 %316 = load i32* %7; + i32 %317 = add i32 %303, i32 %316; + i32 %318 = neg i32 %317; + store i32* %7 with i32 %318; + i32 %319 = add i32 %93, i32 -1; + i1 %320 = icmp sgt i32 %319, i32 0; + cbr i1 %320(prob = 0.984615), ^while.body, ^b; } diff --git a/tests/SysY2022/performance/crypto-3.arm.s b/tests/SysY2022/performance/crypto-3.arm.s index 32df7a74c..7a06355e8 100644 --- a/tests/SysY2022/performance/crypto-3.arm.s +++ b/tests/SysY2022/performance/crypto-3.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 buffer: .zero 131072 .text diff --git a/tests/SysY2022/performance/crypto-3.riscv.s b/tests/SysY2022/performance/crypto-3.riscv.s index 9c79548b6..fd1263e83 100644 --- a/tests/SysY2022/performance/crypto-3.riscv.s +++ b/tests/SysY2022/performance/crypto-3.riscv.s @@ -1,247 +1,193 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 buffer: .zero 131072 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[340] RegSpill[8] CalleeSaved[104] - addi sp, sp, -456 + # stack usage: CalleeArg[0] Local[340] RegSpill[0] CalleeSaved[104] + addi sp, sp, -448 sd ra, 0(sp) sd s0, 8(sp) sd s5, 16(sp) sd s1, 24(sp) sd s6, 32(sp) sd s2, 40(sp) - sd s3, 48(sp) - sd s4, 56(sp) + sd s4, 48(sp) + sd s3, 56(sp) sd s7, 64(sp) - sd s10, 72(sp) - sd s8, 80(sp) - sd s11, 88(sp) - sd s9, 96(sp) + sd s8, 72(sp) + sd s9, 80(sp) + sd s10, 88(sp) + sd s11, 96(sp) jal getint mv s0, a0 jal getint mv s1, a0 li a0, 161 jal _sysy_starttime -pcrel889: - auipc a1, %pcrel_hi(buffer) - lui a0, 31 + sd zero, 424(sp) + li t0, 125 +pcrel667: + auipc a0, %pcrel_hi(buffer) + lui a3, 31 + lui a2, 66 + addi a4, a0, %pcrel_lo(pcrel667) + addiw a1, a3, 1028 + addiw a0, a2, 33 sd zero, 432(sp) - addi a5, a1, %pcrel_lo(pcrel889) - addiw a3, a0, 1028 - li a1, 20 - sd zero, 440(sp) - li a0, 80 - add a2, a5, a3 - sw zero, 448(sp) - sd a5, 104(sp) - ble s1, zero, label51 - mv a4, s0 - mv a3, s1 - j label2 -.p2align 2 -label47: - lw t0, 432(sp) - addiw a3, a3, -1 - addw t6, t1, t0 - subw a5, zero, t6 - sw a5, 432(sp) - lw t0, 436(sp) - addw t6, t2, t0 - subw t1, zero, t6 - sw t1, 436(sp) - lw t0, 440(sp) - addw a5, t3, t0 - subw t1, zero, a5 - sw t1, 440(sp) - lw t0, 444(sp) - addw a5, t4, t0 - subw t1, zero, a5 - sw t1, 444(sp) - lw t2, 448(sp) - addw t0, t5, t2 - subw a5, zero, t0 - sw a5, 448(sp) - ble a3, zero, label51 -.p2align 2 -label2: - ld a5, 104(sp) - mv t0, zero - j label5 + li a3, 20 + add a5, a4, a1 + li a2, 80 + sw zero, 440(sp) + slli a1, t0, 8 + ble s1, zero, label2 + mv t1, s0 + mv t0, s1 + mv t2, a4 + mv t3, zero + j label6 .p2align 2 -label9: - addi a5, a5, 32 +label49: + lw s0, 424(sp) + addiw t0, t0, -1 + addw t2, t4, s0 + subw t3, zero, t2 + sw t3, 424(sp) + lw s0, 428(sp) + addw t2, t5, s0 + subw t4, zero, t2 + sw t4, 428(sp) + lw t3, 432(sp) + addw t2, a6, t3 + subw t4, zero, t2 + sw t4, 432(sp) + lw t3, 436(sp) + addw t2, t6, t3 + subw t4, zero, t2 + sw t4, 436(sp) + lw t5, 440(sp) + addw t2, a7, t5 + subw t3, zero, t2 + sw t3, 440(sp) + ble t0, zero, label2 + mv t2, a4 + mv t3, zero .p2align 2 -label5: - slliw t3, a4, 13 - addiw t0, t0, 8 - addw t2, t3, a4 - slli t1, t2, 1 - srli t4, t1, 47 - add a4, t2, t4 - lui t4, 66 - sraiw t3, a4, 17 - addiw a4, t4, 33 - addw t1, t2, t3 - mulw t3, t1, a4 - slli t2, t3, 1 - srli t4, t2, 47 - add a6, t3, t4 - sraiw t6, a6, 17 - addw t2, t3, t6 - mulw t5, t2, a4 - slli t4, t5, 1 - srli a6, t4, 47 - add a7, t5, a6 - sraiw t6, a7, 17 - addw t3, t5, t6 - mulw t4, t3, a4 - slli a6, t4, 1 - srli t6, a6, 47 - add a7, t4, t6 - sraiw a6, a7, 17 - addw t5, t4, a6 - mulw t6, t5, a4 - slli a7, t6, 1 - srli a6, a7, 47 - add t4, t6, a6 - sraiw a7, t4, 17 - addw a6, t6, a7 - mulw t4, a6, a4 - slli a7, t4, 1 - srli t6, a7, 47 - add s1, t4, t6 - sraiw s2, s1, 17 - addw a7, t4, s2 - slli s4, a7, 5 - mulw s0, a7, a4 - slli t6, s0, 1 - srli s2, t6, 47 - add t4, s0, s2 - sraiw s1, t4, 17 - addw t6, s0, s1 - mulw t4, t6, a4 - slli s2, t4, 1 - srli s0, s2, 47 - add s3, t4, s0 - sraiw a4, s3, 17 - addw s1, t4, a4 - slli s2, s1, 5 - addw s0, s2, s1 - slli s3, s0, 1 - mv a4, s0 - srli t4, s3, 56 - slli s3, t6, 5 - add s1, s0, t4 - andi s2, s1, -256 - addw s1, s3, t6 - subw t4, s0, s2 - slli s0, s1, 1 - srli t6, s0, 56 - addw s0, s4, a7 - add s2, s1, t6 - andi s3, s2, -256 - slli s2, s0, 1 - subw t6, s1, s3 - srli a7, s2, 56 - slli s2, a6, 5 - add s1, s0, a7 - andi s3, s1, -256 - addw s1, s2, a6 - subw a7, s0, s3 - slli s3, s1, 1 - srli a6, s3, 56 - slli s3, t5, 5 - add s0, s1, a6 +label6: + slliw t4, t1, 13 + addiw t3, t3, 4 + addw t5, t4, t1 + slli a6, t5, 1 + srli a7, a6, 47 + add t6, t5, a7 + sraiw a6, t6, 17 + addw t4, t5, a6 + mulw t1, t4, a0 + slli t6, t1, 1 + srli a7, t6, 47 + add s0, t1, a7 + sraiw a6, s0, 17 + addw t5, t1, a6 + mulw t6, t5, a0 + slli a6, t6, 1 + srli t1, a6, 47 + add a7, t6, t1 + sraiw s0, a7, 17 + addw a6, t6, s0 + mulw t1, a6, a0 + slli s1, t1, 1 + srli t6, s1, 47 + add a7, t1, t6 + sraiw s0, a7, 17 + addw t6, t1, s0 + slli s2, t6, 5 + addw a7, s2, t6 + slli s1, a7, 1 + mv t1, a7 + srli t6, s1, 56 + slli s1, a6, 5 + add s0, a7, t6 andi s2, s0, -256 - addw s0, s3, t5 - subw a6, s1, s2 - slli s1, s0, 1 + subw t6, a7, s2 + addw a7, s1, a6 + slli s0, a7, 1 + srli a6, s0, 56 + slli s0, t5, 5 + add s2, a7, a6 + andi s1, s2, -256 + subw a6, a7, s1 + addw a7, s0, t5 + slli s1, a7, 1 srli t5, s1, 56 - slli s1, t3, 5 - add s2, s0, t5 - andi s3, s2, -256 - addw s2, s1, t3 - subw t5, s0, s3 - slli s0, s2, 1 - srli t3, s0, 56 - slli s0, t2, 5 - add s1, s2, t3 - andi s3, s1, -256 - addw s1, s0, t2 - subw t3, s2, s3 - slli s2, s1, 1 - srli t2, s2, 56 - slli s2, t1, 5 - add s0, s1, t2 - andi s3, s0, -256 - addw s0, s2, t1 - subw t2, s1, s3 - slli s2, t2, 32 - slli s3, s0, 1 - slli t2, t5, 32 - srli t1, s3, 56 - add s4, s0, t1 - andi s1, s4, -256 - subw t1, s0, s1 - add.uw s0, t3, t2 - add.uw s3, t1, s2 - slli t2, t4, 32 - slli t1, a7, 32 - sd s3, 0(a5) - add.uw t3, a6, t1 - sd s0, 8(a5) - add.uw t1, t6, t2 - sd t3, 16(a5) - li t2, 125 - sd t1, 24(a5) - slli t1, t2, 8 - blt t0, t1, label9 - li t1, 125 - li t3, 128 - ld a5, 104(sp) - slli t0, t1, 10 - addi t1, t0, 8 - add t2, a5, t0 - srli t0, t1, 2 - add a5, a5, t1 - sw t3, 0(t2) - mv t3, a2 - j label11 -.p2align 2 -label50: - addi t1, a5, 4 - mv t0, t2 - mv t3, a5 - mv a5, t1 + slli s1, t4, 5 + add s2, a7, t5 + andi s0, s2, -256 + subw t5, a7, s0 + addw a7, s1, t4 + slli s0, a7, 1 + srli t4, s0, 56 + slli s0, t5, 32 + add s2, a7, t4 + slli t5, t6, 32 + andi s1, s2, -256 + subw t4, a7, s1 + add.uw a7, t4, s0 + add.uw t4, a6, t5 + sd a7, 0(t2) + sd t4, 8(t2) + bge t3, a1, label95 + addi t2, t2, 16 + j label6 .p2align 2 -label11: - andi t1, t0, 63 - addiw t2, t0, 1 - li t4, 60 - sw zero, 0(t3) - bne t1, t4, label50 +label95: + li t4, 125 + li t5, 128 + mv t6, a5 + li a6, 60 + slli t3, t4, 10 + addi t4, t3, 8 + add t2, a4, t3 + srli t3, t4, 2 + sw t5, 0(t2) + addiw t5, t3, 1 + add t2, a4, t4 sw zero, 0(a5) - li t1, 125 - addiw t0, t0, 4 - mv t6, zero - lui t3, 422994 - lui t5, 982235 - lui a7, 802094 - addiw t2, t5, -1143 - sw zero, 4(a5) - addiw t5, a7, 496 - xori t4, t2, -1 - sw t1, 8(a5) - addiw t1, t3, 769 - sw zero, 12(a5) - xori t3, t1, -1 + andi t4, t3, 63 + beq t4, a6, label16 +.p2align 2 +label51: + addi t4, t2, 4 + mv t3, t5 + mv t6, t2 + mv t2, t4 + andi t4, t5, 63 + addiw t5, t5, 1 + li a6, 60 + sw zero, 0(t6) + bne t4, a6, label51 +.p2align 2 +label16: + sw zero, 0(t2) + li t4, 125 + addiw t3, t3, 4 + mv s0, zero + lui t6, 422994 + lui a7, 982235 + lui s2, 802094 + addiw t5, a7, -1143 + sw zero, 4(t2) + addiw a7, s2, 496 + sw t4, 8(t2) + addiw t4, t6, 769 + sw zero, 12(t2) + xori t6, t5, -1 + xori a6, t4, -1 + mv t2, a4 + sd zero, 104(sp) sd zero, 112(sp) sd zero, 120(sp) sd zero, 128(sp) @@ -281,364 +227,258 @@ label11: sd zero, 400(sp) sd zero, 408(sp) sd zero, 416(sp) - sd zero, 424(sp) - ld a5, 104(sp) - addi a6, sp, 112 - mv a7, zero - j label24 + addi s1, sp, 104 + mv s2, zero + j label25 .p2align 2 -label27: - addi a6, a6, 32 +label352: + addiw s0, s0, 64 + addw t4, t4, s8 + addw t5, t5, s3 + addw a6, a6, s9 + addw t6, t6, s2 + addw a7, a7, s4 + ble t3, s0, label49 + addi t2, t2, 256 + addi s1, sp, 104 + mv s2, zero .p2align 2 -label24: - slliw s2, a7, 2 - sh2add s0, s2, a5 - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s5 - slli s6, s4, 8 - lw s3, 12(s0) - addiw s4, a7, 1 - addw s1, s2, s6 - slliw s2, s4, 2 - addw s5, s1, s3 - sh2add s0, s2, a5 - sw s5, 0(a6) - lw s1, 0(s0) - slli s6, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s6 - slli s5, s4, 8 - lw s3, 12(s0) - addw s1, s2, s5 - addiw s2, a7, 2 - addw s4, s1, s3 - slliw s3, s2, 2 - sh2add s0, s3, a5 - sw s4, 4(a6) - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s5 - slli s6, s4, 8 - lw s3, 12(s0) - addw s1, s2, s6 - addiw s2, a7, 3 - addw s4, s1, s3 - slliw s3, s2, 2 - sh2add s0, s3, a5 - sw s4, 8(a6) - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - lw s6, 8(s0) - slli s3, s4, 16 - slli s4, s6, 8 - addw s2, s3, s5 - lw s3, 12(s0) - addw s1, s2, s4 - addiw s4, a7, 4 - addw s5, s1, s3 - slliw s2, s4, 2 - sh2add s0, s2, a5 - sw s5, 12(a6) - lw s1, 0(s0) - slli s6, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s6 - slli s5, s4, 8 - lw s3, 12(s0) - addw s1, s2, s5 - addiw s2, a7, 5 - addw s4, s1, s3 - slliw s3, s2, 2 - sh2add s0, s3, a5 - sw s4, 16(a6) - lw s1, 0(s0) - slli s5, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s5 - slli s6, s4, 8 - addiw s5, a7, 6 - lw s3, 12(s0) - addw s1, s2, s6 - addw s4, s1, s3 - slliw s1, s5, 2 - sh2add s0, s1, a5 - sw s4, 20(a6) - lw s2, 0(s0) - slli s5, s2, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s1, s3, s5 - slli s6, s4, 8 - lw s3, 12(s0) - addiw s4, a7, 7 - addw s2, s1, s6 - addiw a7, a7, 8 - addw s5, s2, s3 - slliw s2, s4, 2 - sh2add s0, s2, a5 - sw s5, 24(a6) - lw s1, 0(s0) - slli s6, s1, 24 - lw s4, 4(s0) - slli s3, s4, 16 - lw s4, 8(s0) - addw s2, s3, s6 - slli s5, s4, 8 - lw s3, 12(s0) - addw s1, s2, s5 - li s0, 16 - addw s4, s1, s3 - sw s4, 28(a6) - blt a7, s0, label27 - addi a6, sp, 112 - li a7, 16 - addi a6, a6, 64 +label25: + slliw s4, s2, 2 + sh2add s3, s4, t2 + lw s5, 0(s3) + slli s8, s5, 24 + lw s7, 4(s3) + lw s9, 8(s3) + slli s6, s7, 16 + slli s7, s9, 8 + addw s4, s6, s8 + lw s8, 12(s3) + addw s5, s4, s7 + addiw s7, s2, 1 + addw s6, s5, s8 + slliw s5, s7, 2 + sh2add s3, s5, t2 + sw s6, 0(s1) + lw s4, 0(s3) + slli s9, s4, 24 + lw s7, 4(s3) + lw s8, 8(s3) + slli s6, s7, 16 + slli s7, s8, 8 + addw s5, s6, s9 + lw s6, 12(s3) + addw s4, s5, s7 + addiw s7, s2, 2 + addw s8, s4, s6 + slliw s5, s7, 2 + sh2add s3, s5, t2 + sw s8, 4(s1) + lw s4, 0(s3) + slli s8, s4, 24 + lw s7, 4(s3) + lw s9, 8(s3) + slli s6, s7, 16 + slli s7, s9, 8 + addw s5, s6, s8 + lw s6, 12(s3) + addw s4, s5, s7 + addiw s5, s2, 3 + addw s8, s4, s6 + addiw s2, s2, 4 + slliw s6, s5, 2 + sh2add s3, s6, t2 + sw s8, 8(s1) + lw s4, 0(s3) + slli s8, s4, 24 + lw s7, 4(s3) + slli s6, s7, 16 + lw s7, 8(s3) + addw s5, s6, s8 + slli s9, s7, 8 + lw s8, 12(s3) + addw s4, s5, s9 + li s3, 16 + addw s6, s4, s8 + sw s6, 12(s1) + bge s2, s3, label263 + addi s1, s1, 16 + j label25 .p2align 2 -label29: - lw s0, -12(a6) - addiw a7, a7, 8 - lw s2, -32(a6) - lw s4, -56(a6) - addw s3, s0, s2 - lw s7, -64(a6) - addw s6, s3, s7 - subw s1, s4, s6 - srliw s5, s1, 31 - add s3, s1, s5 - andi s7, s3, -2 - subw s6, s1, s7 - sh1add s5, s1, s6 - sw s5, 0(a6) - lw s7, -8(a6) - lw s1, -28(a6) - addw s8, s7, s1 - lw s3, -52(a6) - lw s11, -60(a6) - addw s10, s5, s3 - addw s9, s8, s11 - subw s6, s3, s9 - srliw s5, s6, 31 - add s11, s6, s5 - andi s3, s11, -2 - subw s8, s6, s3 - sh1add s5, s6, s8 - sw s5, 4(a6) - lw s3, -4(a6) - lw s11, -24(a6) - addw s6, s4, s3 - lw s9, -48(a6) - addw s8, s6, s11 - subw s4, s9, s8 - srliw s6, s4, 31 - add s11, s4, s6 - andi s8, s11, -2 - subw s6, s4, s8 - sh1add s11, s4, s6 - sw s11, 8(a6) - lw s8, -20(a6) - addw s11, s10, s8 - lw s8, -44(a6) - subw s10, s8, s11 - srliw s11, s10, 31 - sh1add s7, s10, s7 - add s11, s10, s11 - andi s11, s11, -2 - subw s11, s10, s11 - sh1add s10, s10, s11 - addw s7, s7, s11 - sw s10, 12(a6) - lw s11, -16(a6) - lw s10, -40(a6) - addw s7, s7, s10 - subw s7, s2, s7 - srliw s2, s7, 31 - add s2, s7, s2 - andi s2, s2, -2 - subw s2, s7, s2 - sh1add s2, s7, s2 - addw s7, s5, s9 - addw s5, s11, s7 - subw s7, s10, s5 - srliw s9, s7, 31 - add s11, s7, s9 - andi s10, s11, -2 - sh1add s11, s4, s0 - subw s5, s7, s10 - addw s10, s6, s11 - sh1add s9, s7, s5 - sw s9, 16(a6) - lw s0, -36(a6) - addw s9, s8, s10 - subw s4, s0, s9 - srliw s6, s4, 31 - add s8, s4, s6 - andi s10, s8, -2 - sh1add s8, s7, s3 - subw s9, s4, s10 - sh1add s6, s4, s9 - addw s4, s5, s8 - sw s6, 20(a6) - addw s6, s0, s4 - subw s3, s1, s6 - srliw s5, s3, 31 - add s4, s3, s5 - andi s0, s4, -2 - subw s6, s3, s0 - sh1add s5, s3, s6 - slli s1, s5, 32 - add.uw s0, s2, s1 - sd s0, 24(a6) - bge a7, a0, label32 - addi a6, a6, 32 - j label29 +label263: + addi s1, sp, 104 + li s2, 16 + addi s1, s1, 64 .p2align 2 -label32: - addi a6, sp, 112 - mv s2, zero - mv s0, t1 - mv s3, t2 - mv a7, t3 - mv s1, t4 - mv s4, t5 - blt zero, a1, label461 - addw s6, t2, t3 - li s7, 1 - mv s9, zero - subw s8, s6, t4 - bne s7, zero, label848 - mv s9, s8 - li s6, 1 - mv s5, s8 - bne s6, zero, label850 - lui s10, 586172 - lui s11, 828972 - lui s9, 454047 - addiw s8, s11, 262 - addiw s7, s9, -1151 - j label880 +label30: + lw s4, -12(s1) + addiw s2, s2, 4 + lw s7, -32(s1) + lw s3, -56(s1) + addw s5, s4, s7 + lw s6, -64(s1) + addw s7, s5, s6 + subw s4, s3, s7 + srliw s8, s4, 31 + add s6, s4, s8 + andi s7, s6, -2 + subw s9, s4, s7 + sh1add s5, s4, s9 + sw s5, 0(s1) + lw s7, -8(s1) + lw s8, -28(s1) + addw s9, s7, s8 + lw s6, -52(s1) + lw s10, -60(s1) + addw s4, s5, s6 + addw s11, s9, s10 + subw s5, s6, s11 + srliw s7, s5, 31 + add s8, s5, s7 + andi s6, s8, -2 + subw s9, s5, s6 + sh1add s10, s5, s9 + sw s10, 4(s1) + lw s7, -4(s1) + lw s9, -24(s1) + addw s5, s3, s7 + lw s8, -48(s1) + addw s6, s5, s9 + subw s3, s8, s6 + srliw s7, s3, 31 + add s9, s3, s7 + andi s6, s9, -2 + subw s5, s3, s6 + sh1add s7, s3, s5 + sw s7, 8(s1) + lw s6, -20(s1) + lw s7, -44(s1) + addw s5, s4, s6 + subw s3, s7, s5 + srliw s4, s3, 31 + add s6, s3, s4 + andi s5, s6, -2 + subw s7, s3, s5 + sh1add s4, s3, s7 + sw s4, 12(s1) + bge s2, a2, label323 + addi s1, s1, 16 + j label30 .p2align 2 -label461: - lui s6, 370728 +label323: + addi s1, sp, 104 mv s5, zero - addiw s7, s6, -1639 + mv s3, t4 + mv s6, t5 + mv s2, a6 + mv s4, t6 + mv s7, a7 + blt zero, a3, label329 + addw s9, t5, a6 + li s10, 1 + mv s11, zero + subw s8, s9, t6 + bne s10, zero, label619 + mv s11, s8 + li s9, 1 + bne s9, zero, label621 + lui s10, 828972 + addiw s11, s10, 262 + lui s10, 454047 + addiw s10, s10, -1151 + j label656 .p2align 2 -label42: - slliw s8, s0, 5 - addiw s2, s2, 1 - addw s10, s4, s8 - slli s8, s0, 1 - addw s9, s7, s10 - srli s4, s8, 59 - addw s6, s5, s9 - add s9, s0, s4 - slliw s4, s3, 30 - andi s5, s9, -32 - lw s9, 0(a6) - subw s8, s0, s5 - addw s7, s6, s8 - slli s8, s3, 1 - addw s5, s7, s9 - srli s10, s8, 34 - add s6, s3, s10 - sraiw s9, s6, 30 - slli s7, s9, 30 - subw s8, s3, s7 - addw s6, s4, s8 - bge s2, a0, label484 - addi a6, a6, 4 - mv s3, s0 - mv s4, s1 - mv s0, s5 - mv s1, a7 - mv a7, s6 - blt s2, a1, label461 - addw s6, s3, s6 - slti s7, s2, 60 - mv s9, zero - subw s8, s6, s1 - bne s7, zero, label848 - mv s9, s8 - slti s6, s2, 40 - mv s5, s8 - bne s6, zero, label850 - lui s10, 586172 - lui s11, 828972 - lui s9, 454047 - addiw s8, s11, 262 - addiw s7, s9, -1151 - j label880 +label329: + lui s9, 370728 + mv s8, zero + addiw s10, s9, -1639 .p2align 2 -label848: - slti s6, s2, 40 - mv s5, s8 - bne s6, zero, label850 - mv s5, s9 +label44: + slliw s9, s3, 5 + addiw s5, s5, 1 + addw s11, s7, s9 + addw s7, s10, s11 + slli s11, s3, 1 + addw s9, s8, s7 + srli s10, s11, 59 + lw s11, 0(s1) + add s8, s3, s10 + andi s7, s8, -32 + subw s8, s3, s7 + slliw s7, s6, 30 + addw s10, s9, s8 + addw s8, s10, s11 + slli s11, s6, 1 + srli s10, s11, 34 + add s9, s6, s10 + sraiw s11, s9, 30 + slli s9, s11, 30 + subw s10, s6, s9 + addw s9, s7, s10 + bge s5, a2, label352 + addi s1, s1, 4 + mv s6, s3 + mv s7, s4 + mv s3, s8 + mv s4, s2 + mv s2, s9 + blt s5, a3, label329 + addw s9, s6, s9 + slti s10, s5, 60 + mv s11, zero + subw s8, s9, s4 + bne s10, zero, label619 + mv s11, s8 + slti s9, s5, 40 + bne s9, zero, label621 + lui s10, 828972 + addiw s11, s10, 262 + lui s10, 454047 + addiw s10, s10, -1151 + j label656 .p2align 2 -label850: - lui s10, 586172 - addiw s8, s10, -804 - beq s7, zero, label851 +label619: + slti s9, s5, 40 + bne s9, zero, label621 + mv s8, s11 .p2align 2 -label852: - lui s9, 454047 - addiw s7, s9, -1151 - bne s6, zero, label42 +label621: + lui s11, 586172 + addiw s11, s11, -804 + beq s10, zero, label622 .p2align 2 -label880: - mv s7, s8 - j label42 +label623: + lui s10, 454047 + addiw s10, s10, -1151 + bne s9, zero, label44 .p2align 2 -label484: - addiw t6, t6, 64 - addw t1, t1, s5 - addw t2, t2, s0 - addw t3, t3, s6 - addw t4, t4, a7 - addw t5, t5, s1 - ble t0, t6, label47 - addi a5, a5, 256 - addi a6, sp, 112 - mv a7, zero - j label24 -label51: +label656: + mv s10, s11 + j label44 +label2: li a0, 184 jal _sysy_stoptime li a0, 5 - addi a1, sp, 432 + addi a1, sp, 424 jal putarray - ld ra, 0(sp) mv a0, zero + ld ra, 0(sp) ld s0, 8(sp) ld s5, 16(sp) ld s1, 24(sp) ld s6, 32(sp) ld s2, 40(sp) - ld s3, 48(sp) - ld s4, 56(sp) + ld s4, 48(sp) + ld s3, 56(sp) ld s7, 64(sp) - ld s10, 72(sp) - ld s8, 80(sp) - ld s11, 88(sp) - ld s9, 96(sp) - addi sp, sp, 456 + ld s8, 72(sp) + ld s9, 80(sp) + ld s10, 88(sp) + ld s11, 96(sp) + addi sp, sp, 448 ret .p2align 2 -label851: - lui s11, 828972 - addiw s8, s11, 262 - j label852 +label622: + lui s10, 828972 + addiw s11, s10, 262 + j label623 diff --git a/tests/SysY2022/performance/crypto-3.sy.ir b/tests/SysY2022/performance/crypto-3.sy.ir index 4840c8cb7..adad5665d 100644 --- a/tests/SysY2022/performance/crypto-3.sy.ir +++ b/tests/SysY2022/performance/crypto-3.sy.ir @@ -107,16 +107,16 @@ func @main() -> i32 { NoRecurse Entry } { i32* %91 = getelementptr &([32768 * i32]* %8)[i64 0][i64 32001]; cbr i1 %2(prob = 0.984615), ^while.body, ^b; ^while.body: - i32 %92 = phi [^entry, i32 %0] [^b4, i32 %120]; - i32 %93 = phi [^entry, i32 %1] [^b4, i32 %448]; + i32 %92 = phi [^entry, i32 %0] [^b4, i32 %108]; + i32 %93 = phi [^entry, i32 %1] [^b4, i32 %319]; ubr ^while.body1; ^b: call (i32) -> void @stoptime(i32 184); call (i32, i32*) -> void @putarray(i32 5, i32* %3); ret i32 0; ^while.body1: - i32 %94 = phi [^while.body, i32 %92] [^while.body1, i32 %120]; - i32 %95 = phi [^while.body, i32 0] [^while.body1, i32 %144]; + i32 %94 = phi [^while.body, i32 %92] [^while.body1, i32 %108]; + i32 %95 = phi [^while.body, i32 0] [^while.body1, i32 %120]; i32 %96 = mul i32 %94, i32 8193; i32 %97 = sdiv i32 %96, i32 131072; i32 %98 = add i32 %96, i32 %97; @@ -129,74 +129,46 @@ func @main() -> i32 { NoRecurse Entry } { i32 %105 = mul i32 %104, i32 270369; i32 %106 = sdiv i32 %105, i32 131072; i32 %107 = add i32 %105, i32 %106; - i32 %108 = mul i32 %107, i32 270369; - i32 %109 = sdiv i32 %108, i32 131072; - i32 %110 = add i32 %108, i32 %109; - i32 %111 = mul i32 %110, i32 270369; - i32 %112 = sdiv i32 %111, i32 131072; - i32 %113 = add i32 %111, i32 %112; - i32 %114 = mul i32 %113, i32 270369; - i32 %115 = sdiv i32 %114, i32 131072; - i32 %116 = add i32 %114, i32 %115; - i32 %117 = mul i32 %116, i32 270369; - i32 %118 = sdiv i32 %117, i32 131072; - i32 %119 = add i32 %117, i32 %118; - i32 %120 = mul i32 %119, i32 33; - i32 %121 = srem i32 %120, i32 256; - i32 %122 = mul i32 %116, i32 33; - i32 %123 = srem i32 %122, i32 256; - i32 %124 = mul i32 %113, i32 33; - i32 %125 = srem i32 %124, i32 256; - i32 %126 = mul i32 %110, i32 33; - i32 %127 = srem i32 %126, i32 256; - i32 %128 = mul i32 %107, i32 33; - i32 %129 = srem i32 %128, i32 256; - i32 %130 = mul i32 %104, i32 33; - i32 %131 = srem i32 %130, i32 256; - i32 %132 = mul i32 %101, i32 33; - i32 %133 = srem i32 %132, i32 256; - i32 %134 = mul i32 %98, i32 33; - i32 %135 = srem i32 %134, i32 256; - i32* %136 = getelementptr &([32768 * i32]* %8)[i64 0][i32 %95]; - store i32* %136 with i32 %135; - i32* %137 = getelementptr &(i32* %136)[i64 1]; - store i32* %137 with i32 %133; - i32* %138 = getelementptr &(i32* %136)[i64 2]; - store i32* %138 with i32 %131; - i32* %139 = getelementptr &(i32* %136)[i64 3]; - store i32* %139 with i32 %129; - i32* %140 = getelementptr &(i32* %136)[i64 4]; - store i32* %140 with i32 %127; - i32* %141 = getelementptr &(i32* %136)[i64 5]; - store i32* %141 with i32 %125; - i32* %142 = getelementptr &(i32* %136)[i64 6]; - store i32* %142 with i32 %123; - i32* %143 = getelementptr &(i32* %136)[i64 7]; - store i32* %143 with i32 %121; - i32 %144 = add i32 %95, i32 8; - i1 %145 = icmp slt i32 %144, i32 32000; - cbr i1 %145(prob = 0.99975), ^while.body1, ^postbody; + i32 %108 = mul i32 %107, i32 33; + i32 %109 = srem i32 %108, i32 256; + i32 %110 = mul i32 %104, i32 33; + i32 %111 = srem i32 %110, i32 256; + i32 %112 = mul i32 %101, i32 33; + i32 %113 = srem i32 %112, i32 256; + i32 %114 = mul i32 %98, i32 33; + i32 %115 = srem i32 %114, i32 256; + i32* %116 = getelementptr &([32768 * i32]* %8)[i64 0][i32 %95]; + store i32* %116 with i32 %115; + i32* %117 = getelementptr &(i32* %116)[i64 1]; + store i32* %117 with i32 %113; + i32* %118 = getelementptr &(i32* %116)[i64 2]; + store i32* %118 with i32 %111; + i32* %119 = getelementptr &(i32* %116)[i64 3]; + store i32* %119 with i32 %109; + i32 %120 = add i32 %95, i32 4; + i1 %121 = icmp slt i32 %120, i32 32000; + cbr i1 %121(prob = 0.999875), ^while.body1, ^postbody; ^postbody: store i32* %90 with i32 128; ubr ^while.body2; ^while.body2: - i32 %146 = phi [^postbody, i32 32002] [^while.body2, i32 %151]; - i32* %147 = phi [^postbody, i32* %91] [^while.body2, i32* %150]; - i32 %148 = and i32 %146, i32 63; - i1 %149 = icmp neq i32 %148, i32 60; - i32* %150 = getelementptr &(i32* %9)[i32 %146]; - store i32* %147 with i32 0; - i32 %151 = add i32 %146, i32 1; - cbr i1 %149(prob = 0.984615), ^while.body2, ^b1; + i32 %122 = phi [^postbody, i32 32002] [^while.body2, i32 %127]; + i32* %123 = phi [^postbody, i32* %91] [^while.body2, i32* %126]; + i32 %124 = and i32 %122, i32 63; + i1 %125 = icmp neq i32 %124, i32 60; + i32* %126 = getelementptr &(i32* %9)[i32 %122]; + store i32* %123 with i32 0; + i32 %127 = add i32 %122, i32 1; + cbr i1 %125(prob = 0.984615), ^while.body2, ^b1; ^b1: - store i32* %150 with i32 0; - i32* %152 = getelementptr &(i32* %150)[i64 1]; - store i32* %152 with i32 0; - i32* %153 = getelementptr &(i32* %150)[i64 2]; - store i32* %153 with i32 125; - i32* %154 = getelementptr &(i32* %150)[i64 3]; - store i32* %154 with i32 0; - i32 %155 = add i32 %146, i32 4; + store i32* %126 with i32 0; + i32* %128 = getelementptr &(i32* %126)[i64 1]; + store i32* %128 with i32 0; + i32* %129 = getelementptr &(i32* %126)[i64 2]; + store i32* %129 with i32 125; + i32* %130 = getelementptr &(i32* %126)[i64 3]; + store i32* %130 with i32 0; + i32 %131 = add i32 %122, i32 4; store i32* %10 with i32 0; store i32* %11 with i32 0; store i32* %12 with i32 0; @@ -279,334 +251,221 @@ func @main() -> i32 { NoRecurse Entry } { store i32* %89 with i32 0; ubr ^while.body3; ^while.body3: - i32 %156 = phi [^b1, i32 0] [^b3, i32 %426]; - i32 %157 = phi [^b1, i32 1732584193] [^b3, i32 %428]; - i32 %158 = phi [^b1, i32 -271733879] [^b3, i32 %429]; - i32 %159 = phi [^b1, i32 -1732584194] [^b3, i32 %430]; - i32 %160 = phi [^b1, i32 271733878] [^b3, i32 %431]; - i32 %161 = phi [^b1, i32 -1009589776] [^b3, i32 %432]; - i32* %162 = getelementptr &(i32* %9)[i32 %156]; + i32 %132 = phi [^b1, i32 0] [^b3, i32 %297]; + i32 %133 = phi [^b1, i32 1732584193] [^b3, i32 %299]; + i32 %134 = phi [^b1, i32 -271733879] [^b3, i32 %300]; + i32 %135 = phi [^b1, i32 -1732584194] [^b3, i32 %301]; + i32 %136 = phi [^b1, i32 271733878] [^b3, i32 %302]; + i32 %137 = phi [^b1, i32 -1009589776] [^b3, i32 %303]; + i32* %138 = getelementptr &(i32* %9)[i32 %132]; ubr ^while.body4; ^while.body4: - i32 %163 = phi [^while.body3, i32 0] [^while.body4, i32 %299]; - i32 %164 = mul i32 %163, i32 4; - i32* %165 = getelementptr &(i32* %162)[i32 %164]; + i32 %139 = phi [^while.body3, i32 0] [^while.body4, i32 %207]; + i32 %140 = mul i32 %139, i32 4; + i32* %141 = getelementptr &(i32* %138)[i32 %140]; + i32 %142 = load i32* %141; + i32* %143 = getelementptr &(i32* %141)[i64 1]; + i32 %144 = load i32* %143; + i32 %145 = mul i32 %144, i32 65536; + i32 %146 = mul i32 %142, i32 16777216; + i32 %147 = add i32 %145, i32 %146; + i32* %148 = getelementptr &(i32* %141)[i64 2]; + i32 %149 = load i32* %148; + i32 %150 = mul i32 %149, i32 256; + i32 %151 = add i32 %147, i32 %150; + i32* %152 = getelementptr &(i32* %141)[i64 3]; + i32 %153 = load i32* %152; + i32 %154 = add i32 %151, i32 %153; + i32* %155 = getelementptr &([80 * i32]* %words)[i64 0][i32 %139]; + store i32* %155 with i32 %154; + i32 %156 = add i32 %139, i32 1; + i32 %157 = mul i32 %156, i32 4; + i32* %158 = getelementptr &(i32* %138)[i32 %157]; + i32 %159 = load i32* %158; + i32* %160 = getelementptr &(i32* %158)[i64 1]; + i32 %161 = load i32* %160; + i32 %162 = mul i32 %161, i32 65536; + i32 %163 = mul i32 %159, i32 16777216; + i32 %164 = add i32 %162, i32 %163; + i32* %165 = getelementptr &(i32* %158)[i64 2]; i32 %166 = load i32* %165; - i32* %167 = getelementptr &(i32* %165)[i64 1]; - i32 %168 = load i32* %167; - i32 %169 = mul i32 %168, i32 65536; - i32 %170 = mul i32 %166, i32 16777216; - i32 %171 = add i32 %169, i32 %170; - i32* %172 = getelementptr &(i32* %165)[i64 2]; - i32 %173 = load i32* %172; - i32 %174 = mul i32 %173, i32 256; - i32 %175 = add i32 %171, i32 %174; - i32* %176 = getelementptr &(i32* %165)[i64 3]; - i32 %177 = load i32* %176; - i32 %178 = add i32 %175, i32 %177; - i32* %179 = getelementptr &([80 * i32]* %words)[i64 0][i32 %163]; - store i32* %179 with i32 %178; - i32 %180 = add i32 %163, i32 1; - i32 %181 = mul i32 %180, i32 4; - i32* %182 = getelementptr &(i32* %162)[i32 %181]; + i32 %167 = mul i32 %166, i32 256; + i32 %168 = add i32 %164, i32 %167; + i32* %169 = getelementptr &(i32* %158)[i64 3]; + i32 %170 = load i32* %169; + i32 %171 = add i32 %168, i32 %170; + i32* %172 = getelementptr &(i32* %155)[i64 1]; + store i32* %172 with i32 %171; + i32 %173 = add i32 %139, i32 2; + i32 %174 = mul i32 %173, i32 4; + i32* %175 = getelementptr &(i32* %138)[i32 %174]; + i32 %176 = load i32* %175; + i32* %177 = getelementptr &(i32* %175)[i64 1]; + i32 %178 = load i32* %177; + i32 %179 = mul i32 %178, i32 65536; + i32 %180 = mul i32 %176, i32 16777216; + i32 %181 = add i32 %179, i32 %180; + i32* %182 = getelementptr &(i32* %175)[i64 2]; i32 %183 = load i32* %182; - i32* %184 = getelementptr &(i32* %182)[i64 1]; - i32 %185 = load i32* %184; - i32 %186 = mul i32 %185, i32 65536; - i32 %187 = mul i32 %183, i32 16777216; - i32 %188 = add i32 %186, i32 %187; - i32* %189 = getelementptr &(i32* %182)[i64 2]; - i32 %190 = load i32* %189; - i32 %191 = mul i32 %190, i32 256; - i32 %192 = add i32 %188, i32 %191; - i32* %193 = getelementptr &(i32* %182)[i64 3]; - i32 %194 = load i32* %193; - i32 %195 = add i32 %192, i32 %194; - i32* %196 = getelementptr &(i32* %179)[i64 1]; - store i32* %196 with i32 %195; - i32 %197 = add i32 %163, i32 2; - i32 %198 = mul i32 %197, i32 4; - i32* %199 = getelementptr &(i32* %162)[i32 %198]; + i32 %184 = mul i32 %183, i32 256; + i32 %185 = add i32 %181, i32 %184; + i32* %186 = getelementptr &(i32* %175)[i64 3]; + i32 %187 = load i32* %186; + i32 %188 = add i32 %185, i32 %187; + i32* %189 = getelementptr &(i32* %155)[i64 2]; + store i32* %189 with i32 %188; + i32 %190 = add i32 %139, i32 3; + i32 %191 = mul i32 %190, i32 4; + i32* %192 = getelementptr &(i32* %138)[i32 %191]; + i32 %193 = load i32* %192; + i32* %194 = getelementptr &(i32* %192)[i64 1]; + i32 %195 = load i32* %194; + i32 %196 = mul i32 %195, i32 65536; + i32 %197 = mul i32 %193, i32 16777216; + i32 %198 = add i32 %196, i32 %197; + i32* %199 = getelementptr &(i32* %192)[i64 2]; i32 %200 = load i32* %199; - i32* %201 = getelementptr &(i32* %199)[i64 1]; - i32 %202 = load i32* %201; - i32 %203 = mul i32 %202, i32 65536; - i32 %204 = mul i32 %200, i32 16777216; - i32 %205 = add i32 %203, i32 %204; - i32* %206 = getelementptr &(i32* %199)[i64 2]; - i32 %207 = load i32* %206; - i32 %208 = mul i32 %207, i32 256; - i32 %209 = add i32 %205, i32 %208; - i32* %210 = getelementptr &(i32* %199)[i64 3]; - i32 %211 = load i32* %210; - i32 %212 = add i32 %209, i32 %211; - i32* %213 = getelementptr &(i32* %179)[i64 2]; - store i32* %213 with i32 %212; - i32 %214 = add i32 %163, i32 3; - i32 %215 = mul i32 %214, i32 4; - i32* %216 = getelementptr &(i32* %162)[i32 %215]; + i32 %201 = mul i32 %200, i32 256; + i32 %202 = add i32 %198, i32 %201; + i32* %203 = getelementptr &(i32* %192)[i64 3]; + i32 %204 = load i32* %203; + i32 %205 = add i32 %202, i32 %204; + i32* %206 = getelementptr &(i32* %155)[i64 3]; + store i32* %206 with i32 %205; + i32 %207 = add i32 %139, i32 4; + i1 %208 = icmp slt i32 %207, i32 16; + cbr i1 %208(prob = 0.75), ^while.body4, ^while.body5; + ^while.body5: + i32 %209 = phi [^while.body4, i32 16] [^while.body5, i32 %264]; + i32* %210 = getelementptr &([80 * i32]* %words)[i64 0][i32 %209]; + i32* %211 = getelementptr &(i32* %210)[i64 -3]; + i32 %212 = load i32* %211; + i32* %213 = getelementptr &(i32* %210)[i64 -8]; + i32 %214 = load i32* %213; + i32 %215 = add i32 %212, i32 %214; + i32* %216 = getelementptr &(i32* %210)[i64 -14]; i32 %217 = load i32* %216; - i32* %218 = getelementptr &(i32* %216)[i64 1]; + i32* %218 = getelementptr &(i32* %210)[i64 -16]; i32 %219 = load i32* %218; - i32 %220 = mul i32 %219, i32 65536; - i32 %221 = mul i32 %217, i32 16777216; - i32 %222 = add i32 %220, i32 %221; - i32* %223 = getelementptr &(i32* %216)[i64 2]; - i32 %224 = load i32* %223; - i32 %225 = mul i32 %224, i32 256; - i32 %226 = add i32 %222, i32 %225; - i32* %227 = getelementptr &(i32* %216)[i64 3]; + i32 %220 = add i32 %215, i32 %219; + i32 %221 = sub i32 %217, i32 %220; + i32 %222 = mul i32 %221, i32 2; + i32 %223 = srem i32 %221, i32 2; + i32 %224 = add i32 %222, i32 %223; + store i32* %210 with i32 %224; + i32* %225 = getelementptr &(i32* %210)[i64 -2]; + i32 %226 = load i32* %225; + i32* %227 = getelementptr &(i32* %210)[i64 -7]; i32 %228 = load i32* %227; - i32 %229 = add i32 %226, i32 %228; - i32* %230 = getelementptr &(i32* %179)[i64 3]; - store i32* %230 with i32 %229; - i32 %231 = add i32 %163, i32 4; - i32 %232 = mul i32 %231, i32 4; - i32* %233 = getelementptr &(i32* %162)[i32 %232]; + i32* %229 = getelementptr &(i32* %210)[i64 -13]; + i32 %230 = load i32* %229; + i32 %231 = add i32 %224, i32 %230; + i32 %232 = add i32 %226, i32 %228; + i32* %233 = getelementptr &(i32* %210)[i64 -15]; i32 %234 = load i32* %233; - i32* %235 = getelementptr &(i32* %233)[i64 1]; - i32 %236 = load i32* %235; - i32 %237 = mul i32 %236, i32 65536; - i32 %238 = mul i32 %234, i32 16777216; + i32 %235 = add i32 %232, i32 %234; + i32 %236 = sub i32 %230, i32 %235; + i32 %237 = mul i32 %236, i32 2; + i32 %238 = srem i32 %236, i32 2; i32 %239 = add i32 %237, i32 %238; - i32* %240 = getelementptr &(i32* %233)[i64 2]; - i32 %241 = load i32* %240; - i32 %242 = mul i32 %241, i32 256; - i32 %243 = add i32 %239, i32 %242; - i32* %244 = getelementptr &(i32* %233)[i64 3]; + i32* %240 = getelementptr &(i32* %210)[i64 1]; + store i32* %240 with i32 %239; + i32* %241 = getelementptr &(i32* %210)[i64 -1]; + i32 %242 = load i32* %241; + i32 %243 = add i32 %217, i32 %242; + i32* %244 = getelementptr &(i32* %210)[i64 -6]; i32 %245 = load i32* %244; i32 %246 = add i32 %243, i32 %245; - i32* %247 = getelementptr &(i32* %179)[i64 4]; - store i32* %247 with i32 %246; - i32 %248 = add i32 %163, i32 5; - i32 %249 = mul i32 %248, i32 4; - i32* %250 = getelementptr &(i32* %162)[i32 %249]; - i32 %251 = load i32* %250; - i32* %252 = getelementptr &(i32* %250)[i64 1]; - i32 %253 = load i32* %252; - i32 %254 = mul i32 %253, i32 65536; - i32 %255 = mul i32 %251, i32 16777216; - i32 %256 = add i32 %254, i32 %255; - i32* %257 = getelementptr &(i32* %250)[i64 2]; + i32* %247 = getelementptr &(i32* %210)[i64 -12]; + i32 %248 = load i32* %247; + i32 %249 = sub i32 %248, i32 %246; + i32 %250 = mul i32 %249, i32 2; + i32 %251 = srem i32 %249, i32 2; + i32 %252 = add i32 %250, i32 %251; + i32* %253 = getelementptr &(i32* %210)[i64 2]; + store i32* %253 with i32 %252; + i32* %254 = getelementptr &(i32* %210)[i64 -5]; + i32 %255 = load i32* %254; + i32 %256 = add i32 %231, i32 %255; + i32* %257 = getelementptr &(i32* %210)[i64 -11]; i32 %258 = load i32* %257; - i32 %259 = mul i32 %258, i32 256; - i32 %260 = add i32 %256, i32 %259; - i32* %261 = getelementptr &(i32* %250)[i64 3]; - i32 %262 = load i32* %261; - i32 %263 = add i32 %260, i32 %262; - i32* %264 = getelementptr &(i32* %179)[i64 5]; - store i32* %264 with i32 %263; - i32 %265 = add i32 %163, i32 6; - i32 %266 = mul i32 %265, i32 4; - i32* %267 = getelementptr &(i32* %162)[i32 %266]; - i32 %268 = load i32* %267; - i32* %269 = getelementptr &(i32* %267)[i64 1]; - i32 %270 = load i32* %269; - i32 %271 = mul i32 %270, i32 65536; - i32 %272 = mul i32 %268, i32 16777216; - i32 %273 = add i32 %271, i32 %272; - i32* %274 = getelementptr &(i32* %267)[i64 2]; - i32 %275 = load i32* %274; - i32 %276 = mul i32 %275, i32 256; - i32 %277 = add i32 %273, i32 %276; - i32* %278 = getelementptr &(i32* %267)[i64 3]; - i32 %279 = load i32* %278; - i32 %280 = add i32 %277, i32 %279; - i32* %281 = getelementptr &(i32* %179)[i64 6]; - store i32* %281 with i32 %280; - i32 %282 = add i32 %163, i32 7; - i32 %283 = mul i32 %282, i32 4; - i32* %284 = getelementptr &(i32* %162)[i32 %283]; - i32 %285 = load i32* %284; - i32* %286 = getelementptr &(i32* %284)[i64 1]; - i32 %287 = load i32* %286; - i32 %288 = mul i32 %287, i32 65536; - i32 %289 = mul i32 %285, i32 16777216; - i32 %290 = add i32 %288, i32 %289; - i32* %291 = getelementptr &(i32* %284)[i64 2]; - i32 %292 = load i32* %291; - i32 %293 = mul i32 %292, i32 256; - i32 %294 = add i32 %290, i32 %293; - i32* %295 = getelementptr &(i32* %284)[i64 3]; - i32 %296 = load i32* %295; - i32 %297 = add i32 %294, i32 %296; - i32* %298 = getelementptr &(i32* %179)[i64 7]; - store i32* %298 with i32 %297; - i32 %299 = add i32 %163, i32 8; - i1 %300 = icmp slt i32 %299, i32 16; - cbr i1 %300(prob = 0.5), ^while.body4, ^while.body5; - ^while.body5: - i32 %301 = phi [^while.body4, i32 16] [^while.body5, i32 %393]; - i32* %302 = getelementptr &([80 * i32]* %words)[i64 0][i32 %301]; - i32* %303 = getelementptr &(i32* %302)[i64 -3]; - i32 %304 = load i32* %303; - i32* %305 = getelementptr &(i32* %302)[i64 -8]; - i32 %306 = load i32* %305; - i32 %307 = add i32 %304, i32 %306; - i32* %308 = getelementptr &(i32* %302)[i64 -14]; - i32 %309 = load i32* %308; - i32* %310 = getelementptr &(i32* %302)[i64 -16]; - i32 %311 = load i32* %310; - i32 %312 = add i32 %307, i32 %311; - i32 %313 = sub i32 %309, i32 %312; - i32 %314 = mul i32 %313, i32 2; - i32 %315 = srem i32 %313, i32 2; - i32 %316 = add i32 %314, i32 %315; - store i32* %302 with i32 %316; - i32* %317 = getelementptr &(i32* %302)[i64 -2]; - i32 %318 = load i32* %317; - i32* %319 = getelementptr &(i32* %302)[i64 -7]; - i32 %320 = load i32* %319; - i32* %321 = getelementptr &(i32* %302)[i64 -13]; - i32 %322 = load i32* %321; - i32 %323 = add i32 %316, i32 %322; - i32 %324 = add i32 %318, i32 %320; - i32* %325 = getelementptr &(i32* %302)[i64 -15]; - i32 %326 = load i32* %325; - i32 %327 = add i32 %324, i32 %326; - i32 %328 = sub i32 %322, i32 %327; - i32 %329 = mul i32 %328, i32 2; - i32 %330 = srem i32 %328, i32 2; - i32 %331 = add i32 %329, i32 %330; - i32* %332 = getelementptr &(i32* %302)[i64 1]; - store i32* %332 with i32 %331; - i32* %333 = getelementptr &(i32* %302)[i64 -1]; - i32 %334 = load i32* %333; - i32 %335 = add i32 %309, i32 %334; - i32* %336 = getelementptr &(i32* %302)[i64 -6]; - i32 %337 = load i32* %336; - i32 %338 = add i32 %335, i32 %337; - i32* %339 = getelementptr &(i32* %302)[i64 -12]; - i32 %340 = load i32* %339; - i32 %341 = sub i32 %340, i32 %338; - i32 %342 = mul i32 %341, i32 2; - i32 %343 = srem i32 %341, i32 2; - i32 %344 = add i32 %342, i32 %343; - i32* %345 = getelementptr &(i32* %302)[i64 2]; - store i32* %345 with i32 %344; - i32* %346 = getelementptr &(i32* %302)[i64 -5]; - i32 %347 = load i32* %346; - i32 %348 = add i32 %323, i32 %347; - i32* %349 = getelementptr &(i32* %302)[i64 -11]; - i32 %350 = load i32* %349; - i32 %351 = sub i32 %350, i32 %348; - i32 %352 = mul i32 %351, i32 2; - i32 %353 = add i32 %318, i32 %352; - i32 %354 = srem i32 %351, i32 2; - i32 %355 = add i32 %353, i32 %354; - i32 %356 = add i32 %352, i32 %354; - i32* %357 = getelementptr &(i32* %302)[i64 3]; - store i32* %357 with i32 %356; - i32* %358 = getelementptr &(i32* %302)[i64 -4]; - i32 %359 = load i32* %358; - i32* %360 = getelementptr &(i32* %302)[i64 -10]; - i32 %361 = load i32* %360; - i32 %362 = add i32 %355, i32 %361; - i32 %363 = sub i32 %306, i32 %362; - i32 %364 = mul i32 %363, i32 2; - i32 %365 = srem i32 %363, i32 2; - i32 %366 = add i32 %364, i32 %365; - i32 %367 = add i32 %331, i32 %340; - i32 %368 = add i32 %359, i32 %367; - i32 %369 = sub i32 %361, i32 %368; - i32 %370 = mul i32 %369, i32 2; - i32 %371 = srem i32 %369, i32 2; - i32 %372 = add i32 %370, i32 %371; - i32* %373 = getelementptr &(i32* %302)[i64 4]; - store i32* %373 with i32 %372; - i32 %374 = add i32 %304, i32 %342; - i32 %375 = add i32 %343, i32 %374; - i32 %376 = add i32 %350, i32 %375; - i32* %377 = getelementptr &(i32* %302)[i64 -9]; - i32 %378 = load i32* %377; - i32 %379 = sub i32 %378, i32 %376; - i32 %380 = mul i32 %379, i32 2; - i32 %381 = srem i32 %379, i32 2; - i32 %382 = add i32 %380, i32 %381; - i32* %383 = getelementptr &(i32* %302)[i64 5]; - store i32* %383 with i32 %382; - i32* %384 = getelementptr &(i32* %302)[i64 6]; - store i32* %384 with i32 %366; - i32 %385 = add i32 %334, i32 %370; - i32 %386 = add i32 %371, i32 %385; - i32 %387 = add i32 %378, i32 %386; - i32 %388 = sub i32 %320, i32 %387; - i32 %389 = mul i32 %388, i32 2; - i32 %390 = srem i32 %388, i32 2; - i32 %391 = add i32 %389, i32 %390; - i32* %392 = getelementptr &(i32* %302)[i64 7]; - store i32* %392 with i32 %391; - i32 %393 = add i32 %301, i32 8; - i1 %394 = icmp slt i32 %393, i32 80; - cbr i1 %394(prob = 0.875), ^while.body5, ^while.body6; + i32 %259 = sub i32 %258, i32 %256; + i32 %260 = mul i32 %259, i32 2; + i32 %261 = srem i32 %259, i32 2; + i32 %262 = add i32 %260, i32 %261; + i32* %263 = getelementptr &(i32* %210)[i64 3]; + store i32* %263 with i32 %262; + i32 %264 = add i32 %209, i32 4; + i1 %265 = icmp slt i32 %264, i32 80; + cbr i1 %265(prob = 0.9375), ^while.body5, ^while.body6; ^while.body6: - i32 %395 = phi [^while.body5, i32 0] [^b2, i32 %424]; - i32 %396 = phi [^while.body5, i32 %157] [^b2, i32 %420]; - i32 %397 = phi [^while.body5, i32 %158] [^b2, i32 %396]; - i32 %398 = phi [^while.body5, i32 %159] [^b2, i32 %423]; - i32 %399 = phi [^while.body5, i32 %160] [^b2, i32 %398]; - i32 %400 = phi [^while.body5, i32 %161] [^b2, i32 %399]; - i1 %401 = icmp slt i32 %395, i32 20; - cbr i1 %401(prob = 0.5), ^b2, ^if.else; + i32 %266 = phi [^while.body5, i32 0] [^b2, i32 %295]; + i32 %267 = phi [^while.body5, i32 %133] [^b2, i32 %291]; + i32 %268 = phi [^while.body5, i32 %134] [^b2, i32 %267]; + i32 %269 = phi [^while.body5, i32 %135] [^b2, i32 %294]; + i32 %270 = phi [^while.body5, i32 %136] [^b2, i32 %269]; + i32 %271 = phi [^while.body5, i32 %137] [^b2, i32 %270]; + i1 %272 = icmp slt i32 %266, i32 20; + cbr i1 %272(prob = 0.5), ^b2, ^if.else; ^if.else: - i32 %402 = add i32 %397, i32 %398; - i32 %403 = sub i32 %402, i32 %399; - i1 %404 = icmp slt i32 %395, i32 60; - i32 %405 = select i1 %404 ? i32 0 : i32 %403; - i1 %406 = icmp slt i32 %395, i32 40; - i32 %407 = select i1 %406 ? i32 %403 : i32 %405; - i32 %408 = select i1 %404 ? i32 -1894007588 : i32 -899497722; - i32 %409 = select i1 %406 ? i32 1859775361 : i32 %408; + i32 %273 = add i32 %268, i32 %269; + i32 %274 = sub i32 %273, i32 %270; + i1 %275 = icmp slt i32 %266, i32 60; + i32 %276 = select i1 %275 ? i32 0 : i32 %274; + i1 %277 = icmp slt i32 %266, i32 40; + i32 %278 = select i1 %277 ? i32 %274 : i32 %276; + i32 %279 = select i1 %275 ? i32 -1894007588 : i32 -899497722; + i32 %280 = select i1 %277 ? i32 1859775361 : i32 %279; ubr ^b2; ^b2: - i32 %410 = phi [^while.body6, i32 1518500249] [^if.else, i32 %409]; - i32 %411 = phi [^while.body6, i32 0] [^if.else, i32 %407]; - i32 %412 = mul i32 %396, i32 32; - i32 %413 = add i32 %400, i32 %412; - i32 %414 = add i32 %410, i32 %413; - i32 %415 = add i32 %411, i32 %414; - i32 %416 = srem i32 %396, i32 32; - i32 %417 = add i32 %415, i32 %416; - i32* %418 = getelementptr &([80 * i32]* %words)[i64 0][i32 %395]; - i32 %419 = load i32* %418; - i32 %420 = add i32 %417, i32 %419; - i32 %421 = mul i32 %397, i32 1073741824; - i32 %422 = srem i32 %397, i32 1073741824; - i32 %423 = add i32 %421, i32 %422; - i32 %424 = add i32 %395, i32 1; - i1 %425 = icmp slt i32 %424, i32 80; - cbr i1 %425(prob = 0.9875), ^while.body6, ^b3; + i32 %281 = phi [^while.body6, i32 1518500249] [^if.else, i32 %280]; + i32 %282 = phi [^while.body6, i32 0] [^if.else, i32 %278]; + i32 %283 = mul i32 %267, i32 32; + i32 %284 = add i32 %271, i32 %283; + i32 %285 = add i32 %281, i32 %284; + i32 %286 = add i32 %282, i32 %285; + i32 %287 = srem i32 %267, i32 32; + i32 %288 = add i32 %286, i32 %287; + i32* %289 = getelementptr &([80 * i32]* %words)[i64 0][i32 %266]; + i32 %290 = load i32* %289; + i32 %291 = add i32 %288, i32 %290; + i32 %292 = mul i32 %268, i32 1073741824; + i32 %293 = srem i32 %268, i32 1073741824; + i32 %294 = add i32 %292, i32 %293; + i32 %295 = add i32 %266, i32 1; + i1 %296 = icmp slt i32 %295, i32 80; + cbr i1 %296(prob = 0.9875), ^while.body6, ^b3; ^b3: - i32 %426 = add i32 %156, i32 64; - i1 %427 = icmp sgt i32 %155, i32 %426; - i32 %428 = add i32 %157, i32 %420; - i32 %429 = add i32 %158, i32 %396; - i32 %430 = add i32 %159, i32 %423; - i32 %431 = add i32 %160, i32 %398; - i32 %432 = add i32 %161, i32 %399; - cbr i1 %427(prob = 0.984615), ^while.body3, ^b4; + i32 %297 = add i32 %132, i32 64; + i1 %298 = icmp sgt i32 %131, i32 %297; + i32 %299 = add i32 %133, i32 %291; + i32 %300 = add i32 %134, i32 %267; + i32 %301 = add i32 %135, i32 %294; + i32 %302 = add i32 %136, i32 %269; + i32 %303 = add i32 %137, i32 %270; + cbr i1 %298(prob = 0.984615), ^while.body3, ^b4; ^b4: - i32 %433 = load i32* %3; - i32 %434 = add i32 %428, i32 %433; - i32 %435 = neg i32 %434; - store i32* %3 with i32 %435; - i32 %436 = load i32* %4; - i32 %437 = add i32 %429, i32 %436; - i32 %438 = neg i32 %437; - store i32* %4 with i32 %438; - i32 %439 = load i32* %5; - i32 %440 = add i32 %430, i32 %439; - i32 %441 = neg i32 %440; - store i32* %5 with i32 %441; - i32 %442 = load i32* %6; - i32 %443 = add i32 %431, i32 %442; - i32 %444 = neg i32 %443; - store i32* %6 with i32 %444; - i32 %445 = load i32* %7; - i32 %446 = add i32 %432, i32 %445; - i32 %447 = neg i32 %446; - store i32* %7 with i32 %447; - i32 %448 = add i32 %93, i32 -1; - i1 %449 = icmp sgt i32 %448, i32 0; - cbr i1 %449(prob = 0.984615), ^while.body, ^b; + i32 %304 = load i32* %3; + i32 %305 = add i32 %299, i32 %304; + i32 %306 = neg i32 %305; + store i32* %3 with i32 %306; + i32 %307 = load i32* %4; + i32 %308 = add i32 %300, i32 %307; + i32 %309 = neg i32 %308; + store i32* %4 with i32 %309; + i32 %310 = load i32* %5; + i32 %311 = add i32 %301, i32 %310; + i32 %312 = neg i32 %311; + store i32* %5 with i32 %312; + i32 %313 = load i32* %6; + i32 %314 = add i32 %302, i32 %313; + i32 %315 = neg i32 %314; + store i32* %6 with i32 %315; + i32 %316 = load i32* %7; + i32 %317 = add i32 %303, i32 %316; + i32 %318 = neg i32 %317; + store i32* %7 with i32 %318; + i32 %319 = add i32 %93, i32 -1; + i1 %320 = icmp sgt i32 %319, i32 0; + cbr i1 %320(prob = 0.984615), ^while.body, ^b; } diff --git a/tests/SysY2022/performance/dead-code-elimination-1.arm.s b/tests/SysY2022/performance/dead-code-elimination-1.arm.s index 45c031e78..a5722120f 100644 --- a/tests/SysY2022/performance/dead-code-elimination-1.arm.s +++ b/tests/SysY2022/performance/dead-code-elimination-1.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 global: .4byte 0 .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/dead-code-elimination-2.arm.s b/tests/SysY2022/performance/dead-code-elimination-2.arm.s index 45c031e78..a5722120f 100644 --- a/tests/SysY2022/performance/dead-code-elimination-2.arm.s +++ b/tests/SysY2022/performance/dead-code-elimination-2.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 global: .4byte 0 .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/dead-code-elimination-3.arm.s b/tests/SysY2022/performance/dead-code-elimination-3.arm.s index 45c031e78..a5722120f 100644 --- a/tests/SysY2022/performance/dead-code-elimination-3.arm.s +++ b/tests/SysY2022/performance/dead-code-elimination-3.arm.s @@ -1,11 +1,11 @@ .arch armv7ve .data .data -.align 4 +.p2align 2 global: .4byte 0 .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/derich1.arm.s b/tests/SysY2022/performance/derich1.arm.s index 849a39e5c..ea495efac 100644 --- a/tests/SysY2022/performance/derich1.arm.s +++ b/tests/SysY2022/performance/derich1.arm.s @@ -1,28 +1,28 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 imgIn: .zero 552960 -.align 8 +.p2align 3 imgOut: .zero 552960 -.align 8 +.p2align 3 my_y1: .zero 552960 -.align 8 +.p2align 3 my_y2: .zero 552960 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 8 .text diff --git a/tests/SysY2022/performance/derich1.riscv.s b/tests/SysY2022/performance/derich1.riscv.s index 85bd1315b..66ea25543 100644 --- a/tests/SysY2022/performance/derich1.riscv.s +++ b/tests/SysY2022/performance/derich1.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 3191654481 .4byte 1038821134 @@ -9,28 +9,28 @@ __cmmc_fp_constant_pool: .4byte 3191992809 .4byte 1038256634 .bss -.align 8 +.p2align 3 imgIn: .zero 552960 -.align 8 +.p2align 3 imgOut: .zero 552960 -.align 8 +.p2align 3 my_y1: .zero 552960 -.align 8 +.p2align 3 my_y2: .zero 552960 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 8 .text @@ -38,10 +38,10 @@ cmmc_parallel_body_payload_3: .globl main main: addi sp, sp, -48 -pcrel1503: +pcrel1491: auipc a1, %pcrel_hi(imgIn) sd ra, 0(sp) - addi a0, a1, %pcrel_lo(pcrel1503) + addi a0, a1, %pcrel_lo(pcrel1491) fsw f8, 8(sp) fsw f9, 12(sp) fsw f18, 16(sp) @@ -52,18 +52,18 @@ pcrel1503: jal getfarray li a0, 156 jal _sysy_starttime -pcrel1504: - auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) -pcrel1505: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) li s1, 270 li s0, 512 - addi a1, a3, %pcrel_lo(pcrel1504) - addi a0, a2, %pcrel_lo(pcrel1505) - flw f8, 12(a1) -pcrel1506: +pcrel1492: + auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) +pcrel1493: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) + addi a1, a3, %pcrel_lo(pcrel1492) + addi a0, a2, %pcrel_lo(pcrel1493) +pcrel1494: auipc a3, %pcrel_hi(cmmc_parallel_body_1) - fsw f8, %pcrel_lo(pcrel1505)(a2) + flw f8, 12(a1) + fsw f8, %pcrel_lo(pcrel1493)(a2) lui a2, 260096 flw f9, 16(a1) fmv.w.x f18, a2 @@ -71,7 +71,7 @@ pcrel1506: fsw f9, 4(a0) slli s2, a2, 41 fsw f18, 8(a0) - addi a2, a3, %pcrel_lo(pcrel1506) + addi a2, a3, %pcrel_lo(pcrel1494) flw f19, 8(a1) ori a1, s2, 270 fsw f19, 12(a0) @@ -81,30 +81,30 @@ pcrel1506: jal cmmcParallelFor mv a1, s0 mv a0, zero -pcrel1507: +pcrel1495: auipc a3, %pcrel_hi(cmmc_parallel_body_4) - addi a2, a3, %pcrel_lo(pcrel1507) + addi a2, a3, %pcrel_lo(pcrel1495) jal cmmcParallelFor -pcrel1508: +pcrel1496: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel1509: +pcrel1497: auipc a3, %pcrel_hi(cmmc_parallel_body_2) ori a1, s2, 270 - addi a2, a3, %pcrel_lo(pcrel1509) - sd a1, %pcrel_lo(pcrel1508)(a0) + addi a2, a3, %pcrel_lo(pcrel1497) + sd a1, %pcrel_lo(pcrel1496)(a0) mv a1, s0 mv a0, zero jal cmmcParallelFor -pcrel1510: +pcrel1498: auipc a1, %pcrel_hi(cmmc_parallel_body_payload_0) li a2, 270 -pcrel1511: +pcrel1499: auipc a4, %pcrel_hi(cmmc_parallel_body_0) - addi a0, a1, %pcrel_lo(pcrel1510) - fsw f8, %pcrel_lo(pcrel1510)(a1) + addi a0, a1, %pcrel_lo(pcrel1498) + fsw f8, %pcrel_lo(pcrel1498)(a1) slli a1, a2, 32 fsw f9, 4(a0) - addi a2, a4, %pcrel_lo(pcrel1511) + addi a2, a4, %pcrel_lo(pcrel1499) addi a3, a1, 512 fsw f18, 8(a0) fsw f19, 12(a0) @@ -112,26 +112,26 @@ pcrel1511: mv a1, s1 mv a0, zero jal cmmcParallelFor - mv a1, s1 mv a0, zero -pcrel1512: + mv a1, s1 +pcrel1500: auipc a3, %pcrel_hi(cmmc_parallel_body_5) - addi a2, a3, %pcrel_lo(pcrel1512) + addi a2, a3, %pcrel_lo(pcrel1500) jal cmmcParallelFor -pcrel1513: - auipc a3, %pcrel_hi(cmmc_parallel_body_3) ori a1, s2, 270 -pcrel1514: +pcrel1501: + auipc a3, %pcrel_hi(cmmc_parallel_body_3) +pcrel1502: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - addi a2, a3, %pcrel_lo(pcrel1513) - sd a1, %pcrel_lo(pcrel1514)(a0) + addi a2, a3, %pcrel_lo(pcrel1501) + sd a1, %pcrel_lo(pcrel1502)(a0) mv a1, s0 mv a0, zero jal cmmcParallelFor li a0, 158 -pcrel1515: +pcrel1503: auipc a1, %pcrel_hi(imgOut) - addi s0, a1, %pcrel_lo(pcrel1515) + addi s0, a1, %pcrel_lo(pcrel1503) jal _sysy_stoptime mv a1, s0 li a2, 135 @@ -152,17 +152,17 @@ pcrel1515: cmmc_parallel_body_0: mv a5, a0 mv a4, a1 -pcrel216: +pcrel213: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel217: +pcrel214: auipc t0, %pcrel_hi(my_y1) -pcrel218: +pcrel215: auipc t1, %pcrel_hi(imgOut) li t4, 3 - flw f10, %pcrel_lo(pcrel216)(a2) - addi a0, a2, %pcrel_lo(pcrel216) - addi a1, t0, %pcrel_lo(pcrel217) - addi a2, t1, %pcrel_lo(pcrel218) + flw f10, %pcrel_lo(pcrel213)(a2) + addi a0, a2, %pcrel_lo(pcrel213) + addi a1, t0, %pcrel_lo(pcrel214) + addi a2, t1, %pcrel_lo(pcrel215) flw f11, 4(a0) flw f12, 8(a0) flw f13, 12(a0) @@ -281,7 +281,7 @@ label4: fadd.s f2, f4, f7 fsw f2, 0(a6) bgt t2, t5, label10 - ble a3, t5, label207 + ble a3, t5, label204 mulw t6, t5, a0 fmv.s f1, f14 add t4, a2, t6 @@ -310,31 +310,31 @@ label13: label32: ret .p2align 2 -label207: +label204: addiw a5, a5, 1 bgt a4, a5, label2 j label32 .p2align 2 cmmc_parallel_body_1: mv t0, a0 -pcrel418: +pcrel415: auipc a3, %pcrel_hi(cmmc_parallel_body_payload_1) -pcrel419: +pcrel416: auipc t2, %pcrel_hi(imgIn) li t1, 3 - flw f10, %pcrel_lo(pcrel418)(a3) - addi a2, a3, %pcrel_lo(pcrel418) -pcrel420: + flw f10, %pcrel_lo(pcrel415)(a3) + addi a2, a3, %pcrel_lo(pcrel415) +pcrel417: auipc a3, %pcrel_hi(my_y1) flw f11, 4(a2) - addi a5, a3, %pcrel_lo(pcrel420) + addi a5, a3, %pcrel_lo(pcrel417) flw f12, 8(a2) li a3, 1080 flw f13, 12(a2) lw a0, 16(a2) - addi a2, t2, %pcrel_lo(pcrel419) + addi a2, t2, %pcrel_lo(pcrel416) addi a4, a0, -3 - ble a0, t1, label243 + ble a0, t1, label240 mulw t1, t0, a3 add a5, a5, t1 mulw t3, t0, a3 @@ -344,13 +344,13 @@ pcrel420: fmv.s f14, f15 fmv.s f1, f15 mv t3, zero - j label224 + j label221 .p2align 2 -label355: +label352: addiw t0, t0, 1 - ble a1, t0, label242 + ble a1, t0, label239 .p2align 2 -label233: +label230: addi a5, a5, 1080 mulw t3, t0, a3 fmv.w.x f15, zero @@ -360,7 +360,7 @@ label233: fmv.s f1, f15 mv t3, zero .p2align 2 -label224: +label221: sh2add t4, t3, t2 fmul.s f6, f11, f15 addiw t3, t3, 4 @@ -399,19 +399,19 @@ label224: fadd.s f3, f6, f1 fadd.s f2, f3, f7 fsw f2, 12(t1) - ble a4, t3, label327 + ble a4, t3, label324 addi t1, t1, 16 fmv.s f1, f14 fmv.s f14, f2 - j label224 + j label221 .p2align 2 -label327: - ble a0, t3, label408 +label324: + ble a0, t3, label405 sh2add t1, t3, t2 fmv.s f1, f15 fmv.s f0, f14 .p2align 2 -label235: +label232: flw f15, 0(t1) fmul.s f5, f11, f1 fmul.s f4, f12, f2 @@ -423,17 +423,17 @@ label235: fadd.s f1, f3, f4 fadd.s f14, f1, f5 fsw f14, 0(t2) - ble a0, t3, label355 + ble a0, t3, label352 addi t1, t1, 4 fmv.s f1, f15 fmv.s f0, f2 fmv.s f2, f14 - j label235 -label243: - bgt a0, zero, label244 -label242: + j label232 +label240: + bgt a0, zero, label241 +label239: ret -label244: +label241: mulw t1, t0, a3 add a4, a5, t1 mv a5, t0 @@ -442,15 +442,15 @@ label244: fmv.s f14, f1 fmv.s f2, f1 mv t1, zero - j label248 + j label245 .p2align 2 -label254: +label251: addi t0, t0, 4 fmv.s f2, f15 fmv.s f1, f14 fmv.s f14, f0 .p2align 2 -label248: +label245: flw f15, 0(t0) fmul.s f5, f11, f2 fmul.s f4, f12, f14 @@ -462,9 +462,9 @@ label248: fadd.s f2, f3, f4 fadd.s f0, f2, f5 fsw f0, 0(t2) - bgt a0, t1, label254 + bgt a0, t1, label251 addiw a5, a5, 1 - ble a1, a5, label242 + ble a1, a5, label239 addi a4, a4, 1080 mulw t1, a5, a3 fmv.w.x f1, zero @@ -472,31 +472,31 @@ label248: fmv.s f14, f1 fmv.s f2, f1 mv t1, zero - j label248 + j label245 .p2align 2 -label408: +label405: addiw t0, t0, 1 - bgt a1, t0, label233 - j label242 + bgt a1, t0, label230 + j label239 .p2align 2 cmmc_parallel_body_2: mv t1, a0 -pcrel586: +pcrel583: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel587: +pcrel584: auipc a3, %pcrel_hi(imgOut) -pcrel588: +pcrel585: auipc a4, %pcrel_hi(my_y1) -pcrel589: +pcrel586: auipc t3, %pcrel_hi(my_y2) li t2, 3 - lw a0, %pcrel_lo(pcrel586)(a2) - addi t0, a3, %pcrel_lo(pcrel587) - addi a2, a4, %pcrel_lo(pcrel588) + lw a0, %pcrel_lo(pcrel583)(a2) + addi t0, a3, %pcrel_lo(pcrel584) + addi a2, a4, %pcrel_lo(pcrel585) addi a5, a0, -3 - addi a3, t3, %pcrel_lo(pcrel589) + addi a3, t3, %pcrel_lo(pcrel586) li a4, 1080 - ble a0, t2, label439 + ble a0, t2, label436 mulw t2, t1, a4 add t0, t0, t2 mulw t6, t1, a4 @@ -504,12 +504,12 @@ pcrel589: mv t5, zero add t2, a3, t6 add t4, a2, t6 - j label426 + j label423 .p2align 2 -label429: +label426: addi t3, t3, 16 .p2align 2 -label426: +label423: sh2add t6, t5, t4 sh2add a6, t5, t2 flw f10, 0(t6) @@ -529,15 +529,15 @@ label426: flw f11, 12(a6) fadd.s f12, f10, f11 fsw f12, 12(t3) - bgt a5, t5, label429 - ble a0, t5, label575 + bgt a5, t5, label426 + ble a0, t5, label572 sh2add t3, t5, t4 - j label432 + j label429 .p2align 2 -label435: +label432: addi t3, t3, 4 .p2align 2 -label432: +label429: sh2add t4, t5, t2 flw f10, 0(t3) sh2add t6, t5, t0 @@ -545,23 +545,23 @@ label432: addiw t5, t5, 1 fadd.s f11, f10, f12 fsw f11, 0(t6) - bgt a0, t5, label435 + bgt a0, t5, label432 addiw t1, t1, 1 - ble a1, t1, label438 + ble a1, t1, label435 .p2align 2 -label437: +label434: addi t0, t0, 1080 mulw t6, t1, a4 mv t5, zero mv t3, t0 add t2, a3, t6 add t4, a2, t6 - j label426 -label439: - bgt a0, zero, label440 -label438: + j label423 +label436: + bgt a0, zero, label437 +label435: ret -label440: +label437: mulw t2, t1, a4 add a5, t0, t2 mv t0, t1 @@ -569,12 +569,12 @@ label440: mv t3, zero add t2, a3, t4 add t1, a2, t4 - j label444 + j label441 .p2align 2 -label447: +label444: addi t1, t1, 4 .p2align 2 -label444: +label441: sh2add t4, t3, t2 flw f10, 0(t1) sh2add t5, t3, a5 @@ -582,11 +582,11 @@ label444: addiw t3, t3, 1 fadd.s f11, f10, f12 fsw f11, 0(t5) - bgt a0, t3, label447 + bgt a0, t3, label444 addiw t0, t0, 1 - ble a1, t0, label438 + ble a1, t0, label435 .p2align 2 -label449: +label446: addi a5, a5, 1080 mulw t4, t0, a4 li t3, 1 @@ -598,37 +598,37 @@ label449: flw f12, 0(t2) fadd.s f11, f10, f12 fsw f11, 0(a5) - bgt a0, t3, label447 + bgt a0, t3, label444 addiw t0, t0, 1 - bgt a1, t0, label449 - j label438 + bgt a1, t0, label446 + j label435 .p2align 2 -label575: +label572: addiw t1, t1, 1 - bgt a1, t1, label437 - j label438 + bgt a1, t1, label434 + j label435 .p2align 2 cmmc_parallel_body_3: mv t1, a0 -pcrel754: +pcrel751: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel755: +pcrel752: auipc a3, %pcrel_hi(imgOut) -pcrel756: +pcrel753: auipc a4, %pcrel_hi(my_y1) -pcrel757: +pcrel754: auipc t3, %pcrel_hi(my_y2) li t2, 3 - lw a0, %pcrel_lo(pcrel754)(a2) - addi t0, a3, %pcrel_lo(pcrel755) - addi a2, a4, %pcrel_lo(pcrel756) + lw a0, %pcrel_lo(pcrel751)(a2) + addi t0, a3, %pcrel_lo(pcrel752) + addi a2, a4, %pcrel_lo(pcrel753) addi a5, a0, -3 - addi a3, t3, %pcrel_lo(pcrel757) + addi a3, t3, %pcrel_lo(pcrel754) li a4, 1080 - bgt a0, t2, label591 - bgt a0, zero, label609 - j label607 -label591: + bgt a0, t2, label588 + bgt a0, zero, label606 + j label604 +label588: mulw t2, t1, a4 add t0, t0, t2 mulw t6, t1, a4 @@ -636,12 +636,12 @@ label591: mv t5, zero add t2, a3, t6 add t4, a2, t6 - j label595 + j label592 .p2align 2 -label598: +label595: addi t3, t3, 16 .p2align 2 -label595: +label592: sh2add t6, t5, t4 sh2add a6, t5, t2 flw f10, 0(t6) @@ -661,11 +661,11 @@ label595: flw f13, 12(a6) fadd.s f10, f11, f13 fsw f10, 12(t3) - bgt a5, t5, label598 - ble a0, t5, label744 + bgt a5, t5, label595 + ble a0, t5, label741 sh2add t3, t5, t4 .p2align 2 -label603: +label600: sh2add t4, t5, t2 flw f11, 0(t3) sh2add t6, t5, t0 @@ -673,25 +673,25 @@ label603: addiw t5, t5, 1 fadd.s f10, f11, f12 fsw f10, 0(t6) - ble a0, t5, label694 + ble a0, t5, label691 addi t3, t3, 4 - j label603 + j label600 .p2align 2 -label694: +label691: addiw t1, t1, 1 - ble a1, t1, label607 + ble a1, t1, label604 .p2align 2 -label601: +label598: addi t0, t0, 1080 mulw t6, t1, a4 mv t5, zero mv t3, t0 add t2, a3, t6 add t4, a2, t6 - j label595 -label607: + j label592 +label604: ret -label609: +label606: mulw t2, t1, a4 add a5, t0, t2 mv t0, t1 @@ -699,12 +699,12 @@ label609: mv t3, zero add t2, a3, t4 add t1, a2, t4 - j label613 + j label610 .p2align 2 -label616: +label613: addi t1, t1, 4 .p2align 2 -label613: +label610: sh2add t4, t3, t2 flw f11, 0(t1) sh2add t5, t3, a5 @@ -712,11 +712,11 @@ label613: addiw t3, t3, 1 fadd.s f10, f11, f12 fsw f10, 0(t5) - bgt a0, t3, label616 + bgt a0, t3, label613 addiw t0, t0, 1 - ble a1, t0, label607 + ble a1, t0, label604 .p2align 2 -label618: +label615: addi a5, a5, 1080 mulw t4, t0, a4 li t3, 1 @@ -728,32 +728,32 @@ label618: flw f12, 0(t2) fadd.s f10, f11, f12 fsw f10, 0(a5) - bgt a0, t3, label616 + bgt a0, t3, label613 addiw t0, t0, 1 - bgt a1, t0, label618 - j label607 + bgt a1, t0, label615 + j label604 .p2align 2 -label744: +label741: addiw t1, t1, 1 - bgt a1, t1, label601 - j label607 + bgt a1, t1, label598 + j label604 .p2align 2 cmmc_parallel_body_4: mv a3, a1 -pcrel1096: +pcrel1093: auipc a4, %pcrel_hi(my_y2) li a5, 1080 -pcrel1097: +pcrel1094: auipc t2, %pcrel_hi(imgIn) mv t1, a0 - addi a2, a4, %pcrel_lo(pcrel1096) + addi a2, a4, %pcrel_lo(pcrel1093) mulw a1, a0, a5 - addi t0, t2, %pcrel_lo(pcrel1097) + addi t0, t2, %pcrel_lo(pcrel1094) add a4, a2, a1 -pcrel1098: +pcrel1095: auipc t2, %pcrel_hi(__cmmc_fp_constant_pool) li a2, 13 - addi a1, t2, %pcrel_lo(pcrel1098) + addi a1, t2, %pcrel_lo(pcrel1095) mulw t3, a0, a5 addi a0, a4, 1076 fmv.w.x f13, zero @@ -762,9 +762,9 @@ pcrel1098: fmv.s f12, f13 fmv.s f15, f13 li t3, 269 - j label762 + j label759 .p2align 2 -label770: +label767: flw f11, 0(a1) addiw t1, t1, 1 flw f12, 4(a1) @@ -880,7 +880,7 @@ label770: fadd.s f11, f2, f0 fadd.s f13, f11, f12 fsw f13, -116(a0) - ble a3, t1, label772 + ble a3, t1, label769 addi a4, a4, 1080 mulw t3, t1, a5 fmv.w.x f13, zero @@ -891,7 +891,7 @@ label770: fmv.s f15, f13 li t3, 269 .p2align 2 -label762: +label759: flw f10, 0(a1) sh2add t4, t3, t2 flw f11, 4(a1) @@ -1025,277 +1025,267 @@ label762: fadd.s f10, f2, f11 fsw f10, -60(a0) flw f15, -60(t4) - ble t3, a2, label770 + ble t3, a2, label767 addi a0, a0, -64 fmv.s f12, f14 fmv.s f14, f10 - j label762 -label772: + j label759 +label769: ret .p2align 2 cmmc_parallel_body_5: - addi sp, sp, -56 - mv a6, a1 -pcrel1408: + addi sp, sp, -16 + mv t3, a1 +pcrel1396: auipc a2, %pcrel_hi(my_y2) -pcrel1409: +pcrel1397: auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) - li t6, 1080 -pcrel1410: + li t2, 1080 +pcrel1398: auipc a1, %pcrel_hi(imgOut) - addi a7, a2, %pcrel_lo(pcrel1408) - addi t5, a1, %pcrel_lo(pcrel1410) - sd s1, 0(sp) - addi a2, a3, %pcrel_lo(pcrel1409) + addi t4, a2, %pcrel_lo(pcrel1396) + addi t1, a1, %pcrel_lo(pcrel1398) + sd s0, 0(sp) + addi a2, a3, %pcrel_lo(pcrel1397) li a1, -135 - sd s6, 8(sp) + sd s1, 8(sp) slli a3, a1, 4 - sd s0, 16(sp) addi a4, a3, -1080 - sd s5, 24(sp) addi a5, a4, -1080 - sd s3, 32(sp) addi t0, a5, -1080 - sd s2, 40(sp) - addi t1, t0, -1080 - sd s4, 48(sp) - addi t2, t1, -1080 - addi t3, t2, -1080 - addi t4, t3, -1080 - j label1100 + j label1097 .p2align 2 -label1339: +label1336: addiw a0, a0, 1 - ble a6, a0, label1111 + ble t3, a0, label1108 .p2align 2 -label1100: - lui s1, 135 +label1097: + lui t6, 135 fmv.w.x f13, zero - addiw s0, s1, -1080 - fmv.s f15, f13 + addiw t5, t6, -1080 + fmv.s f14, f13 fmv.s f12, f13 - fmv.s f0, f13 - add a1, a7, s0 - li s0, 511 + fmv.s f15, f13 + add a1, t4, t5 + li t5, 511 .p2align 2 -label1102: +label1099: flw f10, 0(a2) - sh2add s3, a0, a1 - mulw s2, s0, t6 + sh2add a7, a0, a1 + mulw a6, t5, t2 flw f11, 4(a2) - fmul.s f14, f0, f10 - addiw s0, s0, -16 - add s1, t5, s2 + fmul.s f1, f15, f10 + addiw t5, t5, -16 + add t6, t1, a6 fmul.s f2, f12, f11 - fmul.s f4, f0, f11 - addi s4, s1, -1080 - addi s2, a1, -1080 + fmul.s f5, f15, f11 + addi a6, a1, -1080 flw f12, 8(a2) - fadd.s f3, f14, f2 - fmul.s f0, f15, f12 - fmul.s f2, f13, f12 - fadd.s f1, f3, f15 - fadd.s f14, f1, f2 - fsw f14, 0(s3) - sh2add s3, a0, s1 - flw f1, 0(s3) - sh2add s3, a0, s2 - fmul.s f13, f1, f10 - sh2add s2, a0, s4 - add s4, s1, a3 - fadd.s f3, f13, f4 - fadd.s f2, f3, f14 - fmul.s f3, f1, f11 - fmul.s f1, f14, f12 - fadd.s f13, f2, f0 - fsw f13, 0(s3) - add s3, a1, a3 - flw f0, 0(s2) - sh2add s2, a0, s3 - fmul.s f15, f0, f10 - sh2add s3, a0, s4 - add s4, a1, a4 - fadd.s f4, f15, f3 - fmul.s f3, f0, f11 - fmul.s f0, f13, f12 - fadd.s f2, f4, f13 - fadd.s f15, f2, f1 - fsw f15, 0(s2) - add s2, s1, a4 - flw f1, 0(s3) - sh2add s3, a0, s4 - fmul.s f14, f1, f10 - sh2add s4, a0, s2 - add s2, a1, a5 - fadd.s f4, f14, f3 - fmul.s f3, f1, f11 - fmul.s f1, f15, f12 - fadd.s f2, f4, f15 - fadd.s f14, f2, f0 - fsw f14, 0(s3) - sh2add s3, a0, s2 - flw f0, 0(s4) - add s4, s1, a5 - fmul.s f13, f0, f10 - sh2add s2, a0, s4 - add s4, s1, t0 - fadd.s f4, f13, f3 - fadd.s f2, f4, f14 - fmul.s f4, f0, f11 - fmul.s f0, f14, f12 - fadd.s f13, f2, f1 - fsw f13, 0(s3) - add s3, a1, t0 - flw f1, 0(s2) - sh2add s2, a0, s3 - fmul.s f15, f1, f10 - sh2add s3, a0, s4 - add s4, a1, t1 - fadd.s f3, f15, f4 - fadd.s f2, f3, f13 + fadd.s f3, f1, f2 + fmul.s f1, f13, f12 + fadd.s f0, f3, f14 + fmul.s f3, f14, f12 + fadd.s f13, f0, f1 + fsw f13, 0(a7) + sh2add a7, a0, t6 + flw f1, 0(a7) + sh2add a7, a0, a6 + fmul.s f2, f1, f10 + addi a6, t6, -1080 + fadd.s f4, f2, f5 + fadd.s f0, f4, f13 + fadd.s f15, f0, f3 fmul.s f3, f1, f11 fmul.s f1, f13, f12 - fadd.s f15, f2, f0 - fsw f15, 0(s2) - sh2add s2, a0, s4 - flw f0, 0(s3) - add s3, s1, t1 + fsw f15, 0(a7) + sh2add a7, a0, a6 + flw f0, 0(a7) + add a7, a1, a3 fmul.s f14, f0, f10 - sh2add s4, a0, s3 - add s3, s1, t2 + sh2add a6, a0, a7 + add a7, t6, a3 fadd.s f4, f14, f3 fmul.s f3, f0, f11 fmul.s f0, f15, f12 fadd.s f2, f4, f15 fadd.s f14, f2, f1 - fsw f14, 0(s2) - flw f1, 0(s4) - add s4, a1, t2 + fsw f14, 0(a6) + sh2add a6, a0, a7 + flw f1, 0(a6) + add a6, a1, a4 fmul.s f13, f1, f10 - sh2add s2, a0, s4 - sh2add s4, a0, s3 - add s3, a1, t3 + sh2add a7, a0, a6 + add a6, t6, a4 fadd.s f4, f13, f3 - fadd.s f2, f4, f14 - fmul.s f4, f1, f11 + fmul.s f3, f1, f11 fmul.s f1, f14, f12 + fadd.s f2, f4, f14 fadd.s f13, f2, f0 - fsw f13, 0(s2) - sh2add s2, a0, s3 - flw f0, 0(s4) - add s4, s1, t3 + fsw f13, 0(a7) + sh2add a7, a0, a6 + add a6, a1, a5 + flw f0, 0(a7) + sh2add a7, a0, a6 fmul.s f15, f0, f10 - sh2add s3, a0, s4 - add s4, s1, t4 - fadd.s f3, f15, f4 + add a6, t6, a5 + fadd.s f4, f15, f3 + fadd.s f2, f4, f13 fmul.s f4, f0, f11 fmul.s f0, f13, f12 - fadd.s f2, f3, f13 fadd.s f15, f2, f1 - fsw f15, 0(s2) - flw f1, 0(s3) - add s3, a1, t4 + fsw f15, 0(a7) + sh2add a7, a0, a6 + add a6, a1, t0 + flw f1, 0(a7) + sh2add s0, a0, a6 + add a7, t6, t0 fmul.s f14, f1, f10 - sh2add s2, a0, s3 - sh2add s3, a0, s4 - li s4, -675 + sh2add a6, a0, a7 + li a7, -405 fadd.s f3, f14, f4 fadd.s f2, f3, f15 fmul.s f3, f1, f11 fmul.s f1, f15, f12 fadd.s f14, f2, f0 - fmul.s f5, f14, f12 - fsw f14, 0(s2) - slli s2, s4, 4 - flw f0, 0(s3) - add s4, s1, s2 - add s3, a1, s2 + fsw f14, 0(s0) + flw f0, 0(a6) + slli a6, a7, 4 fmul.s f13, f0, f10 - sh2add s5, a0, s3 - sh2add s3, a0, s4 + add s0, t6, a6 + add s1, a1, a6 + sh2add a7, a0, s1 fadd.s f4, f13, f3 + fmul.s f3, f0, f11 + fmul.s f0, f14, f12 fadd.s f2, f4, f14 - fmul.s f4, f0, f11 fadd.s f13, f2, f1 - fsw f13, 0(s5) - flw f15, 0(s3) - addi s3, s2, -1080 - fmul.s f1, f15, f10 - add s4, s1, s3 - add s5, a1, s3 - sh2add s2, a0, s4 - sh2add s6, a0, s5 - fadd.s f3, f1, f4 - fmul.s f4, f15, f11 - fadd.s f2, f3, f13 - fadd.s f0, f2, f5 - fmul.s f5, f13, f12 - fsw f0, 0(s6) - flw f14, 0(s2) - addi s2, s3, -1080 + fsw f13, 0(a7) + sh2add a7, a0, s0 + flw f1, 0(a7) + addi a7, a6, -1080 + fmul.s f15, f1, f10 + add s0, t6, a7 + add s1, a1, a7 + sh2add a6, a0, s1 + fadd.s f4, f15, f3 + fadd.s f2, f4, f13 + fmul.s f4, f1, f11 + fmul.s f1, f13, f12 + fadd.s f15, f2, f0 + fsw f15, 0(a6) + sh2add a6, a0, s0 + flw f0, 0(a6) + addi a6, a7, -1080 + fmul.s f14, f0, f10 + add s0, t6, a6 + add s1, a1, a6 + fmul.s f5, f0, f11 + sh2add a7, a0, s1 + fadd.s f3, f14, f4 + fadd.s f2, f3, f15 + fmul.s f3, f15, f12 + fadd.s f14, f2, f1 + fsw f14, 0(a7) + sh2add a7, a0, s0 + flw f1, 0(a7) + addi a7, a6, -1080 + fmul.s f2, f1, f10 + add s1, t6, a7 + add s0, a1, a7 + sh2add a6, a0, s0 + fadd.s f4, f2, f5 + fadd.s f13, f4, f14 + fadd.s f0, f13, f3 + fmul.s f3, f1, f11 + fmul.s f1, f14, f12 + fsw f0, 0(a6) + fmul.s f5, f0, f12 + sh2add a6, a0, s1 + flw f13, 0(a6) + addi a6, a7, -1080 + fmul.s f15, f13, f10 + add a7, t6, a6 + add s0, a1, a6 + sh2add s1, a0, s0 + sh2add s0, a0, a7 + addi a7, a6, -1080 + fadd.s f4, f15, f3 + fadd.s f2, f4, f0 + fmul.s f4, f13, f11 + fadd.s f15, f2, f1 + fsw f15, 0(s1) + add s1, t6, a7 + flw f14, 0(s0) + add s0, a1, a7 fmul.s f1, f14, f10 - add s4, s1, s2 - add s5, a1, s2 - sh2add s3, a0, s4 - sh2add s6, a0, s5 + sh2add a6, a0, s0 fadd.s f3, f1, f4 - fadd.s f2, f3, f0 - fmul.s f3, f0, f12 - fadd.s f15, f2, f5 - fmul.s f5, f14, f11 - fsw f15, 0(s6) - flw f13, 0(s3) - addi s3, s2, -1080 - fmul.s f2, f13, f10 - add s6, s1, s3 - add s4, a1, s3 - sh2add s2, a0, s6 - sh2add s5, a0, s4 - fadd.s f4, f2, f5 - fmul.s f5, f13, f11 - fadd.s f1, f4, f15 - fmul.s f4, f15, f12 - fadd.s f14, f1, f3 - fsw f14, 0(s5) - flw f0, 0(s2) - addi s2, s3, -1080 + fmul.s f4, f14, f11 + fadd.s f2, f3, f15 + fadd.s f13, f2, f5 + fmul.s f5, f15, f12 + fsw f13, 0(a6) + sh2add a6, a0, s1 + flw f0, 0(a6) + addi a6, a7, -1080 fmul.s f1, f0, f10 - add s3, s1, s2 - add s4, a1, s2 - sh2add s5, a0, s4 - sh2add s4, a0, s3 - addi s3, s2, -1080 - fadd.s f3, f1, f5 - add s6, s1, s3 - sh2add s2, a0, s6 + add s0, t6, a6 + add s1, a1, a6 + sh2add a7, a0, s1 + fadd.s f3, f1, f4 + fmul.s f4, f0, f11 + fmul.s f0, f13, f12 + fadd.s f2, f3, f13 + fadd.s f14, f2, f5 + fsw f14, 0(a7) + sh2add a7, a0, s0 + flw f1, 0(a7) + addi a7, a6, -1080 + fmul.s f15, f1, f10 + add a6, t6, a7 + add s0, a1, a7 + sh2add s1, a0, s0 + sh2add s0, a0, a6 + addi a6, a7, -1080 + fadd.s f3, f15, f4 + fmul.s f4, f1, f11 + fmul.s f1, f14, f12 fadd.s f2, f3, f14 - fadd.s f13, f2, f4 + fadd.s f15, f2, f0 + fsw f15, 0(s1) + add s1, a1, a6 + flw f0, 0(s0) + sh2add a7, a0, s1 + add s0, t6, a6 + fmul.s f13, f0, f10 + fadd.s f3, f13, f4 fmul.s f4, f0, f11 - fmul.s f11, f14, f12 - fsw f13, 0(s5) - add s5, a1, s3 - flw f15, 0(s4) - sh2add s4, a0, s5 - fmul.s f1, f15, f10 + fmul.s f11, f15, f12 + fadd.s f2, f3, f15 + fadd.s f13, f2, f1 + fsw f13, 0(a7) + sh2add a7, a0, s0 + flw f14, 0(a7) + addi a7, a6, -1080 + fmul.s f1, f14, f10 + add a6, t6, a7 + add s1, a1, a7 + sh2add s0, a0, s1 + sh2add s1, a0, a6 fadd.s f3, f1, f4 fadd.s f2, f3, f13 fadd.s f10, f2, f11 - fsw f10, 0(s4) - flw f0, 0(s2) - blt s0, zero, label1339 - li s2, -135 - fmv.s f12, f15 - slli s1, s2, 7 - fmv.s f15, f10 - add a1, a1, s1 - j label1102 -label1111: - ld s1, 0(sp) - ld s6, 8(sp) - ld s0, 16(sp) - ld s5, 24(sp) - ld s3, 32(sp) - ld s2, 40(sp) - ld s4, 48(sp) - addi sp, sp, 56 + fsw f10, 0(s0) + flw f15, 0(s1) + blt t5, zero, label1336 + li a6, -135 + fmv.s f12, f14 + slli t6, a6, 7 + fmv.s f14, f10 + add a1, a1, t6 + j label1099 +label1108: + ld s0, 0(sp) + ld s1, 8(sp) + addi sp, sp, 16 ret diff --git a/tests/SysY2022/performance/derich2.arm.s b/tests/SysY2022/performance/derich2.arm.s index 849a39e5c..ea495efac 100644 --- a/tests/SysY2022/performance/derich2.arm.s +++ b/tests/SysY2022/performance/derich2.arm.s @@ -1,28 +1,28 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 imgIn: .zero 552960 -.align 8 +.p2align 3 imgOut: .zero 552960 -.align 8 +.p2align 3 my_y1: .zero 552960 -.align 8 +.p2align 3 my_y2: .zero 552960 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 8 .text diff --git a/tests/SysY2022/performance/derich2.riscv.s b/tests/SysY2022/performance/derich2.riscv.s index 85bd1315b..66ea25543 100644 --- a/tests/SysY2022/performance/derich2.riscv.s +++ b/tests/SysY2022/performance/derich2.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 3191654481 .4byte 1038821134 @@ -9,28 +9,28 @@ __cmmc_fp_constant_pool: .4byte 3191992809 .4byte 1038256634 .bss -.align 8 +.p2align 3 imgIn: .zero 552960 -.align 8 +.p2align 3 imgOut: .zero 552960 -.align 8 +.p2align 3 my_y1: .zero 552960 -.align 8 +.p2align 3 my_y2: .zero 552960 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 8 .text @@ -38,10 +38,10 @@ cmmc_parallel_body_payload_3: .globl main main: addi sp, sp, -48 -pcrel1503: +pcrel1491: auipc a1, %pcrel_hi(imgIn) sd ra, 0(sp) - addi a0, a1, %pcrel_lo(pcrel1503) + addi a0, a1, %pcrel_lo(pcrel1491) fsw f8, 8(sp) fsw f9, 12(sp) fsw f18, 16(sp) @@ -52,18 +52,18 @@ pcrel1503: jal getfarray li a0, 156 jal _sysy_starttime -pcrel1504: - auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) -pcrel1505: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) li s1, 270 li s0, 512 - addi a1, a3, %pcrel_lo(pcrel1504) - addi a0, a2, %pcrel_lo(pcrel1505) - flw f8, 12(a1) -pcrel1506: +pcrel1492: + auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) +pcrel1493: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) + addi a1, a3, %pcrel_lo(pcrel1492) + addi a0, a2, %pcrel_lo(pcrel1493) +pcrel1494: auipc a3, %pcrel_hi(cmmc_parallel_body_1) - fsw f8, %pcrel_lo(pcrel1505)(a2) + flw f8, 12(a1) + fsw f8, %pcrel_lo(pcrel1493)(a2) lui a2, 260096 flw f9, 16(a1) fmv.w.x f18, a2 @@ -71,7 +71,7 @@ pcrel1506: fsw f9, 4(a0) slli s2, a2, 41 fsw f18, 8(a0) - addi a2, a3, %pcrel_lo(pcrel1506) + addi a2, a3, %pcrel_lo(pcrel1494) flw f19, 8(a1) ori a1, s2, 270 fsw f19, 12(a0) @@ -81,30 +81,30 @@ pcrel1506: jal cmmcParallelFor mv a1, s0 mv a0, zero -pcrel1507: +pcrel1495: auipc a3, %pcrel_hi(cmmc_parallel_body_4) - addi a2, a3, %pcrel_lo(pcrel1507) + addi a2, a3, %pcrel_lo(pcrel1495) jal cmmcParallelFor -pcrel1508: +pcrel1496: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel1509: +pcrel1497: auipc a3, %pcrel_hi(cmmc_parallel_body_2) ori a1, s2, 270 - addi a2, a3, %pcrel_lo(pcrel1509) - sd a1, %pcrel_lo(pcrel1508)(a0) + addi a2, a3, %pcrel_lo(pcrel1497) + sd a1, %pcrel_lo(pcrel1496)(a0) mv a1, s0 mv a0, zero jal cmmcParallelFor -pcrel1510: +pcrel1498: auipc a1, %pcrel_hi(cmmc_parallel_body_payload_0) li a2, 270 -pcrel1511: +pcrel1499: auipc a4, %pcrel_hi(cmmc_parallel_body_0) - addi a0, a1, %pcrel_lo(pcrel1510) - fsw f8, %pcrel_lo(pcrel1510)(a1) + addi a0, a1, %pcrel_lo(pcrel1498) + fsw f8, %pcrel_lo(pcrel1498)(a1) slli a1, a2, 32 fsw f9, 4(a0) - addi a2, a4, %pcrel_lo(pcrel1511) + addi a2, a4, %pcrel_lo(pcrel1499) addi a3, a1, 512 fsw f18, 8(a0) fsw f19, 12(a0) @@ -112,26 +112,26 @@ pcrel1511: mv a1, s1 mv a0, zero jal cmmcParallelFor - mv a1, s1 mv a0, zero -pcrel1512: + mv a1, s1 +pcrel1500: auipc a3, %pcrel_hi(cmmc_parallel_body_5) - addi a2, a3, %pcrel_lo(pcrel1512) + addi a2, a3, %pcrel_lo(pcrel1500) jal cmmcParallelFor -pcrel1513: - auipc a3, %pcrel_hi(cmmc_parallel_body_3) ori a1, s2, 270 -pcrel1514: +pcrel1501: + auipc a3, %pcrel_hi(cmmc_parallel_body_3) +pcrel1502: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - addi a2, a3, %pcrel_lo(pcrel1513) - sd a1, %pcrel_lo(pcrel1514)(a0) + addi a2, a3, %pcrel_lo(pcrel1501) + sd a1, %pcrel_lo(pcrel1502)(a0) mv a1, s0 mv a0, zero jal cmmcParallelFor li a0, 158 -pcrel1515: +pcrel1503: auipc a1, %pcrel_hi(imgOut) - addi s0, a1, %pcrel_lo(pcrel1515) + addi s0, a1, %pcrel_lo(pcrel1503) jal _sysy_stoptime mv a1, s0 li a2, 135 @@ -152,17 +152,17 @@ pcrel1515: cmmc_parallel_body_0: mv a5, a0 mv a4, a1 -pcrel216: +pcrel213: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel217: +pcrel214: auipc t0, %pcrel_hi(my_y1) -pcrel218: +pcrel215: auipc t1, %pcrel_hi(imgOut) li t4, 3 - flw f10, %pcrel_lo(pcrel216)(a2) - addi a0, a2, %pcrel_lo(pcrel216) - addi a1, t0, %pcrel_lo(pcrel217) - addi a2, t1, %pcrel_lo(pcrel218) + flw f10, %pcrel_lo(pcrel213)(a2) + addi a0, a2, %pcrel_lo(pcrel213) + addi a1, t0, %pcrel_lo(pcrel214) + addi a2, t1, %pcrel_lo(pcrel215) flw f11, 4(a0) flw f12, 8(a0) flw f13, 12(a0) @@ -281,7 +281,7 @@ label4: fadd.s f2, f4, f7 fsw f2, 0(a6) bgt t2, t5, label10 - ble a3, t5, label207 + ble a3, t5, label204 mulw t6, t5, a0 fmv.s f1, f14 add t4, a2, t6 @@ -310,31 +310,31 @@ label13: label32: ret .p2align 2 -label207: +label204: addiw a5, a5, 1 bgt a4, a5, label2 j label32 .p2align 2 cmmc_parallel_body_1: mv t0, a0 -pcrel418: +pcrel415: auipc a3, %pcrel_hi(cmmc_parallel_body_payload_1) -pcrel419: +pcrel416: auipc t2, %pcrel_hi(imgIn) li t1, 3 - flw f10, %pcrel_lo(pcrel418)(a3) - addi a2, a3, %pcrel_lo(pcrel418) -pcrel420: + flw f10, %pcrel_lo(pcrel415)(a3) + addi a2, a3, %pcrel_lo(pcrel415) +pcrel417: auipc a3, %pcrel_hi(my_y1) flw f11, 4(a2) - addi a5, a3, %pcrel_lo(pcrel420) + addi a5, a3, %pcrel_lo(pcrel417) flw f12, 8(a2) li a3, 1080 flw f13, 12(a2) lw a0, 16(a2) - addi a2, t2, %pcrel_lo(pcrel419) + addi a2, t2, %pcrel_lo(pcrel416) addi a4, a0, -3 - ble a0, t1, label243 + ble a0, t1, label240 mulw t1, t0, a3 add a5, a5, t1 mulw t3, t0, a3 @@ -344,13 +344,13 @@ pcrel420: fmv.s f14, f15 fmv.s f1, f15 mv t3, zero - j label224 + j label221 .p2align 2 -label355: +label352: addiw t0, t0, 1 - ble a1, t0, label242 + ble a1, t0, label239 .p2align 2 -label233: +label230: addi a5, a5, 1080 mulw t3, t0, a3 fmv.w.x f15, zero @@ -360,7 +360,7 @@ label233: fmv.s f1, f15 mv t3, zero .p2align 2 -label224: +label221: sh2add t4, t3, t2 fmul.s f6, f11, f15 addiw t3, t3, 4 @@ -399,19 +399,19 @@ label224: fadd.s f3, f6, f1 fadd.s f2, f3, f7 fsw f2, 12(t1) - ble a4, t3, label327 + ble a4, t3, label324 addi t1, t1, 16 fmv.s f1, f14 fmv.s f14, f2 - j label224 + j label221 .p2align 2 -label327: - ble a0, t3, label408 +label324: + ble a0, t3, label405 sh2add t1, t3, t2 fmv.s f1, f15 fmv.s f0, f14 .p2align 2 -label235: +label232: flw f15, 0(t1) fmul.s f5, f11, f1 fmul.s f4, f12, f2 @@ -423,17 +423,17 @@ label235: fadd.s f1, f3, f4 fadd.s f14, f1, f5 fsw f14, 0(t2) - ble a0, t3, label355 + ble a0, t3, label352 addi t1, t1, 4 fmv.s f1, f15 fmv.s f0, f2 fmv.s f2, f14 - j label235 -label243: - bgt a0, zero, label244 -label242: + j label232 +label240: + bgt a0, zero, label241 +label239: ret -label244: +label241: mulw t1, t0, a3 add a4, a5, t1 mv a5, t0 @@ -442,15 +442,15 @@ label244: fmv.s f14, f1 fmv.s f2, f1 mv t1, zero - j label248 + j label245 .p2align 2 -label254: +label251: addi t0, t0, 4 fmv.s f2, f15 fmv.s f1, f14 fmv.s f14, f0 .p2align 2 -label248: +label245: flw f15, 0(t0) fmul.s f5, f11, f2 fmul.s f4, f12, f14 @@ -462,9 +462,9 @@ label248: fadd.s f2, f3, f4 fadd.s f0, f2, f5 fsw f0, 0(t2) - bgt a0, t1, label254 + bgt a0, t1, label251 addiw a5, a5, 1 - ble a1, a5, label242 + ble a1, a5, label239 addi a4, a4, 1080 mulw t1, a5, a3 fmv.w.x f1, zero @@ -472,31 +472,31 @@ label248: fmv.s f14, f1 fmv.s f2, f1 mv t1, zero - j label248 + j label245 .p2align 2 -label408: +label405: addiw t0, t0, 1 - bgt a1, t0, label233 - j label242 + bgt a1, t0, label230 + j label239 .p2align 2 cmmc_parallel_body_2: mv t1, a0 -pcrel586: +pcrel583: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel587: +pcrel584: auipc a3, %pcrel_hi(imgOut) -pcrel588: +pcrel585: auipc a4, %pcrel_hi(my_y1) -pcrel589: +pcrel586: auipc t3, %pcrel_hi(my_y2) li t2, 3 - lw a0, %pcrel_lo(pcrel586)(a2) - addi t0, a3, %pcrel_lo(pcrel587) - addi a2, a4, %pcrel_lo(pcrel588) + lw a0, %pcrel_lo(pcrel583)(a2) + addi t0, a3, %pcrel_lo(pcrel584) + addi a2, a4, %pcrel_lo(pcrel585) addi a5, a0, -3 - addi a3, t3, %pcrel_lo(pcrel589) + addi a3, t3, %pcrel_lo(pcrel586) li a4, 1080 - ble a0, t2, label439 + ble a0, t2, label436 mulw t2, t1, a4 add t0, t0, t2 mulw t6, t1, a4 @@ -504,12 +504,12 @@ pcrel589: mv t5, zero add t2, a3, t6 add t4, a2, t6 - j label426 + j label423 .p2align 2 -label429: +label426: addi t3, t3, 16 .p2align 2 -label426: +label423: sh2add t6, t5, t4 sh2add a6, t5, t2 flw f10, 0(t6) @@ -529,15 +529,15 @@ label426: flw f11, 12(a6) fadd.s f12, f10, f11 fsw f12, 12(t3) - bgt a5, t5, label429 - ble a0, t5, label575 + bgt a5, t5, label426 + ble a0, t5, label572 sh2add t3, t5, t4 - j label432 + j label429 .p2align 2 -label435: +label432: addi t3, t3, 4 .p2align 2 -label432: +label429: sh2add t4, t5, t2 flw f10, 0(t3) sh2add t6, t5, t0 @@ -545,23 +545,23 @@ label432: addiw t5, t5, 1 fadd.s f11, f10, f12 fsw f11, 0(t6) - bgt a0, t5, label435 + bgt a0, t5, label432 addiw t1, t1, 1 - ble a1, t1, label438 + ble a1, t1, label435 .p2align 2 -label437: +label434: addi t0, t0, 1080 mulw t6, t1, a4 mv t5, zero mv t3, t0 add t2, a3, t6 add t4, a2, t6 - j label426 -label439: - bgt a0, zero, label440 -label438: + j label423 +label436: + bgt a0, zero, label437 +label435: ret -label440: +label437: mulw t2, t1, a4 add a5, t0, t2 mv t0, t1 @@ -569,12 +569,12 @@ label440: mv t3, zero add t2, a3, t4 add t1, a2, t4 - j label444 + j label441 .p2align 2 -label447: +label444: addi t1, t1, 4 .p2align 2 -label444: +label441: sh2add t4, t3, t2 flw f10, 0(t1) sh2add t5, t3, a5 @@ -582,11 +582,11 @@ label444: addiw t3, t3, 1 fadd.s f11, f10, f12 fsw f11, 0(t5) - bgt a0, t3, label447 + bgt a0, t3, label444 addiw t0, t0, 1 - ble a1, t0, label438 + ble a1, t0, label435 .p2align 2 -label449: +label446: addi a5, a5, 1080 mulw t4, t0, a4 li t3, 1 @@ -598,37 +598,37 @@ label449: flw f12, 0(t2) fadd.s f11, f10, f12 fsw f11, 0(a5) - bgt a0, t3, label447 + bgt a0, t3, label444 addiw t0, t0, 1 - bgt a1, t0, label449 - j label438 + bgt a1, t0, label446 + j label435 .p2align 2 -label575: +label572: addiw t1, t1, 1 - bgt a1, t1, label437 - j label438 + bgt a1, t1, label434 + j label435 .p2align 2 cmmc_parallel_body_3: mv t1, a0 -pcrel754: +pcrel751: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel755: +pcrel752: auipc a3, %pcrel_hi(imgOut) -pcrel756: +pcrel753: auipc a4, %pcrel_hi(my_y1) -pcrel757: +pcrel754: auipc t3, %pcrel_hi(my_y2) li t2, 3 - lw a0, %pcrel_lo(pcrel754)(a2) - addi t0, a3, %pcrel_lo(pcrel755) - addi a2, a4, %pcrel_lo(pcrel756) + lw a0, %pcrel_lo(pcrel751)(a2) + addi t0, a3, %pcrel_lo(pcrel752) + addi a2, a4, %pcrel_lo(pcrel753) addi a5, a0, -3 - addi a3, t3, %pcrel_lo(pcrel757) + addi a3, t3, %pcrel_lo(pcrel754) li a4, 1080 - bgt a0, t2, label591 - bgt a0, zero, label609 - j label607 -label591: + bgt a0, t2, label588 + bgt a0, zero, label606 + j label604 +label588: mulw t2, t1, a4 add t0, t0, t2 mulw t6, t1, a4 @@ -636,12 +636,12 @@ label591: mv t5, zero add t2, a3, t6 add t4, a2, t6 - j label595 + j label592 .p2align 2 -label598: +label595: addi t3, t3, 16 .p2align 2 -label595: +label592: sh2add t6, t5, t4 sh2add a6, t5, t2 flw f10, 0(t6) @@ -661,11 +661,11 @@ label595: flw f13, 12(a6) fadd.s f10, f11, f13 fsw f10, 12(t3) - bgt a5, t5, label598 - ble a0, t5, label744 + bgt a5, t5, label595 + ble a0, t5, label741 sh2add t3, t5, t4 .p2align 2 -label603: +label600: sh2add t4, t5, t2 flw f11, 0(t3) sh2add t6, t5, t0 @@ -673,25 +673,25 @@ label603: addiw t5, t5, 1 fadd.s f10, f11, f12 fsw f10, 0(t6) - ble a0, t5, label694 + ble a0, t5, label691 addi t3, t3, 4 - j label603 + j label600 .p2align 2 -label694: +label691: addiw t1, t1, 1 - ble a1, t1, label607 + ble a1, t1, label604 .p2align 2 -label601: +label598: addi t0, t0, 1080 mulw t6, t1, a4 mv t5, zero mv t3, t0 add t2, a3, t6 add t4, a2, t6 - j label595 -label607: + j label592 +label604: ret -label609: +label606: mulw t2, t1, a4 add a5, t0, t2 mv t0, t1 @@ -699,12 +699,12 @@ label609: mv t3, zero add t2, a3, t4 add t1, a2, t4 - j label613 + j label610 .p2align 2 -label616: +label613: addi t1, t1, 4 .p2align 2 -label613: +label610: sh2add t4, t3, t2 flw f11, 0(t1) sh2add t5, t3, a5 @@ -712,11 +712,11 @@ label613: addiw t3, t3, 1 fadd.s f10, f11, f12 fsw f10, 0(t5) - bgt a0, t3, label616 + bgt a0, t3, label613 addiw t0, t0, 1 - ble a1, t0, label607 + ble a1, t0, label604 .p2align 2 -label618: +label615: addi a5, a5, 1080 mulw t4, t0, a4 li t3, 1 @@ -728,32 +728,32 @@ label618: flw f12, 0(t2) fadd.s f10, f11, f12 fsw f10, 0(a5) - bgt a0, t3, label616 + bgt a0, t3, label613 addiw t0, t0, 1 - bgt a1, t0, label618 - j label607 + bgt a1, t0, label615 + j label604 .p2align 2 -label744: +label741: addiw t1, t1, 1 - bgt a1, t1, label601 - j label607 + bgt a1, t1, label598 + j label604 .p2align 2 cmmc_parallel_body_4: mv a3, a1 -pcrel1096: +pcrel1093: auipc a4, %pcrel_hi(my_y2) li a5, 1080 -pcrel1097: +pcrel1094: auipc t2, %pcrel_hi(imgIn) mv t1, a0 - addi a2, a4, %pcrel_lo(pcrel1096) + addi a2, a4, %pcrel_lo(pcrel1093) mulw a1, a0, a5 - addi t0, t2, %pcrel_lo(pcrel1097) + addi t0, t2, %pcrel_lo(pcrel1094) add a4, a2, a1 -pcrel1098: +pcrel1095: auipc t2, %pcrel_hi(__cmmc_fp_constant_pool) li a2, 13 - addi a1, t2, %pcrel_lo(pcrel1098) + addi a1, t2, %pcrel_lo(pcrel1095) mulw t3, a0, a5 addi a0, a4, 1076 fmv.w.x f13, zero @@ -762,9 +762,9 @@ pcrel1098: fmv.s f12, f13 fmv.s f15, f13 li t3, 269 - j label762 + j label759 .p2align 2 -label770: +label767: flw f11, 0(a1) addiw t1, t1, 1 flw f12, 4(a1) @@ -880,7 +880,7 @@ label770: fadd.s f11, f2, f0 fadd.s f13, f11, f12 fsw f13, -116(a0) - ble a3, t1, label772 + ble a3, t1, label769 addi a4, a4, 1080 mulw t3, t1, a5 fmv.w.x f13, zero @@ -891,7 +891,7 @@ label770: fmv.s f15, f13 li t3, 269 .p2align 2 -label762: +label759: flw f10, 0(a1) sh2add t4, t3, t2 flw f11, 4(a1) @@ -1025,277 +1025,267 @@ label762: fadd.s f10, f2, f11 fsw f10, -60(a0) flw f15, -60(t4) - ble t3, a2, label770 + ble t3, a2, label767 addi a0, a0, -64 fmv.s f12, f14 fmv.s f14, f10 - j label762 -label772: + j label759 +label769: ret .p2align 2 cmmc_parallel_body_5: - addi sp, sp, -56 - mv a6, a1 -pcrel1408: + addi sp, sp, -16 + mv t3, a1 +pcrel1396: auipc a2, %pcrel_hi(my_y2) -pcrel1409: +pcrel1397: auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) - li t6, 1080 -pcrel1410: + li t2, 1080 +pcrel1398: auipc a1, %pcrel_hi(imgOut) - addi a7, a2, %pcrel_lo(pcrel1408) - addi t5, a1, %pcrel_lo(pcrel1410) - sd s1, 0(sp) - addi a2, a3, %pcrel_lo(pcrel1409) + addi t4, a2, %pcrel_lo(pcrel1396) + addi t1, a1, %pcrel_lo(pcrel1398) + sd s0, 0(sp) + addi a2, a3, %pcrel_lo(pcrel1397) li a1, -135 - sd s6, 8(sp) + sd s1, 8(sp) slli a3, a1, 4 - sd s0, 16(sp) addi a4, a3, -1080 - sd s5, 24(sp) addi a5, a4, -1080 - sd s3, 32(sp) addi t0, a5, -1080 - sd s2, 40(sp) - addi t1, t0, -1080 - sd s4, 48(sp) - addi t2, t1, -1080 - addi t3, t2, -1080 - addi t4, t3, -1080 - j label1100 + j label1097 .p2align 2 -label1339: +label1336: addiw a0, a0, 1 - ble a6, a0, label1111 + ble t3, a0, label1108 .p2align 2 -label1100: - lui s1, 135 +label1097: + lui t6, 135 fmv.w.x f13, zero - addiw s0, s1, -1080 - fmv.s f15, f13 + addiw t5, t6, -1080 + fmv.s f14, f13 fmv.s f12, f13 - fmv.s f0, f13 - add a1, a7, s0 - li s0, 511 + fmv.s f15, f13 + add a1, t4, t5 + li t5, 511 .p2align 2 -label1102: +label1099: flw f10, 0(a2) - sh2add s3, a0, a1 - mulw s2, s0, t6 + sh2add a7, a0, a1 + mulw a6, t5, t2 flw f11, 4(a2) - fmul.s f14, f0, f10 - addiw s0, s0, -16 - add s1, t5, s2 + fmul.s f1, f15, f10 + addiw t5, t5, -16 + add t6, t1, a6 fmul.s f2, f12, f11 - fmul.s f4, f0, f11 - addi s4, s1, -1080 - addi s2, a1, -1080 + fmul.s f5, f15, f11 + addi a6, a1, -1080 flw f12, 8(a2) - fadd.s f3, f14, f2 - fmul.s f0, f15, f12 - fmul.s f2, f13, f12 - fadd.s f1, f3, f15 - fadd.s f14, f1, f2 - fsw f14, 0(s3) - sh2add s3, a0, s1 - flw f1, 0(s3) - sh2add s3, a0, s2 - fmul.s f13, f1, f10 - sh2add s2, a0, s4 - add s4, s1, a3 - fadd.s f3, f13, f4 - fadd.s f2, f3, f14 - fmul.s f3, f1, f11 - fmul.s f1, f14, f12 - fadd.s f13, f2, f0 - fsw f13, 0(s3) - add s3, a1, a3 - flw f0, 0(s2) - sh2add s2, a0, s3 - fmul.s f15, f0, f10 - sh2add s3, a0, s4 - add s4, a1, a4 - fadd.s f4, f15, f3 - fmul.s f3, f0, f11 - fmul.s f0, f13, f12 - fadd.s f2, f4, f13 - fadd.s f15, f2, f1 - fsw f15, 0(s2) - add s2, s1, a4 - flw f1, 0(s3) - sh2add s3, a0, s4 - fmul.s f14, f1, f10 - sh2add s4, a0, s2 - add s2, a1, a5 - fadd.s f4, f14, f3 - fmul.s f3, f1, f11 - fmul.s f1, f15, f12 - fadd.s f2, f4, f15 - fadd.s f14, f2, f0 - fsw f14, 0(s3) - sh2add s3, a0, s2 - flw f0, 0(s4) - add s4, s1, a5 - fmul.s f13, f0, f10 - sh2add s2, a0, s4 - add s4, s1, t0 - fadd.s f4, f13, f3 - fadd.s f2, f4, f14 - fmul.s f4, f0, f11 - fmul.s f0, f14, f12 - fadd.s f13, f2, f1 - fsw f13, 0(s3) - add s3, a1, t0 - flw f1, 0(s2) - sh2add s2, a0, s3 - fmul.s f15, f1, f10 - sh2add s3, a0, s4 - add s4, a1, t1 - fadd.s f3, f15, f4 - fadd.s f2, f3, f13 + fadd.s f3, f1, f2 + fmul.s f1, f13, f12 + fadd.s f0, f3, f14 + fmul.s f3, f14, f12 + fadd.s f13, f0, f1 + fsw f13, 0(a7) + sh2add a7, a0, t6 + flw f1, 0(a7) + sh2add a7, a0, a6 + fmul.s f2, f1, f10 + addi a6, t6, -1080 + fadd.s f4, f2, f5 + fadd.s f0, f4, f13 + fadd.s f15, f0, f3 fmul.s f3, f1, f11 fmul.s f1, f13, f12 - fadd.s f15, f2, f0 - fsw f15, 0(s2) - sh2add s2, a0, s4 - flw f0, 0(s3) - add s3, s1, t1 + fsw f15, 0(a7) + sh2add a7, a0, a6 + flw f0, 0(a7) + add a7, a1, a3 fmul.s f14, f0, f10 - sh2add s4, a0, s3 - add s3, s1, t2 + sh2add a6, a0, a7 + add a7, t6, a3 fadd.s f4, f14, f3 fmul.s f3, f0, f11 fmul.s f0, f15, f12 fadd.s f2, f4, f15 fadd.s f14, f2, f1 - fsw f14, 0(s2) - flw f1, 0(s4) - add s4, a1, t2 + fsw f14, 0(a6) + sh2add a6, a0, a7 + flw f1, 0(a6) + add a6, a1, a4 fmul.s f13, f1, f10 - sh2add s2, a0, s4 - sh2add s4, a0, s3 - add s3, a1, t3 + sh2add a7, a0, a6 + add a6, t6, a4 fadd.s f4, f13, f3 - fadd.s f2, f4, f14 - fmul.s f4, f1, f11 + fmul.s f3, f1, f11 fmul.s f1, f14, f12 + fadd.s f2, f4, f14 fadd.s f13, f2, f0 - fsw f13, 0(s2) - sh2add s2, a0, s3 - flw f0, 0(s4) - add s4, s1, t3 + fsw f13, 0(a7) + sh2add a7, a0, a6 + add a6, a1, a5 + flw f0, 0(a7) + sh2add a7, a0, a6 fmul.s f15, f0, f10 - sh2add s3, a0, s4 - add s4, s1, t4 - fadd.s f3, f15, f4 + add a6, t6, a5 + fadd.s f4, f15, f3 + fadd.s f2, f4, f13 fmul.s f4, f0, f11 fmul.s f0, f13, f12 - fadd.s f2, f3, f13 fadd.s f15, f2, f1 - fsw f15, 0(s2) - flw f1, 0(s3) - add s3, a1, t4 + fsw f15, 0(a7) + sh2add a7, a0, a6 + add a6, a1, t0 + flw f1, 0(a7) + sh2add s0, a0, a6 + add a7, t6, t0 fmul.s f14, f1, f10 - sh2add s2, a0, s3 - sh2add s3, a0, s4 - li s4, -675 + sh2add a6, a0, a7 + li a7, -405 fadd.s f3, f14, f4 fadd.s f2, f3, f15 fmul.s f3, f1, f11 fmul.s f1, f15, f12 fadd.s f14, f2, f0 - fmul.s f5, f14, f12 - fsw f14, 0(s2) - slli s2, s4, 4 - flw f0, 0(s3) - add s4, s1, s2 - add s3, a1, s2 + fsw f14, 0(s0) + flw f0, 0(a6) + slli a6, a7, 4 fmul.s f13, f0, f10 - sh2add s5, a0, s3 - sh2add s3, a0, s4 + add s0, t6, a6 + add s1, a1, a6 + sh2add a7, a0, s1 fadd.s f4, f13, f3 + fmul.s f3, f0, f11 + fmul.s f0, f14, f12 fadd.s f2, f4, f14 - fmul.s f4, f0, f11 fadd.s f13, f2, f1 - fsw f13, 0(s5) - flw f15, 0(s3) - addi s3, s2, -1080 - fmul.s f1, f15, f10 - add s4, s1, s3 - add s5, a1, s3 - sh2add s2, a0, s4 - sh2add s6, a0, s5 - fadd.s f3, f1, f4 - fmul.s f4, f15, f11 - fadd.s f2, f3, f13 - fadd.s f0, f2, f5 - fmul.s f5, f13, f12 - fsw f0, 0(s6) - flw f14, 0(s2) - addi s2, s3, -1080 + fsw f13, 0(a7) + sh2add a7, a0, s0 + flw f1, 0(a7) + addi a7, a6, -1080 + fmul.s f15, f1, f10 + add s0, t6, a7 + add s1, a1, a7 + sh2add a6, a0, s1 + fadd.s f4, f15, f3 + fadd.s f2, f4, f13 + fmul.s f4, f1, f11 + fmul.s f1, f13, f12 + fadd.s f15, f2, f0 + fsw f15, 0(a6) + sh2add a6, a0, s0 + flw f0, 0(a6) + addi a6, a7, -1080 + fmul.s f14, f0, f10 + add s0, t6, a6 + add s1, a1, a6 + fmul.s f5, f0, f11 + sh2add a7, a0, s1 + fadd.s f3, f14, f4 + fadd.s f2, f3, f15 + fmul.s f3, f15, f12 + fadd.s f14, f2, f1 + fsw f14, 0(a7) + sh2add a7, a0, s0 + flw f1, 0(a7) + addi a7, a6, -1080 + fmul.s f2, f1, f10 + add s1, t6, a7 + add s0, a1, a7 + sh2add a6, a0, s0 + fadd.s f4, f2, f5 + fadd.s f13, f4, f14 + fadd.s f0, f13, f3 + fmul.s f3, f1, f11 + fmul.s f1, f14, f12 + fsw f0, 0(a6) + fmul.s f5, f0, f12 + sh2add a6, a0, s1 + flw f13, 0(a6) + addi a6, a7, -1080 + fmul.s f15, f13, f10 + add a7, t6, a6 + add s0, a1, a6 + sh2add s1, a0, s0 + sh2add s0, a0, a7 + addi a7, a6, -1080 + fadd.s f4, f15, f3 + fadd.s f2, f4, f0 + fmul.s f4, f13, f11 + fadd.s f15, f2, f1 + fsw f15, 0(s1) + add s1, t6, a7 + flw f14, 0(s0) + add s0, a1, a7 fmul.s f1, f14, f10 - add s4, s1, s2 - add s5, a1, s2 - sh2add s3, a0, s4 - sh2add s6, a0, s5 + sh2add a6, a0, s0 fadd.s f3, f1, f4 - fadd.s f2, f3, f0 - fmul.s f3, f0, f12 - fadd.s f15, f2, f5 - fmul.s f5, f14, f11 - fsw f15, 0(s6) - flw f13, 0(s3) - addi s3, s2, -1080 - fmul.s f2, f13, f10 - add s6, s1, s3 - add s4, a1, s3 - sh2add s2, a0, s6 - sh2add s5, a0, s4 - fadd.s f4, f2, f5 - fmul.s f5, f13, f11 - fadd.s f1, f4, f15 - fmul.s f4, f15, f12 - fadd.s f14, f1, f3 - fsw f14, 0(s5) - flw f0, 0(s2) - addi s2, s3, -1080 + fmul.s f4, f14, f11 + fadd.s f2, f3, f15 + fadd.s f13, f2, f5 + fmul.s f5, f15, f12 + fsw f13, 0(a6) + sh2add a6, a0, s1 + flw f0, 0(a6) + addi a6, a7, -1080 fmul.s f1, f0, f10 - add s3, s1, s2 - add s4, a1, s2 - sh2add s5, a0, s4 - sh2add s4, a0, s3 - addi s3, s2, -1080 - fadd.s f3, f1, f5 - add s6, s1, s3 - sh2add s2, a0, s6 + add s0, t6, a6 + add s1, a1, a6 + sh2add a7, a0, s1 + fadd.s f3, f1, f4 + fmul.s f4, f0, f11 + fmul.s f0, f13, f12 + fadd.s f2, f3, f13 + fadd.s f14, f2, f5 + fsw f14, 0(a7) + sh2add a7, a0, s0 + flw f1, 0(a7) + addi a7, a6, -1080 + fmul.s f15, f1, f10 + add a6, t6, a7 + add s0, a1, a7 + sh2add s1, a0, s0 + sh2add s0, a0, a6 + addi a6, a7, -1080 + fadd.s f3, f15, f4 + fmul.s f4, f1, f11 + fmul.s f1, f14, f12 fadd.s f2, f3, f14 - fadd.s f13, f2, f4 + fadd.s f15, f2, f0 + fsw f15, 0(s1) + add s1, a1, a6 + flw f0, 0(s0) + sh2add a7, a0, s1 + add s0, t6, a6 + fmul.s f13, f0, f10 + fadd.s f3, f13, f4 fmul.s f4, f0, f11 - fmul.s f11, f14, f12 - fsw f13, 0(s5) - add s5, a1, s3 - flw f15, 0(s4) - sh2add s4, a0, s5 - fmul.s f1, f15, f10 + fmul.s f11, f15, f12 + fadd.s f2, f3, f15 + fadd.s f13, f2, f1 + fsw f13, 0(a7) + sh2add a7, a0, s0 + flw f14, 0(a7) + addi a7, a6, -1080 + fmul.s f1, f14, f10 + add a6, t6, a7 + add s1, a1, a7 + sh2add s0, a0, s1 + sh2add s1, a0, a6 fadd.s f3, f1, f4 fadd.s f2, f3, f13 fadd.s f10, f2, f11 - fsw f10, 0(s4) - flw f0, 0(s2) - blt s0, zero, label1339 - li s2, -135 - fmv.s f12, f15 - slli s1, s2, 7 - fmv.s f15, f10 - add a1, a1, s1 - j label1102 -label1111: - ld s1, 0(sp) - ld s6, 8(sp) - ld s0, 16(sp) - ld s5, 24(sp) - ld s3, 32(sp) - ld s2, 40(sp) - ld s4, 48(sp) - addi sp, sp, 56 + fsw f10, 0(s0) + flw f15, 0(s1) + blt t5, zero, label1336 + li a6, -135 + fmv.s f12, f14 + slli t6, a6, 7 + fmv.s f14, f10 + add a1, a1, t6 + j label1099 +label1108: + ld s0, 0(sp) + ld s1, 8(sp) + addi sp, sp, 16 ret diff --git a/tests/SysY2022/performance/derich3.arm.s b/tests/SysY2022/performance/derich3.arm.s index 849a39e5c..ea495efac 100644 --- a/tests/SysY2022/performance/derich3.arm.s +++ b/tests/SysY2022/performance/derich3.arm.s @@ -1,28 +1,28 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 imgIn: .zero 552960 -.align 8 +.p2align 3 imgOut: .zero 552960 -.align 8 +.p2align 3 my_y1: .zero 552960 -.align 8 +.p2align 3 my_y2: .zero 552960 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 8 .text diff --git a/tests/SysY2022/performance/derich3.riscv.s b/tests/SysY2022/performance/derich3.riscv.s index 85bd1315b..66ea25543 100644 --- a/tests/SysY2022/performance/derich3.riscv.s +++ b/tests/SysY2022/performance/derich3.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 3191654481 .4byte 1038821134 @@ -9,28 +9,28 @@ __cmmc_fp_constant_pool: .4byte 3191992809 .4byte 1038256634 .bss -.align 8 +.p2align 3 imgIn: .zero 552960 -.align 8 +.p2align 3 imgOut: .zero 552960 -.align 8 +.p2align 3 my_y1: .zero 552960 -.align 8 +.p2align 3 my_y2: .zero 552960 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 24 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 8 .text @@ -38,10 +38,10 @@ cmmc_parallel_body_payload_3: .globl main main: addi sp, sp, -48 -pcrel1503: +pcrel1491: auipc a1, %pcrel_hi(imgIn) sd ra, 0(sp) - addi a0, a1, %pcrel_lo(pcrel1503) + addi a0, a1, %pcrel_lo(pcrel1491) fsw f8, 8(sp) fsw f9, 12(sp) fsw f18, 16(sp) @@ -52,18 +52,18 @@ pcrel1503: jal getfarray li a0, 156 jal _sysy_starttime -pcrel1504: - auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) -pcrel1505: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) li s1, 270 li s0, 512 - addi a1, a3, %pcrel_lo(pcrel1504) - addi a0, a2, %pcrel_lo(pcrel1505) - flw f8, 12(a1) -pcrel1506: +pcrel1492: + auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) +pcrel1493: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) + addi a1, a3, %pcrel_lo(pcrel1492) + addi a0, a2, %pcrel_lo(pcrel1493) +pcrel1494: auipc a3, %pcrel_hi(cmmc_parallel_body_1) - fsw f8, %pcrel_lo(pcrel1505)(a2) + flw f8, 12(a1) + fsw f8, %pcrel_lo(pcrel1493)(a2) lui a2, 260096 flw f9, 16(a1) fmv.w.x f18, a2 @@ -71,7 +71,7 @@ pcrel1506: fsw f9, 4(a0) slli s2, a2, 41 fsw f18, 8(a0) - addi a2, a3, %pcrel_lo(pcrel1506) + addi a2, a3, %pcrel_lo(pcrel1494) flw f19, 8(a1) ori a1, s2, 270 fsw f19, 12(a0) @@ -81,30 +81,30 @@ pcrel1506: jal cmmcParallelFor mv a1, s0 mv a0, zero -pcrel1507: +pcrel1495: auipc a3, %pcrel_hi(cmmc_parallel_body_4) - addi a2, a3, %pcrel_lo(pcrel1507) + addi a2, a3, %pcrel_lo(pcrel1495) jal cmmcParallelFor -pcrel1508: +pcrel1496: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel1509: +pcrel1497: auipc a3, %pcrel_hi(cmmc_parallel_body_2) ori a1, s2, 270 - addi a2, a3, %pcrel_lo(pcrel1509) - sd a1, %pcrel_lo(pcrel1508)(a0) + addi a2, a3, %pcrel_lo(pcrel1497) + sd a1, %pcrel_lo(pcrel1496)(a0) mv a1, s0 mv a0, zero jal cmmcParallelFor -pcrel1510: +pcrel1498: auipc a1, %pcrel_hi(cmmc_parallel_body_payload_0) li a2, 270 -pcrel1511: +pcrel1499: auipc a4, %pcrel_hi(cmmc_parallel_body_0) - addi a0, a1, %pcrel_lo(pcrel1510) - fsw f8, %pcrel_lo(pcrel1510)(a1) + addi a0, a1, %pcrel_lo(pcrel1498) + fsw f8, %pcrel_lo(pcrel1498)(a1) slli a1, a2, 32 fsw f9, 4(a0) - addi a2, a4, %pcrel_lo(pcrel1511) + addi a2, a4, %pcrel_lo(pcrel1499) addi a3, a1, 512 fsw f18, 8(a0) fsw f19, 12(a0) @@ -112,26 +112,26 @@ pcrel1511: mv a1, s1 mv a0, zero jal cmmcParallelFor - mv a1, s1 mv a0, zero -pcrel1512: + mv a1, s1 +pcrel1500: auipc a3, %pcrel_hi(cmmc_parallel_body_5) - addi a2, a3, %pcrel_lo(pcrel1512) + addi a2, a3, %pcrel_lo(pcrel1500) jal cmmcParallelFor -pcrel1513: - auipc a3, %pcrel_hi(cmmc_parallel_body_3) ori a1, s2, 270 -pcrel1514: +pcrel1501: + auipc a3, %pcrel_hi(cmmc_parallel_body_3) +pcrel1502: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - addi a2, a3, %pcrel_lo(pcrel1513) - sd a1, %pcrel_lo(pcrel1514)(a0) + addi a2, a3, %pcrel_lo(pcrel1501) + sd a1, %pcrel_lo(pcrel1502)(a0) mv a1, s0 mv a0, zero jal cmmcParallelFor li a0, 158 -pcrel1515: +pcrel1503: auipc a1, %pcrel_hi(imgOut) - addi s0, a1, %pcrel_lo(pcrel1515) + addi s0, a1, %pcrel_lo(pcrel1503) jal _sysy_stoptime mv a1, s0 li a2, 135 @@ -152,17 +152,17 @@ pcrel1515: cmmc_parallel_body_0: mv a5, a0 mv a4, a1 -pcrel216: +pcrel213: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel217: +pcrel214: auipc t0, %pcrel_hi(my_y1) -pcrel218: +pcrel215: auipc t1, %pcrel_hi(imgOut) li t4, 3 - flw f10, %pcrel_lo(pcrel216)(a2) - addi a0, a2, %pcrel_lo(pcrel216) - addi a1, t0, %pcrel_lo(pcrel217) - addi a2, t1, %pcrel_lo(pcrel218) + flw f10, %pcrel_lo(pcrel213)(a2) + addi a0, a2, %pcrel_lo(pcrel213) + addi a1, t0, %pcrel_lo(pcrel214) + addi a2, t1, %pcrel_lo(pcrel215) flw f11, 4(a0) flw f12, 8(a0) flw f13, 12(a0) @@ -281,7 +281,7 @@ label4: fadd.s f2, f4, f7 fsw f2, 0(a6) bgt t2, t5, label10 - ble a3, t5, label207 + ble a3, t5, label204 mulw t6, t5, a0 fmv.s f1, f14 add t4, a2, t6 @@ -310,31 +310,31 @@ label13: label32: ret .p2align 2 -label207: +label204: addiw a5, a5, 1 bgt a4, a5, label2 j label32 .p2align 2 cmmc_parallel_body_1: mv t0, a0 -pcrel418: +pcrel415: auipc a3, %pcrel_hi(cmmc_parallel_body_payload_1) -pcrel419: +pcrel416: auipc t2, %pcrel_hi(imgIn) li t1, 3 - flw f10, %pcrel_lo(pcrel418)(a3) - addi a2, a3, %pcrel_lo(pcrel418) -pcrel420: + flw f10, %pcrel_lo(pcrel415)(a3) + addi a2, a3, %pcrel_lo(pcrel415) +pcrel417: auipc a3, %pcrel_hi(my_y1) flw f11, 4(a2) - addi a5, a3, %pcrel_lo(pcrel420) + addi a5, a3, %pcrel_lo(pcrel417) flw f12, 8(a2) li a3, 1080 flw f13, 12(a2) lw a0, 16(a2) - addi a2, t2, %pcrel_lo(pcrel419) + addi a2, t2, %pcrel_lo(pcrel416) addi a4, a0, -3 - ble a0, t1, label243 + ble a0, t1, label240 mulw t1, t0, a3 add a5, a5, t1 mulw t3, t0, a3 @@ -344,13 +344,13 @@ pcrel420: fmv.s f14, f15 fmv.s f1, f15 mv t3, zero - j label224 + j label221 .p2align 2 -label355: +label352: addiw t0, t0, 1 - ble a1, t0, label242 + ble a1, t0, label239 .p2align 2 -label233: +label230: addi a5, a5, 1080 mulw t3, t0, a3 fmv.w.x f15, zero @@ -360,7 +360,7 @@ label233: fmv.s f1, f15 mv t3, zero .p2align 2 -label224: +label221: sh2add t4, t3, t2 fmul.s f6, f11, f15 addiw t3, t3, 4 @@ -399,19 +399,19 @@ label224: fadd.s f3, f6, f1 fadd.s f2, f3, f7 fsw f2, 12(t1) - ble a4, t3, label327 + ble a4, t3, label324 addi t1, t1, 16 fmv.s f1, f14 fmv.s f14, f2 - j label224 + j label221 .p2align 2 -label327: - ble a0, t3, label408 +label324: + ble a0, t3, label405 sh2add t1, t3, t2 fmv.s f1, f15 fmv.s f0, f14 .p2align 2 -label235: +label232: flw f15, 0(t1) fmul.s f5, f11, f1 fmul.s f4, f12, f2 @@ -423,17 +423,17 @@ label235: fadd.s f1, f3, f4 fadd.s f14, f1, f5 fsw f14, 0(t2) - ble a0, t3, label355 + ble a0, t3, label352 addi t1, t1, 4 fmv.s f1, f15 fmv.s f0, f2 fmv.s f2, f14 - j label235 -label243: - bgt a0, zero, label244 -label242: + j label232 +label240: + bgt a0, zero, label241 +label239: ret -label244: +label241: mulw t1, t0, a3 add a4, a5, t1 mv a5, t0 @@ -442,15 +442,15 @@ label244: fmv.s f14, f1 fmv.s f2, f1 mv t1, zero - j label248 + j label245 .p2align 2 -label254: +label251: addi t0, t0, 4 fmv.s f2, f15 fmv.s f1, f14 fmv.s f14, f0 .p2align 2 -label248: +label245: flw f15, 0(t0) fmul.s f5, f11, f2 fmul.s f4, f12, f14 @@ -462,9 +462,9 @@ label248: fadd.s f2, f3, f4 fadd.s f0, f2, f5 fsw f0, 0(t2) - bgt a0, t1, label254 + bgt a0, t1, label251 addiw a5, a5, 1 - ble a1, a5, label242 + ble a1, a5, label239 addi a4, a4, 1080 mulw t1, a5, a3 fmv.w.x f1, zero @@ -472,31 +472,31 @@ label248: fmv.s f14, f1 fmv.s f2, f1 mv t1, zero - j label248 + j label245 .p2align 2 -label408: +label405: addiw t0, t0, 1 - bgt a1, t0, label233 - j label242 + bgt a1, t0, label230 + j label239 .p2align 2 cmmc_parallel_body_2: mv t1, a0 -pcrel586: +pcrel583: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel587: +pcrel584: auipc a3, %pcrel_hi(imgOut) -pcrel588: +pcrel585: auipc a4, %pcrel_hi(my_y1) -pcrel589: +pcrel586: auipc t3, %pcrel_hi(my_y2) li t2, 3 - lw a0, %pcrel_lo(pcrel586)(a2) - addi t0, a3, %pcrel_lo(pcrel587) - addi a2, a4, %pcrel_lo(pcrel588) + lw a0, %pcrel_lo(pcrel583)(a2) + addi t0, a3, %pcrel_lo(pcrel584) + addi a2, a4, %pcrel_lo(pcrel585) addi a5, a0, -3 - addi a3, t3, %pcrel_lo(pcrel589) + addi a3, t3, %pcrel_lo(pcrel586) li a4, 1080 - ble a0, t2, label439 + ble a0, t2, label436 mulw t2, t1, a4 add t0, t0, t2 mulw t6, t1, a4 @@ -504,12 +504,12 @@ pcrel589: mv t5, zero add t2, a3, t6 add t4, a2, t6 - j label426 + j label423 .p2align 2 -label429: +label426: addi t3, t3, 16 .p2align 2 -label426: +label423: sh2add t6, t5, t4 sh2add a6, t5, t2 flw f10, 0(t6) @@ -529,15 +529,15 @@ label426: flw f11, 12(a6) fadd.s f12, f10, f11 fsw f12, 12(t3) - bgt a5, t5, label429 - ble a0, t5, label575 + bgt a5, t5, label426 + ble a0, t5, label572 sh2add t3, t5, t4 - j label432 + j label429 .p2align 2 -label435: +label432: addi t3, t3, 4 .p2align 2 -label432: +label429: sh2add t4, t5, t2 flw f10, 0(t3) sh2add t6, t5, t0 @@ -545,23 +545,23 @@ label432: addiw t5, t5, 1 fadd.s f11, f10, f12 fsw f11, 0(t6) - bgt a0, t5, label435 + bgt a0, t5, label432 addiw t1, t1, 1 - ble a1, t1, label438 + ble a1, t1, label435 .p2align 2 -label437: +label434: addi t0, t0, 1080 mulw t6, t1, a4 mv t5, zero mv t3, t0 add t2, a3, t6 add t4, a2, t6 - j label426 -label439: - bgt a0, zero, label440 -label438: + j label423 +label436: + bgt a0, zero, label437 +label435: ret -label440: +label437: mulw t2, t1, a4 add a5, t0, t2 mv t0, t1 @@ -569,12 +569,12 @@ label440: mv t3, zero add t2, a3, t4 add t1, a2, t4 - j label444 + j label441 .p2align 2 -label447: +label444: addi t1, t1, 4 .p2align 2 -label444: +label441: sh2add t4, t3, t2 flw f10, 0(t1) sh2add t5, t3, a5 @@ -582,11 +582,11 @@ label444: addiw t3, t3, 1 fadd.s f11, f10, f12 fsw f11, 0(t5) - bgt a0, t3, label447 + bgt a0, t3, label444 addiw t0, t0, 1 - ble a1, t0, label438 + ble a1, t0, label435 .p2align 2 -label449: +label446: addi a5, a5, 1080 mulw t4, t0, a4 li t3, 1 @@ -598,37 +598,37 @@ label449: flw f12, 0(t2) fadd.s f11, f10, f12 fsw f11, 0(a5) - bgt a0, t3, label447 + bgt a0, t3, label444 addiw t0, t0, 1 - bgt a1, t0, label449 - j label438 + bgt a1, t0, label446 + j label435 .p2align 2 -label575: +label572: addiw t1, t1, 1 - bgt a1, t1, label437 - j label438 + bgt a1, t1, label434 + j label435 .p2align 2 cmmc_parallel_body_3: mv t1, a0 -pcrel754: +pcrel751: auipc a2, %pcrel_hi(cmmc_parallel_body_payload_3) -pcrel755: +pcrel752: auipc a3, %pcrel_hi(imgOut) -pcrel756: +pcrel753: auipc a4, %pcrel_hi(my_y1) -pcrel757: +pcrel754: auipc t3, %pcrel_hi(my_y2) li t2, 3 - lw a0, %pcrel_lo(pcrel754)(a2) - addi t0, a3, %pcrel_lo(pcrel755) - addi a2, a4, %pcrel_lo(pcrel756) + lw a0, %pcrel_lo(pcrel751)(a2) + addi t0, a3, %pcrel_lo(pcrel752) + addi a2, a4, %pcrel_lo(pcrel753) addi a5, a0, -3 - addi a3, t3, %pcrel_lo(pcrel757) + addi a3, t3, %pcrel_lo(pcrel754) li a4, 1080 - bgt a0, t2, label591 - bgt a0, zero, label609 - j label607 -label591: + bgt a0, t2, label588 + bgt a0, zero, label606 + j label604 +label588: mulw t2, t1, a4 add t0, t0, t2 mulw t6, t1, a4 @@ -636,12 +636,12 @@ label591: mv t5, zero add t2, a3, t6 add t4, a2, t6 - j label595 + j label592 .p2align 2 -label598: +label595: addi t3, t3, 16 .p2align 2 -label595: +label592: sh2add t6, t5, t4 sh2add a6, t5, t2 flw f10, 0(t6) @@ -661,11 +661,11 @@ label595: flw f13, 12(a6) fadd.s f10, f11, f13 fsw f10, 12(t3) - bgt a5, t5, label598 - ble a0, t5, label744 + bgt a5, t5, label595 + ble a0, t5, label741 sh2add t3, t5, t4 .p2align 2 -label603: +label600: sh2add t4, t5, t2 flw f11, 0(t3) sh2add t6, t5, t0 @@ -673,25 +673,25 @@ label603: addiw t5, t5, 1 fadd.s f10, f11, f12 fsw f10, 0(t6) - ble a0, t5, label694 + ble a0, t5, label691 addi t3, t3, 4 - j label603 + j label600 .p2align 2 -label694: +label691: addiw t1, t1, 1 - ble a1, t1, label607 + ble a1, t1, label604 .p2align 2 -label601: +label598: addi t0, t0, 1080 mulw t6, t1, a4 mv t5, zero mv t3, t0 add t2, a3, t6 add t4, a2, t6 - j label595 -label607: + j label592 +label604: ret -label609: +label606: mulw t2, t1, a4 add a5, t0, t2 mv t0, t1 @@ -699,12 +699,12 @@ label609: mv t3, zero add t2, a3, t4 add t1, a2, t4 - j label613 + j label610 .p2align 2 -label616: +label613: addi t1, t1, 4 .p2align 2 -label613: +label610: sh2add t4, t3, t2 flw f11, 0(t1) sh2add t5, t3, a5 @@ -712,11 +712,11 @@ label613: addiw t3, t3, 1 fadd.s f10, f11, f12 fsw f10, 0(t5) - bgt a0, t3, label616 + bgt a0, t3, label613 addiw t0, t0, 1 - ble a1, t0, label607 + ble a1, t0, label604 .p2align 2 -label618: +label615: addi a5, a5, 1080 mulw t4, t0, a4 li t3, 1 @@ -728,32 +728,32 @@ label618: flw f12, 0(t2) fadd.s f10, f11, f12 fsw f10, 0(a5) - bgt a0, t3, label616 + bgt a0, t3, label613 addiw t0, t0, 1 - bgt a1, t0, label618 - j label607 + bgt a1, t0, label615 + j label604 .p2align 2 -label744: +label741: addiw t1, t1, 1 - bgt a1, t1, label601 - j label607 + bgt a1, t1, label598 + j label604 .p2align 2 cmmc_parallel_body_4: mv a3, a1 -pcrel1096: +pcrel1093: auipc a4, %pcrel_hi(my_y2) li a5, 1080 -pcrel1097: +pcrel1094: auipc t2, %pcrel_hi(imgIn) mv t1, a0 - addi a2, a4, %pcrel_lo(pcrel1096) + addi a2, a4, %pcrel_lo(pcrel1093) mulw a1, a0, a5 - addi t0, t2, %pcrel_lo(pcrel1097) + addi t0, t2, %pcrel_lo(pcrel1094) add a4, a2, a1 -pcrel1098: +pcrel1095: auipc t2, %pcrel_hi(__cmmc_fp_constant_pool) li a2, 13 - addi a1, t2, %pcrel_lo(pcrel1098) + addi a1, t2, %pcrel_lo(pcrel1095) mulw t3, a0, a5 addi a0, a4, 1076 fmv.w.x f13, zero @@ -762,9 +762,9 @@ pcrel1098: fmv.s f12, f13 fmv.s f15, f13 li t3, 269 - j label762 + j label759 .p2align 2 -label770: +label767: flw f11, 0(a1) addiw t1, t1, 1 flw f12, 4(a1) @@ -880,7 +880,7 @@ label770: fadd.s f11, f2, f0 fadd.s f13, f11, f12 fsw f13, -116(a0) - ble a3, t1, label772 + ble a3, t1, label769 addi a4, a4, 1080 mulw t3, t1, a5 fmv.w.x f13, zero @@ -891,7 +891,7 @@ label770: fmv.s f15, f13 li t3, 269 .p2align 2 -label762: +label759: flw f10, 0(a1) sh2add t4, t3, t2 flw f11, 4(a1) @@ -1025,277 +1025,267 @@ label762: fadd.s f10, f2, f11 fsw f10, -60(a0) flw f15, -60(t4) - ble t3, a2, label770 + ble t3, a2, label767 addi a0, a0, -64 fmv.s f12, f14 fmv.s f14, f10 - j label762 -label772: + j label759 +label769: ret .p2align 2 cmmc_parallel_body_5: - addi sp, sp, -56 - mv a6, a1 -pcrel1408: + addi sp, sp, -16 + mv t3, a1 +pcrel1396: auipc a2, %pcrel_hi(my_y2) -pcrel1409: +pcrel1397: auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) - li t6, 1080 -pcrel1410: + li t2, 1080 +pcrel1398: auipc a1, %pcrel_hi(imgOut) - addi a7, a2, %pcrel_lo(pcrel1408) - addi t5, a1, %pcrel_lo(pcrel1410) - sd s1, 0(sp) - addi a2, a3, %pcrel_lo(pcrel1409) + addi t4, a2, %pcrel_lo(pcrel1396) + addi t1, a1, %pcrel_lo(pcrel1398) + sd s0, 0(sp) + addi a2, a3, %pcrel_lo(pcrel1397) li a1, -135 - sd s6, 8(sp) + sd s1, 8(sp) slli a3, a1, 4 - sd s0, 16(sp) addi a4, a3, -1080 - sd s5, 24(sp) addi a5, a4, -1080 - sd s3, 32(sp) addi t0, a5, -1080 - sd s2, 40(sp) - addi t1, t0, -1080 - sd s4, 48(sp) - addi t2, t1, -1080 - addi t3, t2, -1080 - addi t4, t3, -1080 - j label1100 + j label1097 .p2align 2 -label1339: +label1336: addiw a0, a0, 1 - ble a6, a0, label1111 + ble t3, a0, label1108 .p2align 2 -label1100: - lui s1, 135 +label1097: + lui t6, 135 fmv.w.x f13, zero - addiw s0, s1, -1080 - fmv.s f15, f13 + addiw t5, t6, -1080 + fmv.s f14, f13 fmv.s f12, f13 - fmv.s f0, f13 - add a1, a7, s0 - li s0, 511 + fmv.s f15, f13 + add a1, t4, t5 + li t5, 511 .p2align 2 -label1102: +label1099: flw f10, 0(a2) - sh2add s3, a0, a1 - mulw s2, s0, t6 + sh2add a7, a0, a1 + mulw a6, t5, t2 flw f11, 4(a2) - fmul.s f14, f0, f10 - addiw s0, s0, -16 - add s1, t5, s2 + fmul.s f1, f15, f10 + addiw t5, t5, -16 + add t6, t1, a6 fmul.s f2, f12, f11 - fmul.s f4, f0, f11 - addi s4, s1, -1080 - addi s2, a1, -1080 + fmul.s f5, f15, f11 + addi a6, a1, -1080 flw f12, 8(a2) - fadd.s f3, f14, f2 - fmul.s f0, f15, f12 - fmul.s f2, f13, f12 - fadd.s f1, f3, f15 - fadd.s f14, f1, f2 - fsw f14, 0(s3) - sh2add s3, a0, s1 - flw f1, 0(s3) - sh2add s3, a0, s2 - fmul.s f13, f1, f10 - sh2add s2, a0, s4 - add s4, s1, a3 - fadd.s f3, f13, f4 - fadd.s f2, f3, f14 - fmul.s f3, f1, f11 - fmul.s f1, f14, f12 - fadd.s f13, f2, f0 - fsw f13, 0(s3) - add s3, a1, a3 - flw f0, 0(s2) - sh2add s2, a0, s3 - fmul.s f15, f0, f10 - sh2add s3, a0, s4 - add s4, a1, a4 - fadd.s f4, f15, f3 - fmul.s f3, f0, f11 - fmul.s f0, f13, f12 - fadd.s f2, f4, f13 - fadd.s f15, f2, f1 - fsw f15, 0(s2) - add s2, s1, a4 - flw f1, 0(s3) - sh2add s3, a0, s4 - fmul.s f14, f1, f10 - sh2add s4, a0, s2 - add s2, a1, a5 - fadd.s f4, f14, f3 - fmul.s f3, f1, f11 - fmul.s f1, f15, f12 - fadd.s f2, f4, f15 - fadd.s f14, f2, f0 - fsw f14, 0(s3) - sh2add s3, a0, s2 - flw f0, 0(s4) - add s4, s1, a5 - fmul.s f13, f0, f10 - sh2add s2, a0, s4 - add s4, s1, t0 - fadd.s f4, f13, f3 - fadd.s f2, f4, f14 - fmul.s f4, f0, f11 - fmul.s f0, f14, f12 - fadd.s f13, f2, f1 - fsw f13, 0(s3) - add s3, a1, t0 - flw f1, 0(s2) - sh2add s2, a0, s3 - fmul.s f15, f1, f10 - sh2add s3, a0, s4 - add s4, a1, t1 - fadd.s f3, f15, f4 - fadd.s f2, f3, f13 + fadd.s f3, f1, f2 + fmul.s f1, f13, f12 + fadd.s f0, f3, f14 + fmul.s f3, f14, f12 + fadd.s f13, f0, f1 + fsw f13, 0(a7) + sh2add a7, a0, t6 + flw f1, 0(a7) + sh2add a7, a0, a6 + fmul.s f2, f1, f10 + addi a6, t6, -1080 + fadd.s f4, f2, f5 + fadd.s f0, f4, f13 + fadd.s f15, f0, f3 fmul.s f3, f1, f11 fmul.s f1, f13, f12 - fadd.s f15, f2, f0 - fsw f15, 0(s2) - sh2add s2, a0, s4 - flw f0, 0(s3) - add s3, s1, t1 + fsw f15, 0(a7) + sh2add a7, a0, a6 + flw f0, 0(a7) + add a7, a1, a3 fmul.s f14, f0, f10 - sh2add s4, a0, s3 - add s3, s1, t2 + sh2add a6, a0, a7 + add a7, t6, a3 fadd.s f4, f14, f3 fmul.s f3, f0, f11 fmul.s f0, f15, f12 fadd.s f2, f4, f15 fadd.s f14, f2, f1 - fsw f14, 0(s2) - flw f1, 0(s4) - add s4, a1, t2 + fsw f14, 0(a6) + sh2add a6, a0, a7 + flw f1, 0(a6) + add a6, a1, a4 fmul.s f13, f1, f10 - sh2add s2, a0, s4 - sh2add s4, a0, s3 - add s3, a1, t3 + sh2add a7, a0, a6 + add a6, t6, a4 fadd.s f4, f13, f3 - fadd.s f2, f4, f14 - fmul.s f4, f1, f11 + fmul.s f3, f1, f11 fmul.s f1, f14, f12 + fadd.s f2, f4, f14 fadd.s f13, f2, f0 - fsw f13, 0(s2) - sh2add s2, a0, s3 - flw f0, 0(s4) - add s4, s1, t3 + fsw f13, 0(a7) + sh2add a7, a0, a6 + add a6, a1, a5 + flw f0, 0(a7) + sh2add a7, a0, a6 fmul.s f15, f0, f10 - sh2add s3, a0, s4 - add s4, s1, t4 - fadd.s f3, f15, f4 + add a6, t6, a5 + fadd.s f4, f15, f3 + fadd.s f2, f4, f13 fmul.s f4, f0, f11 fmul.s f0, f13, f12 - fadd.s f2, f3, f13 fadd.s f15, f2, f1 - fsw f15, 0(s2) - flw f1, 0(s3) - add s3, a1, t4 + fsw f15, 0(a7) + sh2add a7, a0, a6 + add a6, a1, t0 + flw f1, 0(a7) + sh2add s0, a0, a6 + add a7, t6, t0 fmul.s f14, f1, f10 - sh2add s2, a0, s3 - sh2add s3, a0, s4 - li s4, -675 + sh2add a6, a0, a7 + li a7, -405 fadd.s f3, f14, f4 fadd.s f2, f3, f15 fmul.s f3, f1, f11 fmul.s f1, f15, f12 fadd.s f14, f2, f0 - fmul.s f5, f14, f12 - fsw f14, 0(s2) - slli s2, s4, 4 - flw f0, 0(s3) - add s4, s1, s2 - add s3, a1, s2 + fsw f14, 0(s0) + flw f0, 0(a6) + slli a6, a7, 4 fmul.s f13, f0, f10 - sh2add s5, a0, s3 - sh2add s3, a0, s4 + add s0, t6, a6 + add s1, a1, a6 + sh2add a7, a0, s1 fadd.s f4, f13, f3 + fmul.s f3, f0, f11 + fmul.s f0, f14, f12 fadd.s f2, f4, f14 - fmul.s f4, f0, f11 fadd.s f13, f2, f1 - fsw f13, 0(s5) - flw f15, 0(s3) - addi s3, s2, -1080 - fmul.s f1, f15, f10 - add s4, s1, s3 - add s5, a1, s3 - sh2add s2, a0, s4 - sh2add s6, a0, s5 - fadd.s f3, f1, f4 - fmul.s f4, f15, f11 - fadd.s f2, f3, f13 - fadd.s f0, f2, f5 - fmul.s f5, f13, f12 - fsw f0, 0(s6) - flw f14, 0(s2) - addi s2, s3, -1080 + fsw f13, 0(a7) + sh2add a7, a0, s0 + flw f1, 0(a7) + addi a7, a6, -1080 + fmul.s f15, f1, f10 + add s0, t6, a7 + add s1, a1, a7 + sh2add a6, a0, s1 + fadd.s f4, f15, f3 + fadd.s f2, f4, f13 + fmul.s f4, f1, f11 + fmul.s f1, f13, f12 + fadd.s f15, f2, f0 + fsw f15, 0(a6) + sh2add a6, a0, s0 + flw f0, 0(a6) + addi a6, a7, -1080 + fmul.s f14, f0, f10 + add s0, t6, a6 + add s1, a1, a6 + fmul.s f5, f0, f11 + sh2add a7, a0, s1 + fadd.s f3, f14, f4 + fadd.s f2, f3, f15 + fmul.s f3, f15, f12 + fadd.s f14, f2, f1 + fsw f14, 0(a7) + sh2add a7, a0, s0 + flw f1, 0(a7) + addi a7, a6, -1080 + fmul.s f2, f1, f10 + add s1, t6, a7 + add s0, a1, a7 + sh2add a6, a0, s0 + fadd.s f4, f2, f5 + fadd.s f13, f4, f14 + fadd.s f0, f13, f3 + fmul.s f3, f1, f11 + fmul.s f1, f14, f12 + fsw f0, 0(a6) + fmul.s f5, f0, f12 + sh2add a6, a0, s1 + flw f13, 0(a6) + addi a6, a7, -1080 + fmul.s f15, f13, f10 + add a7, t6, a6 + add s0, a1, a6 + sh2add s1, a0, s0 + sh2add s0, a0, a7 + addi a7, a6, -1080 + fadd.s f4, f15, f3 + fadd.s f2, f4, f0 + fmul.s f4, f13, f11 + fadd.s f15, f2, f1 + fsw f15, 0(s1) + add s1, t6, a7 + flw f14, 0(s0) + add s0, a1, a7 fmul.s f1, f14, f10 - add s4, s1, s2 - add s5, a1, s2 - sh2add s3, a0, s4 - sh2add s6, a0, s5 + sh2add a6, a0, s0 fadd.s f3, f1, f4 - fadd.s f2, f3, f0 - fmul.s f3, f0, f12 - fadd.s f15, f2, f5 - fmul.s f5, f14, f11 - fsw f15, 0(s6) - flw f13, 0(s3) - addi s3, s2, -1080 - fmul.s f2, f13, f10 - add s6, s1, s3 - add s4, a1, s3 - sh2add s2, a0, s6 - sh2add s5, a0, s4 - fadd.s f4, f2, f5 - fmul.s f5, f13, f11 - fadd.s f1, f4, f15 - fmul.s f4, f15, f12 - fadd.s f14, f1, f3 - fsw f14, 0(s5) - flw f0, 0(s2) - addi s2, s3, -1080 + fmul.s f4, f14, f11 + fadd.s f2, f3, f15 + fadd.s f13, f2, f5 + fmul.s f5, f15, f12 + fsw f13, 0(a6) + sh2add a6, a0, s1 + flw f0, 0(a6) + addi a6, a7, -1080 fmul.s f1, f0, f10 - add s3, s1, s2 - add s4, a1, s2 - sh2add s5, a0, s4 - sh2add s4, a0, s3 - addi s3, s2, -1080 - fadd.s f3, f1, f5 - add s6, s1, s3 - sh2add s2, a0, s6 + add s0, t6, a6 + add s1, a1, a6 + sh2add a7, a0, s1 + fadd.s f3, f1, f4 + fmul.s f4, f0, f11 + fmul.s f0, f13, f12 + fadd.s f2, f3, f13 + fadd.s f14, f2, f5 + fsw f14, 0(a7) + sh2add a7, a0, s0 + flw f1, 0(a7) + addi a7, a6, -1080 + fmul.s f15, f1, f10 + add a6, t6, a7 + add s0, a1, a7 + sh2add s1, a0, s0 + sh2add s0, a0, a6 + addi a6, a7, -1080 + fadd.s f3, f15, f4 + fmul.s f4, f1, f11 + fmul.s f1, f14, f12 fadd.s f2, f3, f14 - fadd.s f13, f2, f4 + fadd.s f15, f2, f0 + fsw f15, 0(s1) + add s1, a1, a6 + flw f0, 0(s0) + sh2add a7, a0, s1 + add s0, t6, a6 + fmul.s f13, f0, f10 + fadd.s f3, f13, f4 fmul.s f4, f0, f11 - fmul.s f11, f14, f12 - fsw f13, 0(s5) - add s5, a1, s3 - flw f15, 0(s4) - sh2add s4, a0, s5 - fmul.s f1, f15, f10 + fmul.s f11, f15, f12 + fadd.s f2, f3, f15 + fadd.s f13, f2, f1 + fsw f13, 0(a7) + sh2add a7, a0, s0 + flw f14, 0(a7) + addi a7, a6, -1080 + fmul.s f1, f14, f10 + add a6, t6, a7 + add s1, a1, a7 + sh2add s0, a0, s1 + sh2add s1, a0, a6 fadd.s f3, f1, f4 fadd.s f2, f3, f13 fadd.s f10, f2, f11 - fsw f10, 0(s4) - flw f0, 0(s2) - blt s0, zero, label1339 - li s2, -135 - fmv.s f12, f15 - slli s1, s2, 7 - fmv.s f15, f10 - add a1, a1, s1 - j label1102 -label1111: - ld s1, 0(sp) - ld s6, 8(sp) - ld s0, 16(sp) - ld s5, 24(sp) - ld s3, 32(sp) - ld s2, 40(sp) - ld s4, 48(sp) - addi sp, sp, 56 + fsw f10, 0(s0) + flw f15, 0(s1) + blt t5, zero, label1336 + li a6, -135 + fmv.s f12, f14 + slli t6, a6, 7 + fmv.s f14, f10 + add a1, a1, t6 + j label1099 +label1108: + ld s0, 0(sp) + ld s1, 8(sp) + addi sp, sp, 16 ret diff --git a/tests/SysY2022/performance/fft0.arm.s b/tests/SysY2022/performance/fft0.arm.s index 7904836d7..246e82fe7 100644 --- a/tests/SysY2022/performance/fft0.arm.s +++ b/tests/SysY2022/performance/fft0.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 a: .zero 8388608 -.align 8 +.p2align 3 b: .zero 8388608 .text diff --git a/tests/SysY2022/performance/fft0.riscv.s b/tests/SysY2022/performance/fft0.riscv.s index 6b73bb74f..823f18e4e 100644 --- a/tests/SysY2022/performance/fft0.riscv.s +++ b/tests/SysY2022/performance/fft0.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 a: .zero 8388608 -.align 8 +.p2align 3 b: .zero 8388608 .text diff --git a/tests/SysY2022/performance/fft1.arm.s b/tests/SysY2022/performance/fft1.arm.s index 7904836d7..246e82fe7 100644 --- a/tests/SysY2022/performance/fft1.arm.s +++ b/tests/SysY2022/performance/fft1.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 a: .zero 8388608 -.align 8 +.p2align 3 b: .zero 8388608 .text diff --git a/tests/SysY2022/performance/fft1.riscv.s b/tests/SysY2022/performance/fft1.riscv.s index 6b73bb74f..823f18e4e 100644 --- a/tests/SysY2022/performance/fft1.riscv.s +++ b/tests/SysY2022/performance/fft1.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 a: .zero 8388608 -.align 8 +.p2align 3 b: .zero 8388608 .text diff --git a/tests/SysY2022/performance/fft2.arm.s b/tests/SysY2022/performance/fft2.arm.s index 7904836d7..246e82fe7 100644 --- a/tests/SysY2022/performance/fft2.arm.s +++ b/tests/SysY2022/performance/fft2.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 a: .zero 8388608 -.align 8 +.p2align 3 b: .zero 8388608 .text diff --git a/tests/SysY2022/performance/fft2.riscv.s b/tests/SysY2022/performance/fft2.riscv.s index 6b73bb74f..823f18e4e 100644 --- a/tests/SysY2022/performance/fft2.riscv.s +++ b/tests/SysY2022/performance/fft2.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 a: .zero 8388608 -.align 8 +.p2align 3 b: .zero 8388608 .text diff --git a/tests/SysY2022/performance/floyd-0.arm.s b/tests/SysY2022/performance/floyd-0.arm.s index 2939bf4f1..c06dd3346 100644 --- a/tests/SysY2022/performance/floyd-0.arm.s +++ b/tests/SysY2022/performance/floyd-0.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 w: .zero 8388608 -.align 8 +.p2align 3 dst: .zero 8388608 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/floyd-0.riscv.s b/tests/SysY2022/performance/floyd-0.riscv.s index 38106238b..4b93d197c 100644 --- a/tests/SysY2022/performance/floyd-0.riscv.s +++ b/tests/SysY2022/performance/floyd-0.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 w: .zero 8388608 -.align 8 +.p2align 3 dst: .zero 8388608 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/floyd-1.arm.s b/tests/SysY2022/performance/floyd-1.arm.s index 2939bf4f1..c06dd3346 100644 --- a/tests/SysY2022/performance/floyd-1.arm.s +++ b/tests/SysY2022/performance/floyd-1.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 w: .zero 8388608 -.align 8 +.p2align 3 dst: .zero 8388608 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/floyd-1.riscv.s b/tests/SysY2022/performance/floyd-1.riscv.s index 38106238b..4b93d197c 100644 --- a/tests/SysY2022/performance/floyd-1.riscv.s +++ b/tests/SysY2022/performance/floyd-1.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 w: .zero 8388608 -.align 8 +.p2align 3 dst: .zero 8388608 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/floyd-2.arm.s b/tests/SysY2022/performance/floyd-2.arm.s index 2939bf4f1..c06dd3346 100644 --- a/tests/SysY2022/performance/floyd-2.arm.s +++ b/tests/SysY2022/performance/floyd-2.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 w: .zero 8388608 -.align 8 +.p2align 3 dst: .zero 8388608 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/floyd-2.riscv.s b/tests/SysY2022/performance/floyd-2.riscv.s index 38106238b..4b93d197c 100644 --- a/tests/SysY2022/performance/floyd-2.riscv.s +++ b/tests/SysY2022/performance/floyd-2.riscv.s @@ -1,19 +1,19 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 temp: .zero 8388608 -.align 8 +.p2align 3 w: .zero 8388608 -.align 8 +.p2align 3 dst: .zero 8388608 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 .text diff --git a/tests/SysY2022/performance/gameoflife-gosper.arm.s b/tests/SysY2022/performance/gameoflife-gosper.arm.s index b4a0a88fe..d04d08c96 100644 --- a/tests/SysY2022/performance/gameoflife-gosper.arm.s +++ b/tests/SysY2022/performance/gameoflife-gosper.arm.s @@ -1,22 +1,22 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 sheet1: .zero 1000000 -.align 8 +.p2align 3 sheet2: .zero 1000000 -.align 4 +.p2align 2 width: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 .text diff --git a/tests/SysY2022/performance/gameoflife-gosper.riscv.s b/tests/SysY2022/performance/gameoflife-gosper.riscv.s index 81df0a843..86e23c7b2 100644 --- a/tests/SysY2022/performance/gameoflife-gosper.riscv.s +++ b/tests/SysY2022/performance/gameoflife-gosper.riscv.s @@ -1,402 +1,458 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 sheet1: .zero 1000000 -.align 8 +.p2align 3 sheet2: .zero 1000000 -.align 4 +.p2align 2 width: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[8] CalleeSaved[104] - addi sp, sp, -112 + # stack usage: CalleeArg[0] Local[0] RegSpill[16] CalleeSaved[104] + addi sp, sp, -120 sd ra, 0(sp) - sd s6, 8(sp) - sd s1, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s4, 40(sp) - sd s8, 48(sp) - sd s3, 56(sp) - sd s2, 64(sp) - sd s7, 72(sp) + sd s7, 8(sp) + sd s6, 16(sp) + sd s1, 24(sp) + sd s3, 32(sp) + sd s8, 40(sp) + sd s2, 48(sp) + sd s0, 56(sp) + sd s5, 64(sp) + sd s4, 72(sp) sd s9, 80(sp) sd s10, 88(sp) sd s11, 96(sp) jal getint - mv s6, a0 -pcrel572: + mv s7, a0 +pcrel630: auipc a0, %pcrel_hi(width) - sw s6, %pcrel_lo(pcrel572)(a0) + sw s7, %pcrel_lo(pcrel630)(a0) jal getint - slt s4, zero, a0 + slt s3, zero, a0 addiw a1, a0, 1 - mv s0, a0 + sd a0, 112(sp) + mv s6, a0 sd a1, 104(sp) jal getint mv s8, a0 jal getch -pcrel573: +pcrel631: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) +pcrel632: + auipc a0, %pcrel_hi(cmmc_parallel_body_1) +pcrel633: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel574: - auipc a0, %pcrel_hi(cmmc_parallel_body_2) -pcrel575: +pcrel634: auipc a1, %pcrel_hi(sheet1) - addi s1, a0, %pcrel_lo(pcrel574) - addi s3, a1, %pcrel_lo(pcrel575) -pcrel576: - auipc a1, %pcrel_hi(cmmc_parallel_body_1) - addi s2, a1, %pcrel_lo(pcrel576) - ble s0, zero, label369 - addi s9, s3, 2000 - mv a1, s6 + addi s0, a0, %pcrel_lo(pcrel632) + addi s2, a1, %pcrel_lo(pcrel634) +pcrel635: + auipc a1, %pcrel_hi(cmmc_parallel_body_2) + addi s1, a1, %pcrel_lo(pcrel635) + ble s6, zero, label436 + addi s9, s2, 2000 + mv a1, s7 li s7, 1 mv s10, s7 - bgt s6, zero, label402 - j label400 + bgt a1, zero, label431 + j label429 .p2align 2 -label406: +label435: addi s11, s11, 4 .p2align 2 -label403: +label432: jal getch addiw s7, s7, 1 xori a2, a0, 35 sltiu a1, a2, 1 sw a1, 0(s11) - bgt s6, s7, label406 + bgt s6, s7, label435 .p2align 2 -label400: +label429: jal getch + ld s6, 112(sp) addiw s10, s10, 1 - blt s0, s10, label369 -pcrel577: + blt s6, s10, label436 +pcrel636: auipc a0, %pcrel_hi(width) addi s9, s9, 2000 - lw a1, %pcrel_lo(pcrel577)(a0) - ble a1, zero, label400 + lw a1, %pcrel_lo(pcrel636)(a0) + ble a1, zero, label429 .p2align 2 -label402: +label431: auipc a0, %pcrel_hi(width) addi s11, s9, 4 li s7, 1 - lw a1, %pcrel_lo(label402)(a0) + lw a1, %pcrel_lo(label431)(a0) addi s6, a1, 1 - j label403 -label369: + j label432 +label436: li a0, 95 jal _sysy_starttime - ble s8, zero, label421 + ble s8, zero, label501 li s7, 1 mv a0, s7 - bne s7, s7, label425 + beq s7, s7, label442 + ld s6, 112(sp) + bgt s6, zero, label441 + j label508 .p2align 2 -label373: - ble s0, zero, label428 +label443: ld a1, 104(sp) li s7, 1 -pcrel578: +pcrel637: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) - sw a1, %pcrel_lo(pcrel578)(s5) + sw a1, %pcrel_lo(pcrel637)(s5) mv a0, s7 mv a2, s1 jal cmmcParallelFor - addiw s8, s8, -1 li s7, 2 - bgt s8, zero, label445 - j label379 -label425: - ble s0, zero, label552 -.p2align 2 -label376: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) - li s7, 1 - ld a1, 104(sp) - sw a1, %pcrel_lo(label376)(a2) - mv a0, s7 - mv a2, s2 - jal cmmcParallelFor .p2align 2 -label377: +label444: addiw s8, s8, -1 - ble s8, zero, label379 + ble s8, zero, label446 .p2align 2 -label445: +label525: mv a0, s7 li s7, 1 - beq a0, s7, label373 - bgt s0, zero, label376 -label552: + bne a0, s7, label611 +.p2align 2 +label442: + ld s6, 112(sp) + bgt s6, zero, label443 + li s7, 2 + j label444 +.p2align 2 +label611: + ld s6, 112(sp) + ble s6, zero, label508 +.p2align 2 +label441: + ld a1, 104(sp) li s7, 1 - j label377 -label379: +pcrel638: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) + sw a1, %pcrel_lo(pcrel638)(s4) + mv a0, s7 + mv a2, s0 + jal cmmcParallelFor + addiw s8, s8, -1 + bgt s8, zero, label525 +label446: xori a2, s7, 2 li a0, 106 sltiu a1, a2, 1 - and s1, s4, a1 + and s0, s3, a1 jal _sysy_stoptime - bne s1, zero, label394 -label381: - ble s0, zero, label382 - addi s1, s3, 2000 + bne s0, zero, label461 +label448: + ld s6, 112(sp) + ble s6, zero, label449 + addi s1, s2, 2000 li s7, 1 mv s2, s7 -pcrel579: +pcrel639: auipc a0, %pcrel_hi(width) - lw a1, %pcrel_lo(pcrel579)(a0) + lw a1, %pcrel_lo(pcrel639)(a0) addi s0, a1, 1 - bgt a1, zero, label389 + bgt a1, zero, label456 .p2align 2 -label387: +label454: li a0, 10 jal putch ld a1, 104(sp) addiw s2, s2, 1 - ble a1, s2, label382 + ble a1, s2, label449 addi s1, s1, 2000 -pcrel580: +pcrel640: auipc a0, %pcrel_hi(width) - lw a1, %pcrel_lo(pcrel580)(a0) + lw a1, %pcrel_lo(pcrel640)(a0) addi s0, a1, 1 - ble a1, zero, label387 + ble a1, zero, label454 .p2align 2 -label389: +label456: addi s3, s1, 4 li s7, 1 mv s4, s7 lw a1, 0(s3) li a0, 35 - beq a1, s7, label537 + beq a1, s7, label593 .p2align 2 -label536: +label592: li a0, 46 .p2align 2 -label537: +label593: jal putch addiw s4, s4, 1 - ble s0, s4, label387 + ble s0, s4, label454 addi s3, s3, 4 li a0, 35 li s7, 1 lw a1, 0(s3) - beq a1, s7, label537 - j label536 -label382: + beq a1, s7, label593 + j label592 +label449: mv a0, zero ld ra, 0(sp) - ld s6, 8(sp) - ld s1, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s4, 40(sp) - ld s8, 48(sp) - ld s3, 56(sp) - ld s2, 64(sp) - ld s7, 72(sp) + ld s7, 8(sp) + ld s6, 16(sp) + ld s1, 24(sp) + ld s3, 32(sp) + ld s8, 40(sp) + ld s2, 48(sp) + ld s0, 56(sp) + ld s5, 64(sp) + ld s4, 72(sp) ld s9, 80(sp) ld s10, 88(sp) ld s11, 96(sp) - addi sp, sp, 112 + addi sp, sp, 120 ret -label394: +label461: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel581: +pcrel641: auipc a3, %pcrel_hi(cmmc_parallel_body_0) li s7, 1 - addi a2, a3, %pcrel_lo(pcrel581) + addi a2, a3, %pcrel_lo(pcrel641) ld a1, 104(sp) - sw a1, %pcrel_lo(label394)(a0) + sw a1, %pcrel_lo(label461)(a0) mv a0, s7 jal cmmcParallelFor - j label381 -label428: - li s7, 2 - j label377 -label421: + j label448 +label508: + li s7, 1 + j label444 +label501: li s7, 1 - j label379 + j label446 .p2align 2 cmmc_parallel_body_0: - addi sp, sp, -8 - mv t0, a0 -pcrel146: + addi sp, sp, -24 + mv t1, a0 +pcrel199: auipc a5, %pcrel_hi(sheet2) li a2, 2000 -pcrel147: - auipc t1, %pcrel_hi(sheet1) - addi a3, a5, %pcrel_lo(pcrel146) +pcrel200: + auipc t0, %pcrel_hi(sheet1) + addi a3, a5, %pcrel_lo(pcrel199) mul a4, a0, a2 sd s0, 0(sp) - li a5, 16 + li a5, 4 add a0, a3, a4 -pcrel148: + sd s1, 8(sp) +pcrel201: auipc a4, %pcrel_hi(width) - addi a3, t1, %pcrel_lo(pcrel147) - lw t3, %pcrel_lo(pcrel148)(a4) - bgt t3, zero, label8 - j label5 -.p2align 2 -label20: - addi t3, t3, 4 -.p2align 2 -label17: - sh2add t5, t4, t1 - lw t6, 0(t3) - addiw t4, t4, 1 - sw t6, 0(t5) - bgt t2, t4, label20 - addiw t0, t0, 1 - ble a1, t0, label7 -.p2align 2 -label6: + addi a3, t0, %pcrel_lo(pcrel200) + sd s2, 16(sp) + li t0, 16 + lw t4, %pcrel_lo(pcrel201)(a4) + bgt t4, zero, label5 + j label27 +.p2align 2 +label26: + addi t4, t4, 4 +.p2align 2 +label23: + sh2add t6, t5, t2 + lw a6, 0(t4) + addiw t5, t5, 1 + sw a6, 0(t6) + bgt t3, t5, label26 + addiw t1, t1, 1 + ble a1, t1, label29 +.p2align 2 +label28: addi a0, a0, 2000 -pcrel149: +pcrel202: auipc a4, %pcrel_hi(width) - lw t3, %pcrel_lo(pcrel149)(a4) - ble t3, zero, label5 -.p2align 2 -label8: - mul t5, t0, a2 - addiw t2, t3, 1 - addiw t4, t3, -14 - add t1, a3, t5 - ble t2, a5, label47 - addi t3, a0, 4 - li t6, 1 - j label10 -.p2align 2 -label13: - addi t3, t3, 64 + lw t4, %pcrel_lo(pcrel202)(a4) + ble t4, zero, label27 .p2align 2 -label10: - sh2add t5, t6, t1 - lw a7, 0(t3) - addiw t6, t6, 16 - sw a7, 0(t5) - lw a6, 4(t3) - sw a6, 4(t5) - lw a7, 8(t3) - sw a7, 8(t5) - lw a6, 12(t3) - sw a6, 12(t5) - lw a7, 16(t3) - sw a7, 16(t5) - lw a6, 20(t3) - sw a6, 20(t5) - lw a7, 24(t3) - sw a7, 24(t5) - lw a6, 28(t3) - sw a6, 28(t5) - lw s0, 32(t3) - sw s0, 32(t5) - lw a7, 36(t3) - sw a7, 36(t5) - lw a6, 40(t3) - sw a6, 40(t5) - lw s0, 44(t3) - sw s0, 44(t5) - lw a7, 48(t3) - sw a7, 48(t5) - lw a6, 52(t3) - sw a6, 52(t5) - lw a7, 56(t3) - sw a7, 56(t5) - lw a6, 60(t3) - sw a6, 60(t5) - bgt t4, t6, label13 - mv t4, t6 - ble t2, t6, label138 +label5: + mul t5, t1, a2 + addiw t3, t4, 1 + add t2, a3, t5 + ble t3, a5, label49 + addiw t5, t4, -2 + addiw t6, t4, -17 + ble t5, t0, label55 + addi t4, t2, 4 + li a6, 1 .p2align 2 label16: - sh2add t3, t4, a0 - j label17 -label5: - addiw t0, t0, 1 - bgt a1, t0, label6 -label7: + sh2add a7, a6, a0 + addiw a6, a6, 16 + lw s1, 0(a7) + sw s1, 0(t4) + lw s0, 4(a7) + sw s0, 4(t4) + lw s1, 8(a7) + sw s1, 8(t4) + lw s0, 12(a7) + sw s0, 12(t4) + lw s1, 16(a7) + sw s1, 16(t4) + lw s0, 20(a7) + sw s0, 20(t4) + lw s1, 24(a7) + sw s1, 24(t4) + lw s2, 28(a7) + sw s2, 28(t4) + lw s0, 32(a7) + sw s0, 32(t4) + lw s1, 36(a7) + sw s1, 36(t4) + lw s0, 40(a7) + sw s0, 40(t4) + lw s1, 44(a7) + sw s1, 44(t4) + lw s0, 48(a7) + sw s0, 48(t4) + lw s1, 52(a7) + sw s1, 52(t4) + lw s0, 56(a7) + sw s0, 56(t4) + lw s1, 60(a7) + sw s1, 60(t4) + ble t6, a6, label133 + addi t4, t4, 64 + j label16 +.p2align 2 +label133: + mv t6, a6 + ble t5, a6, label184 +.p2align 2 +label10: + sh2add t4, t6, t2 + j label11 +.p2align 2 +label14: + addi t4, t4, 16 +.p2align 2 +label11: + sh2add a6, t6, a0 + addiw t6, t6, 4 + lw s0, 0(a6) + sw s0, 0(t4) + lw a7, 4(a6) + sw a7, 4(t4) + lw s0, 8(a6) + sw s0, 8(t4) + lw a7, 12(a6) + sw a7, 12(t4) + bgt t5, t6, label14 + mv t5, t6 + ble t3, t6, label183 +.p2align 2 +label22: + sh2add t4, t5, a0 + j label23 +label55: + li t6, 1 + mv a6, zero + bgt t5, t6, label10 + mv t5, zero + bgt t3, zero, label22 + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 +.p2align 2 +label184: + mv t5, a6 + bgt t3, a6, label22 + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 +label27: + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 +label49: + li t5, 1 + bgt t3, t5, label22 + addiw t1, t1, 1 + bgt a1, t1, label28 +label29: ld s0, 0(sp) - addi sp, sp, 8 + ld s1, 8(sp) + ld s2, 16(sp) + addi sp, sp, 24 ret .p2align 2 -label47: - li t4, 1 - bgt t2, t4, label16 - addiw t0, t0, 1 - bgt a1, t0, label6 - j label7 -label138: - addiw t0, t0, 1 - bgt a1, t0, label6 - j label7 +label183: + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 .p2align 2 cmmc_parallel_body_1: addi sp, sp, -8 mv a4, a1 -pcrel255: +pcrel308: auipc a3, %pcrel_hi(sheet2) li t0, 2000 mv t3, a0 -pcrel256: +pcrel309: auipc t2, %pcrel_hi(width) sd s0, 0(sp) - addi a2, a3, %pcrel_lo(pcrel255) + addi a2, a3, %pcrel_lo(pcrel308) mulw a1, a0, t0 -pcrel257: +pcrel310: auipc a3, %pcrel_hi(sheet1) add a5, a2, a1 - addi t1, a3, %pcrel_lo(pcrel257) + addi t1, a3, %pcrel_lo(pcrel310) li a2, 1 - lw a0, %pcrel_lo(pcrel256)(t2) + lw a0, %pcrel_lo(pcrel309)(t2) mulw t4, t3, t0 addi a1, a0, 1 add a3, t1, t4 - bgt a0, zero, label157 - j label154 + bgt a0, zero, label210 + j label207 .p2align 2 -label225: +label278: addiw t3, t3, 1 - ble a4, t3, label156 + ble a4, t3, label209 .p2align 2 -label155: +label208: addi a5, a5, 2000 mulw t4, t3, t0 -pcrel258: +pcrel311: auipc t2, %pcrel_hi(width) - lw a0, %pcrel_lo(pcrel258)(t2) + lw a0, %pcrel_lo(pcrel311)(t2) add a3, t1, t4 addi a1, a0, 1 - ble a0, zero, label154 + ble a0, zero, label207 .p2align 2 -label157: +label210: addi a0, a5, 4 mv t4, a2 - j label158 + j label211 .p2align 2 -label161: +label214: sh2add t6, t4, a3 sw a2, 0(t6) - ble a1, t5, label225 + ble a1, t5, label278 .p2align 2 -label163: +label216: addi a0, a0, 4 mv t4, t5 .p2align 2 -label158: +label211: lw t5, -2004(a0) lw t6, -2000(a0) lw a7, -1996(a0) @@ -417,109 +473,111 @@ label158: xori s0, t5, 1 addiw t5, t4, 1 or a7, a6, s0 - beq a7, zero, label161 + beq a7, zero, label214 xori a7, t6, 3 sh2add t6, t4, a3 sltiu a6, a7, 1 sw a6, 0(t6) - bgt a1, t5, label163 + bgt a1, t5, label216 addiw t3, t3, 1 - bgt a4, t3, label155 -label156: + bgt a4, t3, label208 +label209: ld s0, 0(sp) addi sp, sp, 8 ret -label154: +label207: addiw t3, t3, 1 - bgt a4, t3, label155 - j label156 + bgt a4, t3, label208 + j label209 .p2align 2 cmmc_parallel_body_2: - addi sp, sp, -8 + addi sp, sp, -16 mv a4, a1 -pcrel364: +pcrel419: auipc a3, %pcrel_hi(sheet1) li t0, 2000 mv t3, a0 -pcrel365: +pcrel420: auipc t2, %pcrel_hi(width) - sd s0, 0(sp) - addi a2, a3, %pcrel_lo(pcrel364) + sd s1, 0(sp) + addi a2, a3, %pcrel_lo(pcrel419) mulw a1, a0, t0 -pcrel366: + sd s0, 8(sp) +pcrel421: auipc a3, %pcrel_hi(sheet2) add a5, a2, a1 - addi t1, a3, %pcrel_lo(pcrel366) + addi t1, a3, %pcrel_lo(pcrel421) li a2, 1 - lw a0, %pcrel_lo(pcrel365)(t2) + lw a0, %pcrel_lo(pcrel420)(t2) mulw t4, t3, t0 addi a1, a0, 1 add a3, t1, t4 - bgt a0, zero, label266 - j label263 -.p2align 2 -label334: - addiw t3, t3, 1 - ble a4, t3, label265 -.p2align 2 -label264: - addi a5, a5, 2000 - mulw t4, t3, t0 -pcrel367: - auipc t2, %pcrel_hi(width) - lw a0, %pcrel_lo(pcrel367)(t2) - add a3, t1, t4 - addi a1, a0, 1 - ble a0, zero, label263 -.p2align 2 -label266: - addi a0, a5, 4 - mv t4, a2 - j label267 + bgt a0, zero, label316 + j label324 .p2align 2 -label270: +label375: + xori a7, t6, 3 sh2add t6, t4, a3 - sw a2, 0(t6) - ble a1, t5, label334 + sltiu a6, a7, 1 + sw a6, 0(t6) + ble a1, t5, label379 .p2align 2 -label272: +label321: addi a0, a0, 4 mv t4, t5 .p2align 2 -label267: - lw t5, -2004(a0) +label317: + lw a6, -2004(a0) lw t6, -2000(a0) lw a7, -1996(a0) - addw a6, t5, t6 - lw t5, -4(a0) - addw t6, a6, a7 - lw a6, 4(a0) - addw a7, t6, t5 - lw s0, 1996(a0) - addw t5, a7, a6 - lw t6, 2000(a0) - addw a6, t5, s0 + addw t5, a6, t6 + lw s1, -4(a0) + addw t6, t5, a7 + lw s0, 4(a0) + addw a6, t6, s1 + lw a7, 1996(a0) + addw t5, a6, s0 + lw a6, 2000(a0) + addw t6, t5, a7 lw t5, 2004(a0) - addw a7, a6, t6 + addw a7, t6, a6 addw t6, a7, t5 lw t5, 0(a0) xori a6, t6, 2 xori s0, t5, 1 addiw t5, t4, 1 or a7, a6, s0 - beq a7, zero, label270 - xori a7, t6, 3 - sh2add t6, t4, a3 - sltiu a6, a7, 1 - sw a6, 0(t6) - bgt a1, t5, label272 + bne a7, zero, label375 + sh2add a6, t4, a3 + sw a2, 0(a6) + bgt a1, t5, label321 addiw t3, t3, 1 - bgt a4, t3, label264 -label265: - ld s0, 0(sp) - addi sp, sp, 8 + ble a4, t3, label326 +.p2align 2 +label325: + addi a5, a5, 2000 + mulw t4, t3, t0 +pcrel422: + auipc t2, %pcrel_hi(width) + lw a0, %pcrel_lo(pcrel422)(t2) + add a3, t1, t4 + addi a1, a0, 1 + ble a0, zero, label324 +.p2align 2 +label316: + addi a0, a5, 4 + mv t4, a2 + j label317 +label324: + addiw t3, t3, 1 + bgt a4, t3, label325 +label326: + ld s1, 0(sp) + ld s0, 8(sp) + addi sp, sp, 16 ret -label263: +.p2align 2 +label379: addiw t3, t3, 1 - bgt a4, t3, label264 - j label265 + bgt a4, t3, label325 + j label326 diff --git a/tests/SysY2022/performance/gameoflife-gosper.sy.ir b/tests/SysY2022/performance/gameoflife-gosper.sy.ir index 8be93b9f0..c453f20de 100644 --- a/tests/SysY2022/performance/gameoflife-gosper.sy.ir +++ b/tests/SysY2022/performance/gameoflife-gosper.sy.ir @@ -126,7 +126,7 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel [500 * [500 * i32]]* %4 = ptrcast [500 * [500 * i32]]* @sheet2 to [500 * [500 * i32]]*; ubr ^b1; ^b1: - i32 %5 = phi [^b, i32 %0] [^b2, i32 %13]; + i32 %5 = phi [^b, i32 %0] [^b2, i32 %12]; i32 %6 = load i32* %2; i1 %7 = icmp sgt i32 %6, i32 0; cbr i1 %7(prob = 0.984615), ^super.header, ^b2; @@ -134,97 +134,127 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel [500 * i32]* %8 = getelementptr &([500 * [500 * i32]]* %4)[i64 0][i32 %5]; [500 * i32]* %9 = getelementptr &([500 * [500 * i32]]* %3)[i64 0][i32 %5]; i32 %10 = add i32 %6, i32 1; - i1 %11 = icmp sgt i32 %10, i32 16; - i32 %12 = add i32 %6, i32 -14; - cbr i1 %11(prob = 0.8), ^while.body, ^scalar.header; + i1 %11 = icmp sgt i32 %10, i32 4; + cbr i1 %11(prob = 0.941176), ^super.header1, ^scalar.header; ^b2: - i32 %13 = add i32 %5, i32 1; - i1 %14 = icmp sgt i32 %1, i32 %13; - cbr i1 %14(prob = 0.984615), ^b1, ^b3; - ^while.body: - i32 %15 = phi [^super.header, i32 1] [^while.body, i32 %64]; - i32* %16 = getelementptr &([500 * i32]* %8)[i64 0][i32 %15]; - i32 %17 = load i32* %16; - i32* %18 = getelementptr &([500 * i32]* %9)[i64 0][i32 %15]; - store i32* %18 with i32 %17; - i32* %19 = getelementptr &(i32* %16)[i64 1]; - i32 %20 = load i32* %19; - i32* %21 = getelementptr &(i32* %18)[i64 1]; - store i32* %21 with i32 %20; - i32* %22 = getelementptr &(i32* %16)[i64 2]; - i32 %23 = load i32* %22; - i32* %24 = getelementptr &(i32* %18)[i64 2]; - store i32* %24 with i32 %23; - i32* %25 = getelementptr &(i32* %16)[i64 3]; - i32 %26 = load i32* %25; - i32* %27 = getelementptr &(i32* %18)[i64 3]; - store i32* %27 with i32 %26; - i32* %28 = getelementptr &(i32* %16)[i64 4]; - i32 %29 = load i32* %28; - i32* %30 = getelementptr &(i32* %18)[i64 4]; - store i32* %30 with i32 %29; - i32* %31 = getelementptr &(i32* %16)[i64 5]; - i32 %32 = load i32* %31; - i32* %33 = getelementptr &(i32* %18)[i64 5]; - store i32* %33 with i32 %32; - i32* %34 = getelementptr &(i32* %16)[i64 6]; - i32 %35 = load i32* %34; - i32* %36 = getelementptr &(i32* %18)[i64 6]; - store i32* %36 with i32 %35; - i32* %37 = getelementptr &(i32* %16)[i64 7]; - i32 %38 = load i32* %37; - i32* %39 = getelementptr &(i32* %18)[i64 7]; - store i32* %39 with i32 %38; - i32* %40 = getelementptr &(i32* %16)[i64 8]; - i32 %41 = load i32* %40; - i32* %42 = getelementptr &(i32* %18)[i64 8]; - store i32* %42 with i32 %41; - i32* %43 = getelementptr &(i32* %16)[i64 9]; - i32 %44 = load i32* %43; - i32* %45 = getelementptr &(i32* %18)[i64 9]; - store i32* %45 with i32 %44; - i32* %46 = getelementptr &(i32* %16)[i64 10]; - i32 %47 = load i32* %46; - i32* %48 = getelementptr &(i32* %18)[i64 10]; - store i32* %48 with i32 %47; - i32* %49 = getelementptr &(i32* %16)[i64 11]; - i32 %50 = load i32* %49; - i32* %51 = getelementptr &(i32* %18)[i64 11]; - store i32* %51 with i32 %50; - i32* %52 = getelementptr &(i32* %16)[i64 12]; - i32 %53 = load i32* %52; - i32* %54 = getelementptr &(i32* %18)[i64 12]; - store i32* %54 with i32 %53; - i32* %55 = getelementptr &(i32* %16)[i64 13]; - i32 %56 = load i32* %55; - i32* %57 = getelementptr &(i32* %18)[i64 13]; - store i32* %57 with i32 %56; - i32* %58 = getelementptr &(i32* %16)[i64 14]; - i32 %59 = load i32* %58; - i32* %60 = getelementptr &(i32* %18)[i64 14]; - store i32* %60 with i32 %59; - i32* %61 = getelementptr &(i32* %16)[i64 15]; - i32 %62 = load i32* %61; - i32* %63 = getelementptr &(i32* %18)[i64 15]; - store i32* %63 with i32 %62; - i32 %64 = add i32 %15, i32 16; - i1 %65 = icmp sgt i32 %12, i32 %64; - cbr i1 %65(prob = 0.8), ^while.body, ^scalar.header; + i32 %12 = add i32 %5, i32 1; + i1 %13 = icmp sgt i32 %1, i32 %12; + cbr i1 %13(prob = 0.984615), ^b1, ^b3; + ^super.header1: + i32 %14 = add i32 %6, i32 -2; + i1 %15 = icmp sgt i32 %14, i32 16; + i32 %16 = add i32 %6, i32 -17; + cbr i1 %15(prob = 0.941176), ^while.body, ^scalar.header1; ^scalar.header: - i32 %66 = phi [^super.header, i32 1] [^while.body, i32 %64]; - i1 %67 = icmp sgt i32 %10, i32 %66; - cbr i1 %67(prob = 0.9375), ^while.body1, ^b2; + i32 %17 = phi [^super.header, i32 1] [^scalar.header1, i32 %71] [^while.body2, i32 %92]; + i1 %18 = icmp sgt i32 %10, i32 %17; + cbr i1 %18(prob = 0.75), ^while.body1, ^b2; ^b3: ret; + ^while.body: + i32 %19 = phi [^super.header1, i32 1] [^while.body, i32 %68]; + i32* %20 = getelementptr &([500 * i32]* %8)[i64 0][i32 %19]; + i32 %21 = load i32* %20; + i32* %22 = getelementptr &([500 * i32]* %9)[i64 0][i32 %19]; + store i32* %22 with i32 %21; + i32* %23 = getelementptr &(i32* %20)[i64 1]; + i32 %24 = load i32* %23; + i32* %25 = getelementptr &(i32* %22)[i64 1]; + store i32* %25 with i32 %24; + i32* %26 = getelementptr &(i32* %20)[i64 2]; + i32 %27 = load i32* %26; + i32* %28 = getelementptr &(i32* %22)[i64 2]; + store i32* %28 with i32 %27; + i32* %29 = getelementptr &(i32* %20)[i64 3]; + i32 %30 = load i32* %29; + i32* %31 = getelementptr &(i32* %22)[i64 3]; + store i32* %31 with i32 %30; + i32* %32 = getelementptr &(i32* %20)[i64 4]; + i32 %33 = load i32* %32; + i32* %34 = getelementptr &(i32* %22)[i64 4]; + store i32* %34 with i32 %33; + i32* %35 = getelementptr &(i32* %20)[i64 5]; + i32 %36 = load i32* %35; + i32* %37 = getelementptr &(i32* %22)[i64 5]; + store i32* %37 with i32 %36; + i32* %38 = getelementptr &(i32* %20)[i64 6]; + i32 %39 = load i32* %38; + i32* %40 = getelementptr &(i32* %22)[i64 6]; + store i32* %40 with i32 %39; + i32* %41 = getelementptr &(i32* %20)[i64 7]; + i32 %42 = load i32* %41; + i32* %43 = getelementptr &(i32* %22)[i64 7]; + store i32* %43 with i32 %42; + i32* %44 = getelementptr &(i32* %20)[i64 8]; + i32 %45 = load i32* %44; + i32* %46 = getelementptr &(i32* %22)[i64 8]; + store i32* %46 with i32 %45; + i32* %47 = getelementptr &(i32* %20)[i64 9]; + i32 %48 = load i32* %47; + i32* %49 = getelementptr &(i32* %22)[i64 9]; + store i32* %49 with i32 %48; + i32* %50 = getelementptr &(i32* %20)[i64 10]; + i32 %51 = load i32* %50; + i32* %52 = getelementptr &(i32* %22)[i64 10]; + store i32* %52 with i32 %51; + i32* %53 = getelementptr &(i32* %20)[i64 11]; + i32 %54 = load i32* %53; + i32* %55 = getelementptr &(i32* %22)[i64 11]; + store i32* %55 with i32 %54; + i32* %56 = getelementptr &(i32* %20)[i64 12]; + i32 %57 = load i32* %56; + i32* %58 = getelementptr &(i32* %22)[i64 12]; + store i32* %58 with i32 %57; + i32* %59 = getelementptr &(i32* %20)[i64 13]; + i32 %60 = load i32* %59; + i32* %61 = getelementptr &(i32* %22)[i64 13]; + store i32* %61 with i32 %60; + i32* %62 = getelementptr &(i32* %20)[i64 14]; + i32 %63 = load i32* %62; + i32* %64 = getelementptr &(i32* %22)[i64 14]; + store i32* %64 with i32 %63; + i32* %65 = getelementptr &(i32* %20)[i64 15]; + i32 %66 = load i32* %65; + i32* %67 = getelementptr &(i32* %22)[i64 15]; + store i32* %67 with i32 %66; + i32 %68 = add i32 %19, i32 16; + i1 %69 = icmp sgt i32 %16, i32 %68; + cbr i1 %69(prob = 0.941176), ^while.body, ^scalar.header1; + ^scalar.header1: + i32 %70 = phi [^super.header1, i32 1] [^while.body, i32 %68]; + i32 %71 = phi [^super.header1, i32 undef] [^while.body, i32 %68]; + i1 %72 = icmp sgt i32 %14, i32 %70; + cbr i1 %72(prob = 0.75), ^while.body2, ^scalar.header; ^while.body1 {scalar}: - i32 %68 = phi [^scalar.header, i32 %66] [^while.body1, i32 %72]; - i32* %69 = getelementptr &([500 * i32]* %8)[i64 0][i32 %68]; - i32 %70 = load i32* %69; - i32* %71 = getelementptr &([500 * i32]* %9)[i64 0][i32 %68]; - store i32* %71 with i32 %70; - i32 %72 = add i32 %68, i32 1; - i1 %73 = icmp sgt i32 %10, i32 %72; - cbr i1 %73(prob = 0.9375), ^while.body1, ^b2; + i32 %73 = phi [^scalar.header, i32 %17] [^while.body1, i32 %77]; + i32* %74 = getelementptr &([500 * i32]* %8)[i64 0][i32 %73]; + i32 %75 = load i32* %74; + i32* %76 = getelementptr &([500 * i32]* %9)[i64 0][i32 %73]; + store i32* %76 with i32 %75; + i32 %77 = add i32 %73, i32 1; + i1 %78 = icmp sgt i32 %10, i32 %77; + cbr i1 %78(prob = 0.75), ^while.body1, ^b2; + ^while.body2 {scalar}: + i32 %79 = phi [^scalar.header1, i32 %70] [^while.body2, i32 %92]; + i32* %80 = getelementptr &([500 * i32]* %8)[i64 0][i32 %79]; + i32 %81 = load i32* %80; + i32* %82 = getelementptr &([500 * i32]* %9)[i64 0][i32 %79]; + store i32* %82 with i32 %81; + i32* %83 = getelementptr &(i32* %80)[i64 1]; + i32 %84 = load i32* %83; + i32* %85 = getelementptr &(i32* %82)[i64 1]; + store i32* %85 with i32 %84; + i32* %86 = getelementptr &(i32* %80)[i64 2]; + i32 %87 = load i32* %86; + i32* %88 = getelementptr &(i32* %82)[i64 2]; + store i32* %88 with i32 %87; + i32* %89 = getelementptr &(i32* %80)[i64 3]; + i32 %90 = load i32* %89; + i32* %91 = getelementptr &(i32* %82)[i64 3]; + store i32* %91 with i32 %90; + i32 %92 = add i32 %79, i32 4; + i1 %93 = icmp sgt i32 %14, i32 %92; + cbr i1 %93(prob = 0.75), ^while.body2, ^scalar.header; } internal [4 * i8]* @cmmc_parallel_body_payload_0, align 8; internal func @cmmc_parallel_body_1(i32 %0, i32 %1) -> void { NoRecurse ParallelBody } { diff --git a/tests/SysY2022/performance/gameoflife-oscillator.arm.s b/tests/SysY2022/performance/gameoflife-oscillator.arm.s index b4a0a88fe..d04d08c96 100644 --- a/tests/SysY2022/performance/gameoflife-oscillator.arm.s +++ b/tests/SysY2022/performance/gameoflife-oscillator.arm.s @@ -1,22 +1,22 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 sheet1: .zero 1000000 -.align 8 +.p2align 3 sheet2: .zero 1000000 -.align 4 +.p2align 2 width: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 .text diff --git a/tests/SysY2022/performance/gameoflife-oscillator.riscv.s b/tests/SysY2022/performance/gameoflife-oscillator.riscv.s index 81df0a843..86e23c7b2 100644 --- a/tests/SysY2022/performance/gameoflife-oscillator.riscv.s +++ b/tests/SysY2022/performance/gameoflife-oscillator.riscv.s @@ -1,402 +1,458 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 sheet1: .zero 1000000 -.align 8 +.p2align 3 sheet2: .zero 1000000 -.align 4 +.p2align 2 width: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[8] CalleeSaved[104] - addi sp, sp, -112 + # stack usage: CalleeArg[0] Local[0] RegSpill[16] CalleeSaved[104] + addi sp, sp, -120 sd ra, 0(sp) - sd s6, 8(sp) - sd s1, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s4, 40(sp) - sd s8, 48(sp) - sd s3, 56(sp) - sd s2, 64(sp) - sd s7, 72(sp) + sd s7, 8(sp) + sd s6, 16(sp) + sd s1, 24(sp) + sd s3, 32(sp) + sd s8, 40(sp) + sd s2, 48(sp) + sd s0, 56(sp) + sd s5, 64(sp) + sd s4, 72(sp) sd s9, 80(sp) sd s10, 88(sp) sd s11, 96(sp) jal getint - mv s6, a0 -pcrel572: + mv s7, a0 +pcrel630: auipc a0, %pcrel_hi(width) - sw s6, %pcrel_lo(pcrel572)(a0) + sw s7, %pcrel_lo(pcrel630)(a0) jal getint - slt s4, zero, a0 + slt s3, zero, a0 addiw a1, a0, 1 - mv s0, a0 + sd a0, 112(sp) + mv s6, a0 sd a1, 104(sp) jal getint mv s8, a0 jal getch -pcrel573: +pcrel631: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) +pcrel632: + auipc a0, %pcrel_hi(cmmc_parallel_body_1) +pcrel633: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel574: - auipc a0, %pcrel_hi(cmmc_parallel_body_2) -pcrel575: +pcrel634: auipc a1, %pcrel_hi(sheet1) - addi s1, a0, %pcrel_lo(pcrel574) - addi s3, a1, %pcrel_lo(pcrel575) -pcrel576: - auipc a1, %pcrel_hi(cmmc_parallel_body_1) - addi s2, a1, %pcrel_lo(pcrel576) - ble s0, zero, label369 - addi s9, s3, 2000 - mv a1, s6 + addi s0, a0, %pcrel_lo(pcrel632) + addi s2, a1, %pcrel_lo(pcrel634) +pcrel635: + auipc a1, %pcrel_hi(cmmc_parallel_body_2) + addi s1, a1, %pcrel_lo(pcrel635) + ble s6, zero, label436 + addi s9, s2, 2000 + mv a1, s7 li s7, 1 mv s10, s7 - bgt s6, zero, label402 - j label400 + bgt a1, zero, label431 + j label429 .p2align 2 -label406: +label435: addi s11, s11, 4 .p2align 2 -label403: +label432: jal getch addiw s7, s7, 1 xori a2, a0, 35 sltiu a1, a2, 1 sw a1, 0(s11) - bgt s6, s7, label406 + bgt s6, s7, label435 .p2align 2 -label400: +label429: jal getch + ld s6, 112(sp) addiw s10, s10, 1 - blt s0, s10, label369 -pcrel577: + blt s6, s10, label436 +pcrel636: auipc a0, %pcrel_hi(width) addi s9, s9, 2000 - lw a1, %pcrel_lo(pcrel577)(a0) - ble a1, zero, label400 + lw a1, %pcrel_lo(pcrel636)(a0) + ble a1, zero, label429 .p2align 2 -label402: +label431: auipc a0, %pcrel_hi(width) addi s11, s9, 4 li s7, 1 - lw a1, %pcrel_lo(label402)(a0) + lw a1, %pcrel_lo(label431)(a0) addi s6, a1, 1 - j label403 -label369: + j label432 +label436: li a0, 95 jal _sysy_starttime - ble s8, zero, label421 + ble s8, zero, label501 li s7, 1 mv a0, s7 - bne s7, s7, label425 + beq s7, s7, label442 + ld s6, 112(sp) + bgt s6, zero, label441 + j label508 .p2align 2 -label373: - ble s0, zero, label428 +label443: ld a1, 104(sp) li s7, 1 -pcrel578: +pcrel637: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) - sw a1, %pcrel_lo(pcrel578)(s5) + sw a1, %pcrel_lo(pcrel637)(s5) mv a0, s7 mv a2, s1 jal cmmcParallelFor - addiw s8, s8, -1 li s7, 2 - bgt s8, zero, label445 - j label379 -label425: - ble s0, zero, label552 -.p2align 2 -label376: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) - li s7, 1 - ld a1, 104(sp) - sw a1, %pcrel_lo(label376)(a2) - mv a0, s7 - mv a2, s2 - jal cmmcParallelFor .p2align 2 -label377: +label444: addiw s8, s8, -1 - ble s8, zero, label379 + ble s8, zero, label446 .p2align 2 -label445: +label525: mv a0, s7 li s7, 1 - beq a0, s7, label373 - bgt s0, zero, label376 -label552: + bne a0, s7, label611 +.p2align 2 +label442: + ld s6, 112(sp) + bgt s6, zero, label443 + li s7, 2 + j label444 +.p2align 2 +label611: + ld s6, 112(sp) + ble s6, zero, label508 +.p2align 2 +label441: + ld a1, 104(sp) li s7, 1 - j label377 -label379: +pcrel638: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) + sw a1, %pcrel_lo(pcrel638)(s4) + mv a0, s7 + mv a2, s0 + jal cmmcParallelFor + addiw s8, s8, -1 + bgt s8, zero, label525 +label446: xori a2, s7, 2 li a0, 106 sltiu a1, a2, 1 - and s1, s4, a1 + and s0, s3, a1 jal _sysy_stoptime - bne s1, zero, label394 -label381: - ble s0, zero, label382 - addi s1, s3, 2000 + bne s0, zero, label461 +label448: + ld s6, 112(sp) + ble s6, zero, label449 + addi s1, s2, 2000 li s7, 1 mv s2, s7 -pcrel579: +pcrel639: auipc a0, %pcrel_hi(width) - lw a1, %pcrel_lo(pcrel579)(a0) + lw a1, %pcrel_lo(pcrel639)(a0) addi s0, a1, 1 - bgt a1, zero, label389 + bgt a1, zero, label456 .p2align 2 -label387: +label454: li a0, 10 jal putch ld a1, 104(sp) addiw s2, s2, 1 - ble a1, s2, label382 + ble a1, s2, label449 addi s1, s1, 2000 -pcrel580: +pcrel640: auipc a0, %pcrel_hi(width) - lw a1, %pcrel_lo(pcrel580)(a0) + lw a1, %pcrel_lo(pcrel640)(a0) addi s0, a1, 1 - ble a1, zero, label387 + ble a1, zero, label454 .p2align 2 -label389: +label456: addi s3, s1, 4 li s7, 1 mv s4, s7 lw a1, 0(s3) li a0, 35 - beq a1, s7, label537 + beq a1, s7, label593 .p2align 2 -label536: +label592: li a0, 46 .p2align 2 -label537: +label593: jal putch addiw s4, s4, 1 - ble s0, s4, label387 + ble s0, s4, label454 addi s3, s3, 4 li a0, 35 li s7, 1 lw a1, 0(s3) - beq a1, s7, label537 - j label536 -label382: + beq a1, s7, label593 + j label592 +label449: mv a0, zero ld ra, 0(sp) - ld s6, 8(sp) - ld s1, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s4, 40(sp) - ld s8, 48(sp) - ld s3, 56(sp) - ld s2, 64(sp) - ld s7, 72(sp) + ld s7, 8(sp) + ld s6, 16(sp) + ld s1, 24(sp) + ld s3, 32(sp) + ld s8, 40(sp) + ld s2, 48(sp) + ld s0, 56(sp) + ld s5, 64(sp) + ld s4, 72(sp) ld s9, 80(sp) ld s10, 88(sp) ld s11, 96(sp) - addi sp, sp, 112 + addi sp, sp, 120 ret -label394: +label461: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel581: +pcrel641: auipc a3, %pcrel_hi(cmmc_parallel_body_0) li s7, 1 - addi a2, a3, %pcrel_lo(pcrel581) + addi a2, a3, %pcrel_lo(pcrel641) ld a1, 104(sp) - sw a1, %pcrel_lo(label394)(a0) + sw a1, %pcrel_lo(label461)(a0) mv a0, s7 jal cmmcParallelFor - j label381 -label428: - li s7, 2 - j label377 -label421: + j label448 +label508: + li s7, 1 + j label444 +label501: li s7, 1 - j label379 + j label446 .p2align 2 cmmc_parallel_body_0: - addi sp, sp, -8 - mv t0, a0 -pcrel146: + addi sp, sp, -24 + mv t1, a0 +pcrel199: auipc a5, %pcrel_hi(sheet2) li a2, 2000 -pcrel147: - auipc t1, %pcrel_hi(sheet1) - addi a3, a5, %pcrel_lo(pcrel146) +pcrel200: + auipc t0, %pcrel_hi(sheet1) + addi a3, a5, %pcrel_lo(pcrel199) mul a4, a0, a2 sd s0, 0(sp) - li a5, 16 + li a5, 4 add a0, a3, a4 -pcrel148: + sd s1, 8(sp) +pcrel201: auipc a4, %pcrel_hi(width) - addi a3, t1, %pcrel_lo(pcrel147) - lw t3, %pcrel_lo(pcrel148)(a4) - bgt t3, zero, label8 - j label5 -.p2align 2 -label20: - addi t3, t3, 4 -.p2align 2 -label17: - sh2add t5, t4, t1 - lw t6, 0(t3) - addiw t4, t4, 1 - sw t6, 0(t5) - bgt t2, t4, label20 - addiw t0, t0, 1 - ble a1, t0, label7 -.p2align 2 -label6: + addi a3, t0, %pcrel_lo(pcrel200) + sd s2, 16(sp) + li t0, 16 + lw t4, %pcrel_lo(pcrel201)(a4) + bgt t4, zero, label5 + j label27 +.p2align 2 +label26: + addi t4, t4, 4 +.p2align 2 +label23: + sh2add t6, t5, t2 + lw a6, 0(t4) + addiw t5, t5, 1 + sw a6, 0(t6) + bgt t3, t5, label26 + addiw t1, t1, 1 + ble a1, t1, label29 +.p2align 2 +label28: addi a0, a0, 2000 -pcrel149: +pcrel202: auipc a4, %pcrel_hi(width) - lw t3, %pcrel_lo(pcrel149)(a4) - ble t3, zero, label5 -.p2align 2 -label8: - mul t5, t0, a2 - addiw t2, t3, 1 - addiw t4, t3, -14 - add t1, a3, t5 - ble t2, a5, label47 - addi t3, a0, 4 - li t6, 1 - j label10 -.p2align 2 -label13: - addi t3, t3, 64 + lw t4, %pcrel_lo(pcrel202)(a4) + ble t4, zero, label27 .p2align 2 -label10: - sh2add t5, t6, t1 - lw a7, 0(t3) - addiw t6, t6, 16 - sw a7, 0(t5) - lw a6, 4(t3) - sw a6, 4(t5) - lw a7, 8(t3) - sw a7, 8(t5) - lw a6, 12(t3) - sw a6, 12(t5) - lw a7, 16(t3) - sw a7, 16(t5) - lw a6, 20(t3) - sw a6, 20(t5) - lw a7, 24(t3) - sw a7, 24(t5) - lw a6, 28(t3) - sw a6, 28(t5) - lw s0, 32(t3) - sw s0, 32(t5) - lw a7, 36(t3) - sw a7, 36(t5) - lw a6, 40(t3) - sw a6, 40(t5) - lw s0, 44(t3) - sw s0, 44(t5) - lw a7, 48(t3) - sw a7, 48(t5) - lw a6, 52(t3) - sw a6, 52(t5) - lw a7, 56(t3) - sw a7, 56(t5) - lw a6, 60(t3) - sw a6, 60(t5) - bgt t4, t6, label13 - mv t4, t6 - ble t2, t6, label138 +label5: + mul t5, t1, a2 + addiw t3, t4, 1 + add t2, a3, t5 + ble t3, a5, label49 + addiw t5, t4, -2 + addiw t6, t4, -17 + ble t5, t0, label55 + addi t4, t2, 4 + li a6, 1 .p2align 2 label16: - sh2add t3, t4, a0 - j label17 -label5: - addiw t0, t0, 1 - bgt a1, t0, label6 -label7: + sh2add a7, a6, a0 + addiw a6, a6, 16 + lw s1, 0(a7) + sw s1, 0(t4) + lw s0, 4(a7) + sw s0, 4(t4) + lw s1, 8(a7) + sw s1, 8(t4) + lw s0, 12(a7) + sw s0, 12(t4) + lw s1, 16(a7) + sw s1, 16(t4) + lw s0, 20(a7) + sw s0, 20(t4) + lw s1, 24(a7) + sw s1, 24(t4) + lw s2, 28(a7) + sw s2, 28(t4) + lw s0, 32(a7) + sw s0, 32(t4) + lw s1, 36(a7) + sw s1, 36(t4) + lw s0, 40(a7) + sw s0, 40(t4) + lw s1, 44(a7) + sw s1, 44(t4) + lw s0, 48(a7) + sw s0, 48(t4) + lw s1, 52(a7) + sw s1, 52(t4) + lw s0, 56(a7) + sw s0, 56(t4) + lw s1, 60(a7) + sw s1, 60(t4) + ble t6, a6, label133 + addi t4, t4, 64 + j label16 +.p2align 2 +label133: + mv t6, a6 + ble t5, a6, label184 +.p2align 2 +label10: + sh2add t4, t6, t2 + j label11 +.p2align 2 +label14: + addi t4, t4, 16 +.p2align 2 +label11: + sh2add a6, t6, a0 + addiw t6, t6, 4 + lw s0, 0(a6) + sw s0, 0(t4) + lw a7, 4(a6) + sw a7, 4(t4) + lw s0, 8(a6) + sw s0, 8(t4) + lw a7, 12(a6) + sw a7, 12(t4) + bgt t5, t6, label14 + mv t5, t6 + ble t3, t6, label183 +.p2align 2 +label22: + sh2add t4, t5, a0 + j label23 +label55: + li t6, 1 + mv a6, zero + bgt t5, t6, label10 + mv t5, zero + bgt t3, zero, label22 + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 +.p2align 2 +label184: + mv t5, a6 + bgt t3, a6, label22 + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 +label27: + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 +label49: + li t5, 1 + bgt t3, t5, label22 + addiw t1, t1, 1 + bgt a1, t1, label28 +label29: ld s0, 0(sp) - addi sp, sp, 8 + ld s1, 8(sp) + ld s2, 16(sp) + addi sp, sp, 24 ret .p2align 2 -label47: - li t4, 1 - bgt t2, t4, label16 - addiw t0, t0, 1 - bgt a1, t0, label6 - j label7 -label138: - addiw t0, t0, 1 - bgt a1, t0, label6 - j label7 +label183: + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 .p2align 2 cmmc_parallel_body_1: addi sp, sp, -8 mv a4, a1 -pcrel255: +pcrel308: auipc a3, %pcrel_hi(sheet2) li t0, 2000 mv t3, a0 -pcrel256: +pcrel309: auipc t2, %pcrel_hi(width) sd s0, 0(sp) - addi a2, a3, %pcrel_lo(pcrel255) + addi a2, a3, %pcrel_lo(pcrel308) mulw a1, a0, t0 -pcrel257: +pcrel310: auipc a3, %pcrel_hi(sheet1) add a5, a2, a1 - addi t1, a3, %pcrel_lo(pcrel257) + addi t1, a3, %pcrel_lo(pcrel310) li a2, 1 - lw a0, %pcrel_lo(pcrel256)(t2) + lw a0, %pcrel_lo(pcrel309)(t2) mulw t4, t3, t0 addi a1, a0, 1 add a3, t1, t4 - bgt a0, zero, label157 - j label154 + bgt a0, zero, label210 + j label207 .p2align 2 -label225: +label278: addiw t3, t3, 1 - ble a4, t3, label156 + ble a4, t3, label209 .p2align 2 -label155: +label208: addi a5, a5, 2000 mulw t4, t3, t0 -pcrel258: +pcrel311: auipc t2, %pcrel_hi(width) - lw a0, %pcrel_lo(pcrel258)(t2) + lw a0, %pcrel_lo(pcrel311)(t2) add a3, t1, t4 addi a1, a0, 1 - ble a0, zero, label154 + ble a0, zero, label207 .p2align 2 -label157: +label210: addi a0, a5, 4 mv t4, a2 - j label158 + j label211 .p2align 2 -label161: +label214: sh2add t6, t4, a3 sw a2, 0(t6) - ble a1, t5, label225 + ble a1, t5, label278 .p2align 2 -label163: +label216: addi a0, a0, 4 mv t4, t5 .p2align 2 -label158: +label211: lw t5, -2004(a0) lw t6, -2000(a0) lw a7, -1996(a0) @@ -417,109 +473,111 @@ label158: xori s0, t5, 1 addiw t5, t4, 1 or a7, a6, s0 - beq a7, zero, label161 + beq a7, zero, label214 xori a7, t6, 3 sh2add t6, t4, a3 sltiu a6, a7, 1 sw a6, 0(t6) - bgt a1, t5, label163 + bgt a1, t5, label216 addiw t3, t3, 1 - bgt a4, t3, label155 -label156: + bgt a4, t3, label208 +label209: ld s0, 0(sp) addi sp, sp, 8 ret -label154: +label207: addiw t3, t3, 1 - bgt a4, t3, label155 - j label156 + bgt a4, t3, label208 + j label209 .p2align 2 cmmc_parallel_body_2: - addi sp, sp, -8 + addi sp, sp, -16 mv a4, a1 -pcrel364: +pcrel419: auipc a3, %pcrel_hi(sheet1) li t0, 2000 mv t3, a0 -pcrel365: +pcrel420: auipc t2, %pcrel_hi(width) - sd s0, 0(sp) - addi a2, a3, %pcrel_lo(pcrel364) + sd s1, 0(sp) + addi a2, a3, %pcrel_lo(pcrel419) mulw a1, a0, t0 -pcrel366: + sd s0, 8(sp) +pcrel421: auipc a3, %pcrel_hi(sheet2) add a5, a2, a1 - addi t1, a3, %pcrel_lo(pcrel366) + addi t1, a3, %pcrel_lo(pcrel421) li a2, 1 - lw a0, %pcrel_lo(pcrel365)(t2) + lw a0, %pcrel_lo(pcrel420)(t2) mulw t4, t3, t0 addi a1, a0, 1 add a3, t1, t4 - bgt a0, zero, label266 - j label263 -.p2align 2 -label334: - addiw t3, t3, 1 - ble a4, t3, label265 -.p2align 2 -label264: - addi a5, a5, 2000 - mulw t4, t3, t0 -pcrel367: - auipc t2, %pcrel_hi(width) - lw a0, %pcrel_lo(pcrel367)(t2) - add a3, t1, t4 - addi a1, a0, 1 - ble a0, zero, label263 -.p2align 2 -label266: - addi a0, a5, 4 - mv t4, a2 - j label267 + bgt a0, zero, label316 + j label324 .p2align 2 -label270: +label375: + xori a7, t6, 3 sh2add t6, t4, a3 - sw a2, 0(t6) - ble a1, t5, label334 + sltiu a6, a7, 1 + sw a6, 0(t6) + ble a1, t5, label379 .p2align 2 -label272: +label321: addi a0, a0, 4 mv t4, t5 .p2align 2 -label267: - lw t5, -2004(a0) +label317: + lw a6, -2004(a0) lw t6, -2000(a0) lw a7, -1996(a0) - addw a6, t5, t6 - lw t5, -4(a0) - addw t6, a6, a7 - lw a6, 4(a0) - addw a7, t6, t5 - lw s0, 1996(a0) - addw t5, a7, a6 - lw t6, 2000(a0) - addw a6, t5, s0 + addw t5, a6, t6 + lw s1, -4(a0) + addw t6, t5, a7 + lw s0, 4(a0) + addw a6, t6, s1 + lw a7, 1996(a0) + addw t5, a6, s0 + lw a6, 2000(a0) + addw t6, t5, a7 lw t5, 2004(a0) - addw a7, a6, t6 + addw a7, t6, a6 addw t6, a7, t5 lw t5, 0(a0) xori a6, t6, 2 xori s0, t5, 1 addiw t5, t4, 1 or a7, a6, s0 - beq a7, zero, label270 - xori a7, t6, 3 - sh2add t6, t4, a3 - sltiu a6, a7, 1 - sw a6, 0(t6) - bgt a1, t5, label272 + bne a7, zero, label375 + sh2add a6, t4, a3 + sw a2, 0(a6) + bgt a1, t5, label321 addiw t3, t3, 1 - bgt a4, t3, label264 -label265: - ld s0, 0(sp) - addi sp, sp, 8 + ble a4, t3, label326 +.p2align 2 +label325: + addi a5, a5, 2000 + mulw t4, t3, t0 +pcrel422: + auipc t2, %pcrel_hi(width) + lw a0, %pcrel_lo(pcrel422)(t2) + add a3, t1, t4 + addi a1, a0, 1 + ble a0, zero, label324 +.p2align 2 +label316: + addi a0, a5, 4 + mv t4, a2 + j label317 +label324: + addiw t3, t3, 1 + bgt a4, t3, label325 +label326: + ld s1, 0(sp) + ld s0, 8(sp) + addi sp, sp, 16 ret -label263: +.p2align 2 +label379: addiw t3, t3, 1 - bgt a4, t3, label264 - j label265 + bgt a4, t3, label325 + j label326 diff --git a/tests/SysY2022/performance/gameoflife-oscillator.sy.ir b/tests/SysY2022/performance/gameoflife-oscillator.sy.ir index 8be93b9f0..c453f20de 100644 --- a/tests/SysY2022/performance/gameoflife-oscillator.sy.ir +++ b/tests/SysY2022/performance/gameoflife-oscillator.sy.ir @@ -126,7 +126,7 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel [500 * [500 * i32]]* %4 = ptrcast [500 * [500 * i32]]* @sheet2 to [500 * [500 * i32]]*; ubr ^b1; ^b1: - i32 %5 = phi [^b, i32 %0] [^b2, i32 %13]; + i32 %5 = phi [^b, i32 %0] [^b2, i32 %12]; i32 %6 = load i32* %2; i1 %7 = icmp sgt i32 %6, i32 0; cbr i1 %7(prob = 0.984615), ^super.header, ^b2; @@ -134,97 +134,127 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel [500 * i32]* %8 = getelementptr &([500 * [500 * i32]]* %4)[i64 0][i32 %5]; [500 * i32]* %9 = getelementptr &([500 * [500 * i32]]* %3)[i64 0][i32 %5]; i32 %10 = add i32 %6, i32 1; - i1 %11 = icmp sgt i32 %10, i32 16; - i32 %12 = add i32 %6, i32 -14; - cbr i1 %11(prob = 0.8), ^while.body, ^scalar.header; + i1 %11 = icmp sgt i32 %10, i32 4; + cbr i1 %11(prob = 0.941176), ^super.header1, ^scalar.header; ^b2: - i32 %13 = add i32 %5, i32 1; - i1 %14 = icmp sgt i32 %1, i32 %13; - cbr i1 %14(prob = 0.984615), ^b1, ^b3; - ^while.body: - i32 %15 = phi [^super.header, i32 1] [^while.body, i32 %64]; - i32* %16 = getelementptr &([500 * i32]* %8)[i64 0][i32 %15]; - i32 %17 = load i32* %16; - i32* %18 = getelementptr &([500 * i32]* %9)[i64 0][i32 %15]; - store i32* %18 with i32 %17; - i32* %19 = getelementptr &(i32* %16)[i64 1]; - i32 %20 = load i32* %19; - i32* %21 = getelementptr &(i32* %18)[i64 1]; - store i32* %21 with i32 %20; - i32* %22 = getelementptr &(i32* %16)[i64 2]; - i32 %23 = load i32* %22; - i32* %24 = getelementptr &(i32* %18)[i64 2]; - store i32* %24 with i32 %23; - i32* %25 = getelementptr &(i32* %16)[i64 3]; - i32 %26 = load i32* %25; - i32* %27 = getelementptr &(i32* %18)[i64 3]; - store i32* %27 with i32 %26; - i32* %28 = getelementptr &(i32* %16)[i64 4]; - i32 %29 = load i32* %28; - i32* %30 = getelementptr &(i32* %18)[i64 4]; - store i32* %30 with i32 %29; - i32* %31 = getelementptr &(i32* %16)[i64 5]; - i32 %32 = load i32* %31; - i32* %33 = getelementptr &(i32* %18)[i64 5]; - store i32* %33 with i32 %32; - i32* %34 = getelementptr &(i32* %16)[i64 6]; - i32 %35 = load i32* %34; - i32* %36 = getelementptr &(i32* %18)[i64 6]; - store i32* %36 with i32 %35; - i32* %37 = getelementptr &(i32* %16)[i64 7]; - i32 %38 = load i32* %37; - i32* %39 = getelementptr &(i32* %18)[i64 7]; - store i32* %39 with i32 %38; - i32* %40 = getelementptr &(i32* %16)[i64 8]; - i32 %41 = load i32* %40; - i32* %42 = getelementptr &(i32* %18)[i64 8]; - store i32* %42 with i32 %41; - i32* %43 = getelementptr &(i32* %16)[i64 9]; - i32 %44 = load i32* %43; - i32* %45 = getelementptr &(i32* %18)[i64 9]; - store i32* %45 with i32 %44; - i32* %46 = getelementptr &(i32* %16)[i64 10]; - i32 %47 = load i32* %46; - i32* %48 = getelementptr &(i32* %18)[i64 10]; - store i32* %48 with i32 %47; - i32* %49 = getelementptr &(i32* %16)[i64 11]; - i32 %50 = load i32* %49; - i32* %51 = getelementptr &(i32* %18)[i64 11]; - store i32* %51 with i32 %50; - i32* %52 = getelementptr &(i32* %16)[i64 12]; - i32 %53 = load i32* %52; - i32* %54 = getelementptr &(i32* %18)[i64 12]; - store i32* %54 with i32 %53; - i32* %55 = getelementptr &(i32* %16)[i64 13]; - i32 %56 = load i32* %55; - i32* %57 = getelementptr &(i32* %18)[i64 13]; - store i32* %57 with i32 %56; - i32* %58 = getelementptr &(i32* %16)[i64 14]; - i32 %59 = load i32* %58; - i32* %60 = getelementptr &(i32* %18)[i64 14]; - store i32* %60 with i32 %59; - i32* %61 = getelementptr &(i32* %16)[i64 15]; - i32 %62 = load i32* %61; - i32* %63 = getelementptr &(i32* %18)[i64 15]; - store i32* %63 with i32 %62; - i32 %64 = add i32 %15, i32 16; - i1 %65 = icmp sgt i32 %12, i32 %64; - cbr i1 %65(prob = 0.8), ^while.body, ^scalar.header; + i32 %12 = add i32 %5, i32 1; + i1 %13 = icmp sgt i32 %1, i32 %12; + cbr i1 %13(prob = 0.984615), ^b1, ^b3; + ^super.header1: + i32 %14 = add i32 %6, i32 -2; + i1 %15 = icmp sgt i32 %14, i32 16; + i32 %16 = add i32 %6, i32 -17; + cbr i1 %15(prob = 0.941176), ^while.body, ^scalar.header1; ^scalar.header: - i32 %66 = phi [^super.header, i32 1] [^while.body, i32 %64]; - i1 %67 = icmp sgt i32 %10, i32 %66; - cbr i1 %67(prob = 0.9375), ^while.body1, ^b2; + i32 %17 = phi [^super.header, i32 1] [^scalar.header1, i32 %71] [^while.body2, i32 %92]; + i1 %18 = icmp sgt i32 %10, i32 %17; + cbr i1 %18(prob = 0.75), ^while.body1, ^b2; ^b3: ret; + ^while.body: + i32 %19 = phi [^super.header1, i32 1] [^while.body, i32 %68]; + i32* %20 = getelementptr &([500 * i32]* %8)[i64 0][i32 %19]; + i32 %21 = load i32* %20; + i32* %22 = getelementptr &([500 * i32]* %9)[i64 0][i32 %19]; + store i32* %22 with i32 %21; + i32* %23 = getelementptr &(i32* %20)[i64 1]; + i32 %24 = load i32* %23; + i32* %25 = getelementptr &(i32* %22)[i64 1]; + store i32* %25 with i32 %24; + i32* %26 = getelementptr &(i32* %20)[i64 2]; + i32 %27 = load i32* %26; + i32* %28 = getelementptr &(i32* %22)[i64 2]; + store i32* %28 with i32 %27; + i32* %29 = getelementptr &(i32* %20)[i64 3]; + i32 %30 = load i32* %29; + i32* %31 = getelementptr &(i32* %22)[i64 3]; + store i32* %31 with i32 %30; + i32* %32 = getelementptr &(i32* %20)[i64 4]; + i32 %33 = load i32* %32; + i32* %34 = getelementptr &(i32* %22)[i64 4]; + store i32* %34 with i32 %33; + i32* %35 = getelementptr &(i32* %20)[i64 5]; + i32 %36 = load i32* %35; + i32* %37 = getelementptr &(i32* %22)[i64 5]; + store i32* %37 with i32 %36; + i32* %38 = getelementptr &(i32* %20)[i64 6]; + i32 %39 = load i32* %38; + i32* %40 = getelementptr &(i32* %22)[i64 6]; + store i32* %40 with i32 %39; + i32* %41 = getelementptr &(i32* %20)[i64 7]; + i32 %42 = load i32* %41; + i32* %43 = getelementptr &(i32* %22)[i64 7]; + store i32* %43 with i32 %42; + i32* %44 = getelementptr &(i32* %20)[i64 8]; + i32 %45 = load i32* %44; + i32* %46 = getelementptr &(i32* %22)[i64 8]; + store i32* %46 with i32 %45; + i32* %47 = getelementptr &(i32* %20)[i64 9]; + i32 %48 = load i32* %47; + i32* %49 = getelementptr &(i32* %22)[i64 9]; + store i32* %49 with i32 %48; + i32* %50 = getelementptr &(i32* %20)[i64 10]; + i32 %51 = load i32* %50; + i32* %52 = getelementptr &(i32* %22)[i64 10]; + store i32* %52 with i32 %51; + i32* %53 = getelementptr &(i32* %20)[i64 11]; + i32 %54 = load i32* %53; + i32* %55 = getelementptr &(i32* %22)[i64 11]; + store i32* %55 with i32 %54; + i32* %56 = getelementptr &(i32* %20)[i64 12]; + i32 %57 = load i32* %56; + i32* %58 = getelementptr &(i32* %22)[i64 12]; + store i32* %58 with i32 %57; + i32* %59 = getelementptr &(i32* %20)[i64 13]; + i32 %60 = load i32* %59; + i32* %61 = getelementptr &(i32* %22)[i64 13]; + store i32* %61 with i32 %60; + i32* %62 = getelementptr &(i32* %20)[i64 14]; + i32 %63 = load i32* %62; + i32* %64 = getelementptr &(i32* %22)[i64 14]; + store i32* %64 with i32 %63; + i32* %65 = getelementptr &(i32* %20)[i64 15]; + i32 %66 = load i32* %65; + i32* %67 = getelementptr &(i32* %22)[i64 15]; + store i32* %67 with i32 %66; + i32 %68 = add i32 %19, i32 16; + i1 %69 = icmp sgt i32 %16, i32 %68; + cbr i1 %69(prob = 0.941176), ^while.body, ^scalar.header1; + ^scalar.header1: + i32 %70 = phi [^super.header1, i32 1] [^while.body, i32 %68]; + i32 %71 = phi [^super.header1, i32 undef] [^while.body, i32 %68]; + i1 %72 = icmp sgt i32 %14, i32 %70; + cbr i1 %72(prob = 0.75), ^while.body2, ^scalar.header; ^while.body1 {scalar}: - i32 %68 = phi [^scalar.header, i32 %66] [^while.body1, i32 %72]; - i32* %69 = getelementptr &([500 * i32]* %8)[i64 0][i32 %68]; - i32 %70 = load i32* %69; - i32* %71 = getelementptr &([500 * i32]* %9)[i64 0][i32 %68]; - store i32* %71 with i32 %70; - i32 %72 = add i32 %68, i32 1; - i1 %73 = icmp sgt i32 %10, i32 %72; - cbr i1 %73(prob = 0.9375), ^while.body1, ^b2; + i32 %73 = phi [^scalar.header, i32 %17] [^while.body1, i32 %77]; + i32* %74 = getelementptr &([500 * i32]* %8)[i64 0][i32 %73]; + i32 %75 = load i32* %74; + i32* %76 = getelementptr &([500 * i32]* %9)[i64 0][i32 %73]; + store i32* %76 with i32 %75; + i32 %77 = add i32 %73, i32 1; + i1 %78 = icmp sgt i32 %10, i32 %77; + cbr i1 %78(prob = 0.75), ^while.body1, ^b2; + ^while.body2 {scalar}: + i32 %79 = phi [^scalar.header1, i32 %70] [^while.body2, i32 %92]; + i32* %80 = getelementptr &([500 * i32]* %8)[i64 0][i32 %79]; + i32 %81 = load i32* %80; + i32* %82 = getelementptr &([500 * i32]* %9)[i64 0][i32 %79]; + store i32* %82 with i32 %81; + i32* %83 = getelementptr &(i32* %80)[i64 1]; + i32 %84 = load i32* %83; + i32* %85 = getelementptr &(i32* %82)[i64 1]; + store i32* %85 with i32 %84; + i32* %86 = getelementptr &(i32* %80)[i64 2]; + i32 %87 = load i32* %86; + i32* %88 = getelementptr &(i32* %82)[i64 2]; + store i32* %88 with i32 %87; + i32* %89 = getelementptr &(i32* %80)[i64 3]; + i32 %90 = load i32* %89; + i32* %91 = getelementptr &(i32* %82)[i64 3]; + store i32* %91 with i32 %90; + i32 %92 = add i32 %79, i32 4; + i1 %93 = icmp sgt i32 %14, i32 %92; + cbr i1 %93(prob = 0.75), ^while.body2, ^scalar.header; } internal [4 * i8]* @cmmc_parallel_body_payload_0, align 8; internal func @cmmc_parallel_body_1(i32 %0, i32 %1) -> void { NoRecurse ParallelBody } { diff --git a/tests/SysY2022/performance/gameoflife-p61glidergun.arm.s b/tests/SysY2022/performance/gameoflife-p61glidergun.arm.s index b4a0a88fe..d04d08c96 100644 --- a/tests/SysY2022/performance/gameoflife-p61glidergun.arm.s +++ b/tests/SysY2022/performance/gameoflife-p61glidergun.arm.s @@ -1,22 +1,22 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 sheet1: .zero 1000000 -.align 8 +.p2align 3 sheet2: .zero 1000000 -.align 4 +.p2align 2 width: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 .text diff --git a/tests/SysY2022/performance/gameoflife-p61glidergun.riscv.s b/tests/SysY2022/performance/gameoflife-p61glidergun.riscv.s index 81df0a843..86e23c7b2 100644 --- a/tests/SysY2022/performance/gameoflife-p61glidergun.riscv.s +++ b/tests/SysY2022/performance/gameoflife-p61glidergun.riscv.s @@ -1,402 +1,458 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 sheet1: .zero 1000000 -.align 8 +.p2align 3 sheet2: .zero 1000000 -.align 4 +.p2align 2 width: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 4 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[8] CalleeSaved[104] - addi sp, sp, -112 + # stack usage: CalleeArg[0] Local[0] RegSpill[16] CalleeSaved[104] + addi sp, sp, -120 sd ra, 0(sp) - sd s6, 8(sp) - sd s1, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s4, 40(sp) - sd s8, 48(sp) - sd s3, 56(sp) - sd s2, 64(sp) - sd s7, 72(sp) + sd s7, 8(sp) + sd s6, 16(sp) + sd s1, 24(sp) + sd s3, 32(sp) + sd s8, 40(sp) + sd s2, 48(sp) + sd s0, 56(sp) + sd s5, 64(sp) + sd s4, 72(sp) sd s9, 80(sp) sd s10, 88(sp) sd s11, 96(sp) jal getint - mv s6, a0 -pcrel572: + mv s7, a0 +pcrel630: auipc a0, %pcrel_hi(width) - sw s6, %pcrel_lo(pcrel572)(a0) + sw s7, %pcrel_lo(pcrel630)(a0) jal getint - slt s4, zero, a0 + slt s3, zero, a0 addiw a1, a0, 1 - mv s0, a0 + sd a0, 112(sp) + mv s6, a0 sd a1, 104(sp) jal getint mv s8, a0 jal getch -pcrel573: +pcrel631: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) +pcrel632: + auipc a0, %pcrel_hi(cmmc_parallel_body_1) +pcrel633: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) -pcrel574: - auipc a0, %pcrel_hi(cmmc_parallel_body_2) -pcrel575: +pcrel634: auipc a1, %pcrel_hi(sheet1) - addi s1, a0, %pcrel_lo(pcrel574) - addi s3, a1, %pcrel_lo(pcrel575) -pcrel576: - auipc a1, %pcrel_hi(cmmc_parallel_body_1) - addi s2, a1, %pcrel_lo(pcrel576) - ble s0, zero, label369 - addi s9, s3, 2000 - mv a1, s6 + addi s0, a0, %pcrel_lo(pcrel632) + addi s2, a1, %pcrel_lo(pcrel634) +pcrel635: + auipc a1, %pcrel_hi(cmmc_parallel_body_2) + addi s1, a1, %pcrel_lo(pcrel635) + ble s6, zero, label436 + addi s9, s2, 2000 + mv a1, s7 li s7, 1 mv s10, s7 - bgt s6, zero, label402 - j label400 + bgt a1, zero, label431 + j label429 .p2align 2 -label406: +label435: addi s11, s11, 4 .p2align 2 -label403: +label432: jal getch addiw s7, s7, 1 xori a2, a0, 35 sltiu a1, a2, 1 sw a1, 0(s11) - bgt s6, s7, label406 + bgt s6, s7, label435 .p2align 2 -label400: +label429: jal getch + ld s6, 112(sp) addiw s10, s10, 1 - blt s0, s10, label369 -pcrel577: + blt s6, s10, label436 +pcrel636: auipc a0, %pcrel_hi(width) addi s9, s9, 2000 - lw a1, %pcrel_lo(pcrel577)(a0) - ble a1, zero, label400 + lw a1, %pcrel_lo(pcrel636)(a0) + ble a1, zero, label429 .p2align 2 -label402: +label431: auipc a0, %pcrel_hi(width) addi s11, s9, 4 li s7, 1 - lw a1, %pcrel_lo(label402)(a0) + lw a1, %pcrel_lo(label431)(a0) addi s6, a1, 1 - j label403 -label369: + j label432 +label436: li a0, 95 jal _sysy_starttime - ble s8, zero, label421 + ble s8, zero, label501 li s7, 1 mv a0, s7 - bne s7, s7, label425 + beq s7, s7, label442 + ld s6, 112(sp) + bgt s6, zero, label441 + j label508 .p2align 2 -label373: - ble s0, zero, label428 +label443: ld a1, 104(sp) li s7, 1 -pcrel578: +pcrel637: auipc s5, %pcrel_hi(cmmc_parallel_body_payload_2) - sw a1, %pcrel_lo(pcrel578)(s5) + sw a1, %pcrel_lo(pcrel637)(s5) mv a0, s7 mv a2, s1 jal cmmcParallelFor - addiw s8, s8, -1 li s7, 2 - bgt s8, zero, label445 - j label379 -label425: - ble s0, zero, label552 -.p2align 2 -label376: - auipc a2, %pcrel_hi(cmmc_parallel_body_payload_1) - li s7, 1 - ld a1, 104(sp) - sw a1, %pcrel_lo(label376)(a2) - mv a0, s7 - mv a2, s2 - jal cmmcParallelFor .p2align 2 -label377: +label444: addiw s8, s8, -1 - ble s8, zero, label379 + ble s8, zero, label446 .p2align 2 -label445: +label525: mv a0, s7 li s7, 1 - beq a0, s7, label373 - bgt s0, zero, label376 -label552: + bne a0, s7, label611 +.p2align 2 +label442: + ld s6, 112(sp) + bgt s6, zero, label443 + li s7, 2 + j label444 +.p2align 2 +label611: + ld s6, 112(sp) + ble s6, zero, label508 +.p2align 2 +label441: + ld a1, 104(sp) li s7, 1 - j label377 -label379: +pcrel638: + auipc s4, %pcrel_hi(cmmc_parallel_body_payload_1) + sw a1, %pcrel_lo(pcrel638)(s4) + mv a0, s7 + mv a2, s0 + jal cmmcParallelFor + addiw s8, s8, -1 + bgt s8, zero, label525 +label446: xori a2, s7, 2 li a0, 106 sltiu a1, a2, 1 - and s1, s4, a1 + and s0, s3, a1 jal _sysy_stoptime - bne s1, zero, label394 -label381: - ble s0, zero, label382 - addi s1, s3, 2000 + bne s0, zero, label461 +label448: + ld s6, 112(sp) + ble s6, zero, label449 + addi s1, s2, 2000 li s7, 1 mv s2, s7 -pcrel579: +pcrel639: auipc a0, %pcrel_hi(width) - lw a1, %pcrel_lo(pcrel579)(a0) + lw a1, %pcrel_lo(pcrel639)(a0) addi s0, a1, 1 - bgt a1, zero, label389 + bgt a1, zero, label456 .p2align 2 -label387: +label454: li a0, 10 jal putch ld a1, 104(sp) addiw s2, s2, 1 - ble a1, s2, label382 + ble a1, s2, label449 addi s1, s1, 2000 -pcrel580: +pcrel640: auipc a0, %pcrel_hi(width) - lw a1, %pcrel_lo(pcrel580)(a0) + lw a1, %pcrel_lo(pcrel640)(a0) addi s0, a1, 1 - ble a1, zero, label387 + ble a1, zero, label454 .p2align 2 -label389: +label456: addi s3, s1, 4 li s7, 1 mv s4, s7 lw a1, 0(s3) li a0, 35 - beq a1, s7, label537 + beq a1, s7, label593 .p2align 2 -label536: +label592: li a0, 46 .p2align 2 -label537: +label593: jal putch addiw s4, s4, 1 - ble s0, s4, label387 + ble s0, s4, label454 addi s3, s3, 4 li a0, 35 li s7, 1 lw a1, 0(s3) - beq a1, s7, label537 - j label536 -label382: + beq a1, s7, label593 + j label592 +label449: mv a0, zero ld ra, 0(sp) - ld s6, 8(sp) - ld s1, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s4, 40(sp) - ld s8, 48(sp) - ld s3, 56(sp) - ld s2, 64(sp) - ld s7, 72(sp) + ld s7, 8(sp) + ld s6, 16(sp) + ld s1, 24(sp) + ld s3, 32(sp) + ld s8, 40(sp) + ld s2, 48(sp) + ld s0, 56(sp) + ld s5, 64(sp) + ld s4, 72(sp) ld s9, 80(sp) ld s10, 88(sp) ld s11, 96(sp) - addi sp, sp, 112 + addi sp, sp, 120 ret -label394: +label461: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel581: +pcrel641: auipc a3, %pcrel_hi(cmmc_parallel_body_0) li s7, 1 - addi a2, a3, %pcrel_lo(pcrel581) + addi a2, a3, %pcrel_lo(pcrel641) ld a1, 104(sp) - sw a1, %pcrel_lo(label394)(a0) + sw a1, %pcrel_lo(label461)(a0) mv a0, s7 jal cmmcParallelFor - j label381 -label428: - li s7, 2 - j label377 -label421: + j label448 +label508: + li s7, 1 + j label444 +label501: li s7, 1 - j label379 + j label446 .p2align 2 cmmc_parallel_body_0: - addi sp, sp, -8 - mv t0, a0 -pcrel146: + addi sp, sp, -24 + mv t1, a0 +pcrel199: auipc a5, %pcrel_hi(sheet2) li a2, 2000 -pcrel147: - auipc t1, %pcrel_hi(sheet1) - addi a3, a5, %pcrel_lo(pcrel146) +pcrel200: + auipc t0, %pcrel_hi(sheet1) + addi a3, a5, %pcrel_lo(pcrel199) mul a4, a0, a2 sd s0, 0(sp) - li a5, 16 + li a5, 4 add a0, a3, a4 -pcrel148: + sd s1, 8(sp) +pcrel201: auipc a4, %pcrel_hi(width) - addi a3, t1, %pcrel_lo(pcrel147) - lw t3, %pcrel_lo(pcrel148)(a4) - bgt t3, zero, label8 - j label5 -.p2align 2 -label20: - addi t3, t3, 4 -.p2align 2 -label17: - sh2add t5, t4, t1 - lw t6, 0(t3) - addiw t4, t4, 1 - sw t6, 0(t5) - bgt t2, t4, label20 - addiw t0, t0, 1 - ble a1, t0, label7 -.p2align 2 -label6: + addi a3, t0, %pcrel_lo(pcrel200) + sd s2, 16(sp) + li t0, 16 + lw t4, %pcrel_lo(pcrel201)(a4) + bgt t4, zero, label5 + j label27 +.p2align 2 +label26: + addi t4, t4, 4 +.p2align 2 +label23: + sh2add t6, t5, t2 + lw a6, 0(t4) + addiw t5, t5, 1 + sw a6, 0(t6) + bgt t3, t5, label26 + addiw t1, t1, 1 + ble a1, t1, label29 +.p2align 2 +label28: addi a0, a0, 2000 -pcrel149: +pcrel202: auipc a4, %pcrel_hi(width) - lw t3, %pcrel_lo(pcrel149)(a4) - ble t3, zero, label5 -.p2align 2 -label8: - mul t5, t0, a2 - addiw t2, t3, 1 - addiw t4, t3, -14 - add t1, a3, t5 - ble t2, a5, label47 - addi t3, a0, 4 - li t6, 1 - j label10 -.p2align 2 -label13: - addi t3, t3, 64 + lw t4, %pcrel_lo(pcrel202)(a4) + ble t4, zero, label27 .p2align 2 -label10: - sh2add t5, t6, t1 - lw a7, 0(t3) - addiw t6, t6, 16 - sw a7, 0(t5) - lw a6, 4(t3) - sw a6, 4(t5) - lw a7, 8(t3) - sw a7, 8(t5) - lw a6, 12(t3) - sw a6, 12(t5) - lw a7, 16(t3) - sw a7, 16(t5) - lw a6, 20(t3) - sw a6, 20(t5) - lw a7, 24(t3) - sw a7, 24(t5) - lw a6, 28(t3) - sw a6, 28(t5) - lw s0, 32(t3) - sw s0, 32(t5) - lw a7, 36(t3) - sw a7, 36(t5) - lw a6, 40(t3) - sw a6, 40(t5) - lw s0, 44(t3) - sw s0, 44(t5) - lw a7, 48(t3) - sw a7, 48(t5) - lw a6, 52(t3) - sw a6, 52(t5) - lw a7, 56(t3) - sw a7, 56(t5) - lw a6, 60(t3) - sw a6, 60(t5) - bgt t4, t6, label13 - mv t4, t6 - ble t2, t6, label138 +label5: + mul t5, t1, a2 + addiw t3, t4, 1 + add t2, a3, t5 + ble t3, a5, label49 + addiw t5, t4, -2 + addiw t6, t4, -17 + ble t5, t0, label55 + addi t4, t2, 4 + li a6, 1 .p2align 2 label16: - sh2add t3, t4, a0 - j label17 -label5: - addiw t0, t0, 1 - bgt a1, t0, label6 -label7: + sh2add a7, a6, a0 + addiw a6, a6, 16 + lw s1, 0(a7) + sw s1, 0(t4) + lw s0, 4(a7) + sw s0, 4(t4) + lw s1, 8(a7) + sw s1, 8(t4) + lw s0, 12(a7) + sw s0, 12(t4) + lw s1, 16(a7) + sw s1, 16(t4) + lw s0, 20(a7) + sw s0, 20(t4) + lw s1, 24(a7) + sw s1, 24(t4) + lw s2, 28(a7) + sw s2, 28(t4) + lw s0, 32(a7) + sw s0, 32(t4) + lw s1, 36(a7) + sw s1, 36(t4) + lw s0, 40(a7) + sw s0, 40(t4) + lw s1, 44(a7) + sw s1, 44(t4) + lw s0, 48(a7) + sw s0, 48(t4) + lw s1, 52(a7) + sw s1, 52(t4) + lw s0, 56(a7) + sw s0, 56(t4) + lw s1, 60(a7) + sw s1, 60(t4) + ble t6, a6, label133 + addi t4, t4, 64 + j label16 +.p2align 2 +label133: + mv t6, a6 + ble t5, a6, label184 +.p2align 2 +label10: + sh2add t4, t6, t2 + j label11 +.p2align 2 +label14: + addi t4, t4, 16 +.p2align 2 +label11: + sh2add a6, t6, a0 + addiw t6, t6, 4 + lw s0, 0(a6) + sw s0, 0(t4) + lw a7, 4(a6) + sw a7, 4(t4) + lw s0, 8(a6) + sw s0, 8(t4) + lw a7, 12(a6) + sw a7, 12(t4) + bgt t5, t6, label14 + mv t5, t6 + ble t3, t6, label183 +.p2align 2 +label22: + sh2add t4, t5, a0 + j label23 +label55: + li t6, 1 + mv a6, zero + bgt t5, t6, label10 + mv t5, zero + bgt t3, zero, label22 + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 +.p2align 2 +label184: + mv t5, a6 + bgt t3, a6, label22 + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 +label27: + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 +label49: + li t5, 1 + bgt t3, t5, label22 + addiw t1, t1, 1 + bgt a1, t1, label28 +label29: ld s0, 0(sp) - addi sp, sp, 8 + ld s1, 8(sp) + ld s2, 16(sp) + addi sp, sp, 24 ret .p2align 2 -label47: - li t4, 1 - bgt t2, t4, label16 - addiw t0, t0, 1 - bgt a1, t0, label6 - j label7 -label138: - addiw t0, t0, 1 - bgt a1, t0, label6 - j label7 +label183: + addiw t1, t1, 1 + bgt a1, t1, label28 + j label29 .p2align 2 cmmc_parallel_body_1: addi sp, sp, -8 mv a4, a1 -pcrel255: +pcrel308: auipc a3, %pcrel_hi(sheet2) li t0, 2000 mv t3, a0 -pcrel256: +pcrel309: auipc t2, %pcrel_hi(width) sd s0, 0(sp) - addi a2, a3, %pcrel_lo(pcrel255) + addi a2, a3, %pcrel_lo(pcrel308) mulw a1, a0, t0 -pcrel257: +pcrel310: auipc a3, %pcrel_hi(sheet1) add a5, a2, a1 - addi t1, a3, %pcrel_lo(pcrel257) + addi t1, a3, %pcrel_lo(pcrel310) li a2, 1 - lw a0, %pcrel_lo(pcrel256)(t2) + lw a0, %pcrel_lo(pcrel309)(t2) mulw t4, t3, t0 addi a1, a0, 1 add a3, t1, t4 - bgt a0, zero, label157 - j label154 + bgt a0, zero, label210 + j label207 .p2align 2 -label225: +label278: addiw t3, t3, 1 - ble a4, t3, label156 + ble a4, t3, label209 .p2align 2 -label155: +label208: addi a5, a5, 2000 mulw t4, t3, t0 -pcrel258: +pcrel311: auipc t2, %pcrel_hi(width) - lw a0, %pcrel_lo(pcrel258)(t2) + lw a0, %pcrel_lo(pcrel311)(t2) add a3, t1, t4 addi a1, a0, 1 - ble a0, zero, label154 + ble a0, zero, label207 .p2align 2 -label157: +label210: addi a0, a5, 4 mv t4, a2 - j label158 + j label211 .p2align 2 -label161: +label214: sh2add t6, t4, a3 sw a2, 0(t6) - ble a1, t5, label225 + ble a1, t5, label278 .p2align 2 -label163: +label216: addi a0, a0, 4 mv t4, t5 .p2align 2 -label158: +label211: lw t5, -2004(a0) lw t6, -2000(a0) lw a7, -1996(a0) @@ -417,109 +473,111 @@ label158: xori s0, t5, 1 addiw t5, t4, 1 or a7, a6, s0 - beq a7, zero, label161 + beq a7, zero, label214 xori a7, t6, 3 sh2add t6, t4, a3 sltiu a6, a7, 1 sw a6, 0(t6) - bgt a1, t5, label163 + bgt a1, t5, label216 addiw t3, t3, 1 - bgt a4, t3, label155 -label156: + bgt a4, t3, label208 +label209: ld s0, 0(sp) addi sp, sp, 8 ret -label154: +label207: addiw t3, t3, 1 - bgt a4, t3, label155 - j label156 + bgt a4, t3, label208 + j label209 .p2align 2 cmmc_parallel_body_2: - addi sp, sp, -8 + addi sp, sp, -16 mv a4, a1 -pcrel364: +pcrel419: auipc a3, %pcrel_hi(sheet1) li t0, 2000 mv t3, a0 -pcrel365: +pcrel420: auipc t2, %pcrel_hi(width) - sd s0, 0(sp) - addi a2, a3, %pcrel_lo(pcrel364) + sd s1, 0(sp) + addi a2, a3, %pcrel_lo(pcrel419) mulw a1, a0, t0 -pcrel366: + sd s0, 8(sp) +pcrel421: auipc a3, %pcrel_hi(sheet2) add a5, a2, a1 - addi t1, a3, %pcrel_lo(pcrel366) + addi t1, a3, %pcrel_lo(pcrel421) li a2, 1 - lw a0, %pcrel_lo(pcrel365)(t2) + lw a0, %pcrel_lo(pcrel420)(t2) mulw t4, t3, t0 addi a1, a0, 1 add a3, t1, t4 - bgt a0, zero, label266 - j label263 -.p2align 2 -label334: - addiw t3, t3, 1 - ble a4, t3, label265 -.p2align 2 -label264: - addi a5, a5, 2000 - mulw t4, t3, t0 -pcrel367: - auipc t2, %pcrel_hi(width) - lw a0, %pcrel_lo(pcrel367)(t2) - add a3, t1, t4 - addi a1, a0, 1 - ble a0, zero, label263 -.p2align 2 -label266: - addi a0, a5, 4 - mv t4, a2 - j label267 + bgt a0, zero, label316 + j label324 .p2align 2 -label270: +label375: + xori a7, t6, 3 sh2add t6, t4, a3 - sw a2, 0(t6) - ble a1, t5, label334 + sltiu a6, a7, 1 + sw a6, 0(t6) + ble a1, t5, label379 .p2align 2 -label272: +label321: addi a0, a0, 4 mv t4, t5 .p2align 2 -label267: - lw t5, -2004(a0) +label317: + lw a6, -2004(a0) lw t6, -2000(a0) lw a7, -1996(a0) - addw a6, t5, t6 - lw t5, -4(a0) - addw t6, a6, a7 - lw a6, 4(a0) - addw a7, t6, t5 - lw s0, 1996(a0) - addw t5, a7, a6 - lw t6, 2000(a0) - addw a6, t5, s0 + addw t5, a6, t6 + lw s1, -4(a0) + addw t6, t5, a7 + lw s0, 4(a0) + addw a6, t6, s1 + lw a7, 1996(a0) + addw t5, a6, s0 + lw a6, 2000(a0) + addw t6, t5, a7 lw t5, 2004(a0) - addw a7, a6, t6 + addw a7, t6, a6 addw t6, a7, t5 lw t5, 0(a0) xori a6, t6, 2 xori s0, t5, 1 addiw t5, t4, 1 or a7, a6, s0 - beq a7, zero, label270 - xori a7, t6, 3 - sh2add t6, t4, a3 - sltiu a6, a7, 1 - sw a6, 0(t6) - bgt a1, t5, label272 + bne a7, zero, label375 + sh2add a6, t4, a3 + sw a2, 0(a6) + bgt a1, t5, label321 addiw t3, t3, 1 - bgt a4, t3, label264 -label265: - ld s0, 0(sp) - addi sp, sp, 8 + ble a4, t3, label326 +.p2align 2 +label325: + addi a5, a5, 2000 + mulw t4, t3, t0 +pcrel422: + auipc t2, %pcrel_hi(width) + lw a0, %pcrel_lo(pcrel422)(t2) + add a3, t1, t4 + addi a1, a0, 1 + ble a0, zero, label324 +.p2align 2 +label316: + addi a0, a5, 4 + mv t4, a2 + j label317 +label324: + addiw t3, t3, 1 + bgt a4, t3, label325 +label326: + ld s1, 0(sp) + ld s0, 8(sp) + addi sp, sp, 16 ret -label263: +.p2align 2 +label379: addiw t3, t3, 1 - bgt a4, t3, label264 - j label265 + bgt a4, t3, label325 + j label326 diff --git a/tests/SysY2022/performance/gameoflife-p61glidergun.sy.ir b/tests/SysY2022/performance/gameoflife-p61glidergun.sy.ir index 8be93b9f0..c453f20de 100644 --- a/tests/SysY2022/performance/gameoflife-p61glidergun.sy.ir +++ b/tests/SysY2022/performance/gameoflife-p61glidergun.sy.ir @@ -126,7 +126,7 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel [500 * [500 * i32]]* %4 = ptrcast [500 * [500 * i32]]* @sheet2 to [500 * [500 * i32]]*; ubr ^b1; ^b1: - i32 %5 = phi [^b, i32 %0] [^b2, i32 %13]; + i32 %5 = phi [^b, i32 %0] [^b2, i32 %12]; i32 %6 = load i32* %2; i1 %7 = icmp sgt i32 %6, i32 0; cbr i1 %7(prob = 0.984615), ^super.header, ^b2; @@ -134,97 +134,127 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel [500 * i32]* %8 = getelementptr &([500 * [500 * i32]]* %4)[i64 0][i32 %5]; [500 * i32]* %9 = getelementptr &([500 * [500 * i32]]* %3)[i64 0][i32 %5]; i32 %10 = add i32 %6, i32 1; - i1 %11 = icmp sgt i32 %10, i32 16; - i32 %12 = add i32 %6, i32 -14; - cbr i1 %11(prob = 0.8), ^while.body, ^scalar.header; + i1 %11 = icmp sgt i32 %10, i32 4; + cbr i1 %11(prob = 0.941176), ^super.header1, ^scalar.header; ^b2: - i32 %13 = add i32 %5, i32 1; - i1 %14 = icmp sgt i32 %1, i32 %13; - cbr i1 %14(prob = 0.984615), ^b1, ^b3; - ^while.body: - i32 %15 = phi [^super.header, i32 1] [^while.body, i32 %64]; - i32* %16 = getelementptr &([500 * i32]* %8)[i64 0][i32 %15]; - i32 %17 = load i32* %16; - i32* %18 = getelementptr &([500 * i32]* %9)[i64 0][i32 %15]; - store i32* %18 with i32 %17; - i32* %19 = getelementptr &(i32* %16)[i64 1]; - i32 %20 = load i32* %19; - i32* %21 = getelementptr &(i32* %18)[i64 1]; - store i32* %21 with i32 %20; - i32* %22 = getelementptr &(i32* %16)[i64 2]; - i32 %23 = load i32* %22; - i32* %24 = getelementptr &(i32* %18)[i64 2]; - store i32* %24 with i32 %23; - i32* %25 = getelementptr &(i32* %16)[i64 3]; - i32 %26 = load i32* %25; - i32* %27 = getelementptr &(i32* %18)[i64 3]; - store i32* %27 with i32 %26; - i32* %28 = getelementptr &(i32* %16)[i64 4]; - i32 %29 = load i32* %28; - i32* %30 = getelementptr &(i32* %18)[i64 4]; - store i32* %30 with i32 %29; - i32* %31 = getelementptr &(i32* %16)[i64 5]; - i32 %32 = load i32* %31; - i32* %33 = getelementptr &(i32* %18)[i64 5]; - store i32* %33 with i32 %32; - i32* %34 = getelementptr &(i32* %16)[i64 6]; - i32 %35 = load i32* %34; - i32* %36 = getelementptr &(i32* %18)[i64 6]; - store i32* %36 with i32 %35; - i32* %37 = getelementptr &(i32* %16)[i64 7]; - i32 %38 = load i32* %37; - i32* %39 = getelementptr &(i32* %18)[i64 7]; - store i32* %39 with i32 %38; - i32* %40 = getelementptr &(i32* %16)[i64 8]; - i32 %41 = load i32* %40; - i32* %42 = getelementptr &(i32* %18)[i64 8]; - store i32* %42 with i32 %41; - i32* %43 = getelementptr &(i32* %16)[i64 9]; - i32 %44 = load i32* %43; - i32* %45 = getelementptr &(i32* %18)[i64 9]; - store i32* %45 with i32 %44; - i32* %46 = getelementptr &(i32* %16)[i64 10]; - i32 %47 = load i32* %46; - i32* %48 = getelementptr &(i32* %18)[i64 10]; - store i32* %48 with i32 %47; - i32* %49 = getelementptr &(i32* %16)[i64 11]; - i32 %50 = load i32* %49; - i32* %51 = getelementptr &(i32* %18)[i64 11]; - store i32* %51 with i32 %50; - i32* %52 = getelementptr &(i32* %16)[i64 12]; - i32 %53 = load i32* %52; - i32* %54 = getelementptr &(i32* %18)[i64 12]; - store i32* %54 with i32 %53; - i32* %55 = getelementptr &(i32* %16)[i64 13]; - i32 %56 = load i32* %55; - i32* %57 = getelementptr &(i32* %18)[i64 13]; - store i32* %57 with i32 %56; - i32* %58 = getelementptr &(i32* %16)[i64 14]; - i32 %59 = load i32* %58; - i32* %60 = getelementptr &(i32* %18)[i64 14]; - store i32* %60 with i32 %59; - i32* %61 = getelementptr &(i32* %16)[i64 15]; - i32 %62 = load i32* %61; - i32* %63 = getelementptr &(i32* %18)[i64 15]; - store i32* %63 with i32 %62; - i32 %64 = add i32 %15, i32 16; - i1 %65 = icmp sgt i32 %12, i32 %64; - cbr i1 %65(prob = 0.8), ^while.body, ^scalar.header; + i32 %12 = add i32 %5, i32 1; + i1 %13 = icmp sgt i32 %1, i32 %12; + cbr i1 %13(prob = 0.984615), ^b1, ^b3; + ^super.header1: + i32 %14 = add i32 %6, i32 -2; + i1 %15 = icmp sgt i32 %14, i32 16; + i32 %16 = add i32 %6, i32 -17; + cbr i1 %15(prob = 0.941176), ^while.body, ^scalar.header1; ^scalar.header: - i32 %66 = phi [^super.header, i32 1] [^while.body, i32 %64]; - i1 %67 = icmp sgt i32 %10, i32 %66; - cbr i1 %67(prob = 0.9375), ^while.body1, ^b2; + i32 %17 = phi [^super.header, i32 1] [^scalar.header1, i32 %71] [^while.body2, i32 %92]; + i1 %18 = icmp sgt i32 %10, i32 %17; + cbr i1 %18(prob = 0.75), ^while.body1, ^b2; ^b3: ret; + ^while.body: + i32 %19 = phi [^super.header1, i32 1] [^while.body, i32 %68]; + i32* %20 = getelementptr &([500 * i32]* %8)[i64 0][i32 %19]; + i32 %21 = load i32* %20; + i32* %22 = getelementptr &([500 * i32]* %9)[i64 0][i32 %19]; + store i32* %22 with i32 %21; + i32* %23 = getelementptr &(i32* %20)[i64 1]; + i32 %24 = load i32* %23; + i32* %25 = getelementptr &(i32* %22)[i64 1]; + store i32* %25 with i32 %24; + i32* %26 = getelementptr &(i32* %20)[i64 2]; + i32 %27 = load i32* %26; + i32* %28 = getelementptr &(i32* %22)[i64 2]; + store i32* %28 with i32 %27; + i32* %29 = getelementptr &(i32* %20)[i64 3]; + i32 %30 = load i32* %29; + i32* %31 = getelementptr &(i32* %22)[i64 3]; + store i32* %31 with i32 %30; + i32* %32 = getelementptr &(i32* %20)[i64 4]; + i32 %33 = load i32* %32; + i32* %34 = getelementptr &(i32* %22)[i64 4]; + store i32* %34 with i32 %33; + i32* %35 = getelementptr &(i32* %20)[i64 5]; + i32 %36 = load i32* %35; + i32* %37 = getelementptr &(i32* %22)[i64 5]; + store i32* %37 with i32 %36; + i32* %38 = getelementptr &(i32* %20)[i64 6]; + i32 %39 = load i32* %38; + i32* %40 = getelementptr &(i32* %22)[i64 6]; + store i32* %40 with i32 %39; + i32* %41 = getelementptr &(i32* %20)[i64 7]; + i32 %42 = load i32* %41; + i32* %43 = getelementptr &(i32* %22)[i64 7]; + store i32* %43 with i32 %42; + i32* %44 = getelementptr &(i32* %20)[i64 8]; + i32 %45 = load i32* %44; + i32* %46 = getelementptr &(i32* %22)[i64 8]; + store i32* %46 with i32 %45; + i32* %47 = getelementptr &(i32* %20)[i64 9]; + i32 %48 = load i32* %47; + i32* %49 = getelementptr &(i32* %22)[i64 9]; + store i32* %49 with i32 %48; + i32* %50 = getelementptr &(i32* %20)[i64 10]; + i32 %51 = load i32* %50; + i32* %52 = getelementptr &(i32* %22)[i64 10]; + store i32* %52 with i32 %51; + i32* %53 = getelementptr &(i32* %20)[i64 11]; + i32 %54 = load i32* %53; + i32* %55 = getelementptr &(i32* %22)[i64 11]; + store i32* %55 with i32 %54; + i32* %56 = getelementptr &(i32* %20)[i64 12]; + i32 %57 = load i32* %56; + i32* %58 = getelementptr &(i32* %22)[i64 12]; + store i32* %58 with i32 %57; + i32* %59 = getelementptr &(i32* %20)[i64 13]; + i32 %60 = load i32* %59; + i32* %61 = getelementptr &(i32* %22)[i64 13]; + store i32* %61 with i32 %60; + i32* %62 = getelementptr &(i32* %20)[i64 14]; + i32 %63 = load i32* %62; + i32* %64 = getelementptr &(i32* %22)[i64 14]; + store i32* %64 with i32 %63; + i32* %65 = getelementptr &(i32* %20)[i64 15]; + i32 %66 = load i32* %65; + i32* %67 = getelementptr &(i32* %22)[i64 15]; + store i32* %67 with i32 %66; + i32 %68 = add i32 %19, i32 16; + i1 %69 = icmp sgt i32 %16, i32 %68; + cbr i1 %69(prob = 0.941176), ^while.body, ^scalar.header1; + ^scalar.header1: + i32 %70 = phi [^super.header1, i32 1] [^while.body, i32 %68]; + i32 %71 = phi [^super.header1, i32 undef] [^while.body, i32 %68]; + i1 %72 = icmp sgt i32 %14, i32 %70; + cbr i1 %72(prob = 0.75), ^while.body2, ^scalar.header; ^while.body1 {scalar}: - i32 %68 = phi [^scalar.header, i32 %66] [^while.body1, i32 %72]; - i32* %69 = getelementptr &([500 * i32]* %8)[i64 0][i32 %68]; - i32 %70 = load i32* %69; - i32* %71 = getelementptr &([500 * i32]* %9)[i64 0][i32 %68]; - store i32* %71 with i32 %70; - i32 %72 = add i32 %68, i32 1; - i1 %73 = icmp sgt i32 %10, i32 %72; - cbr i1 %73(prob = 0.9375), ^while.body1, ^b2; + i32 %73 = phi [^scalar.header, i32 %17] [^while.body1, i32 %77]; + i32* %74 = getelementptr &([500 * i32]* %8)[i64 0][i32 %73]; + i32 %75 = load i32* %74; + i32* %76 = getelementptr &([500 * i32]* %9)[i64 0][i32 %73]; + store i32* %76 with i32 %75; + i32 %77 = add i32 %73, i32 1; + i1 %78 = icmp sgt i32 %10, i32 %77; + cbr i1 %78(prob = 0.75), ^while.body1, ^b2; + ^while.body2 {scalar}: + i32 %79 = phi [^scalar.header1, i32 %70] [^while.body2, i32 %92]; + i32* %80 = getelementptr &([500 * i32]* %8)[i64 0][i32 %79]; + i32 %81 = load i32* %80; + i32* %82 = getelementptr &([500 * i32]* %9)[i64 0][i32 %79]; + store i32* %82 with i32 %81; + i32* %83 = getelementptr &(i32* %80)[i64 1]; + i32 %84 = load i32* %83; + i32* %85 = getelementptr &(i32* %82)[i64 1]; + store i32* %85 with i32 %84; + i32* %86 = getelementptr &(i32* %80)[i64 2]; + i32 %87 = load i32* %86; + i32* %88 = getelementptr &(i32* %82)[i64 2]; + store i32* %88 with i32 %87; + i32* %89 = getelementptr &(i32* %80)[i64 3]; + i32 %90 = load i32* %89; + i32* %91 = getelementptr &(i32* %82)[i64 3]; + store i32* %91 with i32 %90; + i32 %92 = add i32 %79, i32 4; + i1 %93 = icmp sgt i32 %14, i32 %92; + cbr i1 %93(prob = 0.75), ^while.body2, ^scalar.header; } internal [4 * i8]* @cmmc_parallel_body_payload_0, align 8; internal func @cmmc_parallel_body_1(i32 %0, i32 %1) -> void { NoRecurse ParallelBody } { diff --git a/tests/SysY2022/performance/large_loop_array_1.arm.s b/tests/SysY2022/performance/large_loop_array_1.arm.s index 70dfde35e..fa4cfe4f6 100644 --- a/tests/SysY2022/performance/large_loop_array_1.arm.s +++ b/tests/SysY2022/performance/large_loop_array_1.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 24 -.align 8 +.p2align 3 y: .zero 8192 -.align 8 +.p2align 3 x: .zero 8192 .text diff --git a/tests/SysY2022/performance/large_loop_array_1.riscv.s b/tests/SysY2022/performance/large_loop_array_1.riscv.s index a70a6788f..d6c341061 100644 --- a/tests/SysY2022/performance/large_loop_array_1.riscv.s +++ b/tests/SysY2022/performance/large_loop_array_1.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1045220557 .4byte 1036831949 @@ -9,158 +9,149 @@ __cmmc_fp_constant_pool: .4byte 897988541 .4byte 3045472189 .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 40 -.align 8 +.p2align 3 y: .zero 8192 -.align 8 +.p2align 3 x: .zero 8192 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[16] CalleeSaved[116] - addi sp, sp, -144 + addi sp, sp, -128 sd ra, 0(sp) - sd s7, 8(sp) - sd s3, 16(sp) - fsw f8, 24(sp) - sd s10, 32(sp) - fsw f9, 40(sp) - sd s4, 48(sp) - fsw f18, 56(sp) - sd s6, 64(sp) - sd s1, 72(sp) - sd s8, 80(sp) - sd s5, 88(sp) - sd s0, 96(sp) + sd s8, 8(sp) + fsw f8, 16(sp) + sd s11, 24(sp) + fsw f9, 32(sp) + sd s3, 40(sp) + fsw f18, 48(sp) + sd s6, 56(sp) + sd s9, 64(sp) + sd s5, 72(sp) + sd s0, 80(sp) + sd s4, 88(sp) + sd s1, 96(sp) sd s2, 104(sp) - sd s11, 112(sp) - sd s9, 120(sp) + sd s7, 112(sp) + sd s10, 120(sp) jal getint - sd a0, 128(sp) - mv s11, a0 + mv s2, a0 li a0, 22 jal _sysy_starttime -pcrel312: +pcrel309: auipc a2, %pcrel_hi(cmmc_parallel_body_0) -pcrel313: + addiw s1, s2, -3 +pcrel310: auipc a1, %pcrel_hi(y) - lui s4, 260096 -pcrel314: + lui s6, 260096 +pcrel311: auipc a0, %pcrel_hi(x) -pcrel315: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_0) - mv s10, zero - li s9, 3 +pcrel312: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) + mv s11, zero + li s10, 3 fmv.w.x f18, zero - ld s11, 128(sp) - addi s7, a2, %pcrel_lo(pcrel312) - addi s0, a1, %pcrel_lo(pcrel313) - fmv.w.x f9, s4 - addi s2, a0, %pcrel_lo(pcrel314) - addi s1, s8, %pcrel_lo(pcrel315) + addi s8, a2, %pcrel_lo(pcrel309) + addi s0, a1, %pcrel_lo(pcrel310) + fmv.w.x f9, s6 + addi s4, a0, %pcrel_lo(pcrel311) + addi s3, s9, %pcrel_lo(pcrel312) fmv.s f8, f18 - addiw a3, s11, -3 - lui a2, 419430 -pcrel316: +pcrel313: auipc a1, %pcrel_hi(__cmmc_fp_constant_pool) mv a0, zero - addiw s6, a2, 1639 - addi s3, a1, %pcrel_lo(pcrel316) - sd a3, 136(sp) + addi s5, a1, %pcrel_lo(pcrel313) lui a1, 122 - addiw s5, a1, 288 + addiw s7, a1, 288 j label92 .p2align 2 label187: fadd.s f8, f8, f10 - addiw s10, s10, 1 - bge s10, s5, label120 + addiw s11, s11, 1 + bge s11, s7, label120 .p2align 2 label92: - mul a1, s10, s6 - flw f11, 0(s3) - fmv.w.x f12, s4 + lui a4, 419430 + fmv.w.x f12, s6 + flw f11, 0(s5) + addiw a3, a4, 1639 + mul a1, s11, a3 + fadd.s f10, f9, f11 srli t0, a1, 63 srai a2, a1, 34 - fadd.s f10, f9, f11 - add a3, t0, a2 fmv.s f9, f12 + add a3, t0, a2 sh2add a4, a3, a3 slliw a5, a4, 1 - subw a2, s10, a5 + subw a2, s11, a5 sltu a1, zero, a2 - bne a1, zero, label275 + bne a1, zero, label274 fmv.s f9, f10 .p2align 2 -label275: - flw f12, 4(s3) +label274: + flw f12, 4(s5) fmv.w.x f11, zero fadd.s f10, f18, f12 fmv.s f18, f11 - bne a1, zero, label277 + bne a1, zero, label276 fmv.s f18, f10 .p2align 2 -label277: - ld s11, 128(sp) - ble s11, a0, label152 -pcrel317: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_0) - sw a0, %pcrel_lo(pcrel317)(s8) - sd s2, 8(s1) - fsw f18, 16(s1) - sd s0, 24(s1) - fsw f9, 32(s1) - sw s11, 36(s1) - mv a1, s11 - mv a2, s7 +label276: + ble s2, a0, label152 +pcrel314: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) + sw a0, %pcrel_lo(pcrel314)(s9) + sd s4, 8(s3) + fsw f18, 16(s3) + sd s0, 24(s3) + fsw f9, 32(s3) + sw s2, 36(s3) + mv a1, s2 + mv a2, s8 jal cmmcParallelFor - mv a0, s11 - ld s11, 128(sp) - ble s11, zero, label163 + mv a0, s2 + ble s2, zero, label163 .p2align 2 label101: - ld s11, 128(sp) - ble s11, s9, label168 + ble s2, s10, label168 fmv.w.x f10, zero - mv a1, s2 + mv a1, s4 mv a2, zero .p2align 2 label113: sh2add a3, a2, s0 - flw f12, 0(a1) + flw f13, 0(a1) addiw a2, a2, 4 flw f14, 0(a3) - flw f13, 4(a1) - fmul.s f15, f12, f14 + flw f12, 4(a1) + fmul.s f15, f13, f14 flw f14, 4(a3) - flw f12, 8(a1) - fmul.s f0, f13, f14 + flw f13, 8(a1) + fmul.s f0, f12, f14 fadd.s f11, f10, f15 flw f15, 8(a3) - flw f13, 12(a1) - fmul.s f14, f12, f15 - flw f12, 12(a3) + flw f12, 12(a1) + fmul.s f14, f13, f15 + flw f13, 12(a3) fadd.s f10, f11, f0 - ld a3, 136(sp) fadd.s f11, f10, f14 - fmul.s f14, f13, f12 + fmul.s f14, f12, f13 fadd.s f10, f11, f14 - ble a3, a2, label218 + ble s1, a2, label218 addi a1, a1, 16 j label113 .p2align 2 label218: fmv.s f11, f10 - ld s11, 128(sp) - ble s11, a2, label300 + ble s2, a2, label297 .p2align 2 label106: - sh2add a1, a2, s2 + sh2add a1, a2, s4 fmv.s f10, f11 .p2align 2 label107: @@ -168,10 +159,9 @@ label107: flw f11, 0(a1) addiw a2, a2, 1 flw f12, 0(a3) - ld s11, 128(sp) fmul.s f13, f11, f12 fadd.s f10, f10, f13 - ble s11, a2, label187 + ble s2, a2, label187 addi a1, a1, 4 j label107 .p2align 2 @@ -179,26 +169,24 @@ label168: fmv.w.x f10, zero mv a2, zero fmv.s f11, f10 - ld s11, 128(sp) - bgt s11, zero, label106 + bgt s2, zero, label106 fadd.s f8, f8, f10 - addiw s10, s10, 1 - blt s10, s5, label92 + addiw s11, s11, 1 + blt s11, s7, label92 j label120 .p2align 2 label152: - ld s11, 128(sp) - bgt s11, zero, label101 + bgt s2, zero, label101 fmv.w.x f10, zero - addiw s10, s10, 1 + addiw s11, s11, 1 fadd.s f8, f8, f10 - blt s10, s5, label92 + blt s11, s7, label92 label120: - flw f12, 8(s3) + flw f12, 8(s5) li a0, 39 - flw f11, 12(s3) + flw f11, 12(s5) fsub.s f10, f8, f12 - flw f12, 16(s3) + flw f12, 16(s5) fle.s a1, f10, f11 fle.s a2, f12, f10 or s0, a1, a2 @@ -209,22 +197,22 @@ label120: mv a0, zero label121: ld ra, 0(sp) - ld s7, 8(sp) - ld s3, 16(sp) - flw f8, 24(sp) - ld s10, 32(sp) - flw f9, 40(sp) - ld s4, 48(sp) - flw f18, 56(sp) - ld s6, 64(sp) - ld s1, 72(sp) - ld s8, 80(sp) - ld s5, 88(sp) - ld s0, 96(sp) + ld s8, 8(sp) + flw f8, 16(sp) + ld s11, 24(sp) + flw f9, 32(sp) + ld s3, 40(sp) + flw f18, 48(sp) + ld s6, 56(sp) + ld s9, 64(sp) + ld s5, 72(sp) + ld s0, 80(sp) + ld s4, 88(sp) + ld s1, 96(sp) ld s2, 104(sp) - ld s11, 112(sp) - ld s9, 120(sp) - addi sp, sp, 144 + ld s7, 112(sp) + ld s10, 120(sp) + addi sp, sp, 128 ret label123: li a0, 1 @@ -232,17 +220,17 @@ label123: li a0, 1 j label121 .p2align 2 -label300: +label297: fadd.s f8, f8, f10 - addiw s10, s10, 1 - blt s10, s5, label92 + addiw s11, s11, 1 + blt s11, s7, label92 j label120 .p2align 2 label163: fmv.w.x f10, zero - addiw s10, s10, 1 + addiw s11, s11, 1 fadd.s f8, f8, f10 - blt s10, s5, label92 + blt s11, s7, label92 j label120 .p2align 2 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/large_loop_array_2.arm.s b/tests/SysY2022/performance/large_loop_array_2.arm.s index 08bc9ab41..99e570124 100644 --- a/tests/SysY2022/performance/large_loop_array_2.arm.s +++ b/tests/SysY2022/performance/large_loop_array_2.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 24 -.align 8 +.p2align 3 y: .zero 16384 -.align 8 +.p2align 3 x: .zero 16384 .text diff --git a/tests/SysY2022/performance/large_loop_array_2.riscv.s b/tests/SysY2022/performance/large_loop_array_2.riscv.s index 4e1d41440..c0dbba957 100644 --- a/tests/SysY2022/performance/large_loop_array_2.riscv.s +++ b/tests/SysY2022/performance/large_loop_array_2.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1045220557 .4byte 1036831949 @@ -9,158 +9,149 @@ __cmmc_fp_constant_pool: .4byte 897988541 .4byte 3045472189 .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 40 -.align 8 +.p2align 3 y: .zero 16384 -.align 8 +.p2align 3 x: .zero 16384 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[16] CalleeSaved[116] - addi sp, sp, -144 + addi sp, sp, -128 sd ra, 0(sp) - sd s7, 8(sp) - sd s3, 16(sp) - fsw f8, 24(sp) - sd s10, 32(sp) - fsw f9, 40(sp) - sd s4, 48(sp) - fsw f18, 56(sp) - sd s6, 64(sp) - sd s1, 72(sp) - sd s8, 80(sp) - sd s5, 88(sp) - sd s0, 96(sp) + sd s8, 8(sp) + fsw f8, 16(sp) + sd s11, 24(sp) + fsw f9, 32(sp) + sd s3, 40(sp) + fsw f18, 48(sp) + sd s6, 56(sp) + sd s9, 64(sp) + sd s5, 72(sp) + sd s0, 80(sp) + sd s4, 88(sp) + sd s1, 96(sp) sd s2, 104(sp) - sd s11, 112(sp) - sd s9, 120(sp) + sd s7, 112(sp) + sd s10, 120(sp) jal getint - sd a0, 128(sp) - mv s11, a0 + mv s2, a0 li a0, 22 jal _sysy_starttime -pcrel312: +pcrel309: auipc a2, %pcrel_hi(cmmc_parallel_body_0) -pcrel313: + addiw s1, s2, -3 +pcrel310: auipc a1, %pcrel_hi(y) - lui s4, 260096 -pcrel314: + lui s6, 260096 +pcrel311: auipc a0, %pcrel_hi(x) -pcrel315: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_0) - mv s10, zero - li s9, 3 +pcrel312: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) + mv s11, zero + li s10, 3 fmv.w.x f18, zero - ld s11, 128(sp) - addi s7, a2, %pcrel_lo(pcrel312) - addi s0, a1, %pcrel_lo(pcrel313) - fmv.w.x f9, s4 - addi s2, a0, %pcrel_lo(pcrel314) - addi s1, s8, %pcrel_lo(pcrel315) + addi s8, a2, %pcrel_lo(pcrel309) + addi s0, a1, %pcrel_lo(pcrel310) + fmv.w.x f9, s6 + addi s4, a0, %pcrel_lo(pcrel311) + addi s3, s9, %pcrel_lo(pcrel312) fmv.s f8, f18 - addiw a3, s11, -3 - lui a2, 419430 -pcrel316: +pcrel313: auipc a1, %pcrel_hi(__cmmc_fp_constant_pool) mv a0, zero - addiw s6, a2, 1639 - addi s3, a1, %pcrel_lo(pcrel316) - sd a3, 136(sp) + addi s5, a1, %pcrel_lo(pcrel313) lui a1, 122 - addiw s5, a1, 288 + addiw s7, a1, 288 j label92 .p2align 2 label187: fadd.s f8, f8, f10 - addiw s10, s10, 1 - bge s10, s5, label120 + addiw s11, s11, 1 + bge s11, s7, label120 .p2align 2 label92: - mul a1, s10, s6 - flw f11, 0(s3) - fmv.w.x f12, s4 + lui a4, 419430 + fmv.w.x f12, s6 + flw f11, 0(s5) + addiw a3, a4, 1639 + mul a1, s11, a3 + fadd.s f10, f9, f11 srli t0, a1, 63 srai a2, a1, 34 - fadd.s f10, f9, f11 - add a3, t0, a2 fmv.s f9, f12 + add a3, t0, a2 sh2add a4, a3, a3 slliw a5, a4, 1 - subw a2, s10, a5 + subw a2, s11, a5 sltu a1, zero, a2 - bne a1, zero, label275 + bne a1, zero, label274 fmv.s f9, f10 .p2align 2 -label275: - flw f12, 4(s3) +label274: + flw f12, 4(s5) fmv.w.x f11, zero fadd.s f10, f18, f12 fmv.s f18, f11 - bne a1, zero, label277 + bne a1, zero, label276 fmv.s f18, f10 .p2align 2 -label277: - ld s11, 128(sp) - ble s11, a0, label152 -pcrel317: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_0) - sw a0, %pcrel_lo(pcrel317)(s8) - sd s2, 8(s1) - fsw f18, 16(s1) - sd s0, 24(s1) - fsw f9, 32(s1) - sw s11, 36(s1) - mv a1, s11 - mv a2, s7 +label276: + ble s2, a0, label152 +pcrel314: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) + sw a0, %pcrel_lo(pcrel314)(s9) + sd s4, 8(s3) + fsw f18, 16(s3) + sd s0, 24(s3) + fsw f9, 32(s3) + sw s2, 36(s3) + mv a1, s2 + mv a2, s8 jal cmmcParallelFor - mv a0, s11 - ld s11, 128(sp) - ble s11, zero, label163 + mv a0, s2 + ble s2, zero, label163 .p2align 2 label101: - ld s11, 128(sp) - ble s11, s9, label168 + ble s2, s10, label168 fmv.w.x f10, zero - mv a1, s2 + mv a1, s4 mv a2, zero .p2align 2 label113: sh2add a3, a2, s0 - flw f12, 0(a1) + flw f13, 0(a1) addiw a2, a2, 4 flw f14, 0(a3) - flw f13, 4(a1) - fmul.s f15, f12, f14 + flw f12, 4(a1) + fmul.s f15, f13, f14 flw f14, 4(a3) - flw f12, 8(a1) - fmul.s f0, f13, f14 + flw f13, 8(a1) + fmul.s f0, f12, f14 fadd.s f11, f10, f15 flw f15, 8(a3) - flw f13, 12(a1) - fmul.s f14, f12, f15 - flw f12, 12(a3) + flw f12, 12(a1) + fmul.s f14, f13, f15 + flw f13, 12(a3) fadd.s f10, f11, f0 - ld a3, 136(sp) fadd.s f11, f10, f14 - fmul.s f14, f13, f12 + fmul.s f14, f12, f13 fadd.s f10, f11, f14 - ble a3, a2, label218 + ble s1, a2, label218 addi a1, a1, 16 j label113 .p2align 2 label218: fmv.s f11, f10 - ld s11, 128(sp) - ble s11, a2, label300 + ble s2, a2, label297 .p2align 2 label106: - sh2add a1, a2, s2 + sh2add a1, a2, s4 fmv.s f10, f11 .p2align 2 label107: @@ -168,10 +159,9 @@ label107: flw f11, 0(a1) addiw a2, a2, 1 flw f12, 0(a3) - ld s11, 128(sp) fmul.s f13, f11, f12 fadd.s f10, f10, f13 - ble s11, a2, label187 + ble s2, a2, label187 addi a1, a1, 4 j label107 .p2align 2 @@ -179,26 +169,24 @@ label168: fmv.w.x f10, zero mv a2, zero fmv.s f11, f10 - ld s11, 128(sp) - bgt s11, zero, label106 + bgt s2, zero, label106 fadd.s f8, f8, f10 - addiw s10, s10, 1 - blt s10, s5, label92 + addiw s11, s11, 1 + blt s11, s7, label92 j label120 .p2align 2 label152: - ld s11, 128(sp) - bgt s11, zero, label101 + bgt s2, zero, label101 fmv.w.x f10, zero - addiw s10, s10, 1 + addiw s11, s11, 1 fadd.s f8, f8, f10 - blt s10, s5, label92 + blt s11, s7, label92 label120: - flw f12, 8(s3) + flw f12, 8(s5) li a0, 39 - flw f11, 12(s3) + flw f11, 12(s5) fsub.s f10, f8, f12 - flw f12, 16(s3) + flw f12, 16(s5) fle.s a1, f10, f11 fle.s a2, f12, f10 or s0, a1, a2 @@ -209,22 +197,22 @@ label120: mv a0, zero label121: ld ra, 0(sp) - ld s7, 8(sp) - ld s3, 16(sp) - flw f8, 24(sp) - ld s10, 32(sp) - flw f9, 40(sp) - ld s4, 48(sp) - flw f18, 56(sp) - ld s6, 64(sp) - ld s1, 72(sp) - ld s8, 80(sp) - ld s5, 88(sp) - ld s0, 96(sp) + ld s8, 8(sp) + flw f8, 16(sp) + ld s11, 24(sp) + flw f9, 32(sp) + ld s3, 40(sp) + flw f18, 48(sp) + ld s6, 56(sp) + ld s9, 64(sp) + ld s5, 72(sp) + ld s0, 80(sp) + ld s4, 88(sp) + ld s1, 96(sp) ld s2, 104(sp) - ld s11, 112(sp) - ld s9, 120(sp) - addi sp, sp, 144 + ld s7, 112(sp) + ld s10, 120(sp) + addi sp, sp, 128 ret label123: li a0, 1 @@ -232,17 +220,17 @@ label123: li a0, 1 j label121 .p2align 2 -label300: +label297: fadd.s f8, f8, f10 - addiw s10, s10, 1 - blt s10, s5, label92 + addiw s11, s11, 1 + blt s11, s7, label92 j label120 .p2align 2 label163: fmv.w.x f10, zero - addiw s10, s10, 1 + addiw s11, s11, 1 fadd.s f8, f8, f10 - blt s10, s5, label92 + blt s11, s7, label92 j label120 .p2align 2 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/large_loop_array_3.arm.s b/tests/SysY2022/performance/large_loop_array_3.arm.s index b5b3f40ec..eb8aa29b8 100644 --- a/tests/SysY2022/performance/large_loop_array_3.arm.s +++ b/tests/SysY2022/performance/large_loop_array_3.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 24 -.align 8 +.p2align 3 y: .zero 32768 -.align 8 +.p2align 3 x: .zero 32768 .text diff --git a/tests/SysY2022/performance/large_loop_array_3.riscv.s b/tests/SysY2022/performance/large_loop_array_3.riscv.s index 009b284ed..ad3861315 100644 --- a/tests/SysY2022/performance/large_loop_array_3.riscv.s +++ b/tests/SysY2022/performance/large_loop_array_3.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1045220557 .4byte 1036831949 @@ -9,71 +9,65 @@ __cmmc_fp_constant_pool: .4byte 897988541 .4byte 3045472189 .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 40 -.align 8 +.p2align 3 y: .zero 32768 -.align 8 +.p2align 3 x: .zero 32768 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[16] CalleeSaved[116] - addi sp, sp, -144 + addi sp, sp, -128 sd ra, 0(sp) - sd s7, 8(sp) - sd s3, 16(sp) - fsw f8, 24(sp) - sd s10, 32(sp) - fsw f9, 40(sp) - sd s4, 48(sp) - fsw f18, 56(sp) - sd s6, 64(sp) - sd s1, 72(sp) - sd s8, 80(sp) - sd s5, 88(sp) - sd s0, 96(sp) + sd s8, 8(sp) + fsw f8, 16(sp) + sd s11, 24(sp) + fsw f9, 32(sp) + sd s3, 40(sp) + fsw f18, 48(sp) + sd s6, 56(sp) + sd s9, 64(sp) + sd s5, 72(sp) + sd s0, 80(sp) + sd s4, 88(sp) + sd s1, 96(sp) sd s2, 104(sp) - sd s11, 112(sp) - sd s9, 120(sp) + sd s7, 112(sp) + sd s10, 120(sp) jal getint - sd a0, 128(sp) - mv s11, a0 + mv s2, a0 li a0, 22 jal _sysy_starttime -pcrel311: +pcrel308: auipc a2, %pcrel_hi(cmmc_parallel_body_0) -pcrel312: + addiw s1, s2, -3 +pcrel309: auipc a1, %pcrel_hi(y) - lui s4, 260096 -pcrel313: + lui s6, 260096 +pcrel310: auipc a0, %pcrel_hi(x) -pcrel314: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_0) - mv s10, zero - li s9, 3 +pcrel311: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) + mv s11, zero + li s10, 3 fmv.w.x f18, zero - ld s11, 128(sp) - addi s7, a2, %pcrel_lo(pcrel311) - addi s0, a1, %pcrel_lo(pcrel312) - fmv.w.x f9, s4 - addi s2, a0, %pcrel_lo(pcrel313) - addi s1, s8, %pcrel_lo(pcrel314) + addi s8, a2, %pcrel_lo(pcrel308) + addi s0, a1, %pcrel_lo(pcrel309) + fmv.w.x f9, s6 + addi s4, a0, %pcrel_lo(pcrel310) + addi s3, s9, %pcrel_lo(pcrel311) fmv.s f8, f18 - addiw a3, s11, -3 - lui a2, 419430 -pcrel315: +pcrel312: auipc a1, %pcrel_hi(__cmmc_fp_constant_pool) mv a0, zero - addiw s6, a2, 1639 - addi s3, a1, %pcrel_lo(pcrel315) - sd a3, 136(sp) + addi s5, a1, %pcrel_lo(pcrel312) lui a1, 24 - addiw s5, a1, 1696 + addiw s7, a1, 1696 j label92 .p2align 2 label111: @@ -84,94 +78,90 @@ label107: flw f11, 0(a1) addiw a2, a2, 1 flw f12, 0(a3) - ld s11, 128(sp) fmul.s f13, f11, f12 fadd.s f10, f10, f13 - bgt s11, a2, label111 + bgt s2, a2, label111 fadd.s f8, f8, f10 - addiw s10, s10, 1 - bge s10, s5, label120 + addiw s11, s11, 1 + bge s11, s7, label120 .p2align 2 label92: - mul a1, s10, s6 - flw f11, 0(s3) - fmv.w.x f12, s4 + lui a4, 419430 + fmv.w.x f12, s6 + flw f11, 0(s5) + addiw a3, a4, 1639 + mul a1, s11, a3 + fadd.s f10, f9, f11 srli t0, a1, 63 srai a2, a1, 34 - fadd.s f10, f9, f11 - add a3, t0, a2 fmv.s f9, f12 + add a3, t0, a2 sh2add a4, a3, a3 slliw a5, a4, 1 - subw a2, s10, a5 + subw a2, s11, a5 sltu a1, zero, a2 - bne a1, zero, label275 + bne a1, zero, label274 fmv.s f9, f10 .p2align 2 -label275: - flw f12, 4(s3) +label274: + flw f12, 4(s5) fmv.w.x f11, zero fadd.s f10, f18, f12 fmv.s f18, f11 - bne a1, zero, label277 + bne a1, zero, label276 fmv.s f18, f10 .p2align 2 -label277: - ld s11, 128(sp) - ble s11, a0, label152 -pcrel316: - auipc s8, %pcrel_hi(cmmc_parallel_body_payload_0) - sw a0, %pcrel_lo(pcrel316)(s8) - sd s2, 8(s1) - fsw f18, 16(s1) - sd s0, 24(s1) - fsw f9, 32(s1) - sw s11, 36(s1) - mv a1, s11 - mv a2, s7 +label276: + ble s2, a0, label152 +pcrel313: + auipc s9, %pcrel_hi(cmmc_parallel_body_payload_0) + sw a0, %pcrel_lo(pcrel313)(s9) + sd s4, 8(s3) + fsw f18, 16(s3) + sd s0, 24(s3) + fsw f9, 32(s3) + sw s2, 36(s3) + mv a1, s2 + mv a2, s8 jal cmmcParallelFor - mv a0, s11 - ld s11, 128(sp) - ble s11, zero, label163 + mv a0, s2 + ble s2, zero, label163 .p2align 2 label101: - ld s11, 128(sp) - ble s11, s9, label168 + ble s2, s10, label168 fmv.w.x f10, zero - mv a1, s2 + mv a1, s4 mv a2, zero .p2align 2 label113: sh2add a3, a2, s0 - flw f12, 0(a1) + flw f13, 0(a1) addiw a2, a2, 4 flw f14, 0(a3) - flw f13, 4(a1) - fmul.s f15, f12, f14 + flw f12, 4(a1) + fmul.s f15, f13, f14 flw f14, 4(a3) - flw f12, 8(a1) - fmul.s f0, f13, f14 + flw f13, 8(a1) + fmul.s f0, f12, f14 fadd.s f11, f10, f15 flw f15, 8(a3) - flw f13, 12(a1) - fmul.s f14, f12, f15 - flw f12, 12(a3) + flw f12, 12(a1) + fmul.s f14, f13, f15 + flw f13, 12(a3) fadd.s f10, f11, f0 - ld a3, 136(sp) fadd.s f11, f10, f14 - fmul.s f14, f13, f12 + fmul.s f14, f12, f13 fadd.s f10, f11, f14 - ble a3, a2, label218 + ble s1, a2, label218 addi a1, a1, 16 j label113 .p2align 2 label218: fmv.s f11, f10 - ld s11, 128(sp) - ble s11, a2, label300 + ble s2, a2, label297 .p2align 2 label106: - sh2add a1, a2, s2 + sh2add a1, a2, s4 fmv.s f10, f11 j label107 .p2align 2 @@ -179,26 +169,24 @@ label168: fmv.w.x f10, zero mv a2, zero fmv.s f11, f10 - ld s11, 128(sp) - bgt s11, zero, label106 + bgt s2, zero, label106 fadd.s f8, f8, f10 - addiw s10, s10, 1 - blt s10, s5, label92 + addiw s11, s11, 1 + blt s11, s7, label92 j label120 .p2align 2 label152: - ld s11, 128(sp) - bgt s11, zero, label101 + bgt s2, zero, label101 fmv.w.x f10, zero - addiw s10, s10, 1 + addiw s11, s11, 1 fadd.s f8, f8, f10 - blt s10, s5, label92 + blt s11, s7, label92 label120: - flw f12, 8(s3) + flw f12, 8(s5) li a0, 39 - flw f11, 12(s3) + flw f11, 12(s5) fsub.s f10, f8, f12 - flw f12, 16(s3) + flw f12, 16(s5) fle.s a1, f10, f11 fle.s a2, f12, f10 or s0, a1, a2 @@ -209,22 +197,22 @@ label120: mv a0, zero label121: ld ra, 0(sp) - ld s7, 8(sp) - ld s3, 16(sp) - flw f8, 24(sp) - ld s10, 32(sp) - flw f9, 40(sp) - ld s4, 48(sp) - flw f18, 56(sp) - ld s6, 64(sp) - ld s1, 72(sp) - ld s8, 80(sp) - ld s5, 88(sp) - ld s0, 96(sp) + ld s8, 8(sp) + flw f8, 16(sp) + ld s11, 24(sp) + flw f9, 32(sp) + ld s3, 40(sp) + flw f18, 48(sp) + ld s6, 56(sp) + ld s9, 64(sp) + ld s5, 72(sp) + ld s0, 80(sp) + ld s4, 88(sp) + ld s1, 96(sp) ld s2, 104(sp) - ld s11, 112(sp) - ld s9, 120(sp) - addi sp, sp, 144 + ld s7, 112(sp) + ld s10, 120(sp) + addi sp, sp, 128 ret label123: li a0, 1 @@ -232,17 +220,17 @@ label123: li a0, 1 j label121 .p2align 2 -label300: +label297: fadd.s f8, f8, f10 - addiw s10, s10, 1 - blt s10, s5, label92 + addiw s11, s11, 1 + blt s11, s7, label92 j label120 .p2align 2 label163: fmv.w.x f10, zero - addiw s10, s10, 1 + addiw s11, s11, 1 fadd.s f8, f8, f10 - blt s10, s5, label92 + blt s11, s7, label92 j label120 .p2align 2 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/layernorm1.arm.s b/tests/SysY2022/performance/layernorm1.arm.s index a4799b737..a458d2dec 100644 --- a/tests/SysY2022/performance/layernorm1.arm.s +++ b/tests/SysY2022/performance/layernorm1.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 8 -.align 8 +.p2align 3 var: .zero 4000 -.align 8 +.p2align 3 mean: .zero 4000 .text diff --git a/tests/SysY2022/performance/layernorm1.riscv.s b/tests/SysY2022/performance/layernorm1.riscv.s index 953361289..d6997250d 100644 --- a/tests/SysY2022/performance/layernorm1.riscv.s +++ b/tests/SysY2022/performance/layernorm1.riscv.s @@ -1,24 +1,24 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1008981770 .4byte 981668463 .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 16 -.align 8 +.p2align 3 var: .zero 4000 -.align 8 +.p2align 3 mean: .zero 4000 .text @@ -26,27 +26,27 @@ mean: .globl main main: addi sp, sp, -104 -pcrel1175: +pcrel1173: auipc a0, %pcrel_hi(a) -pcrel1176: +pcrel1174: auipc a1, %pcrel_hi(var) sd ra, 0(sp) sd s4, 8(sp) - addi s4, a0, %pcrel_lo(pcrel1175) + addi s4, a0, %pcrel_lo(pcrel1173) sd s7, 16(sp) -pcrel1177: +pcrel1175: auipc a0, %pcrel_hi(mean) -pcrel1178: +pcrel1176: auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) sd s8, 24(sp) - addi s8, s7, %pcrel_lo(pcrel1178) + addi s8, s7, %pcrel_lo(pcrel1176) sd s6, 32(sp) - addi s6, a0, %pcrel_lo(pcrel1177) + addi s6, a0, %pcrel_lo(pcrel1175) sd s1, 40(sp) li a0, 125 sd s5, 48(sp) slli s1, a0, 5 - addi s5, a1, %pcrel_lo(pcrel1176) + addi s5, a1, %pcrel_lo(pcrel1174) sd s0, 56(sp) li s0, 1000 sd s9, 64(sp) @@ -75,24 +75,24 @@ label606: label796: addiw s9, s9, 1 li a0, 100 - bge s9, a0, label1170 + bge s9, a0, label1168 fmv.s f8, f12 label607: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1179: +pcrel1177: auipc a3, %pcrel_hi(cmmc_parallel_body_0) sd s6, %pcrel_lo(label607)(a0) - addi a2, a3, %pcrel_lo(pcrel1179) + addi a2, a3, %pcrel_lo(pcrel1177) mv a1, s0 mv a0, zero jal cmmcParallelFor mv a0, zero -pcrel1180: +pcrel1178: auipc a3, %pcrel_hi(cmmc_parallel_body_1) -pcrel1181: +pcrel1179: auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) - sd s5, %pcrel_lo(pcrel1181)(s7) - addi a2, a3, %pcrel_lo(pcrel1180) + sd s5, %pcrel_lo(pcrel1179)(s7) + addi a2, a3, %pcrel_lo(pcrel1178) sd s6, 8(s8) mv a1, s0 jal cmmcParallelFor @@ -264,7 +264,7 @@ label635: flw f8, 96(sp) addi sp, sp, 104 ret -label1170: +label1168: mv a1, zero fmv.s f10, f12 mv a0, s4 diff --git a/tests/SysY2022/performance/layernorm2.arm.s b/tests/SysY2022/performance/layernorm2.arm.s index a4799b737..a458d2dec 100644 --- a/tests/SysY2022/performance/layernorm2.arm.s +++ b/tests/SysY2022/performance/layernorm2.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 8 -.align 8 +.p2align 3 var: .zero 4000 -.align 8 +.p2align 3 mean: .zero 4000 .text diff --git a/tests/SysY2022/performance/layernorm2.riscv.s b/tests/SysY2022/performance/layernorm2.riscv.s index 953361289..d6997250d 100644 --- a/tests/SysY2022/performance/layernorm2.riscv.s +++ b/tests/SysY2022/performance/layernorm2.riscv.s @@ -1,24 +1,24 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1008981770 .4byte 981668463 .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 16 -.align 8 +.p2align 3 var: .zero 4000 -.align 8 +.p2align 3 mean: .zero 4000 .text @@ -26,27 +26,27 @@ mean: .globl main main: addi sp, sp, -104 -pcrel1175: +pcrel1173: auipc a0, %pcrel_hi(a) -pcrel1176: +pcrel1174: auipc a1, %pcrel_hi(var) sd ra, 0(sp) sd s4, 8(sp) - addi s4, a0, %pcrel_lo(pcrel1175) + addi s4, a0, %pcrel_lo(pcrel1173) sd s7, 16(sp) -pcrel1177: +pcrel1175: auipc a0, %pcrel_hi(mean) -pcrel1178: +pcrel1176: auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) sd s8, 24(sp) - addi s8, s7, %pcrel_lo(pcrel1178) + addi s8, s7, %pcrel_lo(pcrel1176) sd s6, 32(sp) - addi s6, a0, %pcrel_lo(pcrel1177) + addi s6, a0, %pcrel_lo(pcrel1175) sd s1, 40(sp) li a0, 125 sd s5, 48(sp) slli s1, a0, 5 - addi s5, a1, %pcrel_lo(pcrel1176) + addi s5, a1, %pcrel_lo(pcrel1174) sd s0, 56(sp) li s0, 1000 sd s9, 64(sp) @@ -75,24 +75,24 @@ label606: label796: addiw s9, s9, 1 li a0, 100 - bge s9, a0, label1170 + bge s9, a0, label1168 fmv.s f8, f12 label607: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1179: +pcrel1177: auipc a3, %pcrel_hi(cmmc_parallel_body_0) sd s6, %pcrel_lo(label607)(a0) - addi a2, a3, %pcrel_lo(pcrel1179) + addi a2, a3, %pcrel_lo(pcrel1177) mv a1, s0 mv a0, zero jal cmmcParallelFor mv a0, zero -pcrel1180: +pcrel1178: auipc a3, %pcrel_hi(cmmc_parallel_body_1) -pcrel1181: +pcrel1179: auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) - sd s5, %pcrel_lo(pcrel1181)(s7) - addi a2, a3, %pcrel_lo(pcrel1180) + sd s5, %pcrel_lo(pcrel1179)(s7) + addi a2, a3, %pcrel_lo(pcrel1178) sd s6, 8(s8) mv a1, s0 jal cmmcParallelFor @@ -264,7 +264,7 @@ label635: flw f8, 96(sp) addi sp, sp, 104 ret -label1170: +label1168: mv a1, zero fmv.s f10, f12 mv a0, s4 diff --git a/tests/SysY2022/performance/layernorm3.arm.s b/tests/SysY2022/performance/layernorm3.arm.s index a4799b737..a458d2dec 100644 --- a/tests/SysY2022/performance/layernorm3.arm.s +++ b/tests/SysY2022/performance/layernorm3.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 8 -.align 8 +.p2align 3 var: .zero 4000 -.align 8 +.p2align 3 mean: .zero 4000 .text diff --git a/tests/SysY2022/performance/layernorm3.riscv.s b/tests/SysY2022/performance/layernorm3.riscv.s index 953361289..d6997250d 100644 --- a/tests/SysY2022/performance/layernorm3.riscv.s +++ b/tests/SysY2022/performance/layernorm3.riscv.s @@ -1,24 +1,24 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1008981770 .4byte 981668463 .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_1: .zero 16 -.align 8 +.p2align 3 var: .zero 4000 -.align 8 +.p2align 3 mean: .zero 4000 .text @@ -26,27 +26,27 @@ mean: .globl main main: addi sp, sp, -104 -pcrel1175: +pcrel1173: auipc a0, %pcrel_hi(a) -pcrel1176: +pcrel1174: auipc a1, %pcrel_hi(var) sd ra, 0(sp) sd s4, 8(sp) - addi s4, a0, %pcrel_lo(pcrel1175) + addi s4, a0, %pcrel_lo(pcrel1173) sd s7, 16(sp) -pcrel1177: +pcrel1175: auipc a0, %pcrel_hi(mean) -pcrel1178: +pcrel1176: auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) sd s8, 24(sp) - addi s8, s7, %pcrel_lo(pcrel1178) + addi s8, s7, %pcrel_lo(pcrel1176) sd s6, 32(sp) - addi s6, a0, %pcrel_lo(pcrel1177) + addi s6, a0, %pcrel_lo(pcrel1175) sd s1, 40(sp) li a0, 125 sd s5, 48(sp) slli s1, a0, 5 - addi s5, a1, %pcrel_lo(pcrel1176) + addi s5, a1, %pcrel_lo(pcrel1174) sd s0, 56(sp) li s0, 1000 sd s9, 64(sp) @@ -75,24 +75,24 @@ label606: label796: addiw s9, s9, 1 li a0, 100 - bge s9, a0, label1170 + bge s9, a0, label1168 fmv.s f8, f12 label607: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1179: +pcrel1177: auipc a3, %pcrel_hi(cmmc_parallel_body_0) sd s6, %pcrel_lo(label607)(a0) - addi a2, a3, %pcrel_lo(pcrel1179) + addi a2, a3, %pcrel_lo(pcrel1177) mv a1, s0 mv a0, zero jal cmmcParallelFor mv a0, zero -pcrel1180: +pcrel1178: auipc a3, %pcrel_hi(cmmc_parallel_body_1) -pcrel1181: +pcrel1179: auipc s7, %pcrel_hi(cmmc_parallel_body_payload_1) - sd s5, %pcrel_lo(pcrel1181)(s7) - addi a2, a3, %pcrel_lo(pcrel1180) + sd s5, %pcrel_lo(pcrel1179)(s7) + addi a2, a3, %pcrel_lo(pcrel1178) sd s6, 8(s8) mv a1, s0 jal cmmcParallelFor @@ -264,7 +264,7 @@ label635: flw f8, 96(sp) addi sp, sp, 104 ret -label1170: +label1168: mv a1, zero fmv.s f10, f12 mv a0, s4 diff --git a/tests/SysY2022/performance/matmul1.arm.s b/tests/SysY2022/performance/matmul1.arm.s index 721ecd82d..93911d3e2 100644 --- a/tests/SysY2022/performance/matmul1.arm.s +++ b/tests/SysY2022/performance/matmul1.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 b: .zero 4000000 -.align 8 +.p2align 3 c: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 12 .text diff --git a/tests/SysY2022/performance/matmul1.riscv.s b/tests/SysY2022/performance/matmul1.riscv.s index 5036d87ef..e17b7c81c 100644 --- a/tests/SysY2022/performance/matmul1.riscv.s +++ b/tests/SysY2022/performance/matmul1.riscv.s @@ -1,485 +1,473 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 b: .zero 4000000 -.align 8 +.p2align 3 c: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 12 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 16 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[16] CalleeSaved[104] - addi sp, sp, -120 -pcrel2002: + addi sp, sp, -88 +pcrel1982: auipc a0, %pcrel_hi(a) -pcrel2003: +pcrel1983: auipc a1, %pcrel_hi(cmmc_parallel_body_3) sd ra, 0(sp) - addi a2, a1, %pcrel_lo(pcrel2003) - sd s7, 8(sp) - li a1, 875 - addi s7, a0, %pcrel_lo(pcrel2002) - sd s9, 16(sp) -pcrel2004: + sd s5, 8(sp) + addi s5, a0, %pcrel_lo(pcrel1982) + sd s0, 16(sp) +pcrel1984: auipc a0, %pcrel_hi(c) -pcrel2005: - auipc s9, %pcrel_hi(cmmc_parallel_body_payload_2) - sd s10, 24(sp) - addi s10, s9, %pcrel_lo(pcrel2005) - sd s0, 32(sp) - addi s0, a0, %pcrel_lo(pcrel2004) - sd s5, 40(sp) -pcrel2006: + sd s7, 24(sp) +pcrel1985: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_2) + sd s8, 32(sp) + addi s8, s7, %pcrel_lo(pcrel1985) + sd s1, 40(sp) + addi s1, a0, %pcrel_lo(pcrel1984) + sd s6, 48(sp) +pcrel1986: auipc a0, %pcrel_hi(cmmc_parallel_body_2) - sd s11, 48(sp) - mv s11, zero - sd s8, 56(sp) - addi s8, a0, %pcrel_lo(pcrel2006) - sd s1, 64(sp) + sd s9, 56(sp) + addi s6, a0, %pcrel_lo(pcrel1986) + mv s9, zero li a0, 125 - sd s6, 72(sp) - slli a5, a0, 5 - slli s6, a1, 5 - slli s1, a5, 1 - sd s2, 80(sp) - sh1add s2, a5, a5 - sd s3, 88(sp) - slli s5, s2, 1 - slli s3, s1, 1 - sd s4, 96(sp) - sh2add s4, a5, a5 - sd a2, 104(sp) - sd a5, 112(sp) -label1358: + sd s2, 64(sp) + slli s0, a0, 5 + addi s2, a1, %pcrel_lo(pcrel1983) + sd s3, 72(sp) + slli s3, s0, 1 + sd s4, 80(sp) + sh1add s4, s0, s0 +label1347: li a0, 1000 - bge s11, a0, label1365 - mv a0, s7 + bge s9, a0, label1354 + mv a0, s5 jal getarray li a1, 1000 - bne a0, a1, label1363 - addiw s11, s11, 1 - ld a5, 112(sp) - add s7, s7, a5 - j label1358 + bne a0, a1, label1352 + addiw s9, s9, 1 + add s5, s5, s0 + j label1347 .p2align 2 -label1389: - ld a5, 112(sp) - add s0, s0, a5 +label1378: + add s1, s1, s0 .p2align 2 -label1384: - auipc s3, %pcrel_hi(cmmc_parallel_body_payload_3) +label1373: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) mv a0, zero li a1, 1000 - sw s4, %pcrel_lo(label1384)(s3) - sw s4, 4(s1) - sd s0, 8(s1) - ld a2, 104(sp) + sw s6, %pcrel_lo(label1373)(s5) + sw s6, 4(s3) + sd s1, 8(s3) + mv a2, s2 jal cmmcParallelFor li a0, 1000 - addiw s2, s2, 1 - lw s4, %pcrel_lo(label1384)(s3) - blt s2, a0, label1389 + addiw s4, s4, 1 + lw s6, %pcrel_lo(label1373)(s5) + blt s4, a0, label1378 li a0, 92 jal _sysy_stoptime - mv a0, s4 + mv a0, s6 jal putint mv a0, zero -label1363: +label1352: ld ra, 0(sp) - ld s7, 8(sp) - ld s9, 16(sp) - ld s10, 24(sp) - ld s0, 32(sp) - ld s5, 40(sp) - ld s11, 48(sp) - ld s8, 56(sp) - ld s1, 64(sp) - ld s6, 72(sp) - ld s2, 80(sp) - ld s3, 88(sp) - ld s4, 96(sp) - addi sp, sp, 120 + ld s5, 8(sp) + ld s0, 16(sp) + ld s7, 24(sp) + ld s8, 32(sp) + ld s1, 40(sp) + ld s6, 48(sp) + ld s9, 56(sp) + ld s2, 64(sp) + ld s3, 72(sp) + ld s4, 80(sp) + addi sp, sp, 88 ret -label1365: +label1354: li a0, 23 jal _sysy_starttime li a1, 1000 mv a0, zero -pcrel2007: +pcrel1987: auipc a3, %pcrel_hi(cmmc_parallel_body_0) - addi a2, a3, %pcrel_lo(pcrel2007) + addi a2, a3, %pcrel_lo(pcrel1987) jal cmmcParallelFor li a1, 1000 mv a0, zero -pcrel2008: +pcrel1988: auipc a3, %pcrel_hi(cmmc_parallel_body_1) - addi a2, a3, %pcrel_lo(pcrel2008) + addi a2, a3, %pcrel_lo(pcrel1988) jal cmmcParallelFor - mv s11, zero - mv s7, s0 - mv a0, s0 + mv s9, zero + mv s5, s1 + mv a0, s1 mv a1, zero lui a3, 524288 addiw a2, a3, -1 - j label1369 + j label1358 .p2align 2 -label1392: +label1381: addi a0, a0, 256 .p2align 2 -label1369: - lw a5, 0(a0) +label1358: + lw t0, 0(a0) addiw a1, a1, 64 - lw t0, 4(a0) - min a3, a2, a5 - lw t1, 8(a0) - min a4, a3, t0 - lw t2, 12(a0) - min a5, a4, t1 + lw a5, 4(a0) + min a4, a2, t0 + lw t0, 8(a0) + min a3, a4, a5 + lw a5, 12(a0) + min a2, a3, t0 lw t0, 16(a0) - min a2, a5, t2 + min a4, a2, a5 lw a5, 20(a0) - min a3, a2, t0 + min a3, a4, t0 lw t0, 24(a0) - min a4, a3, a5 - lw t1, 28(a0) - min a2, a4, t0 - lw a4, 32(a0) + min a2, a3, a5 + lw a5, 28(a0) + min a4, a2, t0 + lw t0, 32(a0) + min a3, a4, a5 + lw t1, 36(a0) + min a2, a3, t0 + lw a3, 40(a0) min a5, a2, t1 - lw a2, 36(a0) - min a3, a5, a4 - lw a5, 40(a0) - min t0, a3, a2 - lw a3, 44(a0) - min a4, t0, a5 - lw t0, 48(a0) - min a2, a4, a3 + lw t0, 44(a0) + min a4, a5, a3 + lw a5, 48(a0) + min a2, a4, t0 lw t1, 52(a0) - min a5, a2, t0 - lw t0, 56(a0) - min a3, a5, t1 - lw t1, 60(a0) - min a4, a3, t0 - lw a5, 64(a0) - min a2, a4, t1 - lw t0, 68(a0) min a3, a2, a5 + lw t0, 56(a0) + min a4, a3, t1 + lw a3, 60(a0) + min a5, a4, t0 + lw t2, 64(a0) + min a2, a5, a3 + lw t1, 68(a0) + min t0, a2, t2 lw a5, 72(a0) - min a4, a3, t0 + min a4, t0, t1 lw t1, 76(a0) - min a2, a4, a5 + min a3, a4, a5 lw t0, 80(a0) - min a3, a2, t1 - lw a2, 84(a0) - min a5, a3, t0 - lw t1, 88(a0) - min a4, a5, a2 - lw t0, 92(a0) - min a3, a4, t1 - lw a5, 96(a0) - min a2, a3, t0 - lw t0, 100(a0) - min a4, a2, a5 - lw t1, 104(a0) + min a2, a3, t1 + lw t1, 84(a0) + min a5, a2, t0 + lw t0, 88(a0) + min a4, a5, t1 + lw t1, 92(a0) min a3, a4, t0 - lw t0, 108(a0) + lw t0, 96(a0) + min a2, a3, t1 + lw t1, 100(a0) + min a5, a2, t0 + lw a2, 104(a0) + min a4, a5, t1 + lw t1, 108(a0) + min a3, a4, a2 + lw t0, 112(a0) min a5, a3, t1 - lw a3, 112(a0) + lw t1, 116(a0) min a2, a5, t0 - lw t0, 116(a0) - min a4, a2, a3 - lw t2, 120(a0) - min a5, a4, t0 - lw t1, 124(a0) - min a3, a5, t2 - lw t0, 128(a0) - min a2, a3, t1 - lw t2, 132(a0) - min a4, a2, t0 - lw t1, 136(a0) - min a5, a4, t2 - lw t0, 140(a0) - min a3, a5, t1 - lw a5, 144(a0) + lw a5, 120(a0) + min a4, a2, t1 + lw t0, 124(a0) + min a3, a4, a5 + lw a5, 128(a0) min a2, a3, t0 - lw t1, 148(a0) + lw t1, 132(a0) min a4, a2, a5 + lw t0, 136(a0) + min a3, a4, t1 + lw t1, 140(a0) + min a5, a3, t0 + lw t0, 144(a0) + min a2, a5, t1 + lw t1, 148(a0) + min a4, a2, t0 lw t0, 152(a0) min a3, a4, t1 lw a4, 156(a0) - min a2, a3, t0 - lw t1, 160(a0) - min a5, a2, a4 - lw t0, 164(a0) - min a3, a5, t1 - lw a5, 168(a0) - min a4, a3, t0 + min a5, a3, t0 + lw t0, 160(a0) + min a2, a5, a4 + lw a5, 164(a0) + min a3, a2, t0 + lw t0, 168(a0) + min a4, a3, a5 lw t1, 172(a0) - min a2, a4, a5 - lw t0, 176(a0) + min a2, a4, t0 + lw a5, 176(a0) min a3, a2, t1 - lw a2, 180(a0) - min a5, a3, t0 - lw t0, 184(a0) - min a4, a5, a2 - lw a5, 188(a0) - min a3, a4, t0 + lw t0, 180(a0) + min a4, a3, a5 + lw a3, 184(a0) + min a2, a4, t0 + lw t1, 188(a0) + min a5, a2, a3 lw t0, 192(a0) + min a4, a5, t1 + lw a5, 196(a0) + min a3, a4, t0 + lw t0, 200(a0) min a2, a3, a5 - lw t1, 196(a0) + lw t1, 204(a0) min a4, a2, t0 - lw a5, 200(a0) - min a3, a4, t1 - lw t0, 204(a0) - min a2, a3, a5 - lw a5, 208(a0) + lw t0, 208(a0) + min a5, a4, t1 + lw t1, 212(a0) + min a3, a5, t0 + lw t0, 216(a0) + min a2, a3, t1 + lw a5, 220(a0) min a4, a2, t0 - lw t0, 212(a0) + lw t0, 224(a0) min a3, a4, a5 - lw a5, 216(a0) + lw a5, 228(a0) min a2, a3, t0 - lw t0, 220(a0) - min a4, a2, a5 - lw t1, 224(a0) - min a3, a4, t0 - lw t0, 228(a0) - min a5, a3, t1 lw t1, 232(a0) - min a2, a5, t0 - lw a5, 236(a0) - min a4, a2, t1 - lw t0, 240(a0) - min a3, a4, a5 - lw a5, 244(a0) - min a2, a3, t0 - lw t0, 248(a0) min a4, a2, a5 + lw t0, 236(a0) + min a3, a4, t1 + lw a4, 240(a0) + min a2, a3, t0 + lw t0, 244(a0) + min a5, a2, a4 + lw t1, 248(a0) + min a3, a5, t0 lw a5, 252(a0) - min a3, a4, t0 - li a4, 960 - min a2, a3, a5 - blt a1, a4, label1392 - lw a4, 256(a0) - lw t0, 260(a0) - min a3, a2, a4 - lw a5, 264(a0) - min a1, a3, t0 - lw t0, 268(a0) + min a4, a3, t1 + li a3, 960 + min a2, a4, a5 + blt a1, a3, label1381 + lw a3, 256(a0) + lw a5, 260(a0) + min a1, a2, a3 + lw a2, 264(a0) min a4, a1, a5 - lw a5, 272(a0) - min a2, a4, t0 - lw a4, 276(a0) - min a3, a2, a5 - lw a5, 280(a0) - min a1, a3, a4 - lw a4, 284(a0) - min a2, a1, a5 - lw t0, 288(a0) - min a3, a2, a4 - lw a5, 292(a0) - min a1, a3, t0 - lw a4, 296(a0) - min a2, a1, a5 - lw a5, 300(a0) - min a3, a2, a4 - lw t0, 304(a0) + lw a5, 268(a0) + min a3, a4, a2 + lw t0, 272(a0) min a1, a3, a5 - lw a4, 308(a0) - min a2, a1, t0 - lw a5, 312(a0) - min a3, a2, a4 - lw t0, 316(a0) - min a1, a3, a5 - lw a4, 320(a0) + lw a3, 276(a0) min a2, a1, t0 + lw t0, 280(a0) + min a4, a2, a3 + lw a5, 284(a0) + min a1, a4, t0 + lw t0, 288(a0) + min a3, a1, a5 + lw a4, 292(a0) + min a2, a3, t0 + lw t0, 296(a0) + min a1, a2, a4 + lw a5, 300(a0) + min a3, a1, t0 + lw a1, 304(a0) + min a2, a3, a5 + lw a5, 308(a0) + min a4, a2, a1 + lw a2, 312(a0) + min a3, a4, a5 + lw a5, 316(a0) + min a1, a3, a2 + lw t0, 320(a0) + min a4, a1, a5 lw a5, 324(a0) - min a3, a2, a4 + min a3, a4, t0 lw a4, 328(a0) - min a1, a3, a5 + min a2, a3, a5 lw a5, 332(a0) - min a2, a1, a4 + min a1, a2, a4 lw a4, 336(a0) - min a3, a2, a5 + min a3, a1, a5 lw a5, 340(a0) - min a1, a3, a4 + min a2, a3, a4 lw t0, 344(a0) - min a2, a1, a5 - lw a5, 348(a0) - min a4, a2, t0 - lw a2, 352(a0) - min a3, a4, a5 + min a1, a2, a5 + lw a4, 348(a0) + min a3, a1, t0 + lw t0, 352(a0) + min a2, a3, a4 lw a5, 356(a0) - min a1, a3, a2 - lw t0, 360(a0) - min a4, a1, a5 - lw t1, 364(a0) - min a2, a4, t0 - lw a5, 368(a0) - min a3, a2, t1 - lw t0, 372(a0) - min a1, a3, a5 - lw a3, 376(a0) - min a4, a1, t0 - lw t0, 380(a0) - min a2, a4, a3 - lw a5, 384(a0) min a1, a2, t0 - lw a4, 388(a0) + lw a4, 360(a0) min a3, a1, a5 - lw a5, 392(a0) + lw a5, 364(a0) min a2, a3, a4 - lw a3, 396(a0) + lw t0, 368(a0) min a1, a2, a5 - lw a5, 400(a0) - min a4, a1, a3 - lw t0, 404(a0) + lw t1, 372(a0) + min a3, a1, t0 + lw a5, 376(a0) + min a4, a3, t1 + lw a3, 380(a0) min a2, a4, a5 + lw a5, 384(a0) + min a1, a2, a3 + lw t0, 388(a0) + min a4, a1, a5 + lw a5, 392(a0) + min a2, a4, t0 + lw a4, 396(a0) + min a3, a2, a5 + lw a5, 400(a0) + min a1, a3, a4 + lw a4, 404(a0) + min a2, a1, a5 lw a5, 408(a0) - min a3, a2, t0 + min a3, a2, a4 lw a4, 412(a0) min a1, a3, a5 mv a0, zero min a2, a1, a4 -pcrel2009: - auipc s9, %pcrel_hi(cmmc_parallel_body_payload_2) - sd s7, %pcrel_lo(pcrel2009)(s9) +pcrel1989: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_2) + sd s5, %pcrel_lo(pcrel1989)(s7) li a1, 1000 - sw a2, 8(s10) - mv a2, s8 + sw a2, 8(s8) + mv a2, s6 jal cmmcParallelFor li a0, 1000 - addiw s11, s11, 1 - bge s11, a0, label1733 - ld a5, 112(sp) + addiw s9, s9, 1 + bge s9, a0, label1722 + add s5, s5, s0 mv a1, zero lui a3, 524288 - add s7, s7, a5 + mv a0, s5 addiw a2, a3, -1 - mv a0, s7 - j label1369 -label1733: - mv a2, s0 + j label1358 +label1722: + mv a2, s1 mv a0, zero - mv a1, s0 + mv a1, s1 mv a4, zero - j label1379 + j label1368 .p2align 2 -label1391: +label1380: addi a1, a1, 64 .p2align 2 -label1379: - ld a5, 112(sp) - mul t1, a4, a5 +label1368: + mul t0, a4, s0 + li t4, 125 + li t5, 375 addiw a4, a4, 16 - add a3, s0, t1 - sh2add t0, a0, a3 - lw t2, 0(t0) - add t0, a3, a5 - subw t1, zero, t2 - sh2add t2, a0, t0 - add t0, a3, s1 - sw t1, 0(a1) - sh2add a5, a0, t0 - lw t1, 0(t2) - subw t3, zero, t1 - add t1, a3, s2 - sh2add t0, a0, t1 - sw t3, 4(a1) - add t1, a3, s3 - lw t3, 0(a5) - subw t2, zero, t3 - li t3, 125 - sw t2, 8(a1) - lw t2, 0(t0) - subw a5, zero, t2 - sh2add t2, a0, t1 - sw a5, 12(a1) + add a3, s1, t0 + sh2add t2, a0, a3 lw a5, 0(t2) - add t2, a3, s4 - subw t0, zero, a5 - sh2add a5, a0, t2 - add t2, a3, s5 - sw t0, 16(a1) - lw t0, 0(a5) - subw t1, zero, t0 + add t2, a3, s0 + subw t1, zero, a5 sh2add t0, a0, t2 - add t2, a3, s6 - sw t1, 20(a1) - lw a5, 0(t0) + add t2, a3, s3 + sw t1, 0(a1) + lw t1, 0(t0) sh2add t0, a0, t2 - subw t1, zero, a5 - sw t1, 24(a1) - lw a5, 0(t0) - slli t0, t3, 8 - subw t1, zero, a5 - sw t1, 28(a1) - add t1, a3, t0 - sh2add t2, a0, t1 - lw a5, 0(t2) - li t2, 1125 - subw t0, zero, a5 - slli t3, t2, 5 - add t1, a3, t3 - sw t0, 32(a1) + subw a5, zero, t1 + add t2, a3, s4 + sw a5, 4(a1) + lw t1, 0(t0) + subw a5, zero, t1 + sh2add t1, a0, t2 + slli t2, t4, 7 + sw a5, 8(a1) + lw t0, 0(t1) + add t1, a3, t2 + subw a5, zero, t0 + sh2add t3, a0, t1 + sw a5, 12(a1) + lw t0, 0(t3) li t3, 625 - sh2add a5, a0, t1 - slli t1, t3, 6 - lw t2, 0(a5) - li t3, 1375 + subw a5, zero, t0 + slli t1, t3, 5 + add t4, a3, t1 + sw a5, 16(a1) + sh2add t0, a0, t4 + lw t3, 0(t0) + slli t0, t5, 6 + subw a5, zero, t3 + add t4, a3, t0 + sw a5, 20(a1) + sh2add a5, a0, t4 + li t4, 875 + lw t5, 0(a5) + slli a5, t4, 5 + subw t3, zero, t5 + add t5, a3, a5 + sw t3, 24(a1) + sh2add t3, a0, t5 + slli t5, t2, 1 + lw t6, 0(t3) + add t3, a3, t5 + subw t4, zero, t6 + sw t4, 28(a1) + sh2add t4, a0, t3 + li t3, 1125 + lw t2, 0(t4) + slli t4, t3, 5 + subw t5, zero, t2 + sw t5, 32(a1) + add t5, a3, t4 + sh2add t2, a0, t5 + lw t3, 0(t2) + slli t2, t1, 1 + subw t6, zero, t3 + add t4, a3, t2 + sh2add t3, a0, t4 + sw t6, 36(a1) + li t4, 1375 + lw t5, 0(t3) + slli t2, t4, 5 + subw t1, zero, t5 + add t3, a3, t2 + sw t1, 40(a1) + sh2add t1, a0, t3 + lw t4, 0(t1) + slli t1, t0, 1 + subw t2, zero, t4 + add t3, a3, t1 + sh2add t4, a0, t3 + sw t2, 44(a1) + li t3, 1625 + lw t2, 0(t4) + slli t1, t3, 5 subw t0, zero, t2 - sw t0, 36(a1) + sw t0, 48(a1) add t0, a3, t1 - sh2add a5, a0, t0 - slli t0, t3, 5 - lw t2, 0(a5) - li t3, 375 + slli t1, a5, 1 + sh2add t4, a0, t0 + add t0, a3, t1 + lw t2, 0(t4) + li t1, 1875 + subw t3, zero, t2 + sw t3, 52(a1) + sh2add t3, a0, t0 + slli t0, t1, 5 + lw t2, 0(t3) + subw a5, zero, t2 + sw a5, 56(a1) add a5, a3, t0 - subw t1, zero, t2 sh2add t2, a0, a5 - slli a5, t3, 7 - sw t1, 40(a1) - li t3, 1625 + li a5, 992 lw t1, 0(t2) - add t2, a3, a5 subw t0, zero, t1 - sw t0, 44(a1) - sh2add t0, a0, t2 - lw t1, 0(t0) - subw a5, zero, t1 - slli t1, t3, 5 - li t3, 875 - add t0, a3, t1 - sw a5, 48(a1) - sh2add t2, a0, t0 - slli t0, t3, 6 - lw a5, 0(t2) - subw t1, zero, a5 - sw t1, 52(a1) - add t1, a3, t0 - sh2add t2, a0, t1 - lw a5, 0(t2) - li t2, 1875 - subw t0, zero, a5 - slli a5, t2, 5 - add t1, a3, a5 - sw t0, 56(a1) - sh2add t0, a0, t1 - lw t2, 0(t0) - li t0, 992 - subw a5, zero, t2 - sw a5, 60(a1) - blt a4, t0, label1391 + sw t0, 60(a1) + blt a4, a5, label1380 li a4, 125 lui t3, 17 slli a5, a4, 9 @@ -495,7 +483,6 @@ label1379: lw a4, 0(t0) subw a5, zero, a4 slli a4, t2, 6 - lui t2, 21 add t1, a3, a4 sw a5, 68(a1) sh2add t0, a0, t1 @@ -506,1001 +493,990 @@ label1379: add a5, a3, t0 sw a4, 72(a1) sh2add a4, a0, a5 - li a5, 625 lw t1, 0(a4) - slli a4, a5, 7 + li a4, 625 subw t0, zero, t1 - add t1, a3, a4 - sw t0, 76(a1) - sh2add t0, a0, t1 - lw a5, 0(t0) - subw a4, zero, a5 - addiw a5, t2, -2016 + slli a5, a4, 7 add t1, a3, a5 - sw a4, 80(a1) + sw t0, 76(a1) sh2add t0, a0, t1 - li t1, 1375 lw a4, 0(t0) - slli t0, t1, 6 + lui t0, 21 subw a5, zero, a4 + addiw a4, t0, -2016 + add t1, a3, a4 + sw a5, 80(a1) + sh2add a5, a0, t1 + li t1, 1375 + lw t0, 0(a5) + subw a4, zero, t0 + slli t0, t1, 6 + sw a4, 84(a1) add a4, a3, t0 - sw a5, 84(a1) sh2add a5, a0, a4 lui a4, 22 - lw t1, 0(a5) - subw t0, zero, t1 - addiw t1, a4, 1888 - add a5, a3, t1 - sw t0, 88(a1) - sh2add t0, a0, a5 + lw t2, 0(a5) + addiw t0, a4, 1888 + subw t1, zero, t2 + add a5, a3, t0 + sw t1, 88(a1) + sh2add t1, a0, a5 addiw a0, a0, 1 - lw a4, 0(t0) + lw a4, 0(t1) subw a3, zero, a4 li a4, 1000 sw a3, 92(a1) - bge a0, a4, label1383 - ld a5, 112(sp) + bge a0, a4, label1372 + add a2, a2, s0 mv a4, zero - add a2, a2, a5 mv a1, a2 - j label1379 -label1383: + j label1368 +label1372: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - mv s2, zero mv s4, zero - addi s1, a0, %pcrel_lo(label1383) - j label1384 + mv s6, zero + addi s3, a0, %pcrel_lo(label1372) + j label1373 .p2align 2 cmmc_parallel_body_0: - addi sp, sp, -64 -pcrel779: - auipc a5, %pcrel_hi(b) - li a4, 125 - addi a2, a5, %pcrel_lo(pcrel779) - sd s0, 0(sp) - slli a3, a4, 5 - mv s0, a1 - sh3add t6, a3, a3 + addi sp, sp, -24 + mv t5, a1 +pcrel772: + auipc a4, %pcrel_hi(b) + li a5, 125 + sd s1, 0(sp) + addi a2, a4, %pcrel_lo(pcrel772) + slli a3, a5, 5 + sd s0, 8(sp) sh2add t2, a3, a3 + sh1add t0, a3, a3 slli a5, a3, 1 - sd s5, 8(sp) mul a1, a0, a3 - sh1add t0, a3, a3 - slli t1, a5, 1 - sd s1, 16(sp) - add a7, a2, a1 + sd s2, 16(sp) slli t3, t0, 1 - slli t5, t1, 1 - sd s6, 24(sp) - li a2, 625 -pcrel780: + slli t1, a5, 1 + add t4, a2, a1 +pcrel773: auipc a1, %pcrel_hi(a) - slli a6, a2, 6 - sd s4, 32(sp) - addi a4, a1, %pcrel_lo(pcrel780) - sd s2, 40(sp) - li a1, 875 - sd s3, 48(sp) - slli t4, a1, 5 - sd s7, 56(sp) - mv a1, a7 - mv s1, zero + addi a4, a1, %pcrel_lo(pcrel773) + mv a1, t4 + mv t6, zero j label5 .p2align 2 label9: - li s5, 125 - slli s4, s5, 11 - lui s5, 63 - add s3, a2, s4 - addiw s4, s5, 1952 - sh2add s2, a0, s3 - lui s5, 64 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, 1856 - sh2add s2, a0, s3 - lui s5, 65 - add s3, a2, s4 - sw s1, 256(a1) - addiw s4, s5, 1760 - lw s1, 0(s2) - lui s5, 66 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 260(a1) - addiw s4, s5, 1664 - lw s1, 0(s2) - lui s5, 67 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 264(a1) - addiw s4, s5, 1568 - lw s1, 0(s2) - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 268(a1) - lui s4, 68 - lw s1, 0(s2) - sh2add s2, a0, s3 - addiw s3, s4, 1472 - sw s1, 272(a1) - lw s1, 0(s2) - sw s1, 276(a1) - add s1, a2, s3 - sh2add s2, a0, s1 - lui s1, 69 - lw s5, 0(s2) - addiw s4, s1, 1376 - add s3, a2, s4 - sh2add s2, a0, s3 - sw s5, 280(a1) - li s5, 1125 - lw s1, 0(s2) - slli s4, s5, 8 - lui s5, 71 - add s3, a2, s4 - sw s1, 284(a1) - sh2add s1, a0, s3 - lw s2, 0(s1) - addiw s1, s5, 1184 - lui s5, 72 - add s4, a2, s1 - sw s2, 288(a1) - sh2add s3, a0, s4 - addiw s4, s5, 1088 - lw s2, 0(s3) - lui s5, 73 - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 292(a1) - lw s2, 0(s1) - addiw s1, s5, 992 - lui s5, 74 - add s4, a2, s1 - sw s2, 296(a1) - sh2add s3, a0, s4 - addiw s4, s5, 896 - lw s2, 0(s3) - lui s5, 75 - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 300(a1) - lw s2, 0(s1) - addiw s1, s5, 800 - lui s5, 76 - add s4, a2, s1 - sw s2, 304(a1) - addiw s1, s5, 704 - sh2add s3, a0, s4 - lui s5, 77 - add s4, a2, s1 - lw s2, 0(s3) - sh2add s3, a0, s4 - addiw s4, s5, 608 - sw s2, 308(a1) - li s5, 625 - lw s2, 0(s3) - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 312(a1) - lw s2, 0(s1) - slli s1, s5, 9 - lui s5, 79 - add s4, a2, s1 - sw s2, 316(a1) - sh2add s3, a0, s4 - addiw s4, s5, 416 - lw s2, 0(s3) - lui s5, 80 - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 320(a1) - lw s2, 0(s1) - addiw s1, s5, 320 - lui s5, 82 - add s4, a2, s1 - sw s2, 324(a1) - lui s1, 81 - sh2add s3, a0, s4 - addiw s4, s1, 224 - lw s2, 0(s3) - add s3, a2, s4 - addiw s4, s5, 128 - sw s2, 328(a1) - lui s5, 83 - sh2add s2, a0, s3 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, 32 - sh2add s2, a0, s3 - lui s5, 84 - add s3, a2, s4 - sw s1, 332(a1) - addiw s4, s5, -64 - lw s1, 0(s2) - lui s5, 88 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 336(a1) - lw s1, 0(s2) - sh2add s2, a0, s3 - lui s3, 85 - sw s1, 340(a1) - lw s1, 0(s2) - addiw s2, s3, -160 - sw s1, 344(a1) - add s1, a2, s2 - li s2, 1375 - sh2add s4, a0, s1 - slli s1, s2, 8 - lw s3, 0(s4) - add s4, a2, s1 - lui s1, 87 - sw s3, 348(a1) - sh2add s3, a0, s4 - addiw s4, s1, -352 - lw s2, 0(s3) - add s3, a2, s4 - addiw s4, s5, -448 - sw s2, 352(a1) - lui s5, 89 - sh2add s2, a0, s3 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, -544 - sh2add s2, a0, s3 - lui s5, 90 - sw s1, 356(a1) - lw s1, 0(s2) - add s2, a2, s4 - addiw s4, s5, -640 - sh2add s3, a0, s2 - sw s1, 360(a1) - lui s5, 91 - lw s1, 0(s3) - add s3, a2, s4 - addiw s4, s5, -736 - sh2add s2, a0, s3 - sw s1, 364(a1) - lui s5, 92 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, -832 - sh2add s2, a0, s3 - lui s5, 93 - add s3, a2, s4 - sw s1, 368(a1) - addiw s4, s5, -928 - lw s1, 0(s2) - li s5, 375 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 372(a1) - slli s4, s5, 10 - lw s1, 0(s2) - lui s5, 95 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 376(a1) - addiw s4, s5, -1120 - lw s1, 0(s2) - lui s5, 96 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 380(a1) - addiw s4, s5, -1216 - lw s1, 0(s2) - lui s5, 97 - sw s1, 384(a1) - sh2add s1, a0, s3 - add s3, a2, s4 - lw s2, 0(s1) - addiw s4, s5, -1312 - lui s5, 98 - sw s2, 388(a1) - sh2add s2, a0, s3 - add s3, a2, s4 - lw s1, 0(s2) - sh2add s2, a0, s3 - sw s1, 392(a1) - lw s1, 0(s2) - addiw s2, s5, -1408 - lui s5, 99 - add s4, a2, s2 - sw s1, 396(a1) - sh2add s3, a0, s4 - addiw s4, s5, -1504 - lw s1, 0(s3) - lui s5, 100 - add s3, a2, s4 - sh2add s2, a0, s3 - sw s1, 400(a1) - addiw s3, s5, -1600 - lw s1, 0(s2) - add s4, a2, s3 - sh2add s2, a0, s4 - lui s4, 101 - sw s1, 404(a1) - addiw s3, s4, -1696 - lw s1, 0(s2) - add s2, a2, s3 - sw s1, 408(a1) - sh2add s1, a0, s2 + li a6, 125 + lui s0, 63 + slli t6, a6, 11 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, 1952 + lui s0, 64 + add a7, a2, a6 + sw t6, 256(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, 1856 + lui s0, 66 + add a7, a2, t6 + sw a6, 260(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 65 + sw t6, 264(a1) + addiw t6, a6, 1760 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, 1664 + lui s0, 67 + add a7, a2, a6 + sw t6, 268(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, 1568 + add a7, a2, t6 + sw a6, 272(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 68 + addiw a7, a6, 1472 + sw t6, 276(a1) + add t6, a2, a7 + sh2add a6, a0, t6 + lui t6, 69 + lw a7, 0(a6) + addiw a6, t6, 1376 + sw a7, 280(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + li a7, 1125 + lw a6, 0(t6) + slli t6, a7, 8 + add s0, a2, t6 + sw a6, 284(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 71 + addiw t6, a7, 1184 + sw a6, 288(a1) + add s0, a2, t6 + lui t6, 72 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 1088 + sw a7, 292(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + lui a7, 73 + lw a6, 0(t6) + addiw t6, a7, 992 + add s0, a2, t6 + sw a6, 296(a1) + lui t6, 74 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 896 + sw a7, 300(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + lui a7, 75 + lw a6, 0(t6) + addiw t6, a7, 800 + add s0, a2, t6 + sw a6, 304(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 76 + addiw t6, a7, 704 + sw a6, 308(a1) + add s0, a2, t6 + lui t6, 77 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 608 + sw a7, 312(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + li a7, 625 + lw a6, 0(t6) + slli t6, a7, 9 + add s0, a2, t6 + sw a6, 316(a1) + lui t6, 79 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 416 + sw a7, 320(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + lui a7, 80 + lw a6, 0(t6) + addiw t6, a7, 320 + add s0, a2, t6 + sw a6, 324(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 81 + addiw t6, a7, 224 + sw a6, 328(a1) + add s0, a2, t6 + sh2add a7, a0, s0 + lui s0, 82 + lw a6, 0(a7) + addiw t6, s0, 128 + lui s0, 83 + add a7, a2, t6 + sw a6, 332(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, 32 + lui s0, 84 + add a7, a2, a6 + sw t6, 336(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -64 + add a7, a2, t6 + sw a6, 340(a1) + sh2add a6, a0, a7 + lui a7, 85 + lw t6, 0(a6) + addiw a6, a7, -160 + sw t6, 344(a1) + add t6, a2, a6 + li a6, 1375 + sh2add s0, a0, t6 + slli t6, a6, 8 + lw a7, 0(s0) + add s0, a2, t6 + sw a7, 348(a1) + sh2add a7, a0, s0 + lui s0, 87 + lw a6, 0(a7) + addiw t6, s0, -352 + lui s0, 88 + add a7, a2, t6 + sw a6, 352(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, -448 + lui s0, 89 + add a7, a2, a6 + sw t6, 356(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -544 + lui s0, 91 + add a7, a2, t6 + sw a6, 360(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 90 + sw t6, 364(a1) + addiw t6, a6, -640 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, -736 + lui s0, 92 + add a7, a2, a6 + sw t6, 368(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -832 + li s0, 375 + add a7, a2, t6 + sw a6, 372(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 93 + sw t6, 376(a1) + addiw t6, a6, -928 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + slli a6, s0, 10 + lui s0, 95 + add a7, a2, a6 + sw t6, 380(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -1120 + lui s0, 96 + add a7, a2, t6 + sw a6, 384(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, -1216 + lui s0, 97 + add a7, a2, a6 + sw t6, 388(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -1312 + add a7, a2, t6 + sw a6, 392(a1) + sh2add a6, a0, a7 + lui a7, 98 + lw t6, 0(a6) + addiw a6, a7, -1408 + add s0, a2, a6 + sw t6, 396(a1) + lui a6, 99 + sh2add t6, a0, s0 + lw a7, 0(t6) + addiw t6, a6, -1504 + sw a7, 400(a1) + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 100 + sw t6, 404(a1) + addiw t6, a6, -1600 + add a7, a2, t6 + sh2add a6, a0, a7 + lui a7, 101 + lw t6, 0(a6) + addiw a6, a7, -1696 + sw t6, 408(a1) + add t6, a2, a6 + sh2add a7, a0, t6 addiw a0, a0, 1 - lw a2, 0(s1) - sw a2, 412(a1) - ble s0, a0, label11 - add a7, a7, a3 - mv s1, zero - mv a1, a7 + lw a6, 0(a7) + sw a6, 412(a1) + ble t5, a0, label11 + add t4, t4, a3 + mv t6, zero + mv a1, t4 .p2align 2 label5: - mul s4, s1, a3 - li s7, 625 - addiw s1, s1, 64 - add a2, a4, s4 - sh2add s2, a0, a2 - add s4, a2, a3 - add s6, a2, t5 - sh2add s5, a0, s4 - lw s3, 0(s2) - add s2, a2, a5 - sw s3, 0(a1) - lw s3, 0(s5) - sh2add s5, a0, s2 - sw s3, 4(a1) - add s3, a2, t0 - lw s4, 0(s5) - sh2add s5, a0, s3 - add s3, a2, t1 - sw s4, 8(a1) - sh2add s4, a0, s3 - lw s2, 0(s5) - sw s2, 12(a1) - add s2, a2, t2 - lw s5, 0(s4) - sh2add s3, a0, s2 - sw s5, 16(a1) - add s5, a2, t3 - lw s4, 0(s3) - sh2add s3, a0, s5 - sw s4, 20(a1) - add s4, a2, t4 - lw s2, 0(s3) - sh2add s5, a0, s4 - sw s2, 24(a1) - sh2add s2, a0, s6 - lw s3, 0(s5) - li s6, 1375 - sw s3, 28(a1) - add s3, a2, t6 - lw s4, 0(s2) - sh2add s5, a0, s3 - add s2, a2, a6 - sh2add s3, a0, s2 - sw s4, 32(a1) - lw s4, 0(s5) - sw s4, 36(a1) - lw s4, 0(s3) - slli s3, s6, 5 - li s6, 375 - add s2, a2, s3 - sw s4, 40(a1) - sh2add s5, a0, s2 - slli s2, s6, 7 - lw s4, 0(s5) - add s5, a2, s2 - sh2add s3, a0, s5 - sw s4, 44(a1) - li s5, 1625 - lw s4, 0(s3) - slli s3, s5, 5 - sw s4, 48(a1) - add s4, a2, s3 - sh2add s2, a0, s4 - li s4, 875 - lw s5, 0(s2) - slli s6, s4, 6 - add s3, a2, s6 - lui s6, 17 - sh2add s2, a0, s3 - sw s5, 52(a1) - li s3, 1875 - lw s4, 0(s2) - slli s2, s3, 5 - add s5, a2, s2 - sw s4, 56(a1) - sh2add s4, a0, s5 - li s5, 125 - lw s3, 0(s4) - slli s2, s5, 9 - add s4, a2, s2 - addiw s2, s6, -1632 - sw s3, 60(a1) - li s6, 1125 - sh2add s3, a0, s4 - lw s5, 0(s3) - sw s5, 64(a1) - add s5, a2, s2 - sh2add s4, a0, s5 - lw s3, 0(s4) - slli s4, s6, 6 - add s5, a2, s4 - sw s3, 68(a1) - sh2add s2, a0, s5 - lui s5, 19 - lw s3, 0(s2) - addiw s4, s5, -1824 - slli s5, s7, 7 - lui s7, 30 - sw s3, 72(a1) - add s3, a2, s4 - add s4, a2, s5 - sh2add s2, a0, s3 - sh2add s3, a0, s4 - lw s6, 0(s2) - sw s6, 76(a1) - lui s6, 21 - lw s2, 0(s3) - addiw s4, s6, -2016 - lui s6, 24 - add s3, a2, s4 - sh2add s5, a0, s3 - sw s2, 80(a1) - li s3, 1375 - lw s2, 0(s5) - slli s5, s3, 6 - sw s2, 84(a1) - add s2, a2, s5 - sh2add s4, a0, s2 - lui s2, 22 - lw s3, 0(s4) - addiw s4, s2, 1888 - add s5, a2, s4 - sw s3, 88(a1) - li s4, 375 - sh2add s3, a0, s5 - lw s2, 0(s3) - slli s3, s4, 8 - sw s2, 92(a1) - add s2, a2, s3 - addiw s3, s6, 1696 - sh2add s5, a0, s2 - li s6, 1625 - lw s4, 0(s5) - sw s4, 96(a1) - add s4, a2, s3 - slli s3, s6, 6 - sh2add s2, a0, s4 - li s6, 875 - add s4, a2, s3 - lw s5, 0(s2) - sw s5, 100(a1) - sh2add s5, a0, s4 - lui s4, 26 - lw s2, 0(s5) - addiw s5, s4, 1504 - sw s2, 104(a1) - add s2, a2, s5 - slli s5, s6, 7 - sh2add s3, a0, s2 - li s6, 1875 - lw s4, 0(s3) - sw s4, 108(a1) - add s4, a2, s5 - lui s5, 28 - sh2add s2, a0, s4 - lw s3, 0(s2) - addiw s2, s5, 1312 - sw s3, 112(a1) - add s3, a2, s2 - sh2add s4, a0, s3 - lw s5, 0(s4) - slli s4, s6, 6 - add s2, a2, s4 - sw s5, 116(a1) - sh2add s3, a0, s2 - addiw s5, s7, 1120 - lw s6, 0(s3) - lui s7, 33 - add s2, a2, s5 - li s5, 125 - sh2add s4, a0, s2 - sw s6, 120(a1) - slli s2, s5, 10 - lui s6, 32 - lw s3, 0(s4) - sw s3, 124(a1) - add s3, a2, s2 - sh2add s4, a0, s3 - addiw s3, s6, 928 - lw s5, 0(s4) - addiw s6, s7, 832 - add s4, a2, s3 - sh2add s2, a0, s4 - sw s5, 128(a1) - add s4, a2, s6 - lw s5, 0(s2) - lui s6, 38 - sh2add s3, a0, s4 - sw s5, 132(a1) - lui s5, 34 - lw s2, 0(s3) - addiw s4, s5, 736 - add s3, a2, s4 - sw s2, 136(a1) - sh2add s2, a0, s3 - li s3, 1125 - lw s5, 0(s2) - slli s2, s3, 7 - add s4, a2, s2 - sw s5, 140(a1) - sh2add s5, a0, s4 - lui s4, 36 - lw s3, 0(s5) - addiw s5, s4, 544 - sw s3, 144(a1) - add s3, a2, s5 - lui s5, 37 - sh2add s2, a0, s3 - lw s4, 0(s2) - addiw s2, s5, 448 - add s3, a2, s2 - sw s4, 148(a1) - sh2add s4, a0, s3 - addiw s3, s6, 352 - lw s5, 0(s4) - li s6, 625 - sw s5, 152(a1) - add s5, a2, s3 - sh2add s2, a0, s5 - slli s5, s6, 8 - lw s4, 0(s2) - lui s6, 40 - add s2, a2, s5 - sh2add s3, a0, s2 - sw s4, 156(a1) - lw s4, 0(s3) - addiw s3, s6, 160 - lui s6, 41 - add s2, a2, s3 - sw s4, 160(a1) - sh2add s5, a0, s2 - addiw s2, s6, 64 - lw s4, 0(s5) - lui s6, 42 - add s3, a2, s2 - addiw s2, s6, -32 - sh2add s5, a0, s3 - sw s4, 164(a1) - li s6, 1375 - add s3, a2, s2 - lw s4, 0(s5) - sh2add s5, a0, s3 - sw s4, 168(a1) - lw s4, 0(s5) - slli s5, s6, 7 - add s3, a2, s5 - sw s4, 172(a1) - sh2add s2, a0, s3 - lui s3, 44 - lw s6, 0(s2) - addiw s4, s3, -224 - add s2, a2, s4 - sh2add s5, a0, s2 - sw s6, 176(a1) - lui s2, 45 - li s6, 375 - addiw s4, s2, -320 - lw s3, 0(s5) - add s5, a2, s4 - sw s3, 180(a1) - sh2add s3, a0, s5 - lui s5, 46 - lw s2, 0(s3) - addiw s3, s5, -416 - sw s2, 184(a1) - add s2, a2, s3 - sh2add s4, a0, s2 - slli s2, s6, 9 - lw s5, 0(s4) - lui s6, 49 - sw s5, 188(a1) - add s5, a2, s2 - sh2add s3, a0, s5 - lui s5, 48 - lw s4, 0(s3) - addiw s2, s5, -608 - add s3, a2, s2 - sw s4, 192(a1) - sh2add s4, a0, s3 - addiw s3, s6, -704 - lw s5, 0(s4) - add s2, a2, s3 - lui s3, 50 - sh2add s4, a0, s2 - sw s5, 196(a1) - addiw s5, s3, -800 - lw s6, 0(s4) - add s4, a2, s5 - li s5, 1625 - sh2add s2, a0, s4 - sw s6, 200(a1) - lui s6, 52 - lw s3, 0(s2) - slli s2, s5, 7 - add s4, a2, s2 - sw s3, 204(a1) - sh2add s3, a0, s4 - addiw s4, s6, -992 - lw s5, 0(s3) - lui s6, 54 - add s2, a2, s4 - lui s4, 53 - sw s5, 208(a1) - sh2add s5, a0, s2 - lw s3, 0(s5) - addiw s5, s4, -1088 - add s2, a2, s5 - sw s3, 212(a1) - addiw s5, s6, -1184 - sh2add s3, a0, s2 - lui s6, 57 - add s2, a2, s5 - lw s4, 0(s3) - sw s4, 216(a1) - sh2add s4, a0, s2 + mul a6, t6, a3 li s2, 875 - lw s3, 0(s4) - slli s4, s2, 8 - sw s3, 220(a1) - add s3, a2, s4 - sh2add s5, a0, s3 - lui s3, 56 - lw s2, 0(s5) - addiw s5, s3, -1376 - add s4, a2, s5 - sw s2, 224(a1) - addiw s5, s6, -1472 - sh2add s2, a0, s4 - li s6, 1875 - lw s3, 0(s2) - sw s3, 228(a1) - add s3, a2, s5 - sh2add s4, a0, s3 - lui s3, 58 - lw s2, 0(s4) - addiw s5, s3, -1568 - sw s2, 232(a1) - add s2, a2, s5 - sh2add s4, a0, s2 - slli s2, s6, 7 - lw s3, 0(s4) - lui s6, 60 - add s4, a2, s2 - sh2add s5, a0, s4 - sw s3, 236(a1) - addiw s4, s6, -1760 - lw s3, 0(s5) - lui s6, 61 - add s5, a2, s4 - addiw s4, s6, -1856 - sh2add s2, a0, s5 - sw s3, 240(a1) - add s5, a2, s4 - lw s3, 0(s2) - lui s4, 62 - sh2add s2, a0, s5 - sw s3, 244(a1) - lw s3, 0(s2) - addiw s2, s4, -1952 - add s5, a2, s2 - sw s3, 248(a1) - li s2, 960 - sh2add s3, a0, s5 - lw s4, 0(s3) - sw s4, 252(a1) - bge s1, s2, label9 + addiw t6, t6, 64 + add a2, a4, a6 + sh2add a7, a0, a2 + add a6, a2, a3 + sh2add s0, a0, a6 + add a6, a2, a5 + lw s1, 0(a7) + sw s1, 0(a1) + lw a7, 0(s0) + sw a7, 4(a1) + sh2add a7, a0, a6 + add a6, a2, t0 + lw s0, 0(a7) + sh2add a7, a0, a6 + sw s0, 8(a1) + lw s0, 0(a7) + add a7, a2, t1 + sh2add a6, a0, a7 + sw s0, 12(a1) + add a7, a2, t2 + lw s0, 0(a6) + sh2add s1, a0, a7 + sw s0, 16(a1) + add s0, a2, t3 + lw a6, 0(s1) + sh2add a7, a0, s0 + li s1, 875 + sw a6, 20(a1) + lw a6, 0(a7) + slli a7, s1, 5 + li s1, 1125 + sw a6, 24(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + li a6, 125 + lw a7, 0(s0) + sw a7, 28(a1) + slli a7, a6, 8 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + sw a7, 32(a1) + slli a7, s1, 5 + add a6, a2, a7 + sh2add s0, a0, a6 + li a6, 625 + lw a7, 0(s0) + sw a7, 36(a1) + slli a7, a6, 6 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + li a6, 1375 + slli s1, a6, 5 + sw a7, 40(a1) + add a7, a2, s1 + li s1, 375 + sh2add a6, a0, a7 + lw s0, 0(a6) + slli a6, s1, 7 + li s1, 1625 + add a7, a2, a6 + sw s0, 44(a1) + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 48(a1) + slli a6, s1, 5 + add a7, a2, a6 + sh2add s0, a0, a7 + slli a7, s2, 6 + lw a6, 0(s0) + li s2, 125 + add s1, a2, a7 + sh2add s0, a0, s1 + sw a6, 52(a1) + li s1, 1875 + lw a6, 0(s0) + slli a7, s1, 5 + sw a6, 56(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + slli a6, s2, 9 + lw a7, 0(s0) + lui s2, 17 + add s1, a2, a6 + sh2add s0, a0, s1 + sw a7, 60(a1) + addiw s1, s2, -1632 + lw a7, 0(s0) + lui s2, 26 + add a6, a2, s1 + sw a7, 64(a1) + sh2add a7, a0, a6 + lw s0, 0(a7) + li a7, 1125 + slli a6, a7, 6 + sw s0, 68(a1) + add s0, a2, a6 + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 19 + sw a6, 72(a1) + addiw a6, a7, -1824 + add s0, a2, a6 + sh2add a7, a0, s0 + lw a6, 0(a7) + li a7, 625 + slli s1, a7, 7 + sw a6, 76(a1) + add a6, a2, s1 + sh2add a7, a0, a6 + lw s0, 0(a7) + lui a7, 21 + addiw a6, a7, -2016 + sw s0, 80(a1) + add s0, a2, a6 + li a6, 1375 + sh2add s1, a0, s0 + lw a7, 0(s1) + li s1, 375 + sw a7, 84(a1) + slli a7, a6, 6 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + lui a6, 22 + sw a7, 88(a1) + addiw a7, a6, 1888 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + slli a6, s1, 8 + lui s1, 24 + add s0, a2, a6 + sw a7, 92(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + sw a6, 96(a1) + addiw a6, s1, 1696 + add a7, a2, a6 + sh2add s0, a0, a7 + li a7, 1625 + lw a6, 0(s0) + sw a6, 100(a1) + slli a6, a7, 6 + add s0, a2, a6 + addiw a6, s2, 1504 + sh2add s1, a0, s0 + li s2, 125 + add s0, a2, a6 + lw a7, 0(s1) + sw a7, 104(a1) + sh2add a7, a0, s0 + li s0, 875 + lw a6, 0(a7) + slli s1, s0, 7 + add a7, a2, s1 + lui s1, 28 + sw a6, 108(a1) + sh2add a6, a0, a7 + lw s0, 0(a6) + addiw a6, s1, 1312 + li s1, 1875 + add a7, a2, a6 + sw s0, 112(a1) + sh2add s0, a0, a7 + slli a7, s1, 6 + lw a6, 0(s0) + lui s1, 30 + add s0, a2, a7 + sw a6, 116(a1) + sh2add a6, a0, s0 + addiw s0, s1, 1120 + lw a7, 0(a6) + add a6, a2, s0 + sw a7, 120(a1) + sh2add a7, a0, a6 + lw s1, 0(a7) + slli a7, s2, 10 + li s2, 1125 + add a6, a2, a7 + sw s1, 124(a1) + sh2add s0, a0, a6 + lui s1, 32 + lw a7, 0(s0) + addiw a6, s1, 928 + lui s1, 33 + sw a7, 128(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 132(a1) + addiw a6, s1, 832 + add a7, a2, a6 + sh2add s0, a0, a7 + lui a7, 34 + lw a6, 0(s0) + sw a6, 136(a1) + addiw a6, a7, 736 + add s1, a2, a6 + slli a6, s2, 7 + sh2add s0, a0, s1 + lui s2, 60 + add s1, a2, a6 + lw a7, 0(s0) + sh2add s0, a0, s1 + lui s1, 36 + sw a7, 140(a1) + addiw a6, s1, 544 + lw a7, 0(s0) + lui s1, 37 + sw a7, 144(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 148(a1) + addiw a6, s1, 448 + lui s1, 38 + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s1, 352 + lw a6, 0(s0) + li s1, 625 + sw a6, 152(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + lw a7, 0(s0) + sw a7, 156(a1) + slli a7, s1, 8 + lui s1, 40 + add a6, a2, a7 + sh2add s0, a0, a6 + lw a7, 0(s0) + sw a7, 160(a1) + addiw a7, s1, 160 + lui s1, 41 + add a6, a2, a7 + sh2add s0, a0, a6 + addiw a6, s1, 64 + lw a7, 0(s0) + lui s1, 42 + add s0, a2, a6 + sw a7, 164(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + sw a6, 168(a1) + addiw a6, s1, -32 + li s1, 1375 + add a7, a2, a6 + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 172(a1) + slli a6, s1, 7 + lui s1, 44 + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s1, -224 + lw a6, 0(s0) + lui s1, 46 + sw a6, 176(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + lui a6, 45 + lw a7, 0(s0) + sw a7, 180(a1) + addiw a7, a6, -320 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + sw a7, 184(a1) + addiw a7, s1, -416 + lui s1, 48 + add a6, a2, a7 + sh2add s0, a0, a6 + li a6, 375 + lw a7, 0(s0) + sw a7, 188(a1) + slli a7, a6, 9 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, s1, -608 + lui s1, 50 + add s0, a2, a6 + sw a7, 192(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 49 + sw a6, 196(a1) + addiw a6, a7, -704 + add s0, a2, a6 + sh2add a7, a0, s0 + lw a6, 0(a7) + addiw a7, s1, -800 + li s1, 1625 + sw a6, 200(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + slli a6, s1, 7 + lw a7, 0(s0) + lui s1, 52 + add s0, a2, a6 + sw a7, 204(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + sw a6, 208(a1) + addiw a6, s1, -992 + lui s1, 53 + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s1, -1088 + lw a6, 0(s0) + lui s1, 54 + sw a6, 212(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + addiw a6, s1, -1184 + lw a7, 0(s0) + li s1, 875 + sw a7, 216(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + slli a7, s1, 8 + lw a6, 0(s0) + lui s1, 57 + add s0, a2, a7 + sw a6, 220(a1) + sh2add a6, a0, s0 + lw a7, 0(a6) + lui a6, 56 + sw a7, 224(a1) + addiw a7, a6, -1376 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + sw a7, 228(a1) + addiw a7, s1, -1472 + li s1, 1875 + add a6, a2, a7 + sh2add s0, a0, a6 + lui a6, 58 + lw a7, 0(s0) + sw a7, 232(a1) + addiw a7, a6, -1568 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + slli a6, s1, 7 + sw a7, 236(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s2, -1760 + lw a6, 0(s0) + lui s2, 61 + add s0, a2, a7 + addiw a7, s2, -1856 + sh2add s1, a0, s0 + sw a6, 240(a1) + lw a6, 0(s1) + lui s1, 62 + sw a6, 244(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + addiw a6, s1, -1952 + lw a7, 0(s0) + add s0, a2, a6 + sw a7, 248(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + li a7, 960 + sw a6, 252(a1) + bge t6, a7, label9 addi a1, a1, 256 j label5 label11: - ld s0, 0(sp) - ld s5, 8(sp) - ld s1, 16(sp) - ld s6, 24(sp) - ld s4, 32(sp) - ld s2, 40(sp) - ld s3, 48(sp) - ld s7, 56(sp) - addi sp, sp, 64 + ld s1, 0(sp) + ld s0, 8(sp) + ld s2, 16(sp) + addi sp, sp, 24 ret .p2align 2 cmmc_parallel_body_1: - addi sp, sp, -96 - mv a7, a1 -pcrel1095: - auipc a5, %pcrel_hi(c) - li a4, 125 - sd s1, 0(sp) - addi a3, a5, %pcrel_lo(pcrel1095) - slli a2, a4, 5 - mv s1, a0 - sd s6, 8(sp) - sh3add t5, a2, a2 + addi sp, sp, -72 + mv t4, a1 +pcrel1084: + auipc a4, %pcrel_hi(c) + li a5, 125 + mv t6, a0 + addi a3, a4, %pcrel_lo(pcrel1084) + slli a2, a5, 5 + sd s0, 0(sp) sh2add t1, a2, a2 + sh1add a5, a2, a2 slli a4, a2, 1 mul a1, a0, a2 - sd s0, 16(sp) - sh1add a5, a2, a2 + sd s5, 8(sp) slli t0, a4, 1 -pcrel1096: +pcrel1085: auipc a0, %pcrel_hi(a) - add a6, a3, a1 - sd s5, 24(sp) - slli t2, a5, 1 - slli t4, t0, 1 - addi s0, a0, %pcrel_lo(pcrel1096) -pcrel1097: + add t3, a3, a1 + addi t5, a0, %pcrel_lo(pcrel1085) + sd s1, 16(sp) +pcrel1086: auipc a1, %pcrel_hi(b) - li a0, 875 - sd s2, 32(sp) - addi a3, a1, %pcrel_lo(pcrel1097) - slli t3, a0, 5 - sd s4, 40(sp) - sd s8, 48(sp) - sd s3, 56(sp) - sd s7, 64(sp) - sd s9, 72(sp) - sd s10, 80(sp) - sd s11, 88(sp) - mul a1, s1, a2 - mv s2, a6 + sd s6, 24(sp) + addi a3, a1, %pcrel_lo(pcrel1086) + sd s3, 32(sp) + sd s2, 40(sp) + sd s4, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + mul a1, t6, a2 + mv a6, t3 mv a0, zero - add t6, s0, a1 - mv a1, t6 - mv s4, zero - mv s5, zero - j label788 + add t2, t5, a1 + mv a1, t2 + mv s0, zero + mv s1, zero + j label781 .p2align 2 -label792: - li s4, 125 - lui s10, 17 - slli s8, s4, 9 - lw s4, 64(a1) - add s6, s3, s8 - sh2add s7, a0, s6 - lw s8, 0(s7) - mulw s9, s4, s8 - addiw s8, s10, -1632 - addw s6, s5, s9 - li s10, 1125 - add s4, s3, s8 - lw s5, 68(a1) - sh2add s7, a0, s4 - lw s8, 0(s7) - mulw s9, s5, s8 - slli s8, s10, 6 - addw s4, s6, s9 - lui s10, 21 - add s5, s3, s8 - lw s6, 72(a1) - sh2add s7, a0, s5 - lw s8, 0(s7) - lui s7, 19 - mulw s9, s6, s8 - addiw s8, s7, -1824 - addw s5, s4, s9 - lw s7, 76(a1) - add s4, s3, s8 - sh2add s6, a0, s4 - lw s8, 0(s6) +label785: + li s0, 125 + lui s6, 17 + slli s4, s0, 9 + lw s0, 64(a1) + add s2, a7, s4 + sh2add s3, a0, s2 + lw s4, 0(s3) + mulw s5, s0, s4 + addiw s4, s6, -1632 + addw s2, s1, s5 + li s6, 1125 + add s0, a7, s4 + lw s1, 68(a1) + sh2add s3, a0, s0 + lw s5, 0(s3) + mulw s4, s1, s5 + slli s1, s6, 6 + addw s0, s2, s4 + lui s6, 19 + add s5, a7, s1 + lw s2, 72(a1) + sh2add s3, a0, s5 + lw s4, 0(s3) + mulw s5, s2, s4 + lw s2, 76(a1) + addiw s4, s6, -1824 + addw s1, s0, s5 li s6, 625 - mulw s9, s7, s8 - slli s8, s6, 7 - addw s4, s5, s9 - lw s6, 80(a1) - add s5, s3, s8 - sh2add s7, a0, s5 - lw s8, 0(s7) - mulw s9, s6, s8 - lw s6, 84(a1) - addiw s8, s10, -2016 - addw s5, s4, s9 - lui s10, 22 - add s4, s3, s8 - sh2add s7, a0, s4 - lw s9, 0(s7) - li s7, 1375 - mulw s8, s6, s9 - slli s9, s7, 6 - addw s4, s5, s8 - lw s7, 88(a1) - add s5, s3, s9 - sh2add s6, a0, s5 - lw s9, 0(s6) - mulw s8, s7, s9 - addiw s7, s10, 1888 - addw s5, s4, s8 - lw s4, 92(a1) - add s8, s3, s7 - sh2add s6, a0, s8 + add s0, a7, s4 + sh2add s3, a0, s0 + lw s4, 0(s3) + mulw s5, s2, s4 + slli s4, s6, 7 + addw s0, s1, s5 + lw s2, 80(a1) + lui s6, 21 + add s1, a7, s4 + sh2add s3, a0, s1 + lw s4, 0(s3) + mulw s5, s2, s4 + lw s2, 84(a1) + addiw s4, s6, -2016 + addw s1, s0, s5 + lui s6, 22 + add s0, a7, s4 + sh2add s3, a0, s0 + lw s4, 0(s3) + li s3, 1375 + mulw s5, s2, s4 + slli s4, s3, 6 + addw s0, s1, s5 + lw s3, 88(a1) + add s1, a7, s4 + sh2add s2, a0, s1 + lw s5, 0(s2) + addiw s2, s6, 1888 + mulw s4, s3, s5 + add s3, a7, s2 + addw s1, s0, s4 + lw a7, 92(a1) + sh2add s0, a0, s3 addiw a0, a0, 1 - lw s3, 0(s6) - mulw s7, s4, s3 - addw a1, s5, s7 - sw a1, 0(s2) - li a1, 1000 - bge a0, a1, label793 - addi s2, s2, 4 - mv a1, t6 - mv s4, zero - mv s5, zero + lw s2, 0(s0) + mulw s3, a7, s2 + li a7, 1000 + addw a1, s1, s3 + sw a1, 0(a6) + bge a0, a7, label786 + addi a6, a6, 4 + mv a1, t2 + mv s0, zero + mv s1, zero .p2align 2 -label788: - mul s8, s4, a2 - lw s6, 0(a1) - addiw s4, s4, 16 - add s3, a3, s8 - sh2add s7, a0, s3 - add s9, s3, a2 - sh2add s10, a0, s9 - lw s8, 0(s7) - lw s11, 4(a1) - lw s7, 0(s10) - mulw s10, s6, s8 - lw s8, 8(a1) - mulw s9, s11, s7 - add s11, s3, a4 - addw s7, s9, s10 - sh2add s10, a0, s11 - lw s9, 0(s10) - add s10, s3, a5 - mulw s11, s8, s9 - lw s9, 12(a1) - sh2add s8, a0, s10 - addw s6, s7, s11 - lw s11, 0(s8) - add s8, s3, t0 - mulw s10, s9, s11 - sh2add s9, a0, s8 - addw s7, s6, s10 - lw s6, 16(a1) - lw s10, 0(s9) - mulw s11, s6, s10 - add s10, s3, t1 - addw s8, s7, s11 - sh2add s9, a0, s10 - lw s7, 20(a1) - lw s11, 0(s9) - mulw s10, s7, s11 - add s11, s3, t3 - add s7, s3, t2 - addw s6, s8, s10 - sh2add s9, a0, s7 - lw s8, 24(a1) - lw s10, 0(s9) - mulw s9, s8, s10 - sh2add s8, a0, s11 - addw s7, s6, s9 - lw s9, 28(a1) - lw s10, 0(s8) - lw s8, 32(a1) - mulw s11, s9, s10 - add s10, s3, t4 - addw s6, s7, s11 - sh2add s9, a0, s10 - lw s10, 0(s9) - lw s9, 36(a1) - mulw s11, s8, s10 - add s10, s3, t5 - addw s7, s6, s11 - sh2add s8, a0, s10 - lw s11, 0(s8) +label781: + mul s3, s0, a2 + lw s2, 0(a1) + addiw s0, s0, 16 + add a7, a3, s3 + sh2add s5, a0, a7 + add s6, a7, a2 + sh2add s4, a0, s6 + lw s3, 0(s5) + lw s5, 4(a1) + lw s7, 0(s4) + mulw s8, s2, s3 + lw s2, 8(a1) + mulw s6, s5, s7 + add s7, a7, a4 + addw s4, s6, s8 + sh2add s5, a0, s7 + add s6, a7, a5 + lw s8, 0(s5) + sh2add s5, a0, s6 + add s6, a7, t0 + mulw s7, s2, s8 + addw s3, s4, s7 + lw s4, 12(a1) + lw s8, 0(s5) + lw s5, 16(a1) + mulw s7, s4, s8 + sh2add s4, a0, s6 + addw s2, s3, s7 + add s6, a7, t1 + lw s7, 0(s4) + lw s4, 20(a1) + mulw s8, s5, s7 + sh2add s5, a0, s6 + addw s3, s2, s8 + lw s7, 0(s5) + li s5, 375 + slli s6, s5, 6 + mulw s8, s4, s7 + lw s5, 24(a1) + addw s2, s3, s8 li s8, 625 - mulw s10, s9, s11 - lw s9, 40(a1) - addw s6, s7, s10 - slli s10, s8, 6 - add s7, s3, s10 - sh2add s8, a0, s7 - lw s10, 0(s8) + add s3, a7, s6 + sh2add s4, a0, s3 + lw s6, 0(s4) + li s4, 875 + mulw s7, s5, s6 + slli s6, s4, 5 + addw s3, s2, s7 + lw s4, 28(a1) + add s2, a7, s6 + sh2add s5, a0, s2 + lw s6, 0(s5) + li s5, 125 + mulw s7, s4, s6 + slli s6, s5, 8 + addw s2, s3, s7 + lw s5, 32(a1) + add s3, a7, s6 + sh2add s4, a0, s3 + lw s6, 0(s4) + li s4, 1125 + mulw s7, s5, s6 + slli s6, s4, 5 + addw s3, s2, s7 + lw s4, 36(a1) + add s2, a7, s6 + sh2add s5, a0, s2 + lw s7, 0(s5) + mulw s6, s4, s7 + lw s4, 40(a1) + slli s7, s8, 6 + addw s2, s3, s6 li s8, 1375 - mulw s11, s9, s10 - lw s9, 44(a1) - slli s10, s8, 5 - addw s7, s6, s11 - add s6, s3, s10 - sh2add s8, a0, s6 - lw s10, 0(s8) + add s3, a7, s7 + sh2add s5, a0, s3 + lw s6, 0(s5) + mulw s7, s4, s6 + slli s4, s8, 5 + addw s3, s2, s7 li s8, 375 - mulw s11, s9, s10 - lw s9, 48(a1) - slli s10, s8, 7 - addw s6, s7, s11 - add s7, s3, s10 - sh2add s8, a0, s7 - lw s10, 0(s8) - li s8, 1625 - mulw s11, s9, s10 - lw s9, 52(a1) - slli s10, s8, 5 - addw s7, s6, s11 - add s6, s3, s10 - sh2add s8, a0, s6 - lw s10, 0(s8) - li s8, 875 - mulw s11, s9, s10 - lw s9, 56(a1) - slli s10, s8, 6 - addw s6, s7, s11 - add s7, s3, s10 - sh2add s8, a0, s7 - lw s10, 0(s8) + add s6, a7, s4 + lw s2, 44(a1) + sh2add s5, a0, s6 + lw s7, 0(s5) + mulw s6, s2, s7 + slli s2, s8, 7 + addw s4, s3, s6 li s8, 1875 - mulw s11, s9, s10 - slli s10, s8, 5 - addw s7, s6, s11 - lw s8, 60(a1) - add s9, s3, s10 - sh2add s6, a0, s9 - lw s10, 0(s6) - li s6, 992 - mulw s9, s8, s10 - addw s11, s7, s9 - addw s5, s5, s11 - bge s4, s6, label792 + add s7, a7, s2 + lw s3, 48(a1) + sh2add s5, a0, s7 + lw s6, 0(s5) + li s5, 1625 + mulw s7, s3, s6 + slli s6, s5, 5 + addw s2, s4, s7 + lw s5, 52(a1) + add s3, a7, s6 + sh2add s4, a0, s3 + lw s7, 0(s4) + li s4, 875 + mulw s6, s5, s7 + slli s7, s4, 6 + addw s3, s2, s6 + lw s4, 56(a1) + add s2, a7, s7 + sh2add s5, a0, s2 + lw s7, 0(s5) + slli s5, s8, 5 + mulw s6, s4, s7 + lw s4, 60(a1) + addw s2, s3, s6 + add s6, a7, s5 + sh2add s3, a0, s6 + lw s7, 0(s3) + mulw s6, s4, s7 + addw s5, s2, s6 + li s2, 992 + addw s1, s1, s5 + bge s0, s2, label785 addi a1, a1, 64 - j label788 + j label781 .p2align 2 -label793: - addiw s1, s1, 1 - ble a7, s1, label795 - add a6, a6, a2 - mul a1, s1, a2 +label786: + addiw t6, t6, 1 + ble t4, t6, label788 + add t3, t3, a2 + mul a1, t6, a2 mv a0, zero - mv s4, zero - mv s5, zero - mv s2, a6 - add t6, s0, a1 - mv a1, t6 - j label788 -label795: - ld s1, 0(sp) - ld s6, 8(sp) - ld s0, 16(sp) - ld s5, 24(sp) - ld s2, 32(sp) - ld s4, 40(sp) - ld s8, 48(sp) - ld s3, 56(sp) - ld s7, 64(sp) - ld s9, 72(sp) - ld s10, 80(sp) - ld s11, 88(sp) - addi sp, sp, 96 + mv s0, zero + mv s1, zero + mv a6, t3 + add t2, t5, a1 + mv a1, t2 + j label781 +label788: + ld s0, 0(sp) + ld s5, 8(sp) + ld s1, 16(sp) + ld s6, 24(sp) + ld s3, 32(sp) + ld s2, 40(sp) + ld s4, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + addi sp, sp, 72 ret .p2align 2 cmmc_parallel_body_2: mv t0, a0 mv a2, a1 addiw a4, a0, 3 -pcrel1201: +pcrel1190: auipc a5, %pcrel_hi(cmmc_parallel_body_payload_2) - ld a3, %pcrel_lo(pcrel1201)(a5) - addi a1, a5, %pcrel_lo(pcrel1201) + ld a3, %pcrel_lo(pcrel1190)(a5) + addi a1, a5, %pcrel_lo(pcrel1190) lw a0, 8(a1) - ble a2, a4, label1099 + ble a2, a4, label1088 addiw t1, t0, 15 addiw a4, a2, -3 addiw a5, a2, -18 - bge t1, a4, label1150 + bge t1, a4, label1139 sh2add a1, t0, a3 - j label1109 + j label1098 .p2align 2 -label1112: +label1101: addi a1, a1, 64 .p2align 2 -label1109: +label1098: sw a0, 0(a1) addiw t0, t0, 16 sw a0, 4(a1) @@ -1518,59 +1494,59 @@ label1109: sw a0, 52(a1) sw a0, 56(a1) sw a0, 60(a1) - bgt a5, t0, label1112 + bgt a5, t0, label1101 mv a1, t0 -label1113: - ble a4, a1, label1099 +label1102: + ble a4, a1, label1088 sh2add a5, a1, a3 -label1117: +label1106: sw a0, 0(a5) addiw a1, a1, 4 sw a0, 4(a5) sw a0, 8(a5) sw a0, 12(a5) - ble a4, a1, label1187 + ble a4, a1, label1176 addi a5, a5, 16 - j label1117 -label1187: + j label1106 +label1176: mv t0, a1 -label1099: - ble a2, t0, label1106 +label1088: + ble a2, t0, label1095 sh2add a1, t0, a3 - j label1102 -label1105: + j label1091 +label1094: addi a1, a1, 4 -label1102: +label1091: addiw t0, t0, 1 sw a0, 0(a1) - bgt a2, t0, label1105 -label1106: + bgt a2, t0, label1094 +label1095: ret -label1150: +label1139: mv a1, t0 mv t0, zero - j label1113 + j label1102 .p2align 2 cmmc_parallel_body_3: mv t0, a0 addiw a5, a0, 3 -pcrel1356: +pcrel1345: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - addi a2, a0, %pcrel_lo(pcrel1356) + addi a2, a0, %pcrel_lo(pcrel1345) ld a3, 8(a2) - ble a1, a5, label1244 + ble a1, a5, label1233 addiw a0, t0, 15 addiw a4, a1, -3 addiw a5, a1, -18 - bge a0, a4, label1251 + bge a0, a4, label1240 sh2add a0, t0, a3 mv t1, zero - j label1219 + j label1208 .p2align 2 -label1223: +label1212: addi a0, a0, 64 .p2align 2 -label1219: +label1208: lw t4, 0(a0) addiw t0, t0, 16 lw t5, 4(a0) @@ -1604,17 +1580,17 @@ label1219: lw t4, 60(a0) addw t2, t3, t6 addw t1, t2, t4 - bgt a5, t0, label1223 + bgt a5, t0, label1212 mv a5, t0 mv t2, t1 -label1204: - ble a4, a5, label1255 +label1193: + ble a4, a5, label1244 sh2add a0, a5, a3 mv t0, t2 - j label1213 -label1217: + j label1202 +label1206: addi a0, a0, 16 -label1213: +label1202: lw t1, 0(a0) addiw a5, a5, 4 lw t4, 4(a0) @@ -1624,39 +1600,39 @@ label1213: lw t3, 12(a0) addw t1, t2, t5 addw t0, t1, t3 - bgt a4, a5, label1217 + bgt a4, a5, label1206 mv a0, t0 mv a4, t0 mv t0, a5 -label1224: - ble a1, t0, label1333 +label1213: + ble a1, t0, label1322 sh2add a0, t0, a3 mv a3, a4 - j label1231 -label1235: + j label1220 +label1224: addi a0, a0, 4 -label1231: +label1220: lw a5, 0(a0) addiw t0, t0, 1 addw a3, a3, a5 - bgt a1, t0, label1235 -label1228: + bgt a1, t0, label1224 +label1217: amoadd.w.aqrl a1, a3, (a2) ret -label1255: +label1244: mv a0, t1 mv a4, t1 - j label1224 -label1251: + j label1213 +label1240: mv a5, t0 mv t2, zero mv t1, zero mv t0, zero - j label1204 -label1333: + j label1193 +label1322: mv a3, a0 - j label1228 -label1244: + j label1217 +label1233: mv a4, zero mv a0, zero - j label1224 + j label1213 diff --git a/tests/SysY2022/performance/matmul2.arm.s b/tests/SysY2022/performance/matmul2.arm.s index 721ecd82d..93911d3e2 100644 --- a/tests/SysY2022/performance/matmul2.arm.s +++ b/tests/SysY2022/performance/matmul2.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 b: .zero 4000000 -.align 8 +.p2align 3 c: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 12 .text diff --git a/tests/SysY2022/performance/matmul2.riscv.s b/tests/SysY2022/performance/matmul2.riscv.s index 5036d87ef..e17b7c81c 100644 --- a/tests/SysY2022/performance/matmul2.riscv.s +++ b/tests/SysY2022/performance/matmul2.riscv.s @@ -1,485 +1,473 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 b: .zero 4000000 -.align 8 +.p2align 3 c: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 12 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 16 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[16] CalleeSaved[104] - addi sp, sp, -120 -pcrel2002: + addi sp, sp, -88 +pcrel1982: auipc a0, %pcrel_hi(a) -pcrel2003: +pcrel1983: auipc a1, %pcrel_hi(cmmc_parallel_body_3) sd ra, 0(sp) - addi a2, a1, %pcrel_lo(pcrel2003) - sd s7, 8(sp) - li a1, 875 - addi s7, a0, %pcrel_lo(pcrel2002) - sd s9, 16(sp) -pcrel2004: + sd s5, 8(sp) + addi s5, a0, %pcrel_lo(pcrel1982) + sd s0, 16(sp) +pcrel1984: auipc a0, %pcrel_hi(c) -pcrel2005: - auipc s9, %pcrel_hi(cmmc_parallel_body_payload_2) - sd s10, 24(sp) - addi s10, s9, %pcrel_lo(pcrel2005) - sd s0, 32(sp) - addi s0, a0, %pcrel_lo(pcrel2004) - sd s5, 40(sp) -pcrel2006: + sd s7, 24(sp) +pcrel1985: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_2) + sd s8, 32(sp) + addi s8, s7, %pcrel_lo(pcrel1985) + sd s1, 40(sp) + addi s1, a0, %pcrel_lo(pcrel1984) + sd s6, 48(sp) +pcrel1986: auipc a0, %pcrel_hi(cmmc_parallel_body_2) - sd s11, 48(sp) - mv s11, zero - sd s8, 56(sp) - addi s8, a0, %pcrel_lo(pcrel2006) - sd s1, 64(sp) + sd s9, 56(sp) + addi s6, a0, %pcrel_lo(pcrel1986) + mv s9, zero li a0, 125 - sd s6, 72(sp) - slli a5, a0, 5 - slli s6, a1, 5 - slli s1, a5, 1 - sd s2, 80(sp) - sh1add s2, a5, a5 - sd s3, 88(sp) - slli s5, s2, 1 - slli s3, s1, 1 - sd s4, 96(sp) - sh2add s4, a5, a5 - sd a2, 104(sp) - sd a5, 112(sp) -label1358: + sd s2, 64(sp) + slli s0, a0, 5 + addi s2, a1, %pcrel_lo(pcrel1983) + sd s3, 72(sp) + slli s3, s0, 1 + sd s4, 80(sp) + sh1add s4, s0, s0 +label1347: li a0, 1000 - bge s11, a0, label1365 - mv a0, s7 + bge s9, a0, label1354 + mv a0, s5 jal getarray li a1, 1000 - bne a0, a1, label1363 - addiw s11, s11, 1 - ld a5, 112(sp) - add s7, s7, a5 - j label1358 + bne a0, a1, label1352 + addiw s9, s9, 1 + add s5, s5, s0 + j label1347 .p2align 2 -label1389: - ld a5, 112(sp) - add s0, s0, a5 +label1378: + add s1, s1, s0 .p2align 2 -label1384: - auipc s3, %pcrel_hi(cmmc_parallel_body_payload_3) +label1373: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) mv a0, zero li a1, 1000 - sw s4, %pcrel_lo(label1384)(s3) - sw s4, 4(s1) - sd s0, 8(s1) - ld a2, 104(sp) + sw s6, %pcrel_lo(label1373)(s5) + sw s6, 4(s3) + sd s1, 8(s3) + mv a2, s2 jal cmmcParallelFor li a0, 1000 - addiw s2, s2, 1 - lw s4, %pcrel_lo(label1384)(s3) - blt s2, a0, label1389 + addiw s4, s4, 1 + lw s6, %pcrel_lo(label1373)(s5) + blt s4, a0, label1378 li a0, 92 jal _sysy_stoptime - mv a0, s4 + mv a0, s6 jal putint mv a0, zero -label1363: +label1352: ld ra, 0(sp) - ld s7, 8(sp) - ld s9, 16(sp) - ld s10, 24(sp) - ld s0, 32(sp) - ld s5, 40(sp) - ld s11, 48(sp) - ld s8, 56(sp) - ld s1, 64(sp) - ld s6, 72(sp) - ld s2, 80(sp) - ld s3, 88(sp) - ld s4, 96(sp) - addi sp, sp, 120 + ld s5, 8(sp) + ld s0, 16(sp) + ld s7, 24(sp) + ld s8, 32(sp) + ld s1, 40(sp) + ld s6, 48(sp) + ld s9, 56(sp) + ld s2, 64(sp) + ld s3, 72(sp) + ld s4, 80(sp) + addi sp, sp, 88 ret -label1365: +label1354: li a0, 23 jal _sysy_starttime li a1, 1000 mv a0, zero -pcrel2007: +pcrel1987: auipc a3, %pcrel_hi(cmmc_parallel_body_0) - addi a2, a3, %pcrel_lo(pcrel2007) + addi a2, a3, %pcrel_lo(pcrel1987) jal cmmcParallelFor li a1, 1000 mv a0, zero -pcrel2008: +pcrel1988: auipc a3, %pcrel_hi(cmmc_parallel_body_1) - addi a2, a3, %pcrel_lo(pcrel2008) + addi a2, a3, %pcrel_lo(pcrel1988) jal cmmcParallelFor - mv s11, zero - mv s7, s0 - mv a0, s0 + mv s9, zero + mv s5, s1 + mv a0, s1 mv a1, zero lui a3, 524288 addiw a2, a3, -1 - j label1369 + j label1358 .p2align 2 -label1392: +label1381: addi a0, a0, 256 .p2align 2 -label1369: - lw a5, 0(a0) +label1358: + lw t0, 0(a0) addiw a1, a1, 64 - lw t0, 4(a0) - min a3, a2, a5 - lw t1, 8(a0) - min a4, a3, t0 - lw t2, 12(a0) - min a5, a4, t1 + lw a5, 4(a0) + min a4, a2, t0 + lw t0, 8(a0) + min a3, a4, a5 + lw a5, 12(a0) + min a2, a3, t0 lw t0, 16(a0) - min a2, a5, t2 + min a4, a2, a5 lw a5, 20(a0) - min a3, a2, t0 + min a3, a4, t0 lw t0, 24(a0) - min a4, a3, a5 - lw t1, 28(a0) - min a2, a4, t0 - lw a4, 32(a0) + min a2, a3, a5 + lw a5, 28(a0) + min a4, a2, t0 + lw t0, 32(a0) + min a3, a4, a5 + lw t1, 36(a0) + min a2, a3, t0 + lw a3, 40(a0) min a5, a2, t1 - lw a2, 36(a0) - min a3, a5, a4 - lw a5, 40(a0) - min t0, a3, a2 - lw a3, 44(a0) - min a4, t0, a5 - lw t0, 48(a0) - min a2, a4, a3 + lw t0, 44(a0) + min a4, a5, a3 + lw a5, 48(a0) + min a2, a4, t0 lw t1, 52(a0) - min a5, a2, t0 - lw t0, 56(a0) - min a3, a5, t1 - lw t1, 60(a0) - min a4, a3, t0 - lw a5, 64(a0) - min a2, a4, t1 - lw t0, 68(a0) min a3, a2, a5 + lw t0, 56(a0) + min a4, a3, t1 + lw a3, 60(a0) + min a5, a4, t0 + lw t2, 64(a0) + min a2, a5, a3 + lw t1, 68(a0) + min t0, a2, t2 lw a5, 72(a0) - min a4, a3, t0 + min a4, t0, t1 lw t1, 76(a0) - min a2, a4, a5 + min a3, a4, a5 lw t0, 80(a0) - min a3, a2, t1 - lw a2, 84(a0) - min a5, a3, t0 - lw t1, 88(a0) - min a4, a5, a2 - lw t0, 92(a0) - min a3, a4, t1 - lw a5, 96(a0) - min a2, a3, t0 - lw t0, 100(a0) - min a4, a2, a5 - lw t1, 104(a0) + min a2, a3, t1 + lw t1, 84(a0) + min a5, a2, t0 + lw t0, 88(a0) + min a4, a5, t1 + lw t1, 92(a0) min a3, a4, t0 - lw t0, 108(a0) + lw t0, 96(a0) + min a2, a3, t1 + lw t1, 100(a0) + min a5, a2, t0 + lw a2, 104(a0) + min a4, a5, t1 + lw t1, 108(a0) + min a3, a4, a2 + lw t0, 112(a0) min a5, a3, t1 - lw a3, 112(a0) + lw t1, 116(a0) min a2, a5, t0 - lw t0, 116(a0) - min a4, a2, a3 - lw t2, 120(a0) - min a5, a4, t0 - lw t1, 124(a0) - min a3, a5, t2 - lw t0, 128(a0) - min a2, a3, t1 - lw t2, 132(a0) - min a4, a2, t0 - lw t1, 136(a0) - min a5, a4, t2 - lw t0, 140(a0) - min a3, a5, t1 - lw a5, 144(a0) + lw a5, 120(a0) + min a4, a2, t1 + lw t0, 124(a0) + min a3, a4, a5 + lw a5, 128(a0) min a2, a3, t0 - lw t1, 148(a0) + lw t1, 132(a0) min a4, a2, a5 + lw t0, 136(a0) + min a3, a4, t1 + lw t1, 140(a0) + min a5, a3, t0 + lw t0, 144(a0) + min a2, a5, t1 + lw t1, 148(a0) + min a4, a2, t0 lw t0, 152(a0) min a3, a4, t1 lw a4, 156(a0) - min a2, a3, t0 - lw t1, 160(a0) - min a5, a2, a4 - lw t0, 164(a0) - min a3, a5, t1 - lw a5, 168(a0) - min a4, a3, t0 + min a5, a3, t0 + lw t0, 160(a0) + min a2, a5, a4 + lw a5, 164(a0) + min a3, a2, t0 + lw t0, 168(a0) + min a4, a3, a5 lw t1, 172(a0) - min a2, a4, a5 - lw t0, 176(a0) + min a2, a4, t0 + lw a5, 176(a0) min a3, a2, t1 - lw a2, 180(a0) - min a5, a3, t0 - lw t0, 184(a0) - min a4, a5, a2 - lw a5, 188(a0) - min a3, a4, t0 + lw t0, 180(a0) + min a4, a3, a5 + lw a3, 184(a0) + min a2, a4, t0 + lw t1, 188(a0) + min a5, a2, a3 lw t0, 192(a0) + min a4, a5, t1 + lw a5, 196(a0) + min a3, a4, t0 + lw t0, 200(a0) min a2, a3, a5 - lw t1, 196(a0) + lw t1, 204(a0) min a4, a2, t0 - lw a5, 200(a0) - min a3, a4, t1 - lw t0, 204(a0) - min a2, a3, a5 - lw a5, 208(a0) + lw t0, 208(a0) + min a5, a4, t1 + lw t1, 212(a0) + min a3, a5, t0 + lw t0, 216(a0) + min a2, a3, t1 + lw a5, 220(a0) min a4, a2, t0 - lw t0, 212(a0) + lw t0, 224(a0) min a3, a4, a5 - lw a5, 216(a0) + lw a5, 228(a0) min a2, a3, t0 - lw t0, 220(a0) - min a4, a2, a5 - lw t1, 224(a0) - min a3, a4, t0 - lw t0, 228(a0) - min a5, a3, t1 lw t1, 232(a0) - min a2, a5, t0 - lw a5, 236(a0) - min a4, a2, t1 - lw t0, 240(a0) - min a3, a4, a5 - lw a5, 244(a0) - min a2, a3, t0 - lw t0, 248(a0) min a4, a2, a5 + lw t0, 236(a0) + min a3, a4, t1 + lw a4, 240(a0) + min a2, a3, t0 + lw t0, 244(a0) + min a5, a2, a4 + lw t1, 248(a0) + min a3, a5, t0 lw a5, 252(a0) - min a3, a4, t0 - li a4, 960 - min a2, a3, a5 - blt a1, a4, label1392 - lw a4, 256(a0) - lw t0, 260(a0) - min a3, a2, a4 - lw a5, 264(a0) - min a1, a3, t0 - lw t0, 268(a0) + min a4, a3, t1 + li a3, 960 + min a2, a4, a5 + blt a1, a3, label1381 + lw a3, 256(a0) + lw a5, 260(a0) + min a1, a2, a3 + lw a2, 264(a0) min a4, a1, a5 - lw a5, 272(a0) - min a2, a4, t0 - lw a4, 276(a0) - min a3, a2, a5 - lw a5, 280(a0) - min a1, a3, a4 - lw a4, 284(a0) - min a2, a1, a5 - lw t0, 288(a0) - min a3, a2, a4 - lw a5, 292(a0) - min a1, a3, t0 - lw a4, 296(a0) - min a2, a1, a5 - lw a5, 300(a0) - min a3, a2, a4 - lw t0, 304(a0) + lw a5, 268(a0) + min a3, a4, a2 + lw t0, 272(a0) min a1, a3, a5 - lw a4, 308(a0) - min a2, a1, t0 - lw a5, 312(a0) - min a3, a2, a4 - lw t0, 316(a0) - min a1, a3, a5 - lw a4, 320(a0) + lw a3, 276(a0) min a2, a1, t0 + lw t0, 280(a0) + min a4, a2, a3 + lw a5, 284(a0) + min a1, a4, t0 + lw t0, 288(a0) + min a3, a1, a5 + lw a4, 292(a0) + min a2, a3, t0 + lw t0, 296(a0) + min a1, a2, a4 + lw a5, 300(a0) + min a3, a1, t0 + lw a1, 304(a0) + min a2, a3, a5 + lw a5, 308(a0) + min a4, a2, a1 + lw a2, 312(a0) + min a3, a4, a5 + lw a5, 316(a0) + min a1, a3, a2 + lw t0, 320(a0) + min a4, a1, a5 lw a5, 324(a0) - min a3, a2, a4 + min a3, a4, t0 lw a4, 328(a0) - min a1, a3, a5 + min a2, a3, a5 lw a5, 332(a0) - min a2, a1, a4 + min a1, a2, a4 lw a4, 336(a0) - min a3, a2, a5 + min a3, a1, a5 lw a5, 340(a0) - min a1, a3, a4 + min a2, a3, a4 lw t0, 344(a0) - min a2, a1, a5 - lw a5, 348(a0) - min a4, a2, t0 - lw a2, 352(a0) - min a3, a4, a5 + min a1, a2, a5 + lw a4, 348(a0) + min a3, a1, t0 + lw t0, 352(a0) + min a2, a3, a4 lw a5, 356(a0) - min a1, a3, a2 - lw t0, 360(a0) - min a4, a1, a5 - lw t1, 364(a0) - min a2, a4, t0 - lw a5, 368(a0) - min a3, a2, t1 - lw t0, 372(a0) - min a1, a3, a5 - lw a3, 376(a0) - min a4, a1, t0 - lw t0, 380(a0) - min a2, a4, a3 - lw a5, 384(a0) min a1, a2, t0 - lw a4, 388(a0) + lw a4, 360(a0) min a3, a1, a5 - lw a5, 392(a0) + lw a5, 364(a0) min a2, a3, a4 - lw a3, 396(a0) + lw t0, 368(a0) min a1, a2, a5 - lw a5, 400(a0) - min a4, a1, a3 - lw t0, 404(a0) + lw t1, 372(a0) + min a3, a1, t0 + lw a5, 376(a0) + min a4, a3, t1 + lw a3, 380(a0) min a2, a4, a5 + lw a5, 384(a0) + min a1, a2, a3 + lw t0, 388(a0) + min a4, a1, a5 + lw a5, 392(a0) + min a2, a4, t0 + lw a4, 396(a0) + min a3, a2, a5 + lw a5, 400(a0) + min a1, a3, a4 + lw a4, 404(a0) + min a2, a1, a5 lw a5, 408(a0) - min a3, a2, t0 + min a3, a2, a4 lw a4, 412(a0) min a1, a3, a5 mv a0, zero min a2, a1, a4 -pcrel2009: - auipc s9, %pcrel_hi(cmmc_parallel_body_payload_2) - sd s7, %pcrel_lo(pcrel2009)(s9) +pcrel1989: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_2) + sd s5, %pcrel_lo(pcrel1989)(s7) li a1, 1000 - sw a2, 8(s10) - mv a2, s8 + sw a2, 8(s8) + mv a2, s6 jal cmmcParallelFor li a0, 1000 - addiw s11, s11, 1 - bge s11, a0, label1733 - ld a5, 112(sp) + addiw s9, s9, 1 + bge s9, a0, label1722 + add s5, s5, s0 mv a1, zero lui a3, 524288 - add s7, s7, a5 + mv a0, s5 addiw a2, a3, -1 - mv a0, s7 - j label1369 -label1733: - mv a2, s0 + j label1358 +label1722: + mv a2, s1 mv a0, zero - mv a1, s0 + mv a1, s1 mv a4, zero - j label1379 + j label1368 .p2align 2 -label1391: +label1380: addi a1, a1, 64 .p2align 2 -label1379: - ld a5, 112(sp) - mul t1, a4, a5 +label1368: + mul t0, a4, s0 + li t4, 125 + li t5, 375 addiw a4, a4, 16 - add a3, s0, t1 - sh2add t0, a0, a3 - lw t2, 0(t0) - add t0, a3, a5 - subw t1, zero, t2 - sh2add t2, a0, t0 - add t0, a3, s1 - sw t1, 0(a1) - sh2add a5, a0, t0 - lw t1, 0(t2) - subw t3, zero, t1 - add t1, a3, s2 - sh2add t0, a0, t1 - sw t3, 4(a1) - add t1, a3, s3 - lw t3, 0(a5) - subw t2, zero, t3 - li t3, 125 - sw t2, 8(a1) - lw t2, 0(t0) - subw a5, zero, t2 - sh2add t2, a0, t1 - sw a5, 12(a1) + add a3, s1, t0 + sh2add t2, a0, a3 lw a5, 0(t2) - add t2, a3, s4 - subw t0, zero, a5 - sh2add a5, a0, t2 - add t2, a3, s5 - sw t0, 16(a1) - lw t0, 0(a5) - subw t1, zero, t0 + add t2, a3, s0 + subw t1, zero, a5 sh2add t0, a0, t2 - add t2, a3, s6 - sw t1, 20(a1) - lw a5, 0(t0) + add t2, a3, s3 + sw t1, 0(a1) + lw t1, 0(t0) sh2add t0, a0, t2 - subw t1, zero, a5 - sw t1, 24(a1) - lw a5, 0(t0) - slli t0, t3, 8 - subw t1, zero, a5 - sw t1, 28(a1) - add t1, a3, t0 - sh2add t2, a0, t1 - lw a5, 0(t2) - li t2, 1125 - subw t0, zero, a5 - slli t3, t2, 5 - add t1, a3, t3 - sw t0, 32(a1) + subw a5, zero, t1 + add t2, a3, s4 + sw a5, 4(a1) + lw t1, 0(t0) + subw a5, zero, t1 + sh2add t1, a0, t2 + slli t2, t4, 7 + sw a5, 8(a1) + lw t0, 0(t1) + add t1, a3, t2 + subw a5, zero, t0 + sh2add t3, a0, t1 + sw a5, 12(a1) + lw t0, 0(t3) li t3, 625 - sh2add a5, a0, t1 - slli t1, t3, 6 - lw t2, 0(a5) - li t3, 1375 + subw a5, zero, t0 + slli t1, t3, 5 + add t4, a3, t1 + sw a5, 16(a1) + sh2add t0, a0, t4 + lw t3, 0(t0) + slli t0, t5, 6 + subw a5, zero, t3 + add t4, a3, t0 + sw a5, 20(a1) + sh2add a5, a0, t4 + li t4, 875 + lw t5, 0(a5) + slli a5, t4, 5 + subw t3, zero, t5 + add t5, a3, a5 + sw t3, 24(a1) + sh2add t3, a0, t5 + slli t5, t2, 1 + lw t6, 0(t3) + add t3, a3, t5 + subw t4, zero, t6 + sw t4, 28(a1) + sh2add t4, a0, t3 + li t3, 1125 + lw t2, 0(t4) + slli t4, t3, 5 + subw t5, zero, t2 + sw t5, 32(a1) + add t5, a3, t4 + sh2add t2, a0, t5 + lw t3, 0(t2) + slli t2, t1, 1 + subw t6, zero, t3 + add t4, a3, t2 + sh2add t3, a0, t4 + sw t6, 36(a1) + li t4, 1375 + lw t5, 0(t3) + slli t2, t4, 5 + subw t1, zero, t5 + add t3, a3, t2 + sw t1, 40(a1) + sh2add t1, a0, t3 + lw t4, 0(t1) + slli t1, t0, 1 + subw t2, zero, t4 + add t3, a3, t1 + sh2add t4, a0, t3 + sw t2, 44(a1) + li t3, 1625 + lw t2, 0(t4) + slli t1, t3, 5 subw t0, zero, t2 - sw t0, 36(a1) + sw t0, 48(a1) add t0, a3, t1 - sh2add a5, a0, t0 - slli t0, t3, 5 - lw t2, 0(a5) - li t3, 375 + slli t1, a5, 1 + sh2add t4, a0, t0 + add t0, a3, t1 + lw t2, 0(t4) + li t1, 1875 + subw t3, zero, t2 + sw t3, 52(a1) + sh2add t3, a0, t0 + slli t0, t1, 5 + lw t2, 0(t3) + subw a5, zero, t2 + sw a5, 56(a1) add a5, a3, t0 - subw t1, zero, t2 sh2add t2, a0, a5 - slli a5, t3, 7 - sw t1, 40(a1) - li t3, 1625 + li a5, 992 lw t1, 0(t2) - add t2, a3, a5 subw t0, zero, t1 - sw t0, 44(a1) - sh2add t0, a0, t2 - lw t1, 0(t0) - subw a5, zero, t1 - slli t1, t3, 5 - li t3, 875 - add t0, a3, t1 - sw a5, 48(a1) - sh2add t2, a0, t0 - slli t0, t3, 6 - lw a5, 0(t2) - subw t1, zero, a5 - sw t1, 52(a1) - add t1, a3, t0 - sh2add t2, a0, t1 - lw a5, 0(t2) - li t2, 1875 - subw t0, zero, a5 - slli a5, t2, 5 - add t1, a3, a5 - sw t0, 56(a1) - sh2add t0, a0, t1 - lw t2, 0(t0) - li t0, 992 - subw a5, zero, t2 - sw a5, 60(a1) - blt a4, t0, label1391 + sw t0, 60(a1) + blt a4, a5, label1380 li a4, 125 lui t3, 17 slli a5, a4, 9 @@ -495,7 +483,6 @@ label1379: lw a4, 0(t0) subw a5, zero, a4 slli a4, t2, 6 - lui t2, 21 add t1, a3, a4 sw a5, 68(a1) sh2add t0, a0, t1 @@ -506,1001 +493,990 @@ label1379: add a5, a3, t0 sw a4, 72(a1) sh2add a4, a0, a5 - li a5, 625 lw t1, 0(a4) - slli a4, a5, 7 + li a4, 625 subw t0, zero, t1 - add t1, a3, a4 - sw t0, 76(a1) - sh2add t0, a0, t1 - lw a5, 0(t0) - subw a4, zero, a5 - addiw a5, t2, -2016 + slli a5, a4, 7 add t1, a3, a5 - sw a4, 80(a1) + sw t0, 76(a1) sh2add t0, a0, t1 - li t1, 1375 lw a4, 0(t0) - slli t0, t1, 6 + lui t0, 21 subw a5, zero, a4 + addiw a4, t0, -2016 + add t1, a3, a4 + sw a5, 80(a1) + sh2add a5, a0, t1 + li t1, 1375 + lw t0, 0(a5) + subw a4, zero, t0 + slli t0, t1, 6 + sw a4, 84(a1) add a4, a3, t0 - sw a5, 84(a1) sh2add a5, a0, a4 lui a4, 22 - lw t1, 0(a5) - subw t0, zero, t1 - addiw t1, a4, 1888 - add a5, a3, t1 - sw t0, 88(a1) - sh2add t0, a0, a5 + lw t2, 0(a5) + addiw t0, a4, 1888 + subw t1, zero, t2 + add a5, a3, t0 + sw t1, 88(a1) + sh2add t1, a0, a5 addiw a0, a0, 1 - lw a4, 0(t0) + lw a4, 0(t1) subw a3, zero, a4 li a4, 1000 sw a3, 92(a1) - bge a0, a4, label1383 - ld a5, 112(sp) + bge a0, a4, label1372 + add a2, a2, s0 mv a4, zero - add a2, a2, a5 mv a1, a2 - j label1379 -label1383: + j label1368 +label1372: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - mv s2, zero mv s4, zero - addi s1, a0, %pcrel_lo(label1383) - j label1384 + mv s6, zero + addi s3, a0, %pcrel_lo(label1372) + j label1373 .p2align 2 cmmc_parallel_body_0: - addi sp, sp, -64 -pcrel779: - auipc a5, %pcrel_hi(b) - li a4, 125 - addi a2, a5, %pcrel_lo(pcrel779) - sd s0, 0(sp) - slli a3, a4, 5 - mv s0, a1 - sh3add t6, a3, a3 + addi sp, sp, -24 + mv t5, a1 +pcrel772: + auipc a4, %pcrel_hi(b) + li a5, 125 + sd s1, 0(sp) + addi a2, a4, %pcrel_lo(pcrel772) + slli a3, a5, 5 + sd s0, 8(sp) sh2add t2, a3, a3 + sh1add t0, a3, a3 slli a5, a3, 1 - sd s5, 8(sp) mul a1, a0, a3 - sh1add t0, a3, a3 - slli t1, a5, 1 - sd s1, 16(sp) - add a7, a2, a1 + sd s2, 16(sp) slli t3, t0, 1 - slli t5, t1, 1 - sd s6, 24(sp) - li a2, 625 -pcrel780: + slli t1, a5, 1 + add t4, a2, a1 +pcrel773: auipc a1, %pcrel_hi(a) - slli a6, a2, 6 - sd s4, 32(sp) - addi a4, a1, %pcrel_lo(pcrel780) - sd s2, 40(sp) - li a1, 875 - sd s3, 48(sp) - slli t4, a1, 5 - sd s7, 56(sp) - mv a1, a7 - mv s1, zero + addi a4, a1, %pcrel_lo(pcrel773) + mv a1, t4 + mv t6, zero j label5 .p2align 2 label9: - li s5, 125 - slli s4, s5, 11 - lui s5, 63 - add s3, a2, s4 - addiw s4, s5, 1952 - sh2add s2, a0, s3 - lui s5, 64 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, 1856 - sh2add s2, a0, s3 - lui s5, 65 - add s3, a2, s4 - sw s1, 256(a1) - addiw s4, s5, 1760 - lw s1, 0(s2) - lui s5, 66 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 260(a1) - addiw s4, s5, 1664 - lw s1, 0(s2) - lui s5, 67 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 264(a1) - addiw s4, s5, 1568 - lw s1, 0(s2) - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 268(a1) - lui s4, 68 - lw s1, 0(s2) - sh2add s2, a0, s3 - addiw s3, s4, 1472 - sw s1, 272(a1) - lw s1, 0(s2) - sw s1, 276(a1) - add s1, a2, s3 - sh2add s2, a0, s1 - lui s1, 69 - lw s5, 0(s2) - addiw s4, s1, 1376 - add s3, a2, s4 - sh2add s2, a0, s3 - sw s5, 280(a1) - li s5, 1125 - lw s1, 0(s2) - slli s4, s5, 8 - lui s5, 71 - add s3, a2, s4 - sw s1, 284(a1) - sh2add s1, a0, s3 - lw s2, 0(s1) - addiw s1, s5, 1184 - lui s5, 72 - add s4, a2, s1 - sw s2, 288(a1) - sh2add s3, a0, s4 - addiw s4, s5, 1088 - lw s2, 0(s3) - lui s5, 73 - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 292(a1) - lw s2, 0(s1) - addiw s1, s5, 992 - lui s5, 74 - add s4, a2, s1 - sw s2, 296(a1) - sh2add s3, a0, s4 - addiw s4, s5, 896 - lw s2, 0(s3) - lui s5, 75 - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 300(a1) - lw s2, 0(s1) - addiw s1, s5, 800 - lui s5, 76 - add s4, a2, s1 - sw s2, 304(a1) - addiw s1, s5, 704 - sh2add s3, a0, s4 - lui s5, 77 - add s4, a2, s1 - lw s2, 0(s3) - sh2add s3, a0, s4 - addiw s4, s5, 608 - sw s2, 308(a1) - li s5, 625 - lw s2, 0(s3) - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 312(a1) - lw s2, 0(s1) - slli s1, s5, 9 - lui s5, 79 - add s4, a2, s1 - sw s2, 316(a1) - sh2add s3, a0, s4 - addiw s4, s5, 416 - lw s2, 0(s3) - lui s5, 80 - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 320(a1) - lw s2, 0(s1) - addiw s1, s5, 320 - lui s5, 82 - add s4, a2, s1 - sw s2, 324(a1) - lui s1, 81 - sh2add s3, a0, s4 - addiw s4, s1, 224 - lw s2, 0(s3) - add s3, a2, s4 - addiw s4, s5, 128 - sw s2, 328(a1) - lui s5, 83 - sh2add s2, a0, s3 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, 32 - sh2add s2, a0, s3 - lui s5, 84 - add s3, a2, s4 - sw s1, 332(a1) - addiw s4, s5, -64 - lw s1, 0(s2) - lui s5, 88 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 336(a1) - lw s1, 0(s2) - sh2add s2, a0, s3 - lui s3, 85 - sw s1, 340(a1) - lw s1, 0(s2) - addiw s2, s3, -160 - sw s1, 344(a1) - add s1, a2, s2 - li s2, 1375 - sh2add s4, a0, s1 - slli s1, s2, 8 - lw s3, 0(s4) - add s4, a2, s1 - lui s1, 87 - sw s3, 348(a1) - sh2add s3, a0, s4 - addiw s4, s1, -352 - lw s2, 0(s3) - add s3, a2, s4 - addiw s4, s5, -448 - sw s2, 352(a1) - lui s5, 89 - sh2add s2, a0, s3 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, -544 - sh2add s2, a0, s3 - lui s5, 90 - sw s1, 356(a1) - lw s1, 0(s2) - add s2, a2, s4 - addiw s4, s5, -640 - sh2add s3, a0, s2 - sw s1, 360(a1) - lui s5, 91 - lw s1, 0(s3) - add s3, a2, s4 - addiw s4, s5, -736 - sh2add s2, a0, s3 - sw s1, 364(a1) - lui s5, 92 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, -832 - sh2add s2, a0, s3 - lui s5, 93 - add s3, a2, s4 - sw s1, 368(a1) - addiw s4, s5, -928 - lw s1, 0(s2) - li s5, 375 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 372(a1) - slli s4, s5, 10 - lw s1, 0(s2) - lui s5, 95 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 376(a1) - addiw s4, s5, -1120 - lw s1, 0(s2) - lui s5, 96 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 380(a1) - addiw s4, s5, -1216 - lw s1, 0(s2) - lui s5, 97 - sw s1, 384(a1) - sh2add s1, a0, s3 - add s3, a2, s4 - lw s2, 0(s1) - addiw s4, s5, -1312 - lui s5, 98 - sw s2, 388(a1) - sh2add s2, a0, s3 - add s3, a2, s4 - lw s1, 0(s2) - sh2add s2, a0, s3 - sw s1, 392(a1) - lw s1, 0(s2) - addiw s2, s5, -1408 - lui s5, 99 - add s4, a2, s2 - sw s1, 396(a1) - sh2add s3, a0, s4 - addiw s4, s5, -1504 - lw s1, 0(s3) - lui s5, 100 - add s3, a2, s4 - sh2add s2, a0, s3 - sw s1, 400(a1) - addiw s3, s5, -1600 - lw s1, 0(s2) - add s4, a2, s3 - sh2add s2, a0, s4 - lui s4, 101 - sw s1, 404(a1) - addiw s3, s4, -1696 - lw s1, 0(s2) - add s2, a2, s3 - sw s1, 408(a1) - sh2add s1, a0, s2 + li a6, 125 + lui s0, 63 + slli t6, a6, 11 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, 1952 + lui s0, 64 + add a7, a2, a6 + sw t6, 256(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, 1856 + lui s0, 66 + add a7, a2, t6 + sw a6, 260(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 65 + sw t6, 264(a1) + addiw t6, a6, 1760 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, 1664 + lui s0, 67 + add a7, a2, a6 + sw t6, 268(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, 1568 + add a7, a2, t6 + sw a6, 272(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 68 + addiw a7, a6, 1472 + sw t6, 276(a1) + add t6, a2, a7 + sh2add a6, a0, t6 + lui t6, 69 + lw a7, 0(a6) + addiw a6, t6, 1376 + sw a7, 280(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + li a7, 1125 + lw a6, 0(t6) + slli t6, a7, 8 + add s0, a2, t6 + sw a6, 284(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 71 + addiw t6, a7, 1184 + sw a6, 288(a1) + add s0, a2, t6 + lui t6, 72 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 1088 + sw a7, 292(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + lui a7, 73 + lw a6, 0(t6) + addiw t6, a7, 992 + add s0, a2, t6 + sw a6, 296(a1) + lui t6, 74 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 896 + sw a7, 300(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + lui a7, 75 + lw a6, 0(t6) + addiw t6, a7, 800 + add s0, a2, t6 + sw a6, 304(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 76 + addiw t6, a7, 704 + sw a6, 308(a1) + add s0, a2, t6 + lui t6, 77 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 608 + sw a7, 312(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + li a7, 625 + lw a6, 0(t6) + slli t6, a7, 9 + add s0, a2, t6 + sw a6, 316(a1) + lui t6, 79 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 416 + sw a7, 320(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + lui a7, 80 + lw a6, 0(t6) + addiw t6, a7, 320 + add s0, a2, t6 + sw a6, 324(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 81 + addiw t6, a7, 224 + sw a6, 328(a1) + add s0, a2, t6 + sh2add a7, a0, s0 + lui s0, 82 + lw a6, 0(a7) + addiw t6, s0, 128 + lui s0, 83 + add a7, a2, t6 + sw a6, 332(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, 32 + lui s0, 84 + add a7, a2, a6 + sw t6, 336(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -64 + add a7, a2, t6 + sw a6, 340(a1) + sh2add a6, a0, a7 + lui a7, 85 + lw t6, 0(a6) + addiw a6, a7, -160 + sw t6, 344(a1) + add t6, a2, a6 + li a6, 1375 + sh2add s0, a0, t6 + slli t6, a6, 8 + lw a7, 0(s0) + add s0, a2, t6 + sw a7, 348(a1) + sh2add a7, a0, s0 + lui s0, 87 + lw a6, 0(a7) + addiw t6, s0, -352 + lui s0, 88 + add a7, a2, t6 + sw a6, 352(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, -448 + lui s0, 89 + add a7, a2, a6 + sw t6, 356(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -544 + lui s0, 91 + add a7, a2, t6 + sw a6, 360(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 90 + sw t6, 364(a1) + addiw t6, a6, -640 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, -736 + lui s0, 92 + add a7, a2, a6 + sw t6, 368(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -832 + li s0, 375 + add a7, a2, t6 + sw a6, 372(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 93 + sw t6, 376(a1) + addiw t6, a6, -928 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + slli a6, s0, 10 + lui s0, 95 + add a7, a2, a6 + sw t6, 380(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -1120 + lui s0, 96 + add a7, a2, t6 + sw a6, 384(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, -1216 + lui s0, 97 + add a7, a2, a6 + sw t6, 388(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -1312 + add a7, a2, t6 + sw a6, 392(a1) + sh2add a6, a0, a7 + lui a7, 98 + lw t6, 0(a6) + addiw a6, a7, -1408 + add s0, a2, a6 + sw t6, 396(a1) + lui a6, 99 + sh2add t6, a0, s0 + lw a7, 0(t6) + addiw t6, a6, -1504 + sw a7, 400(a1) + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 100 + sw t6, 404(a1) + addiw t6, a6, -1600 + add a7, a2, t6 + sh2add a6, a0, a7 + lui a7, 101 + lw t6, 0(a6) + addiw a6, a7, -1696 + sw t6, 408(a1) + add t6, a2, a6 + sh2add a7, a0, t6 addiw a0, a0, 1 - lw a2, 0(s1) - sw a2, 412(a1) - ble s0, a0, label11 - add a7, a7, a3 - mv s1, zero - mv a1, a7 + lw a6, 0(a7) + sw a6, 412(a1) + ble t5, a0, label11 + add t4, t4, a3 + mv t6, zero + mv a1, t4 .p2align 2 label5: - mul s4, s1, a3 - li s7, 625 - addiw s1, s1, 64 - add a2, a4, s4 - sh2add s2, a0, a2 - add s4, a2, a3 - add s6, a2, t5 - sh2add s5, a0, s4 - lw s3, 0(s2) - add s2, a2, a5 - sw s3, 0(a1) - lw s3, 0(s5) - sh2add s5, a0, s2 - sw s3, 4(a1) - add s3, a2, t0 - lw s4, 0(s5) - sh2add s5, a0, s3 - add s3, a2, t1 - sw s4, 8(a1) - sh2add s4, a0, s3 - lw s2, 0(s5) - sw s2, 12(a1) - add s2, a2, t2 - lw s5, 0(s4) - sh2add s3, a0, s2 - sw s5, 16(a1) - add s5, a2, t3 - lw s4, 0(s3) - sh2add s3, a0, s5 - sw s4, 20(a1) - add s4, a2, t4 - lw s2, 0(s3) - sh2add s5, a0, s4 - sw s2, 24(a1) - sh2add s2, a0, s6 - lw s3, 0(s5) - li s6, 1375 - sw s3, 28(a1) - add s3, a2, t6 - lw s4, 0(s2) - sh2add s5, a0, s3 - add s2, a2, a6 - sh2add s3, a0, s2 - sw s4, 32(a1) - lw s4, 0(s5) - sw s4, 36(a1) - lw s4, 0(s3) - slli s3, s6, 5 - li s6, 375 - add s2, a2, s3 - sw s4, 40(a1) - sh2add s5, a0, s2 - slli s2, s6, 7 - lw s4, 0(s5) - add s5, a2, s2 - sh2add s3, a0, s5 - sw s4, 44(a1) - li s5, 1625 - lw s4, 0(s3) - slli s3, s5, 5 - sw s4, 48(a1) - add s4, a2, s3 - sh2add s2, a0, s4 - li s4, 875 - lw s5, 0(s2) - slli s6, s4, 6 - add s3, a2, s6 - lui s6, 17 - sh2add s2, a0, s3 - sw s5, 52(a1) - li s3, 1875 - lw s4, 0(s2) - slli s2, s3, 5 - add s5, a2, s2 - sw s4, 56(a1) - sh2add s4, a0, s5 - li s5, 125 - lw s3, 0(s4) - slli s2, s5, 9 - add s4, a2, s2 - addiw s2, s6, -1632 - sw s3, 60(a1) - li s6, 1125 - sh2add s3, a0, s4 - lw s5, 0(s3) - sw s5, 64(a1) - add s5, a2, s2 - sh2add s4, a0, s5 - lw s3, 0(s4) - slli s4, s6, 6 - add s5, a2, s4 - sw s3, 68(a1) - sh2add s2, a0, s5 - lui s5, 19 - lw s3, 0(s2) - addiw s4, s5, -1824 - slli s5, s7, 7 - lui s7, 30 - sw s3, 72(a1) - add s3, a2, s4 - add s4, a2, s5 - sh2add s2, a0, s3 - sh2add s3, a0, s4 - lw s6, 0(s2) - sw s6, 76(a1) - lui s6, 21 - lw s2, 0(s3) - addiw s4, s6, -2016 - lui s6, 24 - add s3, a2, s4 - sh2add s5, a0, s3 - sw s2, 80(a1) - li s3, 1375 - lw s2, 0(s5) - slli s5, s3, 6 - sw s2, 84(a1) - add s2, a2, s5 - sh2add s4, a0, s2 - lui s2, 22 - lw s3, 0(s4) - addiw s4, s2, 1888 - add s5, a2, s4 - sw s3, 88(a1) - li s4, 375 - sh2add s3, a0, s5 - lw s2, 0(s3) - slli s3, s4, 8 - sw s2, 92(a1) - add s2, a2, s3 - addiw s3, s6, 1696 - sh2add s5, a0, s2 - li s6, 1625 - lw s4, 0(s5) - sw s4, 96(a1) - add s4, a2, s3 - slli s3, s6, 6 - sh2add s2, a0, s4 - li s6, 875 - add s4, a2, s3 - lw s5, 0(s2) - sw s5, 100(a1) - sh2add s5, a0, s4 - lui s4, 26 - lw s2, 0(s5) - addiw s5, s4, 1504 - sw s2, 104(a1) - add s2, a2, s5 - slli s5, s6, 7 - sh2add s3, a0, s2 - li s6, 1875 - lw s4, 0(s3) - sw s4, 108(a1) - add s4, a2, s5 - lui s5, 28 - sh2add s2, a0, s4 - lw s3, 0(s2) - addiw s2, s5, 1312 - sw s3, 112(a1) - add s3, a2, s2 - sh2add s4, a0, s3 - lw s5, 0(s4) - slli s4, s6, 6 - add s2, a2, s4 - sw s5, 116(a1) - sh2add s3, a0, s2 - addiw s5, s7, 1120 - lw s6, 0(s3) - lui s7, 33 - add s2, a2, s5 - li s5, 125 - sh2add s4, a0, s2 - sw s6, 120(a1) - slli s2, s5, 10 - lui s6, 32 - lw s3, 0(s4) - sw s3, 124(a1) - add s3, a2, s2 - sh2add s4, a0, s3 - addiw s3, s6, 928 - lw s5, 0(s4) - addiw s6, s7, 832 - add s4, a2, s3 - sh2add s2, a0, s4 - sw s5, 128(a1) - add s4, a2, s6 - lw s5, 0(s2) - lui s6, 38 - sh2add s3, a0, s4 - sw s5, 132(a1) - lui s5, 34 - lw s2, 0(s3) - addiw s4, s5, 736 - add s3, a2, s4 - sw s2, 136(a1) - sh2add s2, a0, s3 - li s3, 1125 - lw s5, 0(s2) - slli s2, s3, 7 - add s4, a2, s2 - sw s5, 140(a1) - sh2add s5, a0, s4 - lui s4, 36 - lw s3, 0(s5) - addiw s5, s4, 544 - sw s3, 144(a1) - add s3, a2, s5 - lui s5, 37 - sh2add s2, a0, s3 - lw s4, 0(s2) - addiw s2, s5, 448 - add s3, a2, s2 - sw s4, 148(a1) - sh2add s4, a0, s3 - addiw s3, s6, 352 - lw s5, 0(s4) - li s6, 625 - sw s5, 152(a1) - add s5, a2, s3 - sh2add s2, a0, s5 - slli s5, s6, 8 - lw s4, 0(s2) - lui s6, 40 - add s2, a2, s5 - sh2add s3, a0, s2 - sw s4, 156(a1) - lw s4, 0(s3) - addiw s3, s6, 160 - lui s6, 41 - add s2, a2, s3 - sw s4, 160(a1) - sh2add s5, a0, s2 - addiw s2, s6, 64 - lw s4, 0(s5) - lui s6, 42 - add s3, a2, s2 - addiw s2, s6, -32 - sh2add s5, a0, s3 - sw s4, 164(a1) - li s6, 1375 - add s3, a2, s2 - lw s4, 0(s5) - sh2add s5, a0, s3 - sw s4, 168(a1) - lw s4, 0(s5) - slli s5, s6, 7 - add s3, a2, s5 - sw s4, 172(a1) - sh2add s2, a0, s3 - lui s3, 44 - lw s6, 0(s2) - addiw s4, s3, -224 - add s2, a2, s4 - sh2add s5, a0, s2 - sw s6, 176(a1) - lui s2, 45 - li s6, 375 - addiw s4, s2, -320 - lw s3, 0(s5) - add s5, a2, s4 - sw s3, 180(a1) - sh2add s3, a0, s5 - lui s5, 46 - lw s2, 0(s3) - addiw s3, s5, -416 - sw s2, 184(a1) - add s2, a2, s3 - sh2add s4, a0, s2 - slli s2, s6, 9 - lw s5, 0(s4) - lui s6, 49 - sw s5, 188(a1) - add s5, a2, s2 - sh2add s3, a0, s5 - lui s5, 48 - lw s4, 0(s3) - addiw s2, s5, -608 - add s3, a2, s2 - sw s4, 192(a1) - sh2add s4, a0, s3 - addiw s3, s6, -704 - lw s5, 0(s4) - add s2, a2, s3 - lui s3, 50 - sh2add s4, a0, s2 - sw s5, 196(a1) - addiw s5, s3, -800 - lw s6, 0(s4) - add s4, a2, s5 - li s5, 1625 - sh2add s2, a0, s4 - sw s6, 200(a1) - lui s6, 52 - lw s3, 0(s2) - slli s2, s5, 7 - add s4, a2, s2 - sw s3, 204(a1) - sh2add s3, a0, s4 - addiw s4, s6, -992 - lw s5, 0(s3) - lui s6, 54 - add s2, a2, s4 - lui s4, 53 - sw s5, 208(a1) - sh2add s5, a0, s2 - lw s3, 0(s5) - addiw s5, s4, -1088 - add s2, a2, s5 - sw s3, 212(a1) - addiw s5, s6, -1184 - sh2add s3, a0, s2 - lui s6, 57 - add s2, a2, s5 - lw s4, 0(s3) - sw s4, 216(a1) - sh2add s4, a0, s2 + mul a6, t6, a3 li s2, 875 - lw s3, 0(s4) - slli s4, s2, 8 - sw s3, 220(a1) - add s3, a2, s4 - sh2add s5, a0, s3 - lui s3, 56 - lw s2, 0(s5) - addiw s5, s3, -1376 - add s4, a2, s5 - sw s2, 224(a1) - addiw s5, s6, -1472 - sh2add s2, a0, s4 - li s6, 1875 - lw s3, 0(s2) - sw s3, 228(a1) - add s3, a2, s5 - sh2add s4, a0, s3 - lui s3, 58 - lw s2, 0(s4) - addiw s5, s3, -1568 - sw s2, 232(a1) - add s2, a2, s5 - sh2add s4, a0, s2 - slli s2, s6, 7 - lw s3, 0(s4) - lui s6, 60 - add s4, a2, s2 - sh2add s5, a0, s4 - sw s3, 236(a1) - addiw s4, s6, -1760 - lw s3, 0(s5) - lui s6, 61 - add s5, a2, s4 - addiw s4, s6, -1856 - sh2add s2, a0, s5 - sw s3, 240(a1) - add s5, a2, s4 - lw s3, 0(s2) - lui s4, 62 - sh2add s2, a0, s5 - sw s3, 244(a1) - lw s3, 0(s2) - addiw s2, s4, -1952 - add s5, a2, s2 - sw s3, 248(a1) - li s2, 960 - sh2add s3, a0, s5 - lw s4, 0(s3) - sw s4, 252(a1) - bge s1, s2, label9 + addiw t6, t6, 64 + add a2, a4, a6 + sh2add a7, a0, a2 + add a6, a2, a3 + sh2add s0, a0, a6 + add a6, a2, a5 + lw s1, 0(a7) + sw s1, 0(a1) + lw a7, 0(s0) + sw a7, 4(a1) + sh2add a7, a0, a6 + add a6, a2, t0 + lw s0, 0(a7) + sh2add a7, a0, a6 + sw s0, 8(a1) + lw s0, 0(a7) + add a7, a2, t1 + sh2add a6, a0, a7 + sw s0, 12(a1) + add a7, a2, t2 + lw s0, 0(a6) + sh2add s1, a0, a7 + sw s0, 16(a1) + add s0, a2, t3 + lw a6, 0(s1) + sh2add a7, a0, s0 + li s1, 875 + sw a6, 20(a1) + lw a6, 0(a7) + slli a7, s1, 5 + li s1, 1125 + sw a6, 24(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + li a6, 125 + lw a7, 0(s0) + sw a7, 28(a1) + slli a7, a6, 8 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + sw a7, 32(a1) + slli a7, s1, 5 + add a6, a2, a7 + sh2add s0, a0, a6 + li a6, 625 + lw a7, 0(s0) + sw a7, 36(a1) + slli a7, a6, 6 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + li a6, 1375 + slli s1, a6, 5 + sw a7, 40(a1) + add a7, a2, s1 + li s1, 375 + sh2add a6, a0, a7 + lw s0, 0(a6) + slli a6, s1, 7 + li s1, 1625 + add a7, a2, a6 + sw s0, 44(a1) + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 48(a1) + slli a6, s1, 5 + add a7, a2, a6 + sh2add s0, a0, a7 + slli a7, s2, 6 + lw a6, 0(s0) + li s2, 125 + add s1, a2, a7 + sh2add s0, a0, s1 + sw a6, 52(a1) + li s1, 1875 + lw a6, 0(s0) + slli a7, s1, 5 + sw a6, 56(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + slli a6, s2, 9 + lw a7, 0(s0) + lui s2, 17 + add s1, a2, a6 + sh2add s0, a0, s1 + sw a7, 60(a1) + addiw s1, s2, -1632 + lw a7, 0(s0) + lui s2, 26 + add a6, a2, s1 + sw a7, 64(a1) + sh2add a7, a0, a6 + lw s0, 0(a7) + li a7, 1125 + slli a6, a7, 6 + sw s0, 68(a1) + add s0, a2, a6 + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 19 + sw a6, 72(a1) + addiw a6, a7, -1824 + add s0, a2, a6 + sh2add a7, a0, s0 + lw a6, 0(a7) + li a7, 625 + slli s1, a7, 7 + sw a6, 76(a1) + add a6, a2, s1 + sh2add a7, a0, a6 + lw s0, 0(a7) + lui a7, 21 + addiw a6, a7, -2016 + sw s0, 80(a1) + add s0, a2, a6 + li a6, 1375 + sh2add s1, a0, s0 + lw a7, 0(s1) + li s1, 375 + sw a7, 84(a1) + slli a7, a6, 6 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + lui a6, 22 + sw a7, 88(a1) + addiw a7, a6, 1888 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + slli a6, s1, 8 + lui s1, 24 + add s0, a2, a6 + sw a7, 92(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + sw a6, 96(a1) + addiw a6, s1, 1696 + add a7, a2, a6 + sh2add s0, a0, a7 + li a7, 1625 + lw a6, 0(s0) + sw a6, 100(a1) + slli a6, a7, 6 + add s0, a2, a6 + addiw a6, s2, 1504 + sh2add s1, a0, s0 + li s2, 125 + add s0, a2, a6 + lw a7, 0(s1) + sw a7, 104(a1) + sh2add a7, a0, s0 + li s0, 875 + lw a6, 0(a7) + slli s1, s0, 7 + add a7, a2, s1 + lui s1, 28 + sw a6, 108(a1) + sh2add a6, a0, a7 + lw s0, 0(a6) + addiw a6, s1, 1312 + li s1, 1875 + add a7, a2, a6 + sw s0, 112(a1) + sh2add s0, a0, a7 + slli a7, s1, 6 + lw a6, 0(s0) + lui s1, 30 + add s0, a2, a7 + sw a6, 116(a1) + sh2add a6, a0, s0 + addiw s0, s1, 1120 + lw a7, 0(a6) + add a6, a2, s0 + sw a7, 120(a1) + sh2add a7, a0, a6 + lw s1, 0(a7) + slli a7, s2, 10 + li s2, 1125 + add a6, a2, a7 + sw s1, 124(a1) + sh2add s0, a0, a6 + lui s1, 32 + lw a7, 0(s0) + addiw a6, s1, 928 + lui s1, 33 + sw a7, 128(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 132(a1) + addiw a6, s1, 832 + add a7, a2, a6 + sh2add s0, a0, a7 + lui a7, 34 + lw a6, 0(s0) + sw a6, 136(a1) + addiw a6, a7, 736 + add s1, a2, a6 + slli a6, s2, 7 + sh2add s0, a0, s1 + lui s2, 60 + add s1, a2, a6 + lw a7, 0(s0) + sh2add s0, a0, s1 + lui s1, 36 + sw a7, 140(a1) + addiw a6, s1, 544 + lw a7, 0(s0) + lui s1, 37 + sw a7, 144(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 148(a1) + addiw a6, s1, 448 + lui s1, 38 + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s1, 352 + lw a6, 0(s0) + li s1, 625 + sw a6, 152(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + lw a7, 0(s0) + sw a7, 156(a1) + slli a7, s1, 8 + lui s1, 40 + add a6, a2, a7 + sh2add s0, a0, a6 + lw a7, 0(s0) + sw a7, 160(a1) + addiw a7, s1, 160 + lui s1, 41 + add a6, a2, a7 + sh2add s0, a0, a6 + addiw a6, s1, 64 + lw a7, 0(s0) + lui s1, 42 + add s0, a2, a6 + sw a7, 164(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + sw a6, 168(a1) + addiw a6, s1, -32 + li s1, 1375 + add a7, a2, a6 + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 172(a1) + slli a6, s1, 7 + lui s1, 44 + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s1, -224 + lw a6, 0(s0) + lui s1, 46 + sw a6, 176(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + lui a6, 45 + lw a7, 0(s0) + sw a7, 180(a1) + addiw a7, a6, -320 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + sw a7, 184(a1) + addiw a7, s1, -416 + lui s1, 48 + add a6, a2, a7 + sh2add s0, a0, a6 + li a6, 375 + lw a7, 0(s0) + sw a7, 188(a1) + slli a7, a6, 9 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, s1, -608 + lui s1, 50 + add s0, a2, a6 + sw a7, 192(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 49 + sw a6, 196(a1) + addiw a6, a7, -704 + add s0, a2, a6 + sh2add a7, a0, s0 + lw a6, 0(a7) + addiw a7, s1, -800 + li s1, 1625 + sw a6, 200(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + slli a6, s1, 7 + lw a7, 0(s0) + lui s1, 52 + add s0, a2, a6 + sw a7, 204(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + sw a6, 208(a1) + addiw a6, s1, -992 + lui s1, 53 + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s1, -1088 + lw a6, 0(s0) + lui s1, 54 + sw a6, 212(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + addiw a6, s1, -1184 + lw a7, 0(s0) + li s1, 875 + sw a7, 216(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + slli a7, s1, 8 + lw a6, 0(s0) + lui s1, 57 + add s0, a2, a7 + sw a6, 220(a1) + sh2add a6, a0, s0 + lw a7, 0(a6) + lui a6, 56 + sw a7, 224(a1) + addiw a7, a6, -1376 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + sw a7, 228(a1) + addiw a7, s1, -1472 + li s1, 1875 + add a6, a2, a7 + sh2add s0, a0, a6 + lui a6, 58 + lw a7, 0(s0) + sw a7, 232(a1) + addiw a7, a6, -1568 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + slli a6, s1, 7 + sw a7, 236(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s2, -1760 + lw a6, 0(s0) + lui s2, 61 + add s0, a2, a7 + addiw a7, s2, -1856 + sh2add s1, a0, s0 + sw a6, 240(a1) + lw a6, 0(s1) + lui s1, 62 + sw a6, 244(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + addiw a6, s1, -1952 + lw a7, 0(s0) + add s0, a2, a6 + sw a7, 248(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + li a7, 960 + sw a6, 252(a1) + bge t6, a7, label9 addi a1, a1, 256 j label5 label11: - ld s0, 0(sp) - ld s5, 8(sp) - ld s1, 16(sp) - ld s6, 24(sp) - ld s4, 32(sp) - ld s2, 40(sp) - ld s3, 48(sp) - ld s7, 56(sp) - addi sp, sp, 64 + ld s1, 0(sp) + ld s0, 8(sp) + ld s2, 16(sp) + addi sp, sp, 24 ret .p2align 2 cmmc_parallel_body_1: - addi sp, sp, -96 - mv a7, a1 -pcrel1095: - auipc a5, %pcrel_hi(c) - li a4, 125 - sd s1, 0(sp) - addi a3, a5, %pcrel_lo(pcrel1095) - slli a2, a4, 5 - mv s1, a0 - sd s6, 8(sp) - sh3add t5, a2, a2 + addi sp, sp, -72 + mv t4, a1 +pcrel1084: + auipc a4, %pcrel_hi(c) + li a5, 125 + mv t6, a0 + addi a3, a4, %pcrel_lo(pcrel1084) + slli a2, a5, 5 + sd s0, 0(sp) sh2add t1, a2, a2 + sh1add a5, a2, a2 slli a4, a2, 1 mul a1, a0, a2 - sd s0, 16(sp) - sh1add a5, a2, a2 + sd s5, 8(sp) slli t0, a4, 1 -pcrel1096: +pcrel1085: auipc a0, %pcrel_hi(a) - add a6, a3, a1 - sd s5, 24(sp) - slli t2, a5, 1 - slli t4, t0, 1 - addi s0, a0, %pcrel_lo(pcrel1096) -pcrel1097: + add t3, a3, a1 + addi t5, a0, %pcrel_lo(pcrel1085) + sd s1, 16(sp) +pcrel1086: auipc a1, %pcrel_hi(b) - li a0, 875 - sd s2, 32(sp) - addi a3, a1, %pcrel_lo(pcrel1097) - slli t3, a0, 5 - sd s4, 40(sp) - sd s8, 48(sp) - sd s3, 56(sp) - sd s7, 64(sp) - sd s9, 72(sp) - sd s10, 80(sp) - sd s11, 88(sp) - mul a1, s1, a2 - mv s2, a6 + sd s6, 24(sp) + addi a3, a1, %pcrel_lo(pcrel1086) + sd s3, 32(sp) + sd s2, 40(sp) + sd s4, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + mul a1, t6, a2 + mv a6, t3 mv a0, zero - add t6, s0, a1 - mv a1, t6 - mv s4, zero - mv s5, zero - j label788 + add t2, t5, a1 + mv a1, t2 + mv s0, zero + mv s1, zero + j label781 .p2align 2 -label792: - li s4, 125 - lui s10, 17 - slli s8, s4, 9 - lw s4, 64(a1) - add s6, s3, s8 - sh2add s7, a0, s6 - lw s8, 0(s7) - mulw s9, s4, s8 - addiw s8, s10, -1632 - addw s6, s5, s9 - li s10, 1125 - add s4, s3, s8 - lw s5, 68(a1) - sh2add s7, a0, s4 - lw s8, 0(s7) - mulw s9, s5, s8 - slli s8, s10, 6 - addw s4, s6, s9 - lui s10, 21 - add s5, s3, s8 - lw s6, 72(a1) - sh2add s7, a0, s5 - lw s8, 0(s7) - lui s7, 19 - mulw s9, s6, s8 - addiw s8, s7, -1824 - addw s5, s4, s9 - lw s7, 76(a1) - add s4, s3, s8 - sh2add s6, a0, s4 - lw s8, 0(s6) +label785: + li s0, 125 + lui s6, 17 + slli s4, s0, 9 + lw s0, 64(a1) + add s2, a7, s4 + sh2add s3, a0, s2 + lw s4, 0(s3) + mulw s5, s0, s4 + addiw s4, s6, -1632 + addw s2, s1, s5 + li s6, 1125 + add s0, a7, s4 + lw s1, 68(a1) + sh2add s3, a0, s0 + lw s5, 0(s3) + mulw s4, s1, s5 + slli s1, s6, 6 + addw s0, s2, s4 + lui s6, 19 + add s5, a7, s1 + lw s2, 72(a1) + sh2add s3, a0, s5 + lw s4, 0(s3) + mulw s5, s2, s4 + lw s2, 76(a1) + addiw s4, s6, -1824 + addw s1, s0, s5 li s6, 625 - mulw s9, s7, s8 - slli s8, s6, 7 - addw s4, s5, s9 - lw s6, 80(a1) - add s5, s3, s8 - sh2add s7, a0, s5 - lw s8, 0(s7) - mulw s9, s6, s8 - lw s6, 84(a1) - addiw s8, s10, -2016 - addw s5, s4, s9 - lui s10, 22 - add s4, s3, s8 - sh2add s7, a0, s4 - lw s9, 0(s7) - li s7, 1375 - mulw s8, s6, s9 - slli s9, s7, 6 - addw s4, s5, s8 - lw s7, 88(a1) - add s5, s3, s9 - sh2add s6, a0, s5 - lw s9, 0(s6) - mulw s8, s7, s9 - addiw s7, s10, 1888 - addw s5, s4, s8 - lw s4, 92(a1) - add s8, s3, s7 - sh2add s6, a0, s8 + add s0, a7, s4 + sh2add s3, a0, s0 + lw s4, 0(s3) + mulw s5, s2, s4 + slli s4, s6, 7 + addw s0, s1, s5 + lw s2, 80(a1) + lui s6, 21 + add s1, a7, s4 + sh2add s3, a0, s1 + lw s4, 0(s3) + mulw s5, s2, s4 + lw s2, 84(a1) + addiw s4, s6, -2016 + addw s1, s0, s5 + lui s6, 22 + add s0, a7, s4 + sh2add s3, a0, s0 + lw s4, 0(s3) + li s3, 1375 + mulw s5, s2, s4 + slli s4, s3, 6 + addw s0, s1, s5 + lw s3, 88(a1) + add s1, a7, s4 + sh2add s2, a0, s1 + lw s5, 0(s2) + addiw s2, s6, 1888 + mulw s4, s3, s5 + add s3, a7, s2 + addw s1, s0, s4 + lw a7, 92(a1) + sh2add s0, a0, s3 addiw a0, a0, 1 - lw s3, 0(s6) - mulw s7, s4, s3 - addw a1, s5, s7 - sw a1, 0(s2) - li a1, 1000 - bge a0, a1, label793 - addi s2, s2, 4 - mv a1, t6 - mv s4, zero - mv s5, zero + lw s2, 0(s0) + mulw s3, a7, s2 + li a7, 1000 + addw a1, s1, s3 + sw a1, 0(a6) + bge a0, a7, label786 + addi a6, a6, 4 + mv a1, t2 + mv s0, zero + mv s1, zero .p2align 2 -label788: - mul s8, s4, a2 - lw s6, 0(a1) - addiw s4, s4, 16 - add s3, a3, s8 - sh2add s7, a0, s3 - add s9, s3, a2 - sh2add s10, a0, s9 - lw s8, 0(s7) - lw s11, 4(a1) - lw s7, 0(s10) - mulw s10, s6, s8 - lw s8, 8(a1) - mulw s9, s11, s7 - add s11, s3, a4 - addw s7, s9, s10 - sh2add s10, a0, s11 - lw s9, 0(s10) - add s10, s3, a5 - mulw s11, s8, s9 - lw s9, 12(a1) - sh2add s8, a0, s10 - addw s6, s7, s11 - lw s11, 0(s8) - add s8, s3, t0 - mulw s10, s9, s11 - sh2add s9, a0, s8 - addw s7, s6, s10 - lw s6, 16(a1) - lw s10, 0(s9) - mulw s11, s6, s10 - add s10, s3, t1 - addw s8, s7, s11 - sh2add s9, a0, s10 - lw s7, 20(a1) - lw s11, 0(s9) - mulw s10, s7, s11 - add s11, s3, t3 - add s7, s3, t2 - addw s6, s8, s10 - sh2add s9, a0, s7 - lw s8, 24(a1) - lw s10, 0(s9) - mulw s9, s8, s10 - sh2add s8, a0, s11 - addw s7, s6, s9 - lw s9, 28(a1) - lw s10, 0(s8) - lw s8, 32(a1) - mulw s11, s9, s10 - add s10, s3, t4 - addw s6, s7, s11 - sh2add s9, a0, s10 - lw s10, 0(s9) - lw s9, 36(a1) - mulw s11, s8, s10 - add s10, s3, t5 - addw s7, s6, s11 - sh2add s8, a0, s10 - lw s11, 0(s8) +label781: + mul s3, s0, a2 + lw s2, 0(a1) + addiw s0, s0, 16 + add a7, a3, s3 + sh2add s5, a0, a7 + add s6, a7, a2 + sh2add s4, a0, s6 + lw s3, 0(s5) + lw s5, 4(a1) + lw s7, 0(s4) + mulw s8, s2, s3 + lw s2, 8(a1) + mulw s6, s5, s7 + add s7, a7, a4 + addw s4, s6, s8 + sh2add s5, a0, s7 + add s6, a7, a5 + lw s8, 0(s5) + sh2add s5, a0, s6 + add s6, a7, t0 + mulw s7, s2, s8 + addw s3, s4, s7 + lw s4, 12(a1) + lw s8, 0(s5) + lw s5, 16(a1) + mulw s7, s4, s8 + sh2add s4, a0, s6 + addw s2, s3, s7 + add s6, a7, t1 + lw s7, 0(s4) + lw s4, 20(a1) + mulw s8, s5, s7 + sh2add s5, a0, s6 + addw s3, s2, s8 + lw s7, 0(s5) + li s5, 375 + slli s6, s5, 6 + mulw s8, s4, s7 + lw s5, 24(a1) + addw s2, s3, s8 li s8, 625 - mulw s10, s9, s11 - lw s9, 40(a1) - addw s6, s7, s10 - slli s10, s8, 6 - add s7, s3, s10 - sh2add s8, a0, s7 - lw s10, 0(s8) + add s3, a7, s6 + sh2add s4, a0, s3 + lw s6, 0(s4) + li s4, 875 + mulw s7, s5, s6 + slli s6, s4, 5 + addw s3, s2, s7 + lw s4, 28(a1) + add s2, a7, s6 + sh2add s5, a0, s2 + lw s6, 0(s5) + li s5, 125 + mulw s7, s4, s6 + slli s6, s5, 8 + addw s2, s3, s7 + lw s5, 32(a1) + add s3, a7, s6 + sh2add s4, a0, s3 + lw s6, 0(s4) + li s4, 1125 + mulw s7, s5, s6 + slli s6, s4, 5 + addw s3, s2, s7 + lw s4, 36(a1) + add s2, a7, s6 + sh2add s5, a0, s2 + lw s7, 0(s5) + mulw s6, s4, s7 + lw s4, 40(a1) + slli s7, s8, 6 + addw s2, s3, s6 li s8, 1375 - mulw s11, s9, s10 - lw s9, 44(a1) - slli s10, s8, 5 - addw s7, s6, s11 - add s6, s3, s10 - sh2add s8, a0, s6 - lw s10, 0(s8) + add s3, a7, s7 + sh2add s5, a0, s3 + lw s6, 0(s5) + mulw s7, s4, s6 + slli s4, s8, 5 + addw s3, s2, s7 li s8, 375 - mulw s11, s9, s10 - lw s9, 48(a1) - slli s10, s8, 7 - addw s6, s7, s11 - add s7, s3, s10 - sh2add s8, a0, s7 - lw s10, 0(s8) - li s8, 1625 - mulw s11, s9, s10 - lw s9, 52(a1) - slli s10, s8, 5 - addw s7, s6, s11 - add s6, s3, s10 - sh2add s8, a0, s6 - lw s10, 0(s8) - li s8, 875 - mulw s11, s9, s10 - lw s9, 56(a1) - slli s10, s8, 6 - addw s6, s7, s11 - add s7, s3, s10 - sh2add s8, a0, s7 - lw s10, 0(s8) + add s6, a7, s4 + lw s2, 44(a1) + sh2add s5, a0, s6 + lw s7, 0(s5) + mulw s6, s2, s7 + slli s2, s8, 7 + addw s4, s3, s6 li s8, 1875 - mulw s11, s9, s10 - slli s10, s8, 5 - addw s7, s6, s11 - lw s8, 60(a1) - add s9, s3, s10 - sh2add s6, a0, s9 - lw s10, 0(s6) - li s6, 992 - mulw s9, s8, s10 - addw s11, s7, s9 - addw s5, s5, s11 - bge s4, s6, label792 + add s7, a7, s2 + lw s3, 48(a1) + sh2add s5, a0, s7 + lw s6, 0(s5) + li s5, 1625 + mulw s7, s3, s6 + slli s6, s5, 5 + addw s2, s4, s7 + lw s5, 52(a1) + add s3, a7, s6 + sh2add s4, a0, s3 + lw s7, 0(s4) + li s4, 875 + mulw s6, s5, s7 + slli s7, s4, 6 + addw s3, s2, s6 + lw s4, 56(a1) + add s2, a7, s7 + sh2add s5, a0, s2 + lw s7, 0(s5) + slli s5, s8, 5 + mulw s6, s4, s7 + lw s4, 60(a1) + addw s2, s3, s6 + add s6, a7, s5 + sh2add s3, a0, s6 + lw s7, 0(s3) + mulw s6, s4, s7 + addw s5, s2, s6 + li s2, 992 + addw s1, s1, s5 + bge s0, s2, label785 addi a1, a1, 64 - j label788 + j label781 .p2align 2 -label793: - addiw s1, s1, 1 - ble a7, s1, label795 - add a6, a6, a2 - mul a1, s1, a2 +label786: + addiw t6, t6, 1 + ble t4, t6, label788 + add t3, t3, a2 + mul a1, t6, a2 mv a0, zero - mv s4, zero - mv s5, zero - mv s2, a6 - add t6, s0, a1 - mv a1, t6 - j label788 -label795: - ld s1, 0(sp) - ld s6, 8(sp) - ld s0, 16(sp) - ld s5, 24(sp) - ld s2, 32(sp) - ld s4, 40(sp) - ld s8, 48(sp) - ld s3, 56(sp) - ld s7, 64(sp) - ld s9, 72(sp) - ld s10, 80(sp) - ld s11, 88(sp) - addi sp, sp, 96 + mv s0, zero + mv s1, zero + mv a6, t3 + add t2, t5, a1 + mv a1, t2 + j label781 +label788: + ld s0, 0(sp) + ld s5, 8(sp) + ld s1, 16(sp) + ld s6, 24(sp) + ld s3, 32(sp) + ld s2, 40(sp) + ld s4, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + addi sp, sp, 72 ret .p2align 2 cmmc_parallel_body_2: mv t0, a0 mv a2, a1 addiw a4, a0, 3 -pcrel1201: +pcrel1190: auipc a5, %pcrel_hi(cmmc_parallel_body_payload_2) - ld a3, %pcrel_lo(pcrel1201)(a5) - addi a1, a5, %pcrel_lo(pcrel1201) + ld a3, %pcrel_lo(pcrel1190)(a5) + addi a1, a5, %pcrel_lo(pcrel1190) lw a0, 8(a1) - ble a2, a4, label1099 + ble a2, a4, label1088 addiw t1, t0, 15 addiw a4, a2, -3 addiw a5, a2, -18 - bge t1, a4, label1150 + bge t1, a4, label1139 sh2add a1, t0, a3 - j label1109 + j label1098 .p2align 2 -label1112: +label1101: addi a1, a1, 64 .p2align 2 -label1109: +label1098: sw a0, 0(a1) addiw t0, t0, 16 sw a0, 4(a1) @@ -1518,59 +1494,59 @@ label1109: sw a0, 52(a1) sw a0, 56(a1) sw a0, 60(a1) - bgt a5, t0, label1112 + bgt a5, t0, label1101 mv a1, t0 -label1113: - ble a4, a1, label1099 +label1102: + ble a4, a1, label1088 sh2add a5, a1, a3 -label1117: +label1106: sw a0, 0(a5) addiw a1, a1, 4 sw a0, 4(a5) sw a0, 8(a5) sw a0, 12(a5) - ble a4, a1, label1187 + ble a4, a1, label1176 addi a5, a5, 16 - j label1117 -label1187: + j label1106 +label1176: mv t0, a1 -label1099: - ble a2, t0, label1106 +label1088: + ble a2, t0, label1095 sh2add a1, t0, a3 - j label1102 -label1105: + j label1091 +label1094: addi a1, a1, 4 -label1102: +label1091: addiw t0, t0, 1 sw a0, 0(a1) - bgt a2, t0, label1105 -label1106: + bgt a2, t0, label1094 +label1095: ret -label1150: +label1139: mv a1, t0 mv t0, zero - j label1113 + j label1102 .p2align 2 cmmc_parallel_body_3: mv t0, a0 addiw a5, a0, 3 -pcrel1356: +pcrel1345: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - addi a2, a0, %pcrel_lo(pcrel1356) + addi a2, a0, %pcrel_lo(pcrel1345) ld a3, 8(a2) - ble a1, a5, label1244 + ble a1, a5, label1233 addiw a0, t0, 15 addiw a4, a1, -3 addiw a5, a1, -18 - bge a0, a4, label1251 + bge a0, a4, label1240 sh2add a0, t0, a3 mv t1, zero - j label1219 + j label1208 .p2align 2 -label1223: +label1212: addi a0, a0, 64 .p2align 2 -label1219: +label1208: lw t4, 0(a0) addiw t0, t0, 16 lw t5, 4(a0) @@ -1604,17 +1580,17 @@ label1219: lw t4, 60(a0) addw t2, t3, t6 addw t1, t2, t4 - bgt a5, t0, label1223 + bgt a5, t0, label1212 mv a5, t0 mv t2, t1 -label1204: - ble a4, a5, label1255 +label1193: + ble a4, a5, label1244 sh2add a0, a5, a3 mv t0, t2 - j label1213 -label1217: + j label1202 +label1206: addi a0, a0, 16 -label1213: +label1202: lw t1, 0(a0) addiw a5, a5, 4 lw t4, 4(a0) @@ -1624,39 +1600,39 @@ label1213: lw t3, 12(a0) addw t1, t2, t5 addw t0, t1, t3 - bgt a4, a5, label1217 + bgt a4, a5, label1206 mv a0, t0 mv a4, t0 mv t0, a5 -label1224: - ble a1, t0, label1333 +label1213: + ble a1, t0, label1322 sh2add a0, t0, a3 mv a3, a4 - j label1231 -label1235: + j label1220 +label1224: addi a0, a0, 4 -label1231: +label1220: lw a5, 0(a0) addiw t0, t0, 1 addw a3, a3, a5 - bgt a1, t0, label1235 -label1228: + bgt a1, t0, label1224 +label1217: amoadd.w.aqrl a1, a3, (a2) ret -label1255: +label1244: mv a0, t1 mv a4, t1 - j label1224 -label1251: + j label1213 +label1240: mv a5, t0 mv t2, zero mv t1, zero mv t0, zero - j label1204 -label1333: + j label1193 +label1322: mv a3, a0 - j label1228 -label1244: + j label1217 +label1233: mv a4, zero mv a0, zero - j label1224 + j label1213 diff --git a/tests/SysY2022/performance/matmul3.arm.s b/tests/SysY2022/performance/matmul3.arm.s index 721ecd82d..93911d3e2 100644 --- a/tests/SysY2022/performance/matmul3.arm.s +++ b/tests/SysY2022/performance/matmul3.arm.s @@ -1,19 +1,19 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 b: .zero 4000000 -.align 8 +.p2align 3 c: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 8 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 12 .text diff --git a/tests/SysY2022/performance/matmul3.riscv.s b/tests/SysY2022/performance/matmul3.riscv.s index 5036d87ef..e17b7c81c 100644 --- a/tests/SysY2022/performance/matmul3.riscv.s +++ b/tests/SysY2022/performance/matmul3.riscv.s @@ -1,485 +1,473 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 4000000 -.align 8 +.p2align 3 b: .zero 4000000 -.align 8 +.p2align 3 c: .zero 4000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_2: .zero 12 -.align 8 +.p2align 3 cmmc_parallel_body_payload_3: .zero 16 .text .p2align 2 .globl main main: - # stack usage: CalleeArg[0] Local[0] RegSpill[16] CalleeSaved[104] - addi sp, sp, -120 -pcrel2002: + addi sp, sp, -88 +pcrel1982: auipc a0, %pcrel_hi(a) -pcrel2003: +pcrel1983: auipc a1, %pcrel_hi(cmmc_parallel_body_3) sd ra, 0(sp) - addi a2, a1, %pcrel_lo(pcrel2003) - sd s7, 8(sp) - li a1, 875 - addi s7, a0, %pcrel_lo(pcrel2002) - sd s9, 16(sp) -pcrel2004: + sd s5, 8(sp) + addi s5, a0, %pcrel_lo(pcrel1982) + sd s0, 16(sp) +pcrel1984: auipc a0, %pcrel_hi(c) -pcrel2005: - auipc s9, %pcrel_hi(cmmc_parallel_body_payload_2) - sd s10, 24(sp) - addi s10, s9, %pcrel_lo(pcrel2005) - sd s0, 32(sp) - addi s0, a0, %pcrel_lo(pcrel2004) - sd s5, 40(sp) -pcrel2006: + sd s7, 24(sp) +pcrel1985: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_2) + sd s8, 32(sp) + addi s8, s7, %pcrel_lo(pcrel1985) + sd s1, 40(sp) + addi s1, a0, %pcrel_lo(pcrel1984) + sd s6, 48(sp) +pcrel1986: auipc a0, %pcrel_hi(cmmc_parallel_body_2) - sd s11, 48(sp) - mv s11, zero - sd s8, 56(sp) - addi s8, a0, %pcrel_lo(pcrel2006) - sd s1, 64(sp) + sd s9, 56(sp) + addi s6, a0, %pcrel_lo(pcrel1986) + mv s9, zero li a0, 125 - sd s6, 72(sp) - slli a5, a0, 5 - slli s6, a1, 5 - slli s1, a5, 1 - sd s2, 80(sp) - sh1add s2, a5, a5 - sd s3, 88(sp) - slli s5, s2, 1 - slli s3, s1, 1 - sd s4, 96(sp) - sh2add s4, a5, a5 - sd a2, 104(sp) - sd a5, 112(sp) -label1358: + sd s2, 64(sp) + slli s0, a0, 5 + addi s2, a1, %pcrel_lo(pcrel1983) + sd s3, 72(sp) + slli s3, s0, 1 + sd s4, 80(sp) + sh1add s4, s0, s0 +label1347: li a0, 1000 - bge s11, a0, label1365 - mv a0, s7 + bge s9, a0, label1354 + mv a0, s5 jal getarray li a1, 1000 - bne a0, a1, label1363 - addiw s11, s11, 1 - ld a5, 112(sp) - add s7, s7, a5 - j label1358 + bne a0, a1, label1352 + addiw s9, s9, 1 + add s5, s5, s0 + j label1347 .p2align 2 -label1389: - ld a5, 112(sp) - add s0, s0, a5 +label1378: + add s1, s1, s0 .p2align 2 -label1384: - auipc s3, %pcrel_hi(cmmc_parallel_body_payload_3) +label1373: + auipc s5, %pcrel_hi(cmmc_parallel_body_payload_3) mv a0, zero li a1, 1000 - sw s4, %pcrel_lo(label1384)(s3) - sw s4, 4(s1) - sd s0, 8(s1) - ld a2, 104(sp) + sw s6, %pcrel_lo(label1373)(s5) + sw s6, 4(s3) + sd s1, 8(s3) + mv a2, s2 jal cmmcParallelFor li a0, 1000 - addiw s2, s2, 1 - lw s4, %pcrel_lo(label1384)(s3) - blt s2, a0, label1389 + addiw s4, s4, 1 + lw s6, %pcrel_lo(label1373)(s5) + blt s4, a0, label1378 li a0, 92 jal _sysy_stoptime - mv a0, s4 + mv a0, s6 jal putint mv a0, zero -label1363: +label1352: ld ra, 0(sp) - ld s7, 8(sp) - ld s9, 16(sp) - ld s10, 24(sp) - ld s0, 32(sp) - ld s5, 40(sp) - ld s11, 48(sp) - ld s8, 56(sp) - ld s1, 64(sp) - ld s6, 72(sp) - ld s2, 80(sp) - ld s3, 88(sp) - ld s4, 96(sp) - addi sp, sp, 120 + ld s5, 8(sp) + ld s0, 16(sp) + ld s7, 24(sp) + ld s8, 32(sp) + ld s1, 40(sp) + ld s6, 48(sp) + ld s9, 56(sp) + ld s2, 64(sp) + ld s3, 72(sp) + ld s4, 80(sp) + addi sp, sp, 88 ret -label1365: +label1354: li a0, 23 jal _sysy_starttime li a1, 1000 mv a0, zero -pcrel2007: +pcrel1987: auipc a3, %pcrel_hi(cmmc_parallel_body_0) - addi a2, a3, %pcrel_lo(pcrel2007) + addi a2, a3, %pcrel_lo(pcrel1987) jal cmmcParallelFor li a1, 1000 mv a0, zero -pcrel2008: +pcrel1988: auipc a3, %pcrel_hi(cmmc_parallel_body_1) - addi a2, a3, %pcrel_lo(pcrel2008) + addi a2, a3, %pcrel_lo(pcrel1988) jal cmmcParallelFor - mv s11, zero - mv s7, s0 - mv a0, s0 + mv s9, zero + mv s5, s1 + mv a0, s1 mv a1, zero lui a3, 524288 addiw a2, a3, -1 - j label1369 + j label1358 .p2align 2 -label1392: +label1381: addi a0, a0, 256 .p2align 2 -label1369: - lw a5, 0(a0) +label1358: + lw t0, 0(a0) addiw a1, a1, 64 - lw t0, 4(a0) - min a3, a2, a5 - lw t1, 8(a0) - min a4, a3, t0 - lw t2, 12(a0) - min a5, a4, t1 + lw a5, 4(a0) + min a4, a2, t0 + lw t0, 8(a0) + min a3, a4, a5 + lw a5, 12(a0) + min a2, a3, t0 lw t0, 16(a0) - min a2, a5, t2 + min a4, a2, a5 lw a5, 20(a0) - min a3, a2, t0 + min a3, a4, t0 lw t0, 24(a0) - min a4, a3, a5 - lw t1, 28(a0) - min a2, a4, t0 - lw a4, 32(a0) + min a2, a3, a5 + lw a5, 28(a0) + min a4, a2, t0 + lw t0, 32(a0) + min a3, a4, a5 + lw t1, 36(a0) + min a2, a3, t0 + lw a3, 40(a0) min a5, a2, t1 - lw a2, 36(a0) - min a3, a5, a4 - lw a5, 40(a0) - min t0, a3, a2 - lw a3, 44(a0) - min a4, t0, a5 - lw t0, 48(a0) - min a2, a4, a3 + lw t0, 44(a0) + min a4, a5, a3 + lw a5, 48(a0) + min a2, a4, t0 lw t1, 52(a0) - min a5, a2, t0 - lw t0, 56(a0) - min a3, a5, t1 - lw t1, 60(a0) - min a4, a3, t0 - lw a5, 64(a0) - min a2, a4, t1 - lw t0, 68(a0) min a3, a2, a5 + lw t0, 56(a0) + min a4, a3, t1 + lw a3, 60(a0) + min a5, a4, t0 + lw t2, 64(a0) + min a2, a5, a3 + lw t1, 68(a0) + min t0, a2, t2 lw a5, 72(a0) - min a4, a3, t0 + min a4, t0, t1 lw t1, 76(a0) - min a2, a4, a5 + min a3, a4, a5 lw t0, 80(a0) - min a3, a2, t1 - lw a2, 84(a0) - min a5, a3, t0 - lw t1, 88(a0) - min a4, a5, a2 - lw t0, 92(a0) - min a3, a4, t1 - lw a5, 96(a0) - min a2, a3, t0 - lw t0, 100(a0) - min a4, a2, a5 - lw t1, 104(a0) + min a2, a3, t1 + lw t1, 84(a0) + min a5, a2, t0 + lw t0, 88(a0) + min a4, a5, t1 + lw t1, 92(a0) min a3, a4, t0 - lw t0, 108(a0) + lw t0, 96(a0) + min a2, a3, t1 + lw t1, 100(a0) + min a5, a2, t0 + lw a2, 104(a0) + min a4, a5, t1 + lw t1, 108(a0) + min a3, a4, a2 + lw t0, 112(a0) min a5, a3, t1 - lw a3, 112(a0) + lw t1, 116(a0) min a2, a5, t0 - lw t0, 116(a0) - min a4, a2, a3 - lw t2, 120(a0) - min a5, a4, t0 - lw t1, 124(a0) - min a3, a5, t2 - lw t0, 128(a0) - min a2, a3, t1 - lw t2, 132(a0) - min a4, a2, t0 - lw t1, 136(a0) - min a5, a4, t2 - lw t0, 140(a0) - min a3, a5, t1 - lw a5, 144(a0) + lw a5, 120(a0) + min a4, a2, t1 + lw t0, 124(a0) + min a3, a4, a5 + lw a5, 128(a0) min a2, a3, t0 - lw t1, 148(a0) + lw t1, 132(a0) min a4, a2, a5 + lw t0, 136(a0) + min a3, a4, t1 + lw t1, 140(a0) + min a5, a3, t0 + lw t0, 144(a0) + min a2, a5, t1 + lw t1, 148(a0) + min a4, a2, t0 lw t0, 152(a0) min a3, a4, t1 lw a4, 156(a0) - min a2, a3, t0 - lw t1, 160(a0) - min a5, a2, a4 - lw t0, 164(a0) - min a3, a5, t1 - lw a5, 168(a0) - min a4, a3, t0 + min a5, a3, t0 + lw t0, 160(a0) + min a2, a5, a4 + lw a5, 164(a0) + min a3, a2, t0 + lw t0, 168(a0) + min a4, a3, a5 lw t1, 172(a0) - min a2, a4, a5 - lw t0, 176(a0) + min a2, a4, t0 + lw a5, 176(a0) min a3, a2, t1 - lw a2, 180(a0) - min a5, a3, t0 - lw t0, 184(a0) - min a4, a5, a2 - lw a5, 188(a0) - min a3, a4, t0 + lw t0, 180(a0) + min a4, a3, a5 + lw a3, 184(a0) + min a2, a4, t0 + lw t1, 188(a0) + min a5, a2, a3 lw t0, 192(a0) + min a4, a5, t1 + lw a5, 196(a0) + min a3, a4, t0 + lw t0, 200(a0) min a2, a3, a5 - lw t1, 196(a0) + lw t1, 204(a0) min a4, a2, t0 - lw a5, 200(a0) - min a3, a4, t1 - lw t0, 204(a0) - min a2, a3, a5 - lw a5, 208(a0) + lw t0, 208(a0) + min a5, a4, t1 + lw t1, 212(a0) + min a3, a5, t0 + lw t0, 216(a0) + min a2, a3, t1 + lw a5, 220(a0) min a4, a2, t0 - lw t0, 212(a0) + lw t0, 224(a0) min a3, a4, a5 - lw a5, 216(a0) + lw a5, 228(a0) min a2, a3, t0 - lw t0, 220(a0) - min a4, a2, a5 - lw t1, 224(a0) - min a3, a4, t0 - lw t0, 228(a0) - min a5, a3, t1 lw t1, 232(a0) - min a2, a5, t0 - lw a5, 236(a0) - min a4, a2, t1 - lw t0, 240(a0) - min a3, a4, a5 - lw a5, 244(a0) - min a2, a3, t0 - lw t0, 248(a0) min a4, a2, a5 + lw t0, 236(a0) + min a3, a4, t1 + lw a4, 240(a0) + min a2, a3, t0 + lw t0, 244(a0) + min a5, a2, a4 + lw t1, 248(a0) + min a3, a5, t0 lw a5, 252(a0) - min a3, a4, t0 - li a4, 960 - min a2, a3, a5 - blt a1, a4, label1392 - lw a4, 256(a0) - lw t0, 260(a0) - min a3, a2, a4 - lw a5, 264(a0) - min a1, a3, t0 - lw t0, 268(a0) + min a4, a3, t1 + li a3, 960 + min a2, a4, a5 + blt a1, a3, label1381 + lw a3, 256(a0) + lw a5, 260(a0) + min a1, a2, a3 + lw a2, 264(a0) min a4, a1, a5 - lw a5, 272(a0) - min a2, a4, t0 - lw a4, 276(a0) - min a3, a2, a5 - lw a5, 280(a0) - min a1, a3, a4 - lw a4, 284(a0) - min a2, a1, a5 - lw t0, 288(a0) - min a3, a2, a4 - lw a5, 292(a0) - min a1, a3, t0 - lw a4, 296(a0) - min a2, a1, a5 - lw a5, 300(a0) - min a3, a2, a4 - lw t0, 304(a0) + lw a5, 268(a0) + min a3, a4, a2 + lw t0, 272(a0) min a1, a3, a5 - lw a4, 308(a0) - min a2, a1, t0 - lw a5, 312(a0) - min a3, a2, a4 - lw t0, 316(a0) - min a1, a3, a5 - lw a4, 320(a0) + lw a3, 276(a0) min a2, a1, t0 + lw t0, 280(a0) + min a4, a2, a3 + lw a5, 284(a0) + min a1, a4, t0 + lw t0, 288(a0) + min a3, a1, a5 + lw a4, 292(a0) + min a2, a3, t0 + lw t0, 296(a0) + min a1, a2, a4 + lw a5, 300(a0) + min a3, a1, t0 + lw a1, 304(a0) + min a2, a3, a5 + lw a5, 308(a0) + min a4, a2, a1 + lw a2, 312(a0) + min a3, a4, a5 + lw a5, 316(a0) + min a1, a3, a2 + lw t0, 320(a0) + min a4, a1, a5 lw a5, 324(a0) - min a3, a2, a4 + min a3, a4, t0 lw a4, 328(a0) - min a1, a3, a5 + min a2, a3, a5 lw a5, 332(a0) - min a2, a1, a4 + min a1, a2, a4 lw a4, 336(a0) - min a3, a2, a5 + min a3, a1, a5 lw a5, 340(a0) - min a1, a3, a4 + min a2, a3, a4 lw t0, 344(a0) - min a2, a1, a5 - lw a5, 348(a0) - min a4, a2, t0 - lw a2, 352(a0) - min a3, a4, a5 + min a1, a2, a5 + lw a4, 348(a0) + min a3, a1, t0 + lw t0, 352(a0) + min a2, a3, a4 lw a5, 356(a0) - min a1, a3, a2 - lw t0, 360(a0) - min a4, a1, a5 - lw t1, 364(a0) - min a2, a4, t0 - lw a5, 368(a0) - min a3, a2, t1 - lw t0, 372(a0) - min a1, a3, a5 - lw a3, 376(a0) - min a4, a1, t0 - lw t0, 380(a0) - min a2, a4, a3 - lw a5, 384(a0) min a1, a2, t0 - lw a4, 388(a0) + lw a4, 360(a0) min a3, a1, a5 - lw a5, 392(a0) + lw a5, 364(a0) min a2, a3, a4 - lw a3, 396(a0) + lw t0, 368(a0) min a1, a2, a5 - lw a5, 400(a0) - min a4, a1, a3 - lw t0, 404(a0) + lw t1, 372(a0) + min a3, a1, t0 + lw a5, 376(a0) + min a4, a3, t1 + lw a3, 380(a0) min a2, a4, a5 + lw a5, 384(a0) + min a1, a2, a3 + lw t0, 388(a0) + min a4, a1, a5 + lw a5, 392(a0) + min a2, a4, t0 + lw a4, 396(a0) + min a3, a2, a5 + lw a5, 400(a0) + min a1, a3, a4 + lw a4, 404(a0) + min a2, a1, a5 lw a5, 408(a0) - min a3, a2, t0 + min a3, a2, a4 lw a4, 412(a0) min a1, a3, a5 mv a0, zero min a2, a1, a4 -pcrel2009: - auipc s9, %pcrel_hi(cmmc_parallel_body_payload_2) - sd s7, %pcrel_lo(pcrel2009)(s9) +pcrel1989: + auipc s7, %pcrel_hi(cmmc_parallel_body_payload_2) + sd s5, %pcrel_lo(pcrel1989)(s7) li a1, 1000 - sw a2, 8(s10) - mv a2, s8 + sw a2, 8(s8) + mv a2, s6 jal cmmcParallelFor li a0, 1000 - addiw s11, s11, 1 - bge s11, a0, label1733 - ld a5, 112(sp) + addiw s9, s9, 1 + bge s9, a0, label1722 + add s5, s5, s0 mv a1, zero lui a3, 524288 - add s7, s7, a5 + mv a0, s5 addiw a2, a3, -1 - mv a0, s7 - j label1369 -label1733: - mv a2, s0 + j label1358 +label1722: + mv a2, s1 mv a0, zero - mv a1, s0 + mv a1, s1 mv a4, zero - j label1379 + j label1368 .p2align 2 -label1391: +label1380: addi a1, a1, 64 .p2align 2 -label1379: - ld a5, 112(sp) - mul t1, a4, a5 +label1368: + mul t0, a4, s0 + li t4, 125 + li t5, 375 addiw a4, a4, 16 - add a3, s0, t1 - sh2add t0, a0, a3 - lw t2, 0(t0) - add t0, a3, a5 - subw t1, zero, t2 - sh2add t2, a0, t0 - add t0, a3, s1 - sw t1, 0(a1) - sh2add a5, a0, t0 - lw t1, 0(t2) - subw t3, zero, t1 - add t1, a3, s2 - sh2add t0, a0, t1 - sw t3, 4(a1) - add t1, a3, s3 - lw t3, 0(a5) - subw t2, zero, t3 - li t3, 125 - sw t2, 8(a1) - lw t2, 0(t0) - subw a5, zero, t2 - sh2add t2, a0, t1 - sw a5, 12(a1) + add a3, s1, t0 + sh2add t2, a0, a3 lw a5, 0(t2) - add t2, a3, s4 - subw t0, zero, a5 - sh2add a5, a0, t2 - add t2, a3, s5 - sw t0, 16(a1) - lw t0, 0(a5) - subw t1, zero, t0 + add t2, a3, s0 + subw t1, zero, a5 sh2add t0, a0, t2 - add t2, a3, s6 - sw t1, 20(a1) - lw a5, 0(t0) + add t2, a3, s3 + sw t1, 0(a1) + lw t1, 0(t0) sh2add t0, a0, t2 - subw t1, zero, a5 - sw t1, 24(a1) - lw a5, 0(t0) - slli t0, t3, 8 - subw t1, zero, a5 - sw t1, 28(a1) - add t1, a3, t0 - sh2add t2, a0, t1 - lw a5, 0(t2) - li t2, 1125 - subw t0, zero, a5 - slli t3, t2, 5 - add t1, a3, t3 - sw t0, 32(a1) + subw a5, zero, t1 + add t2, a3, s4 + sw a5, 4(a1) + lw t1, 0(t0) + subw a5, zero, t1 + sh2add t1, a0, t2 + slli t2, t4, 7 + sw a5, 8(a1) + lw t0, 0(t1) + add t1, a3, t2 + subw a5, zero, t0 + sh2add t3, a0, t1 + sw a5, 12(a1) + lw t0, 0(t3) li t3, 625 - sh2add a5, a0, t1 - slli t1, t3, 6 - lw t2, 0(a5) - li t3, 1375 + subw a5, zero, t0 + slli t1, t3, 5 + add t4, a3, t1 + sw a5, 16(a1) + sh2add t0, a0, t4 + lw t3, 0(t0) + slli t0, t5, 6 + subw a5, zero, t3 + add t4, a3, t0 + sw a5, 20(a1) + sh2add a5, a0, t4 + li t4, 875 + lw t5, 0(a5) + slli a5, t4, 5 + subw t3, zero, t5 + add t5, a3, a5 + sw t3, 24(a1) + sh2add t3, a0, t5 + slli t5, t2, 1 + lw t6, 0(t3) + add t3, a3, t5 + subw t4, zero, t6 + sw t4, 28(a1) + sh2add t4, a0, t3 + li t3, 1125 + lw t2, 0(t4) + slli t4, t3, 5 + subw t5, zero, t2 + sw t5, 32(a1) + add t5, a3, t4 + sh2add t2, a0, t5 + lw t3, 0(t2) + slli t2, t1, 1 + subw t6, zero, t3 + add t4, a3, t2 + sh2add t3, a0, t4 + sw t6, 36(a1) + li t4, 1375 + lw t5, 0(t3) + slli t2, t4, 5 + subw t1, zero, t5 + add t3, a3, t2 + sw t1, 40(a1) + sh2add t1, a0, t3 + lw t4, 0(t1) + slli t1, t0, 1 + subw t2, zero, t4 + add t3, a3, t1 + sh2add t4, a0, t3 + sw t2, 44(a1) + li t3, 1625 + lw t2, 0(t4) + slli t1, t3, 5 subw t0, zero, t2 - sw t0, 36(a1) + sw t0, 48(a1) add t0, a3, t1 - sh2add a5, a0, t0 - slli t0, t3, 5 - lw t2, 0(a5) - li t3, 375 + slli t1, a5, 1 + sh2add t4, a0, t0 + add t0, a3, t1 + lw t2, 0(t4) + li t1, 1875 + subw t3, zero, t2 + sw t3, 52(a1) + sh2add t3, a0, t0 + slli t0, t1, 5 + lw t2, 0(t3) + subw a5, zero, t2 + sw a5, 56(a1) add a5, a3, t0 - subw t1, zero, t2 sh2add t2, a0, a5 - slli a5, t3, 7 - sw t1, 40(a1) - li t3, 1625 + li a5, 992 lw t1, 0(t2) - add t2, a3, a5 subw t0, zero, t1 - sw t0, 44(a1) - sh2add t0, a0, t2 - lw t1, 0(t0) - subw a5, zero, t1 - slli t1, t3, 5 - li t3, 875 - add t0, a3, t1 - sw a5, 48(a1) - sh2add t2, a0, t0 - slli t0, t3, 6 - lw a5, 0(t2) - subw t1, zero, a5 - sw t1, 52(a1) - add t1, a3, t0 - sh2add t2, a0, t1 - lw a5, 0(t2) - li t2, 1875 - subw t0, zero, a5 - slli a5, t2, 5 - add t1, a3, a5 - sw t0, 56(a1) - sh2add t0, a0, t1 - lw t2, 0(t0) - li t0, 992 - subw a5, zero, t2 - sw a5, 60(a1) - blt a4, t0, label1391 + sw t0, 60(a1) + blt a4, a5, label1380 li a4, 125 lui t3, 17 slli a5, a4, 9 @@ -495,7 +483,6 @@ label1379: lw a4, 0(t0) subw a5, zero, a4 slli a4, t2, 6 - lui t2, 21 add t1, a3, a4 sw a5, 68(a1) sh2add t0, a0, t1 @@ -506,1001 +493,990 @@ label1379: add a5, a3, t0 sw a4, 72(a1) sh2add a4, a0, a5 - li a5, 625 lw t1, 0(a4) - slli a4, a5, 7 + li a4, 625 subw t0, zero, t1 - add t1, a3, a4 - sw t0, 76(a1) - sh2add t0, a0, t1 - lw a5, 0(t0) - subw a4, zero, a5 - addiw a5, t2, -2016 + slli a5, a4, 7 add t1, a3, a5 - sw a4, 80(a1) + sw t0, 76(a1) sh2add t0, a0, t1 - li t1, 1375 lw a4, 0(t0) - slli t0, t1, 6 + lui t0, 21 subw a5, zero, a4 + addiw a4, t0, -2016 + add t1, a3, a4 + sw a5, 80(a1) + sh2add a5, a0, t1 + li t1, 1375 + lw t0, 0(a5) + subw a4, zero, t0 + slli t0, t1, 6 + sw a4, 84(a1) add a4, a3, t0 - sw a5, 84(a1) sh2add a5, a0, a4 lui a4, 22 - lw t1, 0(a5) - subw t0, zero, t1 - addiw t1, a4, 1888 - add a5, a3, t1 - sw t0, 88(a1) - sh2add t0, a0, a5 + lw t2, 0(a5) + addiw t0, a4, 1888 + subw t1, zero, t2 + add a5, a3, t0 + sw t1, 88(a1) + sh2add t1, a0, a5 addiw a0, a0, 1 - lw a4, 0(t0) + lw a4, 0(t1) subw a3, zero, a4 li a4, 1000 sw a3, 92(a1) - bge a0, a4, label1383 - ld a5, 112(sp) + bge a0, a4, label1372 + add a2, a2, s0 mv a4, zero - add a2, a2, a5 mv a1, a2 - j label1379 -label1383: + j label1368 +label1372: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - mv s2, zero mv s4, zero - addi s1, a0, %pcrel_lo(label1383) - j label1384 + mv s6, zero + addi s3, a0, %pcrel_lo(label1372) + j label1373 .p2align 2 cmmc_parallel_body_0: - addi sp, sp, -64 -pcrel779: - auipc a5, %pcrel_hi(b) - li a4, 125 - addi a2, a5, %pcrel_lo(pcrel779) - sd s0, 0(sp) - slli a3, a4, 5 - mv s0, a1 - sh3add t6, a3, a3 + addi sp, sp, -24 + mv t5, a1 +pcrel772: + auipc a4, %pcrel_hi(b) + li a5, 125 + sd s1, 0(sp) + addi a2, a4, %pcrel_lo(pcrel772) + slli a3, a5, 5 + sd s0, 8(sp) sh2add t2, a3, a3 + sh1add t0, a3, a3 slli a5, a3, 1 - sd s5, 8(sp) mul a1, a0, a3 - sh1add t0, a3, a3 - slli t1, a5, 1 - sd s1, 16(sp) - add a7, a2, a1 + sd s2, 16(sp) slli t3, t0, 1 - slli t5, t1, 1 - sd s6, 24(sp) - li a2, 625 -pcrel780: + slli t1, a5, 1 + add t4, a2, a1 +pcrel773: auipc a1, %pcrel_hi(a) - slli a6, a2, 6 - sd s4, 32(sp) - addi a4, a1, %pcrel_lo(pcrel780) - sd s2, 40(sp) - li a1, 875 - sd s3, 48(sp) - slli t4, a1, 5 - sd s7, 56(sp) - mv a1, a7 - mv s1, zero + addi a4, a1, %pcrel_lo(pcrel773) + mv a1, t4 + mv t6, zero j label5 .p2align 2 label9: - li s5, 125 - slli s4, s5, 11 - lui s5, 63 - add s3, a2, s4 - addiw s4, s5, 1952 - sh2add s2, a0, s3 - lui s5, 64 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, 1856 - sh2add s2, a0, s3 - lui s5, 65 - add s3, a2, s4 - sw s1, 256(a1) - addiw s4, s5, 1760 - lw s1, 0(s2) - lui s5, 66 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 260(a1) - addiw s4, s5, 1664 - lw s1, 0(s2) - lui s5, 67 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 264(a1) - addiw s4, s5, 1568 - lw s1, 0(s2) - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 268(a1) - lui s4, 68 - lw s1, 0(s2) - sh2add s2, a0, s3 - addiw s3, s4, 1472 - sw s1, 272(a1) - lw s1, 0(s2) - sw s1, 276(a1) - add s1, a2, s3 - sh2add s2, a0, s1 - lui s1, 69 - lw s5, 0(s2) - addiw s4, s1, 1376 - add s3, a2, s4 - sh2add s2, a0, s3 - sw s5, 280(a1) - li s5, 1125 - lw s1, 0(s2) - slli s4, s5, 8 - lui s5, 71 - add s3, a2, s4 - sw s1, 284(a1) - sh2add s1, a0, s3 - lw s2, 0(s1) - addiw s1, s5, 1184 - lui s5, 72 - add s4, a2, s1 - sw s2, 288(a1) - sh2add s3, a0, s4 - addiw s4, s5, 1088 - lw s2, 0(s3) - lui s5, 73 - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 292(a1) - lw s2, 0(s1) - addiw s1, s5, 992 - lui s5, 74 - add s4, a2, s1 - sw s2, 296(a1) - sh2add s3, a0, s4 - addiw s4, s5, 896 - lw s2, 0(s3) - lui s5, 75 - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 300(a1) - lw s2, 0(s1) - addiw s1, s5, 800 - lui s5, 76 - add s4, a2, s1 - sw s2, 304(a1) - addiw s1, s5, 704 - sh2add s3, a0, s4 - lui s5, 77 - add s4, a2, s1 - lw s2, 0(s3) - sh2add s3, a0, s4 - addiw s4, s5, 608 - sw s2, 308(a1) - li s5, 625 - lw s2, 0(s3) - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 312(a1) - lw s2, 0(s1) - slli s1, s5, 9 - lui s5, 79 - add s4, a2, s1 - sw s2, 316(a1) - sh2add s3, a0, s4 - addiw s4, s5, 416 - lw s2, 0(s3) - lui s5, 80 - add s3, a2, s4 - sh2add s1, a0, s3 - sw s2, 320(a1) - lw s2, 0(s1) - addiw s1, s5, 320 - lui s5, 82 - add s4, a2, s1 - sw s2, 324(a1) - lui s1, 81 - sh2add s3, a0, s4 - addiw s4, s1, 224 - lw s2, 0(s3) - add s3, a2, s4 - addiw s4, s5, 128 - sw s2, 328(a1) - lui s5, 83 - sh2add s2, a0, s3 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, 32 - sh2add s2, a0, s3 - lui s5, 84 - add s3, a2, s4 - sw s1, 332(a1) - addiw s4, s5, -64 - lw s1, 0(s2) - lui s5, 88 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 336(a1) - lw s1, 0(s2) - sh2add s2, a0, s3 - lui s3, 85 - sw s1, 340(a1) - lw s1, 0(s2) - addiw s2, s3, -160 - sw s1, 344(a1) - add s1, a2, s2 - li s2, 1375 - sh2add s4, a0, s1 - slli s1, s2, 8 - lw s3, 0(s4) - add s4, a2, s1 - lui s1, 87 - sw s3, 348(a1) - sh2add s3, a0, s4 - addiw s4, s1, -352 - lw s2, 0(s3) - add s3, a2, s4 - addiw s4, s5, -448 - sw s2, 352(a1) - lui s5, 89 - sh2add s2, a0, s3 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, -544 - sh2add s2, a0, s3 - lui s5, 90 - sw s1, 356(a1) - lw s1, 0(s2) - add s2, a2, s4 - addiw s4, s5, -640 - sh2add s3, a0, s2 - sw s1, 360(a1) - lui s5, 91 - lw s1, 0(s3) - add s3, a2, s4 - addiw s4, s5, -736 - sh2add s2, a0, s3 - sw s1, 364(a1) - lui s5, 92 - add s3, a2, s4 - lw s1, 0(s2) - addiw s4, s5, -832 - sh2add s2, a0, s3 - lui s5, 93 - add s3, a2, s4 - sw s1, 368(a1) - addiw s4, s5, -928 - lw s1, 0(s2) - li s5, 375 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 372(a1) - slli s4, s5, 10 - lw s1, 0(s2) - lui s5, 95 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 376(a1) - addiw s4, s5, -1120 - lw s1, 0(s2) - lui s5, 96 - sh2add s2, a0, s3 - add s3, a2, s4 - sw s1, 380(a1) - addiw s4, s5, -1216 - lw s1, 0(s2) - lui s5, 97 - sw s1, 384(a1) - sh2add s1, a0, s3 - add s3, a2, s4 - lw s2, 0(s1) - addiw s4, s5, -1312 - lui s5, 98 - sw s2, 388(a1) - sh2add s2, a0, s3 - add s3, a2, s4 - lw s1, 0(s2) - sh2add s2, a0, s3 - sw s1, 392(a1) - lw s1, 0(s2) - addiw s2, s5, -1408 - lui s5, 99 - add s4, a2, s2 - sw s1, 396(a1) - sh2add s3, a0, s4 - addiw s4, s5, -1504 - lw s1, 0(s3) - lui s5, 100 - add s3, a2, s4 - sh2add s2, a0, s3 - sw s1, 400(a1) - addiw s3, s5, -1600 - lw s1, 0(s2) - add s4, a2, s3 - sh2add s2, a0, s4 - lui s4, 101 - sw s1, 404(a1) - addiw s3, s4, -1696 - lw s1, 0(s2) - add s2, a2, s3 - sw s1, 408(a1) - sh2add s1, a0, s2 + li a6, 125 + lui s0, 63 + slli t6, a6, 11 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, 1952 + lui s0, 64 + add a7, a2, a6 + sw t6, 256(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, 1856 + lui s0, 66 + add a7, a2, t6 + sw a6, 260(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 65 + sw t6, 264(a1) + addiw t6, a6, 1760 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, 1664 + lui s0, 67 + add a7, a2, a6 + sw t6, 268(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, 1568 + add a7, a2, t6 + sw a6, 272(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 68 + addiw a7, a6, 1472 + sw t6, 276(a1) + add t6, a2, a7 + sh2add a6, a0, t6 + lui t6, 69 + lw a7, 0(a6) + addiw a6, t6, 1376 + sw a7, 280(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + li a7, 1125 + lw a6, 0(t6) + slli t6, a7, 8 + add s0, a2, t6 + sw a6, 284(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 71 + addiw t6, a7, 1184 + sw a6, 288(a1) + add s0, a2, t6 + lui t6, 72 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 1088 + sw a7, 292(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + lui a7, 73 + lw a6, 0(t6) + addiw t6, a7, 992 + add s0, a2, t6 + sw a6, 296(a1) + lui t6, 74 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 896 + sw a7, 300(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + lui a7, 75 + lw a6, 0(t6) + addiw t6, a7, 800 + add s0, a2, t6 + sw a6, 304(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 76 + addiw t6, a7, 704 + sw a6, 308(a1) + add s0, a2, t6 + lui t6, 77 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 608 + sw a7, 312(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + li a7, 625 + lw a6, 0(t6) + slli t6, a7, 9 + add s0, a2, t6 + sw a6, 316(a1) + lui t6, 79 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, t6, 416 + sw a7, 320(a1) + add a7, a2, a6 + sh2add t6, a0, a7 + lui a7, 80 + lw a6, 0(t6) + addiw t6, a7, 320 + add s0, a2, t6 + sw a6, 324(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 81 + addiw t6, a7, 224 + sw a6, 328(a1) + add s0, a2, t6 + sh2add a7, a0, s0 + lui s0, 82 + lw a6, 0(a7) + addiw t6, s0, 128 + lui s0, 83 + add a7, a2, t6 + sw a6, 332(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, 32 + lui s0, 84 + add a7, a2, a6 + sw t6, 336(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -64 + add a7, a2, t6 + sw a6, 340(a1) + sh2add a6, a0, a7 + lui a7, 85 + lw t6, 0(a6) + addiw a6, a7, -160 + sw t6, 344(a1) + add t6, a2, a6 + li a6, 1375 + sh2add s0, a0, t6 + slli t6, a6, 8 + lw a7, 0(s0) + add s0, a2, t6 + sw a7, 348(a1) + sh2add a7, a0, s0 + lui s0, 87 + lw a6, 0(a7) + addiw t6, s0, -352 + lui s0, 88 + add a7, a2, t6 + sw a6, 352(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, -448 + lui s0, 89 + add a7, a2, a6 + sw t6, 356(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -544 + lui s0, 91 + add a7, a2, t6 + sw a6, 360(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 90 + sw t6, 364(a1) + addiw t6, a6, -640 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, -736 + lui s0, 92 + add a7, a2, a6 + sw t6, 368(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -832 + li s0, 375 + add a7, a2, t6 + sw a6, 372(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 93 + sw t6, 376(a1) + addiw t6, a6, -928 + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + slli a6, s0, 10 + lui s0, 95 + add a7, a2, a6 + sw t6, 380(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -1120 + lui s0, 96 + add a7, a2, t6 + sw a6, 384(a1) + sh2add a6, a0, a7 + lw t6, 0(a6) + addiw a6, s0, -1216 + lui s0, 97 + add a7, a2, a6 + sw t6, 388(a1) + sh2add t6, a0, a7 + lw a6, 0(t6) + addiw t6, s0, -1312 + add a7, a2, t6 + sw a6, 392(a1) + sh2add a6, a0, a7 + lui a7, 98 + lw t6, 0(a6) + addiw a6, a7, -1408 + add s0, a2, a6 + sw t6, 396(a1) + lui a6, 99 + sh2add t6, a0, s0 + lw a7, 0(t6) + addiw t6, a6, -1504 + sw a7, 400(a1) + add a7, a2, t6 + sh2add a6, a0, a7 + lw t6, 0(a6) + lui a6, 100 + sw t6, 404(a1) + addiw t6, a6, -1600 + add a7, a2, t6 + sh2add a6, a0, a7 + lui a7, 101 + lw t6, 0(a6) + addiw a6, a7, -1696 + sw t6, 408(a1) + add t6, a2, a6 + sh2add a7, a0, t6 addiw a0, a0, 1 - lw a2, 0(s1) - sw a2, 412(a1) - ble s0, a0, label11 - add a7, a7, a3 - mv s1, zero - mv a1, a7 + lw a6, 0(a7) + sw a6, 412(a1) + ble t5, a0, label11 + add t4, t4, a3 + mv t6, zero + mv a1, t4 .p2align 2 label5: - mul s4, s1, a3 - li s7, 625 - addiw s1, s1, 64 - add a2, a4, s4 - sh2add s2, a0, a2 - add s4, a2, a3 - add s6, a2, t5 - sh2add s5, a0, s4 - lw s3, 0(s2) - add s2, a2, a5 - sw s3, 0(a1) - lw s3, 0(s5) - sh2add s5, a0, s2 - sw s3, 4(a1) - add s3, a2, t0 - lw s4, 0(s5) - sh2add s5, a0, s3 - add s3, a2, t1 - sw s4, 8(a1) - sh2add s4, a0, s3 - lw s2, 0(s5) - sw s2, 12(a1) - add s2, a2, t2 - lw s5, 0(s4) - sh2add s3, a0, s2 - sw s5, 16(a1) - add s5, a2, t3 - lw s4, 0(s3) - sh2add s3, a0, s5 - sw s4, 20(a1) - add s4, a2, t4 - lw s2, 0(s3) - sh2add s5, a0, s4 - sw s2, 24(a1) - sh2add s2, a0, s6 - lw s3, 0(s5) - li s6, 1375 - sw s3, 28(a1) - add s3, a2, t6 - lw s4, 0(s2) - sh2add s5, a0, s3 - add s2, a2, a6 - sh2add s3, a0, s2 - sw s4, 32(a1) - lw s4, 0(s5) - sw s4, 36(a1) - lw s4, 0(s3) - slli s3, s6, 5 - li s6, 375 - add s2, a2, s3 - sw s4, 40(a1) - sh2add s5, a0, s2 - slli s2, s6, 7 - lw s4, 0(s5) - add s5, a2, s2 - sh2add s3, a0, s5 - sw s4, 44(a1) - li s5, 1625 - lw s4, 0(s3) - slli s3, s5, 5 - sw s4, 48(a1) - add s4, a2, s3 - sh2add s2, a0, s4 - li s4, 875 - lw s5, 0(s2) - slli s6, s4, 6 - add s3, a2, s6 - lui s6, 17 - sh2add s2, a0, s3 - sw s5, 52(a1) - li s3, 1875 - lw s4, 0(s2) - slli s2, s3, 5 - add s5, a2, s2 - sw s4, 56(a1) - sh2add s4, a0, s5 - li s5, 125 - lw s3, 0(s4) - slli s2, s5, 9 - add s4, a2, s2 - addiw s2, s6, -1632 - sw s3, 60(a1) - li s6, 1125 - sh2add s3, a0, s4 - lw s5, 0(s3) - sw s5, 64(a1) - add s5, a2, s2 - sh2add s4, a0, s5 - lw s3, 0(s4) - slli s4, s6, 6 - add s5, a2, s4 - sw s3, 68(a1) - sh2add s2, a0, s5 - lui s5, 19 - lw s3, 0(s2) - addiw s4, s5, -1824 - slli s5, s7, 7 - lui s7, 30 - sw s3, 72(a1) - add s3, a2, s4 - add s4, a2, s5 - sh2add s2, a0, s3 - sh2add s3, a0, s4 - lw s6, 0(s2) - sw s6, 76(a1) - lui s6, 21 - lw s2, 0(s3) - addiw s4, s6, -2016 - lui s6, 24 - add s3, a2, s4 - sh2add s5, a0, s3 - sw s2, 80(a1) - li s3, 1375 - lw s2, 0(s5) - slli s5, s3, 6 - sw s2, 84(a1) - add s2, a2, s5 - sh2add s4, a0, s2 - lui s2, 22 - lw s3, 0(s4) - addiw s4, s2, 1888 - add s5, a2, s4 - sw s3, 88(a1) - li s4, 375 - sh2add s3, a0, s5 - lw s2, 0(s3) - slli s3, s4, 8 - sw s2, 92(a1) - add s2, a2, s3 - addiw s3, s6, 1696 - sh2add s5, a0, s2 - li s6, 1625 - lw s4, 0(s5) - sw s4, 96(a1) - add s4, a2, s3 - slli s3, s6, 6 - sh2add s2, a0, s4 - li s6, 875 - add s4, a2, s3 - lw s5, 0(s2) - sw s5, 100(a1) - sh2add s5, a0, s4 - lui s4, 26 - lw s2, 0(s5) - addiw s5, s4, 1504 - sw s2, 104(a1) - add s2, a2, s5 - slli s5, s6, 7 - sh2add s3, a0, s2 - li s6, 1875 - lw s4, 0(s3) - sw s4, 108(a1) - add s4, a2, s5 - lui s5, 28 - sh2add s2, a0, s4 - lw s3, 0(s2) - addiw s2, s5, 1312 - sw s3, 112(a1) - add s3, a2, s2 - sh2add s4, a0, s3 - lw s5, 0(s4) - slli s4, s6, 6 - add s2, a2, s4 - sw s5, 116(a1) - sh2add s3, a0, s2 - addiw s5, s7, 1120 - lw s6, 0(s3) - lui s7, 33 - add s2, a2, s5 - li s5, 125 - sh2add s4, a0, s2 - sw s6, 120(a1) - slli s2, s5, 10 - lui s6, 32 - lw s3, 0(s4) - sw s3, 124(a1) - add s3, a2, s2 - sh2add s4, a0, s3 - addiw s3, s6, 928 - lw s5, 0(s4) - addiw s6, s7, 832 - add s4, a2, s3 - sh2add s2, a0, s4 - sw s5, 128(a1) - add s4, a2, s6 - lw s5, 0(s2) - lui s6, 38 - sh2add s3, a0, s4 - sw s5, 132(a1) - lui s5, 34 - lw s2, 0(s3) - addiw s4, s5, 736 - add s3, a2, s4 - sw s2, 136(a1) - sh2add s2, a0, s3 - li s3, 1125 - lw s5, 0(s2) - slli s2, s3, 7 - add s4, a2, s2 - sw s5, 140(a1) - sh2add s5, a0, s4 - lui s4, 36 - lw s3, 0(s5) - addiw s5, s4, 544 - sw s3, 144(a1) - add s3, a2, s5 - lui s5, 37 - sh2add s2, a0, s3 - lw s4, 0(s2) - addiw s2, s5, 448 - add s3, a2, s2 - sw s4, 148(a1) - sh2add s4, a0, s3 - addiw s3, s6, 352 - lw s5, 0(s4) - li s6, 625 - sw s5, 152(a1) - add s5, a2, s3 - sh2add s2, a0, s5 - slli s5, s6, 8 - lw s4, 0(s2) - lui s6, 40 - add s2, a2, s5 - sh2add s3, a0, s2 - sw s4, 156(a1) - lw s4, 0(s3) - addiw s3, s6, 160 - lui s6, 41 - add s2, a2, s3 - sw s4, 160(a1) - sh2add s5, a0, s2 - addiw s2, s6, 64 - lw s4, 0(s5) - lui s6, 42 - add s3, a2, s2 - addiw s2, s6, -32 - sh2add s5, a0, s3 - sw s4, 164(a1) - li s6, 1375 - add s3, a2, s2 - lw s4, 0(s5) - sh2add s5, a0, s3 - sw s4, 168(a1) - lw s4, 0(s5) - slli s5, s6, 7 - add s3, a2, s5 - sw s4, 172(a1) - sh2add s2, a0, s3 - lui s3, 44 - lw s6, 0(s2) - addiw s4, s3, -224 - add s2, a2, s4 - sh2add s5, a0, s2 - sw s6, 176(a1) - lui s2, 45 - li s6, 375 - addiw s4, s2, -320 - lw s3, 0(s5) - add s5, a2, s4 - sw s3, 180(a1) - sh2add s3, a0, s5 - lui s5, 46 - lw s2, 0(s3) - addiw s3, s5, -416 - sw s2, 184(a1) - add s2, a2, s3 - sh2add s4, a0, s2 - slli s2, s6, 9 - lw s5, 0(s4) - lui s6, 49 - sw s5, 188(a1) - add s5, a2, s2 - sh2add s3, a0, s5 - lui s5, 48 - lw s4, 0(s3) - addiw s2, s5, -608 - add s3, a2, s2 - sw s4, 192(a1) - sh2add s4, a0, s3 - addiw s3, s6, -704 - lw s5, 0(s4) - add s2, a2, s3 - lui s3, 50 - sh2add s4, a0, s2 - sw s5, 196(a1) - addiw s5, s3, -800 - lw s6, 0(s4) - add s4, a2, s5 - li s5, 1625 - sh2add s2, a0, s4 - sw s6, 200(a1) - lui s6, 52 - lw s3, 0(s2) - slli s2, s5, 7 - add s4, a2, s2 - sw s3, 204(a1) - sh2add s3, a0, s4 - addiw s4, s6, -992 - lw s5, 0(s3) - lui s6, 54 - add s2, a2, s4 - lui s4, 53 - sw s5, 208(a1) - sh2add s5, a0, s2 - lw s3, 0(s5) - addiw s5, s4, -1088 - add s2, a2, s5 - sw s3, 212(a1) - addiw s5, s6, -1184 - sh2add s3, a0, s2 - lui s6, 57 - add s2, a2, s5 - lw s4, 0(s3) - sw s4, 216(a1) - sh2add s4, a0, s2 + mul a6, t6, a3 li s2, 875 - lw s3, 0(s4) - slli s4, s2, 8 - sw s3, 220(a1) - add s3, a2, s4 - sh2add s5, a0, s3 - lui s3, 56 - lw s2, 0(s5) - addiw s5, s3, -1376 - add s4, a2, s5 - sw s2, 224(a1) - addiw s5, s6, -1472 - sh2add s2, a0, s4 - li s6, 1875 - lw s3, 0(s2) - sw s3, 228(a1) - add s3, a2, s5 - sh2add s4, a0, s3 - lui s3, 58 - lw s2, 0(s4) - addiw s5, s3, -1568 - sw s2, 232(a1) - add s2, a2, s5 - sh2add s4, a0, s2 - slli s2, s6, 7 - lw s3, 0(s4) - lui s6, 60 - add s4, a2, s2 - sh2add s5, a0, s4 - sw s3, 236(a1) - addiw s4, s6, -1760 - lw s3, 0(s5) - lui s6, 61 - add s5, a2, s4 - addiw s4, s6, -1856 - sh2add s2, a0, s5 - sw s3, 240(a1) - add s5, a2, s4 - lw s3, 0(s2) - lui s4, 62 - sh2add s2, a0, s5 - sw s3, 244(a1) - lw s3, 0(s2) - addiw s2, s4, -1952 - add s5, a2, s2 - sw s3, 248(a1) - li s2, 960 - sh2add s3, a0, s5 - lw s4, 0(s3) - sw s4, 252(a1) - bge s1, s2, label9 + addiw t6, t6, 64 + add a2, a4, a6 + sh2add a7, a0, a2 + add a6, a2, a3 + sh2add s0, a0, a6 + add a6, a2, a5 + lw s1, 0(a7) + sw s1, 0(a1) + lw a7, 0(s0) + sw a7, 4(a1) + sh2add a7, a0, a6 + add a6, a2, t0 + lw s0, 0(a7) + sh2add a7, a0, a6 + sw s0, 8(a1) + lw s0, 0(a7) + add a7, a2, t1 + sh2add a6, a0, a7 + sw s0, 12(a1) + add a7, a2, t2 + lw s0, 0(a6) + sh2add s1, a0, a7 + sw s0, 16(a1) + add s0, a2, t3 + lw a6, 0(s1) + sh2add a7, a0, s0 + li s1, 875 + sw a6, 20(a1) + lw a6, 0(a7) + slli a7, s1, 5 + li s1, 1125 + sw a6, 24(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + li a6, 125 + lw a7, 0(s0) + sw a7, 28(a1) + slli a7, a6, 8 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + sw a7, 32(a1) + slli a7, s1, 5 + add a6, a2, a7 + sh2add s0, a0, a6 + li a6, 625 + lw a7, 0(s0) + sw a7, 36(a1) + slli a7, a6, 6 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + li a6, 1375 + slli s1, a6, 5 + sw a7, 40(a1) + add a7, a2, s1 + li s1, 375 + sh2add a6, a0, a7 + lw s0, 0(a6) + slli a6, s1, 7 + li s1, 1625 + add a7, a2, a6 + sw s0, 44(a1) + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 48(a1) + slli a6, s1, 5 + add a7, a2, a6 + sh2add s0, a0, a7 + slli a7, s2, 6 + lw a6, 0(s0) + li s2, 125 + add s1, a2, a7 + sh2add s0, a0, s1 + sw a6, 52(a1) + li s1, 1875 + lw a6, 0(s0) + slli a7, s1, 5 + sw a6, 56(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + slli a6, s2, 9 + lw a7, 0(s0) + lui s2, 17 + add s1, a2, a6 + sh2add s0, a0, s1 + sw a7, 60(a1) + addiw s1, s2, -1632 + lw a7, 0(s0) + lui s2, 26 + add a6, a2, s1 + sw a7, 64(a1) + sh2add a7, a0, a6 + lw s0, 0(a7) + li a7, 1125 + slli a6, a7, 6 + sw s0, 68(a1) + add s0, a2, a6 + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 19 + sw a6, 72(a1) + addiw a6, a7, -1824 + add s0, a2, a6 + sh2add a7, a0, s0 + lw a6, 0(a7) + li a7, 625 + slli s1, a7, 7 + sw a6, 76(a1) + add a6, a2, s1 + sh2add a7, a0, a6 + lw s0, 0(a7) + lui a7, 21 + addiw a6, a7, -2016 + sw s0, 80(a1) + add s0, a2, a6 + li a6, 1375 + sh2add s1, a0, s0 + lw a7, 0(s1) + li s1, 375 + sw a7, 84(a1) + slli a7, a6, 6 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + lui a6, 22 + sw a7, 88(a1) + addiw a7, a6, 1888 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + slli a6, s1, 8 + lui s1, 24 + add s0, a2, a6 + sw a7, 92(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + sw a6, 96(a1) + addiw a6, s1, 1696 + add a7, a2, a6 + sh2add s0, a0, a7 + li a7, 1625 + lw a6, 0(s0) + sw a6, 100(a1) + slli a6, a7, 6 + add s0, a2, a6 + addiw a6, s2, 1504 + sh2add s1, a0, s0 + li s2, 125 + add s0, a2, a6 + lw a7, 0(s1) + sw a7, 104(a1) + sh2add a7, a0, s0 + li s0, 875 + lw a6, 0(a7) + slli s1, s0, 7 + add a7, a2, s1 + lui s1, 28 + sw a6, 108(a1) + sh2add a6, a0, a7 + lw s0, 0(a6) + addiw a6, s1, 1312 + li s1, 1875 + add a7, a2, a6 + sw s0, 112(a1) + sh2add s0, a0, a7 + slli a7, s1, 6 + lw a6, 0(s0) + lui s1, 30 + add s0, a2, a7 + sw a6, 116(a1) + sh2add a6, a0, s0 + addiw s0, s1, 1120 + lw a7, 0(a6) + add a6, a2, s0 + sw a7, 120(a1) + sh2add a7, a0, a6 + lw s1, 0(a7) + slli a7, s2, 10 + li s2, 1125 + add a6, a2, a7 + sw s1, 124(a1) + sh2add s0, a0, a6 + lui s1, 32 + lw a7, 0(s0) + addiw a6, s1, 928 + lui s1, 33 + sw a7, 128(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 132(a1) + addiw a6, s1, 832 + add a7, a2, a6 + sh2add s0, a0, a7 + lui a7, 34 + lw a6, 0(s0) + sw a6, 136(a1) + addiw a6, a7, 736 + add s1, a2, a6 + slli a6, s2, 7 + sh2add s0, a0, s1 + lui s2, 60 + add s1, a2, a6 + lw a7, 0(s0) + sh2add s0, a0, s1 + lui s1, 36 + sw a7, 140(a1) + addiw a6, s1, 544 + lw a7, 0(s0) + lui s1, 37 + sw a7, 144(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 148(a1) + addiw a6, s1, 448 + lui s1, 38 + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s1, 352 + lw a6, 0(s0) + li s1, 625 + sw a6, 152(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + lw a7, 0(s0) + sw a7, 156(a1) + slli a7, s1, 8 + lui s1, 40 + add a6, a2, a7 + sh2add s0, a0, a6 + lw a7, 0(s0) + sw a7, 160(a1) + addiw a7, s1, 160 + lui s1, 41 + add a6, a2, a7 + sh2add s0, a0, a6 + addiw a6, s1, 64 + lw a7, 0(s0) + lui s1, 42 + add s0, a2, a6 + sw a7, 164(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + sw a6, 168(a1) + addiw a6, s1, -32 + li s1, 1375 + add a7, a2, a6 + sh2add s0, a0, a7 + lw a6, 0(s0) + sw a6, 172(a1) + slli a6, s1, 7 + lui s1, 44 + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s1, -224 + lw a6, 0(s0) + lui s1, 46 + sw a6, 176(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + lui a6, 45 + lw a7, 0(s0) + sw a7, 180(a1) + addiw a7, a6, -320 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + sw a7, 184(a1) + addiw a7, s1, -416 + lui s1, 48 + add a6, a2, a7 + sh2add s0, a0, a6 + li a6, 375 + lw a7, 0(s0) + sw a7, 188(a1) + slli a7, a6, 9 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + addiw a6, s1, -608 + lui s1, 50 + add s0, a2, a6 + sw a7, 192(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + lui a7, 49 + sw a6, 196(a1) + addiw a6, a7, -704 + add s0, a2, a6 + sh2add a7, a0, s0 + lw a6, 0(a7) + addiw a7, s1, -800 + li s1, 1625 + sw a6, 200(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + slli a6, s1, 7 + lw a7, 0(s0) + lui s1, 52 + add s0, a2, a6 + sw a7, 204(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + sw a6, 208(a1) + addiw a6, s1, -992 + lui s1, 53 + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s1, -1088 + lw a6, 0(s0) + lui s1, 54 + sw a6, 212(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + addiw a6, s1, -1184 + lw a7, 0(s0) + li s1, 875 + sw a7, 216(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + slli a7, s1, 8 + lw a6, 0(s0) + lui s1, 57 + add s0, a2, a7 + sw a6, 220(a1) + sh2add a6, a0, s0 + lw a7, 0(a6) + lui a6, 56 + sw a7, 224(a1) + addiw a7, a6, -1376 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + sw a7, 228(a1) + addiw a7, s1, -1472 + li s1, 1875 + add a6, a2, a7 + sh2add s0, a0, a6 + lui a6, 58 + lw a7, 0(s0) + sw a7, 232(a1) + addiw a7, a6, -1568 + add s0, a2, a7 + sh2add a6, a0, s0 + lw a7, 0(a6) + slli a6, s1, 7 + sw a7, 236(a1) + add a7, a2, a6 + sh2add s0, a0, a7 + addiw a7, s2, -1760 + lw a6, 0(s0) + lui s2, 61 + add s0, a2, a7 + addiw a7, s2, -1856 + sh2add s1, a0, s0 + sw a6, 240(a1) + lw a6, 0(s1) + lui s1, 62 + sw a6, 244(a1) + add a6, a2, a7 + sh2add s0, a0, a6 + addiw a6, s1, -1952 + lw a7, 0(s0) + add s0, a2, a6 + sw a7, 248(a1) + sh2add a7, a0, s0 + lw a6, 0(a7) + li a7, 960 + sw a6, 252(a1) + bge t6, a7, label9 addi a1, a1, 256 j label5 label11: - ld s0, 0(sp) - ld s5, 8(sp) - ld s1, 16(sp) - ld s6, 24(sp) - ld s4, 32(sp) - ld s2, 40(sp) - ld s3, 48(sp) - ld s7, 56(sp) - addi sp, sp, 64 + ld s1, 0(sp) + ld s0, 8(sp) + ld s2, 16(sp) + addi sp, sp, 24 ret .p2align 2 cmmc_parallel_body_1: - addi sp, sp, -96 - mv a7, a1 -pcrel1095: - auipc a5, %pcrel_hi(c) - li a4, 125 - sd s1, 0(sp) - addi a3, a5, %pcrel_lo(pcrel1095) - slli a2, a4, 5 - mv s1, a0 - sd s6, 8(sp) - sh3add t5, a2, a2 + addi sp, sp, -72 + mv t4, a1 +pcrel1084: + auipc a4, %pcrel_hi(c) + li a5, 125 + mv t6, a0 + addi a3, a4, %pcrel_lo(pcrel1084) + slli a2, a5, 5 + sd s0, 0(sp) sh2add t1, a2, a2 + sh1add a5, a2, a2 slli a4, a2, 1 mul a1, a0, a2 - sd s0, 16(sp) - sh1add a5, a2, a2 + sd s5, 8(sp) slli t0, a4, 1 -pcrel1096: +pcrel1085: auipc a0, %pcrel_hi(a) - add a6, a3, a1 - sd s5, 24(sp) - slli t2, a5, 1 - slli t4, t0, 1 - addi s0, a0, %pcrel_lo(pcrel1096) -pcrel1097: + add t3, a3, a1 + addi t5, a0, %pcrel_lo(pcrel1085) + sd s1, 16(sp) +pcrel1086: auipc a1, %pcrel_hi(b) - li a0, 875 - sd s2, 32(sp) - addi a3, a1, %pcrel_lo(pcrel1097) - slli t3, a0, 5 - sd s4, 40(sp) - sd s8, 48(sp) - sd s3, 56(sp) - sd s7, 64(sp) - sd s9, 72(sp) - sd s10, 80(sp) - sd s11, 88(sp) - mul a1, s1, a2 - mv s2, a6 + sd s6, 24(sp) + addi a3, a1, %pcrel_lo(pcrel1086) + sd s3, 32(sp) + sd s2, 40(sp) + sd s4, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + mul a1, t6, a2 + mv a6, t3 mv a0, zero - add t6, s0, a1 - mv a1, t6 - mv s4, zero - mv s5, zero - j label788 + add t2, t5, a1 + mv a1, t2 + mv s0, zero + mv s1, zero + j label781 .p2align 2 -label792: - li s4, 125 - lui s10, 17 - slli s8, s4, 9 - lw s4, 64(a1) - add s6, s3, s8 - sh2add s7, a0, s6 - lw s8, 0(s7) - mulw s9, s4, s8 - addiw s8, s10, -1632 - addw s6, s5, s9 - li s10, 1125 - add s4, s3, s8 - lw s5, 68(a1) - sh2add s7, a0, s4 - lw s8, 0(s7) - mulw s9, s5, s8 - slli s8, s10, 6 - addw s4, s6, s9 - lui s10, 21 - add s5, s3, s8 - lw s6, 72(a1) - sh2add s7, a0, s5 - lw s8, 0(s7) - lui s7, 19 - mulw s9, s6, s8 - addiw s8, s7, -1824 - addw s5, s4, s9 - lw s7, 76(a1) - add s4, s3, s8 - sh2add s6, a0, s4 - lw s8, 0(s6) +label785: + li s0, 125 + lui s6, 17 + slli s4, s0, 9 + lw s0, 64(a1) + add s2, a7, s4 + sh2add s3, a0, s2 + lw s4, 0(s3) + mulw s5, s0, s4 + addiw s4, s6, -1632 + addw s2, s1, s5 + li s6, 1125 + add s0, a7, s4 + lw s1, 68(a1) + sh2add s3, a0, s0 + lw s5, 0(s3) + mulw s4, s1, s5 + slli s1, s6, 6 + addw s0, s2, s4 + lui s6, 19 + add s5, a7, s1 + lw s2, 72(a1) + sh2add s3, a0, s5 + lw s4, 0(s3) + mulw s5, s2, s4 + lw s2, 76(a1) + addiw s4, s6, -1824 + addw s1, s0, s5 li s6, 625 - mulw s9, s7, s8 - slli s8, s6, 7 - addw s4, s5, s9 - lw s6, 80(a1) - add s5, s3, s8 - sh2add s7, a0, s5 - lw s8, 0(s7) - mulw s9, s6, s8 - lw s6, 84(a1) - addiw s8, s10, -2016 - addw s5, s4, s9 - lui s10, 22 - add s4, s3, s8 - sh2add s7, a0, s4 - lw s9, 0(s7) - li s7, 1375 - mulw s8, s6, s9 - slli s9, s7, 6 - addw s4, s5, s8 - lw s7, 88(a1) - add s5, s3, s9 - sh2add s6, a0, s5 - lw s9, 0(s6) - mulw s8, s7, s9 - addiw s7, s10, 1888 - addw s5, s4, s8 - lw s4, 92(a1) - add s8, s3, s7 - sh2add s6, a0, s8 + add s0, a7, s4 + sh2add s3, a0, s0 + lw s4, 0(s3) + mulw s5, s2, s4 + slli s4, s6, 7 + addw s0, s1, s5 + lw s2, 80(a1) + lui s6, 21 + add s1, a7, s4 + sh2add s3, a0, s1 + lw s4, 0(s3) + mulw s5, s2, s4 + lw s2, 84(a1) + addiw s4, s6, -2016 + addw s1, s0, s5 + lui s6, 22 + add s0, a7, s4 + sh2add s3, a0, s0 + lw s4, 0(s3) + li s3, 1375 + mulw s5, s2, s4 + slli s4, s3, 6 + addw s0, s1, s5 + lw s3, 88(a1) + add s1, a7, s4 + sh2add s2, a0, s1 + lw s5, 0(s2) + addiw s2, s6, 1888 + mulw s4, s3, s5 + add s3, a7, s2 + addw s1, s0, s4 + lw a7, 92(a1) + sh2add s0, a0, s3 addiw a0, a0, 1 - lw s3, 0(s6) - mulw s7, s4, s3 - addw a1, s5, s7 - sw a1, 0(s2) - li a1, 1000 - bge a0, a1, label793 - addi s2, s2, 4 - mv a1, t6 - mv s4, zero - mv s5, zero + lw s2, 0(s0) + mulw s3, a7, s2 + li a7, 1000 + addw a1, s1, s3 + sw a1, 0(a6) + bge a0, a7, label786 + addi a6, a6, 4 + mv a1, t2 + mv s0, zero + mv s1, zero .p2align 2 -label788: - mul s8, s4, a2 - lw s6, 0(a1) - addiw s4, s4, 16 - add s3, a3, s8 - sh2add s7, a0, s3 - add s9, s3, a2 - sh2add s10, a0, s9 - lw s8, 0(s7) - lw s11, 4(a1) - lw s7, 0(s10) - mulw s10, s6, s8 - lw s8, 8(a1) - mulw s9, s11, s7 - add s11, s3, a4 - addw s7, s9, s10 - sh2add s10, a0, s11 - lw s9, 0(s10) - add s10, s3, a5 - mulw s11, s8, s9 - lw s9, 12(a1) - sh2add s8, a0, s10 - addw s6, s7, s11 - lw s11, 0(s8) - add s8, s3, t0 - mulw s10, s9, s11 - sh2add s9, a0, s8 - addw s7, s6, s10 - lw s6, 16(a1) - lw s10, 0(s9) - mulw s11, s6, s10 - add s10, s3, t1 - addw s8, s7, s11 - sh2add s9, a0, s10 - lw s7, 20(a1) - lw s11, 0(s9) - mulw s10, s7, s11 - add s11, s3, t3 - add s7, s3, t2 - addw s6, s8, s10 - sh2add s9, a0, s7 - lw s8, 24(a1) - lw s10, 0(s9) - mulw s9, s8, s10 - sh2add s8, a0, s11 - addw s7, s6, s9 - lw s9, 28(a1) - lw s10, 0(s8) - lw s8, 32(a1) - mulw s11, s9, s10 - add s10, s3, t4 - addw s6, s7, s11 - sh2add s9, a0, s10 - lw s10, 0(s9) - lw s9, 36(a1) - mulw s11, s8, s10 - add s10, s3, t5 - addw s7, s6, s11 - sh2add s8, a0, s10 - lw s11, 0(s8) +label781: + mul s3, s0, a2 + lw s2, 0(a1) + addiw s0, s0, 16 + add a7, a3, s3 + sh2add s5, a0, a7 + add s6, a7, a2 + sh2add s4, a0, s6 + lw s3, 0(s5) + lw s5, 4(a1) + lw s7, 0(s4) + mulw s8, s2, s3 + lw s2, 8(a1) + mulw s6, s5, s7 + add s7, a7, a4 + addw s4, s6, s8 + sh2add s5, a0, s7 + add s6, a7, a5 + lw s8, 0(s5) + sh2add s5, a0, s6 + add s6, a7, t0 + mulw s7, s2, s8 + addw s3, s4, s7 + lw s4, 12(a1) + lw s8, 0(s5) + lw s5, 16(a1) + mulw s7, s4, s8 + sh2add s4, a0, s6 + addw s2, s3, s7 + add s6, a7, t1 + lw s7, 0(s4) + lw s4, 20(a1) + mulw s8, s5, s7 + sh2add s5, a0, s6 + addw s3, s2, s8 + lw s7, 0(s5) + li s5, 375 + slli s6, s5, 6 + mulw s8, s4, s7 + lw s5, 24(a1) + addw s2, s3, s8 li s8, 625 - mulw s10, s9, s11 - lw s9, 40(a1) - addw s6, s7, s10 - slli s10, s8, 6 - add s7, s3, s10 - sh2add s8, a0, s7 - lw s10, 0(s8) + add s3, a7, s6 + sh2add s4, a0, s3 + lw s6, 0(s4) + li s4, 875 + mulw s7, s5, s6 + slli s6, s4, 5 + addw s3, s2, s7 + lw s4, 28(a1) + add s2, a7, s6 + sh2add s5, a0, s2 + lw s6, 0(s5) + li s5, 125 + mulw s7, s4, s6 + slli s6, s5, 8 + addw s2, s3, s7 + lw s5, 32(a1) + add s3, a7, s6 + sh2add s4, a0, s3 + lw s6, 0(s4) + li s4, 1125 + mulw s7, s5, s6 + slli s6, s4, 5 + addw s3, s2, s7 + lw s4, 36(a1) + add s2, a7, s6 + sh2add s5, a0, s2 + lw s7, 0(s5) + mulw s6, s4, s7 + lw s4, 40(a1) + slli s7, s8, 6 + addw s2, s3, s6 li s8, 1375 - mulw s11, s9, s10 - lw s9, 44(a1) - slli s10, s8, 5 - addw s7, s6, s11 - add s6, s3, s10 - sh2add s8, a0, s6 - lw s10, 0(s8) + add s3, a7, s7 + sh2add s5, a0, s3 + lw s6, 0(s5) + mulw s7, s4, s6 + slli s4, s8, 5 + addw s3, s2, s7 li s8, 375 - mulw s11, s9, s10 - lw s9, 48(a1) - slli s10, s8, 7 - addw s6, s7, s11 - add s7, s3, s10 - sh2add s8, a0, s7 - lw s10, 0(s8) - li s8, 1625 - mulw s11, s9, s10 - lw s9, 52(a1) - slli s10, s8, 5 - addw s7, s6, s11 - add s6, s3, s10 - sh2add s8, a0, s6 - lw s10, 0(s8) - li s8, 875 - mulw s11, s9, s10 - lw s9, 56(a1) - slli s10, s8, 6 - addw s6, s7, s11 - add s7, s3, s10 - sh2add s8, a0, s7 - lw s10, 0(s8) + add s6, a7, s4 + lw s2, 44(a1) + sh2add s5, a0, s6 + lw s7, 0(s5) + mulw s6, s2, s7 + slli s2, s8, 7 + addw s4, s3, s6 li s8, 1875 - mulw s11, s9, s10 - slli s10, s8, 5 - addw s7, s6, s11 - lw s8, 60(a1) - add s9, s3, s10 - sh2add s6, a0, s9 - lw s10, 0(s6) - li s6, 992 - mulw s9, s8, s10 - addw s11, s7, s9 - addw s5, s5, s11 - bge s4, s6, label792 + add s7, a7, s2 + lw s3, 48(a1) + sh2add s5, a0, s7 + lw s6, 0(s5) + li s5, 1625 + mulw s7, s3, s6 + slli s6, s5, 5 + addw s2, s4, s7 + lw s5, 52(a1) + add s3, a7, s6 + sh2add s4, a0, s3 + lw s7, 0(s4) + li s4, 875 + mulw s6, s5, s7 + slli s7, s4, 6 + addw s3, s2, s6 + lw s4, 56(a1) + add s2, a7, s7 + sh2add s5, a0, s2 + lw s7, 0(s5) + slli s5, s8, 5 + mulw s6, s4, s7 + lw s4, 60(a1) + addw s2, s3, s6 + add s6, a7, s5 + sh2add s3, a0, s6 + lw s7, 0(s3) + mulw s6, s4, s7 + addw s5, s2, s6 + li s2, 992 + addw s1, s1, s5 + bge s0, s2, label785 addi a1, a1, 64 - j label788 + j label781 .p2align 2 -label793: - addiw s1, s1, 1 - ble a7, s1, label795 - add a6, a6, a2 - mul a1, s1, a2 +label786: + addiw t6, t6, 1 + ble t4, t6, label788 + add t3, t3, a2 + mul a1, t6, a2 mv a0, zero - mv s4, zero - mv s5, zero - mv s2, a6 - add t6, s0, a1 - mv a1, t6 - j label788 -label795: - ld s1, 0(sp) - ld s6, 8(sp) - ld s0, 16(sp) - ld s5, 24(sp) - ld s2, 32(sp) - ld s4, 40(sp) - ld s8, 48(sp) - ld s3, 56(sp) - ld s7, 64(sp) - ld s9, 72(sp) - ld s10, 80(sp) - ld s11, 88(sp) - addi sp, sp, 96 + mv s0, zero + mv s1, zero + mv a6, t3 + add t2, t5, a1 + mv a1, t2 + j label781 +label788: + ld s0, 0(sp) + ld s5, 8(sp) + ld s1, 16(sp) + ld s6, 24(sp) + ld s3, 32(sp) + ld s2, 40(sp) + ld s4, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + addi sp, sp, 72 ret .p2align 2 cmmc_parallel_body_2: mv t0, a0 mv a2, a1 addiw a4, a0, 3 -pcrel1201: +pcrel1190: auipc a5, %pcrel_hi(cmmc_parallel_body_payload_2) - ld a3, %pcrel_lo(pcrel1201)(a5) - addi a1, a5, %pcrel_lo(pcrel1201) + ld a3, %pcrel_lo(pcrel1190)(a5) + addi a1, a5, %pcrel_lo(pcrel1190) lw a0, 8(a1) - ble a2, a4, label1099 + ble a2, a4, label1088 addiw t1, t0, 15 addiw a4, a2, -3 addiw a5, a2, -18 - bge t1, a4, label1150 + bge t1, a4, label1139 sh2add a1, t0, a3 - j label1109 + j label1098 .p2align 2 -label1112: +label1101: addi a1, a1, 64 .p2align 2 -label1109: +label1098: sw a0, 0(a1) addiw t0, t0, 16 sw a0, 4(a1) @@ -1518,59 +1494,59 @@ label1109: sw a0, 52(a1) sw a0, 56(a1) sw a0, 60(a1) - bgt a5, t0, label1112 + bgt a5, t0, label1101 mv a1, t0 -label1113: - ble a4, a1, label1099 +label1102: + ble a4, a1, label1088 sh2add a5, a1, a3 -label1117: +label1106: sw a0, 0(a5) addiw a1, a1, 4 sw a0, 4(a5) sw a0, 8(a5) sw a0, 12(a5) - ble a4, a1, label1187 + ble a4, a1, label1176 addi a5, a5, 16 - j label1117 -label1187: + j label1106 +label1176: mv t0, a1 -label1099: - ble a2, t0, label1106 +label1088: + ble a2, t0, label1095 sh2add a1, t0, a3 - j label1102 -label1105: + j label1091 +label1094: addi a1, a1, 4 -label1102: +label1091: addiw t0, t0, 1 sw a0, 0(a1) - bgt a2, t0, label1105 -label1106: + bgt a2, t0, label1094 +label1095: ret -label1150: +label1139: mv a1, t0 mv t0, zero - j label1113 + j label1102 .p2align 2 cmmc_parallel_body_3: mv t0, a0 addiw a5, a0, 3 -pcrel1356: +pcrel1345: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_3) - addi a2, a0, %pcrel_lo(pcrel1356) + addi a2, a0, %pcrel_lo(pcrel1345) ld a3, 8(a2) - ble a1, a5, label1244 + ble a1, a5, label1233 addiw a0, t0, 15 addiw a4, a1, -3 addiw a5, a1, -18 - bge a0, a4, label1251 + bge a0, a4, label1240 sh2add a0, t0, a3 mv t1, zero - j label1219 + j label1208 .p2align 2 -label1223: +label1212: addi a0, a0, 64 .p2align 2 -label1219: +label1208: lw t4, 0(a0) addiw t0, t0, 16 lw t5, 4(a0) @@ -1604,17 +1580,17 @@ label1219: lw t4, 60(a0) addw t2, t3, t6 addw t1, t2, t4 - bgt a5, t0, label1223 + bgt a5, t0, label1212 mv a5, t0 mv t2, t1 -label1204: - ble a4, a5, label1255 +label1193: + ble a4, a5, label1244 sh2add a0, a5, a3 mv t0, t2 - j label1213 -label1217: + j label1202 +label1206: addi a0, a0, 16 -label1213: +label1202: lw t1, 0(a0) addiw a5, a5, 4 lw t4, 4(a0) @@ -1624,39 +1600,39 @@ label1213: lw t3, 12(a0) addw t1, t2, t5 addw t0, t1, t3 - bgt a4, a5, label1217 + bgt a4, a5, label1206 mv a0, t0 mv a4, t0 mv t0, a5 -label1224: - ble a1, t0, label1333 +label1213: + ble a1, t0, label1322 sh2add a0, t0, a3 mv a3, a4 - j label1231 -label1235: + j label1220 +label1224: addi a0, a0, 4 -label1231: +label1220: lw a5, 0(a0) addiw t0, t0, 1 addw a3, a3, a5 - bgt a1, t0, label1235 -label1228: + bgt a1, t0, label1224 +label1217: amoadd.w.aqrl a1, a3, (a2) ret -label1255: +label1244: mv a0, t1 mv a4, t1 - j label1224 -label1251: + j label1213 +label1240: mv a5, t0 mv t2, zero mv t1, zero mv t0, zero - j label1204 -label1333: + j label1193 +label1322: mv a3, a0 - j label1228 -label1244: + j label1217 +label1233: mv a4, zero mv a0, zero - j label1224 + j label1213 diff --git a/tests/SysY2022/performance/median0.arm.s b/tests/SysY2022/performance/median0.arm.s index 8b748c490..9de850078 100644 --- a/tests/SysY2022/performance/median0.arm.s +++ b/tests/SysY2022/performance/median0.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 40000000 .text diff --git a/tests/SysY2022/performance/median0.riscv.s b/tests/SysY2022/performance/median0.riscv.s index 6f421ebf0..8ad4d2a77 100644 --- a/tests/SysY2022/performance/median0.riscv.s +++ b/tests/SysY2022/performance/median0.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 40000000 .text diff --git a/tests/SysY2022/performance/median1.arm.s b/tests/SysY2022/performance/median1.arm.s index 8b748c490..9de850078 100644 --- a/tests/SysY2022/performance/median1.arm.s +++ b/tests/SysY2022/performance/median1.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 40000000 .text diff --git a/tests/SysY2022/performance/median1.riscv.s b/tests/SysY2022/performance/median1.riscv.s index 6f421ebf0..8ad4d2a77 100644 --- a/tests/SysY2022/performance/median1.riscv.s +++ b/tests/SysY2022/performance/median1.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 40000000 .text diff --git a/tests/SysY2022/performance/median2.arm.s b/tests/SysY2022/performance/median2.arm.s index 8b748c490..9de850078 100644 --- a/tests/SysY2022/performance/median2.arm.s +++ b/tests/SysY2022/performance/median2.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 a: .zero 40000000 .text diff --git a/tests/SysY2022/performance/median2.riscv.s b/tests/SysY2022/performance/median2.riscv.s index 6f421ebf0..8ad4d2a77 100644 --- a/tests/SysY2022/performance/median2.riscv.s +++ b/tests/SysY2022/performance/median2.riscv.s @@ -1,7 +1,7 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 a: .zero 40000000 .text diff --git a/tests/SysY2022/performance/recursion_fabonacci-1.arm.s b/tests/SysY2022/performance/recursion_fabonacci-1.arm.s index 7f0c8971a..dc6dcd9d0 100644 --- a/tests/SysY2022/performance/recursion_fabonacci-1.arm.s +++ b/tests/SysY2022/performance/recursion_fabonacci-1.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 lut_fibFP: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursion_fabonacci-1.riscv.s b/tests/SysY2022/performance/recursion_fabonacci-1.riscv.s index 8f1e0da7b..483398d54 100644 --- a/tests/SysY2022/performance/recursion_fabonacci-1.riscv.s +++ b/tests/SysY2022/performance/recursion_fabonacci-1.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1282491304 .bss -.align 8 +.p2align 3 lut_fibFP: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursion_fabonacci-2.arm.s b/tests/SysY2022/performance/recursion_fabonacci-2.arm.s index 47e7476cd..0df0d6588 100644 --- a/tests/SysY2022/performance/recursion_fabonacci-2.arm.s +++ b/tests/SysY2022/performance/recursion_fabonacci-2.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 lut_fibFP: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursion_fabonacci-2.riscv.s b/tests/SysY2022/performance/recursion_fabonacci-2.riscv.s index 490ccd307..f8b69ad29 100644 --- a/tests/SysY2022/performance/recursion_fabonacci-2.riscv.s +++ b/tests/SysY2022/performance/recursion_fabonacci-2.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1276451850 .bss -.align 8 +.p2align 3 lut_fibFP: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursion_fabonacci-3.arm.s b/tests/SysY2022/performance/recursion_fabonacci-3.arm.s index f1aaf4133..e7a0da656 100644 --- a/tests/SysY2022/performance/recursion_fabonacci-3.arm.s +++ b/tests/SysY2022/performance/recursion_fabonacci-3.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 lut_fibFP: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursion_fabonacci-3.riscv.s b/tests/SysY2022/performance/recursion_fabonacci-3.riscv.s index dd99453d7..b20fb60dc 100644 --- a/tests/SysY2022/performance/recursion_fabonacci-3.riscv.s +++ b/tests/SysY2022/performance/recursion_fabonacci-3.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1293805782 .bss -.align 8 +.p2align 3 lut_fibFP: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursive_call_1.arm.s b/tests/SysY2022/performance/recursive_call_1.arm.s index 4c50c0746..0b12e80cf 100644 --- a/tests/SysY2022/performance/recursive_call_1.arm.s +++ b/tests/SysY2022/performance/recursive_call_1.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 lut_func: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursive_call_1.riscv.s b/tests/SysY2022/performance/recursive_call_1.riscv.s index a9c105cd2..e618cd27f 100644 --- a/tests/SysY2022/performance/recursive_call_1.riscv.s +++ b/tests/SysY2022/performance/recursive_call_1.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1065361605 .bss -.align 8 +.p2align 3 lut_func: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursive_call_2.arm.s b/tests/SysY2022/performance/recursive_call_2.arm.s index 4c50c0746..0b12e80cf 100644 --- a/tests/SysY2022/performance/recursive_call_2.arm.s +++ b/tests/SysY2022/performance/recursive_call_2.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 lut_func: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursive_call_2.riscv.s b/tests/SysY2022/performance/recursive_call_2.riscv.s index a9c105cd2..e618cd27f 100644 --- a/tests/SysY2022/performance/recursive_call_2.riscv.s +++ b/tests/SysY2022/performance/recursive_call_2.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1065361605 .bss -.align 8 +.p2align 3 lut_func: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursive_call_3.arm.s b/tests/SysY2022/performance/recursive_call_3.arm.s index 7e90a9978..279170028 100644 --- a/tests/SysY2022/performance/recursive_call_3.arm.s +++ b/tests/SysY2022/performance/recursive_call_3.arm.s @@ -1,7 +1,7 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 lut_func: .zero 16336 .text diff --git a/tests/SysY2022/performance/recursive_call_3.riscv.s b/tests/SysY2022/performance/recursive_call_3.riscv.s index 18d673558..9ff87d124 100644 --- a/tests/SysY2022/performance/recursive_call_3.riscv.s +++ b/tests/SysY2022/performance/recursive_call_3.riscv.s @@ -1,11 +1,11 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 1065361605 .bss -.align 8 +.p2align 3 lut_func: .zero 16336 .text diff --git a/tests/SysY2022/performance/shuffle0.arm.s b/tests/SysY2022/performance/shuffle0.arm.s index 76d8409f2..23846d595 100644 --- a/tests/SysY2022/performance/shuffle0.arm.s +++ b/tests/SysY2022/performance/shuffle0.arm.s @@ -1,37 +1,37 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 hashmod: .zero 4 -.align 8 +.p2align 3 head: .zero 40000000 -.align 8 +.p2align 3 next: .zero 40000000 -.align 8 +.p2align 3 nextvalue: .zero 40000000 -.align 8 +.p2align 3 key: .zero 40000000 -.align 8 +.p2align 3 value: .zero 40000000 -.align 8 +.p2align 3 keys: .zero 40000000 -.align 8 +.p2align 3 values: .zero 40000000 -.align 8 +.p2align 3 requests: .zero 40000000 -.align 8 +.p2align 3 ans: .zero 40000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/shuffle0.riscv.s b/tests/SysY2022/performance/shuffle0.riscv.s index 7812b66e3..e0b455c1b 100644 --- a/tests/SysY2022/performance/shuffle0.riscv.s +++ b/tests/SysY2022/performance/shuffle0.riscv.s @@ -1,37 +1,37 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 hashmod: .zero 4 -.align 8 +.p2align 3 head: .zero 40000000 -.align 8 +.p2align 3 next: .zero 40000000 -.align 8 +.p2align 3 nextvalue: .zero 40000000 -.align 8 +.p2align 3 key: .zero 40000000 -.align 8 +.p2align 3 value: .zero 40000000 -.align 8 +.p2align 3 keys: .zero 40000000 -.align 8 +.p2align 3 values: .zero 40000000 -.align 8 +.p2align 3 requests: .zero 40000000 -.align 8 +.p2align 3 ans: .zero 40000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/shuffle1.arm.s b/tests/SysY2022/performance/shuffle1.arm.s index 76d8409f2..23846d595 100644 --- a/tests/SysY2022/performance/shuffle1.arm.s +++ b/tests/SysY2022/performance/shuffle1.arm.s @@ -1,37 +1,37 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 hashmod: .zero 4 -.align 8 +.p2align 3 head: .zero 40000000 -.align 8 +.p2align 3 next: .zero 40000000 -.align 8 +.p2align 3 nextvalue: .zero 40000000 -.align 8 +.p2align 3 key: .zero 40000000 -.align 8 +.p2align 3 value: .zero 40000000 -.align 8 +.p2align 3 keys: .zero 40000000 -.align 8 +.p2align 3 values: .zero 40000000 -.align 8 +.p2align 3 requests: .zero 40000000 -.align 8 +.p2align 3 ans: .zero 40000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/shuffle1.riscv.s b/tests/SysY2022/performance/shuffle1.riscv.s index 7812b66e3..e0b455c1b 100644 --- a/tests/SysY2022/performance/shuffle1.riscv.s +++ b/tests/SysY2022/performance/shuffle1.riscv.s @@ -1,37 +1,37 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 hashmod: .zero 4 -.align 8 +.p2align 3 head: .zero 40000000 -.align 8 +.p2align 3 next: .zero 40000000 -.align 8 +.p2align 3 nextvalue: .zero 40000000 -.align 8 +.p2align 3 key: .zero 40000000 -.align 8 +.p2align 3 value: .zero 40000000 -.align 8 +.p2align 3 keys: .zero 40000000 -.align 8 +.p2align 3 values: .zero 40000000 -.align 8 +.p2align 3 requests: .zero 40000000 -.align 8 +.p2align 3 ans: .zero 40000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/shuffle2.arm.s b/tests/SysY2022/performance/shuffle2.arm.s index 76d8409f2..23846d595 100644 --- a/tests/SysY2022/performance/shuffle2.arm.s +++ b/tests/SysY2022/performance/shuffle2.arm.s @@ -1,37 +1,37 @@ .arch armv7ve .data .bss -.align 4 +.p2align 2 hashmod: .zero 4 -.align 8 +.p2align 3 head: .zero 40000000 -.align 8 +.p2align 3 next: .zero 40000000 -.align 8 +.p2align 3 nextvalue: .zero 40000000 -.align 8 +.p2align 3 key: .zero 40000000 -.align 8 +.p2align 3 value: .zero 40000000 -.align 8 +.p2align 3 keys: .zero 40000000 -.align 8 +.p2align 3 values: .zero 40000000 -.align 8 +.p2align 3 requests: .zero 40000000 -.align 8 +.p2align 3 ans: .zero 40000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/shuffle2.riscv.s b/tests/SysY2022/performance/shuffle2.riscv.s index 7812b66e3..e0b455c1b 100644 --- a/tests/SysY2022/performance/shuffle2.riscv.s +++ b/tests/SysY2022/performance/shuffle2.riscv.s @@ -1,37 +1,37 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 4 +.p2align 2 hashmod: .zero 4 -.align 8 +.p2align 3 head: .zero 40000000 -.align 8 +.p2align 3 next: .zero 40000000 -.align 8 +.p2align 3 nextvalue: .zero 40000000 -.align 8 +.p2align 3 key: .zero 40000000 -.align 8 +.p2align 3 value: .zero 40000000 -.align 8 +.p2align 3 keys: .zero 40000000 -.align 8 +.p2align 3 values: .zero 40000000 -.align 8 +.p2align 3 requests: .zero 40000000 -.align 8 +.p2align 3 ans: .zero 40000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/sl1.arm.s b/tests/SysY2022/performance/sl1.arm.s index 4b74a0ef0..5473b2c2f 100644 --- a/tests/SysY2022/performance/sl1.arm.s +++ b/tests/SysY2022/performance/sl1.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 x: .zero 864000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/sl1.riscv.s b/tests/SysY2022/performance/sl1.riscv.s index 2de6a784a..57daf2e06 100644 --- a/tests/SysY2022/performance/sl1.riscv.s +++ b/tests/SysY2022/performance/sl1.riscv.s @@ -1,1094 +1,247 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 x: .zero 864000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text .p2align 2 .globl main main: - addi sp, sp, -104 + addi sp, sp, -72 sd ra, 0(sp) - sd s6, 8(sp) - sd s1, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s2, 40(sp) - sd s3, 48(sp) - sd s4, 56(sp) + sd s5, 8(sp) + sd s0, 16(sp) + sd s1, 24(sp) + sd s6, 32(sp) + sd s4, 40(sp) + sd s2, 48(sp) + sd s3, 56(sp) sd s7, 64(sp) - sd s9, 72(sp) - sd s8, 80(sp) - sd s11, 88(sp) - sd s10, 96(sp) jal getint - mv s6, a0 + mv s5, a0 jal getint - mv s0, a0 + mv s1, a0 li a0, 13 jal _sysy_starttime - li a0, 75 - addiw s1, s6, -2 -pcrel1697: - auipc a1, %pcrel_hi(x) - addiw s2, s6, -1 - slli s3, a0, 5 - addi s5, a1, %pcrel_lo(pcrel1697) - sub s4, zero, s3 - ble s6, zero, label1452 -pcrel1698: + li a1, 75 +pcrel468: + auipc a0, %pcrel_hi(x) + addiw s0, s5, -1 + slli s2, a1, 5 + addi s4, a0, %pcrel_lo(pcrel468) + sub s3, zero, s2 + lui a0, 352 + addiw s6, a0, -1792 + ble s5, zero, label329 +pcrel469: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1699: +pcrel470: auipc a3, %pcrel_hi(cmmc_parallel_body_0) - sw s6, %pcrel_lo(pcrel1698)(a0) - addi a2, a3, %pcrel_lo(pcrel1699) - mv a1, s6 + sw s5, %pcrel_lo(pcrel469)(a0) + addi a2, a3, %pcrel_lo(pcrel470) + mv a1, s5 mv a0, zero jal cmmcParallelFor -label1452: +label329: li a0, 1 - ble s2, a0, label1496 - lui a2, 352 - mv a3, s5 - addiw a0, a2, -1792 - li a2, 1 - add a1, s5, a0 - lui a5, 352 - addiw a2, a2, 1 - addiw a0, a5, -1792 - li a5, 2 - add a4, a1, a0 - bgt s2, a5, label1462 -.p2align 2 -label1458: - li t0, 601 - slli a0, t0, 2 - lui t0, 352 - add a5, a3, a0 - addiw a3, t0, 612 - lw a4, 0(a5) - add t2, a1, a3 - lw t1, 0(t2) - lw t0, 4(a1) - addw a5, a4, t1 - li a4, 1201 - addw a3, a5, t0 - slli t2, a4, 2 - add t0, a1, s3 - add a5, a1, t2 - addi t2, a0, 4 - lw t1, 0(a5) - lw a5, 0(t0) - addw a4, a3, t1 - add t1, a1, t2 - addw a3, a4, a5 - li t2, 2 - lw a5, 0(t1) - add t1, a1, a0 - addw t0, a3, a5 - divw a4, t0, s0 - sw a4, 0(t1) - ble s2, a2, label1526 -.p2align 2 -label1461: - lui a5, 352 - mv a3, a1 - addiw a2, a2, 1 - addiw a4, a5, -1792 - add a0, a1, a4 - mv a1, a0 - addiw a0, a5, -1792 - li a5, 2 - add a4, a1, a0 - ble s2, a5, label1458 -.p2align 2 -label1462: - add a0, a1, s3 + ble s0, a0, label360 + add t0, s4, s6 + mv a4, s4 + li t1, 1 + add a5, t0, s6 + addiw t1, t1, 1 + add a0, t0, s2 li t2, 1 - mul t5, t2, s3 - add t0, a0, s4 - add t1, a0, s3 - add a5, a4, t5 - add t3, a3, t5 - li t5, 1 - addi t4, t3, 4 - j label1466 -.p2align 2 -label1469: - bgt s2, t5, label1470 + mul t4, t2, s2 + add a2, a0, s3 addiw t2, t2, 1 - ble s2, t2, label1686 -.p2align 2 -label1476: - add a0, a0, s3 - mul t5, t2, s3 - add t1, a0, s3 - add t0, a0, s4 - add a5, a4, t5 - add t3, a3, t5 - li t5, 1 - addi t4, t3, 4 -.p2align 2 -label1466: - sh2add a6, t5, a5 - lw s7, 0(t4) - sh2add a7, t5, t0 - lw s9, 0(a6) - lw s8, 0(a7) - addw t6, s7, s9 - sh2add s7, t5, t1 - addw s9, t6, s8 - lw s11, 0(s7) - sh2add t6, t5, a0 - addw s8, s9, s11 - addiw t5, t5, 2 - lw s10, -4(t6) - addw s9, s8, s10 - lw s8, 4(t6) - addw s11, s9, s8 - divw s8, s11, s0 - sw s8, 0(t6) - lw s9, 4(t4) - lw s11, 4(a6) - lw a6, 4(a7) - addw s8, s9, s11 - lw a7, 4(s7) - addw s10, s8, a6 - lw s8, 0(t6) - addw a6, s10, a7 - lw s9, 8(t6) - addw a7, a6, s8 - addw s7, a7, s9 - divw a6, s7, s0 - sw a6, 4(t6) - ble s1, t5, label1469 - addi t4, t4, 8 - j label1466 -.p2align 2 -label1470: - sh2add t3, t5, t3 + add a1, a5, t4 + add a3, s4, t4 + li t4, 1 + addi t3, a3, 4 + add a3, a0, s2 + j label341 +.p2align 2 +label348: + addi t3, t3, 4 .p2align 2 -label1471: - sh2add t6, t5, a5 - lw a6, 0(t3) - sh2add a7, t5, t0 - lw s7, 0(t6) - addw t4, a6, s7 - sh2add s7, t5, t1 +label341: + sh2add a6, t4, a1 + lw t6, 0(t3) + sh2add a7, t4, a2 + lw s7, 0(a6) lw a6, 0(a7) + addw t5, t6, s7 + sh2add s7, t4, a3 + addw t6, t5, a6 lw a7, 0(s7) - addw t6, t4, a6 - sh2add t4, t5, a0 + sh2add t5, t4, a0 addw a6, t6, a7 - addiw t5, t5, 1 - lw s8, -4(t4) - lw a7, 4(t4) - addw t6, a6, s8 - addw s7, t6, a7 - divw a6, s7, s0 - sw a6, 0(t4) - ble s2, t5, label1613 - addi t3, t3, 4 - j label1471 -.p2align 2 -label1613: + addiw t4, t4, 1 + lw a7, -4(t5) + lw s7, 4(t5) + addw t6, a6, a7 + addw a7, t6, s7 + divw a6, a7, s1 + sw a6, 0(t5) + bgt s0, t4, label348 + ble s0, t2, label411 + add a0, a0, s2 + mul t4, t2, s2 + add a2, a0, s3 addiw t2, t2, 1 - bgt s2, t2, label1476 - bgt s2, a2, label1461 - j label1526 -.p2align 2 -label1686: - bgt s2, a2, label1461 -label1526: - mv s0, a1 + add a1, a5, t4 + add a3, a4, t4 + li t4, 1 + addi t3, a3, 4 + add a3, a0, s2 + j label341 +.p2align 2 +label411: + ble s0, t1, label463 + add a0, t0, s6 + mv a4, t0 + addiw t1, t1, 1 + li t2, 1 + add a5, a0, s6 + mv t0, a0 + mul t4, t2, s2 + add a0, a0, s2 + addiw t2, t2, 1 + add a1, a5, t4 + add a3, a4, t4 + add a2, a0, s3 + li t4, 1 + addi t3, a3, 4 + add a3, a0, s2 + j label341 +label463: + mv s0, t0 mv s1, t2 -label1478: +label330: li a0, 53 jal _sysy_stoptime - mv a0, s6 - mv a1, s5 + mv a0, s5 + mv a1, s4 jal putarray - lui a4, 352 - srliw a0, s6, 31 - addiw a5, a4, -1792 - add a1, s6, a0 + srliw a0, s5, 31 + add a1, s5, a0 + mv a0, s5 sraiw a2, a1, 1 - mul a4, a2, s3 - mul a0, a2, a5 - add a3, s5, a0 - mv a0, s6 + mul a4, a2, s2 + mul a5, a2, s6 + add a3, s4, a5 add a1, a3, a4 jal putarray - mv a0, s6 + mv a0, s5 addiw a3, s1, -1 - mul a2, a3, s3 + mul a2, a3, s2 add a1, s0, a2 jal putarray - mv a0, zero ld ra, 0(sp) - ld s6, 8(sp) - ld s1, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s2, 40(sp) - ld s3, 48(sp) - ld s4, 56(sp) + mv a0, zero + ld s5, 8(sp) + ld s0, 16(sp) + ld s1, 24(sp) + ld s6, 32(sp) + ld s4, 40(sp) + ld s2, 48(sp) + ld s3, 56(sp) ld s7, 64(sp) - ld s9, 72(sp) - ld s8, 80(sp) - ld s11, 88(sp) - ld s10, 96(sp) - addi sp, sp, 104 + addi sp, sp, 72 ret -label1496: - mv s0, s5 +label360: + mv s0, s4 li s1, 1 - j label1478 + j label330 .p2align 2 cmmc_parallel_body_0: - addi sp, sp, -24 - mv t6, a0 -pcrel1448: - auipc a4, %pcrel_hi(cmmc_parallel_body_payload_0) - li a3, 75 - sd s2, 0(sp) - li a0, 1 - sd s1, 8(sp) - sd s0, 16(sp) - lw a2, %pcrel_lo(pcrel1448)(a4) - slli a4, a3, 5 - ble a2, zero, label240 - addiw t4, a1, -57 - addiw t3, a1, -26 - addiw t2, a1, -11 - addiw t0, a1, -4 - addiw a5, a1, -1 -pcrel1449: - auipc a3, %pcrel_hi(x) - addi t1, a3, %pcrel_lo(pcrel1449) - bgt a2, a0, label3 - addiw a2, t6, 1 - ble a1, a2, label186 - addiw a3, t6, 3 - ble a5, a3, label693 - addiw a3, t6, 7 - ble t0, a3, label711 - addiw a3, t6, 15 - ble t2, a3, label731 - addiw a2, t6, 31 - ble t3, a2, label736 - lui t5, 352 - addiw a3, t5, -1792 - mul a4, t6, a3 - mv a3, t6 - add a2, t1, a4 -.p2align 2 -label216: - addiw a3, a3, 32 - lui t5, 352 - sw a0, 0(a2) - addiw a4, t5, -1792 - slli a6, a4, 1 - add t6, a2, a4 - add t5, a2, a6 - sw a0, 0(t6) - sh1add t6, a4, a4 - sw a0, 0(t5) - add a7, a2, t6 - slli t5, a6, 1 - sw a0, 0(a7) - sh2add a6, a4, a4 - add a7, a2, t5 - add s0, a2, a6 - sw a0, 0(a7) - slli a7, t6, 1 - sw a0, 0(s0) - add a6, a2, a7 - lui a7, 2461 - sw a0, 0(a6) - addiw t6, a7, -256 - slli a7, t5, 1 - add a6, a2, t6 - sh3add t5, a4, a4 - add t6, a2, a7 - sw a0, 0(a6) - lui a7, 5977 - add a6, a2, t5 - sw a0, 0(t6) - lui t6, 3516 - sw a0, 0(a6) - addiw t5, t6, -1536 - lui a6, 4570 - lui t6, 3867 - add a4, a2, t5 - addiw t5, t6, 768 - sw a0, 0(a4) - lui t6, 4219 - add a4, a2, t5 - addiw t5, t6, -1024 - sw a0, 0(a4) - addiw t6, a6, 1280 - add a4, a2, t5 - lui a6, 4922 - add t5, a2, t6 - sw a0, 0(a4) - lui t6, 5273 - addiw a4, a6, -512 - sw a0, 0(t5) - lui a6, 5625 - add t5, a2, a4 - addiw a4, t6, 1792 - sw a0, 0(t5) - add t6, a2, a6 - add t5, a2, a4 - addiw a4, a7, -1792 - sw a0, 0(t5) - add t5, a2, a4 - sw a0, 0(t6) - lui t6, 6328 - sw a0, 0(t5) - addiw a4, t6, 512 - lui t6, 6680 - add t5, a2, a4 - addiw a4, t6, -1280 - sw a0, 0(t5) - lui t6, 7031 - add t5, a2, a4 - addiw a4, t6, 1024 - sw a0, 0(t5) - lui t6, 7383 - add t5, a2, a4 - addiw a4, t6, -768 - sw a0, 0(t5) - lui t6, 7734 - add t5, a2, a4 - addiw a4, t6, 1536 - sw a0, 0(t5) - lui t6, 8086 - add t5, a2, a4 - addiw a4, t6, -256 - sw a0, 0(t5) - lui t6, 8438 - add t5, a2, a4 - addiw a4, t6, -2048 - sw a0, 0(t5) - lui t6, 8789 - add t5, a2, a4 - addiw a4, t6, 256 - sw a0, 0(t5) - lui t6, 9141 - add t5, a2, a4 - addiw a4, t6, -1536 - sw a0, 0(t5) - lui t6, 9492 - add t5, a2, a4 - addiw a4, t6, 768 - sw a0, 0(t5) - lui t6, 9844 - add t5, a2, a4 - addiw a4, t6, -1024 - sw a0, 0(t5) - lui t6, 10195 - add t5, a2, a4 - addiw a4, t6, 1280 - sw a0, 0(t5) - lui t6, 10547 - add t5, a2, a4 - addiw a4, t6, -512 - sw a0, 0(t5) - lui t6, 10898 - add t5, a2, a4 - addiw a4, t6, 1792 - sw a0, 0(t5) - add t5, a2, a4 - sw a0, 0(t5) - ble t4, a3, label774 - lui a4, 11250 - add a2, a2, a4 - j label216 -label240: - ld s2, 0(sp) - ld s1, 8(sp) - ld s0, 16(sp) - addi sp, sp, 24 + mv t4, a0 + mv a3, a1 +pcrel327: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_0) + li a5, 75 + lui t0, 352 + li t1, 1 + li a1, 1 + addiw a4, t0, -1792 + lw a0, %pcrel_lo(pcrel327)(a2) + slli a2, a5, 5 + slli a5, t1, 32 + bgt a0, zero, label3 +label2: ret -label774: - mv t6, a3 -label220: - ble t3, t6, label779 - lui t4, 352 - addiw a3, t4, -1792 - mul a4, t6, a3 - add a2, t1, a4 -label224: - addiw a3, t6, 16 - lui t5, 352 - sw a0, 0(a2) - addiw a4, t5, -1792 - sh1add t5, a4, a4 - slli t6, a4, 1 - add t4, a2, a4 - add a7, a2, t5 - add a6, a2, t6 - sw a0, 0(t4) - slli t4, t6, 1 - sw a0, 0(a6) - sh2add t6, a4, a4 - add a6, a2, t4 - sw a0, 0(a7) - add a7, a2, t6 - sw a0, 0(a6) - slli a6, t5, 1 - sw a0, 0(a7) - add t6, a2, a6 - lui a7, 2461 - slli a6, t4, 1 - sw a0, 0(t6) - addiw t5, a7, -256 - add t6, a2, t5 - add t5, a2, a6 - sw a0, 0(t6) - lui a6, 3516 - sh3add t6, a4, a4 - sw a0, 0(t5) - addiw a4, a6, -1536 - add t4, a2, t6 - add t5, a2, a4 - lui t6, 3867 - sw a0, 0(t4) - addiw t4, t6, 768 - sw a0, 0(t5) - lui t6, 4219 - add a4, a2, t4 - addiw t4, t6, -1024 - sw a0, 0(a4) - lui t6, 4570 - add t5, a2, t4 - addiw a4, t6, 1280 - sw a0, 0(t5) - lui t6, 4922 - add t4, a2, a4 - addiw a4, t6, -512 - sw a0, 0(t4) - lui t6, 5273 - add t5, a2, a4 - addiw t4, t6, 1792 - sw a0, 0(t5) - add a4, a2, t4 - sw a0, 0(a4) - bgt t3, a3, label227 -label779: - mv a2, a3 - mv t6, a3 -label230: - bgt t2, t6, label235 - mv t6, a2 - j label203 label3: - addiw a3, a2, -1 - li a5, 3 - bgt a3, a5, label4 - lui t3, 352 + auipc t0, %pcrel_hi(x) + li t1, 3 + addi t2, t0, %pcrel_lo(label3) + bgt a0, t1, label4 + mul t1, t4, a4 + mv t0, t4 + add a5, t2, t1 mv t4, zero - addiw t0, t3, -1792 - mul t2, t6, t0 - mv t0, t6 - add a5, t1, t2 - mv t2, zero mv t3, a5 + mv t2, zero mv t1, a5 - j label171 + j label61 .p2align 2 -label1347: +label65: addiw t0, t0, 1 - ble a1, t0, label240 + ble a3, t0, label2 .p2align 2 -label183: - lui t2, 352 - li t4, 2 - li a6, 1 - addiw t1, t2, -1792 - slli t5, a6, 32 +label66: + add a5, a5, a4 mv t2, zero - add a5, a5, t1 - addi t6, t5, 1 + li t4, 1 mv t3, a5 mv t1, a5 - sd t6, 0(a5) - ble a3, t4, label1385 + sw a1, 0(a5) + ble a0, t4, label309 .p2align 2 -label174: - addi t3, t3, 8 -.p2align 2 -label171: - addiw t4, t4, 2 - li a6, 1 - slli t5, a6, 32 - addi t6, t5, 1 - sd t6, 0(t3) - bgt a3, t4, label174 - bgt a2, t4, label176 -.p2align 2 -label1345: - addiw t2, t2, 1 - bgt a2, t2, label184 -.p2align 2 -label1366: - addiw t0, t0, 1 - bgt a1, t0, label183 - j label240 -.p2align 2 -label180: +label68: addi t3, t3, 4 - mv t4, t5 .p2align 2 -label177: - addiw t5, t4, 1 - sw a0, 0(t3) - bgt a2, t5, label180 +label61: + addiw t4, t4, 1 + sw a1, 0(t3) + bgt a0, t4, label68 addiw t2, t2, 1 - ble a2, t2, label1347 + ble a0, t2, label65 .p2align 2 -label184: - add t1, t1, a4 - li t4, 2 - li a6, 1 +label67: + add t1, t1, a2 + li t4, 1 + sw a1, 0(t1) mv t3, t1 - slli t5, a6, 32 - addi t6, t5, 1 - sd t6, 0(t1) - bgt a3, t4, label174 - bgt a2, t4, label176 + bgt a0, t4, label68 addiw t2, t2, 1 - bgt a2, t2, label184 - j label1366 -.p2align 2 -label1385: - ble a2, t4, label1345 -.p2align 2 -label176: - sh2add t3, t4, t1 - j label177 -label239: - lui t3, 2813 - mv t6, a3 - addiw a4, t3, -2048 - add a2, a2, a4 -label236: - addiw a3, t6, 8 - lui t4, 352 - sw a0, 0(a2) - addiw a4, t4, -1792 - slli t4, a4, 1 - add t3, a2, a4 - add t5, a2, t4 - sw a0, 0(t3) - sh1add t3, a4, a4 - sw a0, 0(t5) - add a6, a2, t3 - slli t5, t4, 1 - sw a0, 0(a6) - sh2add t4, a4, a4 - add t6, a2, t5 - slli a4, t3, 1 - add t5, a2, t4 - sw a0, 0(t6) - add t4, a2, a4 - sw a0, 0(t5) - lui t5, 2461 - sw a0, 0(t4) - addiw a4, t5, -256 - add t3, a2, a4 - sw a0, 0(t3) - bgt t2, a3, label239 - mv a2, a3 - mv t6, a3 -label203: - bgt t0, t6, label206 - mv t6, a2 -label194: - ble a5, t6, label697 - lui t0, 352 - addiw a3, t0, -1792 - mul a4, t6, a3 - add a2, t1, a4 -label198: - addiw t6, t6, 2 - lui t0, 352 - sw a0, 0(a2) - addiw a3, t0, -1792 - add a4, a2, a3 - sw a0, 0(a4) - bgt a5, t6, label201 - j label186 -label697: - mv t6, a2 -label186: - ble a1, t6, label240 - lui a5, 352 - addiw a4, a5, -1792 - mul a3, t6, a4 - add a2, t1, a3 -label189: - addiw t6, t6, 1 - sw a0, 0(a2) - ble a1, t6, label240 - lui a4, 352 - addiw a3, a4, -1792 - add a2, a2, a3 - j label189 -label206: - lui t2, 352 - addiw a3, t2, -1792 - mul a4, t6, a3 - add a2, t1, a4 -label207: - addiw a3, t6, 4 - lui t2, 352 - sw a0, 0(a2) - addiw a4, t2, -1792 - slli t2, a4, 1 - add t3, a2, a4 - add t4, a2, t2 - sw a0, 0(t3) - sh1add t3, a4, a4 - sw a0, 0(t4) - add t2, a2, t3 - sw a0, 0(t2) - ble t0, a3, label725 - lui t2, 1406 - mv t6, a3 - addiw a4, t2, 1024 - add a2, a2, a4 - j label207 + bgt a0, t2, label67 + addiw t0, t0, 1 + bgt a3, t0, label66 + j label2 label4: - addiw a5, a2, -4 - li t0, 7 - bgt a5, t0, label5 - ble a5, zero, label240 - lui t4, 352 - addiw t2, t4, -1792 - mul t3, t6, t2 - mv t2, t6 - add t0, t1, t3 - mv t1, t0 - mv t3, zero - mv t4, t0 - mv t5, zero - j label143 -.p2align 2 -label146: - addi t4, t4, 16 -.p2align 2 -label143: - addiw t5, t5, 4 - li a7, 1 - slli a6, a7, 32 - addi t6, a6, 1 - sd t6, 0(t4) - sd t6, 8(t4) - bgt a5, t5, label146 -.p2align 2 -label600: - bgt a3, t5, label159 - ble a2, t5, label1363 -.p2align 2 -label150: - sh2add t4, t5, t1 - j label151 -.p2align 2 -label154: - addi t4, t4, 4 - mv t5, t6 -.p2align 2 -label151: - addiw t6, t5, 1 - sw a0, 0(t4) - bgt a2, t6, label154 - addiw t3, t3, 1 - ble a2, t3, label156 -.p2align 2 -label158: - add t1, t1, a4 - li t5, 4 - li a7, 1 - mv t4, t1 - slli a6, a7, 32 - addi t6, a6, 1 - sd t6, 0(t1) - sd t6, 8(t1) - bgt a5, t5, label146 - ble a3, t5, label1383 -.p2align 2 -label159: - sh2add t4, t5, t1 -.p2align 2 -label160: - addiw t6, t5, 2 - li s0, 1 - slli a6, s0, 32 - addi a7, a6, 1 - sd a7, 0(t4) - ble a3, t6, label637 - addi t4, t4, 8 - mv t5, t6 - j label160 -.p2align 2 -label637: - mv t5, t6 - bgt a2, t6, label150 - addiw t3, t3, 1 - bgt a2, t3, label158 - j label156 -.p2align 2 -label1383: - bgt a2, t5, label150 - addiw t3, t3, 1 - bgt a2, t3, label158 -label156: - addiw t2, t2, 1 - ble a1, t2, label240 - lui t3, 352 - li t5, 4 - li a7, 1 - addiw t1, t3, -1792 - slli a6, a7, 32 - mv t3, zero - add t0, t0, t1 - addi t6, a6, 1 - mv t4, t0 - mv t1, t0 - sd t6, 0(t0) - sd t6, 8(t0) - bgt a5, t5, label146 - j label600 -.p2align 2 -label1363: - addiw t3, t3, 1 - bgt a2, t3, label158 - j label156 -label5: - addiw t0, a2, -11 - li t2, 15 - bgt t0, t2, label6 - ble t0, zero, label240 - lui t5, 352 - addiw t3, t5, -1792 - mul t4, t6, t3 - mv t3, t6 - add t2, t1, t4 - mv t1, t2 - mv t4, zero - mv t5, t2 - mv t6, zero - j label107 -.p2align 2 -label564: - addiw t4, t4, 1 - ble a2, t4, label116 -.p2align 2 -label118: - add t1, t1, a4 - li t6, 8 - li s0, 1 - mv t5, t1 - slli a7, s0, 32 - addi a6, a7, 1 - sd a6, 0(t1) - sd a6, 8(t1) - sd a6, 16(t1) - sd a6, 24(t1) - ble t0, t6, label1360 -.p2align 2 -label134: - addi t5, t5, 32 -.p2align 2 -label107: - addiw t6, t6, 8 - li s0, 1 - slli a7, s0, 32 - addi a6, a7, 1 - sd a6, 0(t5) - sd a6, 8(t5) - sd a6, 16(t5) - sd a6, 24(t5) - bgt t0, t6, label134 - ble a5, t6, label111 -.p2align 2 -label129: - sh2add t5, t6, t1 -.p2align 2 -label130: - addiw a6, t6, 4 - li s0, 1 - slli t6, s0, 32 - addi a7, t6, 1 - sd a7, 0(t5) - sd a7, 8(t5) - bgt a5, a6, label133 - mv t6, a6 - ble a3, a6, label1340 -.p2align 2 -label124: - sh2add t5, t6, t1 - j label125 -.p2align 2 -label128: - addi t5, t5, 8 -.p2align 2 -label125: - addiw t6, t6, 2 - li s0, 1 - slli a7, s0, 32 - addi a6, a7, 1 - sd a6, 0(t5) - bgt a3, t6, label128 - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 - j label116 -.p2align 2 -label119: - sh2add t5, t6, t1 -.p2align 2 -label120: - addiw a6, t6, 1 - sw a0, 0(t5) - ble a2, a6, label564 - addi t5, t5, 4 - mv t6, a6 - j label120 -.p2align 2 -label111: - bgt a3, t6, label124 - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 - j label116 -.p2align 2 -label1360: - bgt a5, t6, label129 - bgt a3, t6, label124 - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 - j label116 -.p2align 2 -label1340: - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 -label116: - addiw t3, t3, 1 - ble a1, t3, label240 - lui t4, 352 - mv t6, zero - addiw t1, t4, -1792 - mv t4, zero - add t2, t2, t1 - mv t5, t2 - mv t1, t2 - j label107 -label6: - addiw t2, a2, -26 - addiw t3, a2, -57 - li t4, 31 - ble t2, t4, label56 - lui a7, 352 - addiw a6, a7, -1792 - mul t4, t6, a6 - add t5, t1, t4 - mv t4, t5 - mv a6, zero - mv t1, t5 - mv a7, zero - j label14 -.p2align 2 -label17: - addi t1, t1, 128 -.p2align 2 -label14: - addiw a7, a7, 32 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - sd s0, 16(t1) - sd s0, 24(t1) - sd s0, 32(t1) - sd s0, 40(t1) - sd s0, 48(t1) - sd s0, 56(t1) - sd s0, 64(t1) - sd s0, 72(t1) - sd s0, 80(t1) - sd s0, 88(t1) - sd s0, 96(t1) - sd s0, 104(t1) - sd s0, 112(t1) - sd s0, 120(t1) - bgt t3, a7, label17 - ble t2, a7, label1324 - sh2add t1, a7, t4 - j label20 -.p2align 2 -label23: - addi t1, t1, 64 -.p2align 2 -label20: - addiw a7, a7, 16 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - sd s0, 16(t1) - sd s0, 24(t1) - sd s0, 32(t1) - sd s0, 40(t1) - sd s0, 48(t1) - sd s0, 56(t1) - bgt t2, a7, label23 - ble t0, a7, label1326 -.p2align 2 -label51: - sh2add t1, a7, t4 - j label52 -.p2align 2 -label55: - addi t1, t1, 32 -.p2align 2 -label52: - addiw a7, a7, 8 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - sd s0, 16(t1) - sd s0, 24(t1) - bgt t0, a7, label55 - bgt a5, a7, label46 - bgt a3, a7, label41 - ble a2, a7, label1374 -.p2align 2 -label36: - sh2add t1, a7, t4 -.p2align 2 -label37: - addiw a7, a7, 1 - sw a0, 0(t1) - ble a2, a7, label381 - addi t1, t1, 4 - j label37 -.p2align 2 -label1326: - ble a5, a7, label1350 -.p2align 2 -label46: - sh2add t1, a7, t4 -.p2align 2 -label47: - addiw a7, a7, 4 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - bgt a5, a7, label50 - bgt a3, a7, label41 - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -.p2align 2 -label381: - addiw a6, a6, 1 - ble a2, a6, label33 -.p2align 2 -label35: - add t4, t4, a4 - mv a7, zero - mv t1, t4 - j label14 -.p2align 2 -label1324: - bgt t0, a7, label51 - bgt a5, a7, label46 - ble a3, a7, label1387 -.p2align 2 -label41: - sh2add t1, a7, t4 - j label42 -.p2align 2 -label45: - addi t1, t1, 8 -.p2align 2 -label42: - addiw a7, a7, 2 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - bgt a3, a7, label45 - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -.p2align 2 -label1350: - bgt a3, a7, label41 - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -.p2align 2 -label1387: - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 -label33: - addiw t6, t6, 1 - ble a1, t6, label240 - lui t4, 352 - mv a6, zero - mv a7, zero - addiw t1, t4, -1792 - add t5, t5, t1 - mv t1, t5 - mv t4, t5 - j label14 -.p2align 2 -label1374: - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -label56: - ble t2, zero, label240 - lui a6, 352 - addiw t4, a6, -1792 - mul t5, t6, t4 - mv t4, t6 - add t3, t1, t5 - mv t1, t3 + addiw t0, a0, -3 + addiw t1, a0, -18 + li t3, 15 + ble t0, t3, label88 + mul t5, t4, a4 + add t3, t2, t5 + mv t2, t3 mv t5, zero mv t6, t3 mv a6, zero - j label64 + j label12 .p2align 2 -label98: +label15: addi t6, t6, 64 .p2align 2 -label64: +label12: addiw a6, a6, 16 - li s1, 1 - slli s0, s1, 32 - addi a7, s0, 1 + ori a7, a5, 1 sd a7, 0(t6) sd a7, 8(t6) sd a7, 16(t6) @@ -1097,153 +250,147 @@ label64: sd a7, 40(t6) sd a7, 48(t6) sd a7, 56(t6) - bgt t2, a6, label98 - ble t0, a6, label447 - sh2add t6, a6, t1 - j label69 + bgt t1, a6, label15 + ble t0, a6, label294 + sh2add t6, a6, t2 + mv a7, a6 + j label18 .p2align 2 -label72: - addi t6, t6, 32 - mv a6, a7 +label21: + addi t6, t6, 16 .p2align 2 -label69: - addiw a7, a6, 8 - li s1, 1 - slli s0, s1, 32 - addi a6, s0, 1 +label18: + addiw a7, a7, 4 + ori a6, a5, 1 sd a6, 0(t6) sd a6, 8(t6) - sd a6, 16(t6) - sd a6, 24(t6) - bgt t0, a7, label72 + bgt t0, a7, label21 mv a6, a7 - bgt a5, a7, label75 - bgt a3, a7, label82 - bgt a2, a7, label93 - addiw t5, t5, 1 - bgt a2, t5, label92 - j label90 -.p2align 2 -label1334: - ble a2, a6, label1357 + ble a0, a7, label296 .p2align 2 -label93: - sh2add t6, a6, t1 +label28: + sh2add t6, a6, t2 .p2align 2 -label94: - addiw a7, a6, 1 - sw a0, 0(t6) - ble a2, a7, label513 +label29: + addiw a6, a6, 1 + sw a1, 0(t6) + ble a0, a6, label149 addi t6, t6, 4 - mv a6, a7 - j label94 -.p2align 2 -label75: - sh2add t6, a6, t1 -.p2align 2 -label76: - addiw a6, a6, 4 - li s1, 1 - slli s0, s1, 32 - addi a7, s0, 1 - sd a7, 0(t6) - sd a7, 8(t6) - ble a5, a6, label476 - addi t6, t6, 16 - j label76 + j label29 .p2align 2 -label513: +label149: addiw t5, t5, 1 - ble a2, t5, label90 + ble a0, t5, label297 .p2align 2 -label92: - add t1, t1, a4 +label27: + add t2, t2, a2 mv a6, zero - mv t6, t1 - j label64 -.p2align 2 -label476: - ble a3, a6, label1334 -.p2align 2 -label82: - sh2add t6, a6, t1 - j label83 -.p2align 2 -label86: - addi t6, t6, 8 + mv t6, t2 + j label12 .p2align 2 -label83: - addiw a6, a6, 2 - li s1, 1 - slli s0, s1, 32 - addi a7, s0, 1 - sd a7, 0(t6) - bgt a3, a6, label86 - bgt a2, a6, label93 +label294: + bgt a0, a6, label28 addiw t5, t5, 1 - bgt a2, t5, label92 - j label90 + bgt a0, t5, label27 + j label25 .p2align 2 -label447: - bgt a5, a6, label75 - bgt a3, a6, label82 - bgt a2, a6, label93 - addiw t5, t5, 1 - bgt a2, t5, label92 -label90: +label297: addiw t4, t4, 1 - ble a1, t4, label240 - lui t5, 352 - mv a6, zero - addiw t1, t5, -1792 + ble a3, t4, label2 +.p2align 2 +label26: + add t3, t3, a4 mv t5, zero - add t3, t3, t1 + mv a6, zero mv t6, t3 - mv t1, t3 - j label64 + mv t2, t3 + j label12 .p2align 2 -label1357: +label296: addiw t5, t5, 1 - bgt a2, t5, label92 - j label90 + bgt a0, t5, label27 +label25: + addiw t4, t4, 1 + bgt a3, t4, label26 + j label2 +.p2align 2 +label309: + addiw t2, t2, 1 + bgt a0, t2, label67 + j label65 +label88: + mul t3, t4, a4 + mv t6, zero + add t1, t2, t3 + mv t5, t1 + mv t3, t1 + mv t2, t4 + mv t4, zero + j label40 +.p2align 2 +label52: + addi t5, t5, 4 +.p2align 2 +label49: + addiw a6, a6, 1 + sw a1, 0(t5) + bgt a0, a6, label52 + addiw t4, t4, 1 + ble a0, t4, label298 .p2align 2 -label133: +label47: + add t3, t3, a2 + li t6, 4 + ori a6, a5, 1 + mv t5, t3 + sd a6, 0(t3) + sd a6, 8(t3) + ble t0, t6, label303 +.p2align 2 +label53: addi t5, t5, 16 - mv t6, a6 - j label130 -label235: - lui t3, 352 - addiw a4, t3, -1792 - mul a3, t6, a4 - add a2, t1, a3 - j label236 -label693: - mv a2, zero - j label194 -label201: - lui a4, 703 - addiw a3, a4, 512 - add a2, a2, a3 - j label198 -label736: - mv a3, zero - j label220 .p2align 2 -label50: - addi t1, t1, 16 - j label47 -label711: - mv a2, zero - j label203 -label731: - mv a2, zero - j label230 -label227: - lui a4, 5625 - mv t6, a3 - add a2, a2, a4 - j label224 -label725: - mv a2, a3 - mv t6, a3 - j label194 +label40: + addiw t6, t6, 4 + ori a6, a5, 1 + sd a6, 0(t5) + sd a6, 8(t5) + bgt t0, t6, label53 +.p2align 2 +label43: + ble a0, t6, label44 +.p2align 2 +label48: + sh2add t5, t6, t3 + mv a6, t6 + j label49 +.p2align 2 +label44: + addiw t4, t4, 1 + bgt a0, t4, label47 +label45: + addiw t2, t2, 1 + bgt a3, t2, label46 + j label2 +.p2align 2 +label303: + bgt a0, t6, label48 + addiw t4, t4, 1 + bgt a0, t4, label47 + j label45 +.p2align 2 +label298: + addiw t2, t2, 1 + ble a3, t2, label2 +.p2align 2 +label46: + add t1, t1, a4 + mv t4, zero + li t6, 4 + ori a6, a5, 1 + mv t5, t1 + mv t3, t1 + sd a6, 0(t1) + sd a6, 8(t1) + bgt t0, t6, label53 + j label43 diff --git a/tests/SysY2022/performance/sl1.sy.ir b/tests/SysY2022/performance/sl1.sy.ir index 61160ca35..b3d55bac9 100644 --- a/tests/SysY2022/performance/sl1.sy.ir +++ b/tests/SysY2022/performance/sl1.sy.ir @@ -8,155 +8,84 @@ func @main() -> i32 { NoRecurse Entry } { i32 %0 = call () -> i32 @getint(); i32 %1 = call () -> i32 @getint(); call (i32) -> void @starttime(i32 13); - i32 %2 = add i32 %0, i32 -1; - i1 %3 = icmp sgt i32 %2, i32 2; - i1 %4 = icmp sgt i32 %0, i32 0; - i32 %5 = add i32 %0, i32 -2; - [600 * [600 * [600 * i32]]]* %6 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; - cbr i1 %4(prob = 0.984615), ^b, ^b1; + i1 %2 = icmp sgt i32 %0, i32 0; + i32 %3 = add i32 %0, i32 -1; + [600 * [600 * [600 * i32]]]* %4 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; + cbr i1 %2(prob = 0.984615), ^b, ^b1; ^b: - [4 * i8]* %7 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_0 to [4 * i8]*; - i32* %8 = ptradd [4 * i8]* %7, i32 0; - store i32* %8 with i32 %0; - i8* %9 = functionptr () -> void @cmmc_parallel_body_0 as i8*; - call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %0, i8* %9); + [4 * i8]* %5 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_0 to [4 * i8]*; + i32* %6 = ptradd [4 * i8]* %5, i32 0; + store i32* %6 with i32 %0; + i8* %7 = functionptr () -> void @cmmc_parallel_body_0 as i8*; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %0, i8* %7); ubr ^b1; ^b1: - i1 %10 = icmp sgt i32 %2, i32 1; - [600 * [600 * i32]]* %11 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i64 0]; - cbr i1 %10(prob = 0.984615), ^while.body, ^b2; + i1 %8 = icmp sgt i32 %3, i32 1; + [600 * [600 * i32]]* %9 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i64 0]; + cbr i1 %8(prob = 0.984615), ^while.body, ^b2; ^while.body: - [600 * [600 * i32]]* %12 = phi [^b1, [600 * [600 * i32]]* %11] [^b5, [600 * [600 * i32]]* %14]; - i32 %13 = phi [^b1, i32 1] [^b5, i32 %16]; - [600 * [600 * i32]]* %14 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i32 %13]; - [600 * [600 * i32]]* %15 = getelementptr &([600 * [600 * i32]]* %14)[i64 1]; - i32 %16 = add i32 %13, i32 1; - cbr i1 %3(prob = 0.5), ^b3, ^b4; + [600 * [600 * i32]]* %10 = phi [^b1, [600 * [600 * i32]]* %9] [^b5, [600 * [600 * i32]]* %12]; + i32 %11 = phi [^b1, i32 1] [^b5, i32 %14]; + [600 * [600 * i32]]* %12 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i32 %11]; + [600 * [600 * i32]]* %13 = getelementptr &([600 * [600 * i32]]* %12)[i64 1]; + i32 %14 = add i32 %11, i32 1; + ubr ^b3; ^b2: - [600 * [600 * i32]]* %17 = phi [^b1, [600 * [600 * i32]]* %11] [^b5, [600 * [600 * i32]]* %14]; - i32 %18 = phi [^b1, i32 1] [^b5, i32 %91]; - i32* %19 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i64 0][i64 0][i64 0]; + [600 * [600 * i32]]* %15 = phi [^b1, [600 * [600 * i32]]* %9] [^b5, [600 * [600 * i32]]* %12]; + i32 %16 = phi [^b1, i32 1] [^b5, i32 %31]; + i32* %17 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i64 0][i64 0][i64 0]; call (i32) -> void @stoptime(i32 53); - call (i32, i32*) -> void @putarray(i32 %0, i32* %19); - i32 %20 = sdiv i32 %0, i32 2; - [600 * [600 * i32]]* %21 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i32 %20]; - [600 * i32]* %22 = getelementptr &([600 * [600 * i32]]* %21)[i64 0][i32 %20]; - i32* %23 = getelementptr &([600 * i32]* %22)[i64 0][i64 0]; - call (i32, i32*) -> void @putarray(i32 %0, i32* %23); - i32 %24 = add i32 %18, i32 -1; - [600 * i32]* %25 = getelementptr &([600 * [600 * i32]]* %17)[i64 0][i32 %24]; - i32* %26 = getelementptr &([600 * i32]* %25)[i64 0][i64 0]; - call (i32, i32*) -> void @putarray(i32 %0, i32* %26); + call (i32, i32*) -> void @putarray(i32 %0, i32* %17); + i32 %18 = sdiv i32 %0, i32 2; + [600 * [600 * i32]]* %19 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i32 %18]; + [600 * i32]* %20 = getelementptr &([600 * [600 * i32]]* %19)[i64 0][i32 %18]; + i32* %21 = getelementptr &([600 * i32]* %20)[i64 0][i64 0]; + call (i32, i32*) -> void @putarray(i32 %0, i32* %21); + i32 %22 = add i32 %16, i32 -1; + [600 * i32]* %23 = getelementptr &([600 * [600 * i32]]* %15)[i64 0][i32 %22]; + i32* %24 = getelementptr &([600 * i32]* %23)[i64 0][i64 0]; + call (i32, i32*) -> void @putarray(i32 %0, i32* %24); ret i32 0; ^b3: - i32 %27 = phi [^while.body, i32 1] [^b6, i32 %116]; - [600 * i32]* %28 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i32 %27]; - [600 * i32]* %29 = getelementptr &([600 * i32]* %28)[i64 -1]; - [600 * i32]* %30 = getelementptr &([600 * i32]* %28)[i64 1]; - [600 * i32]* %31 = getelementptr &([600 * [600 * i32]]* %12)[i64 0][i32 %27]; - [600 * i32]* %32 = getelementptr &([600 * [600 * i32]]* %15)[i64 0][i32 %27]; + i32 %25 = phi [^while.body, i32 1] [^b4, i32 %31]; + [600 * i32]* %26 = getelementptr &([600 * [600 * i32]]* %12)[i64 0][i32 %25]; + [600 * i32]* %27 = getelementptr &([600 * i32]* %26)[i64 -1]; + [600 * i32]* %28 = getelementptr &([600 * i32]* %26)[i64 1]; + [600 * i32]* %29 = getelementptr &([600 * [600 * i32]]* %10)[i64 0][i32 %25]; + [600 * i32]* %30 = getelementptr &([600 * [600 * i32]]* %13)[i64 0][i32 %25]; + i32 %31 = add i32 %25, i32 1; ubr ^while.body1; - ^b4: - i32* %33 = getelementptr &([600 * [600 * i32]]* %12)[i64 0][i64 1][i64 1]; + ^while.body1: + i32 %32 = phi [^b3, i32 1] [^while.body1, i32 %52]; + i32* %33 = getelementptr &([600 * i32]* %29)[i64 0][i32 %32]; i32 %34 = load i32* %33; - i32* %35 = getelementptr &([600 * [600 * i32]]* %14)[i64 1][i64 1][i64 1]; + i32* %35 = getelementptr &([600 * i32]* %30)[i64 0][i32 %32]; i32 %36 = load i32* %35; i32 %37 = add i32 %34, i32 %36; - i32* %38 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 0][i64 1]; + i32* %38 = getelementptr &([600 * i32]* %27)[i64 0][i32 %32]; i32 %39 = load i32* %38; i32 %40 = add i32 %37, i32 %39; - i32* %41 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 2][i64 1]; + i32* %41 = getelementptr &([600 * i32]* %28)[i64 0][i32 %32]; i32 %42 = load i32* %41; i32 %43 = add i32 %40, i32 %42; - i32* %44 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 1][i64 0]; - i32 %45 = load i32* %44; - i32 %46 = add i32 %43, i32 %45; - i32* %47 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 1][i64 2]; - i32 %48 = load i32* %47; - i32 %49 = add i32 %46, i32 %48; - i32 %50 = sdiv i32 %49, i32 %1; - i32* %51 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 1][i64 1]; - store i32* %51 with i32 %50; - ubr ^b5; - ^while.body1: - i32 %52 = phi [^b3, i32 1] [^while.body1, i32 %89]; - i32* %53 = getelementptr &([600 * i32]* %31)[i64 0][i32 %52]; - i32 %54 = load i32* %53; - i32* %55 = getelementptr &([600 * i32]* %32)[i64 0][i32 %52]; - i32 %56 = load i32* %55; - i32 %57 = add i32 %54, i32 %56; - i32* %58 = getelementptr &([600 * i32]* %29)[i64 0][i32 %52]; - i32 %59 = load i32* %58; - i32 %60 = add i32 %57, i32 %59; - i32* %61 = getelementptr &([600 * i32]* %30)[i64 0][i32 %52]; - i32 %62 = load i32* %61; - i32 %63 = add i32 %60, i32 %62; - i32* %64 = getelementptr &([600 * i32]* %28)[i64 0][i32 %52]; - i32* %65 = getelementptr &(i32* %64)[i64 -1]; - i32 %66 = load i32* %65; - i32 %67 = add i32 %63, i32 %66; - i32* %68 = getelementptr &(i32* %64)[i64 1]; - i32 %69 = load i32* %68; - i32 %70 = add i32 %67, i32 %69; - i32 %71 = sdiv i32 %70, i32 %1; - store i32* %64 with i32 %71; - i32* %72 = getelementptr &(i32* %53)[i64 1]; - i32 %73 = load i32* %72; - i32* %74 = getelementptr &(i32* %55)[i64 1]; - i32 %75 = load i32* %74; - i32 %76 = add i32 %73, i32 %75; - i32* %77 = getelementptr &(i32* %58)[i64 1]; - i32 %78 = load i32* %77; - i32 %79 = add i32 %76, i32 %78; - i32* %80 = getelementptr &(i32* %61)[i64 1]; - i32 %81 = load i32* %80; - i32 %82 = add i32 %79, i32 %81; - i32 %83 = load i32* %64; - i32 %84 = add i32 %82, i32 %83; - i32* %85 = getelementptr &(i32* %64)[i64 2]; - i32 %86 = load i32* %85; - i32 %87 = add i32 %84, i32 %86; - i32 %88 = sdiv i32 %87, i32 %1; - store i32* %68 with i32 %88; - i32 %89 = add i32 %52, i32 2; - i1 %90 = icmp sgt i32 %5, i32 %89; - cbr i1 %90(prob = 0.969697), ^while.body1, ^scalar.header; + i32* %44 = getelementptr &([600 * i32]* %26)[i64 0][i32 %32]; + i32* %45 = getelementptr &(i32* %44)[i64 -1]; + i32 %46 = load i32* %45; + i32 %47 = add i32 %43, i32 %46; + i32* %48 = getelementptr &(i32* %44)[i64 1]; + i32 %49 = load i32* %48; + i32 %50 = add i32 %47, i32 %49; + i32 %51 = sdiv i32 %50, i32 %1; + store i32* %44 with i32 %51; + i32 %52 = add i32 %32, i32 1; + i1 %53 = icmp sgt i32 %3, i32 %52; + cbr i1 %53(prob = 0.984615), ^while.body1, ^b4; + ^b4: + i1 %54 = icmp sgt i32 %3, i32 %31; + cbr i1 %54(prob = 0.984615), ^b3, ^b5; ^b5: - i32 %91 = phi [^b4, i32 2] [^b6, i32 %116]; - i1 %92 = icmp sgt i32 %2, i32 %16; - cbr i1 %92(prob = 0.984615), ^while.body, ^b2; - ^scalar.header: - i1 %93 = icmp sgt i32 %2, i32 %89; - cbr i1 %93(prob = 0.5), ^while.body2, ^b6; - ^while.body2 {scalar}: - i32 %94 = phi [^scalar.header, i32 %89] [^while.body2, i32 %114]; - i32* %95 = getelementptr &([600 * i32]* %31)[i64 0][i32 %94]; - i32 %96 = load i32* %95; - i32* %97 = getelementptr &([600 * i32]* %32)[i64 0][i32 %94]; - i32 %98 = load i32* %97; - i32 %99 = add i32 %96, i32 %98; - i32* %100 = getelementptr &([600 * i32]* %29)[i64 0][i32 %94]; - i32 %101 = load i32* %100; - i32 %102 = add i32 %99, i32 %101; - i32* %103 = getelementptr &([600 * i32]* %30)[i64 0][i32 %94]; - i32 %104 = load i32* %103; - i32 %105 = add i32 %102, i32 %104; - i32* %106 = getelementptr &([600 * i32]* %28)[i64 0][i32 %94]; - i32* %107 = getelementptr &(i32* %106)[i64 -1]; - i32 %108 = load i32* %107; - i32 %109 = add i32 %105, i32 %108; - i32* %110 = getelementptr &(i32* %106)[i64 1]; - i32 %111 = load i32* %110; - i32 %112 = add i32 %109, i32 %111; - i32 %113 = sdiv i32 %112, i32 %1; - store i32* %106 with i32 %113; - i32 %114 = add i32 %94, i32 1; - i1 %115 = icmp sgt i32 %2, i32 %114; - cbr i1 %115(prob = 0.5), ^while.body2, ^b6; - ^b6: - i32 %116 = add i32 %27, i32 1; - i1 %117 = icmp sgt i32 %2, i32 %116; - cbr i1 %117(prob = 0.984615), ^b3, ^b5; + i1 %55 = icmp sgt i32 %3, i32 %14; + cbr i1 %55(prob = 0.984615), ^while.body, ^b2; } internal func @cmmcParallelFor(i32, i32, i8*) -> void { NoRecurse }; internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -167,729 +96,157 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %5 = icmp sgt i32 %4, i32 0; cbr i1 %5(prob = 0.5), ^cond, ^b1; ^cond: - i1 %6 = icmp sgt i32 %4, i32 1; - i32 %7 = add i32 %1, i32 -57; - i32 %8 = add i32 %1, i32 -26; - i32 %9 = add i32 %1, i32 -11; - i32 %10 = add i32 %1, i32 -4; - i32 %11 = add i32 %1, i32 -1; - [600 * [600 * [600 * i32]]]* %12 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; - cbr i1 %6(prob = 0.5), ^cond1, ^super.header; + i1 %6 = icmp sgt i32 %4, i32 3; + [600 * [600 * [600 * i32]]]* %7 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; + cbr i1 %6(prob = 0.5), ^cond1, ^b2; ^b1: ret; - ^cond1: - i32 %13 = add i32 %4, i32 -1; - i1 %14 = icmp sgt i32 %13, i32 3; - cbr i1 %14(prob = 0.5), ^cond2, ^b2; - ^super.header: - i32 %15 = add i32 %0, i32 1; - i1 %16 = icmp sgt i32 %1, i32 %15; - cbr i1 %16(prob = 0.969697), ^super.header1, ^scalar.header; ^b2: - i32 %17 = phi [^cond1, i32 %0] [^b13, i32 %225]; - [600 * [600 * i32]]* %18 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %17]; + i32 %8 = phi [^cond, i32 %0] [^b5, i32 %57]; + [600 * [600 * i32]]* %9 = getelementptr &([600 * [600 * [600 * i32]]]* %7)[i64 0][i32 %8]; ubr ^while.body; - ^cond2: - i32 %19 = add i32 %4, i32 -4; - i1 %20 = icmp sgt i32 %19, i32 7; - cbr i1 %20(prob = 0.5), ^cond3, ^cond4; - ^super.header1: - i32 %21 = add i32 %0, i32 3; - i1 %22 = icmp sgt i32 %11, i32 %21; - cbr i1 %22(prob = 0.969697), ^super.header2, ^scalar.header1; - ^scalar.header: - i32 %23 = phi [^super.header, i32 %0] [^scalar.header1, i32 %33] [^b5, i32 %61]; - i1 %24 = icmp sgt i32 %1, i32 %23; - cbr i1 %24(prob = 0.5), ^b3, ^b1; - ^while.body: - i32 %25 = phi [^b2, i32 0] [^scalar.final3, i32 %153]; - [600 * i32]* %26 = getelementptr &([600 * [600 * i32]]* %18)[i64 0][i32 %25]; + ^cond1: + i32 %10 = add i32 %4, i32 -3; + i1 %11 = icmp sgt i32 %10, i32 15; + i32 %12 = add i32 %4, i32 -18; + cbr i1 %11(prob = 0.5), ^b3, ^b4; + ^b3: + i32 %13 = phi [^cond1, i32 %0] [^b7, i32 %82]; + [600 * [600 * i32]]* %14 = getelementptr &([600 * [600 * [600 * i32]]]* %7)[i64 0][i32 %13]; ubr ^while.body1; - ^cond3: - i32 %27 = add i32 %4, i32 -11; - i1 %28 = icmp sgt i32 %27, i32 15; - cbr i1 %28(prob = 0.5), ^cond5, ^cond6; - ^super.header2: - i32 %29 = add i32 %0, i32 7; - i1 %30 = icmp sgt i32 %10, i32 %29; - cbr i1 %30(prob = 0.969697), ^super.header3, ^scalar.header2; - ^cond4: - i1 %31 = icmp sgt i32 %19, i32 0; - cbr i1 %31(prob = 0.5), ^b4, ^b1; - ^scalar.header1: - i32 %32 = phi [^super.header1, i32 %0] [^scalar.final, i32 %71]; - i32 %33 = phi [^super.header1, i32 undef] [^scalar.final, i32 %71]; - i1 %34 = icmp sgt i32 %11, i32 %32; - cbr i1 %34(prob = 0.5), ^b5, ^scalar.header; - ^b3 {scalar}: - i32 %35 = phi [^scalar.header, i32 %23] [^b3, i32 %38]; - [600 * [600 * i32]]* %36 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %35]; - i32* %37 = getelementptr &([600 * [600 * i32]]* %36)[i64 0][i64 0][i64 0]; - store i32* %37 with i32 1; - i32 %38 = add i32 %35, i32 1; - i1 %39 = icmp sgt i32 %1, i32 %38; - cbr i1 %39(prob = 0.5), ^b3, ^b1; + ^while.body: + i32 %15 = phi [^b2, i32 0] [^scalar.final, i32 %53]; + [600 * i32]* %16 = getelementptr &([600 * [600 * i32]]* %9)[i64 0][i32 %15]; + ubr ^while.body3; ^b4: - i32 %40 = phi [^cond4, i32 %0] [^b14, i32 %325]; - [600 * [600 * i32]]* %41 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %40]; + i32 %17 = phi [^cond1, i32 %0] [^b6, i32 %80]; + [600 * [600 * i32]]* %18 = getelementptr &([600 * [600 * [600 * i32]]]* %7)[i64 0][i32 %17]; ubr ^while.body2; - ^cond5: - i32 %42 = add i32 %4, i32 -26; - i1 %43 = icmp sgt i32 %42, i32 0; - i1 %44 = icmp sgt i32 %42, i32 31; - i32 %45 = add i32 %4, i32 -57; - cbr i1 %44(prob = 0.5), ^b6, ^cond7; - ^super.header3: - i32 %46 = add i32 %0, i32 15; - i1 %47 = icmp sgt i32 %9, i32 %46; - cbr i1 %47(prob = 0.969697), ^super.header4, ^scalar.header3; - ^while.body1 {scalar}: - i32 %48 = phi [^while.body, i32 0] [^while.body1, i32 %51]; - i32* %49 = getelementptr &([600 * i32]* %26)[i64 0][i32 %48]; - store i32* %49 with i32 1; - i32* %50 = getelementptr &(i32* %49)[i64 1]; - store i32* %50 with i32 1; - i32 %51 = add i32 %48, i32 2; - i1 %52 = icmp sgt i32 %13, i32 %51; - cbr i1 %52(prob = 0.5), ^while.body1, ^scalar.final1; - ^cond6: - i1 %53 = icmp sgt i32 %27, i32 0; - cbr i1 %53(prob = 0.5), ^b7, ^b1; - ^scalar.header2: - i32 %54 = phi [^super.header2, i32 %0] [^scalar.final2, i32 %126]; - i32 %55 = phi [^super.header2, i32 undef] [^scalar.final2, i32 %126]; - i1 %56 = icmp sgt i32 %10, i32 %54; - cbr i1 %56(prob = 0.5), ^b8, ^scalar.final; - ^b5 {scalar}: - i32 %57 = phi [^scalar.header1, i32 %32] [^b5, i32 %61]; - [600 * [600 * i32]]* %58 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %57]; - i32* %59 = getelementptr &([600 * [600 * i32]]* %58)[i64 0][i64 0][i64 0]; - store i32* %59 with i32 1; - i32* %60 = getelementptr &([600 * [600 * i32]]* %58)[i64 1][i64 0][i64 0]; - store i32* %60 with i32 1; - i32 %61 = add i32 %57, i32 2; - i1 %62 = icmp sgt i32 %11, i32 %61; - cbr i1 %62(prob = 0.5), ^b5, ^scalar.header; - ^b6: - i32 %63 = phi [^cond5, i32 %0] [^b16, i32 %379]; - [600 * [600 * i32]]* %64 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %63]; - ubr ^while.body3; + ^while.body1: + i32 %19 = phi [^b3, i32 0] [^scalar.final4, i32 %78]; + [600 * i32]* %20 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i32 %19]; + ubr ^while.body4; ^while.body2: - i32 %65 = phi [^b4, i32 0] [^scalar.final11, i32 %290]; - [600 * i32]* %66 = getelementptr &([600 * [600 * i32]]* %41)[i64 0][i32 %65]; + i32 %21 = phi [^b4, i32 0] [^scalar.final3, i32 %72]; + [600 * i32]* %22 = getelementptr &([600 * [600 * i32]]* %18)[i64 0][i32 %21]; ubr ^while.body5; - ^b7: - i32 %67 = phi [^cond6, i32 %0] [^b15, i32 %365]; - [600 * [600 * i32]]* %68 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %67]; - ubr ^while.body4; - ^super.header4: - i32 %69 = add i32 %0, i32 31; - i1 %70 = icmp sgt i32 %8, i32 %69; - cbr i1 %70(prob = 0.969697), ^b10, ^scalar.header4; - ^scalar.final: - i32 %71 = phi [^scalar.header2, i32 %55] [^b8, i32 %81]; - ubr ^scalar.header1; - ^cond7: - cbr i1 %43(prob = 0.5), ^b9, ^b1; - ^scalar.header3: - i32 %72 = phi [^super.header3, i32 %0] [^scalar.final4, i32 %192]; - i32 %73 = phi [^super.header3, i32 undef] [^scalar.final4, i32 %192]; - i1 %74 = icmp sgt i32 %9, i32 %72; - cbr i1 %74(prob = 0.5), ^b11, ^scalar.final2; - ^b8 {scalar}: - i32 %75 = phi [^scalar.header2, i32 %54] [^b8, i32 %81]; - [600 * [600 * i32]]* %76 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %75]; - i32* %77 = getelementptr &([600 * [600 * i32]]* %76)[i64 0][i64 0][i64 0]; - store i32* %77 with i32 1; - i32* %78 = getelementptr &([600 * [600 * i32]]* %76)[i64 1][i64 0][i64 0]; - store i32* %78 with i32 1; - i32* %79 = getelementptr &([600 * [600 * i32]]* %76)[i64 2][i64 0][i64 0]; - store i32* %79 with i32 1; - i32* %80 = getelementptr &([600 * [600 * i32]]* %76)[i64 3][i64 0][i64 0]; - store i32* %80 with i32 1; - i32 %81 = add i32 %75, i32 4; - i1 %82 = icmp sgt i32 %10, i32 %81; - cbr i1 %82(prob = 0.5), ^b8, ^scalar.final; - ^scalar.final1: - i1 %83 = icmp sgt i32 %4, i32 %51; - cbr i1 %83(prob = 0.5), ^while.body6, ^scalar.final3; - ^while.body3: - i32 %84 = phi [^b6, i32 0] [^scalar.final20, i32 %375]; - [600 * i32]* %85 = getelementptr &([600 * [600 * i32]]* %64)[i64 0][i32 %84]; - ubr ^while.body8; + ^while.body3 {scalar}: + i32 %23 = phi [^while.body, i32 0] [^while.body3, i32 %25]; + i32* %24 = getelementptr &([600 * i32]* %16)[i64 0][i32 %23]; + store i32* %24 with i32 1; + i32 %25 = add i32 %23, i32 1; + i1 %26 = icmp sgt i32 %4, i32 %25; + cbr i1 %26(prob = 0.75), ^while.body3, ^scalar.final; ^while.body4: - i32 %86 = phi [^b7, i32 0] [^scalar.final17, i32 %349]; - [600 * i32]* %87 = getelementptr &([600 * [600 * i32]]* %68)[i64 0][i32 %86]; - ubr ^while.body9; - ^b9: - i32 %88 = phi [^cond7, i32 %0] [^b17, i32 %381]; - [600 * [600 * i32]]* %89 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %88]; - ubr ^while.body7; - ^b10: - i32 %90 = phi [^super.header4, i32 %0] [^b10, i32 %124]; - [600 * [600 * i32]]* %91 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %90]; - i32* %92 = getelementptr &([600 * [600 * i32]]* %91)[i64 0][i64 0][i64 0]; - store i32* %92 with i32 1; - i32* %93 = getelementptr &([600 * [600 * i32]]* %91)[i64 1][i64 0][i64 0]; - store i32* %93 with i32 1; - i32* %94 = getelementptr &([600 * [600 * i32]]* %91)[i64 2][i64 0][i64 0]; - store i32* %94 with i32 1; - i32* %95 = getelementptr &([600 * [600 * i32]]* %91)[i64 3][i64 0][i64 0]; - store i32* %95 with i32 1; - i32* %96 = getelementptr &([600 * [600 * i32]]* %91)[i64 4][i64 0][i64 0]; - store i32* %96 with i32 1; - i32* %97 = getelementptr &([600 * [600 * i32]]* %91)[i64 5][i64 0][i64 0]; - store i32* %97 with i32 1; - i32* %98 = getelementptr &([600 * [600 * i32]]* %91)[i64 6][i64 0][i64 0]; - store i32* %98 with i32 1; - i32* %99 = getelementptr &([600 * [600 * i32]]* %91)[i64 7][i64 0][i64 0]; - store i32* %99 with i32 1; - i32* %100 = getelementptr &([600 * [600 * i32]]* %91)[i64 8][i64 0][i64 0]; - store i32* %100 with i32 1; - i32* %101 = getelementptr &([600 * [600 * i32]]* %91)[i64 9][i64 0][i64 0]; - store i32* %101 with i32 1; - i32* %102 = getelementptr &([600 * [600 * i32]]* %91)[i64 10][i64 0][i64 0]; - store i32* %102 with i32 1; - i32* %103 = getelementptr &([600 * [600 * i32]]* %91)[i64 11][i64 0][i64 0]; - store i32* %103 with i32 1; - i32* %104 = getelementptr &([600 * [600 * i32]]* %91)[i64 12][i64 0][i64 0]; - store i32* %104 with i32 1; - i32* %105 = getelementptr &([600 * [600 * i32]]* %91)[i64 13][i64 0][i64 0]; - store i32* %105 with i32 1; - i32* %106 = getelementptr &([600 * [600 * i32]]* %91)[i64 14][i64 0][i64 0]; - store i32* %106 with i32 1; - i32* %107 = getelementptr &([600 * [600 * i32]]* %91)[i64 15][i64 0][i64 0]; - store i32* %107 with i32 1; - i32* %108 = getelementptr &([600 * [600 * i32]]* %91)[i64 16][i64 0][i64 0]; - store i32* %108 with i32 1; - i32* %109 = getelementptr &([600 * [600 * i32]]* %91)[i64 17][i64 0][i64 0]; - store i32* %109 with i32 1; - i32* %110 = getelementptr &([600 * [600 * i32]]* %91)[i64 18][i64 0][i64 0]; - store i32* %110 with i32 1; - i32* %111 = getelementptr &([600 * [600 * i32]]* %91)[i64 19][i64 0][i64 0]; - store i32* %111 with i32 1; - i32* %112 = getelementptr &([600 * [600 * i32]]* %91)[i64 20][i64 0][i64 0]; - store i32* %112 with i32 1; - i32* %113 = getelementptr &([600 * [600 * i32]]* %91)[i64 21][i64 0][i64 0]; - store i32* %113 with i32 1; - i32* %114 = getelementptr &([600 * [600 * i32]]* %91)[i64 22][i64 0][i64 0]; - store i32* %114 with i32 1; - i32* %115 = getelementptr &([600 * [600 * i32]]* %91)[i64 23][i64 0][i64 0]; - store i32* %115 with i32 1; - i32* %116 = getelementptr &([600 * [600 * i32]]* %91)[i64 24][i64 0][i64 0]; - store i32* %116 with i32 1; - i32* %117 = getelementptr &([600 * [600 * i32]]* %91)[i64 25][i64 0][i64 0]; - store i32* %117 with i32 1; - i32* %118 = getelementptr &([600 * [600 * i32]]* %91)[i64 26][i64 0][i64 0]; - store i32* %118 with i32 1; - i32* %119 = getelementptr &([600 * [600 * i32]]* %91)[i64 27][i64 0][i64 0]; - store i32* %119 with i32 1; - i32* %120 = getelementptr &([600 * [600 * i32]]* %91)[i64 28][i64 0][i64 0]; - store i32* %120 with i32 1; - i32* %121 = getelementptr &([600 * [600 * i32]]* %91)[i64 29][i64 0][i64 0]; - store i32* %121 with i32 1; - i32* %122 = getelementptr &([600 * [600 * i32]]* %91)[i64 30][i64 0][i64 0]; - store i32* %122 with i32 1; - i32* %123 = getelementptr &([600 * [600 * i32]]* %91)[i64 31][i64 0][i64 0]; - store i32* %123 with i32 1; - i32 %124 = add i32 %90, i32 32; - i1 %125 = icmp sgt i32 %7, i32 %124; - cbr i1 %125(prob = 0.969697), ^b10, ^scalar.header4; - ^scalar.final2: - i32 %126 = phi [^scalar.header3, i32 %73] [^b11, i32 %147]; - ubr ^scalar.header2; + i32 %27 = phi [^while.body1, i32 0] [^while.body4, i32 %44]; + i32* %28 = getelementptr &([600 * i32]* %20)[i64 0][i32 %27]; + store i32* %28 with i32 1; + i32* %29 = getelementptr &(i32* %28)[i64 1]; + store i32* %29 with i32 1; + i32* %30 = getelementptr &(i32* %28)[i64 2]; + store i32* %30 with i32 1; + i32* %31 = getelementptr &(i32* %28)[i64 3]; + store i32* %31 with i32 1; + i32* %32 = getelementptr &(i32* %28)[i64 4]; + store i32* %32 with i32 1; + i32* %33 = getelementptr &(i32* %28)[i64 5]; + store i32* %33 with i32 1; + i32* %34 = getelementptr &(i32* %28)[i64 6]; + store i32* %34 with i32 1; + i32* %35 = getelementptr &(i32* %28)[i64 7]; + store i32* %35 with i32 1; + i32* %36 = getelementptr &(i32* %28)[i64 8]; + store i32* %36 with i32 1; + i32* %37 = getelementptr &(i32* %28)[i64 9]; + store i32* %37 with i32 1; + i32* %38 = getelementptr &(i32* %28)[i64 10]; + store i32* %38 with i32 1; + i32* %39 = getelementptr &(i32* %28)[i64 11]; + store i32* %39 with i32 1; + i32* %40 = getelementptr &(i32* %28)[i64 12]; + store i32* %40 with i32 1; + i32* %41 = getelementptr &(i32* %28)[i64 13]; + store i32* %41 with i32 1; + i32* %42 = getelementptr &(i32* %28)[i64 14]; + store i32* %42 with i32 1; + i32* %43 = getelementptr &(i32* %28)[i64 15]; + store i32* %43 with i32 1; + i32 %44 = add i32 %27, i32 16; + i1 %45 = icmp sgt i32 %12, i32 %44; + cbr i1 %45(prob = 0.941176), ^while.body4, ^scalar.header; ^while.body5 {scalar}: - i32 %127 = phi [^while.body2, i32 0] [^while.body5, i32 %132]; - i32* %128 = getelementptr &([600 * i32]* %66)[i64 0][i32 %127]; - store i32* %128 with i32 1; - i32* %129 = getelementptr &(i32* %128)[i64 1]; - store i32* %129 with i32 1; - i32* %130 = getelementptr &(i32* %128)[i64 2]; - store i32* %130 with i32 1; - i32* %131 = getelementptr &(i32* %128)[i64 3]; - store i32* %131 with i32 1; - i32 %132 = add i32 %127, i32 4; - i1 %133 = icmp sgt i32 %19, i32 %132; - cbr i1 %133(prob = 0.5), ^while.body5, ^scalar.final5; - ^scalar.header4: - i32 %134 = phi [^super.header4, i32 %0] [^b10, i32 %124]; - i32 %135 = phi [^super.header4, i32 undef] [^b10, i32 %124]; - i1 %136 = icmp sgt i32 %8, i32 %134; - cbr i1 %136(prob = 0.5), ^b12, ^scalar.final4; - ^b11 {scalar}: - i32 %137 = phi [^scalar.header3, i32 %72] [^b11, i32 %147]; - [600 * [600 * i32]]* %138 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %137]; - i32* %139 = getelementptr &([600 * [600 * i32]]* %138)[i64 0][i64 0][i64 0]; - store i32* %139 with i32 1; - i32* %140 = getelementptr &([600 * [600 * i32]]* %138)[i64 1][i64 0][i64 0]; - store i32* %140 with i32 1; - i32* %141 = getelementptr &([600 * [600 * i32]]* %138)[i64 2][i64 0][i64 0]; - store i32* %141 with i32 1; - i32* %142 = getelementptr &([600 * [600 * i32]]* %138)[i64 3][i64 0][i64 0]; - store i32* %142 with i32 1; - i32* %143 = getelementptr &([600 * [600 * i32]]* %138)[i64 4][i64 0][i64 0]; - store i32* %143 with i32 1; - i32* %144 = getelementptr &([600 * [600 * i32]]* %138)[i64 5][i64 0][i64 0]; - store i32* %144 with i32 1; - i32* %145 = getelementptr &([600 * [600 * i32]]* %138)[i64 6][i64 0][i64 0]; - store i32* %145 with i32 1; - i32* %146 = getelementptr &([600 * [600 * i32]]* %138)[i64 7][i64 0][i64 0]; - store i32* %146 with i32 1; - i32 %147 = add i32 %137, i32 8; - i1 %148 = icmp sgt i32 %9, i32 %147; - cbr i1 %148(prob = 0.5), ^b11, ^scalar.final2; + i32 %46 = phi [^while.body2, i32 0] [^while.body5, i32 %51]; + i32* %47 = getelementptr &([600 * i32]* %22)[i64 0][i32 %46]; + store i32* %47 with i32 1; + i32* %48 = getelementptr &(i32* %47)[i64 1]; + store i32* %48 with i32 1; + i32* %49 = getelementptr &(i32* %47)[i64 2]; + store i32* %49 with i32 1; + i32* %50 = getelementptr &(i32* %47)[i64 3]; + store i32* %50 with i32 1; + i32 %51 = add i32 %46, i32 4; + i1 %52 = icmp sgt i32 %10, i32 %51; + cbr i1 %52(prob = 0.75), ^while.body5, ^scalar.final1; + ^scalar.final: + i32 %53 = add i32 %15, i32 1; + i1 %54 = icmp sgt i32 %4, i32 %53; + cbr i1 %54(prob = 0.984615), ^while.body, ^b5; + ^scalar.header: + i1 %55 = icmp sgt i32 %10, i32 %44; + cbr i1 %55(prob = 0.75), ^while.body6, ^scalar.final2; + ^scalar.final1: + i1 %56 = icmp sgt i32 %4, i32 %51; + cbr i1 %56(prob = 0.75), ^while.body7, ^scalar.final3; + ^b5: + i32 %57 = add i32 %8, i32 1; + i1 %58 = icmp sgt i32 %1, i32 %57; + cbr i1 %58(prob = 0.984615), ^b2, ^b1; ^while.body6 {scalar}: - i32 %149 = phi [^scalar.final1, i32 %51] [^while.body6, i32 %151]; - i32* %150 = getelementptr &([600 * i32]* %26)[i64 0][i32 %149]; - store i32* %150 with i32 1; - i32 %151 = add i32 %149, i32 1; - i1 %152 = icmp sgt i32 %4, i32 %151; - cbr i1 %152(prob = 0.5), ^while.body6, ^scalar.final3; + i32 %59 = phi [^scalar.header, i32 %44] [^while.body6, i32 %64]; + i32* %60 = getelementptr &([600 * i32]* %20)[i64 0][i32 %59]; + store i32* %60 with i32 1; + i32* %61 = getelementptr &(i32* %60)[i64 1]; + store i32* %61 with i32 1; + i32* %62 = getelementptr &(i32* %60)[i64 2]; + store i32* %62 with i32 1; + i32* %63 = getelementptr &(i32* %60)[i64 3]; + store i32* %63 with i32 1; + i32 %64 = add i32 %59, i32 4; + i1 %65 = icmp sgt i32 %10, i32 %64; + cbr i1 %65(prob = 0.75), ^while.body6, ^scalar.final2; + ^scalar.final2: + i32 %66 = phi [^scalar.header, i32 %44] [^while.body6, i32 %64]; + i1 %67 = icmp sgt i32 %4, i32 %66; + cbr i1 %67(prob = 0.75), ^while.body8, ^scalar.final4; + ^while.body7 {scalar}: + i32 %68 = phi [^scalar.final1, i32 %51] [^while.body7, i32 %70]; + i32* %69 = getelementptr &([600 * i32]* %22)[i64 0][i32 %68]; + store i32* %69 with i32 1; + i32 %70 = add i32 %68, i32 1; + i1 %71 = icmp sgt i32 %4, i32 %70; + cbr i1 %71(prob = 0.75), ^while.body7, ^scalar.final3; ^scalar.final3: - i32 %153 = add i32 %25, i32 1; - i1 %154 = icmp sgt i32 %4, i32 %153; - cbr i1 %154(prob = 0.984615), ^while.body, ^b13; - ^while.body7: - i32 %155 = phi [^b9, i32 0] [^scalar.final21, i32 %377]; - [600 * i32]* %156 = getelementptr &([600 * [600 * i32]]* %89)[i64 0][i32 %155]; - ubr ^while.body10; - ^while.body8: - i32 %157 = phi [^while.body3, i32 0] [^while.body8, i32 %190]; - i32* %158 = getelementptr &([600 * i32]* %85)[i64 0][i32 %157]; - store i32* %158 with i32 1; - i32* %159 = getelementptr &(i32* %158)[i64 1]; - store i32* %159 with i32 1; - i32* %160 = getelementptr &(i32* %158)[i64 2]; - store i32* %160 with i32 1; - i32* %161 = getelementptr &(i32* %158)[i64 3]; - store i32* %161 with i32 1; - i32* %162 = getelementptr &(i32* %158)[i64 4]; - store i32* %162 with i32 1; - i32* %163 = getelementptr &(i32* %158)[i64 5]; - store i32* %163 with i32 1; - i32* %164 = getelementptr &(i32* %158)[i64 6]; - store i32* %164 with i32 1; - i32* %165 = getelementptr &(i32* %158)[i64 7]; - store i32* %165 with i32 1; - i32* %166 = getelementptr &(i32* %158)[i64 8]; - store i32* %166 with i32 1; - i32* %167 = getelementptr &(i32* %158)[i64 9]; - store i32* %167 with i32 1; - i32* %168 = getelementptr &(i32* %158)[i64 10]; - store i32* %168 with i32 1; - i32* %169 = getelementptr &(i32* %158)[i64 11]; - store i32* %169 with i32 1; - i32* %170 = getelementptr &(i32* %158)[i64 12]; - store i32* %170 with i32 1; - i32* %171 = getelementptr &(i32* %158)[i64 13]; - store i32* %171 with i32 1; - i32* %172 = getelementptr &(i32* %158)[i64 14]; - store i32* %172 with i32 1; - i32* %173 = getelementptr &(i32* %158)[i64 15]; - store i32* %173 with i32 1; - i32* %174 = getelementptr &(i32* %158)[i64 16]; - store i32* %174 with i32 1; - i32* %175 = getelementptr &(i32* %158)[i64 17]; - store i32* %175 with i32 1; - i32* %176 = getelementptr &(i32* %158)[i64 18]; - store i32* %176 with i32 1; - i32* %177 = getelementptr &(i32* %158)[i64 19]; - store i32* %177 with i32 1; - i32* %178 = getelementptr &(i32* %158)[i64 20]; - store i32* %178 with i32 1; - i32* %179 = getelementptr &(i32* %158)[i64 21]; - store i32* %179 with i32 1; - i32* %180 = getelementptr &(i32* %158)[i64 22]; - store i32* %180 with i32 1; - i32* %181 = getelementptr &(i32* %158)[i64 23]; - store i32* %181 with i32 1; - i32* %182 = getelementptr &(i32* %158)[i64 24]; - store i32* %182 with i32 1; - i32* %183 = getelementptr &(i32* %158)[i64 25]; - store i32* %183 with i32 1; - i32* %184 = getelementptr &(i32* %158)[i64 26]; - store i32* %184 with i32 1; - i32* %185 = getelementptr &(i32* %158)[i64 27]; - store i32* %185 with i32 1; - i32* %186 = getelementptr &(i32* %158)[i64 28]; - store i32* %186 with i32 1; - i32* %187 = getelementptr &(i32* %158)[i64 29]; - store i32* %187 with i32 1; - i32* %188 = getelementptr &(i32* %158)[i64 30]; - store i32* %188 with i32 1; - i32* %189 = getelementptr &(i32* %158)[i64 31]; - store i32* %189 with i32 1; - i32 %190 = add i32 %157, i32 32; - i1 %191 = icmp sgt i32 %45, i32 %190; - cbr i1 %191(prob = 0.969697), ^while.body8, ^scalar.header5; + i32 %72 = add i32 %21, i32 1; + i1 %73 = icmp sgt i32 %4, i32 %72; + cbr i1 %73(prob = 0.984615), ^while.body2, ^b6; + ^while.body8 {scalar}: + i32 %74 = phi [^scalar.final2, i32 %66] [^while.body8, i32 %76]; + i32* %75 = getelementptr &([600 * i32]* %20)[i64 0][i32 %74]; + store i32* %75 with i32 1; + i32 %76 = add i32 %74, i32 1; + i1 %77 = icmp sgt i32 %4, i32 %76; + cbr i1 %77(prob = 0.75), ^while.body8, ^scalar.final4; ^scalar.final4: - i32 %192 = phi [^scalar.header4, i32 %135] [^b12, i32 %222]; - ubr ^scalar.header3; - ^while.body9 {scalar}: - i32 %193 = phi [^while.body4, i32 0] [^while.body9, i32 %202]; - i32* %194 = getelementptr &([600 * i32]* %87)[i64 0][i32 %193]; - store i32* %194 with i32 1; - i32* %195 = getelementptr &(i32* %194)[i64 1]; - store i32* %195 with i32 1; - i32* %196 = getelementptr &(i32* %194)[i64 2]; - store i32* %196 with i32 1; - i32* %197 = getelementptr &(i32* %194)[i64 3]; - store i32* %197 with i32 1; - i32* %198 = getelementptr &(i32* %194)[i64 4]; - store i32* %198 with i32 1; - i32* %199 = getelementptr &(i32* %194)[i64 5]; - store i32* %199 with i32 1; - i32* %200 = getelementptr &(i32* %194)[i64 6]; - store i32* %200 with i32 1; - i32* %201 = getelementptr &(i32* %194)[i64 7]; - store i32* %201 with i32 1; - i32 %202 = add i32 %193, i32 8; - i1 %203 = icmp sgt i32 %27, i32 %202; - cbr i1 %203(prob = 0.5), ^while.body9, ^scalar.final6; - ^b12 {scalar}: - i32 %204 = phi [^scalar.header4, i32 %134] [^b12, i32 %222]; - [600 * [600 * i32]]* %205 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %204]; - i32* %206 = getelementptr &([600 * [600 * i32]]* %205)[i64 0][i64 0][i64 0]; - store i32* %206 with i32 1; - i32* %207 = getelementptr &([600 * [600 * i32]]* %205)[i64 1][i64 0][i64 0]; - store i32* %207 with i32 1; - i32* %208 = getelementptr &([600 * [600 * i32]]* %205)[i64 2][i64 0][i64 0]; - store i32* %208 with i32 1; - i32* %209 = getelementptr &([600 * [600 * i32]]* %205)[i64 3][i64 0][i64 0]; - store i32* %209 with i32 1; - i32* %210 = getelementptr &([600 * [600 * i32]]* %205)[i64 4][i64 0][i64 0]; - store i32* %210 with i32 1; - i32* %211 = getelementptr &([600 * [600 * i32]]* %205)[i64 5][i64 0][i64 0]; - store i32* %211 with i32 1; - i32* %212 = getelementptr &([600 * [600 * i32]]* %205)[i64 6][i64 0][i64 0]; - store i32* %212 with i32 1; - i32* %213 = getelementptr &([600 * [600 * i32]]* %205)[i64 7][i64 0][i64 0]; - store i32* %213 with i32 1; - i32* %214 = getelementptr &([600 * [600 * i32]]* %205)[i64 8][i64 0][i64 0]; - store i32* %214 with i32 1; - i32* %215 = getelementptr &([600 * [600 * i32]]* %205)[i64 9][i64 0][i64 0]; - store i32* %215 with i32 1; - i32* %216 = getelementptr &([600 * [600 * i32]]* %205)[i64 10][i64 0][i64 0]; - store i32* %216 with i32 1; - i32* %217 = getelementptr &([600 * [600 * i32]]* %205)[i64 11][i64 0][i64 0]; - store i32* %217 with i32 1; - i32* %218 = getelementptr &([600 * [600 * i32]]* %205)[i64 12][i64 0][i64 0]; - store i32* %218 with i32 1; - i32* %219 = getelementptr &([600 * [600 * i32]]* %205)[i64 13][i64 0][i64 0]; - store i32* %219 with i32 1; - i32* %220 = getelementptr &([600 * [600 * i32]]* %205)[i64 14][i64 0][i64 0]; - store i32* %220 with i32 1; - i32* %221 = getelementptr &([600 * [600 * i32]]* %205)[i64 15][i64 0][i64 0]; - store i32* %221 with i32 1; - i32 %222 = add i32 %204, i32 16; - i1 %223 = icmp sgt i32 %8, i32 %222; - cbr i1 %223(prob = 0.5), ^b12, ^scalar.final4; - ^scalar.final5: - i1 %224 = icmp sgt i32 %13, i32 %132; - cbr i1 %224(prob = 0.5), ^while.body11, ^scalar.final7; - ^b13: - i32 %225 = add i32 %17, i32 1; - i1 %226 = icmp sgt i32 %1, i32 %225; - cbr i1 %226(prob = 0.984615), ^b2, ^b1; - ^scalar.header5: - i1 %227 = icmp sgt i32 %42, i32 %190; - cbr i1 %227(prob = 0.5), ^while.body12, ^scalar.final8; - ^while.body10 {scalar}: - i32 %228 = phi [^while.body7, i32 0] [^while.body10, i32 %245]; - i32* %229 = getelementptr &([600 * i32]* %156)[i64 0][i32 %228]; - store i32* %229 with i32 1; - i32* %230 = getelementptr &(i32* %229)[i64 1]; - store i32* %230 with i32 1; - i32* %231 = getelementptr &(i32* %229)[i64 2]; - store i32* %231 with i32 1; - i32* %232 = getelementptr &(i32* %229)[i64 3]; - store i32* %232 with i32 1; - i32* %233 = getelementptr &(i32* %229)[i64 4]; - store i32* %233 with i32 1; - i32* %234 = getelementptr &(i32* %229)[i64 5]; - store i32* %234 with i32 1; - i32* %235 = getelementptr &(i32* %229)[i64 6]; - store i32* %235 with i32 1; - i32* %236 = getelementptr &(i32* %229)[i64 7]; - store i32* %236 with i32 1; - i32* %237 = getelementptr &(i32* %229)[i64 8]; - store i32* %237 with i32 1; - i32* %238 = getelementptr &(i32* %229)[i64 9]; - store i32* %238 with i32 1; - i32* %239 = getelementptr &(i32* %229)[i64 10]; - store i32* %239 with i32 1; - i32* %240 = getelementptr &(i32* %229)[i64 11]; - store i32* %240 with i32 1; - i32* %241 = getelementptr &(i32* %229)[i64 12]; - store i32* %241 with i32 1; - i32* %242 = getelementptr &(i32* %229)[i64 13]; - store i32* %242 with i32 1; - i32* %243 = getelementptr &(i32* %229)[i64 14]; - store i32* %243 with i32 1; - i32* %244 = getelementptr &(i32* %229)[i64 15]; - store i32* %244 with i32 1; - i32 %245 = add i32 %228, i32 16; - i1 %246 = icmp sgt i32 %42, i32 %245; - cbr i1 %246(prob = 0.5), ^while.body10, ^scalar.final9; - ^while.body11 {scalar}: - i32 %247 = phi [^scalar.final5, i32 %132] [^while.body11, i32 %250]; - i32* %248 = getelementptr &([600 * i32]* %66)[i64 0][i32 %247]; - store i32* %248 with i32 1; - i32* %249 = getelementptr &(i32* %248)[i64 1]; - store i32* %249 with i32 1; - i32 %250 = add i32 %247, i32 2; - i1 %251 = icmp sgt i32 %13, i32 %250; - cbr i1 %251(prob = 0.5), ^while.body11, ^scalar.final7; - ^scalar.final6: - i1 %252 = icmp sgt i32 %19, i32 %202; - cbr i1 %252(prob = 0.5), ^while.body13, ^scalar.final10; - ^scalar.final7: - i32 %253 = phi [^scalar.final5, i32 %132] [^while.body11, i32 %250]; - i1 %254 = icmp sgt i32 %4, i32 %253; - cbr i1 %254(prob = 0.5), ^while.body14, ^scalar.final11; - ^while.body12 {scalar}: - i32 %255 = phi [^scalar.header5, i32 %190] [^while.body12, i32 %272]; - i32* %256 = getelementptr &([600 * i32]* %85)[i64 0][i32 %255]; - store i32* %256 with i32 1; - i32* %257 = getelementptr &(i32* %256)[i64 1]; - store i32* %257 with i32 1; - i32* %258 = getelementptr &(i32* %256)[i64 2]; - store i32* %258 with i32 1; - i32* %259 = getelementptr &(i32* %256)[i64 3]; - store i32* %259 with i32 1; - i32* %260 = getelementptr &(i32* %256)[i64 4]; - store i32* %260 with i32 1; - i32* %261 = getelementptr &(i32* %256)[i64 5]; - store i32* %261 with i32 1; - i32* %262 = getelementptr &(i32* %256)[i64 6]; - store i32* %262 with i32 1; - i32* %263 = getelementptr &(i32* %256)[i64 7]; - store i32* %263 with i32 1; - i32* %264 = getelementptr &(i32* %256)[i64 8]; - store i32* %264 with i32 1; - i32* %265 = getelementptr &(i32* %256)[i64 9]; - store i32* %265 with i32 1; - i32* %266 = getelementptr &(i32* %256)[i64 10]; - store i32* %266 with i32 1; - i32* %267 = getelementptr &(i32* %256)[i64 11]; - store i32* %267 with i32 1; - i32* %268 = getelementptr &(i32* %256)[i64 12]; - store i32* %268 with i32 1; - i32* %269 = getelementptr &(i32* %256)[i64 13]; - store i32* %269 with i32 1; - i32* %270 = getelementptr &(i32* %256)[i64 14]; - store i32* %270 with i32 1; - i32* %271 = getelementptr &(i32* %256)[i64 15]; - store i32* %271 with i32 1; - i32 %272 = add i32 %255, i32 16; - i1 %273 = icmp sgt i32 %42, i32 %272; - cbr i1 %273(prob = 0.5), ^while.body12, ^scalar.final8; - ^while.body13 {scalar}: - i32 %274 = phi [^scalar.final6, i32 %202] [^while.body13, i32 %279]; - i32* %275 = getelementptr &([600 * i32]* %87)[i64 0][i32 %274]; - store i32* %275 with i32 1; - i32* %276 = getelementptr &(i32* %275)[i64 1]; - store i32* %276 with i32 1; - i32* %277 = getelementptr &(i32* %275)[i64 2]; - store i32* %277 with i32 1; - i32* %278 = getelementptr &(i32* %275)[i64 3]; - store i32* %278 with i32 1; - i32 %279 = add i32 %274, i32 4; - i1 %280 = icmp sgt i32 %19, i32 %279; - cbr i1 %280(prob = 0.5), ^while.body13, ^scalar.final10; - ^scalar.final8: - i32 %281 = phi [^scalar.header5, i32 %190] [^while.body12, i32 %272]; - i1 %282 = icmp sgt i32 %27, i32 %281; - cbr i1 %282(prob = 0.5), ^while.body15, ^scalar.final12; - ^scalar.final9: - i1 %283 = icmp sgt i32 %27, i32 %245; - cbr i1 %283(prob = 0.5), ^while.body16, ^scalar.final13; - ^while.body14 {scalar}: - i32 %284 = phi [^scalar.final7, i32 %253] [^while.body14, i32 %286]; - i32* %285 = getelementptr &([600 * i32]* %66)[i64 0][i32 %284]; - store i32* %285 with i32 1; - i32 %286 = add i32 %284, i32 1; - i1 %287 = icmp sgt i32 %4, i32 %286; - cbr i1 %287(prob = 0.5), ^while.body14, ^scalar.final11; - ^scalar.final10: - i32 %288 = phi [^scalar.final6, i32 %202] [^while.body13, i32 %279]; - i1 %289 = icmp sgt i32 %13, i32 %288; - cbr i1 %289(prob = 0.5), ^while.body17, ^scalar.final14; - ^scalar.final11: - i32 %290 = add i32 %65, i32 1; - i1 %291 = icmp sgt i32 %4, i32 %290; - cbr i1 %291(prob = 0.984615), ^while.body2, ^b14; - ^while.body15 {scalar}: - i32 %292 = phi [^scalar.final8, i32 %281] [^while.body15, i32 %301]; - i32* %293 = getelementptr &([600 * i32]* %85)[i64 0][i32 %292]; - store i32* %293 with i32 1; - i32* %294 = getelementptr &(i32* %293)[i64 1]; - store i32* %294 with i32 1; - i32* %295 = getelementptr &(i32* %293)[i64 2]; - store i32* %295 with i32 1; - i32* %296 = getelementptr &(i32* %293)[i64 3]; - store i32* %296 with i32 1; - i32* %297 = getelementptr &(i32* %293)[i64 4]; - store i32* %297 with i32 1; - i32* %298 = getelementptr &(i32* %293)[i64 5]; - store i32* %298 with i32 1; - i32* %299 = getelementptr &(i32* %293)[i64 6]; - store i32* %299 with i32 1; - i32* %300 = getelementptr &(i32* %293)[i64 7]; - store i32* %300 with i32 1; - i32 %301 = add i32 %292, i32 8; - i1 %302 = icmp sgt i32 %27, i32 %301; - cbr i1 %302(prob = 0.5), ^while.body15, ^scalar.final12; - ^while.body16 {scalar}: - i32 %303 = phi [^scalar.final9, i32 %245] [^while.body16, i32 %312]; - i32* %304 = getelementptr &([600 * i32]* %156)[i64 0][i32 %303]; - store i32* %304 with i32 1; - i32* %305 = getelementptr &(i32* %304)[i64 1]; - store i32* %305 with i32 1; - i32* %306 = getelementptr &(i32* %304)[i64 2]; - store i32* %306 with i32 1; - i32* %307 = getelementptr &(i32* %304)[i64 3]; - store i32* %307 with i32 1; - i32* %308 = getelementptr &(i32* %304)[i64 4]; - store i32* %308 with i32 1; - i32* %309 = getelementptr &(i32* %304)[i64 5]; - store i32* %309 with i32 1; - i32* %310 = getelementptr &(i32* %304)[i64 6]; - store i32* %310 with i32 1; - i32* %311 = getelementptr &(i32* %304)[i64 7]; - store i32* %311 with i32 1; - i32 %312 = add i32 %303, i32 8; - i1 %313 = icmp sgt i32 %27, i32 %312; - cbr i1 %313(prob = 0.5), ^while.body16, ^scalar.final13; - ^while.body17 {scalar}: - i32 %314 = phi [^scalar.final10, i32 %288] [^while.body17, i32 %317]; - i32* %315 = getelementptr &([600 * i32]* %87)[i64 0][i32 %314]; - store i32* %315 with i32 1; - i32* %316 = getelementptr &(i32* %315)[i64 1]; - store i32* %316 with i32 1; - i32 %317 = add i32 %314, i32 2; - i1 %318 = icmp sgt i32 %13, i32 %317; - cbr i1 %318(prob = 0.5), ^while.body17, ^scalar.final14; - ^scalar.final12: - i32 %319 = phi [^scalar.final8, i32 %281] [^while.body15, i32 %301]; - i1 %320 = icmp sgt i32 %19, i32 %319; - cbr i1 %320(prob = 0.5), ^while.body18, ^scalar.final15; - ^scalar.final13: - i32 %321 = phi [^scalar.final9, i32 %245] [^while.body16, i32 %312]; - i1 %322 = icmp sgt i32 %19, i32 %321; - cbr i1 %322(prob = 0.5), ^while.body19, ^scalar.final16; - ^scalar.final14: - i32 %323 = phi [^scalar.final10, i32 %288] [^while.body17, i32 %317]; - i1 %324 = icmp sgt i32 %4, i32 %323; - cbr i1 %324(prob = 0.5), ^while.body20, ^scalar.final17; - ^b14: - i32 %325 = add i32 %40, i32 1; - i1 %326 = icmp sgt i32 %1, i32 %325; - cbr i1 %326(prob = 0.984615), ^b4, ^b1; - ^while.body18 {scalar}: - i32 %327 = phi [^scalar.final12, i32 %319] [^while.body18, i32 %332]; - i32* %328 = getelementptr &([600 * i32]* %85)[i64 0][i32 %327]; - store i32* %328 with i32 1; - i32* %329 = getelementptr &(i32* %328)[i64 1]; - store i32* %329 with i32 1; - i32* %330 = getelementptr &(i32* %328)[i64 2]; - store i32* %330 with i32 1; - i32* %331 = getelementptr &(i32* %328)[i64 3]; - store i32* %331 with i32 1; - i32 %332 = add i32 %327, i32 4; - i1 %333 = icmp sgt i32 %19, i32 %332; - cbr i1 %333(prob = 0.5), ^while.body18, ^scalar.final15; - ^while.body19 {scalar}: - i32 %334 = phi [^scalar.final13, i32 %321] [^while.body19, i32 %339]; - i32* %335 = getelementptr &([600 * i32]* %156)[i64 0][i32 %334]; - store i32* %335 with i32 1; - i32* %336 = getelementptr &(i32* %335)[i64 1]; - store i32* %336 with i32 1; - i32* %337 = getelementptr &(i32* %335)[i64 2]; - store i32* %337 with i32 1; - i32* %338 = getelementptr &(i32* %335)[i64 3]; - store i32* %338 with i32 1; - i32 %339 = add i32 %334, i32 4; - i1 %340 = icmp sgt i32 %19, i32 %339; - cbr i1 %340(prob = 0.5), ^while.body19, ^scalar.final16; - ^while.body20 {scalar}: - i32 %341 = phi [^scalar.final14, i32 %323] [^while.body20, i32 %343]; - i32* %342 = getelementptr &([600 * i32]* %87)[i64 0][i32 %341]; - store i32* %342 with i32 1; - i32 %343 = add i32 %341, i32 1; - i1 %344 = icmp sgt i32 %4, i32 %343; - cbr i1 %344(prob = 0.5), ^while.body20, ^scalar.final17; - ^scalar.final15: - i32 %345 = phi [^scalar.final12, i32 %319] [^while.body18, i32 %332]; - i1 %346 = icmp sgt i32 %13, i32 %345; - cbr i1 %346(prob = 0.5), ^while.body21, ^scalar.final18; - ^scalar.final16: - i32 %347 = phi [^scalar.final13, i32 %321] [^while.body19, i32 %339]; - i1 %348 = icmp sgt i32 %13, i32 %347; - cbr i1 %348(prob = 0.5), ^while.body22, ^scalar.final19; - ^scalar.final17: - i32 %349 = add i32 %86, i32 1; - i1 %350 = icmp sgt i32 %4, i32 %349; - cbr i1 %350(prob = 0.984615), ^while.body4, ^b15; - ^while.body21 {scalar}: - i32 %351 = phi [^scalar.final15, i32 %345] [^while.body21, i32 %354]; - i32* %352 = getelementptr &([600 * i32]* %85)[i64 0][i32 %351]; - store i32* %352 with i32 1; - i32* %353 = getelementptr &(i32* %352)[i64 1]; - store i32* %353 with i32 1; - i32 %354 = add i32 %351, i32 2; - i1 %355 = icmp sgt i32 %13, i32 %354; - cbr i1 %355(prob = 0.5), ^while.body21, ^scalar.final18; - ^while.body22 {scalar}: - i32 %356 = phi [^scalar.final16, i32 %347] [^while.body22, i32 %359]; - i32* %357 = getelementptr &([600 * i32]* %156)[i64 0][i32 %356]; - store i32* %357 with i32 1; - i32* %358 = getelementptr &(i32* %357)[i64 1]; - store i32* %358 with i32 1; - i32 %359 = add i32 %356, i32 2; - i1 %360 = icmp sgt i32 %13, i32 %359; - cbr i1 %360(prob = 0.5), ^while.body22, ^scalar.final19; - ^scalar.final18: - i32 %361 = phi [^scalar.final15, i32 %345] [^while.body21, i32 %354]; - i1 %362 = icmp sgt i32 %4, i32 %361; - cbr i1 %362(prob = 0.5), ^while.body23, ^scalar.final20; - ^scalar.final19: - i32 %363 = phi [^scalar.final16, i32 %347] [^while.body22, i32 %359]; - i1 %364 = icmp sgt i32 %4, i32 %363; - cbr i1 %364(prob = 0.5), ^while.body24, ^scalar.final21; - ^b15: - i32 %365 = add i32 %67, i32 1; - i1 %366 = icmp sgt i32 %1, i32 %365; - cbr i1 %366(prob = 0.984615), ^b7, ^b1; - ^while.body23 {scalar}: - i32 %367 = phi [^scalar.final18, i32 %361] [^while.body23, i32 %369]; - i32* %368 = getelementptr &([600 * i32]* %85)[i64 0][i32 %367]; - store i32* %368 with i32 1; - i32 %369 = add i32 %367, i32 1; - i1 %370 = icmp sgt i32 %4, i32 %369; - cbr i1 %370(prob = 0.5), ^while.body23, ^scalar.final20; - ^while.body24 {scalar}: - i32 %371 = phi [^scalar.final19, i32 %363] [^while.body24, i32 %373]; - i32* %372 = getelementptr &([600 * i32]* %156)[i64 0][i32 %371]; - store i32* %372 with i32 1; - i32 %373 = add i32 %371, i32 1; - i1 %374 = icmp sgt i32 %4, i32 %373; - cbr i1 %374(prob = 0.5), ^while.body24, ^scalar.final21; - ^scalar.final20: - i32 %375 = add i32 %84, i32 1; - i1 %376 = icmp sgt i32 %4, i32 %375; - cbr i1 %376(prob = 0.984615), ^while.body3, ^b16; - ^scalar.final21: - i32 %377 = add i32 %155, i32 1; - i1 %378 = icmp sgt i32 %4, i32 %377; - cbr i1 %378(prob = 0.984615), ^while.body7, ^b17; - ^b16: - i32 %379 = add i32 %63, i32 1; - i1 %380 = icmp sgt i32 %1, i32 %379; - cbr i1 %380(prob = 0.984615), ^b6, ^b1; - ^b17: - i32 %381 = add i32 %88, i32 1; - i1 %382 = icmp sgt i32 %1, i32 %381; - cbr i1 %382(prob = 0.984615), ^b9, ^b1; + i32 %78 = add i32 %19, i32 1; + i1 %79 = icmp sgt i32 %4, i32 %78; + cbr i1 %79(prob = 0.984615), ^while.body1, ^b7; + ^b6: + i32 %80 = add i32 %17, i32 1; + i1 %81 = icmp sgt i32 %1, i32 %80; + cbr i1 %81(prob = 0.984615), ^b4, ^b1; + ^b7: + i32 %82 = add i32 %13, i32 1; + i1 %83 = icmp sgt i32 %1, i32 %82; + cbr i1 %83(prob = 0.984615), ^b3, ^b1; } internal [4 * i8]* @cmmc_parallel_body_payload_0, align 8; diff --git a/tests/SysY2022/performance/sl2.arm.s b/tests/SysY2022/performance/sl2.arm.s index 4b74a0ef0..5473b2c2f 100644 --- a/tests/SysY2022/performance/sl2.arm.s +++ b/tests/SysY2022/performance/sl2.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 x: .zero 864000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/sl2.riscv.s b/tests/SysY2022/performance/sl2.riscv.s index 2de6a784a..57daf2e06 100644 --- a/tests/SysY2022/performance/sl2.riscv.s +++ b/tests/SysY2022/performance/sl2.riscv.s @@ -1,1094 +1,247 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 x: .zero 864000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text .p2align 2 .globl main main: - addi sp, sp, -104 + addi sp, sp, -72 sd ra, 0(sp) - sd s6, 8(sp) - sd s1, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s2, 40(sp) - sd s3, 48(sp) - sd s4, 56(sp) + sd s5, 8(sp) + sd s0, 16(sp) + sd s1, 24(sp) + sd s6, 32(sp) + sd s4, 40(sp) + sd s2, 48(sp) + sd s3, 56(sp) sd s7, 64(sp) - sd s9, 72(sp) - sd s8, 80(sp) - sd s11, 88(sp) - sd s10, 96(sp) jal getint - mv s6, a0 + mv s5, a0 jal getint - mv s0, a0 + mv s1, a0 li a0, 13 jal _sysy_starttime - li a0, 75 - addiw s1, s6, -2 -pcrel1697: - auipc a1, %pcrel_hi(x) - addiw s2, s6, -1 - slli s3, a0, 5 - addi s5, a1, %pcrel_lo(pcrel1697) - sub s4, zero, s3 - ble s6, zero, label1452 -pcrel1698: + li a1, 75 +pcrel468: + auipc a0, %pcrel_hi(x) + addiw s0, s5, -1 + slli s2, a1, 5 + addi s4, a0, %pcrel_lo(pcrel468) + sub s3, zero, s2 + lui a0, 352 + addiw s6, a0, -1792 + ble s5, zero, label329 +pcrel469: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1699: +pcrel470: auipc a3, %pcrel_hi(cmmc_parallel_body_0) - sw s6, %pcrel_lo(pcrel1698)(a0) - addi a2, a3, %pcrel_lo(pcrel1699) - mv a1, s6 + sw s5, %pcrel_lo(pcrel469)(a0) + addi a2, a3, %pcrel_lo(pcrel470) + mv a1, s5 mv a0, zero jal cmmcParallelFor -label1452: +label329: li a0, 1 - ble s2, a0, label1496 - lui a2, 352 - mv a3, s5 - addiw a0, a2, -1792 - li a2, 1 - add a1, s5, a0 - lui a5, 352 - addiw a2, a2, 1 - addiw a0, a5, -1792 - li a5, 2 - add a4, a1, a0 - bgt s2, a5, label1462 -.p2align 2 -label1458: - li t0, 601 - slli a0, t0, 2 - lui t0, 352 - add a5, a3, a0 - addiw a3, t0, 612 - lw a4, 0(a5) - add t2, a1, a3 - lw t1, 0(t2) - lw t0, 4(a1) - addw a5, a4, t1 - li a4, 1201 - addw a3, a5, t0 - slli t2, a4, 2 - add t0, a1, s3 - add a5, a1, t2 - addi t2, a0, 4 - lw t1, 0(a5) - lw a5, 0(t0) - addw a4, a3, t1 - add t1, a1, t2 - addw a3, a4, a5 - li t2, 2 - lw a5, 0(t1) - add t1, a1, a0 - addw t0, a3, a5 - divw a4, t0, s0 - sw a4, 0(t1) - ble s2, a2, label1526 -.p2align 2 -label1461: - lui a5, 352 - mv a3, a1 - addiw a2, a2, 1 - addiw a4, a5, -1792 - add a0, a1, a4 - mv a1, a0 - addiw a0, a5, -1792 - li a5, 2 - add a4, a1, a0 - ble s2, a5, label1458 -.p2align 2 -label1462: - add a0, a1, s3 + ble s0, a0, label360 + add t0, s4, s6 + mv a4, s4 + li t1, 1 + add a5, t0, s6 + addiw t1, t1, 1 + add a0, t0, s2 li t2, 1 - mul t5, t2, s3 - add t0, a0, s4 - add t1, a0, s3 - add a5, a4, t5 - add t3, a3, t5 - li t5, 1 - addi t4, t3, 4 - j label1466 -.p2align 2 -label1469: - bgt s2, t5, label1470 + mul t4, t2, s2 + add a2, a0, s3 addiw t2, t2, 1 - ble s2, t2, label1686 -.p2align 2 -label1476: - add a0, a0, s3 - mul t5, t2, s3 - add t1, a0, s3 - add t0, a0, s4 - add a5, a4, t5 - add t3, a3, t5 - li t5, 1 - addi t4, t3, 4 -.p2align 2 -label1466: - sh2add a6, t5, a5 - lw s7, 0(t4) - sh2add a7, t5, t0 - lw s9, 0(a6) - lw s8, 0(a7) - addw t6, s7, s9 - sh2add s7, t5, t1 - addw s9, t6, s8 - lw s11, 0(s7) - sh2add t6, t5, a0 - addw s8, s9, s11 - addiw t5, t5, 2 - lw s10, -4(t6) - addw s9, s8, s10 - lw s8, 4(t6) - addw s11, s9, s8 - divw s8, s11, s0 - sw s8, 0(t6) - lw s9, 4(t4) - lw s11, 4(a6) - lw a6, 4(a7) - addw s8, s9, s11 - lw a7, 4(s7) - addw s10, s8, a6 - lw s8, 0(t6) - addw a6, s10, a7 - lw s9, 8(t6) - addw a7, a6, s8 - addw s7, a7, s9 - divw a6, s7, s0 - sw a6, 4(t6) - ble s1, t5, label1469 - addi t4, t4, 8 - j label1466 -.p2align 2 -label1470: - sh2add t3, t5, t3 + add a1, a5, t4 + add a3, s4, t4 + li t4, 1 + addi t3, a3, 4 + add a3, a0, s2 + j label341 +.p2align 2 +label348: + addi t3, t3, 4 .p2align 2 -label1471: - sh2add t6, t5, a5 - lw a6, 0(t3) - sh2add a7, t5, t0 - lw s7, 0(t6) - addw t4, a6, s7 - sh2add s7, t5, t1 +label341: + sh2add a6, t4, a1 + lw t6, 0(t3) + sh2add a7, t4, a2 + lw s7, 0(a6) lw a6, 0(a7) + addw t5, t6, s7 + sh2add s7, t4, a3 + addw t6, t5, a6 lw a7, 0(s7) - addw t6, t4, a6 - sh2add t4, t5, a0 + sh2add t5, t4, a0 addw a6, t6, a7 - addiw t5, t5, 1 - lw s8, -4(t4) - lw a7, 4(t4) - addw t6, a6, s8 - addw s7, t6, a7 - divw a6, s7, s0 - sw a6, 0(t4) - ble s2, t5, label1613 - addi t3, t3, 4 - j label1471 -.p2align 2 -label1613: + addiw t4, t4, 1 + lw a7, -4(t5) + lw s7, 4(t5) + addw t6, a6, a7 + addw a7, t6, s7 + divw a6, a7, s1 + sw a6, 0(t5) + bgt s0, t4, label348 + ble s0, t2, label411 + add a0, a0, s2 + mul t4, t2, s2 + add a2, a0, s3 addiw t2, t2, 1 - bgt s2, t2, label1476 - bgt s2, a2, label1461 - j label1526 -.p2align 2 -label1686: - bgt s2, a2, label1461 -label1526: - mv s0, a1 + add a1, a5, t4 + add a3, a4, t4 + li t4, 1 + addi t3, a3, 4 + add a3, a0, s2 + j label341 +.p2align 2 +label411: + ble s0, t1, label463 + add a0, t0, s6 + mv a4, t0 + addiw t1, t1, 1 + li t2, 1 + add a5, a0, s6 + mv t0, a0 + mul t4, t2, s2 + add a0, a0, s2 + addiw t2, t2, 1 + add a1, a5, t4 + add a3, a4, t4 + add a2, a0, s3 + li t4, 1 + addi t3, a3, 4 + add a3, a0, s2 + j label341 +label463: + mv s0, t0 mv s1, t2 -label1478: +label330: li a0, 53 jal _sysy_stoptime - mv a0, s6 - mv a1, s5 + mv a0, s5 + mv a1, s4 jal putarray - lui a4, 352 - srliw a0, s6, 31 - addiw a5, a4, -1792 - add a1, s6, a0 + srliw a0, s5, 31 + add a1, s5, a0 + mv a0, s5 sraiw a2, a1, 1 - mul a4, a2, s3 - mul a0, a2, a5 - add a3, s5, a0 - mv a0, s6 + mul a4, a2, s2 + mul a5, a2, s6 + add a3, s4, a5 add a1, a3, a4 jal putarray - mv a0, s6 + mv a0, s5 addiw a3, s1, -1 - mul a2, a3, s3 + mul a2, a3, s2 add a1, s0, a2 jal putarray - mv a0, zero ld ra, 0(sp) - ld s6, 8(sp) - ld s1, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s2, 40(sp) - ld s3, 48(sp) - ld s4, 56(sp) + mv a0, zero + ld s5, 8(sp) + ld s0, 16(sp) + ld s1, 24(sp) + ld s6, 32(sp) + ld s4, 40(sp) + ld s2, 48(sp) + ld s3, 56(sp) ld s7, 64(sp) - ld s9, 72(sp) - ld s8, 80(sp) - ld s11, 88(sp) - ld s10, 96(sp) - addi sp, sp, 104 + addi sp, sp, 72 ret -label1496: - mv s0, s5 +label360: + mv s0, s4 li s1, 1 - j label1478 + j label330 .p2align 2 cmmc_parallel_body_0: - addi sp, sp, -24 - mv t6, a0 -pcrel1448: - auipc a4, %pcrel_hi(cmmc_parallel_body_payload_0) - li a3, 75 - sd s2, 0(sp) - li a0, 1 - sd s1, 8(sp) - sd s0, 16(sp) - lw a2, %pcrel_lo(pcrel1448)(a4) - slli a4, a3, 5 - ble a2, zero, label240 - addiw t4, a1, -57 - addiw t3, a1, -26 - addiw t2, a1, -11 - addiw t0, a1, -4 - addiw a5, a1, -1 -pcrel1449: - auipc a3, %pcrel_hi(x) - addi t1, a3, %pcrel_lo(pcrel1449) - bgt a2, a0, label3 - addiw a2, t6, 1 - ble a1, a2, label186 - addiw a3, t6, 3 - ble a5, a3, label693 - addiw a3, t6, 7 - ble t0, a3, label711 - addiw a3, t6, 15 - ble t2, a3, label731 - addiw a2, t6, 31 - ble t3, a2, label736 - lui t5, 352 - addiw a3, t5, -1792 - mul a4, t6, a3 - mv a3, t6 - add a2, t1, a4 -.p2align 2 -label216: - addiw a3, a3, 32 - lui t5, 352 - sw a0, 0(a2) - addiw a4, t5, -1792 - slli a6, a4, 1 - add t6, a2, a4 - add t5, a2, a6 - sw a0, 0(t6) - sh1add t6, a4, a4 - sw a0, 0(t5) - add a7, a2, t6 - slli t5, a6, 1 - sw a0, 0(a7) - sh2add a6, a4, a4 - add a7, a2, t5 - add s0, a2, a6 - sw a0, 0(a7) - slli a7, t6, 1 - sw a0, 0(s0) - add a6, a2, a7 - lui a7, 2461 - sw a0, 0(a6) - addiw t6, a7, -256 - slli a7, t5, 1 - add a6, a2, t6 - sh3add t5, a4, a4 - add t6, a2, a7 - sw a0, 0(a6) - lui a7, 5977 - add a6, a2, t5 - sw a0, 0(t6) - lui t6, 3516 - sw a0, 0(a6) - addiw t5, t6, -1536 - lui a6, 4570 - lui t6, 3867 - add a4, a2, t5 - addiw t5, t6, 768 - sw a0, 0(a4) - lui t6, 4219 - add a4, a2, t5 - addiw t5, t6, -1024 - sw a0, 0(a4) - addiw t6, a6, 1280 - add a4, a2, t5 - lui a6, 4922 - add t5, a2, t6 - sw a0, 0(a4) - lui t6, 5273 - addiw a4, a6, -512 - sw a0, 0(t5) - lui a6, 5625 - add t5, a2, a4 - addiw a4, t6, 1792 - sw a0, 0(t5) - add t6, a2, a6 - add t5, a2, a4 - addiw a4, a7, -1792 - sw a0, 0(t5) - add t5, a2, a4 - sw a0, 0(t6) - lui t6, 6328 - sw a0, 0(t5) - addiw a4, t6, 512 - lui t6, 6680 - add t5, a2, a4 - addiw a4, t6, -1280 - sw a0, 0(t5) - lui t6, 7031 - add t5, a2, a4 - addiw a4, t6, 1024 - sw a0, 0(t5) - lui t6, 7383 - add t5, a2, a4 - addiw a4, t6, -768 - sw a0, 0(t5) - lui t6, 7734 - add t5, a2, a4 - addiw a4, t6, 1536 - sw a0, 0(t5) - lui t6, 8086 - add t5, a2, a4 - addiw a4, t6, -256 - sw a0, 0(t5) - lui t6, 8438 - add t5, a2, a4 - addiw a4, t6, -2048 - sw a0, 0(t5) - lui t6, 8789 - add t5, a2, a4 - addiw a4, t6, 256 - sw a0, 0(t5) - lui t6, 9141 - add t5, a2, a4 - addiw a4, t6, -1536 - sw a0, 0(t5) - lui t6, 9492 - add t5, a2, a4 - addiw a4, t6, 768 - sw a0, 0(t5) - lui t6, 9844 - add t5, a2, a4 - addiw a4, t6, -1024 - sw a0, 0(t5) - lui t6, 10195 - add t5, a2, a4 - addiw a4, t6, 1280 - sw a0, 0(t5) - lui t6, 10547 - add t5, a2, a4 - addiw a4, t6, -512 - sw a0, 0(t5) - lui t6, 10898 - add t5, a2, a4 - addiw a4, t6, 1792 - sw a0, 0(t5) - add t5, a2, a4 - sw a0, 0(t5) - ble t4, a3, label774 - lui a4, 11250 - add a2, a2, a4 - j label216 -label240: - ld s2, 0(sp) - ld s1, 8(sp) - ld s0, 16(sp) - addi sp, sp, 24 + mv t4, a0 + mv a3, a1 +pcrel327: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_0) + li a5, 75 + lui t0, 352 + li t1, 1 + li a1, 1 + addiw a4, t0, -1792 + lw a0, %pcrel_lo(pcrel327)(a2) + slli a2, a5, 5 + slli a5, t1, 32 + bgt a0, zero, label3 +label2: ret -label774: - mv t6, a3 -label220: - ble t3, t6, label779 - lui t4, 352 - addiw a3, t4, -1792 - mul a4, t6, a3 - add a2, t1, a4 -label224: - addiw a3, t6, 16 - lui t5, 352 - sw a0, 0(a2) - addiw a4, t5, -1792 - sh1add t5, a4, a4 - slli t6, a4, 1 - add t4, a2, a4 - add a7, a2, t5 - add a6, a2, t6 - sw a0, 0(t4) - slli t4, t6, 1 - sw a0, 0(a6) - sh2add t6, a4, a4 - add a6, a2, t4 - sw a0, 0(a7) - add a7, a2, t6 - sw a0, 0(a6) - slli a6, t5, 1 - sw a0, 0(a7) - add t6, a2, a6 - lui a7, 2461 - slli a6, t4, 1 - sw a0, 0(t6) - addiw t5, a7, -256 - add t6, a2, t5 - add t5, a2, a6 - sw a0, 0(t6) - lui a6, 3516 - sh3add t6, a4, a4 - sw a0, 0(t5) - addiw a4, a6, -1536 - add t4, a2, t6 - add t5, a2, a4 - lui t6, 3867 - sw a0, 0(t4) - addiw t4, t6, 768 - sw a0, 0(t5) - lui t6, 4219 - add a4, a2, t4 - addiw t4, t6, -1024 - sw a0, 0(a4) - lui t6, 4570 - add t5, a2, t4 - addiw a4, t6, 1280 - sw a0, 0(t5) - lui t6, 4922 - add t4, a2, a4 - addiw a4, t6, -512 - sw a0, 0(t4) - lui t6, 5273 - add t5, a2, a4 - addiw t4, t6, 1792 - sw a0, 0(t5) - add a4, a2, t4 - sw a0, 0(a4) - bgt t3, a3, label227 -label779: - mv a2, a3 - mv t6, a3 -label230: - bgt t2, t6, label235 - mv t6, a2 - j label203 label3: - addiw a3, a2, -1 - li a5, 3 - bgt a3, a5, label4 - lui t3, 352 + auipc t0, %pcrel_hi(x) + li t1, 3 + addi t2, t0, %pcrel_lo(label3) + bgt a0, t1, label4 + mul t1, t4, a4 + mv t0, t4 + add a5, t2, t1 mv t4, zero - addiw t0, t3, -1792 - mul t2, t6, t0 - mv t0, t6 - add a5, t1, t2 - mv t2, zero mv t3, a5 + mv t2, zero mv t1, a5 - j label171 + j label61 .p2align 2 -label1347: +label65: addiw t0, t0, 1 - ble a1, t0, label240 + ble a3, t0, label2 .p2align 2 -label183: - lui t2, 352 - li t4, 2 - li a6, 1 - addiw t1, t2, -1792 - slli t5, a6, 32 +label66: + add a5, a5, a4 mv t2, zero - add a5, a5, t1 - addi t6, t5, 1 + li t4, 1 mv t3, a5 mv t1, a5 - sd t6, 0(a5) - ble a3, t4, label1385 + sw a1, 0(a5) + ble a0, t4, label309 .p2align 2 -label174: - addi t3, t3, 8 -.p2align 2 -label171: - addiw t4, t4, 2 - li a6, 1 - slli t5, a6, 32 - addi t6, t5, 1 - sd t6, 0(t3) - bgt a3, t4, label174 - bgt a2, t4, label176 -.p2align 2 -label1345: - addiw t2, t2, 1 - bgt a2, t2, label184 -.p2align 2 -label1366: - addiw t0, t0, 1 - bgt a1, t0, label183 - j label240 -.p2align 2 -label180: +label68: addi t3, t3, 4 - mv t4, t5 .p2align 2 -label177: - addiw t5, t4, 1 - sw a0, 0(t3) - bgt a2, t5, label180 +label61: + addiw t4, t4, 1 + sw a1, 0(t3) + bgt a0, t4, label68 addiw t2, t2, 1 - ble a2, t2, label1347 + ble a0, t2, label65 .p2align 2 -label184: - add t1, t1, a4 - li t4, 2 - li a6, 1 +label67: + add t1, t1, a2 + li t4, 1 + sw a1, 0(t1) mv t3, t1 - slli t5, a6, 32 - addi t6, t5, 1 - sd t6, 0(t1) - bgt a3, t4, label174 - bgt a2, t4, label176 + bgt a0, t4, label68 addiw t2, t2, 1 - bgt a2, t2, label184 - j label1366 -.p2align 2 -label1385: - ble a2, t4, label1345 -.p2align 2 -label176: - sh2add t3, t4, t1 - j label177 -label239: - lui t3, 2813 - mv t6, a3 - addiw a4, t3, -2048 - add a2, a2, a4 -label236: - addiw a3, t6, 8 - lui t4, 352 - sw a0, 0(a2) - addiw a4, t4, -1792 - slli t4, a4, 1 - add t3, a2, a4 - add t5, a2, t4 - sw a0, 0(t3) - sh1add t3, a4, a4 - sw a0, 0(t5) - add a6, a2, t3 - slli t5, t4, 1 - sw a0, 0(a6) - sh2add t4, a4, a4 - add t6, a2, t5 - slli a4, t3, 1 - add t5, a2, t4 - sw a0, 0(t6) - add t4, a2, a4 - sw a0, 0(t5) - lui t5, 2461 - sw a0, 0(t4) - addiw a4, t5, -256 - add t3, a2, a4 - sw a0, 0(t3) - bgt t2, a3, label239 - mv a2, a3 - mv t6, a3 -label203: - bgt t0, t6, label206 - mv t6, a2 -label194: - ble a5, t6, label697 - lui t0, 352 - addiw a3, t0, -1792 - mul a4, t6, a3 - add a2, t1, a4 -label198: - addiw t6, t6, 2 - lui t0, 352 - sw a0, 0(a2) - addiw a3, t0, -1792 - add a4, a2, a3 - sw a0, 0(a4) - bgt a5, t6, label201 - j label186 -label697: - mv t6, a2 -label186: - ble a1, t6, label240 - lui a5, 352 - addiw a4, a5, -1792 - mul a3, t6, a4 - add a2, t1, a3 -label189: - addiw t6, t6, 1 - sw a0, 0(a2) - ble a1, t6, label240 - lui a4, 352 - addiw a3, a4, -1792 - add a2, a2, a3 - j label189 -label206: - lui t2, 352 - addiw a3, t2, -1792 - mul a4, t6, a3 - add a2, t1, a4 -label207: - addiw a3, t6, 4 - lui t2, 352 - sw a0, 0(a2) - addiw a4, t2, -1792 - slli t2, a4, 1 - add t3, a2, a4 - add t4, a2, t2 - sw a0, 0(t3) - sh1add t3, a4, a4 - sw a0, 0(t4) - add t2, a2, t3 - sw a0, 0(t2) - ble t0, a3, label725 - lui t2, 1406 - mv t6, a3 - addiw a4, t2, 1024 - add a2, a2, a4 - j label207 + bgt a0, t2, label67 + addiw t0, t0, 1 + bgt a3, t0, label66 + j label2 label4: - addiw a5, a2, -4 - li t0, 7 - bgt a5, t0, label5 - ble a5, zero, label240 - lui t4, 352 - addiw t2, t4, -1792 - mul t3, t6, t2 - mv t2, t6 - add t0, t1, t3 - mv t1, t0 - mv t3, zero - mv t4, t0 - mv t5, zero - j label143 -.p2align 2 -label146: - addi t4, t4, 16 -.p2align 2 -label143: - addiw t5, t5, 4 - li a7, 1 - slli a6, a7, 32 - addi t6, a6, 1 - sd t6, 0(t4) - sd t6, 8(t4) - bgt a5, t5, label146 -.p2align 2 -label600: - bgt a3, t5, label159 - ble a2, t5, label1363 -.p2align 2 -label150: - sh2add t4, t5, t1 - j label151 -.p2align 2 -label154: - addi t4, t4, 4 - mv t5, t6 -.p2align 2 -label151: - addiw t6, t5, 1 - sw a0, 0(t4) - bgt a2, t6, label154 - addiw t3, t3, 1 - ble a2, t3, label156 -.p2align 2 -label158: - add t1, t1, a4 - li t5, 4 - li a7, 1 - mv t4, t1 - slli a6, a7, 32 - addi t6, a6, 1 - sd t6, 0(t1) - sd t6, 8(t1) - bgt a5, t5, label146 - ble a3, t5, label1383 -.p2align 2 -label159: - sh2add t4, t5, t1 -.p2align 2 -label160: - addiw t6, t5, 2 - li s0, 1 - slli a6, s0, 32 - addi a7, a6, 1 - sd a7, 0(t4) - ble a3, t6, label637 - addi t4, t4, 8 - mv t5, t6 - j label160 -.p2align 2 -label637: - mv t5, t6 - bgt a2, t6, label150 - addiw t3, t3, 1 - bgt a2, t3, label158 - j label156 -.p2align 2 -label1383: - bgt a2, t5, label150 - addiw t3, t3, 1 - bgt a2, t3, label158 -label156: - addiw t2, t2, 1 - ble a1, t2, label240 - lui t3, 352 - li t5, 4 - li a7, 1 - addiw t1, t3, -1792 - slli a6, a7, 32 - mv t3, zero - add t0, t0, t1 - addi t6, a6, 1 - mv t4, t0 - mv t1, t0 - sd t6, 0(t0) - sd t6, 8(t0) - bgt a5, t5, label146 - j label600 -.p2align 2 -label1363: - addiw t3, t3, 1 - bgt a2, t3, label158 - j label156 -label5: - addiw t0, a2, -11 - li t2, 15 - bgt t0, t2, label6 - ble t0, zero, label240 - lui t5, 352 - addiw t3, t5, -1792 - mul t4, t6, t3 - mv t3, t6 - add t2, t1, t4 - mv t1, t2 - mv t4, zero - mv t5, t2 - mv t6, zero - j label107 -.p2align 2 -label564: - addiw t4, t4, 1 - ble a2, t4, label116 -.p2align 2 -label118: - add t1, t1, a4 - li t6, 8 - li s0, 1 - mv t5, t1 - slli a7, s0, 32 - addi a6, a7, 1 - sd a6, 0(t1) - sd a6, 8(t1) - sd a6, 16(t1) - sd a6, 24(t1) - ble t0, t6, label1360 -.p2align 2 -label134: - addi t5, t5, 32 -.p2align 2 -label107: - addiw t6, t6, 8 - li s0, 1 - slli a7, s0, 32 - addi a6, a7, 1 - sd a6, 0(t5) - sd a6, 8(t5) - sd a6, 16(t5) - sd a6, 24(t5) - bgt t0, t6, label134 - ble a5, t6, label111 -.p2align 2 -label129: - sh2add t5, t6, t1 -.p2align 2 -label130: - addiw a6, t6, 4 - li s0, 1 - slli t6, s0, 32 - addi a7, t6, 1 - sd a7, 0(t5) - sd a7, 8(t5) - bgt a5, a6, label133 - mv t6, a6 - ble a3, a6, label1340 -.p2align 2 -label124: - sh2add t5, t6, t1 - j label125 -.p2align 2 -label128: - addi t5, t5, 8 -.p2align 2 -label125: - addiw t6, t6, 2 - li s0, 1 - slli a7, s0, 32 - addi a6, a7, 1 - sd a6, 0(t5) - bgt a3, t6, label128 - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 - j label116 -.p2align 2 -label119: - sh2add t5, t6, t1 -.p2align 2 -label120: - addiw a6, t6, 1 - sw a0, 0(t5) - ble a2, a6, label564 - addi t5, t5, 4 - mv t6, a6 - j label120 -.p2align 2 -label111: - bgt a3, t6, label124 - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 - j label116 -.p2align 2 -label1360: - bgt a5, t6, label129 - bgt a3, t6, label124 - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 - j label116 -.p2align 2 -label1340: - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 -label116: - addiw t3, t3, 1 - ble a1, t3, label240 - lui t4, 352 - mv t6, zero - addiw t1, t4, -1792 - mv t4, zero - add t2, t2, t1 - mv t5, t2 - mv t1, t2 - j label107 -label6: - addiw t2, a2, -26 - addiw t3, a2, -57 - li t4, 31 - ble t2, t4, label56 - lui a7, 352 - addiw a6, a7, -1792 - mul t4, t6, a6 - add t5, t1, t4 - mv t4, t5 - mv a6, zero - mv t1, t5 - mv a7, zero - j label14 -.p2align 2 -label17: - addi t1, t1, 128 -.p2align 2 -label14: - addiw a7, a7, 32 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - sd s0, 16(t1) - sd s0, 24(t1) - sd s0, 32(t1) - sd s0, 40(t1) - sd s0, 48(t1) - sd s0, 56(t1) - sd s0, 64(t1) - sd s0, 72(t1) - sd s0, 80(t1) - sd s0, 88(t1) - sd s0, 96(t1) - sd s0, 104(t1) - sd s0, 112(t1) - sd s0, 120(t1) - bgt t3, a7, label17 - ble t2, a7, label1324 - sh2add t1, a7, t4 - j label20 -.p2align 2 -label23: - addi t1, t1, 64 -.p2align 2 -label20: - addiw a7, a7, 16 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - sd s0, 16(t1) - sd s0, 24(t1) - sd s0, 32(t1) - sd s0, 40(t1) - sd s0, 48(t1) - sd s0, 56(t1) - bgt t2, a7, label23 - ble t0, a7, label1326 -.p2align 2 -label51: - sh2add t1, a7, t4 - j label52 -.p2align 2 -label55: - addi t1, t1, 32 -.p2align 2 -label52: - addiw a7, a7, 8 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - sd s0, 16(t1) - sd s0, 24(t1) - bgt t0, a7, label55 - bgt a5, a7, label46 - bgt a3, a7, label41 - ble a2, a7, label1374 -.p2align 2 -label36: - sh2add t1, a7, t4 -.p2align 2 -label37: - addiw a7, a7, 1 - sw a0, 0(t1) - ble a2, a7, label381 - addi t1, t1, 4 - j label37 -.p2align 2 -label1326: - ble a5, a7, label1350 -.p2align 2 -label46: - sh2add t1, a7, t4 -.p2align 2 -label47: - addiw a7, a7, 4 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - bgt a5, a7, label50 - bgt a3, a7, label41 - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -.p2align 2 -label381: - addiw a6, a6, 1 - ble a2, a6, label33 -.p2align 2 -label35: - add t4, t4, a4 - mv a7, zero - mv t1, t4 - j label14 -.p2align 2 -label1324: - bgt t0, a7, label51 - bgt a5, a7, label46 - ble a3, a7, label1387 -.p2align 2 -label41: - sh2add t1, a7, t4 - j label42 -.p2align 2 -label45: - addi t1, t1, 8 -.p2align 2 -label42: - addiw a7, a7, 2 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - bgt a3, a7, label45 - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -.p2align 2 -label1350: - bgt a3, a7, label41 - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -.p2align 2 -label1387: - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 -label33: - addiw t6, t6, 1 - ble a1, t6, label240 - lui t4, 352 - mv a6, zero - mv a7, zero - addiw t1, t4, -1792 - add t5, t5, t1 - mv t1, t5 - mv t4, t5 - j label14 -.p2align 2 -label1374: - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -label56: - ble t2, zero, label240 - lui a6, 352 - addiw t4, a6, -1792 - mul t5, t6, t4 - mv t4, t6 - add t3, t1, t5 - mv t1, t3 + addiw t0, a0, -3 + addiw t1, a0, -18 + li t3, 15 + ble t0, t3, label88 + mul t5, t4, a4 + add t3, t2, t5 + mv t2, t3 mv t5, zero mv t6, t3 mv a6, zero - j label64 + j label12 .p2align 2 -label98: +label15: addi t6, t6, 64 .p2align 2 -label64: +label12: addiw a6, a6, 16 - li s1, 1 - slli s0, s1, 32 - addi a7, s0, 1 + ori a7, a5, 1 sd a7, 0(t6) sd a7, 8(t6) sd a7, 16(t6) @@ -1097,153 +250,147 @@ label64: sd a7, 40(t6) sd a7, 48(t6) sd a7, 56(t6) - bgt t2, a6, label98 - ble t0, a6, label447 - sh2add t6, a6, t1 - j label69 + bgt t1, a6, label15 + ble t0, a6, label294 + sh2add t6, a6, t2 + mv a7, a6 + j label18 .p2align 2 -label72: - addi t6, t6, 32 - mv a6, a7 +label21: + addi t6, t6, 16 .p2align 2 -label69: - addiw a7, a6, 8 - li s1, 1 - slli s0, s1, 32 - addi a6, s0, 1 +label18: + addiw a7, a7, 4 + ori a6, a5, 1 sd a6, 0(t6) sd a6, 8(t6) - sd a6, 16(t6) - sd a6, 24(t6) - bgt t0, a7, label72 + bgt t0, a7, label21 mv a6, a7 - bgt a5, a7, label75 - bgt a3, a7, label82 - bgt a2, a7, label93 - addiw t5, t5, 1 - bgt a2, t5, label92 - j label90 -.p2align 2 -label1334: - ble a2, a6, label1357 + ble a0, a7, label296 .p2align 2 -label93: - sh2add t6, a6, t1 +label28: + sh2add t6, a6, t2 .p2align 2 -label94: - addiw a7, a6, 1 - sw a0, 0(t6) - ble a2, a7, label513 +label29: + addiw a6, a6, 1 + sw a1, 0(t6) + ble a0, a6, label149 addi t6, t6, 4 - mv a6, a7 - j label94 -.p2align 2 -label75: - sh2add t6, a6, t1 -.p2align 2 -label76: - addiw a6, a6, 4 - li s1, 1 - slli s0, s1, 32 - addi a7, s0, 1 - sd a7, 0(t6) - sd a7, 8(t6) - ble a5, a6, label476 - addi t6, t6, 16 - j label76 + j label29 .p2align 2 -label513: +label149: addiw t5, t5, 1 - ble a2, t5, label90 + ble a0, t5, label297 .p2align 2 -label92: - add t1, t1, a4 +label27: + add t2, t2, a2 mv a6, zero - mv t6, t1 - j label64 -.p2align 2 -label476: - ble a3, a6, label1334 -.p2align 2 -label82: - sh2add t6, a6, t1 - j label83 -.p2align 2 -label86: - addi t6, t6, 8 + mv t6, t2 + j label12 .p2align 2 -label83: - addiw a6, a6, 2 - li s1, 1 - slli s0, s1, 32 - addi a7, s0, 1 - sd a7, 0(t6) - bgt a3, a6, label86 - bgt a2, a6, label93 +label294: + bgt a0, a6, label28 addiw t5, t5, 1 - bgt a2, t5, label92 - j label90 + bgt a0, t5, label27 + j label25 .p2align 2 -label447: - bgt a5, a6, label75 - bgt a3, a6, label82 - bgt a2, a6, label93 - addiw t5, t5, 1 - bgt a2, t5, label92 -label90: +label297: addiw t4, t4, 1 - ble a1, t4, label240 - lui t5, 352 - mv a6, zero - addiw t1, t5, -1792 + ble a3, t4, label2 +.p2align 2 +label26: + add t3, t3, a4 mv t5, zero - add t3, t3, t1 + mv a6, zero mv t6, t3 - mv t1, t3 - j label64 + mv t2, t3 + j label12 .p2align 2 -label1357: +label296: addiw t5, t5, 1 - bgt a2, t5, label92 - j label90 + bgt a0, t5, label27 +label25: + addiw t4, t4, 1 + bgt a3, t4, label26 + j label2 +.p2align 2 +label309: + addiw t2, t2, 1 + bgt a0, t2, label67 + j label65 +label88: + mul t3, t4, a4 + mv t6, zero + add t1, t2, t3 + mv t5, t1 + mv t3, t1 + mv t2, t4 + mv t4, zero + j label40 +.p2align 2 +label52: + addi t5, t5, 4 +.p2align 2 +label49: + addiw a6, a6, 1 + sw a1, 0(t5) + bgt a0, a6, label52 + addiw t4, t4, 1 + ble a0, t4, label298 .p2align 2 -label133: +label47: + add t3, t3, a2 + li t6, 4 + ori a6, a5, 1 + mv t5, t3 + sd a6, 0(t3) + sd a6, 8(t3) + ble t0, t6, label303 +.p2align 2 +label53: addi t5, t5, 16 - mv t6, a6 - j label130 -label235: - lui t3, 352 - addiw a4, t3, -1792 - mul a3, t6, a4 - add a2, t1, a3 - j label236 -label693: - mv a2, zero - j label194 -label201: - lui a4, 703 - addiw a3, a4, 512 - add a2, a2, a3 - j label198 -label736: - mv a3, zero - j label220 .p2align 2 -label50: - addi t1, t1, 16 - j label47 -label711: - mv a2, zero - j label203 -label731: - mv a2, zero - j label230 -label227: - lui a4, 5625 - mv t6, a3 - add a2, a2, a4 - j label224 -label725: - mv a2, a3 - mv t6, a3 - j label194 +label40: + addiw t6, t6, 4 + ori a6, a5, 1 + sd a6, 0(t5) + sd a6, 8(t5) + bgt t0, t6, label53 +.p2align 2 +label43: + ble a0, t6, label44 +.p2align 2 +label48: + sh2add t5, t6, t3 + mv a6, t6 + j label49 +.p2align 2 +label44: + addiw t4, t4, 1 + bgt a0, t4, label47 +label45: + addiw t2, t2, 1 + bgt a3, t2, label46 + j label2 +.p2align 2 +label303: + bgt a0, t6, label48 + addiw t4, t4, 1 + bgt a0, t4, label47 + j label45 +.p2align 2 +label298: + addiw t2, t2, 1 + ble a3, t2, label2 +.p2align 2 +label46: + add t1, t1, a4 + mv t4, zero + li t6, 4 + ori a6, a5, 1 + mv t5, t1 + mv t3, t1 + sd a6, 0(t1) + sd a6, 8(t1) + bgt t0, t6, label53 + j label43 diff --git a/tests/SysY2022/performance/sl2.sy.ir b/tests/SysY2022/performance/sl2.sy.ir index 61160ca35..b3d55bac9 100644 --- a/tests/SysY2022/performance/sl2.sy.ir +++ b/tests/SysY2022/performance/sl2.sy.ir @@ -8,155 +8,84 @@ func @main() -> i32 { NoRecurse Entry } { i32 %0 = call () -> i32 @getint(); i32 %1 = call () -> i32 @getint(); call (i32) -> void @starttime(i32 13); - i32 %2 = add i32 %0, i32 -1; - i1 %3 = icmp sgt i32 %2, i32 2; - i1 %4 = icmp sgt i32 %0, i32 0; - i32 %5 = add i32 %0, i32 -2; - [600 * [600 * [600 * i32]]]* %6 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; - cbr i1 %4(prob = 0.984615), ^b, ^b1; + i1 %2 = icmp sgt i32 %0, i32 0; + i32 %3 = add i32 %0, i32 -1; + [600 * [600 * [600 * i32]]]* %4 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; + cbr i1 %2(prob = 0.984615), ^b, ^b1; ^b: - [4 * i8]* %7 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_0 to [4 * i8]*; - i32* %8 = ptradd [4 * i8]* %7, i32 0; - store i32* %8 with i32 %0; - i8* %9 = functionptr () -> void @cmmc_parallel_body_0 as i8*; - call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %0, i8* %9); + [4 * i8]* %5 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_0 to [4 * i8]*; + i32* %6 = ptradd [4 * i8]* %5, i32 0; + store i32* %6 with i32 %0; + i8* %7 = functionptr () -> void @cmmc_parallel_body_0 as i8*; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %0, i8* %7); ubr ^b1; ^b1: - i1 %10 = icmp sgt i32 %2, i32 1; - [600 * [600 * i32]]* %11 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i64 0]; - cbr i1 %10(prob = 0.984615), ^while.body, ^b2; + i1 %8 = icmp sgt i32 %3, i32 1; + [600 * [600 * i32]]* %9 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i64 0]; + cbr i1 %8(prob = 0.984615), ^while.body, ^b2; ^while.body: - [600 * [600 * i32]]* %12 = phi [^b1, [600 * [600 * i32]]* %11] [^b5, [600 * [600 * i32]]* %14]; - i32 %13 = phi [^b1, i32 1] [^b5, i32 %16]; - [600 * [600 * i32]]* %14 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i32 %13]; - [600 * [600 * i32]]* %15 = getelementptr &([600 * [600 * i32]]* %14)[i64 1]; - i32 %16 = add i32 %13, i32 1; - cbr i1 %3(prob = 0.5), ^b3, ^b4; + [600 * [600 * i32]]* %10 = phi [^b1, [600 * [600 * i32]]* %9] [^b5, [600 * [600 * i32]]* %12]; + i32 %11 = phi [^b1, i32 1] [^b5, i32 %14]; + [600 * [600 * i32]]* %12 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i32 %11]; + [600 * [600 * i32]]* %13 = getelementptr &([600 * [600 * i32]]* %12)[i64 1]; + i32 %14 = add i32 %11, i32 1; + ubr ^b3; ^b2: - [600 * [600 * i32]]* %17 = phi [^b1, [600 * [600 * i32]]* %11] [^b5, [600 * [600 * i32]]* %14]; - i32 %18 = phi [^b1, i32 1] [^b5, i32 %91]; - i32* %19 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i64 0][i64 0][i64 0]; + [600 * [600 * i32]]* %15 = phi [^b1, [600 * [600 * i32]]* %9] [^b5, [600 * [600 * i32]]* %12]; + i32 %16 = phi [^b1, i32 1] [^b5, i32 %31]; + i32* %17 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i64 0][i64 0][i64 0]; call (i32) -> void @stoptime(i32 53); - call (i32, i32*) -> void @putarray(i32 %0, i32* %19); - i32 %20 = sdiv i32 %0, i32 2; - [600 * [600 * i32]]* %21 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i32 %20]; - [600 * i32]* %22 = getelementptr &([600 * [600 * i32]]* %21)[i64 0][i32 %20]; - i32* %23 = getelementptr &([600 * i32]* %22)[i64 0][i64 0]; - call (i32, i32*) -> void @putarray(i32 %0, i32* %23); - i32 %24 = add i32 %18, i32 -1; - [600 * i32]* %25 = getelementptr &([600 * [600 * i32]]* %17)[i64 0][i32 %24]; - i32* %26 = getelementptr &([600 * i32]* %25)[i64 0][i64 0]; - call (i32, i32*) -> void @putarray(i32 %0, i32* %26); + call (i32, i32*) -> void @putarray(i32 %0, i32* %17); + i32 %18 = sdiv i32 %0, i32 2; + [600 * [600 * i32]]* %19 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i32 %18]; + [600 * i32]* %20 = getelementptr &([600 * [600 * i32]]* %19)[i64 0][i32 %18]; + i32* %21 = getelementptr &([600 * i32]* %20)[i64 0][i64 0]; + call (i32, i32*) -> void @putarray(i32 %0, i32* %21); + i32 %22 = add i32 %16, i32 -1; + [600 * i32]* %23 = getelementptr &([600 * [600 * i32]]* %15)[i64 0][i32 %22]; + i32* %24 = getelementptr &([600 * i32]* %23)[i64 0][i64 0]; + call (i32, i32*) -> void @putarray(i32 %0, i32* %24); ret i32 0; ^b3: - i32 %27 = phi [^while.body, i32 1] [^b6, i32 %116]; - [600 * i32]* %28 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i32 %27]; - [600 * i32]* %29 = getelementptr &([600 * i32]* %28)[i64 -1]; - [600 * i32]* %30 = getelementptr &([600 * i32]* %28)[i64 1]; - [600 * i32]* %31 = getelementptr &([600 * [600 * i32]]* %12)[i64 0][i32 %27]; - [600 * i32]* %32 = getelementptr &([600 * [600 * i32]]* %15)[i64 0][i32 %27]; + i32 %25 = phi [^while.body, i32 1] [^b4, i32 %31]; + [600 * i32]* %26 = getelementptr &([600 * [600 * i32]]* %12)[i64 0][i32 %25]; + [600 * i32]* %27 = getelementptr &([600 * i32]* %26)[i64 -1]; + [600 * i32]* %28 = getelementptr &([600 * i32]* %26)[i64 1]; + [600 * i32]* %29 = getelementptr &([600 * [600 * i32]]* %10)[i64 0][i32 %25]; + [600 * i32]* %30 = getelementptr &([600 * [600 * i32]]* %13)[i64 0][i32 %25]; + i32 %31 = add i32 %25, i32 1; ubr ^while.body1; - ^b4: - i32* %33 = getelementptr &([600 * [600 * i32]]* %12)[i64 0][i64 1][i64 1]; + ^while.body1: + i32 %32 = phi [^b3, i32 1] [^while.body1, i32 %52]; + i32* %33 = getelementptr &([600 * i32]* %29)[i64 0][i32 %32]; i32 %34 = load i32* %33; - i32* %35 = getelementptr &([600 * [600 * i32]]* %14)[i64 1][i64 1][i64 1]; + i32* %35 = getelementptr &([600 * i32]* %30)[i64 0][i32 %32]; i32 %36 = load i32* %35; i32 %37 = add i32 %34, i32 %36; - i32* %38 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 0][i64 1]; + i32* %38 = getelementptr &([600 * i32]* %27)[i64 0][i32 %32]; i32 %39 = load i32* %38; i32 %40 = add i32 %37, i32 %39; - i32* %41 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 2][i64 1]; + i32* %41 = getelementptr &([600 * i32]* %28)[i64 0][i32 %32]; i32 %42 = load i32* %41; i32 %43 = add i32 %40, i32 %42; - i32* %44 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 1][i64 0]; - i32 %45 = load i32* %44; - i32 %46 = add i32 %43, i32 %45; - i32* %47 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 1][i64 2]; - i32 %48 = load i32* %47; - i32 %49 = add i32 %46, i32 %48; - i32 %50 = sdiv i32 %49, i32 %1; - i32* %51 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 1][i64 1]; - store i32* %51 with i32 %50; - ubr ^b5; - ^while.body1: - i32 %52 = phi [^b3, i32 1] [^while.body1, i32 %89]; - i32* %53 = getelementptr &([600 * i32]* %31)[i64 0][i32 %52]; - i32 %54 = load i32* %53; - i32* %55 = getelementptr &([600 * i32]* %32)[i64 0][i32 %52]; - i32 %56 = load i32* %55; - i32 %57 = add i32 %54, i32 %56; - i32* %58 = getelementptr &([600 * i32]* %29)[i64 0][i32 %52]; - i32 %59 = load i32* %58; - i32 %60 = add i32 %57, i32 %59; - i32* %61 = getelementptr &([600 * i32]* %30)[i64 0][i32 %52]; - i32 %62 = load i32* %61; - i32 %63 = add i32 %60, i32 %62; - i32* %64 = getelementptr &([600 * i32]* %28)[i64 0][i32 %52]; - i32* %65 = getelementptr &(i32* %64)[i64 -1]; - i32 %66 = load i32* %65; - i32 %67 = add i32 %63, i32 %66; - i32* %68 = getelementptr &(i32* %64)[i64 1]; - i32 %69 = load i32* %68; - i32 %70 = add i32 %67, i32 %69; - i32 %71 = sdiv i32 %70, i32 %1; - store i32* %64 with i32 %71; - i32* %72 = getelementptr &(i32* %53)[i64 1]; - i32 %73 = load i32* %72; - i32* %74 = getelementptr &(i32* %55)[i64 1]; - i32 %75 = load i32* %74; - i32 %76 = add i32 %73, i32 %75; - i32* %77 = getelementptr &(i32* %58)[i64 1]; - i32 %78 = load i32* %77; - i32 %79 = add i32 %76, i32 %78; - i32* %80 = getelementptr &(i32* %61)[i64 1]; - i32 %81 = load i32* %80; - i32 %82 = add i32 %79, i32 %81; - i32 %83 = load i32* %64; - i32 %84 = add i32 %82, i32 %83; - i32* %85 = getelementptr &(i32* %64)[i64 2]; - i32 %86 = load i32* %85; - i32 %87 = add i32 %84, i32 %86; - i32 %88 = sdiv i32 %87, i32 %1; - store i32* %68 with i32 %88; - i32 %89 = add i32 %52, i32 2; - i1 %90 = icmp sgt i32 %5, i32 %89; - cbr i1 %90(prob = 0.969697), ^while.body1, ^scalar.header; + i32* %44 = getelementptr &([600 * i32]* %26)[i64 0][i32 %32]; + i32* %45 = getelementptr &(i32* %44)[i64 -1]; + i32 %46 = load i32* %45; + i32 %47 = add i32 %43, i32 %46; + i32* %48 = getelementptr &(i32* %44)[i64 1]; + i32 %49 = load i32* %48; + i32 %50 = add i32 %47, i32 %49; + i32 %51 = sdiv i32 %50, i32 %1; + store i32* %44 with i32 %51; + i32 %52 = add i32 %32, i32 1; + i1 %53 = icmp sgt i32 %3, i32 %52; + cbr i1 %53(prob = 0.984615), ^while.body1, ^b4; + ^b4: + i1 %54 = icmp sgt i32 %3, i32 %31; + cbr i1 %54(prob = 0.984615), ^b3, ^b5; ^b5: - i32 %91 = phi [^b4, i32 2] [^b6, i32 %116]; - i1 %92 = icmp sgt i32 %2, i32 %16; - cbr i1 %92(prob = 0.984615), ^while.body, ^b2; - ^scalar.header: - i1 %93 = icmp sgt i32 %2, i32 %89; - cbr i1 %93(prob = 0.5), ^while.body2, ^b6; - ^while.body2 {scalar}: - i32 %94 = phi [^scalar.header, i32 %89] [^while.body2, i32 %114]; - i32* %95 = getelementptr &([600 * i32]* %31)[i64 0][i32 %94]; - i32 %96 = load i32* %95; - i32* %97 = getelementptr &([600 * i32]* %32)[i64 0][i32 %94]; - i32 %98 = load i32* %97; - i32 %99 = add i32 %96, i32 %98; - i32* %100 = getelementptr &([600 * i32]* %29)[i64 0][i32 %94]; - i32 %101 = load i32* %100; - i32 %102 = add i32 %99, i32 %101; - i32* %103 = getelementptr &([600 * i32]* %30)[i64 0][i32 %94]; - i32 %104 = load i32* %103; - i32 %105 = add i32 %102, i32 %104; - i32* %106 = getelementptr &([600 * i32]* %28)[i64 0][i32 %94]; - i32* %107 = getelementptr &(i32* %106)[i64 -1]; - i32 %108 = load i32* %107; - i32 %109 = add i32 %105, i32 %108; - i32* %110 = getelementptr &(i32* %106)[i64 1]; - i32 %111 = load i32* %110; - i32 %112 = add i32 %109, i32 %111; - i32 %113 = sdiv i32 %112, i32 %1; - store i32* %106 with i32 %113; - i32 %114 = add i32 %94, i32 1; - i1 %115 = icmp sgt i32 %2, i32 %114; - cbr i1 %115(prob = 0.5), ^while.body2, ^b6; - ^b6: - i32 %116 = add i32 %27, i32 1; - i1 %117 = icmp sgt i32 %2, i32 %116; - cbr i1 %117(prob = 0.984615), ^b3, ^b5; + i1 %55 = icmp sgt i32 %3, i32 %14; + cbr i1 %55(prob = 0.984615), ^while.body, ^b2; } internal func @cmmcParallelFor(i32, i32, i8*) -> void { NoRecurse }; internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -167,729 +96,157 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %5 = icmp sgt i32 %4, i32 0; cbr i1 %5(prob = 0.5), ^cond, ^b1; ^cond: - i1 %6 = icmp sgt i32 %4, i32 1; - i32 %7 = add i32 %1, i32 -57; - i32 %8 = add i32 %1, i32 -26; - i32 %9 = add i32 %1, i32 -11; - i32 %10 = add i32 %1, i32 -4; - i32 %11 = add i32 %1, i32 -1; - [600 * [600 * [600 * i32]]]* %12 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; - cbr i1 %6(prob = 0.5), ^cond1, ^super.header; + i1 %6 = icmp sgt i32 %4, i32 3; + [600 * [600 * [600 * i32]]]* %7 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; + cbr i1 %6(prob = 0.5), ^cond1, ^b2; ^b1: ret; - ^cond1: - i32 %13 = add i32 %4, i32 -1; - i1 %14 = icmp sgt i32 %13, i32 3; - cbr i1 %14(prob = 0.5), ^cond2, ^b2; - ^super.header: - i32 %15 = add i32 %0, i32 1; - i1 %16 = icmp sgt i32 %1, i32 %15; - cbr i1 %16(prob = 0.969697), ^super.header1, ^scalar.header; ^b2: - i32 %17 = phi [^cond1, i32 %0] [^b13, i32 %225]; - [600 * [600 * i32]]* %18 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %17]; + i32 %8 = phi [^cond, i32 %0] [^b5, i32 %57]; + [600 * [600 * i32]]* %9 = getelementptr &([600 * [600 * [600 * i32]]]* %7)[i64 0][i32 %8]; ubr ^while.body; - ^cond2: - i32 %19 = add i32 %4, i32 -4; - i1 %20 = icmp sgt i32 %19, i32 7; - cbr i1 %20(prob = 0.5), ^cond3, ^cond4; - ^super.header1: - i32 %21 = add i32 %0, i32 3; - i1 %22 = icmp sgt i32 %11, i32 %21; - cbr i1 %22(prob = 0.969697), ^super.header2, ^scalar.header1; - ^scalar.header: - i32 %23 = phi [^super.header, i32 %0] [^scalar.header1, i32 %33] [^b5, i32 %61]; - i1 %24 = icmp sgt i32 %1, i32 %23; - cbr i1 %24(prob = 0.5), ^b3, ^b1; - ^while.body: - i32 %25 = phi [^b2, i32 0] [^scalar.final3, i32 %153]; - [600 * i32]* %26 = getelementptr &([600 * [600 * i32]]* %18)[i64 0][i32 %25]; + ^cond1: + i32 %10 = add i32 %4, i32 -3; + i1 %11 = icmp sgt i32 %10, i32 15; + i32 %12 = add i32 %4, i32 -18; + cbr i1 %11(prob = 0.5), ^b3, ^b4; + ^b3: + i32 %13 = phi [^cond1, i32 %0] [^b7, i32 %82]; + [600 * [600 * i32]]* %14 = getelementptr &([600 * [600 * [600 * i32]]]* %7)[i64 0][i32 %13]; ubr ^while.body1; - ^cond3: - i32 %27 = add i32 %4, i32 -11; - i1 %28 = icmp sgt i32 %27, i32 15; - cbr i1 %28(prob = 0.5), ^cond5, ^cond6; - ^super.header2: - i32 %29 = add i32 %0, i32 7; - i1 %30 = icmp sgt i32 %10, i32 %29; - cbr i1 %30(prob = 0.969697), ^super.header3, ^scalar.header2; - ^cond4: - i1 %31 = icmp sgt i32 %19, i32 0; - cbr i1 %31(prob = 0.5), ^b4, ^b1; - ^scalar.header1: - i32 %32 = phi [^super.header1, i32 %0] [^scalar.final, i32 %71]; - i32 %33 = phi [^super.header1, i32 undef] [^scalar.final, i32 %71]; - i1 %34 = icmp sgt i32 %11, i32 %32; - cbr i1 %34(prob = 0.5), ^b5, ^scalar.header; - ^b3 {scalar}: - i32 %35 = phi [^scalar.header, i32 %23] [^b3, i32 %38]; - [600 * [600 * i32]]* %36 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %35]; - i32* %37 = getelementptr &([600 * [600 * i32]]* %36)[i64 0][i64 0][i64 0]; - store i32* %37 with i32 1; - i32 %38 = add i32 %35, i32 1; - i1 %39 = icmp sgt i32 %1, i32 %38; - cbr i1 %39(prob = 0.5), ^b3, ^b1; + ^while.body: + i32 %15 = phi [^b2, i32 0] [^scalar.final, i32 %53]; + [600 * i32]* %16 = getelementptr &([600 * [600 * i32]]* %9)[i64 0][i32 %15]; + ubr ^while.body3; ^b4: - i32 %40 = phi [^cond4, i32 %0] [^b14, i32 %325]; - [600 * [600 * i32]]* %41 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %40]; + i32 %17 = phi [^cond1, i32 %0] [^b6, i32 %80]; + [600 * [600 * i32]]* %18 = getelementptr &([600 * [600 * [600 * i32]]]* %7)[i64 0][i32 %17]; ubr ^while.body2; - ^cond5: - i32 %42 = add i32 %4, i32 -26; - i1 %43 = icmp sgt i32 %42, i32 0; - i1 %44 = icmp sgt i32 %42, i32 31; - i32 %45 = add i32 %4, i32 -57; - cbr i1 %44(prob = 0.5), ^b6, ^cond7; - ^super.header3: - i32 %46 = add i32 %0, i32 15; - i1 %47 = icmp sgt i32 %9, i32 %46; - cbr i1 %47(prob = 0.969697), ^super.header4, ^scalar.header3; - ^while.body1 {scalar}: - i32 %48 = phi [^while.body, i32 0] [^while.body1, i32 %51]; - i32* %49 = getelementptr &([600 * i32]* %26)[i64 0][i32 %48]; - store i32* %49 with i32 1; - i32* %50 = getelementptr &(i32* %49)[i64 1]; - store i32* %50 with i32 1; - i32 %51 = add i32 %48, i32 2; - i1 %52 = icmp sgt i32 %13, i32 %51; - cbr i1 %52(prob = 0.5), ^while.body1, ^scalar.final1; - ^cond6: - i1 %53 = icmp sgt i32 %27, i32 0; - cbr i1 %53(prob = 0.5), ^b7, ^b1; - ^scalar.header2: - i32 %54 = phi [^super.header2, i32 %0] [^scalar.final2, i32 %126]; - i32 %55 = phi [^super.header2, i32 undef] [^scalar.final2, i32 %126]; - i1 %56 = icmp sgt i32 %10, i32 %54; - cbr i1 %56(prob = 0.5), ^b8, ^scalar.final; - ^b5 {scalar}: - i32 %57 = phi [^scalar.header1, i32 %32] [^b5, i32 %61]; - [600 * [600 * i32]]* %58 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %57]; - i32* %59 = getelementptr &([600 * [600 * i32]]* %58)[i64 0][i64 0][i64 0]; - store i32* %59 with i32 1; - i32* %60 = getelementptr &([600 * [600 * i32]]* %58)[i64 1][i64 0][i64 0]; - store i32* %60 with i32 1; - i32 %61 = add i32 %57, i32 2; - i1 %62 = icmp sgt i32 %11, i32 %61; - cbr i1 %62(prob = 0.5), ^b5, ^scalar.header; - ^b6: - i32 %63 = phi [^cond5, i32 %0] [^b16, i32 %379]; - [600 * [600 * i32]]* %64 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %63]; - ubr ^while.body3; + ^while.body1: + i32 %19 = phi [^b3, i32 0] [^scalar.final4, i32 %78]; + [600 * i32]* %20 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i32 %19]; + ubr ^while.body4; ^while.body2: - i32 %65 = phi [^b4, i32 0] [^scalar.final11, i32 %290]; - [600 * i32]* %66 = getelementptr &([600 * [600 * i32]]* %41)[i64 0][i32 %65]; + i32 %21 = phi [^b4, i32 0] [^scalar.final3, i32 %72]; + [600 * i32]* %22 = getelementptr &([600 * [600 * i32]]* %18)[i64 0][i32 %21]; ubr ^while.body5; - ^b7: - i32 %67 = phi [^cond6, i32 %0] [^b15, i32 %365]; - [600 * [600 * i32]]* %68 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %67]; - ubr ^while.body4; - ^super.header4: - i32 %69 = add i32 %0, i32 31; - i1 %70 = icmp sgt i32 %8, i32 %69; - cbr i1 %70(prob = 0.969697), ^b10, ^scalar.header4; - ^scalar.final: - i32 %71 = phi [^scalar.header2, i32 %55] [^b8, i32 %81]; - ubr ^scalar.header1; - ^cond7: - cbr i1 %43(prob = 0.5), ^b9, ^b1; - ^scalar.header3: - i32 %72 = phi [^super.header3, i32 %0] [^scalar.final4, i32 %192]; - i32 %73 = phi [^super.header3, i32 undef] [^scalar.final4, i32 %192]; - i1 %74 = icmp sgt i32 %9, i32 %72; - cbr i1 %74(prob = 0.5), ^b11, ^scalar.final2; - ^b8 {scalar}: - i32 %75 = phi [^scalar.header2, i32 %54] [^b8, i32 %81]; - [600 * [600 * i32]]* %76 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %75]; - i32* %77 = getelementptr &([600 * [600 * i32]]* %76)[i64 0][i64 0][i64 0]; - store i32* %77 with i32 1; - i32* %78 = getelementptr &([600 * [600 * i32]]* %76)[i64 1][i64 0][i64 0]; - store i32* %78 with i32 1; - i32* %79 = getelementptr &([600 * [600 * i32]]* %76)[i64 2][i64 0][i64 0]; - store i32* %79 with i32 1; - i32* %80 = getelementptr &([600 * [600 * i32]]* %76)[i64 3][i64 0][i64 0]; - store i32* %80 with i32 1; - i32 %81 = add i32 %75, i32 4; - i1 %82 = icmp sgt i32 %10, i32 %81; - cbr i1 %82(prob = 0.5), ^b8, ^scalar.final; - ^scalar.final1: - i1 %83 = icmp sgt i32 %4, i32 %51; - cbr i1 %83(prob = 0.5), ^while.body6, ^scalar.final3; - ^while.body3: - i32 %84 = phi [^b6, i32 0] [^scalar.final20, i32 %375]; - [600 * i32]* %85 = getelementptr &([600 * [600 * i32]]* %64)[i64 0][i32 %84]; - ubr ^while.body8; + ^while.body3 {scalar}: + i32 %23 = phi [^while.body, i32 0] [^while.body3, i32 %25]; + i32* %24 = getelementptr &([600 * i32]* %16)[i64 0][i32 %23]; + store i32* %24 with i32 1; + i32 %25 = add i32 %23, i32 1; + i1 %26 = icmp sgt i32 %4, i32 %25; + cbr i1 %26(prob = 0.75), ^while.body3, ^scalar.final; ^while.body4: - i32 %86 = phi [^b7, i32 0] [^scalar.final17, i32 %349]; - [600 * i32]* %87 = getelementptr &([600 * [600 * i32]]* %68)[i64 0][i32 %86]; - ubr ^while.body9; - ^b9: - i32 %88 = phi [^cond7, i32 %0] [^b17, i32 %381]; - [600 * [600 * i32]]* %89 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %88]; - ubr ^while.body7; - ^b10: - i32 %90 = phi [^super.header4, i32 %0] [^b10, i32 %124]; - [600 * [600 * i32]]* %91 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %90]; - i32* %92 = getelementptr &([600 * [600 * i32]]* %91)[i64 0][i64 0][i64 0]; - store i32* %92 with i32 1; - i32* %93 = getelementptr &([600 * [600 * i32]]* %91)[i64 1][i64 0][i64 0]; - store i32* %93 with i32 1; - i32* %94 = getelementptr &([600 * [600 * i32]]* %91)[i64 2][i64 0][i64 0]; - store i32* %94 with i32 1; - i32* %95 = getelementptr &([600 * [600 * i32]]* %91)[i64 3][i64 0][i64 0]; - store i32* %95 with i32 1; - i32* %96 = getelementptr &([600 * [600 * i32]]* %91)[i64 4][i64 0][i64 0]; - store i32* %96 with i32 1; - i32* %97 = getelementptr &([600 * [600 * i32]]* %91)[i64 5][i64 0][i64 0]; - store i32* %97 with i32 1; - i32* %98 = getelementptr &([600 * [600 * i32]]* %91)[i64 6][i64 0][i64 0]; - store i32* %98 with i32 1; - i32* %99 = getelementptr &([600 * [600 * i32]]* %91)[i64 7][i64 0][i64 0]; - store i32* %99 with i32 1; - i32* %100 = getelementptr &([600 * [600 * i32]]* %91)[i64 8][i64 0][i64 0]; - store i32* %100 with i32 1; - i32* %101 = getelementptr &([600 * [600 * i32]]* %91)[i64 9][i64 0][i64 0]; - store i32* %101 with i32 1; - i32* %102 = getelementptr &([600 * [600 * i32]]* %91)[i64 10][i64 0][i64 0]; - store i32* %102 with i32 1; - i32* %103 = getelementptr &([600 * [600 * i32]]* %91)[i64 11][i64 0][i64 0]; - store i32* %103 with i32 1; - i32* %104 = getelementptr &([600 * [600 * i32]]* %91)[i64 12][i64 0][i64 0]; - store i32* %104 with i32 1; - i32* %105 = getelementptr &([600 * [600 * i32]]* %91)[i64 13][i64 0][i64 0]; - store i32* %105 with i32 1; - i32* %106 = getelementptr &([600 * [600 * i32]]* %91)[i64 14][i64 0][i64 0]; - store i32* %106 with i32 1; - i32* %107 = getelementptr &([600 * [600 * i32]]* %91)[i64 15][i64 0][i64 0]; - store i32* %107 with i32 1; - i32* %108 = getelementptr &([600 * [600 * i32]]* %91)[i64 16][i64 0][i64 0]; - store i32* %108 with i32 1; - i32* %109 = getelementptr &([600 * [600 * i32]]* %91)[i64 17][i64 0][i64 0]; - store i32* %109 with i32 1; - i32* %110 = getelementptr &([600 * [600 * i32]]* %91)[i64 18][i64 0][i64 0]; - store i32* %110 with i32 1; - i32* %111 = getelementptr &([600 * [600 * i32]]* %91)[i64 19][i64 0][i64 0]; - store i32* %111 with i32 1; - i32* %112 = getelementptr &([600 * [600 * i32]]* %91)[i64 20][i64 0][i64 0]; - store i32* %112 with i32 1; - i32* %113 = getelementptr &([600 * [600 * i32]]* %91)[i64 21][i64 0][i64 0]; - store i32* %113 with i32 1; - i32* %114 = getelementptr &([600 * [600 * i32]]* %91)[i64 22][i64 0][i64 0]; - store i32* %114 with i32 1; - i32* %115 = getelementptr &([600 * [600 * i32]]* %91)[i64 23][i64 0][i64 0]; - store i32* %115 with i32 1; - i32* %116 = getelementptr &([600 * [600 * i32]]* %91)[i64 24][i64 0][i64 0]; - store i32* %116 with i32 1; - i32* %117 = getelementptr &([600 * [600 * i32]]* %91)[i64 25][i64 0][i64 0]; - store i32* %117 with i32 1; - i32* %118 = getelementptr &([600 * [600 * i32]]* %91)[i64 26][i64 0][i64 0]; - store i32* %118 with i32 1; - i32* %119 = getelementptr &([600 * [600 * i32]]* %91)[i64 27][i64 0][i64 0]; - store i32* %119 with i32 1; - i32* %120 = getelementptr &([600 * [600 * i32]]* %91)[i64 28][i64 0][i64 0]; - store i32* %120 with i32 1; - i32* %121 = getelementptr &([600 * [600 * i32]]* %91)[i64 29][i64 0][i64 0]; - store i32* %121 with i32 1; - i32* %122 = getelementptr &([600 * [600 * i32]]* %91)[i64 30][i64 0][i64 0]; - store i32* %122 with i32 1; - i32* %123 = getelementptr &([600 * [600 * i32]]* %91)[i64 31][i64 0][i64 0]; - store i32* %123 with i32 1; - i32 %124 = add i32 %90, i32 32; - i1 %125 = icmp sgt i32 %7, i32 %124; - cbr i1 %125(prob = 0.969697), ^b10, ^scalar.header4; - ^scalar.final2: - i32 %126 = phi [^scalar.header3, i32 %73] [^b11, i32 %147]; - ubr ^scalar.header2; + i32 %27 = phi [^while.body1, i32 0] [^while.body4, i32 %44]; + i32* %28 = getelementptr &([600 * i32]* %20)[i64 0][i32 %27]; + store i32* %28 with i32 1; + i32* %29 = getelementptr &(i32* %28)[i64 1]; + store i32* %29 with i32 1; + i32* %30 = getelementptr &(i32* %28)[i64 2]; + store i32* %30 with i32 1; + i32* %31 = getelementptr &(i32* %28)[i64 3]; + store i32* %31 with i32 1; + i32* %32 = getelementptr &(i32* %28)[i64 4]; + store i32* %32 with i32 1; + i32* %33 = getelementptr &(i32* %28)[i64 5]; + store i32* %33 with i32 1; + i32* %34 = getelementptr &(i32* %28)[i64 6]; + store i32* %34 with i32 1; + i32* %35 = getelementptr &(i32* %28)[i64 7]; + store i32* %35 with i32 1; + i32* %36 = getelementptr &(i32* %28)[i64 8]; + store i32* %36 with i32 1; + i32* %37 = getelementptr &(i32* %28)[i64 9]; + store i32* %37 with i32 1; + i32* %38 = getelementptr &(i32* %28)[i64 10]; + store i32* %38 with i32 1; + i32* %39 = getelementptr &(i32* %28)[i64 11]; + store i32* %39 with i32 1; + i32* %40 = getelementptr &(i32* %28)[i64 12]; + store i32* %40 with i32 1; + i32* %41 = getelementptr &(i32* %28)[i64 13]; + store i32* %41 with i32 1; + i32* %42 = getelementptr &(i32* %28)[i64 14]; + store i32* %42 with i32 1; + i32* %43 = getelementptr &(i32* %28)[i64 15]; + store i32* %43 with i32 1; + i32 %44 = add i32 %27, i32 16; + i1 %45 = icmp sgt i32 %12, i32 %44; + cbr i1 %45(prob = 0.941176), ^while.body4, ^scalar.header; ^while.body5 {scalar}: - i32 %127 = phi [^while.body2, i32 0] [^while.body5, i32 %132]; - i32* %128 = getelementptr &([600 * i32]* %66)[i64 0][i32 %127]; - store i32* %128 with i32 1; - i32* %129 = getelementptr &(i32* %128)[i64 1]; - store i32* %129 with i32 1; - i32* %130 = getelementptr &(i32* %128)[i64 2]; - store i32* %130 with i32 1; - i32* %131 = getelementptr &(i32* %128)[i64 3]; - store i32* %131 with i32 1; - i32 %132 = add i32 %127, i32 4; - i1 %133 = icmp sgt i32 %19, i32 %132; - cbr i1 %133(prob = 0.5), ^while.body5, ^scalar.final5; - ^scalar.header4: - i32 %134 = phi [^super.header4, i32 %0] [^b10, i32 %124]; - i32 %135 = phi [^super.header4, i32 undef] [^b10, i32 %124]; - i1 %136 = icmp sgt i32 %8, i32 %134; - cbr i1 %136(prob = 0.5), ^b12, ^scalar.final4; - ^b11 {scalar}: - i32 %137 = phi [^scalar.header3, i32 %72] [^b11, i32 %147]; - [600 * [600 * i32]]* %138 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %137]; - i32* %139 = getelementptr &([600 * [600 * i32]]* %138)[i64 0][i64 0][i64 0]; - store i32* %139 with i32 1; - i32* %140 = getelementptr &([600 * [600 * i32]]* %138)[i64 1][i64 0][i64 0]; - store i32* %140 with i32 1; - i32* %141 = getelementptr &([600 * [600 * i32]]* %138)[i64 2][i64 0][i64 0]; - store i32* %141 with i32 1; - i32* %142 = getelementptr &([600 * [600 * i32]]* %138)[i64 3][i64 0][i64 0]; - store i32* %142 with i32 1; - i32* %143 = getelementptr &([600 * [600 * i32]]* %138)[i64 4][i64 0][i64 0]; - store i32* %143 with i32 1; - i32* %144 = getelementptr &([600 * [600 * i32]]* %138)[i64 5][i64 0][i64 0]; - store i32* %144 with i32 1; - i32* %145 = getelementptr &([600 * [600 * i32]]* %138)[i64 6][i64 0][i64 0]; - store i32* %145 with i32 1; - i32* %146 = getelementptr &([600 * [600 * i32]]* %138)[i64 7][i64 0][i64 0]; - store i32* %146 with i32 1; - i32 %147 = add i32 %137, i32 8; - i1 %148 = icmp sgt i32 %9, i32 %147; - cbr i1 %148(prob = 0.5), ^b11, ^scalar.final2; + i32 %46 = phi [^while.body2, i32 0] [^while.body5, i32 %51]; + i32* %47 = getelementptr &([600 * i32]* %22)[i64 0][i32 %46]; + store i32* %47 with i32 1; + i32* %48 = getelementptr &(i32* %47)[i64 1]; + store i32* %48 with i32 1; + i32* %49 = getelementptr &(i32* %47)[i64 2]; + store i32* %49 with i32 1; + i32* %50 = getelementptr &(i32* %47)[i64 3]; + store i32* %50 with i32 1; + i32 %51 = add i32 %46, i32 4; + i1 %52 = icmp sgt i32 %10, i32 %51; + cbr i1 %52(prob = 0.75), ^while.body5, ^scalar.final1; + ^scalar.final: + i32 %53 = add i32 %15, i32 1; + i1 %54 = icmp sgt i32 %4, i32 %53; + cbr i1 %54(prob = 0.984615), ^while.body, ^b5; + ^scalar.header: + i1 %55 = icmp sgt i32 %10, i32 %44; + cbr i1 %55(prob = 0.75), ^while.body6, ^scalar.final2; + ^scalar.final1: + i1 %56 = icmp sgt i32 %4, i32 %51; + cbr i1 %56(prob = 0.75), ^while.body7, ^scalar.final3; + ^b5: + i32 %57 = add i32 %8, i32 1; + i1 %58 = icmp sgt i32 %1, i32 %57; + cbr i1 %58(prob = 0.984615), ^b2, ^b1; ^while.body6 {scalar}: - i32 %149 = phi [^scalar.final1, i32 %51] [^while.body6, i32 %151]; - i32* %150 = getelementptr &([600 * i32]* %26)[i64 0][i32 %149]; - store i32* %150 with i32 1; - i32 %151 = add i32 %149, i32 1; - i1 %152 = icmp sgt i32 %4, i32 %151; - cbr i1 %152(prob = 0.5), ^while.body6, ^scalar.final3; + i32 %59 = phi [^scalar.header, i32 %44] [^while.body6, i32 %64]; + i32* %60 = getelementptr &([600 * i32]* %20)[i64 0][i32 %59]; + store i32* %60 with i32 1; + i32* %61 = getelementptr &(i32* %60)[i64 1]; + store i32* %61 with i32 1; + i32* %62 = getelementptr &(i32* %60)[i64 2]; + store i32* %62 with i32 1; + i32* %63 = getelementptr &(i32* %60)[i64 3]; + store i32* %63 with i32 1; + i32 %64 = add i32 %59, i32 4; + i1 %65 = icmp sgt i32 %10, i32 %64; + cbr i1 %65(prob = 0.75), ^while.body6, ^scalar.final2; + ^scalar.final2: + i32 %66 = phi [^scalar.header, i32 %44] [^while.body6, i32 %64]; + i1 %67 = icmp sgt i32 %4, i32 %66; + cbr i1 %67(prob = 0.75), ^while.body8, ^scalar.final4; + ^while.body7 {scalar}: + i32 %68 = phi [^scalar.final1, i32 %51] [^while.body7, i32 %70]; + i32* %69 = getelementptr &([600 * i32]* %22)[i64 0][i32 %68]; + store i32* %69 with i32 1; + i32 %70 = add i32 %68, i32 1; + i1 %71 = icmp sgt i32 %4, i32 %70; + cbr i1 %71(prob = 0.75), ^while.body7, ^scalar.final3; ^scalar.final3: - i32 %153 = add i32 %25, i32 1; - i1 %154 = icmp sgt i32 %4, i32 %153; - cbr i1 %154(prob = 0.984615), ^while.body, ^b13; - ^while.body7: - i32 %155 = phi [^b9, i32 0] [^scalar.final21, i32 %377]; - [600 * i32]* %156 = getelementptr &([600 * [600 * i32]]* %89)[i64 0][i32 %155]; - ubr ^while.body10; - ^while.body8: - i32 %157 = phi [^while.body3, i32 0] [^while.body8, i32 %190]; - i32* %158 = getelementptr &([600 * i32]* %85)[i64 0][i32 %157]; - store i32* %158 with i32 1; - i32* %159 = getelementptr &(i32* %158)[i64 1]; - store i32* %159 with i32 1; - i32* %160 = getelementptr &(i32* %158)[i64 2]; - store i32* %160 with i32 1; - i32* %161 = getelementptr &(i32* %158)[i64 3]; - store i32* %161 with i32 1; - i32* %162 = getelementptr &(i32* %158)[i64 4]; - store i32* %162 with i32 1; - i32* %163 = getelementptr &(i32* %158)[i64 5]; - store i32* %163 with i32 1; - i32* %164 = getelementptr &(i32* %158)[i64 6]; - store i32* %164 with i32 1; - i32* %165 = getelementptr &(i32* %158)[i64 7]; - store i32* %165 with i32 1; - i32* %166 = getelementptr &(i32* %158)[i64 8]; - store i32* %166 with i32 1; - i32* %167 = getelementptr &(i32* %158)[i64 9]; - store i32* %167 with i32 1; - i32* %168 = getelementptr &(i32* %158)[i64 10]; - store i32* %168 with i32 1; - i32* %169 = getelementptr &(i32* %158)[i64 11]; - store i32* %169 with i32 1; - i32* %170 = getelementptr &(i32* %158)[i64 12]; - store i32* %170 with i32 1; - i32* %171 = getelementptr &(i32* %158)[i64 13]; - store i32* %171 with i32 1; - i32* %172 = getelementptr &(i32* %158)[i64 14]; - store i32* %172 with i32 1; - i32* %173 = getelementptr &(i32* %158)[i64 15]; - store i32* %173 with i32 1; - i32* %174 = getelementptr &(i32* %158)[i64 16]; - store i32* %174 with i32 1; - i32* %175 = getelementptr &(i32* %158)[i64 17]; - store i32* %175 with i32 1; - i32* %176 = getelementptr &(i32* %158)[i64 18]; - store i32* %176 with i32 1; - i32* %177 = getelementptr &(i32* %158)[i64 19]; - store i32* %177 with i32 1; - i32* %178 = getelementptr &(i32* %158)[i64 20]; - store i32* %178 with i32 1; - i32* %179 = getelementptr &(i32* %158)[i64 21]; - store i32* %179 with i32 1; - i32* %180 = getelementptr &(i32* %158)[i64 22]; - store i32* %180 with i32 1; - i32* %181 = getelementptr &(i32* %158)[i64 23]; - store i32* %181 with i32 1; - i32* %182 = getelementptr &(i32* %158)[i64 24]; - store i32* %182 with i32 1; - i32* %183 = getelementptr &(i32* %158)[i64 25]; - store i32* %183 with i32 1; - i32* %184 = getelementptr &(i32* %158)[i64 26]; - store i32* %184 with i32 1; - i32* %185 = getelementptr &(i32* %158)[i64 27]; - store i32* %185 with i32 1; - i32* %186 = getelementptr &(i32* %158)[i64 28]; - store i32* %186 with i32 1; - i32* %187 = getelementptr &(i32* %158)[i64 29]; - store i32* %187 with i32 1; - i32* %188 = getelementptr &(i32* %158)[i64 30]; - store i32* %188 with i32 1; - i32* %189 = getelementptr &(i32* %158)[i64 31]; - store i32* %189 with i32 1; - i32 %190 = add i32 %157, i32 32; - i1 %191 = icmp sgt i32 %45, i32 %190; - cbr i1 %191(prob = 0.969697), ^while.body8, ^scalar.header5; + i32 %72 = add i32 %21, i32 1; + i1 %73 = icmp sgt i32 %4, i32 %72; + cbr i1 %73(prob = 0.984615), ^while.body2, ^b6; + ^while.body8 {scalar}: + i32 %74 = phi [^scalar.final2, i32 %66] [^while.body8, i32 %76]; + i32* %75 = getelementptr &([600 * i32]* %20)[i64 0][i32 %74]; + store i32* %75 with i32 1; + i32 %76 = add i32 %74, i32 1; + i1 %77 = icmp sgt i32 %4, i32 %76; + cbr i1 %77(prob = 0.75), ^while.body8, ^scalar.final4; ^scalar.final4: - i32 %192 = phi [^scalar.header4, i32 %135] [^b12, i32 %222]; - ubr ^scalar.header3; - ^while.body9 {scalar}: - i32 %193 = phi [^while.body4, i32 0] [^while.body9, i32 %202]; - i32* %194 = getelementptr &([600 * i32]* %87)[i64 0][i32 %193]; - store i32* %194 with i32 1; - i32* %195 = getelementptr &(i32* %194)[i64 1]; - store i32* %195 with i32 1; - i32* %196 = getelementptr &(i32* %194)[i64 2]; - store i32* %196 with i32 1; - i32* %197 = getelementptr &(i32* %194)[i64 3]; - store i32* %197 with i32 1; - i32* %198 = getelementptr &(i32* %194)[i64 4]; - store i32* %198 with i32 1; - i32* %199 = getelementptr &(i32* %194)[i64 5]; - store i32* %199 with i32 1; - i32* %200 = getelementptr &(i32* %194)[i64 6]; - store i32* %200 with i32 1; - i32* %201 = getelementptr &(i32* %194)[i64 7]; - store i32* %201 with i32 1; - i32 %202 = add i32 %193, i32 8; - i1 %203 = icmp sgt i32 %27, i32 %202; - cbr i1 %203(prob = 0.5), ^while.body9, ^scalar.final6; - ^b12 {scalar}: - i32 %204 = phi [^scalar.header4, i32 %134] [^b12, i32 %222]; - [600 * [600 * i32]]* %205 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %204]; - i32* %206 = getelementptr &([600 * [600 * i32]]* %205)[i64 0][i64 0][i64 0]; - store i32* %206 with i32 1; - i32* %207 = getelementptr &([600 * [600 * i32]]* %205)[i64 1][i64 0][i64 0]; - store i32* %207 with i32 1; - i32* %208 = getelementptr &([600 * [600 * i32]]* %205)[i64 2][i64 0][i64 0]; - store i32* %208 with i32 1; - i32* %209 = getelementptr &([600 * [600 * i32]]* %205)[i64 3][i64 0][i64 0]; - store i32* %209 with i32 1; - i32* %210 = getelementptr &([600 * [600 * i32]]* %205)[i64 4][i64 0][i64 0]; - store i32* %210 with i32 1; - i32* %211 = getelementptr &([600 * [600 * i32]]* %205)[i64 5][i64 0][i64 0]; - store i32* %211 with i32 1; - i32* %212 = getelementptr &([600 * [600 * i32]]* %205)[i64 6][i64 0][i64 0]; - store i32* %212 with i32 1; - i32* %213 = getelementptr &([600 * [600 * i32]]* %205)[i64 7][i64 0][i64 0]; - store i32* %213 with i32 1; - i32* %214 = getelementptr &([600 * [600 * i32]]* %205)[i64 8][i64 0][i64 0]; - store i32* %214 with i32 1; - i32* %215 = getelementptr &([600 * [600 * i32]]* %205)[i64 9][i64 0][i64 0]; - store i32* %215 with i32 1; - i32* %216 = getelementptr &([600 * [600 * i32]]* %205)[i64 10][i64 0][i64 0]; - store i32* %216 with i32 1; - i32* %217 = getelementptr &([600 * [600 * i32]]* %205)[i64 11][i64 0][i64 0]; - store i32* %217 with i32 1; - i32* %218 = getelementptr &([600 * [600 * i32]]* %205)[i64 12][i64 0][i64 0]; - store i32* %218 with i32 1; - i32* %219 = getelementptr &([600 * [600 * i32]]* %205)[i64 13][i64 0][i64 0]; - store i32* %219 with i32 1; - i32* %220 = getelementptr &([600 * [600 * i32]]* %205)[i64 14][i64 0][i64 0]; - store i32* %220 with i32 1; - i32* %221 = getelementptr &([600 * [600 * i32]]* %205)[i64 15][i64 0][i64 0]; - store i32* %221 with i32 1; - i32 %222 = add i32 %204, i32 16; - i1 %223 = icmp sgt i32 %8, i32 %222; - cbr i1 %223(prob = 0.5), ^b12, ^scalar.final4; - ^scalar.final5: - i1 %224 = icmp sgt i32 %13, i32 %132; - cbr i1 %224(prob = 0.5), ^while.body11, ^scalar.final7; - ^b13: - i32 %225 = add i32 %17, i32 1; - i1 %226 = icmp sgt i32 %1, i32 %225; - cbr i1 %226(prob = 0.984615), ^b2, ^b1; - ^scalar.header5: - i1 %227 = icmp sgt i32 %42, i32 %190; - cbr i1 %227(prob = 0.5), ^while.body12, ^scalar.final8; - ^while.body10 {scalar}: - i32 %228 = phi [^while.body7, i32 0] [^while.body10, i32 %245]; - i32* %229 = getelementptr &([600 * i32]* %156)[i64 0][i32 %228]; - store i32* %229 with i32 1; - i32* %230 = getelementptr &(i32* %229)[i64 1]; - store i32* %230 with i32 1; - i32* %231 = getelementptr &(i32* %229)[i64 2]; - store i32* %231 with i32 1; - i32* %232 = getelementptr &(i32* %229)[i64 3]; - store i32* %232 with i32 1; - i32* %233 = getelementptr &(i32* %229)[i64 4]; - store i32* %233 with i32 1; - i32* %234 = getelementptr &(i32* %229)[i64 5]; - store i32* %234 with i32 1; - i32* %235 = getelementptr &(i32* %229)[i64 6]; - store i32* %235 with i32 1; - i32* %236 = getelementptr &(i32* %229)[i64 7]; - store i32* %236 with i32 1; - i32* %237 = getelementptr &(i32* %229)[i64 8]; - store i32* %237 with i32 1; - i32* %238 = getelementptr &(i32* %229)[i64 9]; - store i32* %238 with i32 1; - i32* %239 = getelementptr &(i32* %229)[i64 10]; - store i32* %239 with i32 1; - i32* %240 = getelementptr &(i32* %229)[i64 11]; - store i32* %240 with i32 1; - i32* %241 = getelementptr &(i32* %229)[i64 12]; - store i32* %241 with i32 1; - i32* %242 = getelementptr &(i32* %229)[i64 13]; - store i32* %242 with i32 1; - i32* %243 = getelementptr &(i32* %229)[i64 14]; - store i32* %243 with i32 1; - i32* %244 = getelementptr &(i32* %229)[i64 15]; - store i32* %244 with i32 1; - i32 %245 = add i32 %228, i32 16; - i1 %246 = icmp sgt i32 %42, i32 %245; - cbr i1 %246(prob = 0.5), ^while.body10, ^scalar.final9; - ^while.body11 {scalar}: - i32 %247 = phi [^scalar.final5, i32 %132] [^while.body11, i32 %250]; - i32* %248 = getelementptr &([600 * i32]* %66)[i64 0][i32 %247]; - store i32* %248 with i32 1; - i32* %249 = getelementptr &(i32* %248)[i64 1]; - store i32* %249 with i32 1; - i32 %250 = add i32 %247, i32 2; - i1 %251 = icmp sgt i32 %13, i32 %250; - cbr i1 %251(prob = 0.5), ^while.body11, ^scalar.final7; - ^scalar.final6: - i1 %252 = icmp sgt i32 %19, i32 %202; - cbr i1 %252(prob = 0.5), ^while.body13, ^scalar.final10; - ^scalar.final7: - i32 %253 = phi [^scalar.final5, i32 %132] [^while.body11, i32 %250]; - i1 %254 = icmp sgt i32 %4, i32 %253; - cbr i1 %254(prob = 0.5), ^while.body14, ^scalar.final11; - ^while.body12 {scalar}: - i32 %255 = phi [^scalar.header5, i32 %190] [^while.body12, i32 %272]; - i32* %256 = getelementptr &([600 * i32]* %85)[i64 0][i32 %255]; - store i32* %256 with i32 1; - i32* %257 = getelementptr &(i32* %256)[i64 1]; - store i32* %257 with i32 1; - i32* %258 = getelementptr &(i32* %256)[i64 2]; - store i32* %258 with i32 1; - i32* %259 = getelementptr &(i32* %256)[i64 3]; - store i32* %259 with i32 1; - i32* %260 = getelementptr &(i32* %256)[i64 4]; - store i32* %260 with i32 1; - i32* %261 = getelementptr &(i32* %256)[i64 5]; - store i32* %261 with i32 1; - i32* %262 = getelementptr &(i32* %256)[i64 6]; - store i32* %262 with i32 1; - i32* %263 = getelementptr &(i32* %256)[i64 7]; - store i32* %263 with i32 1; - i32* %264 = getelementptr &(i32* %256)[i64 8]; - store i32* %264 with i32 1; - i32* %265 = getelementptr &(i32* %256)[i64 9]; - store i32* %265 with i32 1; - i32* %266 = getelementptr &(i32* %256)[i64 10]; - store i32* %266 with i32 1; - i32* %267 = getelementptr &(i32* %256)[i64 11]; - store i32* %267 with i32 1; - i32* %268 = getelementptr &(i32* %256)[i64 12]; - store i32* %268 with i32 1; - i32* %269 = getelementptr &(i32* %256)[i64 13]; - store i32* %269 with i32 1; - i32* %270 = getelementptr &(i32* %256)[i64 14]; - store i32* %270 with i32 1; - i32* %271 = getelementptr &(i32* %256)[i64 15]; - store i32* %271 with i32 1; - i32 %272 = add i32 %255, i32 16; - i1 %273 = icmp sgt i32 %42, i32 %272; - cbr i1 %273(prob = 0.5), ^while.body12, ^scalar.final8; - ^while.body13 {scalar}: - i32 %274 = phi [^scalar.final6, i32 %202] [^while.body13, i32 %279]; - i32* %275 = getelementptr &([600 * i32]* %87)[i64 0][i32 %274]; - store i32* %275 with i32 1; - i32* %276 = getelementptr &(i32* %275)[i64 1]; - store i32* %276 with i32 1; - i32* %277 = getelementptr &(i32* %275)[i64 2]; - store i32* %277 with i32 1; - i32* %278 = getelementptr &(i32* %275)[i64 3]; - store i32* %278 with i32 1; - i32 %279 = add i32 %274, i32 4; - i1 %280 = icmp sgt i32 %19, i32 %279; - cbr i1 %280(prob = 0.5), ^while.body13, ^scalar.final10; - ^scalar.final8: - i32 %281 = phi [^scalar.header5, i32 %190] [^while.body12, i32 %272]; - i1 %282 = icmp sgt i32 %27, i32 %281; - cbr i1 %282(prob = 0.5), ^while.body15, ^scalar.final12; - ^scalar.final9: - i1 %283 = icmp sgt i32 %27, i32 %245; - cbr i1 %283(prob = 0.5), ^while.body16, ^scalar.final13; - ^while.body14 {scalar}: - i32 %284 = phi [^scalar.final7, i32 %253] [^while.body14, i32 %286]; - i32* %285 = getelementptr &([600 * i32]* %66)[i64 0][i32 %284]; - store i32* %285 with i32 1; - i32 %286 = add i32 %284, i32 1; - i1 %287 = icmp sgt i32 %4, i32 %286; - cbr i1 %287(prob = 0.5), ^while.body14, ^scalar.final11; - ^scalar.final10: - i32 %288 = phi [^scalar.final6, i32 %202] [^while.body13, i32 %279]; - i1 %289 = icmp sgt i32 %13, i32 %288; - cbr i1 %289(prob = 0.5), ^while.body17, ^scalar.final14; - ^scalar.final11: - i32 %290 = add i32 %65, i32 1; - i1 %291 = icmp sgt i32 %4, i32 %290; - cbr i1 %291(prob = 0.984615), ^while.body2, ^b14; - ^while.body15 {scalar}: - i32 %292 = phi [^scalar.final8, i32 %281] [^while.body15, i32 %301]; - i32* %293 = getelementptr &([600 * i32]* %85)[i64 0][i32 %292]; - store i32* %293 with i32 1; - i32* %294 = getelementptr &(i32* %293)[i64 1]; - store i32* %294 with i32 1; - i32* %295 = getelementptr &(i32* %293)[i64 2]; - store i32* %295 with i32 1; - i32* %296 = getelementptr &(i32* %293)[i64 3]; - store i32* %296 with i32 1; - i32* %297 = getelementptr &(i32* %293)[i64 4]; - store i32* %297 with i32 1; - i32* %298 = getelementptr &(i32* %293)[i64 5]; - store i32* %298 with i32 1; - i32* %299 = getelementptr &(i32* %293)[i64 6]; - store i32* %299 with i32 1; - i32* %300 = getelementptr &(i32* %293)[i64 7]; - store i32* %300 with i32 1; - i32 %301 = add i32 %292, i32 8; - i1 %302 = icmp sgt i32 %27, i32 %301; - cbr i1 %302(prob = 0.5), ^while.body15, ^scalar.final12; - ^while.body16 {scalar}: - i32 %303 = phi [^scalar.final9, i32 %245] [^while.body16, i32 %312]; - i32* %304 = getelementptr &([600 * i32]* %156)[i64 0][i32 %303]; - store i32* %304 with i32 1; - i32* %305 = getelementptr &(i32* %304)[i64 1]; - store i32* %305 with i32 1; - i32* %306 = getelementptr &(i32* %304)[i64 2]; - store i32* %306 with i32 1; - i32* %307 = getelementptr &(i32* %304)[i64 3]; - store i32* %307 with i32 1; - i32* %308 = getelementptr &(i32* %304)[i64 4]; - store i32* %308 with i32 1; - i32* %309 = getelementptr &(i32* %304)[i64 5]; - store i32* %309 with i32 1; - i32* %310 = getelementptr &(i32* %304)[i64 6]; - store i32* %310 with i32 1; - i32* %311 = getelementptr &(i32* %304)[i64 7]; - store i32* %311 with i32 1; - i32 %312 = add i32 %303, i32 8; - i1 %313 = icmp sgt i32 %27, i32 %312; - cbr i1 %313(prob = 0.5), ^while.body16, ^scalar.final13; - ^while.body17 {scalar}: - i32 %314 = phi [^scalar.final10, i32 %288] [^while.body17, i32 %317]; - i32* %315 = getelementptr &([600 * i32]* %87)[i64 0][i32 %314]; - store i32* %315 with i32 1; - i32* %316 = getelementptr &(i32* %315)[i64 1]; - store i32* %316 with i32 1; - i32 %317 = add i32 %314, i32 2; - i1 %318 = icmp sgt i32 %13, i32 %317; - cbr i1 %318(prob = 0.5), ^while.body17, ^scalar.final14; - ^scalar.final12: - i32 %319 = phi [^scalar.final8, i32 %281] [^while.body15, i32 %301]; - i1 %320 = icmp sgt i32 %19, i32 %319; - cbr i1 %320(prob = 0.5), ^while.body18, ^scalar.final15; - ^scalar.final13: - i32 %321 = phi [^scalar.final9, i32 %245] [^while.body16, i32 %312]; - i1 %322 = icmp sgt i32 %19, i32 %321; - cbr i1 %322(prob = 0.5), ^while.body19, ^scalar.final16; - ^scalar.final14: - i32 %323 = phi [^scalar.final10, i32 %288] [^while.body17, i32 %317]; - i1 %324 = icmp sgt i32 %4, i32 %323; - cbr i1 %324(prob = 0.5), ^while.body20, ^scalar.final17; - ^b14: - i32 %325 = add i32 %40, i32 1; - i1 %326 = icmp sgt i32 %1, i32 %325; - cbr i1 %326(prob = 0.984615), ^b4, ^b1; - ^while.body18 {scalar}: - i32 %327 = phi [^scalar.final12, i32 %319] [^while.body18, i32 %332]; - i32* %328 = getelementptr &([600 * i32]* %85)[i64 0][i32 %327]; - store i32* %328 with i32 1; - i32* %329 = getelementptr &(i32* %328)[i64 1]; - store i32* %329 with i32 1; - i32* %330 = getelementptr &(i32* %328)[i64 2]; - store i32* %330 with i32 1; - i32* %331 = getelementptr &(i32* %328)[i64 3]; - store i32* %331 with i32 1; - i32 %332 = add i32 %327, i32 4; - i1 %333 = icmp sgt i32 %19, i32 %332; - cbr i1 %333(prob = 0.5), ^while.body18, ^scalar.final15; - ^while.body19 {scalar}: - i32 %334 = phi [^scalar.final13, i32 %321] [^while.body19, i32 %339]; - i32* %335 = getelementptr &([600 * i32]* %156)[i64 0][i32 %334]; - store i32* %335 with i32 1; - i32* %336 = getelementptr &(i32* %335)[i64 1]; - store i32* %336 with i32 1; - i32* %337 = getelementptr &(i32* %335)[i64 2]; - store i32* %337 with i32 1; - i32* %338 = getelementptr &(i32* %335)[i64 3]; - store i32* %338 with i32 1; - i32 %339 = add i32 %334, i32 4; - i1 %340 = icmp sgt i32 %19, i32 %339; - cbr i1 %340(prob = 0.5), ^while.body19, ^scalar.final16; - ^while.body20 {scalar}: - i32 %341 = phi [^scalar.final14, i32 %323] [^while.body20, i32 %343]; - i32* %342 = getelementptr &([600 * i32]* %87)[i64 0][i32 %341]; - store i32* %342 with i32 1; - i32 %343 = add i32 %341, i32 1; - i1 %344 = icmp sgt i32 %4, i32 %343; - cbr i1 %344(prob = 0.5), ^while.body20, ^scalar.final17; - ^scalar.final15: - i32 %345 = phi [^scalar.final12, i32 %319] [^while.body18, i32 %332]; - i1 %346 = icmp sgt i32 %13, i32 %345; - cbr i1 %346(prob = 0.5), ^while.body21, ^scalar.final18; - ^scalar.final16: - i32 %347 = phi [^scalar.final13, i32 %321] [^while.body19, i32 %339]; - i1 %348 = icmp sgt i32 %13, i32 %347; - cbr i1 %348(prob = 0.5), ^while.body22, ^scalar.final19; - ^scalar.final17: - i32 %349 = add i32 %86, i32 1; - i1 %350 = icmp sgt i32 %4, i32 %349; - cbr i1 %350(prob = 0.984615), ^while.body4, ^b15; - ^while.body21 {scalar}: - i32 %351 = phi [^scalar.final15, i32 %345] [^while.body21, i32 %354]; - i32* %352 = getelementptr &([600 * i32]* %85)[i64 0][i32 %351]; - store i32* %352 with i32 1; - i32* %353 = getelementptr &(i32* %352)[i64 1]; - store i32* %353 with i32 1; - i32 %354 = add i32 %351, i32 2; - i1 %355 = icmp sgt i32 %13, i32 %354; - cbr i1 %355(prob = 0.5), ^while.body21, ^scalar.final18; - ^while.body22 {scalar}: - i32 %356 = phi [^scalar.final16, i32 %347] [^while.body22, i32 %359]; - i32* %357 = getelementptr &([600 * i32]* %156)[i64 0][i32 %356]; - store i32* %357 with i32 1; - i32* %358 = getelementptr &(i32* %357)[i64 1]; - store i32* %358 with i32 1; - i32 %359 = add i32 %356, i32 2; - i1 %360 = icmp sgt i32 %13, i32 %359; - cbr i1 %360(prob = 0.5), ^while.body22, ^scalar.final19; - ^scalar.final18: - i32 %361 = phi [^scalar.final15, i32 %345] [^while.body21, i32 %354]; - i1 %362 = icmp sgt i32 %4, i32 %361; - cbr i1 %362(prob = 0.5), ^while.body23, ^scalar.final20; - ^scalar.final19: - i32 %363 = phi [^scalar.final16, i32 %347] [^while.body22, i32 %359]; - i1 %364 = icmp sgt i32 %4, i32 %363; - cbr i1 %364(prob = 0.5), ^while.body24, ^scalar.final21; - ^b15: - i32 %365 = add i32 %67, i32 1; - i1 %366 = icmp sgt i32 %1, i32 %365; - cbr i1 %366(prob = 0.984615), ^b7, ^b1; - ^while.body23 {scalar}: - i32 %367 = phi [^scalar.final18, i32 %361] [^while.body23, i32 %369]; - i32* %368 = getelementptr &([600 * i32]* %85)[i64 0][i32 %367]; - store i32* %368 with i32 1; - i32 %369 = add i32 %367, i32 1; - i1 %370 = icmp sgt i32 %4, i32 %369; - cbr i1 %370(prob = 0.5), ^while.body23, ^scalar.final20; - ^while.body24 {scalar}: - i32 %371 = phi [^scalar.final19, i32 %363] [^while.body24, i32 %373]; - i32* %372 = getelementptr &([600 * i32]* %156)[i64 0][i32 %371]; - store i32* %372 with i32 1; - i32 %373 = add i32 %371, i32 1; - i1 %374 = icmp sgt i32 %4, i32 %373; - cbr i1 %374(prob = 0.5), ^while.body24, ^scalar.final21; - ^scalar.final20: - i32 %375 = add i32 %84, i32 1; - i1 %376 = icmp sgt i32 %4, i32 %375; - cbr i1 %376(prob = 0.984615), ^while.body3, ^b16; - ^scalar.final21: - i32 %377 = add i32 %155, i32 1; - i1 %378 = icmp sgt i32 %4, i32 %377; - cbr i1 %378(prob = 0.984615), ^while.body7, ^b17; - ^b16: - i32 %379 = add i32 %63, i32 1; - i1 %380 = icmp sgt i32 %1, i32 %379; - cbr i1 %380(prob = 0.984615), ^b6, ^b1; - ^b17: - i32 %381 = add i32 %88, i32 1; - i1 %382 = icmp sgt i32 %1, i32 %381; - cbr i1 %382(prob = 0.984615), ^b9, ^b1; + i32 %78 = add i32 %19, i32 1; + i1 %79 = icmp sgt i32 %4, i32 %78; + cbr i1 %79(prob = 0.984615), ^while.body1, ^b7; + ^b6: + i32 %80 = add i32 %17, i32 1; + i1 %81 = icmp sgt i32 %1, i32 %80; + cbr i1 %81(prob = 0.984615), ^b4, ^b1; + ^b7: + i32 %82 = add i32 %13, i32 1; + i1 %83 = icmp sgt i32 %1, i32 %82; + cbr i1 %83(prob = 0.984615), ^b3, ^b1; } internal [4 * i8]* @cmmc_parallel_body_payload_0, align 8; diff --git a/tests/SysY2022/performance/sl3.arm.s b/tests/SysY2022/performance/sl3.arm.s index 4b74a0ef0..5473b2c2f 100644 --- a/tests/SysY2022/performance/sl3.arm.s +++ b/tests/SysY2022/performance/sl3.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 x: .zero 864000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/sl3.riscv.s b/tests/SysY2022/performance/sl3.riscv.s index 2de6a784a..57daf2e06 100644 --- a/tests/SysY2022/performance/sl3.riscv.s +++ b/tests/SysY2022/performance/sl3.riscv.s @@ -1,1094 +1,247 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 x: .zero 864000000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text .p2align 2 .globl main main: - addi sp, sp, -104 + addi sp, sp, -72 sd ra, 0(sp) - sd s6, 8(sp) - sd s1, 16(sp) - sd s0, 24(sp) - sd s5, 32(sp) - sd s2, 40(sp) - sd s3, 48(sp) - sd s4, 56(sp) + sd s5, 8(sp) + sd s0, 16(sp) + sd s1, 24(sp) + sd s6, 32(sp) + sd s4, 40(sp) + sd s2, 48(sp) + sd s3, 56(sp) sd s7, 64(sp) - sd s9, 72(sp) - sd s8, 80(sp) - sd s11, 88(sp) - sd s10, 96(sp) jal getint - mv s6, a0 + mv s5, a0 jal getint - mv s0, a0 + mv s1, a0 li a0, 13 jal _sysy_starttime - li a0, 75 - addiw s1, s6, -2 -pcrel1697: - auipc a1, %pcrel_hi(x) - addiw s2, s6, -1 - slli s3, a0, 5 - addi s5, a1, %pcrel_lo(pcrel1697) - sub s4, zero, s3 - ble s6, zero, label1452 -pcrel1698: + li a1, 75 +pcrel468: + auipc a0, %pcrel_hi(x) + addiw s0, s5, -1 + slli s2, a1, 5 + addi s4, a0, %pcrel_lo(pcrel468) + sub s3, zero, s2 + lui a0, 352 + addiw s6, a0, -1792 + ble s5, zero, label329 +pcrel469: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1699: +pcrel470: auipc a3, %pcrel_hi(cmmc_parallel_body_0) - sw s6, %pcrel_lo(pcrel1698)(a0) - addi a2, a3, %pcrel_lo(pcrel1699) - mv a1, s6 + sw s5, %pcrel_lo(pcrel469)(a0) + addi a2, a3, %pcrel_lo(pcrel470) + mv a1, s5 mv a0, zero jal cmmcParallelFor -label1452: +label329: li a0, 1 - ble s2, a0, label1496 - lui a2, 352 - mv a3, s5 - addiw a0, a2, -1792 - li a2, 1 - add a1, s5, a0 - lui a5, 352 - addiw a2, a2, 1 - addiw a0, a5, -1792 - li a5, 2 - add a4, a1, a0 - bgt s2, a5, label1462 -.p2align 2 -label1458: - li t0, 601 - slli a0, t0, 2 - lui t0, 352 - add a5, a3, a0 - addiw a3, t0, 612 - lw a4, 0(a5) - add t2, a1, a3 - lw t1, 0(t2) - lw t0, 4(a1) - addw a5, a4, t1 - li a4, 1201 - addw a3, a5, t0 - slli t2, a4, 2 - add t0, a1, s3 - add a5, a1, t2 - addi t2, a0, 4 - lw t1, 0(a5) - lw a5, 0(t0) - addw a4, a3, t1 - add t1, a1, t2 - addw a3, a4, a5 - li t2, 2 - lw a5, 0(t1) - add t1, a1, a0 - addw t0, a3, a5 - divw a4, t0, s0 - sw a4, 0(t1) - ble s2, a2, label1526 -.p2align 2 -label1461: - lui a5, 352 - mv a3, a1 - addiw a2, a2, 1 - addiw a4, a5, -1792 - add a0, a1, a4 - mv a1, a0 - addiw a0, a5, -1792 - li a5, 2 - add a4, a1, a0 - ble s2, a5, label1458 -.p2align 2 -label1462: - add a0, a1, s3 + ble s0, a0, label360 + add t0, s4, s6 + mv a4, s4 + li t1, 1 + add a5, t0, s6 + addiw t1, t1, 1 + add a0, t0, s2 li t2, 1 - mul t5, t2, s3 - add t0, a0, s4 - add t1, a0, s3 - add a5, a4, t5 - add t3, a3, t5 - li t5, 1 - addi t4, t3, 4 - j label1466 -.p2align 2 -label1469: - bgt s2, t5, label1470 + mul t4, t2, s2 + add a2, a0, s3 addiw t2, t2, 1 - ble s2, t2, label1686 -.p2align 2 -label1476: - add a0, a0, s3 - mul t5, t2, s3 - add t1, a0, s3 - add t0, a0, s4 - add a5, a4, t5 - add t3, a3, t5 - li t5, 1 - addi t4, t3, 4 -.p2align 2 -label1466: - sh2add a6, t5, a5 - lw s7, 0(t4) - sh2add a7, t5, t0 - lw s9, 0(a6) - lw s8, 0(a7) - addw t6, s7, s9 - sh2add s7, t5, t1 - addw s9, t6, s8 - lw s11, 0(s7) - sh2add t6, t5, a0 - addw s8, s9, s11 - addiw t5, t5, 2 - lw s10, -4(t6) - addw s9, s8, s10 - lw s8, 4(t6) - addw s11, s9, s8 - divw s8, s11, s0 - sw s8, 0(t6) - lw s9, 4(t4) - lw s11, 4(a6) - lw a6, 4(a7) - addw s8, s9, s11 - lw a7, 4(s7) - addw s10, s8, a6 - lw s8, 0(t6) - addw a6, s10, a7 - lw s9, 8(t6) - addw a7, a6, s8 - addw s7, a7, s9 - divw a6, s7, s0 - sw a6, 4(t6) - ble s1, t5, label1469 - addi t4, t4, 8 - j label1466 -.p2align 2 -label1470: - sh2add t3, t5, t3 + add a1, a5, t4 + add a3, s4, t4 + li t4, 1 + addi t3, a3, 4 + add a3, a0, s2 + j label341 +.p2align 2 +label348: + addi t3, t3, 4 .p2align 2 -label1471: - sh2add t6, t5, a5 - lw a6, 0(t3) - sh2add a7, t5, t0 - lw s7, 0(t6) - addw t4, a6, s7 - sh2add s7, t5, t1 +label341: + sh2add a6, t4, a1 + lw t6, 0(t3) + sh2add a7, t4, a2 + lw s7, 0(a6) lw a6, 0(a7) + addw t5, t6, s7 + sh2add s7, t4, a3 + addw t6, t5, a6 lw a7, 0(s7) - addw t6, t4, a6 - sh2add t4, t5, a0 + sh2add t5, t4, a0 addw a6, t6, a7 - addiw t5, t5, 1 - lw s8, -4(t4) - lw a7, 4(t4) - addw t6, a6, s8 - addw s7, t6, a7 - divw a6, s7, s0 - sw a6, 0(t4) - ble s2, t5, label1613 - addi t3, t3, 4 - j label1471 -.p2align 2 -label1613: + addiw t4, t4, 1 + lw a7, -4(t5) + lw s7, 4(t5) + addw t6, a6, a7 + addw a7, t6, s7 + divw a6, a7, s1 + sw a6, 0(t5) + bgt s0, t4, label348 + ble s0, t2, label411 + add a0, a0, s2 + mul t4, t2, s2 + add a2, a0, s3 addiw t2, t2, 1 - bgt s2, t2, label1476 - bgt s2, a2, label1461 - j label1526 -.p2align 2 -label1686: - bgt s2, a2, label1461 -label1526: - mv s0, a1 + add a1, a5, t4 + add a3, a4, t4 + li t4, 1 + addi t3, a3, 4 + add a3, a0, s2 + j label341 +.p2align 2 +label411: + ble s0, t1, label463 + add a0, t0, s6 + mv a4, t0 + addiw t1, t1, 1 + li t2, 1 + add a5, a0, s6 + mv t0, a0 + mul t4, t2, s2 + add a0, a0, s2 + addiw t2, t2, 1 + add a1, a5, t4 + add a3, a4, t4 + add a2, a0, s3 + li t4, 1 + addi t3, a3, 4 + add a3, a0, s2 + j label341 +label463: + mv s0, t0 mv s1, t2 -label1478: +label330: li a0, 53 jal _sysy_stoptime - mv a0, s6 - mv a1, s5 + mv a0, s5 + mv a1, s4 jal putarray - lui a4, 352 - srliw a0, s6, 31 - addiw a5, a4, -1792 - add a1, s6, a0 + srliw a0, s5, 31 + add a1, s5, a0 + mv a0, s5 sraiw a2, a1, 1 - mul a4, a2, s3 - mul a0, a2, a5 - add a3, s5, a0 - mv a0, s6 + mul a4, a2, s2 + mul a5, a2, s6 + add a3, s4, a5 add a1, a3, a4 jal putarray - mv a0, s6 + mv a0, s5 addiw a3, s1, -1 - mul a2, a3, s3 + mul a2, a3, s2 add a1, s0, a2 jal putarray - mv a0, zero ld ra, 0(sp) - ld s6, 8(sp) - ld s1, 16(sp) - ld s0, 24(sp) - ld s5, 32(sp) - ld s2, 40(sp) - ld s3, 48(sp) - ld s4, 56(sp) + mv a0, zero + ld s5, 8(sp) + ld s0, 16(sp) + ld s1, 24(sp) + ld s6, 32(sp) + ld s4, 40(sp) + ld s2, 48(sp) + ld s3, 56(sp) ld s7, 64(sp) - ld s9, 72(sp) - ld s8, 80(sp) - ld s11, 88(sp) - ld s10, 96(sp) - addi sp, sp, 104 + addi sp, sp, 72 ret -label1496: - mv s0, s5 +label360: + mv s0, s4 li s1, 1 - j label1478 + j label330 .p2align 2 cmmc_parallel_body_0: - addi sp, sp, -24 - mv t6, a0 -pcrel1448: - auipc a4, %pcrel_hi(cmmc_parallel_body_payload_0) - li a3, 75 - sd s2, 0(sp) - li a0, 1 - sd s1, 8(sp) - sd s0, 16(sp) - lw a2, %pcrel_lo(pcrel1448)(a4) - slli a4, a3, 5 - ble a2, zero, label240 - addiw t4, a1, -57 - addiw t3, a1, -26 - addiw t2, a1, -11 - addiw t0, a1, -4 - addiw a5, a1, -1 -pcrel1449: - auipc a3, %pcrel_hi(x) - addi t1, a3, %pcrel_lo(pcrel1449) - bgt a2, a0, label3 - addiw a2, t6, 1 - ble a1, a2, label186 - addiw a3, t6, 3 - ble a5, a3, label693 - addiw a3, t6, 7 - ble t0, a3, label711 - addiw a3, t6, 15 - ble t2, a3, label731 - addiw a2, t6, 31 - ble t3, a2, label736 - lui t5, 352 - addiw a3, t5, -1792 - mul a4, t6, a3 - mv a3, t6 - add a2, t1, a4 -.p2align 2 -label216: - addiw a3, a3, 32 - lui t5, 352 - sw a0, 0(a2) - addiw a4, t5, -1792 - slli a6, a4, 1 - add t6, a2, a4 - add t5, a2, a6 - sw a0, 0(t6) - sh1add t6, a4, a4 - sw a0, 0(t5) - add a7, a2, t6 - slli t5, a6, 1 - sw a0, 0(a7) - sh2add a6, a4, a4 - add a7, a2, t5 - add s0, a2, a6 - sw a0, 0(a7) - slli a7, t6, 1 - sw a0, 0(s0) - add a6, a2, a7 - lui a7, 2461 - sw a0, 0(a6) - addiw t6, a7, -256 - slli a7, t5, 1 - add a6, a2, t6 - sh3add t5, a4, a4 - add t6, a2, a7 - sw a0, 0(a6) - lui a7, 5977 - add a6, a2, t5 - sw a0, 0(t6) - lui t6, 3516 - sw a0, 0(a6) - addiw t5, t6, -1536 - lui a6, 4570 - lui t6, 3867 - add a4, a2, t5 - addiw t5, t6, 768 - sw a0, 0(a4) - lui t6, 4219 - add a4, a2, t5 - addiw t5, t6, -1024 - sw a0, 0(a4) - addiw t6, a6, 1280 - add a4, a2, t5 - lui a6, 4922 - add t5, a2, t6 - sw a0, 0(a4) - lui t6, 5273 - addiw a4, a6, -512 - sw a0, 0(t5) - lui a6, 5625 - add t5, a2, a4 - addiw a4, t6, 1792 - sw a0, 0(t5) - add t6, a2, a6 - add t5, a2, a4 - addiw a4, a7, -1792 - sw a0, 0(t5) - add t5, a2, a4 - sw a0, 0(t6) - lui t6, 6328 - sw a0, 0(t5) - addiw a4, t6, 512 - lui t6, 6680 - add t5, a2, a4 - addiw a4, t6, -1280 - sw a0, 0(t5) - lui t6, 7031 - add t5, a2, a4 - addiw a4, t6, 1024 - sw a0, 0(t5) - lui t6, 7383 - add t5, a2, a4 - addiw a4, t6, -768 - sw a0, 0(t5) - lui t6, 7734 - add t5, a2, a4 - addiw a4, t6, 1536 - sw a0, 0(t5) - lui t6, 8086 - add t5, a2, a4 - addiw a4, t6, -256 - sw a0, 0(t5) - lui t6, 8438 - add t5, a2, a4 - addiw a4, t6, -2048 - sw a0, 0(t5) - lui t6, 8789 - add t5, a2, a4 - addiw a4, t6, 256 - sw a0, 0(t5) - lui t6, 9141 - add t5, a2, a4 - addiw a4, t6, -1536 - sw a0, 0(t5) - lui t6, 9492 - add t5, a2, a4 - addiw a4, t6, 768 - sw a0, 0(t5) - lui t6, 9844 - add t5, a2, a4 - addiw a4, t6, -1024 - sw a0, 0(t5) - lui t6, 10195 - add t5, a2, a4 - addiw a4, t6, 1280 - sw a0, 0(t5) - lui t6, 10547 - add t5, a2, a4 - addiw a4, t6, -512 - sw a0, 0(t5) - lui t6, 10898 - add t5, a2, a4 - addiw a4, t6, 1792 - sw a0, 0(t5) - add t5, a2, a4 - sw a0, 0(t5) - ble t4, a3, label774 - lui a4, 11250 - add a2, a2, a4 - j label216 -label240: - ld s2, 0(sp) - ld s1, 8(sp) - ld s0, 16(sp) - addi sp, sp, 24 + mv t4, a0 + mv a3, a1 +pcrel327: + auipc a2, %pcrel_hi(cmmc_parallel_body_payload_0) + li a5, 75 + lui t0, 352 + li t1, 1 + li a1, 1 + addiw a4, t0, -1792 + lw a0, %pcrel_lo(pcrel327)(a2) + slli a2, a5, 5 + slli a5, t1, 32 + bgt a0, zero, label3 +label2: ret -label774: - mv t6, a3 -label220: - ble t3, t6, label779 - lui t4, 352 - addiw a3, t4, -1792 - mul a4, t6, a3 - add a2, t1, a4 -label224: - addiw a3, t6, 16 - lui t5, 352 - sw a0, 0(a2) - addiw a4, t5, -1792 - sh1add t5, a4, a4 - slli t6, a4, 1 - add t4, a2, a4 - add a7, a2, t5 - add a6, a2, t6 - sw a0, 0(t4) - slli t4, t6, 1 - sw a0, 0(a6) - sh2add t6, a4, a4 - add a6, a2, t4 - sw a0, 0(a7) - add a7, a2, t6 - sw a0, 0(a6) - slli a6, t5, 1 - sw a0, 0(a7) - add t6, a2, a6 - lui a7, 2461 - slli a6, t4, 1 - sw a0, 0(t6) - addiw t5, a7, -256 - add t6, a2, t5 - add t5, a2, a6 - sw a0, 0(t6) - lui a6, 3516 - sh3add t6, a4, a4 - sw a0, 0(t5) - addiw a4, a6, -1536 - add t4, a2, t6 - add t5, a2, a4 - lui t6, 3867 - sw a0, 0(t4) - addiw t4, t6, 768 - sw a0, 0(t5) - lui t6, 4219 - add a4, a2, t4 - addiw t4, t6, -1024 - sw a0, 0(a4) - lui t6, 4570 - add t5, a2, t4 - addiw a4, t6, 1280 - sw a0, 0(t5) - lui t6, 4922 - add t4, a2, a4 - addiw a4, t6, -512 - sw a0, 0(t4) - lui t6, 5273 - add t5, a2, a4 - addiw t4, t6, 1792 - sw a0, 0(t5) - add a4, a2, t4 - sw a0, 0(a4) - bgt t3, a3, label227 -label779: - mv a2, a3 - mv t6, a3 -label230: - bgt t2, t6, label235 - mv t6, a2 - j label203 label3: - addiw a3, a2, -1 - li a5, 3 - bgt a3, a5, label4 - lui t3, 352 + auipc t0, %pcrel_hi(x) + li t1, 3 + addi t2, t0, %pcrel_lo(label3) + bgt a0, t1, label4 + mul t1, t4, a4 + mv t0, t4 + add a5, t2, t1 mv t4, zero - addiw t0, t3, -1792 - mul t2, t6, t0 - mv t0, t6 - add a5, t1, t2 - mv t2, zero mv t3, a5 + mv t2, zero mv t1, a5 - j label171 + j label61 .p2align 2 -label1347: +label65: addiw t0, t0, 1 - ble a1, t0, label240 + ble a3, t0, label2 .p2align 2 -label183: - lui t2, 352 - li t4, 2 - li a6, 1 - addiw t1, t2, -1792 - slli t5, a6, 32 +label66: + add a5, a5, a4 mv t2, zero - add a5, a5, t1 - addi t6, t5, 1 + li t4, 1 mv t3, a5 mv t1, a5 - sd t6, 0(a5) - ble a3, t4, label1385 + sw a1, 0(a5) + ble a0, t4, label309 .p2align 2 -label174: - addi t3, t3, 8 -.p2align 2 -label171: - addiw t4, t4, 2 - li a6, 1 - slli t5, a6, 32 - addi t6, t5, 1 - sd t6, 0(t3) - bgt a3, t4, label174 - bgt a2, t4, label176 -.p2align 2 -label1345: - addiw t2, t2, 1 - bgt a2, t2, label184 -.p2align 2 -label1366: - addiw t0, t0, 1 - bgt a1, t0, label183 - j label240 -.p2align 2 -label180: +label68: addi t3, t3, 4 - mv t4, t5 .p2align 2 -label177: - addiw t5, t4, 1 - sw a0, 0(t3) - bgt a2, t5, label180 +label61: + addiw t4, t4, 1 + sw a1, 0(t3) + bgt a0, t4, label68 addiw t2, t2, 1 - ble a2, t2, label1347 + ble a0, t2, label65 .p2align 2 -label184: - add t1, t1, a4 - li t4, 2 - li a6, 1 +label67: + add t1, t1, a2 + li t4, 1 + sw a1, 0(t1) mv t3, t1 - slli t5, a6, 32 - addi t6, t5, 1 - sd t6, 0(t1) - bgt a3, t4, label174 - bgt a2, t4, label176 + bgt a0, t4, label68 addiw t2, t2, 1 - bgt a2, t2, label184 - j label1366 -.p2align 2 -label1385: - ble a2, t4, label1345 -.p2align 2 -label176: - sh2add t3, t4, t1 - j label177 -label239: - lui t3, 2813 - mv t6, a3 - addiw a4, t3, -2048 - add a2, a2, a4 -label236: - addiw a3, t6, 8 - lui t4, 352 - sw a0, 0(a2) - addiw a4, t4, -1792 - slli t4, a4, 1 - add t3, a2, a4 - add t5, a2, t4 - sw a0, 0(t3) - sh1add t3, a4, a4 - sw a0, 0(t5) - add a6, a2, t3 - slli t5, t4, 1 - sw a0, 0(a6) - sh2add t4, a4, a4 - add t6, a2, t5 - slli a4, t3, 1 - add t5, a2, t4 - sw a0, 0(t6) - add t4, a2, a4 - sw a0, 0(t5) - lui t5, 2461 - sw a0, 0(t4) - addiw a4, t5, -256 - add t3, a2, a4 - sw a0, 0(t3) - bgt t2, a3, label239 - mv a2, a3 - mv t6, a3 -label203: - bgt t0, t6, label206 - mv t6, a2 -label194: - ble a5, t6, label697 - lui t0, 352 - addiw a3, t0, -1792 - mul a4, t6, a3 - add a2, t1, a4 -label198: - addiw t6, t6, 2 - lui t0, 352 - sw a0, 0(a2) - addiw a3, t0, -1792 - add a4, a2, a3 - sw a0, 0(a4) - bgt a5, t6, label201 - j label186 -label697: - mv t6, a2 -label186: - ble a1, t6, label240 - lui a5, 352 - addiw a4, a5, -1792 - mul a3, t6, a4 - add a2, t1, a3 -label189: - addiw t6, t6, 1 - sw a0, 0(a2) - ble a1, t6, label240 - lui a4, 352 - addiw a3, a4, -1792 - add a2, a2, a3 - j label189 -label206: - lui t2, 352 - addiw a3, t2, -1792 - mul a4, t6, a3 - add a2, t1, a4 -label207: - addiw a3, t6, 4 - lui t2, 352 - sw a0, 0(a2) - addiw a4, t2, -1792 - slli t2, a4, 1 - add t3, a2, a4 - add t4, a2, t2 - sw a0, 0(t3) - sh1add t3, a4, a4 - sw a0, 0(t4) - add t2, a2, t3 - sw a0, 0(t2) - ble t0, a3, label725 - lui t2, 1406 - mv t6, a3 - addiw a4, t2, 1024 - add a2, a2, a4 - j label207 + bgt a0, t2, label67 + addiw t0, t0, 1 + bgt a3, t0, label66 + j label2 label4: - addiw a5, a2, -4 - li t0, 7 - bgt a5, t0, label5 - ble a5, zero, label240 - lui t4, 352 - addiw t2, t4, -1792 - mul t3, t6, t2 - mv t2, t6 - add t0, t1, t3 - mv t1, t0 - mv t3, zero - mv t4, t0 - mv t5, zero - j label143 -.p2align 2 -label146: - addi t4, t4, 16 -.p2align 2 -label143: - addiw t5, t5, 4 - li a7, 1 - slli a6, a7, 32 - addi t6, a6, 1 - sd t6, 0(t4) - sd t6, 8(t4) - bgt a5, t5, label146 -.p2align 2 -label600: - bgt a3, t5, label159 - ble a2, t5, label1363 -.p2align 2 -label150: - sh2add t4, t5, t1 - j label151 -.p2align 2 -label154: - addi t4, t4, 4 - mv t5, t6 -.p2align 2 -label151: - addiw t6, t5, 1 - sw a0, 0(t4) - bgt a2, t6, label154 - addiw t3, t3, 1 - ble a2, t3, label156 -.p2align 2 -label158: - add t1, t1, a4 - li t5, 4 - li a7, 1 - mv t4, t1 - slli a6, a7, 32 - addi t6, a6, 1 - sd t6, 0(t1) - sd t6, 8(t1) - bgt a5, t5, label146 - ble a3, t5, label1383 -.p2align 2 -label159: - sh2add t4, t5, t1 -.p2align 2 -label160: - addiw t6, t5, 2 - li s0, 1 - slli a6, s0, 32 - addi a7, a6, 1 - sd a7, 0(t4) - ble a3, t6, label637 - addi t4, t4, 8 - mv t5, t6 - j label160 -.p2align 2 -label637: - mv t5, t6 - bgt a2, t6, label150 - addiw t3, t3, 1 - bgt a2, t3, label158 - j label156 -.p2align 2 -label1383: - bgt a2, t5, label150 - addiw t3, t3, 1 - bgt a2, t3, label158 -label156: - addiw t2, t2, 1 - ble a1, t2, label240 - lui t3, 352 - li t5, 4 - li a7, 1 - addiw t1, t3, -1792 - slli a6, a7, 32 - mv t3, zero - add t0, t0, t1 - addi t6, a6, 1 - mv t4, t0 - mv t1, t0 - sd t6, 0(t0) - sd t6, 8(t0) - bgt a5, t5, label146 - j label600 -.p2align 2 -label1363: - addiw t3, t3, 1 - bgt a2, t3, label158 - j label156 -label5: - addiw t0, a2, -11 - li t2, 15 - bgt t0, t2, label6 - ble t0, zero, label240 - lui t5, 352 - addiw t3, t5, -1792 - mul t4, t6, t3 - mv t3, t6 - add t2, t1, t4 - mv t1, t2 - mv t4, zero - mv t5, t2 - mv t6, zero - j label107 -.p2align 2 -label564: - addiw t4, t4, 1 - ble a2, t4, label116 -.p2align 2 -label118: - add t1, t1, a4 - li t6, 8 - li s0, 1 - mv t5, t1 - slli a7, s0, 32 - addi a6, a7, 1 - sd a6, 0(t1) - sd a6, 8(t1) - sd a6, 16(t1) - sd a6, 24(t1) - ble t0, t6, label1360 -.p2align 2 -label134: - addi t5, t5, 32 -.p2align 2 -label107: - addiw t6, t6, 8 - li s0, 1 - slli a7, s0, 32 - addi a6, a7, 1 - sd a6, 0(t5) - sd a6, 8(t5) - sd a6, 16(t5) - sd a6, 24(t5) - bgt t0, t6, label134 - ble a5, t6, label111 -.p2align 2 -label129: - sh2add t5, t6, t1 -.p2align 2 -label130: - addiw a6, t6, 4 - li s0, 1 - slli t6, s0, 32 - addi a7, t6, 1 - sd a7, 0(t5) - sd a7, 8(t5) - bgt a5, a6, label133 - mv t6, a6 - ble a3, a6, label1340 -.p2align 2 -label124: - sh2add t5, t6, t1 - j label125 -.p2align 2 -label128: - addi t5, t5, 8 -.p2align 2 -label125: - addiw t6, t6, 2 - li s0, 1 - slli a7, s0, 32 - addi a6, a7, 1 - sd a6, 0(t5) - bgt a3, t6, label128 - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 - j label116 -.p2align 2 -label119: - sh2add t5, t6, t1 -.p2align 2 -label120: - addiw a6, t6, 1 - sw a0, 0(t5) - ble a2, a6, label564 - addi t5, t5, 4 - mv t6, a6 - j label120 -.p2align 2 -label111: - bgt a3, t6, label124 - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 - j label116 -.p2align 2 -label1360: - bgt a5, t6, label129 - bgt a3, t6, label124 - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 - j label116 -.p2align 2 -label1340: - bgt a2, t6, label119 - addiw t4, t4, 1 - bgt a2, t4, label118 -label116: - addiw t3, t3, 1 - ble a1, t3, label240 - lui t4, 352 - mv t6, zero - addiw t1, t4, -1792 - mv t4, zero - add t2, t2, t1 - mv t5, t2 - mv t1, t2 - j label107 -label6: - addiw t2, a2, -26 - addiw t3, a2, -57 - li t4, 31 - ble t2, t4, label56 - lui a7, 352 - addiw a6, a7, -1792 - mul t4, t6, a6 - add t5, t1, t4 - mv t4, t5 - mv a6, zero - mv t1, t5 - mv a7, zero - j label14 -.p2align 2 -label17: - addi t1, t1, 128 -.p2align 2 -label14: - addiw a7, a7, 32 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - sd s0, 16(t1) - sd s0, 24(t1) - sd s0, 32(t1) - sd s0, 40(t1) - sd s0, 48(t1) - sd s0, 56(t1) - sd s0, 64(t1) - sd s0, 72(t1) - sd s0, 80(t1) - sd s0, 88(t1) - sd s0, 96(t1) - sd s0, 104(t1) - sd s0, 112(t1) - sd s0, 120(t1) - bgt t3, a7, label17 - ble t2, a7, label1324 - sh2add t1, a7, t4 - j label20 -.p2align 2 -label23: - addi t1, t1, 64 -.p2align 2 -label20: - addiw a7, a7, 16 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - sd s0, 16(t1) - sd s0, 24(t1) - sd s0, 32(t1) - sd s0, 40(t1) - sd s0, 48(t1) - sd s0, 56(t1) - bgt t2, a7, label23 - ble t0, a7, label1326 -.p2align 2 -label51: - sh2add t1, a7, t4 - j label52 -.p2align 2 -label55: - addi t1, t1, 32 -.p2align 2 -label52: - addiw a7, a7, 8 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - sd s0, 16(t1) - sd s0, 24(t1) - bgt t0, a7, label55 - bgt a5, a7, label46 - bgt a3, a7, label41 - ble a2, a7, label1374 -.p2align 2 -label36: - sh2add t1, a7, t4 -.p2align 2 -label37: - addiw a7, a7, 1 - sw a0, 0(t1) - ble a2, a7, label381 - addi t1, t1, 4 - j label37 -.p2align 2 -label1326: - ble a5, a7, label1350 -.p2align 2 -label46: - sh2add t1, a7, t4 -.p2align 2 -label47: - addiw a7, a7, 4 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - sd s0, 8(t1) - bgt a5, a7, label50 - bgt a3, a7, label41 - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -.p2align 2 -label381: - addiw a6, a6, 1 - ble a2, a6, label33 -.p2align 2 -label35: - add t4, t4, a4 - mv a7, zero - mv t1, t4 - j label14 -.p2align 2 -label1324: - bgt t0, a7, label51 - bgt a5, a7, label46 - ble a3, a7, label1387 -.p2align 2 -label41: - sh2add t1, a7, t4 - j label42 -.p2align 2 -label45: - addi t1, t1, 8 -.p2align 2 -label42: - addiw a7, a7, 2 - li s2, 1 - slli s1, s2, 32 - addi s0, s1, 1 - sd s0, 0(t1) - bgt a3, a7, label45 - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -.p2align 2 -label1350: - bgt a3, a7, label41 - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -.p2align 2 -label1387: - bgt a2, a7, label36 - addiw a6, a6, 1 - bgt a2, a6, label35 -label33: - addiw t6, t6, 1 - ble a1, t6, label240 - lui t4, 352 - mv a6, zero - mv a7, zero - addiw t1, t4, -1792 - add t5, t5, t1 - mv t1, t5 - mv t4, t5 - j label14 -.p2align 2 -label1374: - addiw a6, a6, 1 - bgt a2, a6, label35 - j label33 -label56: - ble t2, zero, label240 - lui a6, 352 - addiw t4, a6, -1792 - mul t5, t6, t4 - mv t4, t6 - add t3, t1, t5 - mv t1, t3 + addiw t0, a0, -3 + addiw t1, a0, -18 + li t3, 15 + ble t0, t3, label88 + mul t5, t4, a4 + add t3, t2, t5 + mv t2, t3 mv t5, zero mv t6, t3 mv a6, zero - j label64 + j label12 .p2align 2 -label98: +label15: addi t6, t6, 64 .p2align 2 -label64: +label12: addiw a6, a6, 16 - li s1, 1 - slli s0, s1, 32 - addi a7, s0, 1 + ori a7, a5, 1 sd a7, 0(t6) sd a7, 8(t6) sd a7, 16(t6) @@ -1097,153 +250,147 @@ label64: sd a7, 40(t6) sd a7, 48(t6) sd a7, 56(t6) - bgt t2, a6, label98 - ble t0, a6, label447 - sh2add t6, a6, t1 - j label69 + bgt t1, a6, label15 + ble t0, a6, label294 + sh2add t6, a6, t2 + mv a7, a6 + j label18 .p2align 2 -label72: - addi t6, t6, 32 - mv a6, a7 +label21: + addi t6, t6, 16 .p2align 2 -label69: - addiw a7, a6, 8 - li s1, 1 - slli s0, s1, 32 - addi a6, s0, 1 +label18: + addiw a7, a7, 4 + ori a6, a5, 1 sd a6, 0(t6) sd a6, 8(t6) - sd a6, 16(t6) - sd a6, 24(t6) - bgt t0, a7, label72 + bgt t0, a7, label21 mv a6, a7 - bgt a5, a7, label75 - bgt a3, a7, label82 - bgt a2, a7, label93 - addiw t5, t5, 1 - bgt a2, t5, label92 - j label90 -.p2align 2 -label1334: - ble a2, a6, label1357 + ble a0, a7, label296 .p2align 2 -label93: - sh2add t6, a6, t1 +label28: + sh2add t6, a6, t2 .p2align 2 -label94: - addiw a7, a6, 1 - sw a0, 0(t6) - ble a2, a7, label513 +label29: + addiw a6, a6, 1 + sw a1, 0(t6) + ble a0, a6, label149 addi t6, t6, 4 - mv a6, a7 - j label94 -.p2align 2 -label75: - sh2add t6, a6, t1 -.p2align 2 -label76: - addiw a6, a6, 4 - li s1, 1 - slli s0, s1, 32 - addi a7, s0, 1 - sd a7, 0(t6) - sd a7, 8(t6) - ble a5, a6, label476 - addi t6, t6, 16 - j label76 + j label29 .p2align 2 -label513: +label149: addiw t5, t5, 1 - ble a2, t5, label90 + ble a0, t5, label297 .p2align 2 -label92: - add t1, t1, a4 +label27: + add t2, t2, a2 mv a6, zero - mv t6, t1 - j label64 -.p2align 2 -label476: - ble a3, a6, label1334 -.p2align 2 -label82: - sh2add t6, a6, t1 - j label83 -.p2align 2 -label86: - addi t6, t6, 8 + mv t6, t2 + j label12 .p2align 2 -label83: - addiw a6, a6, 2 - li s1, 1 - slli s0, s1, 32 - addi a7, s0, 1 - sd a7, 0(t6) - bgt a3, a6, label86 - bgt a2, a6, label93 +label294: + bgt a0, a6, label28 addiw t5, t5, 1 - bgt a2, t5, label92 - j label90 + bgt a0, t5, label27 + j label25 .p2align 2 -label447: - bgt a5, a6, label75 - bgt a3, a6, label82 - bgt a2, a6, label93 - addiw t5, t5, 1 - bgt a2, t5, label92 -label90: +label297: addiw t4, t4, 1 - ble a1, t4, label240 - lui t5, 352 - mv a6, zero - addiw t1, t5, -1792 + ble a3, t4, label2 +.p2align 2 +label26: + add t3, t3, a4 mv t5, zero - add t3, t3, t1 + mv a6, zero mv t6, t3 - mv t1, t3 - j label64 + mv t2, t3 + j label12 .p2align 2 -label1357: +label296: addiw t5, t5, 1 - bgt a2, t5, label92 - j label90 + bgt a0, t5, label27 +label25: + addiw t4, t4, 1 + bgt a3, t4, label26 + j label2 +.p2align 2 +label309: + addiw t2, t2, 1 + bgt a0, t2, label67 + j label65 +label88: + mul t3, t4, a4 + mv t6, zero + add t1, t2, t3 + mv t5, t1 + mv t3, t1 + mv t2, t4 + mv t4, zero + j label40 +.p2align 2 +label52: + addi t5, t5, 4 +.p2align 2 +label49: + addiw a6, a6, 1 + sw a1, 0(t5) + bgt a0, a6, label52 + addiw t4, t4, 1 + ble a0, t4, label298 .p2align 2 -label133: +label47: + add t3, t3, a2 + li t6, 4 + ori a6, a5, 1 + mv t5, t3 + sd a6, 0(t3) + sd a6, 8(t3) + ble t0, t6, label303 +.p2align 2 +label53: addi t5, t5, 16 - mv t6, a6 - j label130 -label235: - lui t3, 352 - addiw a4, t3, -1792 - mul a3, t6, a4 - add a2, t1, a3 - j label236 -label693: - mv a2, zero - j label194 -label201: - lui a4, 703 - addiw a3, a4, 512 - add a2, a2, a3 - j label198 -label736: - mv a3, zero - j label220 .p2align 2 -label50: - addi t1, t1, 16 - j label47 -label711: - mv a2, zero - j label203 -label731: - mv a2, zero - j label230 -label227: - lui a4, 5625 - mv t6, a3 - add a2, a2, a4 - j label224 -label725: - mv a2, a3 - mv t6, a3 - j label194 +label40: + addiw t6, t6, 4 + ori a6, a5, 1 + sd a6, 0(t5) + sd a6, 8(t5) + bgt t0, t6, label53 +.p2align 2 +label43: + ble a0, t6, label44 +.p2align 2 +label48: + sh2add t5, t6, t3 + mv a6, t6 + j label49 +.p2align 2 +label44: + addiw t4, t4, 1 + bgt a0, t4, label47 +label45: + addiw t2, t2, 1 + bgt a3, t2, label46 + j label2 +.p2align 2 +label303: + bgt a0, t6, label48 + addiw t4, t4, 1 + bgt a0, t4, label47 + j label45 +.p2align 2 +label298: + addiw t2, t2, 1 + ble a3, t2, label2 +.p2align 2 +label46: + add t1, t1, a4 + mv t4, zero + li t6, 4 + ori a6, a5, 1 + mv t5, t1 + mv t3, t1 + sd a6, 0(t1) + sd a6, 8(t1) + bgt t0, t6, label53 + j label43 diff --git a/tests/SysY2022/performance/sl3.sy.ir b/tests/SysY2022/performance/sl3.sy.ir index 61160ca35..b3d55bac9 100644 --- a/tests/SysY2022/performance/sl3.sy.ir +++ b/tests/SysY2022/performance/sl3.sy.ir @@ -8,155 +8,84 @@ func @main() -> i32 { NoRecurse Entry } { i32 %0 = call () -> i32 @getint(); i32 %1 = call () -> i32 @getint(); call (i32) -> void @starttime(i32 13); - i32 %2 = add i32 %0, i32 -1; - i1 %3 = icmp sgt i32 %2, i32 2; - i1 %4 = icmp sgt i32 %0, i32 0; - i32 %5 = add i32 %0, i32 -2; - [600 * [600 * [600 * i32]]]* %6 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; - cbr i1 %4(prob = 0.984615), ^b, ^b1; + i1 %2 = icmp sgt i32 %0, i32 0; + i32 %3 = add i32 %0, i32 -1; + [600 * [600 * [600 * i32]]]* %4 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; + cbr i1 %2(prob = 0.984615), ^b, ^b1; ^b: - [4 * i8]* %7 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_0 to [4 * i8]*; - i32* %8 = ptradd [4 * i8]* %7, i32 0; - store i32* %8 with i32 %0; - i8* %9 = functionptr () -> void @cmmc_parallel_body_0 as i8*; - call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %0, i8* %9); + [4 * i8]* %5 = ptrcast [4 * i8]* @cmmc_parallel_body_payload_0 to [4 * i8]*; + i32* %6 = ptradd [4 * i8]* %5, i32 0; + store i32* %6 with i32 %0; + i8* %7 = functionptr () -> void @cmmc_parallel_body_0 as i8*; + call (i32, i32, i8*) -> void @cmmcParallelFor(i32 0, i32 %0, i8* %7); ubr ^b1; ^b1: - i1 %10 = icmp sgt i32 %2, i32 1; - [600 * [600 * i32]]* %11 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i64 0]; - cbr i1 %10(prob = 0.984615), ^while.body, ^b2; + i1 %8 = icmp sgt i32 %3, i32 1; + [600 * [600 * i32]]* %9 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i64 0]; + cbr i1 %8(prob = 0.984615), ^while.body, ^b2; ^while.body: - [600 * [600 * i32]]* %12 = phi [^b1, [600 * [600 * i32]]* %11] [^b5, [600 * [600 * i32]]* %14]; - i32 %13 = phi [^b1, i32 1] [^b5, i32 %16]; - [600 * [600 * i32]]* %14 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i32 %13]; - [600 * [600 * i32]]* %15 = getelementptr &([600 * [600 * i32]]* %14)[i64 1]; - i32 %16 = add i32 %13, i32 1; - cbr i1 %3(prob = 0.5), ^b3, ^b4; + [600 * [600 * i32]]* %10 = phi [^b1, [600 * [600 * i32]]* %9] [^b5, [600 * [600 * i32]]* %12]; + i32 %11 = phi [^b1, i32 1] [^b5, i32 %14]; + [600 * [600 * i32]]* %12 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i32 %11]; + [600 * [600 * i32]]* %13 = getelementptr &([600 * [600 * i32]]* %12)[i64 1]; + i32 %14 = add i32 %11, i32 1; + ubr ^b3; ^b2: - [600 * [600 * i32]]* %17 = phi [^b1, [600 * [600 * i32]]* %11] [^b5, [600 * [600 * i32]]* %14]; - i32 %18 = phi [^b1, i32 1] [^b5, i32 %91]; - i32* %19 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i64 0][i64 0][i64 0]; + [600 * [600 * i32]]* %15 = phi [^b1, [600 * [600 * i32]]* %9] [^b5, [600 * [600 * i32]]* %12]; + i32 %16 = phi [^b1, i32 1] [^b5, i32 %31]; + i32* %17 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i64 0][i64 0][i64 0]; call (i32) -> void @stoptime(i32 53); - call (i32, i32*) -> void @putarray(i32 %0, i32* %19); - i32 %20 = sdiv i32 %0, i32 2; - [600 * [600 * i32]]* %21 = getelementptr &([600 * [600 * [600 * i32]]]* %6)[i64 0][i32 %20]; - [600 * i32]* %22 = getelementptr &([600 * [600 * i32]]* %21)[i64 0][i32 %20]; - i32* %23 = getelementptr &([600 * i32]* %22)[i64 0][i64 0]; - call (i32, i32*) -> void @putarray(i32 %0, i32* %23); - i32 %24 = add i32 %18, i32 -1; - [600 * i32]* %25 = getelementptr &([600 * [600 * i32]]* %17)[i64 0][i32 %24]; - i32* %26 = getelementptr &([600 * i32]* %25)[i64 0][i64 0]; - call (i32, i32*) -> void @putarray(i32 %0, i32* %26); + call (i32, i32*) -> void @putarray(i32 %0, i32* %17); + i32 %18 = sdiv i32 %0, i32 2; + [600 * [600 * i32]]* %19 = getelementptr &([600 * [600 * [600 * i32]]]* %4)[i64 0][i32 %18]; + [600 * i32]* %20 = getelementptr &([600 * [600 * i32]]* %19)[i64 0][i32 %18]; + i32* %21 = getelementptr &([600 * i32]* %20)[i64 0][i64 0]; + call (i32, i32*) -> void @putarray(i32 %0, i32* %21); + i32 %22 = add i32 %16, i32 -1; + [600 * i32]* %23 = getelementptr &([600 * [600 * i32]]* %15)[i64 0][i32 %22]; + i32* %24 = getelementptr &([600 * i32]* %23)[i64 0][i64 0]; + call (i32, i32*) -> void @putarray(i32 %0, i32* %24); ret i32 0; ^b3: - i32 %27 = phi [^while.body, i32 1] [^b6, i32 %116]; - [600 * i32]* %28 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i32 %27]; - [600 * i32]* %29 = getelementptr &([600 * i32]* %28)[i64 -1]; - [600 * i32]* %30 = getelementptr &([600 * i32]* %28)[i64 1]; - [600 * i32]* %31 = getelementptr &([600 * [600 * i32]]* %12)[i64 0][i32 %27]; - [600 * i32]* %32 = getelementptr &([600 * [600 * i32]]* %15)[i64 0][i32 %27]; + i32 %25 = phi [^while.body, i32 1] [^b4, i32 %31]; + [600 * i32]* %26 = getelementptr &([600 * [600 * i32]]* %12)[i64 0][i32 %25]; + [600 * i32]* %27 = getelementptr &([600 * i32]* %26)[i64 -1]; + [600 * i32]* %28 = getelementptr &([600 * i32]* %26)[i64 1]; + [600 * i32]* %29 = getelementptr &([600 * [600 * i32]]* %10)[i64 0][i32 %25]; + [600 * i32]* %30 = getelementptr &([600 * [600 * i32]]* %13)[i64 0][i32 %25]; + i32 %31 = add i32 %25, i32 1; ubr ^while.body1; - ^b4: - i32* %33 = getelementptr &([600 * [600 * i32]]* %12)[i64 0][i64 1][i64 1]; + ^while.body1: + i32 %32 = phi [^b3, i32 1] [^while.body1, i32 %52]; + i32* %33 = getelementptr &([600 * i32]* %29)[i64 0][i32 %32]; i32 %34 = load i32* %33; - i32* %35 = getelementptr &([600 * [600 * i32]]* %14)[i64 1][i64 1][i64 1]; + i32* %35 = getelementptr &([600 * i32]* %30)[i64 0][i32 %32]; i32 %36 = load i32* %35; i32 %37 = add i32 %34, i32 %36; - i32* %38 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 0][i64 1]; + i32* %38 = getelementptr &([600 * i32]* %27)[i64 0][i32 %32]; i32 %39 = load i32* %38; i32 %40 = add i32 %37, i32 %39; - i32* %41 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 2][i64 1]; + i32* %41 = getelementptr &([600 * i32]* %28)[i64 0][i32 %32]; i32 %42 = load i32* %41; i32 %43 = add i32 %40, i32 %42; - i32* %44 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 1][i64 0]; - i32 %45 = load i32* %44; - i32 %46 = add i32 %43, i32 %45; - i32* %47 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 1][i64 2]; - i32 %48 = load i32* %47; - i32 %49 = add i32 %46, i32 %48; - i32 %50 = sdiv i32 %49, i32 %1; - i32* %51 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i64 1][i64 1]; - store i32* %51 with i32 %50; - ubr ^b5; - ^while.body1: - i32 %52 = phi [^b3, i32 1] [^while.body1, i32 %89]; - i32* %53 = getelementptr &([600 * i32]* %31)[i64 0][i32 %52]; - i32 %54 = load i32* %53; - i32* %55 = getelementptr &([600 * i32]* %32)[i64 0][i32 %52]; - i32 %56 = load i32* %55; - i32 %57 = add i32 %54, i32 %56; - i32* %58 = getelementptr &([600 * i32]* %29)[i64 0][i32 %52]; - i32 %59 = load i32* %58; - i32 %60 = add i32 %57, i32 %59; - i32* %61 = getelementptr &([600 * i32]* %30)[i64 0][i32 %52]; - i32 %62 = load i32* %61; - i32 %63 = add i32 %60, i32 %62; - i32* %64 = getelementptr &([600 * i32]* %28)[i64 0][i32 %52]; - i32* %65 = getelementptr &(i32* %64)[i64 -1]; - i32 %66 = load i32* %65; - i32 %67 = add i32 %63, i32 %66; - i32* %68 = getelementptr &(i32* %64)[i64 1]; - i32 %69 = load i32* %68; - i32 %70 = add i32 %67, i32 %69; - i32 %71 = sdiv i32 %70, i32 %1; - store i32* %64 with i32 %71; - i32* %72 = getelementptr &(i32* %53)[i64 1]; - i32 %73 = load i32* %72; - i32* %74 = getelementptr &(i32* %55)[i64 1]; - i32 %75 = load i32* %74; - i32 %76 = add i32 %73, i32 %75; - i32* %77 = getelementptr &(i32* %58)[i64 1]; - i32 %78 = load i32* %77; - i32 %79 = add i32 %76, i32 %78; - i32* %80 = getelementptr &(i32* %61)[i64 1]; - i32 %81 = load i32* %80; - i32 %82 = add i32 %79, i32 %81; - i32 %83 = load i32* %64; - i32 %84 = add i32 %82, i32 %83; - i32* %85 = getelementptr &(i32* %64)[i64 2]; - i32 %86 = load i32* %85; - i32 %87 = add i32 %84, i32 %86; - i32 %88 = sdiv i32 %87, i32 %1; - store i32* %68 with i32 %88; - i32 %89 = add i32 %52, i32 2; - i1 %90 = icmp sgt i32 %5, i32 %89; - cbr i1 %90(prob = 0.969697), ^while.body1, ^scalar.header; + i32* %44 = getelementptr &([600 * i32]* %26)[i64 0][i32 %32]; + i32* %45 = getelementptr &(i32* %44)[i64 -1]; + i32 %46 = load i32* %45; + i32 %47 = add i32 %43, i32 %46; + i32* %48 = getelementptr &(i32* %44)[i64 1]; + i32 %49 = load i32* %48; + i32 %50 = add i32 %47, i32 %49; + i32 %51 = sdiv i32 %50, i32 %1; + store i32* %44 with i32 %51; + i32 %52 = add i32 %32, i32 1; + i1 %53 = icmp sgt i32 %3, i32 %52; + cbr i1 %53(prob = 0.984615), ^while.body1, ^b4; + ^b4: + i1 %54 = icmp sgt i32 %3, i32 %31; + cbr i1 %54(prob = 0.984615), ^b3, ^b5; ^b5: - i32 %91 = phi [^b4, i32 2] [^b6, i32 %116]; - i1 %92 = icmp sgt i32 %2, i32 %16; - cbr i1 %92(prob = 0.984615), ^while.body, ^b2; - ^scalar.header: - i1 %93 = icmp sgt i32 %2, i32 %89; - cbr i1 %93(prob = 0.5), ^while.body2, ^b6; - ^while.body2 {scalar}: - i32 %94 = phi [^scalar.header, i32 %89] [^while.body2, i32 %114]; - i32* %95 = getelementptr &([600 * i32]* %31)[i64 0][i32 %94]; - i32 %96 = load i32* %95; - i32* %97 = getelementptr &([600 * i32]* %32)[i64 0][i32 %94]; - i32 %98 = load i32* %97; - i32 %99 = add i32 %96, i32 %98; - i32* %100 = getelementptr &([600 * i32]* %29)[i64 0][i32 %94]; - i32 %101 = load i32* %100; - i32 %102 = add i32 %99, i32 %101; - i32* %103 = getelementptr &([600 * i32]* %30)[i64 0][i32 %94]; - i32 %104 = load i32* %103; - i32 %105 = add i32 %102, i32 %104; - i32* %106 = getelementptr &([600 * i32]* %28)[i64 0][i32 %94]; - i32* %107 = getelementptr &(i32* %106)[i64 -1]; - i32 %108 = load i32* %107; - i32 %109 = add i32 %105, i32 %108; - i32* %110 = getelementptr &(i32* %106)[i64 1]; - i32 %111 = load i32* %110; - i32 %112 = add i32 %109, i32 %111; - i32 %113 = sdiv i32 %112, i32 %1; - store i32* %106 with i32 %113; - i32 %114 = add i32 %94, i32 1; - i1 %115 = icmp sgt i32 %2, i32 %114; - cbr i1 %115(prob = 0.5), ^while.body2, ^b6; - ^b6: - i32 %116 = add i32 %27, i32 1; - i1 %117 = icmp sgt i32 %2, i32 %116; - cbr i1 %117(prob = 0.984615), ^b3, ^b5; + i1 %55 = icmp sgt i32 %3, i32 %14; + cbr i1 %55(prob = 0.984615), ^while.body, ^b2; } internal func @cmmcParallelFor(i32, i32, i8*) -> void { NoRecurse }; internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse ParallelBody AlignedParallelBody } { @@ -167,729 +96,157 @@ internal func @cmmc_parallel_body_0(i32 %0, i32 %1) -> void { NoRecurse Parallel i1 %5 = icmp sgt i32 %4, i32 0; cbr i1 %5(prob = 0.5), ^cond, ^b1; ^cond: - i1 %6 = icmp sgt i32 %4, i32 1; - i32 %7 = add i32 %1, i32 -57; - i32 %8 = add i32 %1, i32 -26; - i32 %9 = add i32 %1, i32 -11; - i32 %10 = add i32 %1, i32 -4; - i32 %11 = add i32 %1, i32 -1; - [600 * [600 * [600 * i32]]]* %12 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; - cbr i1 %6(prob = 0.5), ^cond1, ^super.header; + i1 %6 = icmp sgt i32 %4, i32 3; + [600 * [600 * [600 * i32]]]* %7 = ptrcast [600 * [600 * [600 * i32]]]* @x to [600 * [600 * [600 * i32]]]*; + cbr i1 %6(prob = 0.5), ^cond1, ^b2; ^b1: ret; - ^cond1: - i32 %13 = add i32 %4, i32 -1; - i1 %14 = icmp sgt i32 %13, i32 3; - cbr i1 %14(prob = 0.5), ^cond2, ^b2; - ^super.header: - i32 %15 = add i32 %0, i32 1; - i1 %16 = icmp sgt i32 %1, i32 %15; - cbr i1 %16(prob = 0.969697), ^super.header1, ^scalar.header; ^b2: - i32 %17 = phi [^cond1, i32 %0] [^b13, i32 %225]; - [600 * [600 * i32]]* %18 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %17]; + i32 %8 = phi [^cond, i32 %0] [^b5, i32 %57]; + [600 * [600 * i32]]* %9 = getelementptr &([600 * [600 * [600 * i32]]]* %7)[i64 0][i32 %8]; ubr ^while.body; - ^cond2: - i32 %19 = add i32 %4, i32 -4; - i1 %20 = icmp sgt i32 %19, i32 7; - cbr i1 %20(prob = 0.5), ^cond3, ^cond4; - ^super.header1: - i32 %21 = add i32 %0, i32 3; - i1 %22 = icmp sgt i32 %11, i32 %21; - cbr i1 %22(prob = 0.969697), ^super.header2, ^scalar.header1; - ^scalar.header: - i32 %23 = phi [^super.header, i32 %0] [^scalar.header1, i32 %33] [^b5, i32 %61]; - i1 %24 = icmp sgt i32 %1, i32 %23; - cbr i1 %24(prob = 0.5), ^b3, ^b1; - ^while.body: - i32 %25 = phi [^b2, i32 0] [^scalar.final3, i32 %153]; - [600 * i32]* %26 = getelementptr &([600 * [600 * i32]]* %18)[i64 0][i32 %25]; + ^cond1: + i32 %10 = add i32 %4, i32 -3; + i1 %11 = icmp sgt i32 %10, i32 15; + i32 %12 = add i32 %4, i32 -18; + cbr i1 %11(prob = 0.5), ^b3, ^b4; + ^b3: + i32 %13 = phi [^cond1, i32 %0] [^b7, i32 %82]; + [600 * [600 * i32]]* %14 = getelementptr &([600 * [600 * [600 * i32]]]* %7)[i64 0][i32 %13]; ubr ^while.body1; - ^cond3: - i32 %27 = add i32 %4, i32 -11; - i1 %28 = icmp sgt i32 %27, i32 15; - cbr i1 %28(prob = 0.5), ^cond5, ^cond6; - ^super.header2: - i32 %29 = add i32 %0, i32 7; - i1 %30 = icmp sgt i32 %10, i32 %29; - cbr i1 %30(prob = 0.969697), ^super.header3, ^scalar.header2; - ^cond4: - i1 %31 = icmp sgt i32 %19, i32 0; - cbr i1 %31(prob = 0.5), ^b4, ^b1; - ^scalar.header1: - i32 %32 = phi [^super.header1, i32 %0] [^scalar.final, i32 %71]; - i32 %33 = phi [^super.header1, i32 undef] [^scalar.final, i32 %71]; - i1 %34 = icmp sgt i32 %11, i32 %32; - cbr i1 %34(prob = 0.5), ^b5, ^scalar.header; - ^b3 {scalar}: - i32 %35 = phi [^scalar.header, i32 %23] [^b3, i32 %38]; - [600 * [600 * i32]]* %36 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %35]; - i32* %37 = getelementptr &([600 * [600 * i32]]* %36)[i64 0][i64 0][i64 0]; - store i32* %37 with i32 1; - i32 %38 = add i32 %35, i32 1; - i1 %39 = icmp sgt i32 %1, i32 %38; - cbr i1 %39(prob = 0.5), ^b3, ^b1; + ^while.body: + i32 %15 = phi [^b2, i32 0] [^scalar.final, i32 %53]; + [600 * i32]* %16 = getelementptr &([600 * [600 * i32]]* %9)[i64 0][i32 %15]; + ubr ^while.body3; ^b4: - i32 %40 = phi [^cond4, i32 %0] [^b14, i32 %325]; - [600 * [600 * i32]]* %41 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %40]; + i32 %17 = phi [^cond1, i32 %0] [^b6, i32 %80]; + [600 * [600 * i32]]* %18 = getelementptr &([600 * [600 * [600 * i32]]]* %7)[i64 0][i32 %17]; ubr ^while.body2; - ^cond5: - i32 %42 = add i32 %4, i32 -26; - i1 %43 = icmp sgt i32 %42, i32 0; - i1 %44 = icmp sgt i32 %42, i32 31; - i32 %45 = add i32 %4, i32 -57; - cbr i1 %44(prob = 0.5), ^b6, ^cond7; - ^super.header3: - i32 %46 = add i32 %0, i32 15; - i1 %47 = icmp sgt i32 %9, i32 %46; - cbr i1 %47(prob = 0.969697), ^super.header4, ^scalar.header3; - ^while.body1 {scalar}: - i32 %48 = phi [^while.body, i32 0] [^while.body1, i32 %51]; - i32* %49 = getelementptr &([600 * i32]* %26)[i64 0][i32 %48]; - store i32* %49 with i32 1; - i32* %50 = getelementptr &(i32* %49)[i64 1]; - store i32* %50 with i32 1; - i32 %51 = add i32 %48, i32 2; - i1 %52 = icmp sgt i32 %13, i32 %51; - cbr i1 %52(prob = 0.5), ^while.body1, ^scalar.final1; - ^cond6: - i1 %53 = icmp sgt i32 %27, i32 0; - cbr i1 %53(prob = 0.5), ^b7, ^b1; - ^scalar.header2: - i32 %54 = phi [^super.header2, i32 %0] [^scalar.final2, i32 %126]; - i32 %55 = phi [^super.header2, i32 undef] [^scalar.final2, i32 %126]; - i1 %56 = icmp sgt i32 %10, i32 %54; - cbr i1 %56(prob = 0.5), ^b8, ^scalar.final; - ^b5 {scalar}: - i32 %57 = phi [^scalar.header1, i32 %32] [^b5, i32 %61]; - [600 * [600 * i32]]* %58 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %57]; - i32* %59 = getelementptr &([600 * [600 * i32]]* %58)[i64 0][i64 0][i64 0]; - store i32* %59 with i32 1; - i32* %60 = getelementptr &([600 * [600 * i32]]* %58)[i64 1][i64 0][i64 0]; - store i32* %60 with i32 1; - i32 %61 = add i32 %57, i32 2; - i1 %62 = icmp sgt i32 %11, i32 %61; - cbr i1 %62(prob = 0.5), ^b5, ^scalar.header; - ^b6: - i32 %63 = phi [^cond5, i32 %0] [^b16, i32 %379]; - [600 * [600 * i32]]* %64 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %63]; - ubr ^while.body3; + ^while.body1: + i32 %19 = phi [^b3, i32 0] [^scalar.final4, i32 %78]; + [600 * i32]* %20 = getelementptr &([600 * [600 * i32]]* %14)[i64 0][i32 %19]; + ubr ^while.body4; ^while.body2: - i32 %65 = phi [^b4, i32 0] [^scalar.final11, i32 %290]; - [600 * i32]* %66 = getelementptr &([600 * [600 * i32]]* %41)[i64 0][i32 %65]; + i32 %21 = phi [^b4, i32 0] [^scalar.final3, i32 %72]; + [600 * i32]* %22 = getelementptr &([600 * [600 * i32]]* %18)[i64 0][i32 %21]; ubr ^while.body5; - ^b7: - i32 %67 = phi [^cond6, i32 %0] [^b15, i32 %365]; - [600 * [600 * i32]]* %68 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %67]; - ubr ^while.body4; - ^super.header4: - i32 %69 = add i32 %0, i32 31; - i1 %70 = icmp sgt i32 %8, i32 %69; - cbr i1 %70(prob = 0.969697), ^b10, ^scalar.header4; - ^scalar.final: - i32 %71 = phi [^scalar.header2, i32 %55] [^b8, i32 %81]; - ubr ^scalar.header1; - ^cond7: - cbr i1 %43(prob = 0.5), ^b9, ^b1; - ^scalar.header3: - i32 %72 = phi [^super.header3, i32 %0] [^scalar.final4, i32 %192]; - i32 %73 = phi [^super.header3, i32 undef] [^scalar.final4, i32 %192]; - i1 %74 = icmp sgt i32 %9, i32 %72; - cbr i1 %74(prob = 0.5), ^b11, ^scalar.final2; - ^b8 {scalar}: - i32 %75 = phi [^scalar.header2, i32 %54] [^b8, i32 %81]; - [600 * [600 * i32]]* %76 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %75]; - i32* %77 = getelementptr &([600 * [600 * i32]]* %76)[i64 0][i64 0][i64 0]; - store i32* %77 with i32 1; - i32* %78 = getelementptr &([600 * [600 * i32]]* %76)[i64 1][i64 0][i64 0]; - store i32* %78 with i32 1; - i32* %79 = getelementptr &([600 * [600 * i32]]* %76)[i64 2][i64 0][i64 0]; - store i32* %79 with i32 1; - i32* %80 = getelementptr &([600 * [600 * i32]]* %76)[i64 3][i64 0][i64 0]; - store i32* %80 with i32 1; - i32 %81 = add i32 %75, i32 4; - i1 %82 = icmp sgt i32 %10, i32 %81; - cbr i1 %82(prob = 0.5), ^b8, ^scalar.final; - ^scalar.final1: - i1 %83 = icmp sgt i32 %4, i32 %51; - cbr i1 %83(prob = 0.5), ^while.body6, ^scalar.final3; - ^while.body3: - i32 %84 = phi [^b6, i32 0] [^scalar.final20, i32 %375]; - [600 * i32]* %85 = getelementptr &([600 * [600 * i32]]* %64)[i64 0][i32 %84]; - ubr ^while.body8; + ^while.body3 {scalar}: + i32 %23 = phi [^while.body, i32 0] [^while.body3, i32 %25]; + i32* %24 = getelementptr &([600 * i32]* %16)[i64 0][i32 %23]; + store i32* %24 with i32 1; + i32 %25 = add i32 %23, i32 1; + i1 %26 = icmp sgt i32 %4, i32 %25; + cbr i1 %26(prob = 0.75), ^while.body3, ^scalar.final; ^while.body4: - i32 %86 = phi [^b7, i32 0] [^scalar.final17, i32 %349]; - [600 * i32]* %87 = getelementptr &([600 * [600 * i32]]* %68)[i64 0][i32 %86]; - ubr ^while.body9; - ^b9: - i32 %88 = phi [^cond7, i32 %0] [^b17, i32 %381]; - [600 * [600 * i32]]* %89 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %88]; - ubr ^while.body7; - ^b10: - i32 %90 = phi [^super.header4, i32 %0] [^b10, i32 %124]; - [600 * [600 * i32]]* %91 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %90]; - i32* %92 = getelementptr &([600 * [600 * i32]]* %91)[i64 0][i64 0][i64 0]; - store i32* %92 with i32 1; - i32* %93 = getelementptr &([600 * [600 * i32]]* %91)[i64 1][i64 0][i64 0]; - store i32* %93 with i32 1; - i32* %94 = getelementptr &([600 * [600 * i32]]* %91)[i64 2][i64 0][i64 0]; - store i32* %94 with i32 1; - i32* %95 = getelementptr &([600 * [600 * i32]]* %91)[i64 3][i64 0][i64 0]; - store i32* %95 with i32 1; - i32* %96 = getelementptr &([600 * [600 * i32]]* %91)[i64 4][i64 0][i64 0]; - store i32* %96 with i32 1; - i32* %97 = getelementptr &([600 * [600 * i32]]* %91)[i64 5][i64 0][i64 0]; - store i32* %97 with i32 1; - i32* %98 = getelementptr &([600 * [600 * i32]]* %91)[i64 6][i64 0][i64 0]; - store i32* %98 with i32 1; - i32* %99 = getelementptr &([600 * [600 * i32]]* %91)[i64 7][i64 0][i64 0]; - store i32* %99 with i32 1; - i32* %100 = getelementptr &([600 * [600 * i32]]* %91)[i64 8][i64 0][i64 0]; - store i32* %100 with i32 1; - i32* %101 = getelementptr &([600 * [600 * i32]]* %91)[i64 9][i64 0][i64 0]; - store i32* %101 with i32 1; - i32* %102 = getelementptr &([600 * [600 * i32]]* %91)[i64 10][i64 0][i64 0]; - store i32* %102 with i32 1; - i32* %103 = getelementptr &([600 * [600 * i32]]* %91)[i64 11][i64 0][i64 0]; - store i32* %103 with i32 1; - i32* %104 = getelementptr &([600 * [600 * i32]]* %91)[i64 12][i64 0][i64 0]; - store i32* %104 with i32 1; - i32* %105 = getelementptr &([600 * [600 * i32]]* %91)[i64 13][i64 0][i64 0]; - store i32* %105 with i32 1; - i32* %106 = getelementptr &([600 * [600 * i32]]* %91)[i64 14][i64 0][i64 0]; - store i32* %106 with i32 1; - i32* %107 = getelementptr &([600 * [600 * i32]]* %91)[i64 15][i64 0][i64 0]; - store i32* %107 with i32 1; - i32* %108 = getelementptr &([600 * [600 * i32]]* %91)[i64 16][i64 0][i64 0]; - store i32* %108 with i32 1; - i32* %109 = getelementptr &([600 * [600 * i32]]* %91)[i64 17][i64 0][i64 0]; - store i32* %109 with i32 1; - i32* %110 = getelementptr &([600 * [600 * i32]]* %91)[i64 18][i64 0][i64 0]; - store i32* %110 with i32 1; - i32* %111 = getelementptr &([600 * [600 * i32]]* %91)[i64 19][i64 0][i64 0]; - store i32* %111 with i32 1; - i32* %112 = getelementptr &([600 * [600 * i32]]* %91)[i64 20][i64 0][i64 0]; - store i32* %112 with i32 1; - i32* %113 = getelementptr &([600 * [600 * i32]]* %91)[i64 21][i64 0][i64 0]; - store i32* %113 with i32 1; - i32* %114 = getelementptr &([600 * [600 * i32]]* %91)[i64 22][i64 0][i64 0]; - store i32* %114 with i32 1; - i32* %115 = getelementptr &([600 * [600 * i32]]* %91)[i64 23][i64 0][i64 0]; - store i32* %115 with i32 1; - i32* %116 = getelementptr &([600 * [600 * i32]]* %91)[i64 24][i64 0][i64 0]; - store i32* %116 with i32 1; - i32* %117 = getelementptr &([600 * [600 * i32]]* %91)[i64 25][i64 0][i64 0]; - store i32* %117 with i32 1; - i32* %118 = getelementptr &([600 * [600 * i32]]* %91)[i64 26][i64 0][i64 0]; - store i32* %118 with i32 1; - i32* %119 = getelementptr &([600 * [600 * i32]]* %91)[i64 27][i64 0][i64 0]; - store i32* %119 with i32 1; - i32* %120 = getelementptr &([600 * [600 * i32]]* %91)[i64 28][i64 0][i64 0]; - store i32* %120 with i32 1; - i32* %121 = getelementptr &([600 * [600 * i32]]* %91)[i64 29][i64 0][i64 0]; - store i32* %121 with i32 1; - i32* %122 = getelementptr &([600 * [600 * i32]]* %91)[i64 30][i64 0][i64 0]; - store i32* %122 with i32 1; - i32* %123 = getelementptr &([600 * [600 * i32]]* %91)[i64 31][i64 0][i64 0]; - store i32* %123 with i32 1; - i32 %124 = add i32 %90, i32 32; - i1 %125 = icmp sgt i32 %7, i32 %124; - cbr i1 %125(prob = 0.969697), ^b10, ^scalar.header4; - ^scalar.final2: - i32 %126 = phi [^scalar.header3, i32 %73] [^b11, i32 %147]; - ubr ^scalar.header2; + i32 %27 = phi [^while.body1, i32 0] [^while.body4, i32 %44]; + i32* %28 = getelementptr &([600 * i32]* %20)[i64 0][i32 %27]; + store i32* %28 with i32 1; + i32* %29 = getelementptr &(i32* %28)[i64 1]; + store i32* %29 with i32 1; + i32* %30 = getelementptr &(i32* %28)[i64 2]; + store i32* %30 with i32 1; + i32* %31 = getelementptr &(i32* %28)[i64 3]; + store i32* %31 with i32 1; + i32* %32 = getelementptr &(i32* %28)[i64 4]; + store i32* %32 with i32 1; + i32* %33 = getelementptr &(i32* %28)[i64 5]; + store i32* %33 with i32 1; + i32* %34 = getelementptr &(i32* %28)[i64 6]; + store i32* %34 with i32 1; + i32* %35 = getelementptr &(i32* %28)[i64 7]; + store i32* %35 with i32 1; + i32* %36 = getelementptr &(i32* %28)[i64 8]; + store i32* %36 with i32 1; + i32* %37 = getelementptr &(i32* %28)[i64 9]; + store i32* %37 with i32 1; + i32* %38 = getelementptr &(i32* %28)[i64 10]; + store i32* %38 with i32 1; + i32* %39 = getelementptr &(i32* %28)[i64 11]; + store i32* %39 with i32 1; + i32* %40 = getelementptr &(i32* %28)[i64 12]; + store i32* %40 with i32 1; + i32* %41 = getelementptr &(i32* %28)[i64 13]; + store i32* %41 with i32 1; + i32* %42 = getelementptr &(i32* %28)[i64 14]; + store i32* %42 with i32 1; + i32* %43 = getelementptr &(i32* %28)[i64 15]; + store i32* %43 with i32 1; + i32 %44 = add i32 %27, i32 16; + i1 %45 = icmp sgt i32 %12, i32 %44; + cbr i1 %45(prob = 0.941176), ^while.body4, ^scalar.header; ^while.body5 {scalar}: - i32 %127 = phi [^while.body2, i32 0] [^while.body5, i32 %132]; - i32* %128 = getelementptr &([600 * i32]* %66)[i64 0][i32 %127]; - store i32* %128 with i32 1; - i32* %129 = getelementptr &(i32* %128)[i64 1]; - store i32* %129 with i32 1; - i32* %130 = getelementptr &(i32* %128)[i64 2]; - store i32* %130 with i32 1; - i32* %131 = getelementptr &(i32* %128)[i64 3]; - store i32* %131 with i32 1; - i32 %132 = add i32 %127, i32 4; - i1 %133 = icmp sgt i32 %19, i32 %132; - cbr i1 %133(prob = 0.5), ^while.body5, ^scalar.final5; - ^scalar.header4: - i32 %134 = phi [^super.header4, i32 %0] [^b10, i32 %124]; - i32 %135 = phi [^super.header4, i32 undef] [^b10, i32 %124]; - i1 %136 = icmp sgt i32 %8, i32 %134; - cbr i1 %136(prob = 0.5), ^b12, ^scalar.final4; - ^b11 {scalar}: - i32 %137 = phi [^scalar.header3, i32 %72] [^b11, i32 %147]; - [600 * [600 * i32]]* %138 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %137]; - i32* %139 = getelementptr &([600 * [600 * i32]]* %138)[i64 0][i64 0][i64 0]; - store i32* %139 with i32 1; - i32* %140 = getelementptr &([600 * [600 * i32]]* %138)[i64 1][i64 0][i64 0]; - store i32* %140 with i32 1; - i32* %141 = getelementptr &([600 * [600 * i32]]* %138)[i64 2][i64 0][i64 0]; - store i32* %141 with i32 1; - i32* %142 = getelementptr &([600 * [600 * i32]]* %138)[i64 3][i64 0][i64 0]; - store i32* %142 with i32 1; - i32* %143 = getelementptr &([600 * [600 * i32]]* %138)[i64 4][i64 0][i64 0]; - store i32* %143 with i32 1; - i32* %144 = getelementptr &([600 * [600 * i32]]* %138)[i64 5][i64 0][i64 0]; - store i32* %144 with i32 1; - i32* %145 = getelementptr &([600 * [600 * i32]]* %138)[i64 6][i64 0][i64 0]; - store i32* %145 with i32 1; - i32* %146 = getelementptr &([600 * [600 * i32]]* %138)[i64 7][i64 0][i64 0]; - store i32* %146 with i32 1; - i32 %147 = add i32 %137, i32 8; - i1 %148 = icmp sgt i32 %9, i32 %147; - cbr i1 %148(prob = 0.5), ^b11, ^scalar.final2; + i32 %46 = phi [^while.body2, i32 0] [^while.body5, i32 %51]; + i32* %47 = getelementptr &([600 * i32]* %22)[i64 0][i32 %46]; + store i32* %47 with i32 1; + i32* %48 = getelementptr &(i32* %47)[i64 1]; + store i32* %48 with i32 1; + i32* %49 = getelementptr &(i32* %47)[i64 2]; + store i32* %49 with i32 1; + i32* %50 = getelementptr &(i32* %47)[i64 3]; + store i32* %50 with i32 1; + i32 %51 = add i32 %46, i32 4; + i1 %52 = icmp sgt i32 %10, i32 %51; + cbr i1 %52(prob = 0.75), ^while.body5, ^scalar.final1; + ^scalar.final: + i32 %53 = add i32 %15, i32 1; + i1 %54 = icmp sgt i32 %4, i32 %53; + cbr i1 %54(prob = 0.984615), ^while.body, ^b5; + ^scalar.header: + i1 %55 = icmp sgt i32 %10, i32 %44; + cbr i1 %55(prob = 0.75), ^while.body6, ^scalar.final2; + ^scalar.final1: + i1 %56 = icmp sgt i32 %4, i32 %51; + cbr i1 %56(prob = 0.75), ^while.body7, ^scalar.final3; + ^b5: + i32 %57 = add i32 %8, i32 1; + i1 %58 = icmp sgt i32 %1, i32 %57; + cbr i1 %58(prob = 0.984615), ^b2, ^b1; ^while.body6 {scalar}: - i32 %149 = phi [^scalar.final1, i32 %51] [^while.body6, i32 %151]; - i32* %150 = getelementptr &([600 * i32]* %26)[i64 0][i32 %149]; - store i32* %150 with i32 1; - i32 %151 = add i32 %149, i32 1; - i1 %152 = icmp sgt i32 %4, i32 %151; - cbr i1 %152(prob = 0.5), ^while.body6, ^scalar.final3; + i32 %59 = phi [^scalar.header, i32 %44] [^while.body6, i32 %64]; + i32* %60 = getelementptr &([600 * i32]* %20)[i64 0][i32 %59]; + store i32* %60 with i32 1; + i32* %61 = getelementptr &(i32* %60)[i64 1]; + store i32* %61 with i32 1; + i32* %62 = getelementptr &(i32* %60)[i64 2]; + store i32* %62 with i32 1; + i32* %63 = getelementptr &(i32* %60)[i64 3]; + store i32* %63 with i32 1; + i32 %64 = add i32 %59, i32 4; + i1 %65 = icmp sgt i32 %10, i32 %64; + cbr i1 %65(prob = 0.75), ^while.body6, ^scalar.final2; + ^scalar.final2: + i32 %66 = phi [^scalar.header, i32 %44] [^while.body6, i32 %64]; + i1 %67 = icmp sgt i32 %4, i32 %66; + cbr i1 %67(prob = 0.75), ^while.body8, ^scalar.final4; + ^while.body7 {scalar}: + i32 %68 = phi [^scalar.final1, i32 %51] [^while.body7, i32 %70]; + i32* %69 = getelementptr &([600 * i32]* %22)[i64 0][i32 %68]; + store i32* %69 with i32 1; + i32 %70 = add i32 %68, i32 1; + i1 %71 = icmp sgt i32 %4, i32 %70; + cbr i1 %71(prob = 0.75), ^while.body7, ^scalar.final3; ^scalar.final3: - i32 %153 = add i32 %25, i32 1; - i1 %154 = icmp sgt i32 %4, i32 %153; - cbr i1 %154(prob = 0.984615), ^while.body, ^b13; - ^while.body7: - i32 %155 = phi [^b9, i32 0] [^scalar.final21, i32 %377]; - [600 * i32]* %156 = getelementptr &([600 * [600 * i32]]* %89)[i64 0][i32 %155]; - ubr ^while.body10; - ^while.body8: - i32 %157 = phi [^while.body3, i32 0] [^while.body8, i32 %190]; - i32* %158 = getelementptr &([600 * i32]* %85)[i64 0][i32 %157]; - store i32* %158 with i32 1; - i32* %159 = getelementptr &(i32* %158)[i64 1]; - store i32* %159 with i32 1; - i32* %160 = getelementptr &(i32* %158)[i64 2]; - store i32* %160 with i32 1; - i32* %161 = getelementptr &(i32* %158)[i64 3]; - store i32* %161 with i32 1; - i32* %162 = getelementptr &(i32* %158)[i64 4]; - store i32* %162 with i32 1; - i32* %163 = getelementptr &(i32* %158)[i64 5]; - store i32* %163 with i32 1; - i32* %164 = getelementptr &(i32* %158)[i64 6]; - store i32* %164 with i32 1; - i32* %165 = getelementptr &(i32* %158)[i64 7]; - store i32* %165 with i32 1; - i32* %166 = getelementptr &(i32* %158)[i64 8]; - store i32* %166 with i32 1; - i32* %167 = getelementptr &(i32* %158)[i64 9]; - store i32* %167 with i32 1; - i32* %168 = getelementptr &(i32* %158)[i64 10]; - store i32* %168 with i32 1; - i32* %169 = getelementptr &(i32* %158)[i64 11]; - store i32* %169 with i32 1; - i32* %170 = getelementptr &(i32* %158)[i64 12]; - store i32* %170 with i32 1; - i32* %171 = getelementptr &(i32* %158)[i64 13]; - store i32* %171 with i32 1; - i32* %172 = getelementptr &(i32* %158)[i64 14]; - store i32* %172 with i32 1; - i32* %173 = getelementptr &(i32* %158)[i64 15]; - store i32* %173 with i32 1; - i32* %174 = getelementptr &(i32* %158)[i64 16]; - store i32* %174 with i32 1; - i32* %175 = getelementptr &(i32* %158)[i64 17]; - store i32* %175 with i32 1; - i32* %176 = getelementptr &(i32* %158)[i64 18]; - store i32* %176 with i32 1; - i32* %177 = getelementptr &(i32* %158)[i64 19]; - store i32* %177 with i32 1; - i32* %178 = getelementptr &(i32* %158)[i64 20]; - store i32* %178 with i32 1; - i32* %179 = getelementptr &(i32* %158)[i64 21]; - store i32* %179 with i32 1; - i32* %180 = getelementptr &(i32* %158)[i64 22]; - store i32* %180 with i32 1; - i32* %181 = getelementptr &(i32* %158)[i64 23]; - store i32* %181 with i32 1; - i32* %182 = getelementptr &(i32* %158)[i64 24]; - store i32* %182 with i32 1; - i32* %183 = getelementptr &(i32* %158)[i64 25]; - store i32* %183 with i32 1; - i32* %184 = getelementptr &(i32* %158)[i64 26]; - store i32* %184 with i32 1; - i32* %185 = getelementptr &(i32* %158)[i64 27]; - store i32* %185 with i32 1; - i32* %186 = getelementptr &(i32* %158)[i64 28]; - store i32* %186 with i32 1; - i32* %187 = getelementptr &(i32* %158)[i64 29]; - store i32* %187 with i32 1; - i32* %188 = getelementptr &(i32* %158)[i64 30]; - store i32* %188 with i32 1; - i32* %189 = getelementptr &(i32* %158)[i64 31]; - store i32* %189 with i32 1; - i32 %190 = add i32 %157, i32 32; - i1 %191 = icmp sgt i32 %45, i32 %190; - cbr i1 %191(prob = 0.969697), ^while.body8, ^scalar.header5; + i32 %72 = add i32 %21, i32 1; + i1 %73 = icmp sgt i32 %4, i32 %72; + cbr i1 %73(prob = 0.984615), ^while.body2, ^b6; + ^while.body8 {scalar}: + i32 %74 = phi [^scalar.final2, i32 %66] [^while.body8, i32 %76]; + i32* %75 = getelementptr &([600 * i32]* %20)[i64 0][i32 %74]; + store i32* %75 with i32 1; + i32 %76 = add i32 %74, i32 1; + i1 %77 = icmp sgt i32 %4, i32 %76; + cbr i1 %77(prob = 0.75), ^while.body8, ^scalar.final4; ^scalar.final4: - i32 %192 = phi [^scalar.header4, i32 %135] [^b12, i32 %222]; - ubr ^scalar.header3; - ^while.body9 {scalar}: - i32 %193 = phi [^while.body4, i32 0] [^while.body9, i32 %202]; - i32* %194 = getelementptr &([600 * i32]* %87)[i64 0][i32 %193]; - store i32* %194 with i32 1; - i32* %195 = getelementptr &(i32* %194)[i64 1]; - store i32* %195 with i32 1; - i32* %196 = getelementptr &(i32* %194)[i64 2]; - store i32* %196 with i32 1; - i32* %197 = getelementptr &(i32* %194)[i64 3]; - store i32* %197 with i32 1; - i32* %198 = getelementptr &(i32* %194)[i64 4]; - store i32* %198 with i32 1; - i32* %199 = getelementptr &(i32* %194)[i64 5]; - store i32* %199 with i32 1; - i32* %200 = getelementptr &(i32* %194)[i64 6]; - store i32* %200 with i32 1; - i32* %201 = getelementptr &(i32* %194)[i64 7]; - store i32* %201 with i32 1; - i32 %202 = add i32 %193, i32 8; - i1 %203 = icmp sgt i32 %27, i32 %202; - cbr i1 %203(prob = 0.5), ^while.body9, ^scalar.final6; - ^b12 {scalar}: - i32 %204 = phi [^scalar.header4, i32 %134] [^b12, i32 %222]; - [600 * [600 * i32]]* %205 = getelementptr &([600 * [600 * [600 * i32]]]* %12)[i64 0][i32 %204]; - i32* %206 = getelementptr &([600 * [600 * i32]]* %205)[i64 0][i64 0][i64 0]; - store i32* %206 with i32 1; - i32* %207 = getelementptr &([600 * [600 * i32]]* %205)[i64 1][i64 0][i64 0]; - store i32* %207 with i32 1; - i32* %208 = getelementptr &([600 * [600 * i32]]* %205)[i64 2][i64 0][i64 0]; - store i32* %208 with i32 1; - i32* %209 = getelementptr &([600 * [600 * i32]]* %205)[i64 3][i64 0][i64 0]; - store i32* %209 with i32 1; - i32* %210 = getelementptr &([600 * [600 * i32]]* %205)[i64 4][i64 0][i64 0]; - store i32* %210 with i32 1; - i32* %211 = getelementptr &([600 * [600 * i32]]* %205)[i64 5][i64 0][i64 0]; - store i32* %211 with i32 1; - i32* %212 = getelementptr &([600 * [600 * i32]]* %205)[i64 6][i64 0][i64 0]; - store i32* %212 with i32 1; - i32* %213 = getelementptr &([600 * [600 * i32]]* %205)[i64 7][i64 0][i64 0]; - store i32* %213 with i32 1; - i32* %214 = getelementptr &([600 * [600 * i32]]* %205)[i64 8][i64 0][i64 0]; - store i32* %214 with i32 1; - i32* %215 = getelementptr &([600 * [600 * i32]]* %205)[i64 9][i64 0][i64 0]; - store i32* %215 with i32 1; - i32* %216 = getelementptr &([600 * [600 * i32]]* %205)[i64 10][i64 0][i64 0]; - store i32* %216 with i32 1; - i32* %217 = getelementptr &([600 * [600 * i32]]* %205)[i64 11][i64 0][i64 0]; - store i32* %217 with i32 1; - i32* %218 = getelementptr &([600 * [600 * i32]]* %205)[i64 12][i64 0][i64 0]; - store i32* %218 with i32 1; - i32* %219 = getelementptr &([600 * [600 * i32]]* %205)[i64 13][i64 0][i64 0]; - store i32* %219 with i32 1; - i32* %220 = getelementptr &([600 * [600 * i32]]* %205)[i64 14][i64 0][i64 0]; - store i32* %220 with i32 1; - i32* %221 = getelementptr &([600 * [600 * i32]]* %205)[i64 15][i64 0][i64 0]; - store i32* %221 with i32 1; - i32 %222 = add i32 %204, i32 16; - i1 %223 = icmp sgt i32 %8, i32 %222; - cbr i1 %223(prob = 0.5), ^b12, ^scalar.final4; - ^scalar.final5: - i1 %224 = icmp sgt i32 %13, i32 %132; - cbr i1 %224(prob = 0.5), ^while.body11, ^scalar.final7; - ^b13: - i32 %225 = add i32 %17, i32 1; - i1 %226 = icmp sgt i32 %1, i32 %225; - cbr i1 %226(prob = 0.984615), ^b2, ^b1; - ^scalar.header5: - i1 %227 = icmp sgt i32 %42, i32 %190; - cbr i1 %227(prob = 0.5), ^while.body12, ^scalar.final8; - ^while.body10 {scalar}: - i32 %228 = phi [^while.body7, i32 0] [^while.body10, i32 %245]; - i32* %229 = getelementptr &([600 * i32]* %156)[i64 0][i32 %228]; - store i32* %229 with i32 1; - i32* %230 = getelementptr &(i32* %229)[i64 1]; - store i32* %230 with i32 1; - i32* %231 = getelementptr &(i32* %229)[i64 2]; - store i32* %231 with i32 1; - i32* %232 = getelementptr &(i32* %229)[i64 3]; - store i32* %232 with i32 1; - i32* %233 = getelementptr &(i32* %229)[i64 4]; - store i32* %233 with i32 1; - i32* %234 = getelementptr &(i32* %229)[i64 5]; - store i32* %234 with i32 1; - i32* %235 = getelementptr &(i32* %229)[i64 6]; - store i32* %235 with i32 1; - i32* %236 = getelementptr &(i32* %229)[i64 7]; - store i32* %236 with i32 1; - i32* %237 = getelementptr &(i32* %229)[i64 8]; - store i32* %237 with i32 1; - i32* %238 = getelementptr &(i32* %229)[i64 9]; - store i32* %238 with i32 1; - i32* %239 = getelementptr &(i32* %229)[i64 10]; - store i32* %239 with i32 1; - i32* %240 = getelementptr &(i32* %229)[i64 11]; - store i32* %240 with i32 1; - i32* %241 = getelementptr &(i32* %229)[i64 12]; - store i32* %241 with i32 1; - i32* %242 = getelementptr &(i32* %229)[i64 13]; - store i32* %242 with i32 1; - i32* %243 = getelementptr &(i32* %229)[i64 14]; - store i32* %243 with i32 1; - i32* %244 = getelementptr &(i32* %229)[i64 15]; - store i32* %244 with i32 1; - i32 %245 = add i32 %228, i32 16; - i1 %246 = icmp sgt i32 %42, i32 %245; - cbr i1 %246(prob = 0.5), ^while.body10, ^scalar.final9; - ^while.body11 {scalar}: - i32 %247 = phi [^scalar.final5, i32 %132] [^while.body11, i32 %250]; - i32* %248 = getelementptr &([600 * i32]* %66)[i64 0][i32 %247]; - store i32* %248 with i32 1; - i32* %249 = getelementptr &(i32* %248)[i64 1]; - store i32* %249 with i32 1; - i32 %250 = add i32 %247, i32 2; - i1 %251 = icmp sgt i32 %13, i32 %250; - cbr i1 %251(prob = 0.5), ^while.body11, ^scalar.final7; - ^scalar.final6: - i1 %252 = icmp sgt i32 %19, i32 %202; - cbr i1 %252(prob = 0.5), ^while.body13, ^scalar.final10; - ^scalar.final7: - i32 %253 = phi [^scalar.final5, i32 %132] [^while.body11, i32 %250]; - i1 %254 = icmp sgt i32 %4, i32 %253; - cbr i1 %254(prob = 0.5), ^while.body14, ^scalar.final11; - ^while.body12 {scalar}: - i32 %255 = phi [^scalar.header5, i32 %190] [^while.body12, i32 %272]; - i32* %256 = getelementptr &([600 * i32]* %85)[i64 0][i32 %255]; - store i32* %256 with i32 1; - i32* %257 = getelementptr &(i32* %256)[i64 1]; - store i32* %257 with i32 1; - i32* %258 = getelementptr &(i32* %256)[i64 2]; - store i32* %258 with i32 1; - i32* %259 = getelementptr &(i32* %256)[i64 3]; - store i32* %259 with i32 1; - i32* %260 = getelementptr &(i32* %256)[i64 4]; - store i32* %260 with i32 1; - i32* %261 = getelementptr &(i32* %256)[i64 5]; - store i32* %261 with i32 1; - i32* %262 = getelementptr &(i32* %256)[i64 6]; - store i32* %262 with i32 1; - i32* %263 = getelementptr &(i32* %256)[i64 7]; - store i32* %263 with i32 1; - i32* %264 = getelementptr &(i32* %256)[i64 8]; - store i32* %264 with i32 1; - i32* %265 = getelementptr &(i32* %256)[i64 9]; - store i32* %265 with i32 1; - i32* %266 = getelementptr &(i32* %256)[i64 10]; - store i32* %266 with i32 1; - i32* %267 = getelementptr &(i32* %256)[i64 11]; - store i32* %267 with i32 1; - i32* %268 = getelementptr &(i32* %256)[i64 12]; - store i32* %268 with i32 1; - i32* %269 = getelementptr &(i32* %256)[i64 13]; - store i32* %269 with i32 1; - i32* %270 = getelementptr &(i32* %256)[i64 14]; - store i32* %270 with i32 1; - i32* %271 = getelementptr &(i32* %256)[i64 15]; - store i32* %271 with i32 1; - i32 %272 = add i32 %255, i32 16; - i1 %273 = icmp sgt i32 %42, i32 %272; - cbr i1 %273(prob = 0.5), ^while.body12, ^scalar.final8; - ^while.body13 {scalar}: - i32 %274 = phi [^scalar.final6, i32 %202] [^while.body13, i32 %279]; - i32* %275 = getelementptr &([600 * i32]* %87)[i64 0][i32 %274]; - store i32* %275 with i32 1; - i32* %276 = getelementptr &(i32* %275)[i64 1]; - store i32* %276 with i32 1; - i32* %277 = getelementptr &(i32* %275)[i64 2]; - store i32* %277 with i32 1; - i32* %278 = getelementptr &(i32* %275)[i64 3]; - store i32* %278 with i32 1; - i32 %279 = add i32 %274, i32 4; - i1 %280 = icmp sgt i32 %19, i32 %279; - cbr i1 %280(prob = 0.5), ^while.body13, ^scalar.final10; - ^scalar.final8: - i32 %281 = phi [^scalar.header5, i32 %190] [^while.body12, i32 %272]; - i1 %282 = icmp sgt i32 %27, i32 %281; - cbr i1 %282(prob = 0.5), ^while.body15, ^scalar.final12; - ^scalar.final9: - i1 %283 = icmp sgt i32 %27, i32 %245; - cbr i1 %283(prob = 0.5), ^while.body16, ^scalar.final13; - ^while.body14 {scalar}: - i32 %284 = phi [^scalar.final7, i32 %253] [^while.body14, i32 %286]; - i32* %285 = getelementptr &([600 * i32]* %66)[i64 0][i32 %284]; - store i32* %285 with i32 1; - i32 %286 = add i32 %284, i32 1; - i1 %287 = icmp sgt i32 %4, i32 %286; - cbr i1 %287(prob = 0.5), ^while.body14, ^scalar.final11; - ^scalar.final10: - i32 %288 = phi [^scalar.final6, i32 %202] [^while.body13, i32 %279]; - i1 %289 = icmp sgt i32 %13, i32 %288; - cbr i1 %289(prob = 0.5), ^while.body17, ^scalar.final14; - ^scalar.final11: - i32 %290 = add i32 %65, i32 1; - i1 %291 = icmp sgt i32 %4, i32 %290; - cbr i1 %291(prob = 0.984615), ^while.body2, ^b14; - ^while.body15 {scalar}: - i32 %292 = phi [^scalar.final8, i32 %281] [^while.body15, i32 %301]; - i32* %293 = getelementptr &([600 * i32]* %85)[i64 0][i32 %292]; - store i32* %293 with i32 1; - i32* %294 = getelementptr &(i32* %293)[i64 1]; - store i32* %294 with i32 1; - i32* %295 = getelementptr &(i32* %293)[i64 2]; - store i32* %295 with i32 1; - i32* %296 = getelementptr &(i32* %293)[i64 3]; - store i32* %296 with i32 1; - i32* %297 = getelementptr &(i32* %293)[i64 4]; - store i32* %297 with i32 1; - i32* %298 = getelementptr &(i32* %293)[i64 5]; - store i32* %298 with i32 1; - i32* %299 = getelementptr &(i32* %293)[i64 6]; - store i32* %299 with i32 1; - i32* %300 = getelementptr &(i32* %293)[i64 7]; - store i32* %300 with i32 1; - i32 %301 = add i32 %292, i32 8; - i1 %302 = icmp sgt i32 %27, i32 %301; - cbr i1 %302(prob = 0.5), ^while.body15, ^scalar.final12; - ^while.body16 {scalar}: - i32 %303 = phi [^scalar.final9, i32 %245] [^while.body16, i32 %312]; - i32* %304 = getelementptr &([600 * i32]* %156)[i64 0][i32 %303]; - store i32* %304 with i32 1; - i32* %305 = getelementptr &(i32* %304)[i64 1]; - store i32* %305 with i32 1; - i32* %306 = getelementptr &(i32* %304)[i64 2]; - store i32* %306 with i32 1; - i32* %307 = getelementptr &(i32* %304)[i64 3]; - store i32* %307 with i32 1; - i32* %308 = getelementptr &(i32* %304)[i64 4]; - store i32* %308 with i32 1; - i32* %309 = getelementptr &(i32* %304)[i64 5]; - store i32* %309 with i32 1; - i32* %310 = getelementptr &(i32* %304)[i64 6]; - store i32* %310 with i32 1; - i32* %311 = getelementptr &(i32* %304)[i64 7]; - store i32* %311 with i32 1; - i32 %312 = add i32 %303, i32 8; - i1 %313 = icmp sgt i32 %27, i32 %312; - cbr i1 %313(prob = 0.5), ^while.body16, ^scalar.final13; - ^while.body17 {scalar}: - i32 %314 = phi [^scalar.final10, i32 %288] [^while.body17, i32 %317]; - i32* %315 = getelementptr &([600 * i32]* %87)[i64 0][i32 %314]; - store i32* %315 with i32 1; - i32* %316 = getelementptr &(i32* %315)[i64 1]; - store i32* %316 with i32 1; - i32 %317 = add i32 %314, i32 2; - i1 %318 = icmp sgt i32 %13, i32 %317; - cbr i1 %318(prob = 0.5), ^while.body17, ^scalar.final14; - ^scalar.final12: - i32 %319 = phi [^scalar.final8, i32 %281] [^while.body15, i32 %301]; - i1 %320 = icmp sgt i32 %19, i32 %319; - cbr i1 %320(prob = 0.5), ^while.body18, ^scalar.final15; - ^scalar.final13: - i32 %321 = phi [^scalar.final9, i32 %245] [^while.body16, i32 %312]; - i1 %322 = icmp sgt i32 %19, i32 %321; - cbr i1 %322(prob = 0.5), ^while.body19, ^scalar.final16; - ^scalar.final14: - i32 %323 = phi [^scalar.final10, i32 %288] [^while.body17, i32 %317]; - i1 %324 = icmp sgt i32 %4, i32 %323; - cbr i1 %324(prob = 0.5), ^while.body20, ^scalar.final17; - ^b14: - i32 %325 = add i32 %40, i32 1; - i1 %326 = icmp sgt i32 %1, i32 %325; - cbr i1 %326(prob = 0.984615), ^b4, ^b1; - ^while.body18 {scalar}: - i32 %327 = phi [^scalar.final12, i32 %319] [^while.body18, i32 %332]; - i32* %328 = getelementptr &([600 * i32]* %85)[i64 0][i32 %327]; - store i32* %328 with i32 1; - i32* %329 = getelementptr &(i32* %328)[i64 1]; - store i32* %329 with i32 1; - i32* %330 = getelementptr &(i32* %328)[i64 2]; - store i32* %330 with i32 1; - i32* %331 = getelementptr &(i32* %328)[i64 3]; - store i32* %331 with i32 1; - i32 %332 = add i32 %327, i32 4; - i1 %333 = icmp sgt i32 %19, i32 %332; - cbr i1 %333(prob = 0.5), ^while.body18, ^scalar.final15; - ^while.body19 {scalar}: - i32 %334 = phi [^scalar.final13, i32 %321] [^while.body19, i32 %339]; - i32* %335 = getelementptr &([600 * i32]* %156)[i64 0][i32 %334]; - store i32* %335 with i32 1; - i32* %336 = getelementptr &(i32* %335)[i64 1]; - store i32* %336 with i32 1; - i32* %337 = getelementptr &(i32* %335)[i64 2]; - store i32* %337 with i32 1; - i32* %338 = getelementptr &(i32* %335)[i64 3]; - store i32* %338 with i32 1; - i32 %339 = add i32 %334, i32 4; - i1 %340 = icmp sgt i32 %19, i32 %339; - cbr i1 %340(prob = 0.5), ^while.body19, ^scalar.final16; - ^while.body20 {scalar}: - i32 %341 = phi [^scalar.final14, i32 %323] [^while.body20, i32 %343]; - i32* %342 = getelementptr &([600 * i32]* %87)[i64 0][i32 %341]; - store i32* %342 with i32 1; - i32 %343 = add i32 %341, i32 1; - i1 %344 = icmp sgt i32 %4, i32 %343; - cbr i1 %344(prob = 0.5), ^while.body20, ^scalar.final17; - ^scalar.final15: - i32 %345 = phi [^scalar.final12, i32 %319] [^while.body18, i32 %332]; - i1 %346 = icmp sgt i32 %13, i32 %345; - cbr i1 %346(prob = 0.5), ^while.body21, ^scalar.final18; - ^scalar.final16: - i32 %347 = phi [^scalar.final13, i32 %321] [^while.body19, i32 %339]; - i1 %348 = icmp sgt i32 %13, i32 %347; - cbr i1 %348(prob = 0.5), ^while.body22, ^scalar.final19; - ^scalar.final17: - i32 %349 = add i32 %86, i32 1; - i1 %350 = icmp sgt i32 %4, i32 %349; - cbr i1 %350(prob = 0.984615), ^while.body4, ^b15; - ^while.body21 {scalar}: - i32 %351 = phi [^scalar.final15, i32 %345] [^while.body21, i32 %354]; - i32* %352 = getelementptr &([600 * i32]* %85)[i64 0][i32 %351]; - store i32* %352 with i32 1; - i32* %353 = getelementptr &(i32* %352)[i64 1]; - store i32* %353 with i32 1; - i32 %354 = add i32 %351, i32 2; - i1 %355 = icmp sgt i32 %13, i32 %354; - cbr i1 %355(prob = 0.5), ^while.body21, ^scalar.final18; - ^while.body22 {scalar}: - i32 %356 = phi [^scalar.final16, i32 %347] [^while.body22, i32 %359]; - i32* %357 = getelementptr &([600 * i32]* %156)[i64 0][i32 %356]; - store i32* %357 with i32 1; - i32* %358 = getelementptr &(i32* %357)[i64 1]; - store i32* %358 with i32 1; - i32 %359 = add i32 %356, i32 2; - i1 %360 = icmp sgt i32 %13, i32 %359; - cbr i1 %360(prob = 0.5), ^while.body22, ^scalar.final19; - ^scalar.final18: - i32 %361 = phi [^scalar.final15, i32 %345] [^while.body21, i32 %354]; - i1 %362 = icmp sgt i32 %4, i32 %361; - cbr i1 %362(prob = 0.5), ^while.body23, ^scalar.final20; - ^scalar.final19: - i32 %363 = phi [^scalar.final16, i32 %347] [^while.body22, i32 %359]; - i1 %364 = icmp sgt i32 %4, i32 %363; - cbr i1 %364(prob = 0.5), ^while.body24, ^scalar.final21; - ^b15: - i32 %365 = add i32 %67, i32 1; - i1 %366 = icmp sgt i32 %1, i32 %365; - cbr i1 %366(prob = 0.984615), ^b7, ^b1; - ^while.body23 {scalar}: - i32 %367 = phi [^scalar.final18, i32 %361] [^while.body23, i32 %369]; - i32* %368 = getelementptr &([600 * i32]* %85)[i64 0][i32 %367]; - store i32* %368 with i32 1; - i32 %369 = add i32 %367, i32 1; - i1 %370 = icmp sgt i32 %4, i32 %369; - cbr i1 %370(prob = 0.5), ^while.body23, ^scalar.final20; - ^while.body24 {scalar}: - i32 %371 = phi [^scalar.final19, i32 %363] [^while.body24, i32 %373]; - i32* %372 = getelementptr &([600 * i32]* %156)[i64 0][i32 %371]; - store i32* %372 with i32 1; - i32 %373 = add i32 %371, i32 1; - i1 %374 = icmp sgt i32 %4, i32 %373; - cbr i1 %374(prob = 0.5), ^while.body24, ^scalar.final21; - ^scalar.final20: - i32 %375 = add i32 %84, i32 1; - i1 %376 = icmp sgt i32 %4, i32 %375; - cbr i1 %376(prob = 0.984615), ^while.body3, ^b16; - ^scalar.final21: - i32 %377 = add i32 %155, i32 1; - i1 %378 = icmp sgt i32 %4, i32 %377; - cbr i1 %378(prob = 0.984615), ^while.body7, ^b17; - ^b16: - i32 %379 = add i32 %63, i32 1; - i1 %380 = icmp sgt i32 %1, i32 %379; - cbr i1 %380(prob = 0.984615), ^b6, ^b1; - ^b17: - i32 %381 = add i32 %88, i32 1; - i1 %382 = icmp sgt i32 %1, i32 %381; - cbr i1 %382(prob = 0.984615), ^b9, ^b1; + i32 %78 = add i32 %19, i32 1; + i1 %79 = icmp sgt i32 %4, i32 %78; + cbr i1 %79(prob = 0.984615), ^while.body1, ^b7; + ^b6: + i32 %80 = add i32 %17, i32 1; + i1 %81 = icmp sgt i32 %1, i32 %80; + cbr i1 %81(prob = 0.984615), ^b4, ^b1; + ^b7: + i32 %82 = add i32 %13, i32 1; + i1 %83 = icmp sgt i32 %1, i32 %82; + cbr i1 %83(prob = 0.984615), ^b3, ^b1; } internal [4 * i8]* @cmmc_parallel_body_payload_0, align 8; diff --git a/tests/SysY2022/performance/stencil0.arm.s b/tests/SysY2022/performance/stencil0.arm.s index daf9a9aa6..0c882be0a 100644 --- a/tests/SysY2022/performance/stencil0.arm.s +++ b/tests/SysY2022/performance/stencil0.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 image_in: .zero 2097152 -.align 8 +.p2align 3 image_out: .zero 2097152 .text diff --git a/tests/SysY2022/performance/stencil0.riscv.s b/tests/SysY2022/performance/stencil0.riscv.s index 629bab0f6..95e32c5fd 100644 --- a/tests/SysY2022/performance/stencil0.riscv.s +++ b/tests/SysY2022/performance/stencil0.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 image_in: .zero 2097152 -.align 8 +.p2align 3 image_out: .zero 2097152 .text diff --git a/tests/SysY2022/performance/stencil1.arm.s b/tests/SysY2022/performance/stencil1.arm.s index 1bb196b49..ea21857ae 100644 --- a/tests/SysY2022/performance/stencil1.arm.s +++ b/tests/SysY2022/performance/stencil1.arm.s @@ -1,10 +1,10 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 image_in: .zero 4194304 -.align 8 +.p2align 3 image_out: .zero 4194304 .text diff --git a/tests/SysY2022/performance/stencil1.riscv.s b/tests/SysY2022/performance/stencil1.riscv.s index 6fdfc0f94..ee466106c 100644 --- a/tests/SysY2022/performance/stencil1.riscv.s +++ b/tests/SysY2022/performance/stencil1.riscv.s @@ -1,10 +1,10 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 image_in: .zero 4194304 -.align 8 +.p2align 3 image_out: .zero 4194304 .text diff --git a/tests/SysY2022/performance/transpose0.arm.s b/tests/SysY2022/performance/transpose0.arm.s index 19721bba0..8190f203f 100644 --- a/tests/SysY2022/performance/transpose0.arm.s +++ b/tests/SysY2022/performance/transpose0.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 matrix: .zero 80000000 -.align 8 +.p2align 3 a: .zero 400000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/transpose0.riscv.s b/tests/SysY2022/performance/transpose0.riscv.s index 31c8d1092..7f9d07ce6 100644 --- a/tests/SysY2022/performance/transpose0.riscv.s +++ b/tests/SysY2022/performance/transpose0.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 matrix: .zero 80000000 -.align 8 +.p2align 3 a: .zero 400000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/transpose1.arm.s b/tests/SysY2022/performance/transpose1.arm.s index 19721bba0..8190f203f 100644 --- a/tests/SysY2022/performance/transpose1.arm.s +++ b/tests/SysY2022/performance/transpose1.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 matrix: .zero 80000000 -.align 8 +.p2align 3 a: .zero 400000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/transpose1.riscv.s b/tests/SysY2022/performance/transpose1.riscv.s index 31c8d1092..7f9d07ce6 100644 --- a/tests/SysY2022/performance/transpose1.riscv.s +++ b/tests/SysY2022/performance/transpose1.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 matrix: .zero 80000000 -.align 8 +.p2align 3 a: .zero 400000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/transpose2.arm.s b/tests/SysY2022/performance/transpose2.arm.s index 19721bba0..8190f203f 100644 --- a/tests/SysY2022/performance/transpose2.arm.s +++ b/tests/SysY2022/performance/transpose2.arm.s @@ -1,13 +1,13 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 matrix: .zero 80000000 -.align 8 +.p2align 3 a: .zero 400000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/transpose2.riscv.s b/tests/SysY2022/performance/transpose2.riscv.s index 31c8d1092..7f9d07ce6 100644 --- a/tests/SysY2022/performance/transpose2.riscv.s +++ b/tests/SysY2022/performance/transpose2.riscv.s @@ -1,13 +1,13 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .bss -.align 8 +.p2align 3 matrix: .zero 80000000 -.align 8 +.p2align 3 a: .zero 400000 -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 .text diff --git a/tests/SysY2022/performance/vector_mul1.arm.s b/tests/SysY2022/performance/vector_mul1.arm.s index a739fb531..934b20d85 100644 --- a/tests/SysY2022/performance/vector_mul1.arm.s +++ b/tests/SysY2022/performance/vector_mul1.arm.s @@ -1,16 +1,16 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 Vectortm: .zero 400000 -.align 8 +.p2align 3 vectorB: .zero 400000 -.align 8 +.p2align 3 vectorA: .zero 400000 .text @@ -28,11 +28,11 @@ main: movt r1, #1 movw r2, #:lower16:cmmc_parallel_body_0 movt r2, #:upper16:cmmc_parallel_body_0 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA movw r0, #:lower16:cmmc_parallel_body_payload_0 movt r0, #:upper16:cmmc_parallel_body_payload_0 - str r4, [r0, #0] + str r3, [r0, #0] mov r0, #0 bl cmmcParallelFor mov r2, #0 @@ -49,208 +49,120 @@ label108: movt r5, #:upper16:Vectortm mov r0, r1 vmov s0, r1 - mov r3, r5 + mov r4, r5 .p2align 4 label110: add r5, r1, #1 - movw r4, #34464 - movt r4, #1 - cmp r0, r4 + movw r3, #34464 + movt r3, #1 + cmp r0, r3 bge label115 - add r4, r0, #3 + add r3, r0, #3 movw r6, #34464 movt r6, #1 - cmp r4, r6 + cmp r3, r6 bge label752 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA add r6, r1, #2 add r7, r1, #3 add r8, r1, #4 - add r4, r4, r0, lsl #2 + add r3, r3, r0, lsl #2 b label231 -.p2align 4 -label235: - add r4, r4, #16 -.p2align 4 -label231: - add r10, r1, r0 - add r9, r5, r0 - vldr s2, [r4, #0] - mul r10, r10, r9 - add r10, r10, r10, lsr #31 - asr r10, r10, #1 - add r10, r5, r10 - vmov s1, r10 - add r10, r6, r0 - mul r9, r9, r10 - vcvt.f32.s32 s1, s1 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s1, s2, s1 - add r9, r5, r9 - vldr s2, [r4, #4] - vadd.f32 s1, s0, s1 - vmov s0, r9 - add r9, r7, r0 - mul r10, r10, r9 - vcvt.f32.s32 s0, s0 - add r10, r10, r10, lsr #31 - asr r10, r10, #1 - vdiv.f32 s0, s2, s0 - add r10, r5, r10 - vadd.f32 s2, s1, s0 - vldr s1, [r4, #8] - vmov s0, r10 - add r10, r8, r0 - add r0, r0, #4 - mul r9, r9, r10 - vcvt.f32.s32 s0, s0 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s0, s1, s0 - add r9, r5, r9 - vadd.f32 s1, s2, s0 - vldr s2, [r4, #12] - vmov s0, r9 - movw r9, #34461 - movt r9, #1 - cmp r0, r9 - vcvt.f32.s32 s0, s0 - vdiv.f32 s0, s2, s0 - vadd.f32 s0, s1, s0 - blt label235 - vmov.f32 s1, s0 -.p2align 4 -label220: - movw r4, #34464 - movt r4, #1 - cmp r0, r4 - bge label758 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA - add r4, r4, r0, lsl #2 -.p2align 4 -label225: - add r6, r1, r0 - add r7, r5, r0 - add r0, r0, #1 - mul r6, r6, r7 - add r6, r6, r6, lsr #31 - asr r6, r6, #1 - add r6, r5, r6 - vmov s0, r6 - movw r6, #34464 - movt r6, #1 - cmp r0, r6 - vcvt.f32.s32 s2, s0 - vldr s0, [r4, #0] - vdiv.f32 s0, s0, s2 - vadd.f32 s0, s1, s0 - bge label758 - add r4, r4, #4 - vmov.f32 s1, s0 - b label225 -.p2align 4 -label115: - vstr s0, [r3, #0] - movw r1, #34464 - movt r1, #1 - cmp r5, r1 - bge label254 - add r3, r3, #4 - mov r1, r5 - b label110 label182: mov r0, #76 bl _sysy_stoptime - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA - mov r0, #0 - mov r1, r4 - vmov s0, r0 + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA + mov r1, #0 + mov r0, r3 + vmov s0, r1 b label183 .p2align 4 label187: - add r1, r1, #64 + add r0, r0, #64 vmov.f32 s0, s1 .p2align 4 label183: movw r3, #:lower16:vectorB movt r3, #:upper16:vectorB - vldr s2, [r1, #0] - add r2, r3, r0, lsl #2 - add r0, r0, #16 - vldr s1, [r2, #0] - vmul.f32 s1, s2, s1 - vadd.f32 s2, s0, s1 - vldr s1, [r1, #4] - vldr s0, [r2, #4] - vmul.f32 s0, s1, s0 - vadd.f32 s0, s2, s0 - vldr s2, [r1, #8] - vldr s1, [r2, #8] - vmul.f32 s1, s2, s1 - vadd.f32 s2, s0, s1 - vldr s0, [r1, #12] - vldr s1, [r2, #12] - vmul.f32 s0, s0, s1 - vadd.f32 s1, s2, s0 - vldr s2, [r1, #16] - vldr s0, [r2, #16] - vmul.f32 s0, s2, s0 - vldr s2, [r1, #20] - vadd.f32 s1, s1, s0 - vldr s0, [r2, #20] - vmul.f32 s0, s2, s0 - vldr s2, [r1, #24] - vadd.f32 s1, s1, s0 - vldr s0, [r2, #24] - vmul.f32 s0, s2, s0 - vadd.f32 s2, s1, s0 - vldr s1, [r1, #28] - vldr s0, [r2, #28] - vmul.f32 s0, s1, s0 - vadd.f32 s1, s2, s0 - vldr s2, [r1, #32] - vldr s0, [r2, #32] - vmul.f32 s0, s2, s0 - vadd.f32 s2, s1, s0 - vldr s0, [r1, #36] - vldr s1, [r2, #36] - vmul.f32 s0, s0, s1 - vldr s1, [r1, #40] - vadd.f32 s2, s2, s0 - vldr s0, [r2, #40] - vmul.f32 s0, s1, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r1, #44] - vldr s1, [r2, #44] - vmul.f32 s0, s0, s1 - vldr s1, [r1, #48] - vadd.f32 s2, s2, s0 - vldr s0, [r2, #48] - vmul.f32 s0, s1, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r1, #52] - vldr s1, [r2, #52] - vmul.f32 s0, s0, s1 - vadd.f32 s1, s2, s0 - vldr s0, [r1, #56] + vldr s1, [r0, #0] + add r2, r3, r1, lsl #2 + add r1, r1, #16 + vldr s2, [r2, #0] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #4] + vldr s2, [r2, #4] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #8] + vldr s2, [r2, #8] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #12] + vldr s2, [r2, #12] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #16] + vldr s2, [r2, #16] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #20] + vldr s2, [r2, #20] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #24] + vldr s2, [r2, #24] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #28] + vldr s2, [r2, #28] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #32] + vldr s2, [r2, #32] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #36] + vldr s2, [r2, #36] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #40] + vldr s2, [r2, #40] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #44] + vldr s2, [r2, #44] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #48] + vldr s2, [r2, #48] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #52] + vldr s2, [r2, #52] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #56] vldr s2, [r2, #56] - vmul.f32 s0, s0, s2 - vadd.f32 s2, s1, s0 - vldr s0, [r1, #60] - vldr s1, [r2, #60] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #60] + vldr s2, [r2, #60] movw r2, #34464 movt r2, #1 - cmp r0, r2 - vmul.f32 s0, s0, s1 - vadd.f32 s1, s2, s0 + cmp r1, r2 + vmul.f32 s1, s1, s2 + vadd.f32 s1, s0, s1 blt label187 mov r1, #0 vmov s0, r1 mov r0, r3 + b label189 +.p2align 4 +label193: + add r0, r0, #64 .p2align 4 label189: vldr s2, [r0, #0] @@ -259,135 +171,53 @@ label189: movt r2, #1 cmp r1, r2 vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #4] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #4] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #8] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #12] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #16] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #20] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #12] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #16] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #20] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #24] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #28] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #32] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #36] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #28] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #32] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #36] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #40] vmul.f32 s2, s2, s2 vadd.f32 s0, s0, s2 vldr s2, [r0, #44] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #48] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #48] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #52] vmul.f32 s2, s2, s2 vadd.f32 s0, s0, s2 vldr s2, [r0, #56] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #60] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 - bge label194 - add r0, r0, #64 - b label189 -.p2align 4 -label254: - mov r1, #0 - movw r3, #:lower16:vectorB - movt r3, #:upper16:vectorB - mov r0, r1 - vmov s0, r1 - mov r4, r3 -.p2align 4 -label120: - add r6, r1, #1 - movw r3, #34464 - movt r3, #1 - cmp r0, r3 - bge label142 - add r3, r0, #3 - movw r5, #34464 - movt r5, #1 - cmp r3, r5 - bge label267 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r7, r1, #3 - add r8, r1, #4 - add r3, r5, r0, lsl #2 - add r5, r1, #2 -.p2align 4 -label127: - add r9, r1, r0 - add r10, r6, r0 - vldr s2, [r3, #0] - add r11, r0, #1 - mul r9, r9, r10 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - add r9, r9, r11 - add r11, r0, #2 - vmov s1, r9 - add r9, r5, r0 - mul r10, r10, r9 - vcvt.f32.s32 s1, s1 - add r10, r10, r10, lsr #31 - asr r10, r10, #1 - vdiv.f32 s1, s2, s1 - add r10, r10, r11 - vldr s2, [r3, #4] - add r11, r0, #3 - vadd.f32 s0, s0, s1 - vmov s1, r10 - add r10, r7, r0 - mul r9, r9, r10 - vcvt.f32.s32 s1, s1 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s1, s2, s1 - add r9, r9, r11 - vldr s2, [r3, #8] - vadd.f32 s0, s0, s1 - vmov s1, r9 - add r9, r8, r0 - add r0, r0, #4 - mul r9, r10, r9 - vcvt.f32.s32 s1, s1 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s1, s2, s1 - add r9, r9, r0 - vldr s2, [r3, #12] - vadd.f32 s0, s0, s1 - vmov s1, r9 - movw r9, #34461 - movt r9, #1 - cmp r0, r9 - vcvt.f32.s32 s1, s1 - vdiv.f32 s1, s2, s1 - vadd.f32 s0, s0, s1 - bge label318 - add r3, r3, #16 - b label127 -label194: + vadd.f32 s0, s0, s2 + vldr s2, [r0, #60] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + blt label193 vdiv.f32 s2, s1, s0 mov r0, #1065353216 movw r1, #14269 @@ -395,25 +225,25 @@ label194: vmov s0, r0 movw r0, #14269 movt r0, #13702 - vsub.f32 s1, s0, s2 - vmov s0, r0 + vmov s1, r0 mov r0, #0 - vcmp.f32 s1, s0 - vmov s0, r1 + vsub.f32 s0, s0, s2 + vcmp.f32 s0, s1 + vmov s1, r1 mov r1, #0 vmrs APSR_nzcv, FPSCR movwgt r0, #1 - vcmp.f32 s1, s0 + vcmp.f32 s0, s1 vmrs APSR_nzcv, FPSCR movwmi r1, #1 orrs r0, r0, r1 beq label636 mov r0, #1065353216 - vmov.f32 s0, s2 - vmov s1, r0 + vmov.f32 s1, s2 + vmov s0, r0 .p2align 4 label197: - vadd.f32 s0, s1, s0 + vadd.f32 s0, s0, s1 mov r0, #1056964608 movw r1, #14269 movt r1, #46470 @@ -422,9 +252,9 @@ label197: movt r0, #13702 vmov s4, r0 mov r0, #0 - vmul.f32 s1, s0, s1 - vdiv.f32 s0, s2, s1 - vsub.f32 s3, s1, s0 + vmul.f32 s0, s0, s1 + vdiv.f32 s1, s2, s0 + vsub.f32 s3, s0, s1 vcmp.f32 s3, s4 vmov s4, r1 mov r1, #0 @@ -435,7 +265,6 @@ label197: movwmi r1, #1 orrs r0, r0, r1 bne label197 - vmov.f32 s0, s1 label195: mov r0, #1065353216 movw r1, #14269 @@ -443,15 +272,15 @@ label195: vmov s1, r0 movw r0, #14269 movt r0, #13702 - vsub.f32 s1, s0, s1 - vmov s0, r0 + vsub.f32 s0, s0, s1 + vmov s1, r0 mov r0, #0 - vcmp.f32 s1, s0 - vmov s0, r1 + vcmp.f32 s0, s1 + vmov s1, r1 mov r1, #0 vmrs APSR_nzcv, FPSCR movwls r0, #1 - vcmp.f32 s1, s0 + vcmp.f32 s0, s1 vmrs APSR_nzcv, FPSCR movwge r1, #1 and r0, r0, r1 @@ -462,68 +291,92 @@ label195: mov r0, #0 pop { r4, r5, r6, r7, r8, r9, r10, r11, pc } .p2align 4 -label267: +label752: mov r3, #0 vmov.f32 s1, s0 vmov s2, r3 vmov.f32 s0, s2 .p2align 4 -label132: +label220: movw r3, #34464 movt r3, #1 cmp r0, r3 - bge label323 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r3, r5, r0, lsl #2 + bge label758 + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA + add r3, r3, r0, lsl #2 .p2align 4 -label137: - add r5, r1, r0 - add r7, r6, r0 - vldr s2, [r3, #0] +label225: + add r6, r1, r0 + add r7, r5, r0 add r0, r0, #1 - mul r5, r5, r7 - add r5, r5, r5, lsr #31 - asr r5, r5, #1 - add r5, r5, r0 - vmov s0, r5 - movw r5, #34464 - movt r5, #1 - cmp r0, r5 - vcvt.f32.s32 s0, s0 - vdiv.f32 s0, s2, s0 + mul r6, r6, r7 + add r6, r6, r6, lsr #31 + asr r6, r6, #1 + add r6, r5, r6 + vmov s0, r6 + movw r6, #34464 + movt r6, #1 + cmp r0, r6 + vcvt.f32.s32 s2, s0 + vldr s0, [r3, #0] + vdiv.f32 s0, s0, s2 vadd.f32 s0, s1, s0 - bge label323 + bge label758 add r3, r3, #4 vmov.f32 s1, s0 - b label137 + b label225 .p2align 4 -label323: +label758: movw r0, #34464 movt r0, #1 .p2align 4 -label142: +label115: vstr s0, [r4, #0] movw r1, #34464 movt r1, #1 + cmp r5, r1 + bge label254 + add r4, r4, #4 + mov r1, r5 + b label110 +.p2align 4 +label254: + mov r1, #0 + movw r3, #:lower16:vectorB + movt r3, #:upper16:vectorB + mov r0, r1 + vmov s0, r1 +.p2align 4 +label120: + add r6, r1, #1 + movw r4, #34464 + movt r4, #1 + cmp r0, r4 + bge label142 + add r4, r0, #3 + movw r5, #34464 + movt r5, #1 + cmp r4, r5 + bge label267 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r7, r1, #3 + add r8, r1, #4 + add r4, r5, r0, lsl #2 + add r5, r1, #2 + b label127 +.p2align 4 +label142: + vstr s0, [r3, #0] + movw r1, #34464 + movt r1, #1 cmp r6, r1 bge label344 - add r4, r4, #4 + add r3, r3, #4 mov r1, r6 b label120 .p2align 4 -label758: - movw r0, #34464 - movt r0, #1 - b label115 -.p2align 4 -label752: - mov r4, #0 - vmov.f32 s1, s0 - vmov s2, r4 - vmov.f32 s0, s2 - b label220 -.p2align 4 label344: mov r1, #0 movw r5, #:lower16:Vectortm @@ -531,16 +384,6 @@ label344: mov r0, r1 vmov s0, r1 mov r4, r5 - b label147 -.p2align 4 -label169: - vstr s0, [r4, #0] - movw r1, #34464 - movt r1, #1 - cmp r5, r1 - bge label172 - add r4, r4, #4 - mov r1, r5 .p2align 4 label147: add r5, r1, #1 @@ -560,7 +403,103 @@ label147: add r8, r1, #4 add r3, r3, r0, lsl #2 .p2align 4 -label154: +label154: + add r10, r1, r0 + add r9, r5, r0 + vldr s2, [r3, #0] + mul r10, r10, r9 + add r10, r10, r10, lsr #31 + asr r10, r10, #1 + add r10, r5, r10 + vmov s1, r10 + add r10, r6, r0 + mul r9, r9, r10 + vcvt.f32.s32 s1, s1 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + vdiv.f32 s1, s2, s1 + add r9, r5, r9 + vldr s2, [r3, #4] + vadd.f32 s0, s0, s1 + vmov s1, r9 + add r9, r7, r0 + mul r10, r10, r9 + vcvt.f32.s32 s1, s1 + add r10, r10, r10, lsr #31 + asr r10, r10, #1 + vdiv.f32 s1, s2, s1 + add r10, r5, r10 + vldr s2, [r3, #8] + vadd.f32 s0, s0, s1 + vmov s1, r10 + add r10, r8, r0 + add r0, r0, #4 + mul r9, r9, r10 + vcvt.f32.s32 s1, s1 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + vdiv.f32 s1, s2, s1 + add r9, r5, r9 + vldr s2, [r3, #12] + vadd.f32 s0, s0, s1 + vmov s1, r9 + movw r9, #34461 + movt r9, #1 + cmp r0, r9 + vcvt.f32.s32 s1, s1 + vdiv.f32 s1, s2, s1 + vadd.f32 s0, s0, s1 + bge label405 + add r3, r3, #16 + b label154 +.p2align 4 +label267: + mov r4, #0 + vmov.f32 s1, s0 + vmov s2, r4 + vmov.f32 s0, s2 +.p2align 4 +label132: + movw r4, #34464 + movt r4, #1 + cmp r0, r4 + bge label323 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r4, r5, r0, lsl #2 + b label137 +.p2align 4 +label141: + add r4, r4, #4 + vmov.f32 s1, s0 +.p2align 4 +label137: + add r5, r1, r0 + add r7, r6, r0 + vldr s2, [r4, #0] + add r0, r0, #1 + mul r5, r5, r7 + add r5, r5, r5, lsr #31 + asr r5, r5, #1 + add r5, r5, r0 + vmov s0, r5 + movw r5, #34464 + movt r5, #1 + cmp r0, r5 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + blt label141 +.p2align 4 +label323: + movw r0, #34464 + movt r0, #1 + b label142 +.p2align 4 +label235: + add r3, r3, #16 +.p2align 4 +label231: add r10, r1, r0 add r9, r5, r0 vldr s2, [r3, #0] @@ -577,69 +516,46 @@ label154: vdiv.f32 s1, s2, s1 add r9, r5, r9 vldr s2, [r3, #4] - vadd.f32 s0, s0, s1 - vmov s1, r9 + vadd.f32 s1, s0, s1 + vmov s0, r9 add r9, r7, r0 mul r10, r10, r9 - vcvt.f32.s32 s1, s1 + vcvt.f32.s32 s0, s0 add r10, r10, r10, lsr #31 asr r10, r10, #1 - vdiv.f32 s1, s2, s1 + vdiv.f32 s0, s2, s0 add r10, r5, r10 - vldr s2, [r3, #8] - vadd.f32 s0, s0, s1 - vmov s1, r10 + vadd.f32 s2, s1, s0 + vldr s1, [r3, #8] + vmov s0, r10 add r10, r8, r0 add r0, r0, #4 mul r9, r9, r10 - vcvt.f32.s32 s1, s1 + vcvt.f32.s32 s0, s0 add r9, r9, r9, lsr #31 asr r9, r9, #1 - vdiv.f32 s1, s2, s1 + vdiv.f32 s0, s1, s0 add r9, r5, r9 + vadd.f32 s1, s2, s0 vldr s2, [r3, #12] - vadd.f32 s0, s0, s1 - vmov s1, r9 + vmov s0, r9 movw r9, #34461 movt r9, #1 cmp r0, r9 - vcvt.f32.s32 s1, s1 - vdiv.f32 s1, s2, s1 - vadd.f32 s1, s0, s1 - bge label405 - add r3, r3, #16 - vmov.f32 s0, s1 - b label154 -.p2align 4 -label172: - mov r1, #0 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA - mov r0, r1 - vmov s0, r1 - mov r3, r4 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + blt label235 + vmov.f32 s1, s0 + b label220 .p2align 4 -label173: - add r6, r1, #1 - movw r4, #34464 - movt r4, #1 - cmp r0, r4 - bge label178 - add r4, r0, #3 - movw r5, #34464 - movt r5, #1 - cmp r4, r5 - bge label673 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r7, r1, #3 - add r8, r1, #4 - add r4, r5, r0, lsl #2 - add r5, r1, #2 +label131: + add r4, r4, #16 .p2align 4 -label213: +label127: add r10, r1, r0 add r9, r6, r0 + vldr s2, [r4, #0] add r11, r0, #1 mul r10, r10, r9 add r10, r10, r10, lsr #31 @@ -649,49 +565,51 @@ label213: vmov s1, r10 add r10, r5, r0 mul r9, r9, r10 - vcvt.f32.s32 s2, s1 + vcvt.f32.s32 s1, s1 add r9, r9, r9, lsr #31 - vldr s1, [r4, #0] asr r9, r9, #1 + vdiv.f32 s1, s2, s1 add r9, r9, r11 + vldr s2, [r4, #4] add r11, r0, #3 - vdiv.f32 s1, s1, s2 - vadd.f32 s2, s0, s1 - vldr s1, [r4, #4] - vmov s0, r9 + vadd.f32 s0, s0, s1 + vmov s1, r9 add r9, r7, r0 mul r10, r10, r9 - vcvt.f32.s32 s0, s0 + vcvt.f32.s32 s1, s1 add r10, r10, r10, lsr #31 asr r10, r10, #1 - vdiv.f32 s0, s1, s0 + vdiv.f32 s1, s2, s1 add r10, r10, r11 - vadd.f32 s1, s2, s0 vldr s2, [r4, #8] - vmov s0, r10 + vadd.f32 s0, s0, s1 + vmov s1, r10 add r10, r8, r0 add r0, r0, #4 mul r9, r9, r10 - vcvt.f32.s32 s0, s0 + vcvt.f32.s32 s1, s1 add r9, r9, r9, lsr #31 asr r9, r9, #1 - vdiv.f32 s0, s2, s0 + vdiv.f32 s1, s2, s1 add r9, r9, r0 vldr s2, [r4, #12] - vadd.f32 s1, s1, s0 - vmov s0, r9 + vadd.f32 s0, s0, s1 + vmov s1, r9 movw r9, #34461 movt r9, #1 cmp r0, r9 - vcvt.f32.s32 s0, s0 - vdiv.f32 s0, s2, s0 - vadd.f32 s0, s1, s0 - bge label745 - add r4, r4, #16 - b label213 + vcvt.f32.s32 s1, s1 + vdiv.f32 s1, s2, s1 + vadd.f32 s0, s0, s1 + blt label131 + vmov.f32 s1, s0 + b label132 .p2align 4 -label405: - vmov.f32 s0, s1 +label357: + mov r3, #0 + vmov.f32 s1, s0 + vmov s2, r3 + vmov.f32 s0, s2 .p2align 4 label159: movw r3, #34464 @@ -701,6 +619,11 @@ label159: movw r3, #:lower16:vectorB movt r3, #:upper16:vectorB add r3, r3, r0, lsl #2 + b label164 +.p2align 4 +label168: + add r3, r3, #4 + vmov.f32 s1, s0 .p2align 4 label164: add r6, r1, r0 @@ -711,32 +634,40 @@ label164: add r6, r6, r6, lsr #31 asr r6, r6, #1 add r6, r5, r6 - vmov s1, r6 + vmov s0, r6 movw r6, #34464 movt r6, #1 cmp r0, r6 - vcvt.f32.s32 s1, s1 - vdiv.f32 s1, s2, s1 - vadd.f32 s0, s0, s1 - bge label426 - add r3, r3, #4 - b label164 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + blt label168 .p2align 4 -label745: - vmov.f32 s1, s0 +label410: + movw r0, #34464 + movt r0, #1 .p2align 4 -label202: - movw r4, #34464 - movt r4, #1 - cmp r0, r4 - bge label679 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r4, r5, r0, lsl #2 - b label207 +label169: + vstr s0, [r4, #0] + movw r1, #34464 + movt r1, #1 + cmp r5, r1 + bge label172 + add r4, r4, #4 + mov r1, r5 + b label147 +.p2align 4 +label172: + mov r1, #0 + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA + mov r0, r1 + vmov s0, r1 + mov r4, r3 + b label173 .p2align 4 label211: - add r4, r4, #4 + add r3, r3, #4 vmov.f32 s1, s0 .p2align 4 label207: @@ -752,7 +683,7 @@ label207: movt r5, #1 cmp r0, r5 vcvt.f32.s32 s2, s0 - vldr s0, [r4, #0] + vldr s0, [r3, #0] vdiv.f32 s0, s0, s2 vadd.f32 s0, s1, s0 blt label211 @@ -762,44 +693,111 @@ label679: movt r0, #1 .p2align 4 label178: - vstr s0, [r3, #0] + vstr s0, [r4, #0] movw r1, #34464 movt r1, #1 cmp r6, r1 bge label181 - add r3, r3, #4 + add r4, r4, #4 mov r1, r6 - b label173 +.p2align 4 +label173: + add r6, r1, #1 + movw r3, #34464 + movt r3, #1 + cmp r0, r3 + bge label178 + add r3, r0, #3 + movw r5, #34464 + movt r5, #1 + cmp r3, r5 + bge label673 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r7, r1, #3 + add r8, r1, #4 + add r3, r5, r0, lsl #2 + add r5, r1, #2 +.p2align 4 +label213: + add r9, r1, r0 + add r10, r6, r0 + add r11, r0, #1 + mul r9, r9, r10 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + add r9, r9, r11 + add r11, r0, #2 + vmov s1, r9 + add r9, r5, r0 + mul r10, r10, r9 + vcvt.f32.s32 s2, s1 + add r10, r10, r10, lsr #31 + vldr s1, [r3, #0] + asr r10, r10, #1 + add r10, r10, r11 + add r11, r0, #3 + vdiv.f32 s1, s1, s2 + vadd.f32 s2, s0, s1 + vmov s0, r10 + add r10, r7, r0 + mul r9, r9, r10 + vcvt.f32.s32 s1, s0 + add r9, r9, r9, lsr #31 + vldr s0, [r3, #4] + asr r9, r9, #1 + add r9, r9, r11 + vdiv.f32 s0, s0, s1 + vldr s1, [r3, #8] + vadd.f32 s2, s2, s0 + vmov s0, r9 + add r9, r8, r0 + add r0, r0, #4 + mul r9, r10, r9 + vcvt.f32.s32 s0, s0 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + vdiv.f32 s0, s1, s0 + add r9, r9, r0 + vadd.f32 s1, s2, s0 + vldr s2, [r3, #12] + vmov s0, r9 + movw r9, #34461 + movt r9, #1 + cmp r0, r9 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + bge label745 + add r3, r3, #16 + b label213 +.p2align 4 +label745: + vmov.f32 s1, s0 +.p2align 4 +label202: + movw r3, #34464 + movt r3, #1 + cmp r0, r3 + bge label679 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r3, r5, r0, lsl #2 + b label207 .p2align 4 label673: - mov r4, #0 + mov r3, #0 vmov.f32 s1, s0 - vmov s2, r4 + vmov s2, r3 vmov.f32 s0, s2 b label202 -.p2align 4 -label426: - movw r0, #34464 - movt r0, #1 - b label169 -.p2align 4 -label410: - movw r0, #34464 - movt r0, #1 - vmov.f32 s0, s1 - b label169 label636: mov r0, #1065353216 vmov s0, r0 b label195 .p2align 4 -label318: +label405: vmov.f32 s1, s0 - b label132 -.p2align 4 -label357: - mov r3, #0 - vmov s1, r3 b label159 .p2align 4 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/vector_mul1.riscv.s b/tests/SysY2022/performance/vector_mul1.riscv.s index 3198622f7..97bf7f282 100644 --- a/tests/SysY2022/performance/vector_mul1.riscv.s +++ b/tests/SysY2022/performance/vector_mul1.riscv.s @@ -1,668 +1,805 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 897988541 .4byte 3045472189 .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 Vectortm: .zero 400000 -.align 8 +.p2align 3 vectorB: .zero 400000 -.align 8 +.p2align 3 vectorA: .zero 400000 .text .p2align 2 .globl main main: - addi sp, sp, -72 + addi sp, sp, -56 li a0, 62 sd ra, 0(sp) - sd s2, 8(sp) - sd s1, 16(sp) - sd s6, 24(sp) - sd s4, 32(sp) - sd s3, 40(sp) - sd s0, 48(sp) - sd s5, 56(sp) - sd s7, 64(sp) + sd s1, 8(sp) + sd s0, 16(sp) + sd s5, 24(sp) + sd s3, 32(sp) + sd s2, 40(sp) + sd s4, 48(sp) jal _sysy_starttime lui a3, 24 -pcrel1032: +pcrel1055: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1033: +pcrel1056: auipc a1, %pcrel_hi(vectorA) - addi s2, a1, %pcrel_lo(pcrel1033) -pcrel1034: + addi s1, a1, %pcrel_lo(pcrel1056) +pcrel1057: auipc a1, %pcrel_hi(cmmc_parallel_body_0) - sd s2, %pcrel_lo(pcrel1032)(a0) - addi a2, a1, %pcrel_lo(pcrel1034) + sd s1, %pcrel_lo(pcrel1055)(a0) + addi a2, a1, %pcrel_lo(pcrel1057) mv a0, zero addiw a1, a3, 1696 jal cmmcParallelFor + lui s3, 258048 + mv a2, zero lui a4, 24 - lui s4, 258048 - mv a3, zero -pcrel1035: - auipc a2, %pcrel_hi(Vectortm) -pcrel1036: +pcrel1058: + auipc a3, %pcrel_hi(Vectortm) +pcrel1059: auipc a0, %pcrel_hi(vectorB) - addi a1, a2, %pcrel_lo(pcrel1035) - addi s1, a0, %pcrel_lo(pcrel1036) -pcrel1037: - auipc a2, %pcrel_hi(__cmmc_fp_constant_pool) + addi a1, a3, %pcrel_lo(pcrel1058) + addi s0, a0, %pcrel_lo(pcrel1059) +pcrel1060: + auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) addiw a0, a4, 1693 - addi s3, a2, %pcrel_lo(pcrel1037) - addi s0, a0, 3 - li a2, 1000 + addi s2, a3, %pcrel_lo(pcrel1060) j label110 .p2align 2 -label958: - addiw a3, a3, 1 - bge a3, a2, label184 +label981: + addiw a2, a2, 1 + li a3, 1000 + bge a2, a3, label184 .p2align 2 label110: fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label117 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label117 fsw f10, 0(a1) - blt a4, s0, label237 - mv a5, s1 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label237 + mv a4, s0 + mv a5, zero + addiw t0, t1, 1696 + blt zero, t0, label220 j label143 .p2align 2 label277: - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label953 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label976 .p2align 2 label237: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label970 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label993 .p2align 2 label117: - addiw t1, t5, 3 - bge t1, s0, label256 - sh2add t1, t5, s2 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label256 + sh2add t0, t4, s1 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 j label129 .p2align 2 label133: - addi t1, t1, 16 + addi t0, t0, 16 .p2align 2 label129: - addw a7, t0, t5 - addw a6, a4, t5 - flw f14, 0(t1) - mulw t6, a7, a6 - srliw s6, t6, 31 - add s5, t6, s6 - sraiw a7, s5, 1 - addw t6, a4, a7 + addw a6, a5, t4 + addw t5, a3, t4 + flw f14, 0(t0) + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + sraiw a6, s4, 1 + addw t6, a3, a6 fcvt.s.w f12, t6 - addw t6, t2, t5 + addw t6, t1, t4 fdiv.s f13, f14, f12 - mulw a7, a6, t6 - srliw s6, a7, 31 - add s7, a7, s6 - sraiw s5, s7, 1 - addw a6, a4, s5 + mulw a6, t5, t6 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw s5, a7, 1 + addw t5, a3, s5 + fcvt.s.w f12, t5 + addw t5, t2, t4 + mulw a6, t6, t5 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw t6, a7, 1 + addw a6, a3, t6 fadd.s f11, f10, f13 - flw f13, 4(t1) - fcvt.s.w f10, a6 - addw a6, t3, t5 - fdiv.s f14, f13, f10 - mulw a7, t6, a6 - flw f13, 8(t1) - srliw s6, a7, 31 - add s5, a7, s6 - addw a7, t4, t5 - sraiw s7, s5, 1 - addiw t5, t5, 4 - addw t6, a4, s7 - fadd.s f12, f11, f14 - fcvt.s.w f11, t6 - mulw t6, a6, a7 - fdiv.s f14, f13, f11 - srliw s5, t6, 31 - add a7, t6, s5 - sraiw s6, a7, 1 - addw a6, a4, s6 - fcvt.s.w f11, a6 - fadd.s f10, f12, f14 - flw f12, 12(t1) - fdiv.s f13, f12, f11 - fadd.s f10, f10, f13 - blt t5, a0, label133 + flw f13, 4(t0) + fdiv.s f14, f13, f12 + flw f13, 8(t0) + fcvt.s.w f12, a6 + addw a6, t3, t4 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw a7, t6, 31 + add s4, t6, a7 + sraiw t5, s4, 1 + addw a6, a3, t5 + fadd.s f10, f11, f14 + fdiv.s f14, f13, f12 + flw f13, 12(t0) + fcvt.s.w f12, a6 + fadd.s f11, f10, f14 + fdiv.s f14, f13, f12 + fadd.s f10, f11, f14 + blt t4, a0, label133 fmv.s f11, f10 - bge t5, s0, label954 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label977 .p2align 2 label122: - sh2add t1, t5, s2 - mv t2, t5 + sh2add t0, t4, s1 + mv t1, t4 fmv.s f10, f11 .p2align 2 label123: - addw t4, t0, t2 - addw a6, a4, t2 - flw f12, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, a6 - srliw t5, t3, 31 - add t6, t3, t5 + addw t3, a5, t1 + addw t4, a3, t1 + flw f12, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t4 + srliw t5, t2, 31 + add t6, t2, t5 sraiw t4, t6, 1 - addw a6, a4, t4 - fcvt.s.w f11, a6 + addw t3, a3, t4 + fcvt.s.w f11, t3 + lui t3, 24 fdiv.s f13, f12, f11 + addiw t2, t3, 1696 fadd.s f10, f10, f13 - bge t2, s0, label277 - addi t1, t1, 4 + bge t1, t2, label277 + addi t0, t0, 4 j label123 .p2align 2 label768: - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label968 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label991 .p2align 2 label146: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label955 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label978 .p2align 2 label220: - addiw t1, t5, 3 - bge t1, s0, label747 - sh2add t1, t5, a1 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label747 + sh2add t0, t4, a1 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 j label232 .p2align 2 label236: - addi t1, t1, 16 + addi t0, t0, 16 .p2align 2 label232: - addw a7, t0, t5 - addw a6, a4, t5 - flw f14, 0(t1) - addiw s7, t5, 3 - mulw t6, a7, a6 - srliw s6, t6, 31 - add s5, t6, s6 - addiw t6, t5, 1 - sraiw a7, s5, 1 - addw s6, a7, t6 - addw t6, t2, t5 - mulw a7, a6, t6 - fcvt.s.w f12, s6 - srliw s5, a7, 31 - add s6, a7, s5 + addw a6, a5, t4 + addw t5, a3, t4 + flw f14, 0(t0) + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + addiw t6, t4, 1 + sraiw a6, s4, 1 + addw a7, a6, t6 + addw t6, t1, t4 + mulw a6, t5, t6 + fcvt.s.w f12, a7 + srliw a7, a6, 31 fdiv.s f13, f14, f12 - addiw a7, t5, 2 - sraiw a6, s6, 1 - flw f14, 4(t1) - addw s5, a6, a7 - addw a6, t3, t5 - mulw a7, t6, a6 - fcvt.s.w f12, s5 - srliw s6, a7, 31 - add s5, a7, s6 - sraiw t6, s5, 1 - addw a7, t6, s7 + add s4, a6, a7 + flw f14, 4(t0) + addiw a7, t4, 2 + sraiw t5, s4, 1 + addw a6, t5, a7 + addw t5, t2, t4 + fcvt.s.w f12, a6 + mulw a6, t6, t5 + srliw a7, a6, 31 + add s4, a6, a7 + addiw a7, t4, 3 + sraiw t6, s4, 1 + addw a6, t6, a7 fadd.s f11, f10, f13 fdiv.s f13, f14, f12 - flw f14, 8(t1) - fcvt.s.w f12, a7 - addw a7, t4, t5 - addiw t5, t5, 4 - mulw t6, a6, a7 - srliw s5, t6, 31 - add a7, t6, s5 - sraiw a6, a7, 1 - addw t6, a6, t5 + flw f14, 8(t0) + fcvt.s.w f12, a6 + addw a6, t3, t4 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw a7, t6, 31 + add a6, t6, a7 + sraiw t5, a6, 1 + addw t6, t5, t4 fadd.s f10, f11, f13 fdiv.s f13, f14, f12 - flw f14, 12(t1) + flw f14, 12(t0) fcvt.s.w f12, t6 fadd.s f11, f10, f13 fdiv.s f13, f14, f12 fadd.s f10, f11, f13 - blt t5, a0, label236 + blt t4, a0, label236 fmv.s f12, f10 - bge t5, s0, label969 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label992 .p2align 2 label225: - sh2add t1, t5, a1 - mv t2, t5 + sh2add t0, t4, a1 + mv t1, t4 fmv.s f10, f12 .p2align 2 label226: - addw t4, t0, t2 - addw t5, a4, t2 - flw f13, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, t5 - srliw t6, t3, 31 - add a6, t3, t6 - sraiw t4, a6, 1 - addw t3, t4, t2 - fcvt.s.w f11, t3 + addw t3, a5, t1 + addw t4, a3, t1 + flw f13, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t4 + srliw t6, t2, 31 + add t5, t2, t6 + sraiw t3, t5, 1 + addw t2, t3, t1 + lui t3, 24 + fcvt.s.w f11, t2 + addiw t2, t3, 1696 fdiv.s f12, f13, f11 fadd.s f10, f10, f12 - bge t2, s0, label768 - addi t1, t1, 4 + bge t1, t2, label768 + addi t0, t0, 4 j label226 .p2align 2 label256: fmv.w.x f12, zero fmv.s f11, f10 fmv.s f10, f12 - blt t5, s0, label122 - mv t5, s0 - fsw f12, 0(a5) - blt a4, s0, label237 + lui t1, 24 + addiw t0, t1, 1696 + blt t4, t0, label122 + lui a5, 24 + lui t0, 24 + fsw f12, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label237 fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + addiw t0, t1, 1696 + blt zero, t0, label220 j label143 .p2align 2 label747: fmv.w.x f11, zero fmv.s f12, f10 fmv.s f10, f11 - blt t5, s0, label225 - mv t5, s0 - fsw f11, 0(a5) - blt a4, s0, label146 + lui t1, 24 + addiw t0, t1, 1696 + blt t4, t0, label225 + lui a5, 24 + lui t0, 24 + fsw f11, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label146 fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + addiw t0, t1, 1696 + blt zero, t0, label203 fsw f10, 0(a1) - blt a4, s0, label202 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 j label156 .p2align 2 label219: - addi t1, t1, 4 + addi t0, t0, 4 .p2align 2 label215: - addw t4, t0, t2 - addw t6, a4, t2 - flw f13, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, t6 - srliw a6, t3, 31 - add t5, t3, a6 - sraiw t4, t5, 1 - addw t6, a4, t4 - fcvt.s.w f11, t6 + addw t3, a5, t1 + addw t4, a3, t1 + flw f13, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t4 + srliw t6, t2, 31 + add t5, t2, t6 + sraiw t3, t5, 1 + addw t4, a3, t3 + lui t3, 24 + addiw t2, t3, 1696 + fcvt.s.w f11, t4 fdiv.s f12, f13, f11 fadd.s f10, f10, f12 - blt t2, s0, label219 - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label966 + blt t1, t2, label219 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label989 .p2align 2 label202: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label962 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label985 .p2align 2 label203: - addiw t1, t5, 3 - bge t1, s0, label673 - sh2add t1, t5, s1 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label673 + sh2add t0, t4, s0 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 .p2align 2 label205: - addw a7, t0, t5 - addw t6, a4, t5 - flw f13, 0(t1) - mulw a6, a7, t6 - srliw s5, a6, 31 - add a7, a6, s5 - sraiw s6, a7, 1 - addw a6, a4, s6 - fcvt.s.w f11, a6 - addw a6, t2, t5 - fdiv.s f14, f13, f11 - mulw a7, t6, a6 - flw f13, 4(t1) - srliw s5, a7, 31 - add s6, a7, s5 - sraiw t6, s6, 1 - addw a7, a4, t6 - addw t6, t3, t5 - fadd.s f12, f10, f14 - fcvt.s.w f10, a7 - mulw a7, a6, t6 - fdiv.s f14, f13, f10 - srliw s5, a7, 31 - add s6, a7, s5 - sraiw a6, s6, 1 - addw a7, a4, a6 - fadd.s f11, f12, f14 - flw f14, 8(t1) - fcvt.s.w f12, a7 - addw a7, t4, t5 + addw a6, a5, t4 + addw t5, a3, t4 + flw f13, 0(t0) + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + sraiw a6, s4, 1 + addw t6, a3, a6 + fcvt.s.w f12, t6 + addw t6, t1, t4 + fdiv.s f14, f13, f12 + mulw a6, t5, t6 + flw f13, 4(t0) + srliw a7, a6, 31 + add t5, a6, a7 + sraiw s4, t5, 1 + addw t5, t2, t4 + addw a6, a3, s4 + fcvt.s.w f12, a6 + mulw a6, t6, t5 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw t6, a7, 1 + addw a6, a3, t6 + fadd.s f11, f10, f14 + fdiv.s f14, f13, f12 + flw f13, 8(t0) + fcvt.s.w f12, a6 + addw a6, t3, t4 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw a7, t6, 31 + add t5, t6, a7 + sraiw s4, t5, 1 + addw a6, a3, s4 + fadd.s f10, f11, f14 + fdiv.s f14, f13, f12 + fcvt.s.w f12, a6 + fadd.s f11, f10, f14 + flw f14, 12(t0) fdiv.s f13, f14, f12 - addiw t5, t5, 4 - mulw a6, t6, a7 - srliw s6, a6, 31 - add a7, a6, s6 - sraiw s5, a7, 1 - addw t6, a4, s5 fadd.s f10, f11, f13 - flw f13, 12(t1) - fcvt.s.w f11, t6 - fdiv.s f12, f13, f11 - fadd.s f10, f10, f12 - bge t5, a0, label720 - addi t1, t1, 16 + bge t4, a0, label720 + addi t0, t0, 16 j label205 .p2align 2 label720: fmv.s f11, f10 - bge t5, s0, label964 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label987 .p2align 2 label214: - sh2add t1, t5, s1 - mv t2, t5 + sh2add t0, t4, s0 + mv t1, t4 fmv.s f10, f11 j label215 .p2align 2 label673: fmv.w.x f12, zero fmv.s f11, f10 + lui t1, 24 fmv.s f10, f12 - blt t5, s0, label214 - mv t5, s0 - fsw f12, 0(a5) - blt a4, s0, label202 + addiw t0, t1, 1696 + blt t4, t0, label214 + lui a5, 24 + lui t0, 24 + fsw f12, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label202 fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + addiw t0, t1, 1696 + blt zero, t0, label162 j label356 .p2align 2 label172: - addi t1, t1, 4 + addi t0, t0, 4 .p2align 2 label168: - addw t4, t0, t2 - addw t6, a4, t2 - flw f12, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, t6 - srliw a6, t3, 31 - add t5, t3, a6 - sraiw t4, t5, 1 - addw t3, t4, t2 - fcvt.s.w f11, t3 + addw t3, a5, t1 + addw t6, a3, t1 + flw f12, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t6 + srliw t4, t2, 31 + add t5, t2, t4 + sraiw t3, t5, 1 + addw t2, t3, t1 + lui t3, 24 + fcvt.s.w f11, t2 + addiw t2, t3, 1696 fdiv.s f13, f12, f11 fadd.s f10, f10, f13 - blt t2, s0, label172 - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label958 + blt t1, t2, label172 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label981 .p2align 2 label182: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label961 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label984 .p2align 2 label162: - addiw t1, t5, 3 - bge t1, s0, label361 - sh2add t1, t5, a1 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label361 + sh2add t0, t4, a1 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 .p2align 2 label174: - addw a7, t0, t5 - addw a6, a4, t5 - addiw s6, t5, 1 - mulw t6, a7, a6 - flw f13, 0(t1) - srliw s5, t6, 31 - add s7, t6, s5 - sraiw a7, s7, 1 - addiw s7, t5, 3 - addw t6, a7, s6 + addw a6, a5, t4 + addw t5, a3, t4 + flw f13, 0(t0) + addiw s5, t4, 3 + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + addiw a7, t4, 1 + sraiw a6, s4, 1 + addw t6, a6, a7 fcvt.s.w f12, t6 - addw t6, t2, t5 + addw t6, t1, t4 fdiv.s f14, f13, f12 - mulw a7, a6, t6 - flw f13, 4(t1) - srliw s6, a7, 31 - add s5, a7, s6 - addiw a7, t5, 2 - sraiw a6, s5, 1 - addw s6, a6, a7 - addw a6, t3, t5 - mulw a7, t6, a6 - fcvt.s.w f12, s6 - srliw s6, a7, 31 - add s5, a7, s6 - sraiw t6, s5, 1 - addw a7, t6, s7 + mulw a6, t5, t6 + flw f13, 4(t0) + srliw a7, a6, 31 + add s4, a6, a7 + addiw a7, t4, 2 + sraiw t5, s4, 1 + addw a6, t5, a7 + addw t5, t2, t4 fadd.s f11, f10, f14 - fdiv.s f14, f13, f12 - fcvt.s.w f12, a7 - addw a7, t4, t5 - addiw t5, t5, 4 - mulw t6, a6, a7 - srliw s5, t6, 31 - add a7, t6, s5 - sraiw a6, a7, 1 - addw t6, a6, t5 - fadd.s f10, f11, f14 - flw f14, 8(t1) - fdiv.s f13, f14, f12 - fcvt.s.w f12, t6 - fadd.s f11, f10, f13 - flw f13, 12(t1) - fdiv.s f14, f13, f12 - fadd.s f10, f11, f14 - bge t5, a0, label432 - addi t1, t1, 16 + fcvt.s.w f10, a6 + mulw a6, t6, t5 + fdiv.s f14, f13, f10 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw t6, a7, 1 + addw a6, t6, s5 + fadd.s f12, f11, f14 + flw f14, 8(t0) + fcvt.s.w f11, a6 + addw a6, t3, t4 + fdiv.s f13, f14, f11 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw s4, t6, 31 + add a7, t6, s4 + sraiw t5, a7, 1 + addw t6, t5, t4 + fcvt.s.w f11, t6 + fadd.s f10, f12, f13 + flw f12, 12(t0) + fdiv.s f13, f12, f11 + fadd.s f10, f10, f13 + bge t4, a0, label432 + addi t0, t0, 16 j label174 .p2align 2 label432: - fmv.s f12, f10 - bge t5, s0, label959 + fmv.s f11, f10 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label982 .p2align 2 label167: - sh2add t1, t5, a1 - mv t2, t5 - fmv.s f10, f12 + sh2add t0, t4, a1 + mv t1, t4 + fmv.s f10, f11 j label168 .p2align 2 label361: - fmv.w.x f11, zero - fmv.s f12, f10 - fmv.s f10, f11 - blt t5, s0, label167 - mv t5, s0 - fsw f11, 0(a5) - blt a4, s0, label182 - addiw a3, a3, 1 - blt a3, a2, label110 + fmv.w.x f12, zero + fmv.s f11, f10 + fmv.s f10, f12 + lui t1, 24 + addiw t0, t1, 1696 + blt t4, t0, label167 + lui a5, 24 + lui t0, 24 + fsw f12, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label182 + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label959: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label182 - addiw a3, a3, 1 - blt a3, a2, label110 +label982: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label182 + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label954: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label237 +label977: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label237 fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 - fsw f10, 0(s1) - blt a4, s0, label146 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label220 + fsw f10, 0(s0) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 j label339 .p2align 2 -label964: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label202 +label987: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label202 fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 - fsw f10, 0(s2) - blt a4, s0, label182 - j label956 -.p2align 2 -label969: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label146 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 + fsw f10, 0(s1) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 + j label979 +.p2align 2 +label992: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label146 fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 fsw f10, 0(a1) - blt a4, s0, label202 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 j label156 .p2align 2 -label961: - fsw f10, 0(a5) - blt a4, s0, label182 - addiw a3, a3, 1 - blt a3, a2, label110 +label984: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label970: - fsw f10, 0(a5) - blt a4, s0, label237 +label993: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label237 fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label220 label143: - fsw f10, 0(a5) - blt a4, s0, label146 + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 j label339 .p2align 2 -label962: - fsw f10, 0(a5) - blt a4, s0, label202 +label985: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 label356: - fsw f10, 0(a5) - blt a4, s0, label182 - j label956 + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 + j label979 .p2align 2 -label955: - fsw f10, 0(a5) - blt a4, s0, label146 +label978: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 label153: - fsw f10, 0(a5) - blt a4, s0, label202 + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 j label156 .p2align 2 -label953: +label976: fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 - fsw f10, 0(s1) - blt a4, s0, label146 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label220 + fsw f10, 0(s0) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 label339: fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 j label153 .p2align 2 -label966: +label989: fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 - fsw f10, 0(s2) - blt a4, s0, label182 -label956: - addiw a3, a3, 1 - blt a3, a2, label110 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 + fsw f10, 0(s1) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 +label979: + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label968: +label991: fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 fsw f10, 0(a1) - blt a4, s0, label202 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 label156: fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 j label356 label184: li a0, 76 @@ -672,154 +809,158 @@ label184: j label185 .p2align 2 label189: - addi s2, s2, 64 + addi s1, s1, 64 .p2align 2 label185: - sh2add a1, a0, s1 - flw f12, 0(s2) + sh2add a1, a0, s0 + flw f12, 0(s1) + lui a2, 24 addiw a0, a0, 16 flw f14, 0(a1) - flw f13, 4(s2) + flw f13, 4(s1) fmul.s f15, f12, f14 flw f14, 4(a1) - flw f12, 8(s2) + flw f12, 8(s1) flw f0, 8(a1) fadd.s f11, f10, f15 fmul.s f15, f13, f14 - flw f13, 12(s2) + flw f13, 12(s1) fmul.s f14, f12, f0 fadd.s f10, f11, f15 flw f15, 12(a1) - flw f12, 16(s2) + flw f12, 16(s1) fadd.s f11, f10, f14 fmul.s f14, f13, f15 flw f13, 16(a1) fmul.s f15, f12, f13 fadd.s f10, f11, f14 - flw f14, 20(s2) + flw f14, 20(s1) flw f12, 20(a1) - flw f13, 24(s2) + flw f13, 24(s1) fadd.s f11, f10, f15 fmul.s f15, f14, f12 flw f14, 24(a1) - flw f12, 28(s2) + flw f12, 28(s1) fadd.s f10, f11, f15 fmul.s f15, f13, f14 flw f14, 28(a1) - flw f13, 32(s2) + flw f13, 32(s1) fmul.s f0, f12, f14 fadd.s f11, f10, f15 flw f15, 32(a1) - flw f14, 36(s2) + flw f14, 36(s1) fadd.s f10, f11, f0 fmul.s f11, f13, f15 flw f15, 36(a1) - flw f13, 40(s2) + flw f13, 40(s1) fadd.s f12, f10, f11 fmul.s f10, f14, f15 flw f14, 40(a1) fmul.s f0, f13, f14 fadd.s f11, f12, f10 - flw f12, 44(s2) + flw f12, 44(s1) flw f15, 44(a1) - flw f13, 48(s2) + flw f13, 48(s1) fmul.s f14, f12, f15 fadd.s f10, f11, f0 flw f15, 48(a1) fmul.s f12, f13, f15 fadd.s f11, f10, f14 - flw f14, 52(s2) + flw f14, 52(s1) flw f15, 52(a1) - flw f13, 56(s2) + flw f13, 56(s1) fadd.s f10, f11, f12 fmul.s f11, f14, f15 flw f15, 56(a1) - flw f14, 60(s2) + flw f14, 60(s1) fadd.s f12, f10, f11 fmul.s f10, f13, f15 flw f13, 60(a1) + addiw a1, a2, 1696 fadd.s f11, f12, f10 fmul.s f12, f14, f13 fadd.s f10, f11, f12 - blt a0, s0, label189 + blt a0, a1, label189 fmv.w.x f11, zero mv a0, zero j label191 .p2align 2 label195: - addi s1, s1, 64 + addi s0, s0, 64 .p2align 2 label191: - flw f13, 0(s1) + flw f13, 0(s0) addiw a0, a0, 16 - flw f14, 4(s1) + lui a2, 24 + flw f14, 4(s0) fmul.s f15, f13, f13 + addiw a1, a2, 1696 fmul.s f1, f14, f14 fadd.s f12, f11, f15 - flw f15, 8(s1) - flw f14, 12(s1) + flw f15, 8(s0) + flw f14, 12(s0) fmul.s f0, f15, f15 fadd.s f13, f12, f1 fmul.s f1, f14, f14 fadd.s f11, f13, f0 - flw f13, 16(s1) - flw f14, 20(s1) + flw f13, 16(s0) + flw f14, 20(s0) fmul.s f15, f13, f13 fmul.s f0, f14, f14 - flw f13, 24(s1) + flw f13, 24(s0) fadd.s f12, f11, f1 - flw f14, 28(s1) + flw f14, 28(s0) fadd.s f11, f12, f15 fmul.s f15, f13, f13 fadd.s f12, f11, f0 fmul.s f0, f14, f14 fadd.s f11, f12, f15 - flw f15, 32(s1) - flw f14, 36(s1) + flw f15, 32(s0) + flw f14, 36(s0) fmul.s f1, f15, f15 fadd.s f13, f11, f0 fmul.s f0, f14, f14 fadd.s f12, f13, f1 - flw f13, 40(s1) - flw f14, 44(s1) + flw f13, 40(s0) + flw f14, 44(s0) fmul.s f15, f13, f13 - flw f13, 48(s1) + flw f13, 48(s0) fadd.s f11, f12, f0 fmul.s f0, f14, f14 - flw f14, 52(s1) + flw f14, 52(s0) fadd.s f12, f11, f15 fmul.s f15, f13, f13 - flw f13, 56(s1) + flw f13, 56(s0) fadd.s f11, f12, f0 fmul.s f0, f14, f14 - flw f14, 60(s1) + flw f14, 60(s0) fadd.s f12, f11, f15 fmul.s f15, f13, f13 fmul.s f13, f14, f14 fadd.s f11, f12, f0 fadd.s f12, f11, f15 fadd.s f11, f12, f13 - blt a0, s0, label195 + blt a0, a1, label195 fdiv.s f10, f10, f11 lui a0, 260096 - flw f13, 0(s3) + flw f13, 0(s2) fmv.w.x f12, a0 fsub.s f11, f12, f10 - flw f12, 4(s3) + flw f12, 4(s2) flt.s a0, f13, f11 - flt.s a2, f11, f12 - or a1, a0, a2 - beq a1, zero, label632 + flt.s a1, f11, f12 + or a2, a0, a1 + beq a2, zero, label632 lui a0, 260096 fmv.s f12, f10 fmv.w.x f11, a0 .p2align 2 label197: fadd.s f14, f11, f12 - fmv.w.x f15, s4 + fmv.w.x f15, s3 fmul.s f11, f14, f15 - flw f14, 0(s3) - flw f15, 4(s3) + flw f14, 0(s2) + flw f15, 4(s2) fdiv.s f12, f10, f11 fsub.s f13, f11, f12 flt.s a2, f13, f15 @@ -828,10 +969,10 @@ label197: bne a1, zero, label197 label200: lui a0, 260096 - flw f13, 0(s3) + flw f13, 0(s2) fmv.w.x f12, a0 fsub.s f10, f11, f12 - flw f11, 4(s3) + flw f11, 4(s2) fle.s a1, f10, f13 fle.s a2, f11, f10 and a0, a1, a2 @@ -840,15 +981,13 @@ label200: jal putch ld ra, 0(sp) mv a0, zero - ld s2, 8(sp) - ld s1, 16(sp) - ld s6, 24(sp) - ld s4, 32(sp) - ld s3, 40(sp) - ld s0, 48(sp) - ld s5, 56(sp) - ld s7, 64(sp) - addi sp, sp, 72 + ld s1, 8(sp) + ld s0, 16(sp) + ld s5, 24(sp) + ld s3, 32(sp) + ld s2, 40(sp) + ld s4, 48(sp) + addi sp, sp, 56 ret label632: lui a3, 260096 diff --git a/tests/SysY2022/performance/vector_mul2.arm.s b/tests/SysY2022/performance/vector_mul2.arm.s index a739fb531..934b20d85 100644 --- a/tests/SysY2022/performance/vector_mul2.arm.s +++ b/tests/SysY2022/performance/vector_mul2.arm.s @@ -1,16 +1,16 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 Vectortm: .zero 400000 -.align 8 +.p2align 3 vectorB: .zero 400000 -.align 8 +.p2align 3 vectorA: .zero 400000 .text @@ -28,11 +28,11 @@ main: movt r1, #1 movw r2, #:lower16:cmmc_parallel_body_0 movt r2, #:upper16:cmmc_parallel_body_0 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA movw r0, #:lower16:cmmc_parallel_body_payload_0 movt r0, #:upper16:cmmc_parallel_body_payload_0 - str r4, [r0, #0] + str r3, [r0, #0] mov r0, #0 bl cmmcParallelFor mov r2, #0 @@ -49,208 +49,120 @@ label108: movt r5, #:upper16:Vectortm mov r0, r1 vmov s0, r1 - mov r3, r5 + mov r4, r5 .p2align 4 label110: add r5, r1, #1 - movw r4, #34464 - movt r4, #1 - cmp r0, r4 + movw r3, #34464 + movt r3, #1 + cmp r0, r3 bge label115 - add r4, r0, #3 + add r3, r0, #3 movw r6, #34464 movt r6, #1 - cmp r4, r6 + cmp r3, r6 bge label752 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA add r6, r1, #2 add r7, r1, #3 add r8, r1, #4 - add r4, r4, r0, lsl #2 + add r3, r3, r0, lsl #2 b label231 -.p2align 4 -label235: - add r4, r4, #16 -.p2align 4 -label231: - add r10, r1, r0 - add r9, r5, r0 - vldr s2, [r4, #0] - mul r10, r10, r9 - add r10, r10, r10, lsr #31 - asr r10, r10, #1 - add r10, r5, r10 - vmov s1, r10 - add r10, r6, r0 - mul r9, r9, r10 - vcvt.f32.s32 s1, s1 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s1, s2, s1 - add r9, r5, r9 - vldr s2, [r4, #4] - vadd.f32 s1, s0, s1 - vmov s0, r9 - add r9, r7, r0 - mul r10, r10, r9 - vcvt.f32.s32 s0, s0 - add r10, r10, r10, lsr #31 - asr r10, r10, #1 - vdiv.f32 s0, s2, s0 - add r10, r5, r10 - vadd.f32 s2, s1, s0 - vldr s1, [r4, #8] - vmov s0, r10 - add r10, r8, r0 - add r0, r0, #4 - mul r9, r9, r10 - vcvt.f32.s32 s0, s0 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s0, s1, s0 - add r9, r5, r9 - vadd.f32 s1, s2, s0 - vldr s2, [r4, #12] - vmov s0, r9 - movw r9, #34461 - movt r9, #1 - cmp r0, r9 - vcvt.f32.s32 s0, s0 - vdiv.f32 s0, s2, s0 - vadd.f32 s0, s1, s0 - blt label235 - vmov.f32 s1, s0 -.p2align 4 -label220: - movw r4, #34464 - movt r4, #1 - cmp r0, r4 - bge label758 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA - add r4, r4, r0, lsl #2 -.p2align 4 -label225: - add r6, r1, r0 - add r7, r5, r0 - add r0, r0, #1 - mul r6, r6, r7 - add r6, r6, r6, lsr #31 - asr r6, r6, #1 - add r6, r5, r6 - vmov s0, r6 - movw r6, #34464 - movt r6, #1 - cmp r0, r6 - vcvt.f32.s32 s2, s0 - vldr s0, [r4, #0] - vdiv.f32 s0, s0, s2 - vadd.f32 s0, s1, s0 - bge label758 - add r4, r4, #4 - vmov.f32 s1, s0 - b label225 -.p2align 4 -label115: - vstr s0, [r3, #0] - movw r1, #34464 - movt r1, #1 - cmp r5, r1 - bge label254 - add r3, r3, #4 - mov r1, r5 - b label110 label182: mov r0, #76 bl _sysy_stoptime - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA - mov r0, #0 - mov r1, r4 - vmov s0, r0 + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA + mov r1, #0 + mov r0, r3 + vmov s0, r1 b label183 .p2align 4 label187: - add r1, r1, #64 + add r0, r0, #64 vmov.f32 s0, s1 .p2align 4 label183: movw r3, #:lower16:vectorB movt r3, #:upper16:vectorB - vldr s2, [r1, #0] - add r2, r3, r0, lsl #2 - add r0, r0, #16 - vldr s1, [r2, #0] - vmul.f32 s1, s2, s1 - vadd.f32 s2, s0, s1 - vldr s1, [r1, #4] - vldr s0, [r2, #4] - vmul.f32 s0, s1, s0 - vadd.f32 s0, s2, s0 - vldr s2, [r1, #8] - vldr s1, [r2, #8] - vmul.f32 s1, s2, s1 - vadd.f32 s2, s0, s1 - vldr s0, [r1, #12] - vldr s1, [r2, #12] - vmul.f32 s0, s0, s1 - vadd.f32 s1, s2, s0 - vldr s2, [r1, #16] - vldr s0, [r2, #16] - vmul.f32 s0, s2, s0 - vldr s2, [r1, #20] - vadd.f32 s1, s1, s0 - vldr s0, [r2, #20] - vmul.f32 s0, s2, s0 - vldr s2, [r1, #24] - vadd.f32 s1, s1, s0 - vldr s0, [r2, #24] - vmul.f32 s0, s2, s0 - vadd.f32 s2, s1, s0 - vldr s1, [r1, #28] - vldr s0, [r2, #28] - vmul.f32 s0, s1, s0 - vadd.f32 s1, s2, s0 - vldr s2, [r1, #32] - vldr s0, [r2, #32] - vmul.f32 s0, s2, s0 - vadd.f32 s2, s1, s0 - vldr s0, [r1, #36] - vldr s1, [r2, #36] - vmul.f32 s0, s0, s1 - vldr s1, [r1, #40] - vadd.f32 s2, s2, s0 - vldr s0, [r2, #40] - vmul.f32 s0, s1, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r1, #44] - vldr s1, [r2, #44] - vmul.f32 s0, s0, s1 - vldr s1, [r1, #48] - vadd.f32 s2, s2, s0 - vldr s0, [r2, #48] - vmul.f32 s0, s1, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r1, #52] - vldr s1, [r2, #52] - vmul.f32 s0, s0, s1 - vadd.f32 s1, s2, s0 - vldr s0, [r1, #56] + vldr s1, [r0, #0] + add r2, r3, r1, lsl #2 + add r1, r1, #16 + vldr s2, [r2, #0] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #4] + vldr s2, [r2, #4] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #8] + vldr s2, [r2, #8] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #12] + vldr s2, [r2, #12] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #16] + vldr s2, [r2, #16] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #20] + vldr s2, [r2, #20] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #24] + vldr s2, [r2, #24] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #28] + vldr s2, [r2, #28] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #32] + vldr s2, [r2, #32] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #36] + vldr s2, [r2, #36] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #40] + vldr s2, [r2, #40] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #44] + vldr s2, [r2, #44] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #48] + vldr s2, [r2, #48] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #52] + vldr s2, [r2, #52] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #56] vldr s2, [r2, #56] - vmul.f32 s0, s0, s2 - vadd.f32 s2, s1, s0 - vldr s0, [r1, #60] - vldr s1, [r2, #60] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #60] + vldr s2, [r2, #60] movw r2, #34464 movt r2, #1 - cmp r0, r2 - vmul.f32 s0, s0, s1 - vadd.f32 s1, s2, s0 + cmp r1, r2 + vmul.f32 s1, s1, s2 + vadd.f32 s1, s0, s1 blt label187 mov r1, #0 vmov s0, r1 mov r0, r3 + b label189 +.p2align 4 +label193: + add r0, r0, #64 .p2align 4 label189: vldr s2, [r0, #0] @@ -259,135 +171,53 @@ label189: movt r2, #1 cmp r1, r2 vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #4] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #4] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #8] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #12] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #16] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #20] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #12] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #16] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #20] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #24] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #28] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #32] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #36] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #28] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #32] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #36] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #40] vmul.f32 s2, s2, s2 vadd.f32 s0, s0, s2 vldr s2, [r0, #44] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #48] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #48] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #52] vmul.f32 s2, s2, s2 vadd.f32 s0, s0, s2 vldr s2, [r0, #56] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #60] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 - bge label194 - add r0, r0, #64 - b label189 -.p2align 4 -label254: - mov r1, #0 - movw r3, #:lower16:vectorB - movt r3, #:upper16:vectorB - mov r0, r1 - vmov s0, r1 - mov r4, r3 -.p2align 4 -label120: - add r6, r1, #1 - movw r3, #34464 - movt r3, #1 - cmp r0, r3 - bge label142 - add r3, r0, #3 - movw r5, #34464 - movt r5, #1 - cmp r3, r5 - bge label267 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r7, r1, #3 - add r8, r1, #4 - add r3, r5, r0, lsl #2 - add r5, r1, #2 -.p2align 4 -label127: - add r9, r1, r0 - add r10, r6, r0 - vldr s2, [r3, #0] - add r11, r0, #1 - mul r9, r9, r10 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - add r9, r9, r11 - add r11, r0, #2 - vmov s1, r9 - add r9, r5, r0 - mul r10, r10, r9 - vcvt.f32.s32 s1, s1 - add r10, r10, r10, lsr #31 - asr r10, r10, #1 - vdiv.f32 s1, s2, s1 - add r10, r10, r11 - vldr s2, [r3, #4] - add r11, r0, #3 - vadd.f32 s0, s0, s1 - vmov s1, r10 - add r10, r7, r0 - mul r9, r9, r10 - vcvt.f32.s32 s1, s1 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s1, s2, s1 - add r9, r9, r11 - vldr s2, [r3, #8] - vadd.f32 s0, s0, s1 - vmov s1, r9 - add r9, r8, r0 - add r0, r0, #4 - mul r9, r10, r9 - vcvt.f32.s32 s1, s1 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s1, s2, s1 - add r9, r9, r0 - vldr s2, [r3, #12] - vadd.f32 s0, s0, s1 - vmov s1, r9 - movw r9, #34461 - movt r9, #1 - cmp r0, r9 - vcvt.f32.s32 s1, s1 - vdiv.f32 s1, s2, s1 - vadd.f32 s0, s0, s1 - bge label318 - add r3, r3, #16 - b label127 -label194: + vadd.f32 s0, s0, s2 + vldr s2, [r0, #60] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + blt label193 vdiv.f32 s2, s1, s0 mov r0, #1065353216 movw r1, #14269 @@ -395,25 +225,25 @@ label194: vmov s0, r0 movw r0, #14269 movt r0, #13702 - vsub.f32 s1, s0, s2 - vmov s0, r0 + vmov s1, r0 mov r0, #0 - vcmp.f32 s1, s0 - vmov s0, r1 + vsub.f32 s0, s0, s2 + vcmp.f32 s0, s1 + vmov s1, r1 mov r1, #0 vmrs APSR_nzcv, FPSCR movwgt r0, #1 - vcmp.f32 s1, s0 + vcmp.f32 s0, s1 vmrs APSR_nzcv, FPSCR movwmi r1, #1 orrs r0, r0, r1 beq label636 mov r0, #1065353216 - vmov.f32 s0, s2 - vmov s1, r0 + vmov.f32 s1, s2 + vmov s0, r0 .p2align 4 label197: - vadd.f32 s0, s1, s0 + vadd.f32 s0, s0, s1 mov r0, #1056964608 movw r1, #14269 movt r1, #46470 @@ -422,9 +252,9 @@ label197: movt r0, #13702 vmov s4, r0 mov r0, #0 - vmul.f32 s1, s0, s1 - vdiv.f32 s0, s2, s1 - vsub.f32 s3, s1, s0 + vmul.f32 s0, s0, s1 + vdiv.f32 s1, s2, s0 + vsub.f32 s3, s0, s1 vcmp.f32 s3, s4 vmov s4, r1 mov r1, #0 @@ -435,7 +265,6 @@ label197: movwmi r1, #1 orrs r0, r0, r1 bne label197 - vmov.f32 s0, s1 label195: mov r0, #1065353216 movw r1, #14269 @@ -443,15 +272,15 @@ label195: vmov s1, r0 movw r0, #14269 movt r0, #13702 - vsub.f32 s1, s0, s1 - vmov s0, r0 + vsub.f32 s0, s0, s1 + vmov s1, r0 mov r0, #0 - vcmp.f32 s1, s0 - vmov s0, r1 + vcmp.f32 s0, s1 + vmov s1, r1 mov r1, #0 vmrs APSR_nzcv, FPSCR movwls r0, #1 - vcmp.f32 s1, s0 + vcmp.f32 s0, s1 vmrs APSR_nzcv, FPSCR movwge r1, #1 and r0, r0, r1 @@ -462,68 +291,92 @@ label195: mov r0, #0 pop { r4, r5, r6, r7, r8, r9, r10, r11, pc } .p2align 4 -label267: +label752: mov r3, #0 vmov.f32 s1, s0 vmov s2, r3 vmov.f32 s0, s2 .p2align 4 -label132: +label220: movw r3, #34464 movt r3, #1 cmp r0, r3 - bge label323 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r3, r5, r0, lsl #2 + bge label758 + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA + add r3, r3, r0, lsl #2 .p2align 4 -label137: - add r5, r1, r0 - add r7, r6, r0 - vldr s2, [r3, #0] +label225: + add r6, r1, r0 + add r7, r5, r0 add r0, r0, #1 - mul r5, r5, r7 - add r5, r5, r5, lsr #31 - asr r5, r5, #1 - add r5, r5, r0 - vmov s0, r5 - movw r5, #34464 - movt r5, #1 - cmp r0, r5 - vcvt.f32.s32 s0, s0 - vdiv.f32 s0, s2, s0 + mul r6, r6, r7 + add r6, r6, r6, lsr #31 + asr r6, r6, #1 + add r6, r5, r6 + vmov s0, r6 + movw r6, #34464 + movt r6, #1 + cmp r0, r6 + vcvt.f32.s32 s2, s0 + vldr s0, [r3, #0] + vdiv.f32 s0, s0, s2 vadd.f32 s0, s1, s0 - bge label323 + bge label758 add r3, r3, #4 vmov.f32 s1, s0 - b label137 + b label225 .p2align 4 -label323: +label758: movw r0, #34464 movt r0, #1 .p2align 4 -label142: +label115: vstr s0, [r4, #0] movw r1, #34464 movt r1, #1 + cmp r5, r1 + bge label254 + add r4, r4, #4 + mov r1, r5 + b label110 +.p2align 4 +label254: + mov r1, #0 + movw r3, #:lower16:vectorB + movt r3, #:upper16:vectorB + mov r0, r1 + vmov s0, r1 +.p2align 4 +label120: + add r6, r1, #1 + movw r4, #34464 + movt r4, #1 + cmp r0, r4 + bge label142 + add r4, r0, #3 + movw r5, #34464 + movt r5, #1 + cmp r4, r5 + bge label267 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r7, r1, #3 + add r8, r1, #4 + add r4, r5, r0, lsl #2 + add r5, r1, #2 + b label127 +.p2align 4 +label142: + vstr s0, [r3, #0] + movw r1, #34464 + movt r1, #1 cmp r6, r1 bge label344 - add r4, r4, #4 + add r3, r3, #4 mov r1, r6 b label120 .p2align 4 -label758: - movw r0, #34464 - movt r0, #1 - b label115 -.p2align 4 -label752: - mov r4, #0 - vmov.f32 s1, s0 - vmov s2, r4 - vmov.f32 s0, s2 - b label220 -.p2align 4 label344: mov r1, #0 movw r5, #:lower16:Vectortm @@ -531,16 +384,6 @@ label344: mov r0, r1 vmov s0, r1 mov r4, r5 - b label147 -.p2align 4 -label169: - vstr s0, [r4, #0] - movw r1, #34464 - movt r1, #1 - cmp r5, r1 - bge label172 - add r4, r4, #4 - mov r1, r5 .p2align 4 label147: add r5, r1, #1 @@ -560,7 +403,103 @@ label147: add r8, r1, #4 add r3, r3, r0, lsl #2 .p2align 4 -label154: +label154: + add r10, r1, r0 + add r9, r5, r0 + vldr s2, [r3, #0] + mul r10, r10, r9 + add r10, r10, r10, lsr #31 + asr r10, r10, #1 + add r10, r5, r10 + vmov s1, r10 + add r10, r6, r0 + mul r9, r9, r10 + vcvt.f32.s32 s1, s1 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + vdiv.f32 s1, s2, s1 + add r9, r5, r9 + vldr s2, [r3, #4] + vadd.f32 s0, s0, s1 + vmov s1, r9 + add r9, r7, r0 + mul r10, r10, r9 + vcvt.f32.s32 s1, s1 + add r10, r10, r10, lsr #31 + asr r10, r10, #1 + vdiv.f32 s1, s2, s1 + add r10, r5, r10 + vldr s2, [r3, #8] + vadd.f32 s0, s0, s1 + vmov s1, r10 + add r10, r8, r0 + add r0, r0, #4 + mul r9, r9, r10 + vcvt.f32.s32 s1, s1 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + vdiv.f32 s1, s2, s1 + add r9, r5, r9 + vldr s2, [r3, #12] + vadd.f32 s0, s0, s1 + vmov s1, r9 + movw r9, #34461 + movt r9, #1 + cmp r0, r9 + vcvt.f32.s32 s1, s1 + vdiv.f32 s1, s2, s1 + vadd.f32 s0, s0, s1 + bge label405 + add r3, r3, #16 + b label154 +.p2align 4 +label267: + mov r4, #0 + vmov.f32 s1, s0 + vmov s2, r4 + vmov.f32 s0, s2 +.p2align 4 +label132: + movw r4, #34464 + movt r4, #1 + cmp r0, r4 + bge label323 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r4, r5, r0, lsl #2 + b label137 +.p2align 4 +label141: + add r4, r4, #4 + vmov.f32 s1, s0 +.p2align 4 +label137: + add r5, r1, r0 + add r7, r6, r0 + vldr s2, [r4, #0] + add r0, r0, #1 + mul r5, r5, r7 + add r5, r5, r5, lsr #31 + asr r5, r5, #1 + add r5, r5, r0 + vmov s0, r5 + movw r5, #34464 + movt r5, #1 + cmp r0, r5 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + blt label141 +.p2align 4 +label323: + movw r0, #34464 + movt r0, #1 + b label142 +.p2align 4 +label235: + add r3, r3, #16 +.p2align 4 +label231: add r10, r1, r0 add r9, r5, r0 vldr s2, [r3, #0] @@ -577,69 +516,46 @@ label154: vdiv.f32 s1, s2, s1 add r9, r5, r9 vldr s2, [r3, #4] - vadd.f32 s0, s0, s1 - vmov s1, r9 + vadd.f32 s1, s0, s1 + vmov s0, r9 add r9, r7, r0 mul r10, r10, r9 - vcvt.f32.s32 s1, s1 + vcvt.f32.s32 s0, s0 add r10, r10, r10, lsr #31 asr r10, r10, #1 - vdiv.f32 s1, s2, s1 + vdiv.f32 s0, s2, s0 add r10, r5, r10 - vldr s2, [r3, #8] - vadd.f32 s0, s0, s1 - vmov s1, r10 + vadd.f32 s2, s1, s0 + vldr s1, [r3, #8] + vmov s0, r10 add r10, r8, r0 add r0, r0, #4 mul r9, r9, r10 - vcvt.f32.s32 s1, s1 + vcvt.f32.s32 s0, s0 add r9, r9, r9, lsr #31 asr r9, r9, #1 - vdiv.f32 s1, s2, s1 + vdiv.f32 s0, s1, s0 add r9, r5, r9 + vadd.f32 s1, s2, s0 vldr s2, [r3, #12] - vadd.f32 s0, s0, s1 - vmov s1, r9 + vmov s0, r9 movw r9, #34461 movt r9, #1 cmp r0, r9 - vcvt.f32.s32 s1, s1 - vdiv.f32 s1, s2, s1 - vadd.f32 s1, s0, s1 - bge label405 - add r3, r3, #16 - vmov.f32 s0, s1 - b label154 -.p2align 4 -label172: - mov r1, #0 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA - mov r0, r1 - vmov s0, r1 - mov r3, r4 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + blt label235 + vmov.f32 s1, s0 + b label220 .p2align 4 -label173: - add r6, r1, #1 - movw r4, #34464 - movt r4, #1 - cmp r0, r4 - bge label178 - add r4, r0, #3 - movw r5, #34464 - movt r5, #1 - cmp r4, r5 - bge label673 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r7, r1, #3 - add r8, r1, #4 - add r4, r5, r0, lsl #2 - add r5, r1, #2 +label131: + add r4, r4, #16 .p2align 4 -label213: +label127: add r10, r1, r0 add r9, r6, r0 + vldr s2, [r4, #0] add r11, r0, #1 mul r10, r10, r9 add r10, r10, r10, lsr #31 @@ -649,49 +565,51 @@ label213: vmov s1, r10 add r10, r5, r0 mul r9, r9, r10 - vcvt.f32.s32 s2, s1 + vcvt.f32.s32 s1, s1 add r9, r9, r9, lsr #31 - vldr s1, [r4, #0] asr r9, r9, #1 + vdiv.f32 s1, s2, s1 add r9, r9, r11 + vldr s2, [r4, #4] add r11, r0, #3 - vdiv.f32 s1, s1, s2 - vadd.f32 s2, s0, s1 - vldr s1, [r4, #4] - vmov s0, r9 + vadd.f32 s0, s0, s1 + vmov s1, r9 add r9, r7, r0 mul r10, r10, r9 - vcvt.f32.s32 s0, s0 + vcvt.f32.s32 s1, s1 add r10, r10, r10, lsr #31 asr r10, r10, #1 - vdiv.f32 s0, s1, s0 + vdiv.f32 s1, s2, s1 add r10, r10, r11 - vadd.f32 s1, s2, s0 vldr s2, [r4, #8] - vmov s0, r10 + vadd.f32 s0, s0, s1 + vmov s1, r10 add r10, r8, r0 add r0, r0, #4 mul r9, r9, r10 - vcvt.f32.s32 s0, s0 + vcvt.f32.s32 s1, s1 add r9, r9, r9, lsr #31 asr r9, r9, #1 - vdiv.f32 s0, s2, s0 + vdiv.f32 s1, s2, s1 add r9, r9, r0 vldr s2, [r4, #12] - vadd.f32 s1, s1, s0 - vmov s0, r9 + vadd.f32 s0, s0, s1 + vmov s1, r9 movw r9, #34461 movt r9, #1 cmp r0, r9 - vcvt.f32.s32 s0, s0 - vdiv.f32 s0, s2, s0 - vadd.f32 s0, s1, s0 - bge label745 - add r4, r4, #16 - b label213 + vcvt.f32.s32 s1, s1 + vdiv.f32 s1, s2, s1 + vadd.f32 s0, s0, s1 + blt label131 + vmov.f32 s1, s0 + b label132 .p2align 4 -label405: - vmov.f32 s0, s1 +label357: + mov r3, #0 + vmov.f32 s1, s0 + vmov s2, r3 + vmov.f32 s0, s2 .p2align 4 label159: movw r3, #34464 @@ -701,6 +619,11 @@ label159: movw r3, #:lower16:vectorB movt r3, #:upper16:vectorB add r3, r3, r0, lsl #2 + b label164 +.p2align 4 +label168: + add r3, r3, #4 + vmov.f32 s1, s0 .p2align 4 label164: add r6, r1, r0 @@ -711,32 +634,40 @@ label164: add r6, r6, r6, lsr #31 asr r6, r6, #1 add r6, r5, r6 - vmov s1, r6 + vmov s0, r6 movw r6, #34464 movt r6, #1 cmp r0, r6 - vcvt.f32.s32 s1, s1 - vdiv.f32 s1, s2, s1 - vadd.f32 s0, s0, s1 - bge label426 - add r3, r3, #4 - b label164 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + blt label168 .p2align 4 -label745: - vmov.f32 s1, s0 +label410: + movw r0, #34464 + movt r0, #1 .p2align 4 -label202: - movw r4, #34464 - movt r4, #1 - cmp r0, r4 - bge label679 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r4, r5, r0, lsl #2 - b label207 +label169: + vstr s0, [r4, #0] + movw r1, #34464 + movt r1, #1 + cmp r5, r1 + bge label172 + add r4, r4, #4 + mov r1, r5 + b label147 +.p2align 4 +label172: + mov r1, #0 + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA + mov r0, r1 + vmov s0, r1 + mov r4, r3 + b label173 .p2align 4 label211: - add r4, r4, #4 + add r3, r3, #4 vmov.f32 s1, s0 .p2align 4 label207: @@ -752,7 +683,7 @@ label207: movt r5, #1 cmp r0, r5 vcvt.f32.s32 s2, s0 - vldr s0, [r4, #0] + vldr s0, [r3, #0] vdiv.f32 s0, s0, s2 vadd.f32 s0, s1, s0 blt label211 @@ -762,44 +693,111 @@ label679: movt r0, #1 .p2align 4 label178: - vstr s0, [r3, #0] + vstr s0, [r4, #0] movw r1, #34464 movt r1, #1 cmp r6, r1 bge label181 - add r3, r3, #4 + add r4, r4, #4 mov r1, r6 - b label173 +.p2align 4 +label173: + add r6, r1, #1 + movw r3, #34464 + movt r3, #1 + cmp r0, r3 + bge label178 + add r3, r0, #3 + movw r5, #34464 + movt r5, #1 + cmp r3, r5 + bge label673 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r7, r1, #3 + add r8, r1, #4 + add r3, r5, r0, lsl #2 + add r5, r1, #2 +.p2align 4 +label213: + add r9, r1, r0 + add r10, r6, r0 + add r11, r0, #1 + mul r9, r9, r10 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + add r9, r9, r11 + add r11, r0, #2 + vmov s1, r9 + add r9, r5, r0 + mul r10, r10, r9 + vcvt.f32.s32 s2, s1 + add r10, r10, r10, lsr #31 + vldr s1, [r3, #0] + asr r10, r10, #1 + add r10, r10, r11 + add r11, r0, #3 + vdiv.f32 s1, s1, s2 + vadd.f32 s2, s0, s1 + vmov s0, r10 + add r10, r7, r0 + mul r9, r9, r10 + vcvt.f32.s32 s1, s0 + add r9, r9, r9, lsr #31 + vldr s0, [r3, #4] + asr r9, r9, #1 + add r9, r9, r11 + vdiv.f32 s0, s0, s1 + vldr s1, [r3, #8] + vadd.f32 s2, s2, s0 + vmov s0, r9 + add r9, r8, r0 + add r0, r0, #4 + mul r9, r10, r9 + vcvt.f32.s32 s0, s0 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + vdiv.f32 s0, s1, s0 + add r9, r9, r0 + vadd.f32 s1, s2, s0 + vldr s2, [r3, #12] + vmov s0, r9 + movw r9, #34461 + movt r9, #1 + cmp r0, r9 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + bge label745 + add r3, r3, #16 + b label213 +.p2align 4 +label745: + vmov.f32 s1, s0 +.p2align 4 +label202: + movw r3, #34464 + movt r3, #1 + cmp r0, r3 + bge label679 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r3, r5, r0, lsl #2 + b label207 .p2align 4 label673: - mov r4, #0 + mov r3, #0 vmov.f32 s1, s0 - vmov s2, r4 + vmov s2, r3 vmov.f32 s0, s2 b label202 -.p2align 4 -label426: - movw r0, #34464 - movt r0, #1 - b label169 -.p2align 4 -label410: - movw r0, #34464 - movt r0, #1 - vmov.f32 s0, s1 - b label169 label636: mov r0, #1065353216 vmov s0, r0 b label195 .p2align 4 -label318: +label405: vmov.f32 s1, s0 - b label132 -.p2align 4 -label357: - mov r3, #0 - vmov s1, r3 b label159 .p2align 4 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/vector_mul2.riscv.s b/tests/SysY2022/performance/vector_mul2.riscv.s index 3198622f7..97bf7f282 100644 --- a/tests/SysY2022/performance/vector_mul2.riscv.s +++ b/tests/SysY2022/performance/vector_mul2.riscv.s @@ -1,668 +1,805 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 897988541 .4byte 3045472189 .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 Vectortm: .zero 400000 -.align 8 +.p2align 3 vectorB: .zero 400000 -.align 8 +.p2align 3 vectorA: .zero 400000 .text .p2align 2 .globl main main: - addi sp, sp, -72 + addi sp, sp, -56 li a0, 62 sd ra, 0(sp) - sd s2, 8(sp) - sd s1, 16(sp) - sd s6, 24(sp) - sd s4, 32(sp) - sd s3, 40(sp) - sd s0, 48(sp) - sd s5, 56(sp) - sd s7, 64(sp) + sd s1, 8(sp) + sd s0, 16(sp) + sd s5, 24(sp) + sd s3, 32(sp) + sd s2, 40(sp) + sd s4, 48(sp) jal _sysy_starttime lui a3, 24 -pcrel1032: +pcrel1055: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1033: +pcrel1056: auipc a1, %pcrel_hi(vectorA) - addi s2, a1, %pcrel_lo(pcrel1033) -pcrel1034: + addi s1, a1, %pcrel_lo(pcrel1056) +pcrel1057: auipc a1, %pcrel_hi(cmmc_parallel_body_0) - sd s2, %pcrel_lo(pcrel1032)(a0) - addi a2, a1, %pcrel_lo(pcrel1034) + sd s1, %pcrel_lo(pcrel1055)(a0) + addi a2, a1, %pcrel_lo(pcrel1057) mv a0, zero addiw a1, a3, 1696 jal cmmcParallelFor + lui s3, 258048 + mv a2, zero lui a4, 24 - lui s4, 258048 - mv a3, zero -pcrel1035: - auipc a2, %pcrel_hi(Vectortm) -pcrel1036: +pcrel1058: + auipc a3, %pcrel_hi(Vectortm) +pcrel1059: auipc a0, %pcrel_hi(vectorB) - addi a1, a2, %pcrel_lo(pcrel1035) - addi s1, a0, %pcrel_lo(pcrel1036) -pcrel1037: - auipc a2, %pcrel_hi(__cmmc_fp_constant_pool) + addi a1, a3, %pcrel_lo(pcrel1058) + addi s0, a0, %pcrel_lo(pcrel1059) +pcrel1060: + auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) addiw a0, a4, 1693 - addi s3, a2, %pcrel_lo(pcrel1037) - addi s0, a0, 3 - li a2, 1000 + addi s2, a3, %pcrel_lo(pcrel1060) j label110 .p2align 2 -label958: - addiw a3, a3, 1 - bge a3, a2, label184 +label981: + addiw a2, a2, 1 + li a3, 1000 + bge a2, a3, label184 .p2align 2 label110: fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label117 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label117 fsw f10, 0(a1) - blt a4, s0, label237 - mv a5, s1 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label237 + mv a4, s0 + mv a5, zero + addiw t0, t1, 1696 + blt zero, t0, label220 j label143 .p2align 2 label277: - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label953 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label976 .p2align 2 label237: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label970 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label993 .p2align 2 label117: - addiw t1, t5, 3 - bge t1, s0, label256 - sh2add t1, t5, s2 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label256 + sh2add t0, t4, s1 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 j label129 .p2align 2 label133: - addi t1, t1, 16 + addi t0, t0, 16 .p2align 2 label129: - addw a7, t0, t5 - addw a6, a4, t5 - flw f14, 0(t1) - mulw t6, a7, a6 - srliw s6, t6, 31 - add s5, t6, s6 - sraiw a7, s5, 1 - addw t6, a4, a7 + addw a6, a5, t4 + addw t5, a3, t4 + flw f14, 0(t0) + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + sraiw a6, s4, 1 + addw t6, a3, a6 fcvt.s.w f12, t6 - addw t6, t2, t5 + addw t6, t1, t4 fdiv.s f13, f14, f12 - mulw a7, a6, t6 - srliw s6, a7, 31 - add s7, a7, s6 - sraiw s5, s7, 1 - addw a6, a4, s5 + mulw a6, t5, t6 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw s5, a7, 1 + addw t5, a3, s5 + fcvt.s.w f12, t5 + addw t5, t2, t4 + mulw a6, t6, t5 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw t6, a7, 1 + addw a6, a3, t6 fadd.s f11, f10, f13 - flw f13, 4(t1) - fcvt.s.w f10, a6 - addw a6, t3, t5 - fdiv.s f14, f13, f10 - mulw a7, t6, a6 - flw f13, 8(t1) - srliw s6, a7, 31 - add s5, a7, s6 - addw a7, t4, t5 - sraiw s7, s5, 1 - addiw t5, t5, 4 - addw t6, a4, s7 - fadd.s f12, f11, f14 - fcvt.s.w f11, t6 - mulw t6, a6, a7 - fdiv.s f14, f13, f11 - srliw s5, t6, 31 - add a7, t6, s5 - sraiw s6, a7, 1 - addw a6, a4, s6 - fcvt.s.w f11, a6 - fadd.s f10, f12, f14 - flw f12, 12(t1) - fdiv.s f13, f12, f11 - fadd.s f10, f10, f13 - blt t5, a0, label133 + flw f13, 4(t0) + fdiv.s f14, f13, f12 + flw f13, 8(t0) + fcvt.s.w f12, a6 + addw a6, t3, t4 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw a7, t6, 31 + add s4, t6, a7 + sraiw t5, s4, 1 + addw a6, a3, t5 + fadd.s f10, f11, f14 + fdiv.s f14, f13, f12 + flw f13, 12(t0) + fcvt.s.w f12, a6 + fadd.s f11, f10, f14 + fdiv.s f14, f13, f12 + fadd.s f10, f11, f14 + blt t4, a0, label133 fmv.s f11, f10 - bge t5, s0, label954 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label977 .p2align 2 label122: - sh2add t1, t5, s2 - mv t2, t5 + sh2add t0, t4, s1 + mv t1, t4 fmv.s f10, f11 .p2align 2 label123: - addw t4, t0, t2 - addw a6, a4, t2 - flw f12, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, a6 - srliw t5, t3, 31 - add t6, t3, t5 + addw t3, a5, t1 + addw t4, a3, t1 + flw f12, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t4 + srliw t5, t2, 31 + add t6, t2, t5 sraiw t4, t6, 1 - addw a6, a4, t4 - fcvt.s.w f11, a6 + addw t3, a3, t4 + fcvt.s.w f11, t3 + lui t3, 24 fdiv.s f13, f12, f11 + addiw t2, t3, 1696 fadd.s f10, f10, f13 - bge t2, s0, label277 - addi t1, t1, 4 + bge t1, t2, label277 + addi t0, t0, 4 j label123 .p2align 2 label768: - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label968 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label991 .p2align 2 label146: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label955 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label978 .p2align 2 label220: - addiw t1, t5, 3 - bge t1, s0, label747 - sh2add t1, t5, a1 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label747 + sh2add t0, t4, a1 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 j label232 .p2align 2 label236: - addi t1, t1, 16 + addi t0, t0, 16 .p2align 2 label232: - addw a7, t0, t5 - addw a6, a4, t5 - flw f14, 0(t1) - addiw s7, t5, 3 - mulw t6, a7, a6 - srliw s6, t6, 31 - add s5, t6, s6 - addiw t6, t5, 1 - sraiw a7, s5, 1 - addw s6, a7, t6 - addw t6, t2, t5 - mulw a7, a6, t6 - fcvt.s.w f12, s6 - srliw s5, a7, 31 - add s6, a7, s5 + addw a6, a5, t4 + addw t5, a3, t4 + flw f14, 0(t0) + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + addiw t6, t4, 1 + sraiw a6, s4, 1 + addw a7, a6, t6 + addw t6, t1, t4 + mulw a6, t5, t6 + fcvt.s.w f12, a7 + srliw a7, a6, 31 fdiv.s f13, f14, f12 - addiw a7, t5, 2 - sraiw a6, s6, 1 - flw f14, 4(t1) - addw s5, a6, a7 - addw a6, t3, t5 - mulw a7, t6, a6 - fcvt.s.w f12, s5 - srliw s6, a7, 31 - add s5, a7, s6 - sraiw t6, s5, 1 - addw a7, t6, s7 + add s4, a6, a7 + flw f14, 4(t0) + addiw a7, t4, 2 + sraiw t5, s4, 1 + addw a6, t5, a7 + addw t5, t2, t4 + fcvt.s.w f12, a6 + mulw a6, t6, t5 + srliw a7, a6, 31 + add s4, a6, a7 + addiw a7, t4, 3 + sraiw t6, s4, 1 + addw a6, t6, a7 fadd.s f11, f10, f13 fdiv.s f13, f14, f12 - flw f14, 8(t1) - fcvt.s.w f12, a7 - addw a7, t4, t5 - addiw t5, t5, 4 - mulw t6, a6, a7 - srliw s5, t6, 31 - add a7, t6, s5 - sraiw a6, a7, 1 - addw t6, a6, t5 + flw f14, 8(t0) + fcvt.s.w f12, a6 + addw a6, t3, t4 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw a7, t6, 31 + add a6, t6, a7 + sraiw t5, a6, 1 + addw t6, t5, t4 fadd.s f10, f11, f13 fdiv.s f13, f14, f12 - flw f14, 12(t1) + flw f14, 12(t0) fcvt.s.w f12, t6 fadd.s f11, f10, f13 fdiv.s f13, f14, f12 fadd.s f10, f11, f13 - blt t5, a0, label236 + blt t4, a0, label236 fmv.s f12, f10 - bge t5, s0, label969 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label992 .p2align 2 label225: - sh2add t1, t5, a1 - mv t2, t5 + sh2add t0, t4, a1 + mv t1, t4 fmv.s f10, f12 .p2align 2 label226: - addw t4, t0, t2 - addw t5, a4, t2 - flw f13, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, t5 - srliw t6, t3, 31 - add a6, t3, t6 - sraiw t4, a6, 1 - addw t3, t4, t2 - fcvt.s.w f11, t3 + addw t3, a5, t1 + addw t4, a3, t1 + flw f13, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t4 + srliw t6, t2, 31 + add t5, t2, t6 + sraiw t3, t5, 1 + addw t2, t3, t1 + lui t3, 24 + fcvt.s.w f11, t2 + addiw t2, t3, 1696 fdiv.s f12, f13, f11 fadd.s f10, f10, f12 - bge t2, s0, label768 - addi t1, t1, 4 + bge t1, t2, label768 + addi t0, t0, 4 j label226 .p2align 2 label256: fmv.w.x f12, zero fmv.s f11, f10 fmv.s f10, f12 - blt t5, s0, label122 - mv t5, s0 - fsw f12, 0(a5) - blt a4, s0, label237 + lui t1, 24 + addiw t0, t1, 1696 + blt t4, t0, label122 + lui a5, 24 + lui t0, 24 + fsw f12, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label237 fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + addiw t0, t1, 1696 + blt zero, t0, label220 j label143 .p2align 2 label747: fmv.w.x f11, zero fmv.s f12, f10 fmv.s f10, f11 - blt t5, s0, label225 - mv t5, s0 - fsw f11, 0(a5) - blt a4, s0, label146 + lui t1, 24 + addiw t0, t1, 1696 + blt t4, t0, label225 + lui a5, 24 + lui t0, 24 + fsw f11, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label146 fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + addiw t0, t1, 1696 + blt zero, t0, label203 fsw f10, 0(a1) - blt a4, s0, label202 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 j label156 .p2align 2 label219: - addi t1, t1, 4 + addi t0, t0, 4 .p2align 2 label215: - addw t4, t0, t2 - addw t6, a4, t2 - flw f13, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, t6 - srliw a6, t3, 31 - add t5, t3, a6 - sraiw t4, t5, 1 - addw t6, a4, t4 - fcvt.s.w f11, t6 + addw t3, a5, t1 + addw t4, a3, t1 + flw f13, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t4 + srliw t6, t2, 31 + add t5, t2, t6 + sraiw t3, t5, 1 + addw t4, a3, t3 + lui t3, 24 + addiw t2, t3, 1696 + fcvt.s.w f11, t4 fdiv.s f12, f13, f11 fadd.s f10, f10, f12 - blt t2, s0, label219 - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label966 + blt t1, t2, label219 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label989 .p2align 2 label202: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label962 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label985 .p2align 2 label203: - addiw t1, t5, 3 - bge t1, s0, label673 - sh2add t1, t5, s1 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label673 + sh2add t0, t4, s0 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 .p2align 2 label205: - addw a7, t0, t5 - addw t6, a4, t5 - flw f13, 0(t1) - mulw a6, a7, t6 - srliw s5, a6, 31 - add a7, a6, s5 - sraiw s6, a7, 1 - addw a6, a4, s6 - fcvt.s.w f11, a6 - addw a6, t2, t5 - fdiv.s f14, f13, f11 - mulw a7, t6, a6 - flw f13, 4(t1) - srliw s5, a7, 31 - add s6, a7, s5 - sraiw t6, s6, 1 - addw a7, a4, t6 - addw t6, t3, t5 - fadd.s f12, f10, f14 - fcvt.s.w f10, a7 - mulw a7, a6, t6 - fdiv.s f14, f13, f10 - srliw s5, a7, 31 - add s6, a7, s5 - sraiw a6, s6, 1 - addw a7, a4, a6 - fadd.s f11, f12, f14 - flw f14, 8(t1) - fcvt.s.w f12, a7 - addw a7, t4, t5 + addw a6, a5, t4 + addw t5, a3, t4 + flw f13, 0(t0) + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + sraiw a6, s4, 1 + addw t6, a3, a6 + fcvt.s.w f12, t6 + addw t6, t1, t4 + fdiv.s f14, f13, f12 + mulw a6, t5, t6 + flw f13, 4(t0) + srliw a7, a6, 31 + add t5, a6, a7 + sraiw s4, t5, 1 + addw t5, t2, t4 + addw a6, a3, s4 + fcvt.s.w f12, a6 + mulw a6, t6, t5 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw t6, a7, 1 + addw a6, a3, t6 + fadd.s f11, f10, f14 + fdiv.s f14, f13, f12 + flw f13, 8(t0) + fcvt.s.w f12, a6 + addw a6, t3, t4 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw a7, t6, 31 + add t5, t6, a7 + sraiw s4, t5, 1 + addw a6, a3, s4 + fadd.s f10, f11, f14 + fdiv.s f14, f13, f12 + fcvt.s.w f12, a6 + fadd.s f11, f10, f14 + flw f14, 12(t0) fdiv.s f13, f14, f12 - addiw t5, t5, 4 - mulw a6, t6, a7 - srliw s6, a6, 31 - add a7, a6, s6 - sraiw s5, a7, 1 - addw t6, a4, s5 fadd.s f10, f11, f13 - flw f13, 12(t1) - fcvt.s.w f11, t6 - fdiv.s f12, f13, f11 - fadd.s f10, f10, f12 - bge t5, a0, label720 - addi t1, t1, 16 + bge t4, a0, label720 + addi t0, t0, 16 j label205 .p2align 2 label720: fmv.s f11, f10 - bge t5, s0, label964 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label987 .p2align 2 label214: - sh2add t1, t5, s1 - mv t2, t5 + sh2add t0, t4, s0 + mv t1, t4 fmv.s f10, f11 j label215 .p2align 2 label673: fmv.w.x f12, zero fmv.s f11, f10 + lui t1, 24 fmv.s f10, f12 - blt t5, s0, label214 - mv t5, s0 - fsw f12, 0(a5) - blt a4, s0, label202 + addiw t0, t1, 1696 + blt t4, t0, label214 + lui a5, 24 + lui t0, 24 + fsw f12, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label202 fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + addiw t0, t1, 1696 + blt zero, t0, label162 j label356 .p2align 2 label172: - addi t1, t1, 4 + addi t0, t0, 4 .p2align 2 label168: - addw t4, t0, t2 - addw t6, a4, t2 - flw f12, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, t6 - srliw a6, t3, 31 - add t5, t3, a6 - sraiw t4, t5, 1 - addw t3, t4, t2 - fcvt.s.w f11, t3 + addw t3, a5, t1 + addw t6, a3, t1 + flw f12, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t6 + srliw t4, t2, 31 + add t5, t2, t4 + sraiw t3, t5, 1 + addw t2, t3, t1 + lui t3, 24 + fcvt.s.w f11, t2 + addiw t2, t3, 1696 fdiv.s f13, f12, f11 fadd.s f10, f10, f13 - blt t2, s0, label172 - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label958 + blt t1, t2, label172 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label981 .p2align 2 label182: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label961 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label984 .p2align 2 label162: - addiw t1, t5, 3 - bge t1, s0, label361 - sh2add t1, t5, a1 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label361 + sh2add t0, t4, a1 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 .p2align 2 label174: - addw a7, t0, t5 - addw a6, a4, t5 - addiw s6, t5, 1 - mulw t6, a7, a6 - flw f13, 0(t1) - srliw s5, t6, 31 - add s7, t6, s5 - sraiw a7, s7, 1 - addiw s7, t5, 3 - addw t6, a7, s6 + addw a6, a5, t4 + addw t5, a3, t4 + flw f13, 0(t0) + addiw s5, t4, 3 + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + addiw a7, t4, 1 + sraiw a6, s4, 1 + addw t6, a6, a7 fcvt.s.w f12, t6 - addw t6, t2, t5 + addw t6, t1, t4 fdiv.s f14, f13, f12 - mulw a7, a6, t6 - flw f13, 4(t1) - srliw s6, a7, 31 - add s5, a7, s6 - addiw a7, t5, 2 - sraiw a6, s5, 1 - addw s6, a6, a7 - addw a6, t3, t5 - mulw a7, t6, a6 - fcvt.s.w f12, s6 - srliw s6, a7, 31 - add s5, a7, s6 - sraiw t6, s5, 1 - addw a7, t6, s7 + mulw a6, t5, t6 + flw f13, 4(t0) + srliw a7, a6, 31 + add s4, a6, a7 + addiw a7, t4, 2 + sraiw t5, s4, 1 + addw a6, t5, a7 + addw t5, t2, t4 fadd.s f11, f10, f14 - fdiv.s f14, f13, f12 - fcvt.s.w f12, a7 - addw a7, t4, t5 - addiw t5, t5, 4 - mulw t6, a6, a7 - srliw s5, t6, 31 - add a7, t6, s5 - sraiw a6, a7, 1 - addw t6, a6, t5 - fadd.s f10, f11, f14 - flw f14, 8(t1) - fdiv.s f13, f14, f12 - fcvt.s.w f12, t6 - fadd.s f11, f10, f13 - flw f13, 12(t1) - fdiv.s f14, f13, f12 - fadd.s f10, f11, f14 - bge t5, a0, label432 - addi t1, t1, 16 + fcvt.s.w f10, a6 + mulw a6, t6, t5 + fdiv.s f14, f13, f10 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw t6, a7, 1 + addw a6, t6, s5 + fadd.s f12, f11, f14 + flw f14, 8(t0) + fcvt.s.w f11, a6 + addw a6, t3, t4 + fdiv.s f13, f14, f11 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw s4, t6, 31 + add a7, t6, s4 + sraiw t5, a7, 1 + addw t6, t5, t4 + fcvt.s.w f11, t6 + fadd.s f10, f12, f13 + flw f12, 12(t0) + fdiv.s f13, f12, f11 + fadd.s f10, f10, f13 + bge t4, a0, label432 + addi t0, t0, 16 j label174 .p2align 2 label432: - fmv.s f12, f10 - bge t5, s0, label959 + fmv.s f11, f10 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label982 .p2align 2 label167: - sh2add t1, t5, a1 - mv t2, t5 - fmv.s f10, f12 + sh2add t0, t4, a1 + mv t1, t4 + fmv.s f10, f11 j label168 .p2align 2 label361: - fmv.w.x f11, zero - fmv.s f12, f10 - fmv.s f10, f11 - blt t5, s0, label167 - mv t5, s0 - fsw f11, 0(a5) - blt a4, s0, label182 - addiw a3, a3, 1 - blt a3, a2, label110 + fmv.w.x f12, zero + fmv.s f11, f10 + fmv.s f10, f12 + lui t1, 24 + addiw t0, t1, 1696 + blt t4, t0, label167 + lui a5, 24 + lui t0, 24 + fsw f12, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label182 + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label959: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label182 - addiw a3, a3, 1 - blt a3, a2, label110 +label982: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label182 + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label954: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label237 +label977: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label237 fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 - fsw f10, 0(s1) - blt a4, s0, label146 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label220 + fsw f10, 0(s0) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 j label339 .p2align 2 -label964: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label202 +label987: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label202 fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 - fsw f10, 0(s2) - blt a4, s0, label182 - j label956 -.p2align 2 -label969: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label146 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 + fsw f10, 0(s1) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 + j label979 +.p2align 2 +label992: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label146 fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 fsw f10, 0(a1) - blt a4, s0, label202 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 j label156 .p2align 2 -label961: - fsw f10, 0(a5) - blt a4, s0, label182 - addiw a3, a3, 1 - blt a3, a2, label110 +label984: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label970: - fsw f10, 0(a5) - blt a4, s0, label237 +label993: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label237 fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label220 label143: - fsw f10, 0(a5) - blt a4, s0, label146 + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 j label339 .p2align 2 -label962: - fsw f10, 0(a5) - blt a4, s0, label202 +label985: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 label356: - fsw f10, 0(a5) - blt a4, s0, label182 - j label956 + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 + j label979 .p2align 2 -label955: - fsw f10, 0(a5) - blt a4, s0, label146 +label978: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 label153: - fsw f10, 0(a5) - blt a4, s0, label202 + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 j label156 .p2align 2 -label953: +label976: fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 - fsw f10, 0(s1) - blt a4, s0, label146 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label220 + fsw f10, 0(s0) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 label339: fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 j label153 .p2align 2 -label966: +label989: fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 - fsw f10, 0(s2) - blt a4, s0, label182 -label956: - addiw a3, a3, 1 - blt a3, a2, label110 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 + fsw f10, 0(s1) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 +label979: + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label968: +label991: fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 fsw f10, 0(a1) - blt a4, s0, label202 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 label156: fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 j label356 label184: li a0, 76 @@ -672,154 +809,158 @@ label184: j label185 .p2align 2 label189: - addi s2, s2, 64 + addi s1, s1, 64 .p2align 2 label185: - sh2add a1, a0, s1 - flw f12, 0(s2) + sh2add a1, a0, s0 + flw f12, 0(s1) + lui a2, 24 addiw a0, a0, 16 flw f14, 0(a1) - flw f13, 4(s2) + flw f13, 4(s1) fmul.s f15, f12, f14 flw f14, 4(a1) - flw f12, 8(s2) + flw f12, 8(s1) flw f0, 8(a1) fadd.s f11, f10, f15 fmul.s f15, f13, f14 - flw f13, 12(s2) + flw f13, 12(s1) fmul.s f14, f12, f0 fadd.s f10, f11, f15 flw f15, 12(a1) - flw f12, 16(s2) + flw f12, 16(s1) fadd.s f11, f10, f14 fmul.s f14, f13, f15 flw f13, 16(a1) fmul.s f15, f12, f13 fadd.s f10, f11, f14 - flw f14, 20(s2) + flw f14, 20(s1) flw f12, 20(a1) - flw f13, 24(s2) + flw f13, 24(s1) fadd.s f11, f10, f15 fmul.s f15, f14, f12 flw f14, 24(a1) - flw f12, 28(s2) + flw f12, 28(s1) fadd.s f10, f11, f15 fmul.s f15, f13, f14 flw f14, 28(a1) - flw f13, 32(s2) + flw f13, 32(s1) fmul.s f0, f12, f14 fadd.s f11, f10, f15 flw f15, 32(a1) - flw f14, 36(s2) + flw f14, 36(s1) fadd.s f10, f11, f0 fmul.s f11, f13, f15 flw f15, 36(a1) - flw f13, 40(s2) + flw f13, 40(s1) fadd.s f12, f10, f11 fmul.s f10, f14, f15 flw f14, 40(a1) fmul.s f0, f13, f14 fadd.s f11, f12, f10 - flw f12, 44(s2) + flw f12, 44(s1) flw f15, 44(a1) - flw f13, 48(s2) + flw f13, 48(s1) fmul.s f14, f12, f15 fadd.s f10, f11, f0 flw f15, 48(a1) fmul.s f12, f13, f15 fadd.s f11, f10, f14 - flw f14, 52(s2) + flw f14, 52(s1) flw f15, 52(a1) - flw f13, 56(s2) + flw f13, 56(s1) fadd.s f10, f11, f12 fmul.s f11, f14, f15 flw f15, 56(a1) - flw f14, 60(s2) + flw f14, 60(s1) fadd.s f12, f10, f11 fmul.s f10, f13, f15 flw f13, 60(a1) + addiw a1, a2, 1696 fadd.s f11, f12, f10 fmul.s f12, f14, f13 fadd.s f10, f11, f12 - blt a0, s0, label189 + blt a0, a1, label189 fmv.w.x f11, zero mv a0, zero j label191 .p2align 2 label195: - addi s1, s1, 64 + addi s0, s0, 64 .p2align 2 label191: - flw f13, 0(s1) + flw f13, 0(s0) addiw a0, a0, 16 - flw f14, 4(s1) + lui a2, 24 + flw f14, 4(s0) fmul.s f15, f13, f13 + addiw a1, a2, 1696 fmul.s f1, f14, f14 fadd.s f12, f11, f15 - flw f15, 8(s1) - flw f14, 12(s1) + flw f15, 8(s0) + flw f14, 12(s0) fmul.s f0, f15, f15 fadd.s f13, f12, f1 fmul.s f1, f14, f14 fadd.s f11, f13, f0 - flw f13, 16(s1) - flw f14, 20(s1) + flw f13, 16(s0) + flw f14, 20(s0) fmul.s f15, f13, f13 fmul.s f0, f14, f14 - flw f13, 24(s1) + flw f13, 24(s0) fadd.s f12, f11, f1 - flw f14, 28(s1) + flw f14, 28(s0) fadd.s f11, f12, f15 fmul.s f15, f13, f13 fadd.s f12, f11, f0 fmul.s f0, f14, f14 fadd.s f11, f12, f15 - flw f15, 32(s1) - flw f14, 36(s1) + flw f15, 32(s0) + flw f14, 36(s0) fmul.s f1, f15, f15 fadd.s f13, f11, f0 fmul.s f0, f14, f14 fadd.s f12, f13, f1 - flw f13, 40(s1) - flw f14, 44(s1) + flw f13, 40(s0) + flw f14, 44(s0) fmul.s f15, f13, f13 - flw f13, 48(s1) + flw f13, 48(s0) fadd.s f11, f12, f0 fmul.s f0, f14, f14 - flw f14, 52(s1) + flw f14, 52(s0) fadd.s f12, f11, f15 fmul.s f15, f13, f13 - flw f13, 56(s1) + flw f13, 56(s0) fadd.s f11, f12, f0 fmul.s f0, f14, f14 - flw f14, 60(s1) + flw f14, 60(s0) fadd.s f12, f11, f15 fmul.s f15, f13, f13 fmul.s f13, f14, f14 fadd.s f11, f12, f0 fadd.s f12, f11, f15 fadd.s f11, f12, f13 - blt a0, s0, label195 + blt a0, a1, label195 fdiv.s f10, f10, f11 lui a0, 260096 - flw f13, 0(s3) + flw f13, 0(s2) fmv.w.x f12, a0 fsub.s f11, f12, f10 - flw f12, 4(s3) + flw f12, 4(s2) flt.s a0, f13, f11 - flt.s a2, f11, f12 - or a1, a0, a2 - beq a1, zero, label632 + flt.s a1, f11, f12 + or a2, a0, a1 + beq a2, zero, label632 lui a0, 260096 fmv.s f12, f10 fmv.w.x f11, a0 .p2align 2 label197: fadd.s f14, f11, f12 - fmv.w.x f15, s4 + fmv.w.x f15, s3 fmul.s f11, f14, f15 - flw f14, 0(s3) - flw f15, 4(s3) + flw f14, 0(s2) + flw f15, 4(s2) fdiv.s f12, f10, f11 fsub.s f13, f11, f12 flt.s a2, f13, f15 @@ -828,10 +969,10 @@ label197: bne a1, zero, label197 label200: lui a0, 260096 - flw f13, 0(s3) + flw f13, 0(s2) fmv.w.x f12, a0 fsub.s f10, f11, f12 - flw f11, 4(s3) + flw f11, 4(s2) fle.s a1, f10, f13 fle.s a2, f11, f10 and a0, a1, a2 @@ -840,15 +981,13 @@ label200: jal putch ld ra, 0(sp) mv a0, zero - ld s2, 8(sp) - ld s1, 16(sp) - ld s6, 24(sp) - ld s4, 32(sp) - ld s3, 40(sp) - ld s0, 48(sp) - ld s5, 56(sp) - ld s7, 64(sp) - addi sp, sp, 72 + ld s1, 8(sp) + ld s0, 16(sp) + ld s5, 24(sp) + ld s3, 32(sp) + ld s2, 40(sp) + ld s4, 48(sp) + addi sp, sp, 56 ret label632: lui a3, 260096 diff --git a/tests/SysY2022/performance/vector_mul3.arm.s b/tests/SysY2022/performance/vector_mul3.arm.s index a739fb531..934b20d85 100644 --- a/tests/SysY2022/performance/vector_mul3.arm.s +++ b/tests/SysY2022/performance/vector_mul3.arm.s @@ -1,16 +1,16 @@ .arch armv7ve .data .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 4 -.align 8 +.p2align 3 Vectortm: .zero 400000 -.align 8 +.p2align 3 vectorB: .zero 400000 -.align 8 +.p2align 3 vectorA: .zero 400000 .text @@ -28,11 +28,11 @@ main: movt r1, #1 movw r2, #:lower16:cmmc_parallel_body_0 movt r2, #:upper16:cmmc_parallel_body_0 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA movw r0, #:lower16:cmmc_parallel_body_payload_0 movt r0, #:upper16:cmmc_parallel_body_payload_0 - str r4, [r0, #0] + str r3, [r0, #0] mov r0, #0 bl cmmcParallelFor mov r2, #0 @@ -49,208 +49,120 @@ label108: movt r5, #:upper16:Vectortm mov r0, r1 vmov s0, r1 - mov r3, r5 + mov r4, r5 .p2align 4 label110: add r5, r1, #1 - movw r4, #34464 - movt r4, #1 - cmp r0, r4 + movw r3, #34464 + movt r3, #1 + cmp r0, r3 bge label115 - add r4, r0, #3 + add r3, r0, #3 movw r6, #34464 movt r6, #1 - cmp r4, r6 + cmp r3, r6 bge label752 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA add r6, r1, #2 add r7, r1, #3 add r8, r1, #4 - add r4, r4, r0, lsl #2 + add r3, r3, r0, lsl #2 b label231 -.p2align 4 -label235: - add r4, r4, #16 -.p2align 4 -label231: - add r10, r1, r0 - add r9, r5, r0 - vldr s2, [r4, #0] - mul r10, r10, r9 - add r10, r10, r10, lsr #31 - asr r10, r10, #1 - add r10, r5, r10 - vmov s1, r10 - add r10, r6, r0 - mul r9, r9, r10 - vcvt.f32.s32 s1, s1 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s1, s2, s1 - add r9, r5, r9 - vldr s2, [r4, #4] - vadd.f32 s1, s0, s1 - vmov s0, r9 - add r9, r7, r0 - mul r10, r10, r9 - vcvt.f32.s32 s0, s0 - add r10, r10, r10, lsr #31 - asr r10, r10, #1 - vdiv.f32 s0, s2, s0 - add r10, r5, r10 - vadd.f32 s2, s1, s0 - vldr s1, [r4, #8] - vmov s0, r10 - add r10, r8, r0 - add r0, r0, #4 - mul r9, r9, r10 - vcvt.f32.s32 s0, s0 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s0, s1, s0 - add r9, r5, r9 - vadd.f32 s1, s2, s0 - vldr s2, [r4, #12] - vmov s0, r9 - movw r9, #34461 - movt r9, #1 - cmp r0, r9 - vcvt.f32.s32 s0, s0 - vdiv.f32 s0, s2, s0 - vadd.f32 s0, s1, s0 - blt label235 - vmov.f32 s1, s0 -.p2align 4 -label220: - movw r4, #34464 - movt r4, #1 - cmp r0, r4 - bge label758 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA - add r4, r4, r0, lsl #2 -.p2align 4 -label225: - add r6, r1, r0 - add r7, r5, r0 - add r0, r0, #1 - mul r6, r6, r7 - add r6, r6, r6, lsr #31 - asr r6, r6, #1 - add r6, r5, r6 - vmov s0, r6 - movw r6, #34464 - movt r6, #1 - cmp r0, r6 - vcvt.f32.s32 s2, s0 - vldr s0, [r4, #0] - vdiv.f32 s0, s0, s2 - vadd.f32 s0, s1, s0 - bge label758 - add r4, r4, #4 - vmov.f32 s1, s0 - b label225 -.p2align 4 -label115: - vstr s0, [r3, #0] - movw r1, #34464 - movt r1, #1 - cmp r5, r1 - bge label254 - add r3, r3, #4 - mov r1, r5 - b label110 label182: mov r0, #76 bl _sysy_stoptime - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA - mov r0, #0 - mov r1, r4 - vmov s0, r0 + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA + mov r1, #0 + mov r0, r3 + vmov s0, r1 b label183 .p2align 4 label187: - add r1, r1, #64 + add r0, r0, #64 vmov.f32 s0, s1 .p2align 4 label183: movw r3, #:lower16:vectorB movt r3, #:upper16:vectorB - vldr s2, [r1, #0] - add r2, r3, r0, lsl #2 - add r0, r0, #16 - vldr s1, [r2, #0] - vmul.f32 s1, s2, s1 - vadd.f32 s2, s0, s1 - vldr s1, [r1, #4] - vldr s0, [r2, #4] - vmul.f32 s0, s1, s0 - vadd.f32 s0, s2, s0 - vldr s2, [r1, #8] - vldr s1, [r2, #8] - vmul.f32 s1, s2, s1 - vadd.f32 s2, s0, s1 - vldr s0, [r1, #12] - vldr s1, [r2, #12] - vmul.f32 s0, s0, s1 - vadd.f32 s1, s2, s0 - vldr s2, [r1, #16] - vldr s0, [r2, #16] - vmul.f32 s0, s2, s0 - vldr s2, [r1, #20] - vadd.f32 s1, s1, s0 - vldr s0, [r2, #20] - vmul.f32 s0, s2, s0 - vldr s2, [r1, #24] - vadd.f32 s1, s1, s0 - vldr s0, [r2, #24] - vmul.f32 s0, s2, s0 - vadd.f32 s2, s1, s0 - vldr s1, [r1, #28] - vldr s0, [r2, #28] - vmul.f32 s0, s1, s0 - vadd.f32 s1, s2, s0 - vldr s2, [r1, #32] - vldr s0, [r2, #32] - vmul.f32 s0, s2, s0 - vadd.f32 s2, s1, s0 - vldr s0, [r1, #36] - vldr s1, [r2, #36] - vmul.f32 s0, s0, s1 - vldr s1, [r1, #40] - vadd.f32 s2, s2, s0 - vldr s0, [r2, #40] - vmul.f32 s0, s1, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r1, #44] - vldr s1, [r2, #44] - vmul.f32 s0, s0, s1 - vldr s1, [r1, #48] - vadd.f32 s2, s2, s0 - vldr s0, [r2, #48] - vmul.f32 s0, s1, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r1, #52] - vldr s1, [r2, #52] - vmul.f32 s0, s0, s1 - vadd.f32 s1, s2, s0 - vldr s0, [r1, #56] + vldr s1, [r0, #0] + add r2, r3, r1, lsl #2 + add r1, r1, #16 + vldr s2, [r2, #0] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #4] + vldr s2, [r2, #4] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #8] + vldr s2, [r2, #8] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #12] + vldr s2, [r2, #12] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #16] + vldr s2, [r2, #16] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #20] + vldr s2, [r2, #20] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #24] + vldr s2, [r2, #24] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #28] + vldr s2, [r2, #28] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #32] + vldr s2, [r2, #32] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #36] + vldr s2, [r2, #36] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #40] + vldr s2, [r2, #40] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #44] + vldr s2, [r2, #44] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #48] + vldr s2, [r2, #48] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #52] + vldr s2, [r2, #52] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #56] vldr s2, [r2, #56] - vmul.f32 s0, s0, s2 - vadd.f32 s2, s1, s0 - vldr s0, [r1, #60] - vldr s1, [r2, #60] + vmul.f32 s1, s1, s2 + vadd.f32 s0, s0, s1 + vldr s1, [r0, #60] + vldr s2, [r2, #60] movw r2, #34464 movt r2, #1 - cmp r0, r2 - vmul.f32 s0, s0, s1 - vadd.f32 s1, s2, s0 + cmp r1, r2 + vmul.f32 s1, s1, s2 + vadd.f32 s1, s0, s1 blt label187 mov r1, #0 vmov s0, r1 mov r0, r3 + b label189 +.p2align 4 +label193: + add r0, r0, #64 .p2align 4 label189: vldr s2, [r0, #0] @@ -259,135 +171,53 @@ label189: movt r2, #1 cmp r1, r2 vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #4] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #4] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #8] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #12] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #16] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #20] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #12] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #16] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #20] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #24] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #28] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #32] - vmul.f32 s0, s0, s0 - vadd.f32 s2, s2, s0 - vldr s0, [r0, #36] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #28] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #32] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #36] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #40] vmul.f32 s2, s2, s2 vadd.f32 s0, s0, s2 vldr s2, [r0, #44] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #48] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 + vadd.f32 s0, s0, s2 + vldr s2, [r0, #48] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 vldr s2, [r0, #52] vmul.f32 s2, s2, s2 vadd.f32 s0, s0, s2 vldr s2, [r0, #56] vmul.f32 s2, s2, s2 - vadd.f32 s2, s0, s2 - vldr s0, [r0, #60] - vmul.f32 s0, s0, s0 - vadd.f32 s0, s2, s0 - bge label194 - add r0, r0, #64 - b label189 -.p2align 4 -label254: - mov r1, #0 - movw r3, #:lower16:vectorB - movt r3, #:upper16:vectorB - mov r0, r1 - vmov s0, r1 - mov r4, r3 -.p2align 4 -label120: - add r6, r1, #1 - movw r3, #34464 - movt r3, #1 - cmp r0, r3 - bge label142 - add r3, r0, #3 - movw r5, #34464 - movt r5, #1 - cmp r3, r5 - bge label267 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r7, r1, #3 - add r8, r1, #4 - add r3, r5, r0, lsl #2 - add r5, r1, #2 -.p2align 4 -label127: - add r9, r1, r0 - add r10, r6, r0 - vldr s2, [r3, #0] - add r11, r0, #1 - mul r9, r9, r10 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - add r9, r9, r11 - add r11, r0, #2 - vmov s1, r9 - add r9, r5, r0 - mul r10, r10, r9 - vcvt.f32.s32 s1, s1 - add r10, r10, r10, lsr #31 - asr r10, r10, #1 - vdiv.f32 s1, s2, s1 - add r10, r10, r11 - vldr s2, [r3, #4] - add r11, r0, #3 - vadd.f32 s0, s0, s1 - vmov s1, r10 - add r10, r7, r0 - mul r9, r9, r10 - vcvt.f32.s32 s1, s1 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s1, s2, s1 - add r9, r9, r11 - vldr s2, [r3, #8] - vadd.f32 s0, s0, s1 - vmov s1, r9 - add r9, r8, r0 - add r0, r0, #4 - mul r9, r10, r9 - vcvt.f32.s32 s1, s1 - add r9, r9, r9, lsr #31 - asr r9, r9, #1 - vdiv.f32 s1, s2, s1 - add r9, r9, r0 - vldr s2, [r3, #12] - vadd.f32 s0, s0, s1 - vmov s1, r9 - movw r9, #34461 - movt r9, #1 - cmp r0, r9 - vcvt.f32.s32 s1, s1 - vdiv.f32 s1, s2, s1 - vadd.f32 s0, s0, s1 - bge label318 - add r3, r3, #16 - b label127 -label194: + vadd.f32 s0, s0, s2 + vldr s2, [r0, #60] + vmul.f32 s2, s2, s2 + vadd.f32 s0, s0, s2 + blt label193 vdiv.f32 s2, s1, s0 mov r0, #1065353216 movw r1, #14269 @@ -395,25 +225,25 @@ label194: vmov s0, r0 movw r0, #14269 movt r0, #13702 - vsub.f32 s1, s0, s2 - vmov s0, r0 + vmov s1, r0 mov r0, #0 - vcmp.f32 s1, s0 - vmov s0, r1 + vsub.f32 s0, s0, s2 + vcmp.f32 s0, s1 + vmov s1, r1 mov r1, #0 vmrs APSR_nzcv, FPSCR movwgt r0, #1 - vcmp.f32 s1, s0 + vcmp.f32 s0, s1 vmrs APSR_nzcv, FPSCR movwmi r1, #1 orrs r0, r0, r1 beq label636 mov r0, #1065353216 - vmov.f32 s0, s2 - vmov s1, r0 + vmov.f32 s1, s2 + vmov s0, r0 .p2align 4 label197: - vadd.f32 s0, s1, s0 + vadd.f32 s0, s0, s1 mov r0, #1056964608 movw r1, #14269 movt r1, #46470 @@ -422,9 +252,9 @@ label197: movt r0, #13702 vmov s4, r0 mov r0, #0 - vmul.f32 s1, s0, s1 - vdiv.f32 s0, s2, s1 - vsub.f32 s3, s1, s0 + vmul.f32 s0, s0, s1 + vdiv.f32 s1, s2, s0 + vsub.f32 s3, s0, s1 vcmp.f32 s3, s4 vmov s4, r1 mov r1, #0 @@ -435,7 +265,6 @@ label197: movwmi r1, #1 orrs r0, r0, r1 bne label197 - vmov.f32 s0, s1 label195: mov r0, #1065353216 movw r1, #14269 @@ -443,15 +272,15 @@ label195: vmov s1, r0 movw r0, #14269 movt r0, #13702 - vsub.f32 s1, s0, s1 - vmov s0, r0 + vsub.f32 s0, s0, s1 + vmov s1, r0 mov r0, #0 - vcmp.f32 s1, s0 - vmov s0, r1 + vcmp.f32 s0, s1 + vmov s1, r1 mov r1, #0 vmrs APSR_nzcv, FPSCR movwls r0, #1 - vcmp.f32 s1, s0 + vcmp.f32 s0, s1 vmrs APSR_nzcv, FPSCR movwge r1, #1 and r0, r0, r1 @@ -462,68 +291,92 @@ label195: mov r0, #0 pop { r4, r5, r6, r7, r8, r9, r10, r11, pc } .p2align 4 -label267: +label752: mov r3, #0 vmov.f32 s1, s0 vmov s2, r3 vmov.f32 s0, s2 .p2align 4 -label132: +label220: movw r3, #34464 movt r3, #1 cmp r0, r3 - bge label323 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r3, r5, r0, lsl #2 + bge label758 + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA + add r3, r3, r0, lsl #2 .p2align 4 -label137: - add r5, r1, r0 - add r7, r6, r0 - vldr s2, [r3, #0] +label225: + add r6, r1, r0 + add r7, r5, r0 add r0, r0, #1 - mul r5, r5, r7 - add r5, r5, r5, lsr #31 - asr r5, r5, #1 - add r5, r5, r0 - vmov s0, r5 - movw r5, #34464 - movt r5, #1 - cmp r0, r5 - vcvt.f32.s32 s0, s0 - vdiv.f32 s0, s2, s0 + mul r6, r6, r7 + add r6, r6, r6, lsr #31 + asr r6, r6, #1 + add r6, r5, r6 + vmov s0, r6 + movw r6, #34464 + movt r6, #1 + cmp r0, r6 + vcvt.f32.s32 s2, s0 + vldr s0, [r3, #0] + vdiv.f32 s0, s0, s2 vadd.f32 s0, s1, s0 - bge label323 + bge label758 add r3, r3, #4 vmov.f32 s1, s0 - b label137 + b label225 .p2align 4 -label323: +label758: movw r0, #34464 movt r0, #1 .p2align 4 -label142: +label115: vstr s0, [r4, #0] movw r1, #34464 movt r1, #1 + cmp r5, r1 + bge label254 + add r4, r4, #4 + mov r1, r5 + b label110 +.p2align 4 +label254: + mov r1, #0 + movw r3, #:lower16:vectorB + movt r3, #:upper16:vectorB + mov r0, r1 + vmov s0, r1 +.p2align 4 +label120: + add r6, r1, #1 + movw r4, #34464 + movt r4, #1 + cmp r0, r4 + bge label142 + add r4, r0, #3 + movw r5, #34464 + movt r5, #1 + cmp r4, r5 + bge label267 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r7, r1, #3 + add r8, r1, #4 + add r4, r5, r0, lsl #2 + add r5, r1, #2 + b label127 +.p2align 4 +label142: + vstr s0, [r3, #0] + movw r1, #34464 + movt r1, #1 cmp r6, r1 bge label344 - add r4, r4, #4 + add r3, r3, #4 mov r1, r6 b label120 .p2align 4 -label758: - movw r0, #34464 - movt r0, #1 - b label115 -.p2align 4 -label752: - mov r4, #0 - vmov.f32 s1, s0 - vmov s2, r4 - vmov.f32 s0, s2 - b label220 -.p2align 4 label344: mov r1, #0 movw r5, #:lower16:Vectortm @@ -531,16 +384,6 @@ label344: mov r0, r1 vmov s0, r1 mov r4, r5 - b label147 -.p2align 4 -label169: - vstr s0, [r4, #0] - movw r1, #34464 - movt r1, #1 - cmp r5, r1 - bge label172 - add r4, r4, #4 - mov r1, r5 .p2align 4 label147: add r5, r1, #1 @@ -560,7 +403,103 @@ label147: add r8, r1, #4 add r3, r3, r0, lsl #2 .p2align 4 -label154: +label154: + add r10, r1, r0 + add r9, r5, r0 + vldr s2, [r3, #0] + mul r10, r10, r9 + add r10, r10, r10, lsr #31 + asr r10, r10, #1 + add r10, r5, r10 + vmov s1, r10 + add r10, r6, r0 + mul r9, r9, r10 + vcvt.f32.s32 s1, s1 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + vdiv.f32 s1, s2, s1 + add r9, r5, r9 + vldr s2, [r3, #4] + vadd.f32 s0, s0, s1 + vmov s1, r9 + add r9, r7, r0 + mul r10, r10, r9 + vcvt.f32.s32 s1, s1 + add r10, r10, r10, lsr #31 + asr r10, r10, #1 + vdiv.f32 s1, s2, s1 + add r10, r5, r10 + vldr s2, [r3, #8] + vadd.f32 s0, s0, s1 + vmov s1, r10 + add r10, r8, r0 + add r0, r0, #4 + mul r9, r9, r10 + vcvt.f32.s32 s1, s1 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + vdiv.f32 s1, s2, s1 + add r9, r5, r9 + vldr s2, [r3, #12] + vadd.f32 s0, s0, s1 + vmov s1, r9 + movw r9, #34461 + movt r9, #1 + cmp r0, r9 + vcvt.f32.s32 s1, s1 + vdiv.f32 s1, s2, s1 + vadd.f32 s0, s0, s1 + bge label405 + add r3, r3, #16 + b label154 +.p2align 4 +label267: + mov r4, #0 + vmov.f32 s1, s0 + vmov s2, r4 + vmov.f32 s0, s2 +.p2align 4 +label132: + movw r4, #34464 + movt r4, #1 + cmp r0, r4 + bge label323 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r4, r5, r0, lsl #2 + b label137 +.p2align 4 +label141: + add r4, r4, #4 + vmov.f32 s1, s0 +.p2align 4 +label137: + add r5, r1, r0 + add r7, r6, r0 + vldr s2, [r4, #0] + add r0, r0, #1 + mul r5, r5, r7 + add r5, r5, r5, lsr #31 + asr r5, r5, #1 + add r5, r5, r0 + vmov s0, r5 + movw r5, #34464 + movt r5, #1 + cmp r0, r5 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + blt label141 +.p2align 4 +label323: + movw r0, #34464 + movt r0, #1 + b label142 +.p2align 4 +label235: + add r3, r3, #16 +.p2align 4 +label231: add r10, r1, r0 add r9, r5, r0 vldr s2, [r3, #0] @@ -577,69 +516,46 @@ label154: vdiv.f32 s1, s2, s1 add r9, r5, r9 vldr s2, [r3, #4] - vadd.f32 s0, s0, s1 - vmov s1, r9 + vadd.f32 s1, s0, s1 + vmov s0, r9 add r9, r7, r0 mul r10, r10, r9 - vcvt.f32.s32 s1, s1 + vcvt.f32.s32 s0, s0 add r10, r10, r10, lsr #31 asr r10, r10, #1 - vdiv.f32 s1, s2, s1 + vdiv.f32 s0, s2, s0 add r10, r5, r10 - vldr s2, [r3, #8] - vadd.f32 s0, s0, s1 - vmov s1, r10 + vadd.f32 s2, s1, s0 + vldr s1, [r3, #8] + vmov s0, r10 add r10, r8, r0 add r0, r0, #4 mul r9, r9, r10 - vcvt.f32.s32 s1, s1 + vcvt.f32.s32 s0, s0 add r9, r9, r9, lsr #31 asr r9, r9, #1 - vdiv.f32 s1, s2, s1 + vdiv.f32 s0, s1, s0 add r9, r5, r9 + vadd.f32 s1, s2, s0 vldr s2, [r3, #12] - vadd.f32 s0, s0, s1 - vmov s1, r9 + vmov s0, r9 movw r9, #34461 movt r9, #1 cmp r0, r9 - vcvt.f32.s32 s1, s1 - vdiv.f32 s1, s2, s1 - vadd.f32 s1, s0, s1 - bge label405 - add r3, r3, #16 - vmov.f32 s0, s1 - b label154 -.p2align 4 -label172: - mov r1, #0 - movw r4, #:lower16:vectorA - movt r4, #:upper16:vectorA - mov r0, r1 - vmov s0, r1 - mov r3, r4 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + blt label235 + vmov.f32 s1, s0 + b label220 .p2align 4 -label173: - add r6, r1, #1 - movw r4, #34464 - movt r4, #1 - cmp r0, r4 - bge label178 - add r4, r0, #3 - movw r5, #34464 - movt r5, #1 - cmp r4, r5 - bge label673 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r7, r1, #3 - add r8, r1, #4 - add r4, r5, r0, lsl #2 - add r5, r1, #2 +label131: + add r4, r4, #16 .p2align 4 -label213: +label127: add r10, r1, r0 add r9, r6, r0 + vldr s2, [r4, #0] add r11, r0, #1 mul r10, r10, r9 add r10, r10, r10, lsr #31 @@ -649,49 +565,51 @@ label213: vmov s1, r10 add r10, r5, r0 mul r9, r9, r10 - vcvt.f32.s32 s2, s1 + vcvt.f32.s32 s1, s1 add r9, r9, r9, lsr #31 - vldr s1, [r4, #0] asr r9, r9, #1 + vdiv.f32 s1, s2, s1 add r9, r9, r11 + vldr s2, [r4, #4] add r11, r0, #3 - vdiv.f32 s1, s1, s2 - vadd.f32 s2, s0, s1 - vldr s1, [r4, #4] - vmov s0, r9 + vadd.f32 s0, s0, s1 + vmov s1, r9 add r9, r7, r0 mul r10, r10, r9 - vcvt.f32.s32 s0, s0 + vcvt.f32.s32 s1, s1 add r10, r10, r10, lsr #31 asr r10, r10, #1 - vdiv.f32 s0, s1, s0 + vdiv.f32 s1, s2, s1 add r10, r10, r11 - vadd.f32 s1, s2, s0 vldr s2, [r4, #8] - vmov s0, r10 + vadd.f32 s0, s0, s1 + vmov s1, r10 add r10, r8, r0 add r0, r0, #4 mul r9, r9, r10 - vcvt.f32.s32 s0, s0 + vcvt.f32.s32 s1, s1 add r9, r9, r9, lsr #31 asr r9, r9, #1 - vdiv.f32 s0, s2, s0 + vdiv.f32 s1, s2, s1 add r9, r9, r0 vldr s2, [r4, #12] - vadd.f32 s1, s1, s0 - vmov s0, r9 + vadd.f32 s0, s0, s1 + vmov s1, r9 movw r9, #34461 movt r9, #1 cmp r0, r9 - vcvt.f32.s32 s0, s0 - vdiv.f32 s0, s2, s0 - vadd.f32 s0, s1, s0 - bge label745 - add r4, r4, #16 - b label213 + vcvt.f32.s32 s1, s1 + vdiv.f32 s1, s2, s1 + vadd.f32 s0, s0, s1 + blt label131 + vmov.f32 s1, s0 + b label132 .p2align 4 -label405: - vmov.f32 s0, s1 +label357: + mov r3, #0 + vmov.f32 s1, s0 + vmov s2, r3 + vmov.f32 s0, s2 .p2align 4 label159: movw r3, #34464 @@ -701,6 +619,11 @@ label159: movw r3, #:lower16:vectorB movt r3, #:upper16:vectorB add r3, r3, r0, lsl #2 + b label164 +.p2align 4 +label168: + add r3, r3, #4 + vmov.f32 s1, s0 .p2align 4 label164: add r6, r1, r0 @@ -711,32 +634,40 @@ label164: add r6, r6, r6, lsr #31 asr r6, r6, #1 add r6, r5, r6 - vmov s1, r6 + vmov s0, r6 movw r6, #34464 movt r6, #1 cmp r0, r6 - vcvt.f32.s32 s1, s1 - vdiv.f32 s1, s2, s1 - vadd.f32 s0, s0, s1 - bge label426 - add r3, r3, #4 - b label164 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + blt label168 .p2align 4 -label745: - vmov.f32 s1, s0 +label410: + movw r0, #34464 + movt r0, #1 .p2align 4 -label202: - movw r4, #34464 - movt r4, #1 - cmp r0, r4 - bge label679 - movw r5, #:lower16:Vectortm - movt r5, #:upper16:Vectortm - add r4, r5, r0, lsl #2 - b label207 +label169: + vstr s0, [r4, #0] + movw r1, #34464 + movt r1, #1 + cmp r5, r1 + bge label172 + add r4, r4, #4 + mov r1, r5 + b label147 +.p2align 4 +label172: + mov r1, #0 + movw r3, #:lower16:vectorA + movt r3, #:upper16:vectorA + mov r0, r1 + vmov s0, r1 + mov r4, r3 + b label173 .p2align 4 label211: - add r4, r4, #4 + add r3, r3, #4 vmov.f32 s1, s0 .p2align 4 label207: @@ -752,7 +683,7 @@ label207: movt r5, #1 cmp r0, r5 vcvt.f32.s32 s2, s0 - vldr s0, [r4, #0] + vldr s0, [r3, #0] vdiv.f32 s0, s0, s2 vadd.f32 s0, s1, s0 blt label211 @@ -762,44 +693,111 @@ label679: movt r0, #1 .p2align 4 label178: - vstr s0, [r3, #0] + vstr s0, [r4, #0] movw r1, #34464 movt r1, #1 cmp r6, r1 bge label181 - add r3, r3, #4 + add r4, r4, #4 mov r1, r6 - b label173 +.p2align 4 +label173: + add r6, r1, #1 + movw r3, #34464 + movt r3, #1 + cmp r0, r3 + bge label178 + add r3, r0, #3 + movw r5, #34464 + movt r5, #1 + cmp r3, r5 + bge label673 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r7, r1, #3 + add r8, r1, #4 + add r3, r5, r0, lsl #2 + add r5, r1, #2 +.p2align 4 +label213: + add r9, r1, r0 + add r10, r6, r0 + add r11, r0, #1 + mul r9, r9, r10 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + add r9, r9, r11 + add r11, r0, #2 + vmov s1, r9 + add r9, r5, r0 + mul r10, r10, r9 + vcvt.f32.s32 s2, s1 + add r10, r10, r10, lsr #31 + vldr s1, [r3, #0] + asr r10, r10, #1 + add r10, r10, r11 + add r11, r0, #3 + vdiv.f32 s1, s1, s2 + vadd.f32 s2, s0, s1 + vmov s0, r10 + add r10, r7, r0 + mul r9, r9, r10 + vcvt.f32.s32 s1, s0 + add r9, r9, r9, lsr #31 + vldr s0, [r3, #4] + asr r9, r9, #1 + add r9, r9, r11 + vdiv.f32 s0, s0, s1 + vldr s1, [r3, #8] + vadd.f32 s2, s2, s0 + vmov s0, r9 + add r9, r8, r0 + add r0, r0, #4 + mul r9, r10, r9 + vcvt.f32.s32 s0, s0 + add r9, r9, r9, lsr #31 + asr r9, r9, #1 + vdiv.f32 s0, s1, s0 + add r9, r9, r0 + vadd.f32 s1, s2, s0 + vldr s2, [r3, #12] + vmov s0, r9 + movw r9, #34461 + movt r9, #1 + cmp r0, r9 + vcvt.f32.s32 s0, s0 + vdiv.f32 s0, s2, s0 + vadd.f32 s0, s1, s0 + bge label745 + add r3, r3, #16 + b label213 +.p2align 4 +label745: + vmov.f32 s1, s0 +.p2align 4 +label202: + movw r3, #34464 + movt r3, #1 + cmp r0, r3 + bge label679 + movw r5, #:lower16:Vectortm + movt r5, #:upper16:Vectortm + add r3, r5, r0, lsl #2 + b label207 .p2align 4 label673: - mov r4, #0 + mov r3, #0 vmov.f32 s1, s0 - vmov s2, r4 + vmov s2, r3 vmov.f32 s0, s2 b label202 -.p2align 4 -label426: - movw r0, #34464 - movt r0, #1 - b label169 -.p2align 4 -label410: - movw r0, #34464 - movt r0, #1 - vmov.f32 s0, s1 - b label169 label636: mov r0, #1065353216 vmov s0, r0 b label195 .p2align 4 -label318: +label405: vmov.f32 s1, s0 - b label132 -.p2align 4 -label357: - mov r3, #0 - vmov s1, r3 b label159 .p2align 4 cmmc_parallel_body_0: diff --git a/tests/SysY2022/performance/vector_mul3.riscv.s b/tests/SysY2022/performance/vector_mul3.riscv.s index 3198622f7..97bf7f282 100644 --- a/tests/SysY2022/performance/vector_mul3.riscv.s +++ b/tests/SysY2022/performance/vector_mul3.riscv.s @@ -1,668 +1,805 @@ .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zba1p0_zbb1p0" .data .section .rodata -.align 4 +.p2align 2 __cmmc_fp_constant_pool: .4byte 897988541 .4byte 3045472189 .bss -.align 8 +.p2align 3 cmmc_parallel_body_payload_0: .zero 8 -.align 8 +.p2align 3 Vectortm: .zero 400000 -.align 8 +.p2align 3 vectorB: .zero 400000 -.align 8 +.p2align 3 vectorA: .zero 400000 .text .p2align 2 .globl main main: - addi sp, sp, -72 + addi sp, sp, -56 li a0, 62 sd ra, 0(sp) - sd s2, 8(sp) - sd s1, 16(sp) - sd s6, 24(sp) - sd s4, 32(sp) - sd s3, 40(sp) - sd s0, 48(sp) - sd s5, 56(sp) - sd s7, 64(sp) + sd s1, 8(sp) + sd s0, 16(sp) + sd s5, 24(sp) + sd s3, 32(sp) + sd s2, 40(sp) + sd s4, 48(sp) jal _sysy_starttime lui a3, 24 -pcrel1032: +pcrel1055: auipc a0, %pcrel_hi(cmmc_parallel_body_payload_0) -pcrel1033: +pcrel1056: auipc a1, %pcrel_hi(vectorA) - addi s2, a1, %pcrel_lo(pcrel1033) -pcrel1034: + addi s1, a1, %pcrel_lo(pcrel1056) +pcrel1057: auipc a1, %pcrel_hi(cmmc_parallel_body_0) - sd s2, %pcrel_lo(pcrel1032)(a0) - addi a2, a1, %pcrel_lo(pcrel1034) + sd s1, %pcrel_lo(pcrel1055)(a0) + addi a2, a1, %pcrel_lo(pcrel1057) mv a0, zero addiw a1, a3, 1696 jal cmmcParallelFor + lui s3, 258048 + mv a2, zero lui a4, 24 - lui s4, 258048 - mv a3, zero -pcrel1035: - auipc a2, %pcrel_hi(Vectortm) -pcrel1036: +pcrel1058: + auipc a3, %pcrel_hi(Vectortm) +pcrel1059: auipc a0, %pcrel_hi(vectorB) - addi a1, a2, %pcrel_lo(pcrel1035) - addi s1, a0, %pcrel_lo(pcrel1036) -pcrel1037: - auipc a2, %pcrel_hi(__cmmc_fp_constant_pool) + addi a1, a3, %pcrel_lo(pcrel1058) + addi s0, a0, %pcrel_lo(pcrel1059) +pcrel1060: + auipc a3, %pcrel_hi(__cmmc_fp_constant_pool) addiw a0, a4, 1693 - addi s3, a2, %pcrel_lo(pcrel1037) - addi s0, a0, 3 - li a2, 1000 + addi s2, a3, %pcrel_lo(pcrel1060) j label110 .p2align 2 -label958: - addiw a3, a3, 1 - bge a3, a2, label184 +label981: + addiw a2, a2, 1 + li a3, 1000 + bge a2, a3, label184 .p2align 2 label110: fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label117 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label117 fsw f10, 0(a1) - blt a4, s0, label237 - mv a5, s1 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label237 + mv a4, s0 + mv a5, zero + addiw t0, t1, 1696 + blt zero, t0, label220 j label143 .p2align 2 label277: - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label953 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label976 .p2align 2 label237: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label970 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label993 .p2align 2 label117: - addiw t1, t5, 3 - bge t1, s0, label256 - sh2add t1, t5, s2 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label256 + sh2add t0, t4, s1 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 j label129 .p2align 2 label133: - addi t1, t1, 16 + addi t0, t0, 16 .p2align 2 label129: - addw a7, t0, t5 - addw a6, a4, t5 - flw f14, 0(t1) - mulw t6, a7, a6 - srliw s6, t6, 31 - add s5, t6, s6 - sraiw a7, s5, 1 - addw t6, a4, a7 + addw a6, a5, t4 + addw t5, a3, t4 + flw f14, 0(t0) + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + sraiw a6, s4, 1 + addw t6, a3, a6 fcvt.s.w f12, t6 - addw t6, t2, t5 + addw t6, t1, t4 fdiv.s f13, f14, f12 - mulw a7, a6, t6 - srliw s6, a7, 31 - add s7, a7, s6 - sraiw s5, s7, 1 - addw a6, a4, s5 + mulw a6, t5, t6 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw s5, a7, 1 + addw t5, a3, s5 + fcvt.s.w f12, t5 + addw t5, t2, t4 + mulw a6, t6, t5 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw t6, a7, 1 + addw a6, a3, t6 fadd.s f11, f10, f13 - flw f13, 4(t1) - fcvt.s.w f10, a6 - addw a6, t3, t5 - fdiv.s f14, f13, f10 - mulw a7, t6, a6 - flw f13, 8(t1) - srliw s6, a7, 31 - add s5, a7, s6 - addw a7, t4, t5 - sraiw s7, s5, 1 - addiw t5, t5, 4 - addw t6, a4, s7 - fadd.s f12, f11, f14 - fcvt.s.w f11, t6 - mulw t6, a6, a7 - fdiv.s f14, f13, f11 - srliw s5, t6, 31 - add a7, t6, s5 - sraiw s6, a7, 1 - addw a6, a4, s6 - fcvt.s.w f11, a6 - fadd.s f10, f12, f14 - flw f12, 12(t1) - fdiv.s f13, f12, f11 - fadd.s f10, f10, f13 - blt t5, a0, label133 + flw f13, 4(t0) + fdiv.s f14, f13, f12 + flw f13, 8(t0) + fcvt.s.w f12, a6 + addw a6, t3, t4 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw a7, t6, 31 + add s4, t6, a7 + sraiw t5, s4, 1 + addw a6, a3, t5 + fadd.s f10, f11, f14 + fdiv.s f14, f13, f12 + flw f13, 12(t0) + fcvt.s.w f12, a6 + fadd.s f11, f10, f14 + fdiv.s f14, f13, f12 + fadd.s f10, f11, f14 + blt t4, a0, label133 fmv.s f11, f10 - bge t5, s0, label954 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label977 .p2align 2 label122: - sh2add t1, t5, s2 - mv t2, t5 + sh2add t0, t4, s1 + mv t1, t4 fmv.s f10, f11 .p2align 2 label123: - addw t4, t0, t2 - addw a6, a4, t2 - flw f12, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, a6 - srliw t5, t3, 31 - add t6, t3, t5 + addw t3, a5, t1 + addw t4, a3, t1 + flw f12, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t4 + srliw t5, t2, 31 + add t6, t2, t5 sraiw t4, t6, 1 - addw a6, a4, t4 - fcvt.s.w f11, a6 + addw t3, a3, t4 + fcvt.s.w f11, t3 + lui t3, 24 fdiv.s f13, f12, f11 + addiw t2, t3, 1696 fadd.s f10, f10, f13 - bge t2, s0, label277 - addi t1, t1, 4 + bge t1, t2, label277 + addi t0, t0, 4 j label123 .p2align 2 label768: - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label968 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label991 .p2align 2 label146: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label955 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label978 .p2align 2 label220: - addiw t1, t5, 3 - bge t1, s0, label747 - sh2add t1, t5, a1 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label747 + sh2add t0, t4, a1 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 j label232 .p2align 2 label236: - addi t1, t1, 16 + addi t0, t0, 16 .p2align 2 label232: - addw a7, t0, t5 - addw a6, a4, t5 - flw f14, 0(t1) - addiw s7, t5, 3 - mulw t6, a7, a6 - srliw s6, t6, 31 - add s5, t6, s6 - addiw t6, t5, 1 - sraiw a7, s5, 1 - addw s6, a7, t6 - addw t6, t2, t5 - mulw a7, a6, t6 - fcvt.s.w f12, s6 - srliw s5, a7, 31 - add s6, a7, s5 + addw a6, a5, t4 + addw t5, a3, t4 + flw f14, 0(t0) + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + addiw t6, t4, 1 + sraiw a6, s4, 1 + addw a7, a6, t6 + addw t6, t1, t4 + mulw a6, t5, t6 + fcvt.s.w f12, a7 + srliw a7, a6, 31 fdiv.s f13, f14, f12 - addiw a7, t5, 2 - sraiw a6, s6, 1 - flw f14, 4(t1) - addw s5, a6, a7 - addw a6, t3, t5 - mulw a7, t6, a6 - fcvt.s.w f12, s5 - srliw s6, a7, 31 - add s5, a7, s6 - sraiw t6, s5, 1 - addw a7, t6, s7 + add s4, a6, a7 + flw f14, 4(t0) + addiw a7, t4, 2 + sraiw t5, s4, 1 + addw a6, t5, a7 + addw t5, t2, t4 + fcvt.s.w f12, a6 + mulw a6, t6, t5 + srliw a7, a6, 31 + add s4, a6, a7 + addiw a7, t4, 3 + sraiw t6, s4, 1 + addw a6, t6, a7 fadd.s f11, f10, f13 fdiv.s f13, f14, f12 - flw f14, 8(t1) - fcvt.s.w f12, a7 - addw a7, t4, t5 - addiw t5, t5, 4 - mulw t6, a6, a7 - srliw s5, t6, 31 - add a7, t6, s5 - sraiw a6, a7, 1 - addw t6, a6, t5 + flw f14, 8(t0) + fcvt.s.w f12, a6 + addw a6, t3, t4 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw a7, t6, 31 + add a6, t6, a7 + sraiw t5, a6, 1 + addw t6, t5, t4 fadd.s f10, f11, f13 fdiv.s f13, f14, f12 - flw f14, 12(t1) + flw f14, 12(t0) fcvt.s.w f12, t6 fadd.s f11, f10, f13 fdiv.s f13, f14, f12 fadd.s f10, f11, f13 - blt t5, a0, label236 + blt t4, a0, label236 fmv.s f12, f10 - bge t5, s0, label969 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label992 .p2align 2 label225: - sh2add t1, t5, a1 - mv t2, t5 + sh2add t0, t4, a1 + mv t1, t4 fmv.s f10, f12 .p2align 2 label226: - addw t4, t0, t2 - addw t5, a4, t2 - flw f13, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, t5 - srliw t6, t3, 31 - add a6, t3, t6 - sraiw t4, a6, 1 - addw t3, t4, t2 - fcvt.s.w f11, t3 + addw t3, a5, t1 + addw t4, a3, t1 + flw f13, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t4 + srliw t6, t2, 31 + add t5, t2, t6 + sraiw t3, t5, 1 + addw t2, t3, t1 + lui t3, 24 + fcvt.s.w f11, t2 + addiw t2, t3, 1696 fdiv.s f12, f13, f11 fadd.s f10, f10, f12 - bge t2, s0, label768 - addi t1, t1, 4 + bge t1, t2, label768 + addi t0, t0, 4 j label226 .p2align 2 label256: fmv.w.x f12, zero fmv.s f11, f10 fmv.s f10, f12 - blt t5, s0, label122 - mv t5, s0 - fsw f12, 0(a5) - blt a4, s0, label237 + lui t1, 24 + addiw t0, t1, 1696 + blt t4, t0, label122 + lui a5, 24 + lui t0, 24 + fsw f12, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label237 fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + addiw t0, t1, 1696 + blt zero, t0, label220 j label143 .p2align 2 label747: fmv.w.x f11, zero fmv.s f12, f10 fmv.s f10, f11 - blt t5, s0, label225 - mv t5, s0 - fsw f11, 0(a5) - blt a4, s0, label146 + lui t1, 24 + addiw t0, t1, 1696 + blt t4, t0, label225 + lui a5, 24 + lui t0, 24 + fsw f11, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label146 fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + addiw t0, t1, 1696 + blt zero, t0, label203 fsw f10, 0(a1) - blt a4, s0, label202 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 j label156 .p2align 2 label219: - addi t1, t1, 4 + addi t0, t0, 4 .p2align 2 label215: - addw t4, t0, t2 - addw t6, a4, t2 - flw f13, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, t6 - srliw a6, t3, 31 - add t5, t3, a6 - sraiw t4, t5, 1 - addw t6, a4, t4 - fcvt.s.w f11, t6 + addw t3, a5, t1 + addw t4, a3, t1 + flw f13, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t4 + srliw t6, t2, 31 + add t5, t2, t6 + sraiw t3, t5, 1 + addw t4, a3, t3 + lui t3, 24 + addiw t2, t3, 1696 + fcvt.s.w f11, t4 fdiv.s f12, f13, f11 fadd.s f10, f10, f12 - blt t2, s0, label219 - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label966 + blt t1, t2, label219 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label989 .p2align 2 label202: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label962 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label985 .p2align 2 label203: - addiw t1, t5, 3 - bge t1, s0, label673 - sh2add t1, t5, s1 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label673 + sh2add t0, t4, s0 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 .p2align 2 label205: - addw a7, t0, t5 - addw t6, a4, t5 - flw f13, 0(t1) - mulw a6, a7, t6 - srliw s5, a6, 31 - add a7, a6, s5 - sraiw s6, a7, 1 - addw a6, a4, s6 - fcvt.s.w f11, a6 - addw a6, t2, t5 - fdiv.s f14, f13, f11 - mulw a7, t6, a6 - flw f13, 4(t1) - srliw s5, a7, 31 - add s6, a7, s5 - sraiw t6, s6, 1 - addw a7, a4, t6 - addw t6, t3, t5 - fadd.s f12, f10, f14 - fcvt.s.w f10, a7 - mulw a7, a6, t6 - fdiv.s f14, f13, f10 - srliw s5, a7, 31 - add s6, a7, s5 - sraiw a6, s6, 1 - addw a7, a4, a6 - fadd.s f11, f12, f14 - flw f14, 8(t1) - fcvt.s.w f12, a7 - addw a7, t4, t5 + addw a6, a5, t4 + addw t5, a3, t4 + flw f13, 0(t0) + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + sraiw a6, s4, 1 + addw t6, a3, a6 + fcvt.s.w f12, t6 + addw t6, t1, t4 + fdiv.s f14, f13, f12 + mulw a6, t5, t6 + flw f13, 4(t0) + srliw a7, a6, 31 + add t5, a6, a7 + sraiw s4, t5, 1 + addw t5, t2, t4 + addw a6, a3, s4 + fcvt.s.w f12, a6 + mulw a6, t6, t5 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw t6, a7, 1 + addw a6, a3, t6 + fadd.s f11, f10, f14 + fdiv.s f14, f13, f12 + flw f13, 8(t0) + fcvt.s.w f12, a6 + addw a6, t3, t4 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw a7, t6, 31 + add t5, t6, a7 + sraiw s4, t5, 1 + addw a6, a3, s4 + fadd.s f10, f11, f14 + fdiv.s f14, f13, f12 + fcvt.s.w f12, a6 + fadd.s f11, f10, f14 + flw f14, 12(t0) fdiv.s f13, f14, f12 - addiw t5, t5, 4 - mulw a6, t6, a7 - srliw s6, a6, 31 - add a7, a6, s6 - sraiw s5, a7, 1 - addw t6, a4, s5 fadd.s f10, f11, f13 - flw f13, 12(t1) - fcvt.s.w f11, t6 - fdiv.s f12, f13, f11 - fadd.s f10, f10, f12 - bge t5, a0, label720 - addi t1, t1, 16 + bge t4, a0, label720 + addi t0, t0, 16 j label205 .p2align 2 label720: fmv.s f11, f10 - bge t5, s0, label964 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label987 .p2align 2 label214: - sh2add t1, t5, s1 - mv t2, t5 + sh2add t0, t4, s0 + mv t1, t4 fmv.s f10, f11 j label215 .p2align 2 label673: fmv.w.x f12, zero fmv.s f11, f10 + lui t1, 24 fmv.s f10, f12 - blt t5, s0, label214 - mv t5, s0 - fsw f12, 0(a5) - blt a4, s0, label202 + addiw t0, t1, 1696 + blt t4, t0, label214 + lui a5, 24 + lui t0, 24 + fsw f12, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label202 fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + addiw t0, t1, 1696 + blt zero, t0, label162 j label356 .p2align 2 label172: - addi t1, t1, 4 + addi t0, t0, 4 .p2align 2 label168: - addw t4, t0, t2 - addw t6, a4, t2 - flw f12, 0(t1) - addiw t2, t2, 1 - mulw t3, t4, t6 - srliw a6, t3, 31 - add t5, t3, a6 - sraiw t4, t5, 1 - addw t3, t4, t2 - fcvt.s.w f11, t3 + addw t3, a5, t1 + addw t6, a3, t1 + flw f12, 0(t0) + addiw t1, t1, 1 + mulw t2, t3, t6 + srliw t4, t2, 31 + add t5, t2, t4 + sraiw t3, t5, 1 + addw t2, t3, t1 + lui t3, 24 + fcvt.s.w f11, t2 + addiw t2, t3, 1696 fdiv.s f13, f12, f11 fadd.s f10, f10, f13 - blt t2, s0, label172 - mv t5, s0 - fsw f10, 0(a5) - bge a4, s0, label958 + blt t1, t2, label172 + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + bge a3, a5, label981 .p2align 2 label182: - addi a5, a5, 4 - mv t0, a4 - addiw a4, a4, 1 - bge t5, s0, label961 + addi a4, a4, 4 + mv a5, a3 + lui t1, 24 + addiw a3, a3, 1 + addiw t0, t1, 1696 + bge t4, t0, label984 .p2align 2 label162: - addiw t1, t5, 3 - bge t1, s0, label361 - sh2add t1, t5, a1 - addiw t2, t0, 2 - addiw t3, t0, 3 - addiw t4, t0, 4 + addiw t0, t4, 3 + lui t2, 24 + addiw t1, t2, 1696 + bge t0, t1, label361 + sh2add t0, t4, a1 + addiw t1, a5, 2 + addiw t2, a5, 3 + addiw t3, a5, 4 .p2align 2 label174: - addw a7, t0, t5 - addw a6, a4, t5 - addiw s6, t5, 1 - mulw t6, a7, a6 - flw f13, 0(t1) - srliw s5, t6, 31 - add s7, t6, s5 - sraiw a7, s7, 1 - addiw s7, t5, 3 - addw t6, a7, s6 + addw a6, a5, t4 + addw t5, a3, t4 + flw f13, 0(t0) + addiw s5, t4, 3 + mulw t6, a6, t5 + srliw a7, t6, 31 + add s4, t6, a7 + addiw a7, t4, 1 + sraiw a6, s4, 1 + addw t6, a6, a7 fcvt.s.w f12, t6 - addw t6, t2, t5 + addw t6, t1, t4 fdiv.s f14, f13, f12 - mulw a7, a6, t6 - flw f13, 4(t1) - srliw s6, a7, 31 - add s5, a7, s6 - addiw a7, t5, 2 - sraiw a6, s5, 1 - addw s6, a6, a7 - addw a6, t3, t5 - mulw a7, t6, a6 - fcvt.s.w f12, s6 - srliw s6, a7, 31 - add s5, a7, s6 - sraiw t6, s5, 1 - addw a7, t6, s7 + mulw a6, t5, t6 + flw f13, 4(t0) + srliw a7, a6, 31 + add s4, a6, a7 + addiw a7, t4, 2 + sraiw t5, s4, 1 + addw a6, t5, a7 + addw t5, t2, t4 fadd.s f11, f10, f14 - fdiv.s f14, f13, f12 - fcvt.s.w f12, a7 - addw a7, t4, t5 - addiw t5, t5, 4 - mulw t6, a6, a7 - srliw s5, t6, 31 - add a7, t6, s5 - sraiw a6, a7, 1 - addw t6, a6, t5 - fadd.s f10, f11, f14 - flw f14, 8(t1) - fdiv.s f13, f14, f12 - fcvt.s.w f12, t6 - fadd.s f11, f10, f13 - flw f13, 12(t1) - fdiv.s f14, f13, f12 - fadd.s f10, f11, f14 - bge t5, a0, label432 - addi t1, t1, 16 + fcvt.s.w f10, a6 + mulw a6, t6, t5 + fdiv.s f14, f13, f10 + srliw s4, a6, 31 + add a7, a6, s4 + sraiw t6, a7, 1 + addw a6, t6, s5 + fadd.s f12, f11, f14 + flw f14, 8(t0) + fcvt.s.w f11, a6 + addw a6, t3, t4 + fdiv.s f13, f14, f11 + addiw t4, t4, 4 + mulw t6, t5, a6 + srliw s4, t6, 31 + add a7, t6, s4 + sraiw t5, a7, 1 + addw t6, t5, t4 + fcvt.s.w f11, t6 + fadd.s f10, f12, f13 + flw f12, 12(t0) + fdiv.s f13, f12, f11 + fadd.s f10, f10, f13 + bge t4, a0, label432 + addi t0, t0, 16 j label174 .p2align 2 label432: - fmv.s f12, f10 - bge t5, s0, label959 + fmv.s f11, f10 + lui t1, 24 + addiw t0, t1, 1696 + bge t4, t0, label982 .p2align 2 label167: - sh2add t1, t5, a1 - mv t2, t5 - fmv.s f10, f12 + sh2add t0, t4, a1 + mv t1, t4 + fmv.s f10, f11 j label168 .p2align 2 label361: - fmv.w.x f11, zero - fmv.s f12, f10 - fmv.s f10, f11 - blt t5, s0, label167 - mv t5, s0 - fsw f11, 0(a5) - blt a4, s0, label182 - addiw a3, a3, 1 - blt a3, a2, label110 + fmv.w.x f12, zero + fmv.s f11, f10 + fmv.s f10, f12 + lui t1, 24 + addiw t0, t1, 1696 + blt t4, t0, label167 + lui a5, 24 + lui t0, 24 + fsw f12, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label182 + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label959: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label182 - addiw a3, a3, 1 - blt a3, a2, label110 +label982: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label182 + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label954: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label237 +label977: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label237 fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 - fsw f10, 0(s1) - blt a4, s0, label146 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label220 + fsw f10, 0(s0) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 j label339 .p2align 2 -label964: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label202 +label987: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label202 fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 - fsw f10, 0(s2) - blt a4, s0, label182 - j label956 -.p2align 2 -label969: - mv t5, s0 - fsw f10, 0(a5) - blt a4, s0, label146 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 + fsw f10, 0(s1) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 + j label979 +.p2align 2 +label992: + lui a5, 24 + lui t0, 24 + fsw f10, 0(a4) + addiw t4, a5, 1696 + addiw a5, t0, 1696 + blt a3, a5, label146 fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 fsw f10, 0(a1) - blt a4, s0, label202 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 j label156 .p2align 2 -label961: - fsw f10, 0(a5) - blt a4, s0, label182 - addiw a3, a3, 1 - blt a3, a2, label110 +label984: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label970: - fsw f10, 0(a5) - blt a4, s0, label237 +label993: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label237 fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label220 label143: - fsw f10, 0(a5) - blt a4, s0, label146 + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 j label339 .p2align 2 -label962: - fsw f10, 0(a5) - blt a4, s0, label202 +label985: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 label356: - fsw f10, 0(a5) - blt a4, s0, label182 - j label956 + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 + j label979 .p2align 2 -label955: - fsw f10, 0(a5) - blt a4, s0, label146 +label978: + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 label153: - fsw f10, 0(a5) - blt a4, s0, label202 + fsw f10, 0(a4) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 j label156 .p2align 2 -label953: +label976: fmv.w.x f10, zero - mv a5, s1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label220 - fsw f10, 0(s1) - blt a4, s0, label146 + mv a4, s0 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label220 + fsw f10, 0(s0) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label146 label339: fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 j label153 .p2align 2 -label966: +label989: fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 - fsw f10, 0(s2) - blt a4, s0, label182 -label956: - addiw a3, a3, 1 - blt a3, a2, label110 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 + fsw f10, 0(s1) + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label182 +label979: + addiw a2, a2, 1 + li a3, 1000 + blt a2, a3, label110 j label184 .p2align 2 -label968: +label991: fmv.w.x f10, zero - mv a5, a1 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label203 + mv a4, a1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label203 fsw f10, 0(a1) - blt a4, s0, label202 + lui t0, 24 + addiw a5, t0, 1696 + blt a3, a5, label202 label156: fmv.w.x f10, zero - mv a5, s2 - mv t0, zero - mv t5, zero - li a4, 1 - blt zero, s0, label162 + mv a4, s1 + mv a5, zero + mv t4, zero + li a3, 1 + lui t1, 24 + addiw t0, t1, 1696 + blt zero, t0, label162 j label356 label184: li a0, 76 @@ -672,154 +809,158 @@ label184: j label185 .p2align 2 label189: - addi s2, s2, 64 + addi s1, s1, 64 .p2align 2 label185: - sh2add a1, a0, s1 - flw f12, 0(s2) + sh2add a1, a0, s0 + flw f12, 0(s1) + lui a2, 24 addiw a0, a0, 16 flw f14, 0(a1) - flw f13, 4(s2) + flw f13, 4(s1) fmul.s f15, f12, f14 flw f14, 4(a1) - flw f12, 8(s2) + flw f12, 8(s1) flw f0, 8(a1) fadd.s f11, f10, f15 fmul.s f15, f13, f14 - flw f13, 12(s2) + flw f13, 12(s1) fmul.s f14, f12, f0 fadd.s f10, f11, f15 flw f15, 12(a1) - flw f12, 16(s2) + flw f12, 16(s1) fadd.s f11, f10, f14 fmul.s f14, f13, f15 flw f13, 16(a1) fmul.s f15, f12, f13 fadd.s f10, f11, f14 - flw f14, 20(s2) + flw f14, 20(s1) flw f12, 20(a1) - flw f13, 24(s2) + flw f13, 24(s1) fadd.s f11, f10, f15 fmul.s f15, f14, f12 flw f14, 24(a1) - flw f12, 28(s2) + flw f12, 28(s1) fadd.s f10, f11, f15 fmul.s f15, f13, f14 flw f14, 28(a1) - flw f13, 32(s2) + flw f13, 32(s1) fmul.s f0, f12, f14 fadd.s f11, f10, f15 flw f15, 32(a1) - flw f14, 36(s2) + flw f14, 36(s1) fadd.s f10, f11, f0 fmul.s f11, f13, f15 flw f15, 36(a1) - flw f13, 40(s2) + flw f13, 40(s1) fadd.s f12, f10, f11 fmul.s f10, f14, f15 flw f14, 40(a1) fmul.s f0, f13, f14 fadd.s f11, f12, f10 - flw f12, 44(s2) + flw f12, 44(s1) flw f15, 44(a1) - flw f13, 48(s2) + flw f13, 48(s1) fmul.s f14, f12, f15 fadd.s f10, f11, f0 flw f15, 48(a1) fmul.s f12, f13, f15 fadd.s f11, f10, f14 - flw f14, 52(s2) + flw f14, 52(s1) flw f15, 52(a1) - flw f13, 56(s2) + flw f13, 56(s1) fadd.s f10, f11, f12 fmul.s f11, f14, f15 flw f15, 56(a1) - flw f14, 60(s2) + flw f14, 60(s1) fadd.s f12, f10, f11 fmul.s f10, f13, f15 flw f13, 60(a1) + addiw a1, a2, 1696 fadd.s f11, f12, f10 fmul.s f12, f14, f13 fadd.s f10, f11, f12 - blt a0, s0, label189 + blt a0, a1, label189 fmv.w.x f11, zero mv a0, zero j label191 .p2align 2 label195: - addi s1, s1, 64 + addi s0, s0, 64 .p2align 2 label191: - flw f13, 0(s1) + flw f13, 0(s0) addiw a0, a0, 16 - flw f14, 4(s1) + lui a2, 24 + flw f14, 4(s0) fmul.s f15, f13, f13 + addiw a1, a2, 1696 fmul.s f1, f14, f14 fadd.s f12, f11, f15 - flw f15, 8(s1) - flw f14, 12(s1) + flw f15, 8(s0) + flw f14, 12(s0) fmul.s f0, f15, f15 fadd.s f13, f12, f1 fmul.s f1, f14, f14 fadd.s f11, f13, f0 - flw f13, 16(s1) - flw f14, 20(s1) + flw f13, 16(s0) + flw f14, 20(s0) fmul.s f15, f13, f13 fmul.s f0, f14, f14 - flw f13, 24(s1) + flw f13, 24(s0) fadd.s f12, f11, f1 - flw f14, 28(s1) + flw f14, 28(s0) fadd.s f11, f12, f15 fmul.s f15, f13, f13 fadd.s f12, f11, f0 fmul.s f0, f14, f14 fadd.s f11, f12, f15 - flw f15, 32(s1) - flw f14, 36(s1) + flw f15, 32(s0) + flw f14, 36(s0) fmul.s f1, f15, f15 fadd.s f13, f11, f0 fmul.s f0, f14, f14 fadd.s f12, f13, f1 - flw f13, 40(s1) - flw f14, 44(s1) + flw f13, 40(s0) + flw f14, 44(s0) fmul.s f15, f13, f13 - flw f13, 48(s1) + flw f13, 48(s0) fadd.s f11, f12, f0 fmul.s f0, f14, f14 - flw f14, 52(s1) + flw f14, 52(s0) fadd.s f12, f11, f15 fmul.s f15, f13, f13 - flw f13, 56(s1) + flw f13, 56(s0) fadd.s f11, f12, f0 fmul.s f0, f14, f14 - flw f14, 60(s1) + flw f14, 60(s0) fadd.s f12, f11, f15 fmul.s f15, f13, f13 fmul.s f13, f14, f14 fadd.s f11, f12, f0 fadd.s f12, f11, f15 fadd.s f11, f12, f13 - blt a0, s0, label195 + blt a0, a1, label195 fdiv.s f10, f10, f11 lui a0, 260096 - flw f13, 0(s3) + flw f13, 0(s2) fmv.w.x f12, a0 fsub.s f11, f12, f10 - flw f12, 4(s3) + flw f12, 4(s2) flt.s a0, f13, f11 - flt.s a2, f11, f12 - or a1, a0, a2 - beq a1, zero, label632 + flt.s a1, f11, f12 + or a2, a0, a1 + beq a2, zero, label632 lui a0, 260096 fmv.s f12, f10 fmv.w.x f11, a0 .p2align 2 label197: fadd.s f14, f11, f12 - fmv.w.x f15, s4 + fmv.w.x f15, s3 fmul.s f11, f14, f15 - flw f14, 0(s3) - flw f15, 4(s3) + flw f14, 0(s2) + flw f15, 4(s2) fdiv.s f12, f10, f11 fsub.s f13, f11, f12 flt.s a2, f13, f15 @@ -828,10 +969,10 @@ label197: bne a1, zero, label197 label200: lui a0, 260096 - flw f13, 0(s3) + flw f13, 0(s2) fmv.w.x f12, a0 fsub.s f10, f11, f12 - flw f11, 4(s3) + flw f11, 4(s2) fle.s a1, f10, f13 fle.s a2, f11, f10 and a0, a1, a2 @@ -840,15 +981,13 @@ label200: jal putch ld ra, 0(sp) mv a0, zero - ld s2, 8(sp) - ld s1, 16(sp) - ld s6, 24(sp) - ld s4, 32(sp) - ld s3, 40(sp) - ld s0, 48(sp) - ld s5, 56(sp) - ld s7, 64(sp) - addi sp, sp, 72 + ld s1, 8(sp) + ld s0, 16(sp) + ld s5, 24(sp) + ld s3, 32(sp) + ld s2, 40(sp) + ld s4, 48(sp) + addi sp, sp, 56 ret label632: lui a3, 260096