'How to get llc to optimize an ll file?
I am learning how IR and llc work. I got started by looking at output of clang. I thought llc could be used to optimize .ll files further, but I can't get it to make any changes. I used clang to generate a .ll file of a tail-call version of factorial, using -O0. Then I took that .ll and handed it to llc with -O3, and then opt -O3. The output was identical. Not even the alloca were turned into registers, and --mem2reg made no difference. Is there something in .ll file created by clang that prevents llc from optimizing further or am I just totally off base.
The original .c:
long factorialtail(long n, long fact) {
if (n == 1)
return fact;
else
return factorialtail(n - 1L, n * fact);
}
long factorial(long x) {
return factorialtail(x, 1);
}
The output from clang -S -emit-llvm -O0 factorial.c
:
ModuleID = 'factorial.c'
source_filename = "factorial.c"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-macosx11.0.0"
; Function Attrs: noinline nounwind optnone ssp uwtable
define i64 @factorialtail(i64 %0, i64 %1) #0 {
%3 = alloca i64, align 8
%4 = alloca i64, align 8
%5 = alloca i64, align 8
store i64 %0, i64* %4, align 8
store i64 %1, i64* %5, align 8
%6 = load i64, i64* %4, align 8
%7 = icmp eq i64 %6, 1
br i1 %7, label %8, label %10
8: ; preds = %2
%9 = load i64, i64* %5, align 8
store i64 %9, i64* %3, align 8
br label %17
10: ; preds = %2
%11 = load i64, i64* %4, align 8
%12 = sub nsw i64 %11, 1
%13 = load i64, i64* %4, align 8
%14 = load i64, i64* %5, align 8
%15 = mul nsw i64 %13, %14
%16 = call i64 @factorialtail(i64 %12, i64 %15)
store i64 %16, i64* %3, align 8
br label %17
17: ; preds = %10, %8
%18 = load i64, i64* %3, align 8
ret i64 %18
}
; Function Attrs: noinline nounwind optnone ssp uwtable
define i64 @factorial(i64 %0) #0 {
%2 = alloca i64, align 8
store i64 %0, i64* %2, align 8
%3 = load i64, i64* %2, align 8
%4 = call i64 @factorialtail(i64 %3, i64 1)
ret i64 %4
}
attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-a7" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2,+zcm,+zcz" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 11.1.0"}
And the output from llc -O3 factorial.ll
:
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 11, 0
.globl _factorialtail ; -- Begin function factorialtail
.p2align 2
_factorialtail: ; @factorialtail
.cfi_startproc
; %bb.0:
sub sp, sp, #48 ; =48
stp x29, x30, [sp, #32] ; 16-byte Folded Spill
add x29, sp, #32 ; =32
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
str x0, [sp, #16]
str x1, [sp, #8]
ldr x8, [sp, #16]
cmp x8, #1 ; =1
b.ne LBB0_2
; %bb.1:
ldr x8, [sp, #8]
stur x8, [x29, #-8]
b LBB0_3
LBB0_2:
ldr x8, [sp, #16]
sub x0, x8, #1 ; =1
ldr x8, [sp, #16]
ldr x9, [sp, #8]
mul x1, x8, x9
bl _factorialtail
stur x0, [x29, #-8]
LBB0_3:
ldur x0, [x29, #-8]
ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
add sp, sp, #48 ; =48
ret
.cfi_endproc
; -- End function
.globl _factorial ; -- Begin function factorial
.p2align 2
_factorial: ; @factorial
.cfi_startproc
; %bb.0:
sub sp, sp, #32 ; =32
stp x29, x30, [sp, #16] ; 16-byte Folded Spill
add x29, sp, #16 ; =16
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
str x0, [sp, #8]
ldr x0, [sp, #8]
mov x1, #1
bl _factorialtail
ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
add sp, sp, #32 ; =32
ret
.cfi_endproc
; -- End function
I was at least expecting the allocas would get converted to registers.
Solution 1:[1]
Have you checked these: https://www.incredibuild.com/blog/compiling-with-clang-optimization-flags and https://groups.google.com/g/llvm-dev/c/b1ckC_hini4?pli=1
clang -O0 does not disable all optimization passes modify the IR.; In fact it causes most functions to get tagged with noinline to prevent inlinining
What you really need to do is
clang -O3 -c emit-llvm -o source.bc -v
Find the -cc1 command line from that output. Execute that command with --disable-llvm-passes. leave the -O3 and everything else.
You should be able to feed the output from that command to opt/llc and get consistent results.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Cody Gray |