'How to get llc to optimize an ll file?

I am learning how IR and llc work. I got started by looking at output of clang. I thought llc could be used to optimize .ll files further, but I can't get it to make any changes. I used clang to generate a .ll file of a tail-call version of factorial, using -O0. Then I took that .ll and handed it to llc with -O3, and then opt -O3. The output was identical. Not even the alloca were turned into registers, and --mem2reg made no difference. Is there something in .ll file created by clang that prevents llc from optimizing further or am I just totally off base.

The original .c:

long factorialtail(long n, long fact) {
    if (n == 1)
        return fact;
    else
        return factorialtail(n - 1L, n * fact);
}

long factorial(long  x) {
    return factorialtail(x, 1);
}

The output from clang -S -emit-llvm -O0 factorial.c:

 ModuleID = 'factorial.c'
source_filename = "factorial.c"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-macosx11.0.0"

; Function Attrs: noinline nounwind optnone ssp uwtable
define i64 @factorialtail(i64 %0, i64 %1) #0 {
  %3 = alloca i64, align 8
  %4 = alloca i64, align 8
  %5 = alloca i64, align 8
  store i64 %0, i64* %4, align 8
  store i64 %1, i64* %5, align 8
  %6 = load i64, i64* %4, align 8
  %7 = icmp eq i64 %6, 1
  br i1 %7, label %8, label %10

8:                                                ; preds = %2
  %9 = load i64, i64* %5, align 8
  store i64 %9, i64* %3, align 8
  br label %17

10:                                               ; preds = %2
  %11 = load i64, i64* %4, align 8
  %12 = sub nsw i64 %11, 1
  %13 = load i64, i64* %4, align 8
  %14 = load i64, i64* %5, align 8
  %15 = mul nsw i64 %13, %14
  %16 = call i64 @factorialtail(i64 %12, i64 %15)
  store i64 %16, i64* %3, align 8
  br label %17

17:                                               ; preds = %10, %8
  %18 = load i64, i64* %3, align 8
  ret i64 %18
}

; Function Attrs: noinline nounwind optnone ssp uwtable
define i64 @factorial(i64 %0) #0 {
  %2 = alloca i64, align 8
  store i64 %0, i64* %2, align 8
  %3 = load i64, i64* %2, align 8
  %4 = call i64 @factorialtail(i64 %3, i64 1)
  ret i64 %4
}

attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-a7" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2,+zcm,+zcz" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 11.1.0"}

And the output from llc -O3 factorial.ll:

    .section    __TEXT,__text,regular,pure_instructions
    .build_version macos, 11, 0
    .globl  _factorialtail                  ; -- Begin function factorialtail
    .p2align    2
_factorialtail:                         ; @factorialtail
    .cfi_startproc
; %bb.0:
    sub sp, sp, #48                     ; =48
    stp x29, x30, [sp, #32]             ; 16-byte Folded Spill
    add x29, sp, #32                    ; =32
    .cfi_def_cfa w29, 16
    .cfi_offset w30, -8
    .cfi_offset w29, -16
    str x0, [sp, #16]
    str x1, [sp, #8]
    ldr x8, [sp, #16]
    cmp x8, #1                          ; =1
    b.ne    LBB0_2
; %bb.1:
    ldr x8, [sp, #8]
    stur    x8, [x29, #-8]
    b   LBB0_3
LBB0_2:
    ldr x8, [sp, #16]
    sub x0, x8, #1                      ; =1
    ldr x8, [sp, #16]
    ldr x9, [sp, #8]
    mul x1, x8, x9
    bl  _factorialtail
    stur    x0, [x29, #-8]
LBB0_3:
    ldur    x0, [x29, #-8]
    ldp x29, x30, [sp, #32]             ; 16-byte Folded Reload
    add sp, sp, #48                     ; =48
    ret
    .cfi_endproc
                                        ; -- End function
    .globl  _factorial                      ; -- Begin function factorial
    .p2align    2
_factorial:                             ; @factorial
    .cfi_startproc
; %bb.0:
    sub sp, sp, #32                     ; =32
    stp x29, x30, [sp, #16]             ; 16-byte Folded Spill
    add x29, sp, #16                    ; =16
    .cfi_def_cfa w29, 16
    .cfi_offset w30, -8
    .cfi_offset w29, -16
    str x0, [sp, #8]
    ldr x0, [sp, #8]
    mov x1, #1
    bl  _factorialtail
    ldp x29, x30, [sp, #16]             ; 16-byte Folded Reload
    add sp, sp, #32                     ; =32
    ret
    .cfi_endproc
                                        ; -- End function

I was at least expecting the allocas would get converted to registers.



Solution 1:[1]

Have you checked these: https://www.incredibuild.com/blog/compiling-with-clang-optimization-flags and https://groups.google.com/g/llvm-dev/c/b1ckC_hini4?pli=1

clang -O0 does not disable all optimization passes modify the IR.; In fact it causes most functions to get tagged with noinline to prevent inlinining

What you really need to do is

clang -O3 -c emit-llvm -o source.bc -v

Find the -cc1 command line from that output. Execute that command with --disable-llvm-passes. leave the -O3 and everything else.

You should be able to feed the output from that command to opt/llc and get consistent results.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Cody Gray