'terminated by signal SIGKILL (Forced quit) DIFFERENT ERROR

I faced another problem here again with terminated by signal SIGSEGV (Address boundary error) But I found that there's an output here : Sequential elapsedTime: 97906.976232 ms It shows that only my parallel function got wrong. Can anyone tell me why? Here's how function work: This program compared a parallel and a normal large matrix multiplying. I use a strassen function hope to make it faster than only parallel. But now I faced problem which I can't understand right now. Can some help?

/*Author: Sayak Bera*/
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <time.h>
#define N 2000
void Madd(int* A, int* B, int* C, int n, int x){
    //   printf("AAAAddd\n\n\n");
    int i,j, m=x>0?n/2:n;
//parallel
    //#pragma omp parallel for private(i, j) 
    for (i=0;i<m;i++)
        for (j=0;j<m;j++)
            *(C+i*m+j) = *(A+i*n+j) + *(B+i*n+j);
}

void Msub(int* A, int* B, int* C, int n, int x){
     // printf("AAAAsss\n\n\n");
//parallel
    //#pragma omp parallel for private(i, j) 
    int i,j, m=x>0?n/2:n;
    for (i=0;i<m;i++)
        for (j=0;j<m;j++)
            *(C+i*m+j) = *(A+i*n+j) - *(B+i*n+j);
}

void strassen(int* A, int* B, int* C, int n){
    int i,j;
//NOT possible for 2000*2000
    if(n==2){

        int P=(*A+*(A+n+1))*(*B+*(B+n+1));  //P=(A[0][0]+A[1][1])*(B[0][0]+B[1][1])
        int Q=(*(A+n)+*(A+n+1))*(*B);   //Q=(A[1][0]+A[1][1])*B[0][0]
        int R=(*A)*(*(B+1)-*(B+n+1));   //R=A[0][0]*(B[0][1]-B[1][1])
        int S=(*(A+n+1))*(*(B+n)-*B);   //S=A[1][1]*(B[1][0]-B[0][0])
        int T=(*A+*(A+1))*(*(B+n+1));   //T=(A[0][0]+A[0][1])*B[1][1]
        int U=(*(A+n)-*A)*(*B+*(B+1));  //U=(A[1][0]-A[0][0])*(B[0][0]+B[0][1])
        int V=(*(A+1)-*(A+n+1))*(*(B+n)+*(B+n+1));  //V=(A[0][1]-A[1][1])*(B[1][0]+B[1][1])
// master parallel??
        *C=P+S-T+V;
        *(C+1)=R+T;
        *(C+n)=Q+S;
        *(C+n+1)=P+R-Q+U;
    }
/////////////////////////////////
    else{
        
        int m=n/2, i, j;
//      printf("else m is %d, n is %d\n\n", m, n);
        int *x=calloc(m*m, sizeof(int*)), *y=calloc(m*m, sizeof(int*)), *o=calloc( n*n, sizeof(int) );


        int *P=malloc(sizeof(int*)*N*N), *Q=malloc(sizeof(int*)*n*n), *R=malloc(sizeof(int*)*n*n), *S=malloc(sizeof(int*)*n*n), *T=malloc(sizeof(int*)*n*n),*U=malloc(sizeof(int*)*n*n), *V=malloc(sizeof(int*)*n*n);


//malloc not yet
        /*P=(A[0][0]+A[1][1])*(B[0][0]+B[1][1])*/
  //      printf("P1");
        Madd(A, A+m*(n+1), x, n, 1);
        Madd(B, B+m*(n+1), y, n, 1);
        strassen(x, y, P, m);
//      printf("P2\n\n");
        /*Q=(A[1][0]+A[1][1])*B[0][0]*/
//      printf("Q1");
        Madd(A+m*n, A+m*(n+1), x, n, 1);
        Madd(B, o, y, n, 1);
        strassen(x, y, Q, m);
//      printf("Q2\n\n");
        /*R=A[0][0]*(B[0][1]-B[1][1])*/
//      printf("R1");
        Madd(A, o, x, n, 1);
        Msub(B+m, B+m*(n+1), y, n, 1);
        strassen(x, y, R, m);
    //  printf("R2\n\n");
        /*S=A[1][1]*(B[1][0]-B[0][0])*/
//      printf("S1");
        Madd(A+m*(n+1), o, x, n, 1);
        Msub(B+m*n, B, y, n, 1);
        strassen(x, y, S, m);
//      printf("S2\n\n");
        /*T=(A[0][0]+A[0][1])*B[1][1]*/
//      printf("T1");
        Madd(A, A+m, x, n, 1);
        Madd(B+m*(n+1), o, y, n, 1);
        strassen(x, y, T, m);
//      printf("T2\n\n");
        /*U=(A[1][0]-A[0][0])*(B[0][0]+B[0][1])*/
        
 //       printf("U1");
        Msub(A+m*n, A, x, n, 1);
        Madd(B, B+m, y, n, 1);
        strassen(x, y, U, m);
 //       printf("U2\n\n");
        /*V=(A[0][1]-A[1][1])*(B[1][0]+B[1][1])*/
//      printf("V1");
        Msub(A+m, A+m*(n+1), x, n, 1);
        Madd(B+m*n, B+m*(n+1), y, n, 1);
        strassen(x, y, V, m);
//      printf("V2\n\n");

        /*Calculating the 4 parts for the result matrix*/
//        int W[m][m], X[m][m], Y[m][m], Z[m][m];
        int *W=calloc(m*m, sizeof(int*)), *X=calloc(m*m, sizeof(int*)), *Y=calloc(m*m, sizeof(int*)), *Z=calloc(m*m, sizeof(int*));
   //     printf("Final RUN\n\n");
        Msub(V,T,x,m,0);
        Madd(S,x,y,m,0);
        Madd(P,y,W,m,0); // W=P+S-T+V
        Madd(R,T,X,m,0); // X==R+T
        Madd(Q,S,Y,m,0); // Y=Q+S
        Msub(U,Q,x,m,0);
        Madd(R,x,y,m,0);
        Madd(P,y,Z,m,0); // Z=P+R-Q+U

        /*Conquering 4 parts in the result matrix*/
        for (i=0;i<m;i++)
            for (j=0;j<m;j++){
                *(C+i*n+j) = *(W+i*m+j); //C[0][0]=W
                *(C+i*n+j+m) = *(X+i*m+j); //C[0][1]=X
                *(C+(i+m)*n+j) = *(Y+i*m+j); //C[1][0]=Y
                *(C+(i+m)*n+j+m) = *(Z+i*m+j); //C[1][1]=Z
            }
    

    }

    /*free(P);
    free(Q);
    free(R);
    free(S);
    free(T);
    free(U);
    free(V);
    free(W);
    free(X);
    free(Y);
    free(Z);*/
}

void main()
{
    struct timespec t_start, t_end;
    double elapsedTime;
    int i,j,k,m=2048,o=0;
    int *A=malloc(sizeof(int*)*N*N), *B=malloc(sizeof(int*)*N*N),*a=calloc(2048*2048,sizeof(int*)), *b=calloc(2048*2048,sizeof(int*)), **CC=NULL, *C=calloc(2048*2048,sizeof(int*));
    
   

    for( i=0; i<N; i++ )
     for( j=0; j<N; j++ ) {
        *(A+N*i+j) = rand();
        *(B+N*i+j) = rand();
      }

    //printf("led m is %d",m );
   // a=(int**)malloc(sizeof(int*)*m);
  //  b=(int**)malloc(sizeof(int*)*m);
    CC=calloc(N, sizeof(int*));
    for (i=0;i<m;i++){
        //a[i]=(int *)malloc(m*sizeof(int));
        //b[i]=(int *)malloc(m*sizeof(int));
        CC[i]=calloc(N, sizeof(int));
    }        
    /*for(i=0;i<N;i++)
        for(j=0;j<N;j++)
            a[i][j]=A[i][j];
    for(i=0;i<N;i++)
        for(j=0;j<N;j++)
            b[i][j]=B[i][j];*/

    clock_gettime( CLOCK_REALTIME, &t_start);  
    for( i=0; i<N; i++ ){
        for( j=0; j<N; j++ ) {
            for( k=0; k<N; k++ ){
                CC[i][j] += *(A+N*i+k)*(*(B+N*k+j));
            }
        }
    }
    // stop time
   clock_gettime( CLOCK_REALTIME, &t_end);   

   // compute and print the elapsed time in millisec
   elapsedTime = (t_end.tv_sec - t_start.tv_sec) * 1000.0;
   elapsedTime += (t_end.tv_nsec - t_start.tv_nsec) / 1000000.0;
   printf("Sequential elapsedTime: %lf ms\n", elapsedTime);  
    // start time
  clock_gettime( CLOCK_REALTIME, &t_start);
 //  printf("StarA: mm is %d", m);
    strassen(A,B,C,m);
//   printf("StarB");;
   clock_gettime( CLOCK_REALTIME, &t_end);   

   // compute and print the elapsed time in millisec
   elapsedTime = (t_end.tv_sec - t_start.tv_sec) * 1000.0;
   elapsedTime += (t_end.tv_nsec - t_start.tv_nsec) / 1000000.0;
   printf("Parallel elapsedTime: %lf ms\n", elapsedTime);
 
   for(i=0; i<N; i++){
        for(j=0; j<N; j++){
            if(*(C+i*N+j) != CC[i][j])
                break;
        }
    }
   
    if(i==N && j==N)
        printf("Test pass!!!\n"); 
    else
        printf("Test failed!!!\n");
    for(i=0;i<N;i++){
        free(CC[i]);

    }
//1 - dimension
    free(A);
    free(B);
    free(C);
    
    
    free(CC);
   //return 0;
}

This function got problem when it was running

strassen(a,b,C,m);

in int main(){} function

c


Solution 1:[1]

Your program gets to a strassen arg n of 4 and seems to oscillate there with n of 2.

I've added a bunch of cross checking and array limit checking.

I think that the very first Madd call in strassen exceeds the array limits.

Also, by not releasing the temp memory, it consumes a lot.

Note that in your revised version, when doing malloc, you're using sizeof(int *) but you want sizeof(int).

I've fixed those.

I've had to heavily refactor the code to add debugging.

I've added a debug/control struct for each array.

I rewrote Madd and Msub arguments so that array bounds checking could be done.

I could have made a zillion transcription mistakes, so you may have to do considerable desk checking.

Here's the fault output. I ran this under gdb and did a stack traceback and it's the first Madd call in strassen:

Sequential elapsedTime: 1.136087689
Sequential elapsedTime: 0.608301088
mtxcmp: PASS Corig Cmtx
strassen: ENTER A=A B=B C=Cstrass n=512 depth=0
P1
_geofault: curmax -- off=262400 geo_wid=500 geo_hgt=500 geomax=250000 geo_sym=A

Here's all the source. I split things into multiple files. To preserve that, the following is an extractable archive.

Save the following to a temp file (e.g. /tmp/archive). Do cd /tmp. Then, do perl /tmp/archive -go and it will created a strassen subdirectory. cd to that directory (/tmp/strassen) and type make.

You may have to modify the Makefile to add things to CFLAGS:

make COPTS="-DDEBUG -DCHECK"

Caveat: It may take more effort to understand the debug stuff I've added than to debug it yourself -- YMMV

#!/usr/bin/perl
# FILE: ovcbin/ovcext.pm 755
# ovcbin/ovcext.pm -- ovrcat archive extractor
#
# this is a self extracting archive
# after the __DATA__ line, files are separated by:
#   % filename

ovcext_cmd(@ARGV);
exit(0);

sub ovcext_cmd
{
    my(@argv) = @_;
    local($xfdata);
    local($xfdiv,$divcur,%ovcdiv_lookup);

    $pgmtail = "ovcext";
    ovcinit();
    ovcopt(\@argv,qw(opt_go opt_f opt_t));

    $xfdata = "ovrcat::DATA";
    $xfdata = \*$xfdata;

    ovceval($xfdata);

    ovcfifo($zipflg_all);

    ovcline($xfdata);

    $code = ovcwait();

    ovcclose(\$xfdata);

    ovcdiv();

    ovczipd_spl()
        if ($zipflg_spl);
}

sub ovceval
{
    my($xfdata) = @_;
    my($buf,$err);

    {
        $buf = <$xfdata>;
        chomp($buf);

        last unless ($buf =~ s/^%\s+([\@\$;])/$1/);

        eval($buf);

        $err = $@;
        unless ($err) {
            undef($buf);
            last;
        }

        chomp($err);
        $err = " (" . $err . ")"
    }

    sysfault("ovceval: bad options line -- '%s'%s\n",$buf,$err)
        if (defined($buf));
}

sub ovcline
{
    my($xfdata) = @_;
    my($buf);
    my($tail);

    while ($buf = <$xfdata>) {
        chomp($buf);

        if ($buf =~ /^%\s+(.+)$/) {
            $tail = $1;
            ovcdiv($tail);
            next;
        }

        print($xfdiv $buf,"\n")
            if (ref($xfdiv));
    }

}

sub ovcdiv
{
    my($ofile) = @_;
    my($mode);
    my($xfcur);
    my($err,$prt);

    ($ofile,$mode) = split(" ",$ofile);

    $mode = oct($mode);
    $mode &= 0777;

    {
        unless (defined($ofile)) {
            while ((undef,$divcur) = each(%ovcdiv_lookup)) {
                close($divcur->{div_xfdst});
            }
            last;
        }

        $ofile = ovctail($ofile);

        $divcur = $ovcdiv_lookup{$ofile};
        if (ref($divcur)) {
            $xfdiv = $divcur->{div_xfdst};
            last;
        }
        undef($xfdiv);

        if (-e $ofile) {
            msg("ovcdiv: file '%s' already exists -- ",$ofile);

            unless ($opt_f) {
                msg("rerun with -f to force\n");
                last;
            }

            msg("overwriting!\n");
        }

        unless (defined($err)) {
            ovcmkdir($1)
                if ($ofile =~ m,^(.+)/[^/]+$,);
        }

        msg("$pgmtail: %s %s",ovcnogo("extracting"),$ofile);
        msg(" chmod %3.3o",$mode)
            if ($mode);
        msg("\n");

        last unless ($opt_go);
        last if (defined($err));

        $xfcur = ovcopen(">$ofile");

        $divcur = {};
        $ovcdiv_lookup{$ofile} = $divcur;

        if ($mode) {
            chmod($mode,$xfcur);
            $divcur->{div_mode} = $mode;
        }

        $divcur->{div_xfdst} = $xfcur;
        $xfdiv = $xfcur;
    }
}

sub ovcinit
{

    {
        last if (defined($ztmp));
        $ztmp = "/tmp/ovrcat_zip";

        $PWD = $ENV{PWD};

        $quo_2 = '"';

        $ztmp_inp = $ztmp . "_0";
        $ztmp_out = $ztmp . "_1";
        $ztmp_perl = $ztmp . "_perl";

        ovcunlink();

        $ovcdbg = ($ENV{"ZPXHOWOVC"} != 0);
    }
}

sub ovcunlink
{

    _ovcunlink($ztmp_inp,1);
    _ovcunlink($ztmp_out,1);
    _ovcunlink($ztmp_perl,($pgmtail ne "ovcext") || $opt_go);
}

sub _ovcunlink
{
    my($file,$rmflg) = @_;
    my($found,$tag);

    {
        last unless (defined($file));

        $found = (-e $file);

        $tag //= "notfound"
            unless ($found);
        $tag //= $rmflg ? "cleaning" : "keeping";

        msg("ovcunlink: %s %s ...\n",$tag,$file)
            if (($found or $ovcdbg) and (! $ovcunlink_quiet));

        unlink($file)
            if ($rmflg and $found);
    }
}

sub ovcopt
{
    my($argv) = @_;
    my($opt);

    while (1) {
        $opt = $argv->[0];
        last unless ($opt =~ s/^-/opt_/);

        shift(@$argv);
        $$opt = 1;
    }
}

sub ovctail
{
    my($file,$sub) = @_;
    my(@file);

    $file =~ s,^/,,;
    @file = split("/",$file);

    $sub //= 2;

    @file = splice(@file,-$sub)
        if (@file >= $sub);

    $file = join("/",@file);

    $file;
}

sub ovcmkdir
{
    my($odir) = @_;
    my(@lhs,@rhs);

    @rhs = split("/",$odir);

    foreach $rhs (@rhs) {
        push(@lhs,$rhs);

        $odir = join("/",@lhs);

        if ($opt_go) {
            next if (-d $odir);
        }
        else {
            next if ($ovcmkdir{$odir});
            $ovcmkdir{$odir} = 1;
        }

        msg("$pgmtail: %s %s ...\n",ovcnogo("mkdir"),$odir);

        next unless ($opt_go);

        mkdir($odir) or
            sysfault("$pgmtail: unable to mkdir '%s' -- $!\n",$odir);
    }
}

sub ovcopen
{
    my($file,$who) = @_;
    my($xf);

    $who //= $pgmtail;
    $who //= "ovcopen";

    open($xf,$file) or
        sysfault("$who: unable to open '%s' -- $!\n",$file);

    $xf;
}

sub ovcclose
{
    my($xfp) = @_;
    my($ref);
    my($xf);

    {
        $ref = ref($xfp);
        last unless ($ref);

        if ($ref eq "GLOB") {
            close($xfp);
            last;
        }

        if ($ref eq "REF") {
            $xf = $$xfp;
            if (ref($xf) eq "GLOB") {
                close($xf);
                undef($$xfp);
            }
        }
    }

    undef($xf);

    $xf;
}

sub ovcnogo
{
    my($str) = @_;

    unless ($opt_go) {
        $str = "NOGO-$str";
        $nogo_msg = 1;
    }

    $str;
}

sub ovcdbg
{

    if ($ovcdbg) {
        printf(STDERR "[%d] ",$$);
        printf(STDERR @_);
    }
}

sub msg
{

    printf(STDERR @_);
}

sub msgv
{

    $_ = join(" ",@_);
    print(STDERR $_,"\n");
}

sub sysfault
{

    printf(STDERR @_);
    exit(1);
}

sub ovcfifo
{
}

sub ovcwait
{
    my($code);

    if ($pid_fifo) {
        waitpid($pid_fifo,0);
        $code = $? >> 8;
    }

    $code;
}

sub prtstr
{
    my($val,$fmtpos,$fmtneg) = @_;

    {
        unless (defined($val)) {
            $val = "undef";
            last;
        }

        if (ref($val)) {
            $val = sprintf("(%s)",$val);
            last;
        }

        $fmtpos //= "'%s'";

        if (defined($fmtneg) && ($val <= 0)) {
            $val = sprintf($fmtneg,$val);
            last;
        }

        $val = sprintf($fmtpos,$val);
    }

    $val;
}

sub prtnum
{
    my($val) = @_;

    $val = prtstr($val,"%d");

    $val;
}

END {
    msg("$pgmtail: rerun with -go to actually do it\n")
        if ($nogo_msg);
    ovcunlink();
}

1;
package ovrcat;
__DATA__
% ;
% strassen/strassen.h
// Strassen's Algorithm for Matrix Multiplication using Divide & Conquer

#ifndef _strassen_strassen_h_
#define _strassen_strassen_h_

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <time.h>

#ifdef _STRASSEN_GLO_
#define EXTRN_STRASSEN      /**/
#else
#define EXTRN_STRASSEN      extern
#endif

#ifndef CHECK
#define CHECK       1
#endif
#ifndef SMALL
#define SMALL       1
#endif

#define inline_always       static inline __attribute__((__always_inline__))

#define sysfault(_fmt...) \
    do { \
        printf(_fmt); \
        _sysfault(); \
    } while (0)

#if DEBUG || _USE_ZPRT_
#define dbgprt(_fmt...) \
    do { \
        printf(_fmt); \
        fflush(stdout); \
    } while (0)
#else
#define dbgprt(_fmt...) \
    do { \
    } while (0)
#endif

typedef int *dat_p;
typedef unsigned int u32;

#if SMALL
#define N       (2000 / 4)
#define BIG     (2048 / 4)
#else
#define N       2000
#define BIG     2048
#endif

typedef struct mtxgeo {
    u32 geo_magic;
    struct mtxgeo *geo_next;
    const char *geo_sym;
    int geo_hgt;
    int geo_wid;
} mtxgeo_t;

#define GEOMAGIC    0xDEADBEEF

EXTRN_STRASSEN mtxgeo_t *geofree;
EXTRN_STRASSEN mtxgeo_t *geobusy;

EXTRN_STRASSEN double t_start;
EXTRN_STRASSEN double t_end;

#include <strassen/strassen.proto>

inline_always mtxgeo_t *
mtxgeo(int *mtx)
{
    void *vp;
    mtxgeo_t *geo;

    vp = mtx;
    vp -= sizeof(mtxgeo_t);

    geo = vp;

#if CHECK
    if (geo->geo_magic != GEOMAGIC)
        sysfault("mtxgeo: bad magic\n");
#endif

    return geo;
}

inline_always int *
mtxloc(int *mtx,int y,int x)
{
    mtxgeo_t *geo = mtxgeo(mtx);

    mtx += (y * geo->geo_wid);
    mtx += x;

    return mtx;
}

inline_always const char *
mtxsym(int *mtx)
{
    mtxgeo_t *geo = mtxgeo(mtx);
    return geo->geo_sym;
}

inline_always int *
mtxoffchk(int *mtx,int off,int m,int n)
{

#if CHECK
    mtx = _mtxoffchk(mtx,off,m,n);
#else
    mtx += off;
#endif

    return mtx;
}

#endif
% strassen/arith.c
// strassen/arith.c -- basic add/sub

#include <strassen/strassen.h>

void
Madd_orig(int *A, int *B, int *C, int n, int x)
{
    int i,
     j,
     m = x > 0 ? n / 2 : n;

//parallel
    for (i = 0; i < m; i++)
        for (j = 0; j < m; j++)
            *(C + i * m + j) = *(A + i * n + j) + *(B + i * n + j);
}

void
MaddX(int *A, int *B, int *C, int n, int m)
{
    int i, j;

//parallel
    for (i = 0; i < m; i++) {
        int iN = i * n;

        int *Ap = &A[iN];
        int *Bp = &B[iN];
        int *Cp = &C[i * m];

        for (j = 0; j < m; j++) {
#if 0
            *(C + i * m + j) = *(A + i * n + j) + *(B + i * n + j);
#else
            Cp[j] = Ap[j] + Bp[j];
#endif
        }
    }
}

void
Msub_orig(int *A, int *B, int *C, int n, int x)
{
    int i, j;
    int m = x > 0 ? n / 2 : n;

//parallel
    for (i = 0; i < m; i++)
        for (j = 0; j < m; j++)
            *(C + i * m + j) = *(A + i * n + j) - *(B + i * n + j);
}

void
MsubX(int *A, int *B, int *C, int n, int m)
{
    int i, j;

//parallel
    for (i = 0; i < m; i++) {
        int iN = i * n;

        int *Ap = &A[iN];
        int *Bp = &B[iN];
        int *Cp = &C[i * m];

        for (j = 0; j < m; j++) {
#if 0
            *(C + i * m + j) = *(A + i * n + j) + *(B + i * n + j);
#else
            Cp[j] = Ap[j] - Bp[j];
#endif
        }
    }
}

void
Madd0(int *A, int *B, int *C, int n)
{
    int m = n;

    A = mtxoffchk(A,0,n,m);
    B = mtxoffchk(B,0,n,m);
    C = mtxoffchk(C,0,m,m);

    MaddX(A,B,C,n,m);
}

void
Madd1(int *A,int Aoff, int *B,int Boff, int *C, int n)
{
    int m = n / 2;

    A = mtxoffchk(A,Aoff,n,m);
    B = mtxoffchk(B,Boff,n,m);
    C = mtxoffchk(C,0,m,m);

    MaddX(A,B,C,n,m);
}

void
Msub0(int *A, int *B, int *C, int n)
{
    int m = n;

    A = mtxoffchk(A,0,n,m);
    B = mtxoffchk(B,0,n,m);
    C = mtxoffchk(C,0,m,m);

    MsubX(A,B,C,n,m);
}

void
Msub1(int *A,int Aoff, int *B,int Boff, int *C, int n)
{
    int m = n / 2;

    A = mtxoffchk(A,Aoff,n,m);
    B = mtxoffchk(B,Boff,n,m);
    C = mtxoffchk(C,0,m,m);

    MsubX(A,B,C,n,m);
}
% strassen/core.c
// strassen/core.c -- strassen function

#include <strassen/strassen.h>

void
strassen(int *A, int *B, int *C, int n)
{
    static int depth = 0;

    dbgprt("strassen: ENTER A=%s B=%s C=%s n=%d depth=%d\n",
        mtxsym(A),mtxsym(B),mtxsym(C),n,depth);

    ++depth;
    if (depth > 40)
        sysfault("strassen: depth overflow n=%d depth=%d\n",n,depth);

    t_end = tscgetf();
    t_end -= t_start;
    if (t_end > 2.0)
        sysfault("strassen: timeout\n");

//NOT possible for 2000*2000
    if (n == 2) {
        // P=(A[0][0]+A[1][1])*(B[0][0]+B[1][1])
        int P = (*A + *(A + n + 1)) * (*B + *(B + n + 1));

        // Q=(A[1][0]+A[1][1])*B[0][0]
        int Q = (*(A + n) + *(A + n + 1)) * (*B);

        // R=A[0][0]*(B[0][1]-B[1][1])
        int R = (*A) * (*(B + 1) - *(B + n + 1));

        // S=A[1][1]*(B[1][0]-B[0][0])
        int S = (*(A + n + 1)) * (*(B + n) - *B);

        // T=(A[0][0]+A[0][1])*B[1][1]
        int T = (*A + *(A + 1)) * (*(B + n + 1));

        // U=(A[1][0]-A[0][0])*(B[0][0]+B[0][1])
        int U = (*(A + n) - *A) * (*B + *(B + 1));

        // V=(A[0][1]-A[1][1])*(B[1][0]+B[1][1])
        int V = (*(A + 1) - *(A + n + 1)) * (*(B + n) + *(B + n + 1));

        // master parallel??
        *C = P + S - T + V;
        *(C + 1) = R + T;
        *(C + n) = Q + S;
        *(C + n + 1) = P + R - Q + U;
    }

/////////////////////////////////
    else {
        int m = n / 2;
        int i, j;

//      printf("else m is %d, n is %d\n\n", m, n);
#if 0
        int *x = calloc(m * m, sizeof(int *)),
            *y = calloc(m * m, sizeof(int *)),
            *o = calloc(n * n, sizeof(int));
#else
        int *x = mtxacq(m,m,"x");
        int *y = mtxacq(m,m,"y");
        int *o = mtxacq(n,n,"o");
#endif

#if 0
        int *P = malloc(sizeof(int *) * N * N),
            *Q = malloc(sizeof(int *) * n * n),
            *R = malloc(sizeof(int *) * n * n),
            *S = malloc(sizeof(int *) * n * n),
            *T = malloc(sizeof(int *) * n * n),
            *U = malloc(sizeof(int *) * n * n),
            *V = malloc(sizeof(int *) * n * n);
#else
        int *P = mtxacq(N,N,"P");
        int *Q = mtxacq(N,N,"Q");
        int *R = mtxacq(N,N,"R");
        int *S = mtxacq(N,N,"S");
        int *T = mtxacq(N,N,"T");
        int *U = mtxacq(N,N,"U");
        int *V = mtxacq(N,N,"V");
#endif

//malloc not yet
        // P=(A[0][0]+A[1][1])*(B[0][0]+B[1][1])

        dbgprt("P1\n");
        Madd1(A, 0, A,m * (n + 1), x, n);
        Madd1(B, 0, B,m * (n + 1), y, n);
        strassen(x, y, P, m);
        dbgprt("P2\n");
        // Q=(A[1][0]+A[1][1])*B[0][0]

        dbgprt("Q1\n");
        Madd1(A,m * n, A,m * (n + 1), x, n);
        Madd1(B,0, o,0, y, n);
        strassen(x, y, Q, m);
        dbgprt("Q2\n");
        // R=A[0][0]*(B[0][1]-B[1][1])

        dbgprt("R1\n");
        Madd1(A,0, o,0, x, n);
        Msub1(B,m, B,m * (n + 1), y, n);
        strassen(x, y, R, m);
        dbgprt("R2\n");

        // S=A[1][1]*(B[1][0]-B[0][0])

        dbgprt("S1\n");
        Madd1(A,m * (n + 1), o, 0, x, n);
        Msub1(B,m * n, B,0, y, n);
        strassen(x, y, S, m);
        dbgprt("S2\n");
        // T=(A[0][0]+A[0][1])*B[1][1]

        dbgprt("T1\n");
        Madd1(A,0, A,m, x, n);
        Madd1(B,m * (n + 1), o,0, y, n);
        strassen(x, y, T, m);
        dbgprt("T2\n");

        // U=(A[1][0]-A[0][0])*(B[0][0]+B[0][1])
        dbgprt("U1\n");

        Msub1(A,m * n, A,0, x, n);
        Madd1(B,0, B,m, y, n);
        strassen(x, y, U, m);
        dbgprt("U2\n");

        // V=(A[0][1]-A[1][1])*(B[1][0]+B[1][1])

        dbgprt("V1\n");
        Msub1(A,m, A,m * (n + 1), x, n);
        Madd1(B,m * n, B,m * (n + 1), y, n);
        strassen(x, y, V, m);
        dbgprt("V2\n");

        // Calculating the 4 parts for the result matrix

//        int W[m][m], X[m][m], Y[m][m], Z[m][m];
#if 0
        int *W = calloc(m * m, sizeof(int *)),
            *X = calloc(m * m, sizeof(int *)),
            *Y = calloc(m * m, sizeof(int *)),
            *Z = calloc(m * m, sizeof(int *));
#else
        int *W = mtxacq(m,m,"W");
        int *X = mtxacq(m,m,"X");
        int *Y = mtxacq(m,m,"Y");
        int *Z = mtxacq(m,m,"Z");
#endif

        dbgprt("Final RUN\n");

        Msub0(V, T, x, m);
        Madd0(S, x, y, m);

        // W=P+S-T+V
        Madd0(P, y, W, m);

        // X==R+T
        Madd0(R, T, X, m);

        // Y=Q+S
        Madd0(Q, S, Y, m);
        Msub0(U, Q, x, m);
        Madd0(R, x, y, m);

        // Z=P+R-Q+U
        Madd0(P, y, Z, m);

        // Conquering 4 parts in the result matrix

        for (i = 0; i < m; i++) {
            for (j = 0; j < m; j++) {
                // C[0][0]=W
                *(C + i * n + j) = *(W + i * m + j);

                // C[0][1]=X
                *(C + i * n + j + m) = *(X + i * m + j);

                // C[1][0]=Y
                *(C + (i + m) * n + j) = *(Y + i * m + j);

                // C[1][1]=Z
                *(C + (i + m) * n + j + m) = *(Z + i * m + j);
            }
        }

        mtxrls(x);
        mtxrls(y);
        mtxrls(o);

        mtxrls(P);
        mtxrls(Q);
        mtxrls(R);
        mtxrls(S);
        mtxrls(T);
        mtxrls(U);
        mtxrls(V);

        mtxrls(W);
        mtxrls(X);
        mtxrls(Y);
        mtxrls(Z);
    }

    --depth;

    dbgprt("strassen: EXIT n=%d depth=%d\n",n,depth);
}
% strassen/mtx.c
// strassen/mtx.c -- matrix allocation

#define _STRASSEN_GLO_
#include <strassen/strassen.h>

int *
mtxacq(int hgt,int wid,const char *sym)
{
    size_t len = 0;
    mtxgeo_t *geo;
    int *mtx = NULL;

    len += sizeof(mtxgeo_t);
    len += sizeof(int) * hgt * wid;

    while (1) {
        void *vp;

        // find existing matrix
        mtxgeo_t *prev = NULL;
        for (geo = geofree;  geo != NULL;  geo = geo->geo_next) {
            if ((geo->geo_hgt == hgt) && (geo->geo_wid == wid))
                break;
            prev = geo;
        }

        // use existing matrix
        if (geo != NULL) {
            if (prev != NULL)
                prev->geo_next = geo->geo_next;
            else
                geofree = geo->geo_next;

            vp = geo;
            vp += sizeof(mtxgeo_t);
            mtx = vp;
            break;
        }

        vp = malloc(len);
        geo = vp;

        geo->geo_hgt = hgt;
        geo->geo_wid = wid;

        // point to matrix data
        vp += sizeof(mtxgeo_t);
        mtx = vp;

        // link it -- we'll find it immediately above
        geo->geo_next = geofree;
        geofree = geo;
    }

    geo->geo_sym = sym;
    geo->geo_magic = GEOMAGIC;

    geo->geo_next = geobusy;
    geobusy = geo;

    return mtx;
}

void
mtxrls(int *mtx)
{
    mtxgeo_t *geo = mtxgeo(mtx);
    mtxgeo_t *prev = NULL;
    mtxgeo_t *next = NULL;
    mtxgeo_t *cur;

    for (cur = geobusy;  cur != NULL;  cur = next) {
        next = cur->geo_next;
        if (cur == geo)
            break;
        prev = cur;
    }
    if (prev != NULL)
        prev->geo_next = next;
    else
        geobusy = next;

    geo->geo_next = geofree;
    geofree = geo;
}

mtxgeo_t *
mtxfree(mtxgeo_t *list)
{
    mtxgeo_t *next;
    mtxgeo_t *cur;

    for (cur = list;  cur != NULL;  cur = next) {
        next = cur->geo_next;
        free(cur);
    }

    list = NULL;

    return list;
}

void
mtxzero(int *mtx)
{
    mtxgeo_t *geo = mtxgeo(mtx);

    memset(mtx,0,sizeof(int) * geo->geo_hgt * geo->geo_wid);
}

void
mtxcmp(int *lhsbase,int *rhsbase)
{
    int pass = 1;

    for (int i = 0; i < N; i++) {
        int *lhs = mtxloc(lhsbase,i,0);
        int *rhs = mtxloc(rhsbase,i,0);

        for (int j = 0; j < N; j++) {
            if (lhs[j] != rhs[j]) {
                printf("mtxcmp: MISMATCH [%d,%d] lhs=%d rhs=%d\n",
                    i,j,lhs[j],rhs[j]);
                pass = 0;
                break;
            }
        }

        if (! pass)
            break;
    }

    printf("mtxcmp: %s %s %s\n",
        pass ? "PASS" : "FAIL",
        mtxgeo(lhsbase)->geo_sym,mtxgeo(rhsbase)->geo_sym);

    if (! pass)
        sysfault("mtxcmp: aborting ...\n");
}

int *
_mtxoffchk(int *mtx,int off,int m,int n)
{
    mtxgeo_t *geo = mtxgeo(mtx);
    int geomax = geo->geo_wid * geo->geo_hgt;

    // check starting index
    if (off >= geomax)
        _geofault(geo,off,"off");

    // check ending index
    int curmax = off + (m * n);
    if (curmax > geomax)
        _geofault(geo,curmax,"curmax");

    // point to start
    mtx += off;

    return mtx;
}

void
_geofault(mtxgeo_t *geo,int off,const char *why)
{
    int geomax = geo->geo_wid * geo->geo_hgt;

    sysfault("_geofault: %s -- off=%d geo_wid=%d geo_hgt=%d geomax=%d geo_sym=%s\n",
        why,off,geo->geo_wid,geo->geo_hgt,geomax,geo->geo_sym);
}
% strassen/strassen.c
// strassen/strassen.c -- strassen test

#include <strassen/strassen.h>

int
main(void)
{
    int i, j, k;
    int m = BIG;

#if 0
    int *A = malloc(sizeof(int *) * N * N),
        *B = malloc(sizeof(int *) * N * N),
        *a = calloc(2048 * 2048, sizeof(int *)),
        *b = calloc(2048 * 2048, sizeof(int *)),
        **CC = NULL,
        *C = calloc(2048 * 2048, sizeof(int *));
#else
    dat_p A = mtxacq(N,N,"A");
    dat_p B = mtxacq(N,N,"B");
    //dat_p a = mtxacq(BIG,BIG,"a");
    //dat_p b = mtxacq(BIG,BIG,"b");
    dat_p C = mtxacq(BIG,BIG,"Cstrass");
    dat_p Corig = mtxacq(BIG,BIG,"Corig");
    dat_p Cmtx = mtxacq(BIG,BIG,"Cmtx");
#endif
    int *Cij;

#if 0
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            *(A + N * i + j) = rand();
            *(B + N * i + j) = rand();
        }
    }
#else
    for (i = 0; i < N; i++) {
        int *pA = mtxloc(A,i,0);
        int *pB = mtxloc(B,i,0);
        for (j = 0; j < N; j++) {
            pA[j] = rand();
            pB[j] = rand();
        }
    }
#endif

    // printf("led m is %d",m );
    // a=(int**)malloc(sizeof(int*)*m);
    // b=(int**)malloc(sizeof(int*)*m);

    // for(i=0;i<N;i++) for(j=0;j<N;j++) a[i][j]=A[i][j]; for(i=0;i<N;i++)
    // for(j=0;j<N;j++) b[i][j]=B[i][j];

    mtxzero(Corig);
    t_start = tscgetf();
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            for (k = 0; k < N; k++)
                Corig[(i * N) + j] += *(A + N * i + k) * (*(B + N * k + j));
        }
    }
    t_end = tscgetf();
    printf("Sequential elapsedTime: %.9f\n", t_end - t_start);

    t_start = tscgetf();
    Cij = mtxloc(Cmtx,0,0);
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++, ++Cij) {
            int *Ap = mtxloc(A,i,0);
            int *Bp = mtxloc(B,0,j);

            int sum = 0;
            for (k = 0; k < N; k++, Ap++, Bp += N)
                sum += Ap[0] * Bp[0];

            *Cij = sum;
        }
    }
    t_end = tscgetf();
    printf("Sequential elapsedTime: %.9f\n", t_end - t_start);
    mtxcmp(Corig,Cmtx);

    t_start = tscgetf();
    strassen(A, B, C, m);
    t_end = tscgetf();
    printf("Parallel elapsedTime: %.9f\n", t_end - t_start);
    mtxcmp(Cmtx,C);

    mtxfree(geobusy);
    mtxfree(geofree);

    return 0;
}
% strassen/tsc.c
// strassen/mtx.c -- matrix allocation

#include <strassen/strassen.h>

double
tscgetf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_MONOTONIC,&ts);
    sec = ts.tv_nsec;
    sec /= 1e9;
    sec += ts.tv_sec;

    return sec;
}

void
_sysfault(void)
{

    __asm__ __volatile__("" ::: "memory");
    exit(1);
}
% strassen/strassen.proto
// /home/cae/OBJ/ovrgen/strassen/strassen.proto -- prototypes

// FILE: /home/cae/preserve/ovrbnc/strassen/arith.c
// strassen/arith.c -- basic add/sub

    void
    Madd_orig(int *A, int *B, int *C, int n, int x);

    void
    MaddX(int *A, int *B, int *C, int n, int m);

    void
    Msub_orig(int *A, int *B, int *C, int n, int x);

    void
    MsubX(int *A, int *B, int *C, int n, int m);

    void
    Madd0(int *A, int *B, int *C, int n);

    void
    Madd1(int *A,int Aoff, int *B,int Boff, int *C, int n);

    void
    Msub0(int *A, int *B, int *C, int n);

    void
    Msub1(int *A,int Aoff, int *B,int Boff, int *C, int n);

// FILE: /home/cae/preserve/ovrbnc/strassen/core.c
// strassen/core.c -- strassen function

    void
    strassen(int *A, int *B, int *C, int n);

// FILE: /home/cae/preserve/ovrbnc/strassen/mtx.c
// strassen/mtx.c -- matrix allocation

    int *
    mtxacq(int hgt,int wid,const char *sym);

    void
    mtxrls(int *mtx);

    mtxgeo_t *
    mtxfree(mtxgeo_t *list);

    void
    mtxzero(int *mtx);

    void
    mtxcmp(int *lhsbase,int *rhsbase);

    int *
    _mtxoffchk(int *mtx,int off,int m,int n);

    void
    _geofault(mtxgeo_t *geo,int off,const char *why);

// FILE: /home/cae/preserve/ovrbnc/strassen/strassen.c
// strassen/strassen.c -- strassen test

    int
    main(void);

// FILE: /home/cae/preserve/ovrbnc/strassen/tsc.c
// strassen/mtx.c -- matrix allocation

    double
    tscgetf(void);

    void
    _sysfault(void);
% strassen/Makefile
# /home/cae/preserve/ovrbnc/strassen -- makefile
PGMTGT += strassen
CURSRC += arith.c
CURSRC += core.c
CURSRC += mtx.c
CURSRC += tsc.c
ifndef COPTS
    COPTS += -O2
endif
CFLAGS += $(COPTS)
CFLAGS += -g
CFLAGS += -Wall
CFLAGS += -Werror
CFLAGS += -I..
all: $(PGMTGT)
strassen: strassen.c $(CURSRC) $(LIBSRC)
    cc -o strassen $(CFLAGS) strassen.c $(CURSRC) $(LIBSRC)
clean:
    rm -f $(PGMTGT)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1