'terminated by signal SIGKILL (Forced quit) DIFFERENT ERROR
I faced another problem here again with terminated by signal SIGSEGV (Address boundary error) But I found that there's an output here : Sequential elapsedTime: 97906.976232 ms It shows that only my parallel function got wrong. Can anyone tell me why? Here's how function work: This program compared a parallel and a normal large matrix multiplying. I use a strassen function hope to make it faster than only parallel. But now I faced problem which I can't understand right now. Can some help?
/*Author: Sayak Bera*/
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <time.h>
#define N 2000
void Madd(int* A, int* B, int* C, int n, int x){
// printf("AAAAddd\n\n\n");
int i,j, m=x>0?n/2:n;
//parallel
//#pragma omp parallel for private(i, j)
for (i=0;i<m;i++)
for (j=0;j<m;j++)
*(C+i*m+j) = *(A+i*n+j) + *(B+i*n+j);
}
void Msub(int* A, int* B, int* C, int n, int x){
// printf("AAAAsss\n\n\n");
//parallel
//#pragma omp parallel for private(i, j)
int i,j, m=x>0?n/2:n;
for (i=0;i<m;i++)
for (j=0;j<m;j++)
*(C+i*m+j) = *(A+i*n+j) - *(B+i*n+j);
}
void strassen(int* A, int* B, int* C, int n){
int i,j;
//NOT possible for 2000*2000
if(n==2){
int P=(*A+*(A+n+1))*(*B+*(B+n+1)); //P=(A[0][0]+A[1][1])*(B[0][0]+B[1][1])
int Q=(*(A+n)+*(A+n+1))*(*B); //Q=(A[1][0]+A[1][1])*B[0][0]
int R=(*A)*(*(B+1)-*(B+n+1)); //R=A[0][0]*(B[0][1]-B[1][1])
int S=(*(A+n+1))*(*(B+n)-*B); //S=A[1][1]*(B[1][0]-B[0][0])
int T=(*A+*(A+1))*(*(B+n+1)); //T=(A[0][0]+A[0][1])*B[1][1]
int U=(*(A+n)-*A)*(*B+*(B+1)); //U=(A[1][0]-A[0][0])*(B[0][0]+B[0][1])
int V=(*(A+1)-*(A+n+1))*(*(B+n)+*(B+n+1)); //V=(A[0][1]-A[1][1])*(B[1][0]+B[1][1])
// master parallel??
*C=P+S-T+V;
*(C+1)=R+T;
*(C+n)=Q+S;
*(C+n+1)=P+R-Q+U;
}
/////////////////////////////////
else{
int m=n/2, i, j;
// printf("else m is %d, n is %d\n\n", m, n);
int *x=calloc(m*m, sizeof(int*)), *y=calloc(m*m, sizeof(int*)), *o=calloc( n*n, sizeof(int) );
int *P=malloc(sizeof(int*)*N*N), *Q=malloc(sizeof(int*)*n*n), *R=malloc(sizeof(int*)*n*n), *S=malloc(sizeof(int*)*n*n), *T=malloc(sizeof(int*)*n*n),*U=malloc(sizeof(int*)*n*n), *V=malloc(sizeof(int*)*n*n);
//malloc not yet
/*P=(A[0][0]+A[1][1])*(B[0][0]+B[1][1])*/
// printf("P1");
Madd(A, A+m*(n+1), x, n, 1);
Madd(B, B+m*(n+1), y, n, 1);
strassen(x, y, P, m);
// printf("P2\n\n");
/*Q=(A[1][0]+A[1][1])*B[0][0]*/
// printf("Q1");
Madd(A+m*n, A+m*(n+1), x, n, 1);
Madd(B, o, y, n, 1);
strassen(x, y, Q, m);
// printf("Q2\n\n");
/*R=A[0][0]*(B[0][1]-B[1][1])*/
// printf("R1");
Madd(A, o, x, n, 1);
Msub(B+m, B+m*(n+1), y, n, 1);
strassen(x, y, R, m);
// printf("R2\n\n");
/*S=A[1][1]*(B[1][0]-B[0][0])*/
// printf("S1");
Madd(A+m*(n+1), o, x, n, 1);
Msub(B+m*n, B, y, n, 1);
strassen(x, y, S, m);
// printf("S2\n\n");
/*T=(A[0][0]+A[0][1])*B[1][1]*/
// printf("T1");
Madd(A, A+m, x, n, 1);
Madd(B+m*(n+1), o, y, n, 1);
strassen(x, y, T, m);
// printf("T2\n\n");
/*U=(A[1][0]-A[0][0])*(B[0][0]+B[0][1])*/
// printf("U1");
Msub(A+m*n, A, x, n, 1);
Madd(B, B+m, y, n, 1);
strassen(x, y, U, m);
// printf("U2\n\n");
/*V=(A[0][1]-A[1][1])*(B[1][0]+B[1][1])*/
// printf("V1");
Msub(A+m, A+m*(n+1), x, n, 1);
Madd(B+m*n, B+m*(n+1), y, n, 1);
strassen(x, y, V, m);
// printf("V2\n\n");
/*Calculating the 4 parts for the result matrix*/
// int W[m][m], X[m][m], Y[m][m], Z[m][m];
int *W=calloc(m*m, sizeof(int*)), *X=calloc(m*m, sizeof(int*)), *Y=calloc(m*m, sizeof(int*)), *Z=calloc(m*m, sizeof(int*));
// printf("Final RUN\n\n");
Msub(V,T,x,m,0);
Madd(S,x,y,m,0);
Madd(P,y,W,m,0); // W=P+S-T+V
Madd(R,T,X,m,0); // X==R+T
Madd(Q,S,Y,m,0); // Y=Q+S
Msub(U,Q,x,m,0);
Madd(R,x,y,m,0);
Madd(P,y,Z,m,0); // Z=P+R-Q+U
/*Conquering 4 parts in the result matrix*/
for (i=0;i<m;i++)
for (j=0;j<m;j++){
*(C+i*n+j) = *(W+i*m+j); //C[0][0]=W
*(C+i*n+j+m) = *(X+i*m+j); //C[0][1]=X
*(C+(i+m)*n+j) = *(Y+i*m+j); //C[1][0]=Y
*(C+(i+m)*n+j+m) = *(Z+i*m+j); //C[1][1]=Z
}
}
/*free(P);
free(Q);
free(R);
free(S);
free(T);
free(U);
free(V);
free(W);
free(X);
free(Y);
free(Z);*/
}
void main()
{
struct timespec t_start, t_end;
double elapsedTime;
int i,j,k,m=2048,o=0;
int *A=malloc(sizeof(int*)*N*N), *B=malloc(sizeof(int*)*N*N),*a=calloc(2048*2048,sizeof(int*)), *b=calloc(2048*2048,sizeof(int*)), **CC=NULL, *C=calloc(2048*2048,sizeof(int*));
for( i=0; i<N; i++ )
for( j=0; j<N; j++ ) {
*(A+N*i+j) = rand();
*(B+N*i+j) = rand();
}
//printf("led m is %d",m );
// a=(int**)malloc(sizeof(int*)*m);
// b=(int**)malloc(sizeof(int*)*m);
CC=calloc(N, sizeof(int*));
for (i=0;i<m;i++){
//a[i]=(int *)malloc(m*sizeof(int));
//b[i]=(int *)malloc(m*sizeof(int));
CC[i]=calloc(N, sizeof(int));
}
/*for(i=0;i<N;i++)
for(j=0;j<N;j++)
a[i][j]=A[i][j];
for(i=0;i<N;i++)
for(j=0;j<N;j++)
b[i][j]=B[i][j];*/
clock_gettime( CLOCK_REALTIME, &t_start);
for( i=0; i<N; i++ ){
for( j=0; j<N; j++ ) {
for( k=0; k<N; k++ ){
CC[i][j] += *(A+N*i+k)*(*(B+N*k+j));
}
}
}
// stop time
clock_gettime( CLOCK_REALTIME, &t_end);
// compute and print the elapsed time in millisec
elapsedTime = (t_end.tv_sec - t_start.tv_sec) * 1000.0;
elapsedTime += (t_end.tv_nsec - t_start.tv_nsec) / 1000000.0;
printf("Sequential elapsedTime: %lf ms\n", elapsedTime);
// start time
clock_gettime( CLOCK_REALTIME, &t_start);
// printf("StarA: mm is %d", m);
strassen(A,B,C,m);
// printf("StarB");;
clock_gettime( CLOCK_REALTIME, &t_end);
// compute and print the elapsed time in millisec
elapsedTime = (t_end.tv_sec - t_start.tv_sec) * 1000.0;
elapsedTime += (t_end.tv_nsec - t_start.tv_nsec) / 1000000.0;
printf("Parallel elapsedTime: %lf ms\n", elapsedTime);
for(i=0; i<N; i++){
for(j=0; j<N; j++){
if(*(C+i*N+j) != CC[i][j])
break;
}
}
if(i==N && j==N)
printf("Test pass!!!\n");
else
printf("Test failed!!!\n");
for(i=0;i<N;i++){
free(CC[i]);
}
//1 - dimension
free(A);
free(B);
free(C);
free(CC);
//return 0;
}
This function got problem when it was running
strassen(a,b,C,m);
in int main(){} function
Solution 1:[1]
Your program gets to a strassen
arg n
of 4 and seems to oscillate there with n
of 2.
I've added a bunch of cross checking and array limit checking.
I think that the very first Madd
call in strassen
exceeds the array limits.
Also, by not releasing the temp memory, it consumes a lot.
Note that in your revised version, when doing malloc
, you're using sizeof(int *)
but you want sizeof(int)
.
I've fixed those.
I've had to heavily refactor the code to add debugging.
I've added a debug/control struct
for each array.
I rewrote Madd
and Msub
arguments so that array bounds checking could be done.
I could have made a zillion transcription mistakes, so you may have to do considerable desk checking.
Here's the fault output. I ran this under gdb
and did a stack traceback and it's the first Madd
call in strassen
:
Sequential elapsedTime: 1.136087689
Sequential elapsedTime: 0.608301088
mtxcmp: PASS Corig Cmtx
strassen: ENTER A=A B=B C=Cstrass n=512 depth=0
P1
_geofault: curmax -- off=262400 geo_wid=500 geo_hgt=500 geomax=250000 geo_sym=A
Here's all the source. I split things into multiple files. To preserve that, the following is an extractable archive.
Save the following to a temp file (e.g. /tmp/archive
). Do cd /tmp
. Then, do perl /tmp/archive -go
and it will created a strassen
subdirectory. cd to that directory (/tmp/strassen
) and type make
.
You may have to modify the Makefile
to add things to CFLAGS
:
make COPTS="-DDEBUG -DCHECK"
Caveat: It may take more effort to understand the debug stuff I've added than to debug it yourself -- YMMV
#!/usr/bin/perl
# FILE: ovcbin/ovcext.pm 755
# ovcbin/ovcext.pm -- ovrcat archive extractor
#
# this is a self extracting archive
# after the __DATA__ line, files are separated by:
# % filename
ovcext_cmd(@ARGV);
exit(0);
sub ovcext_cmd
{
my(@argv) = @_;
local($xfdata);
local($xfdiv,$divcur,%ovcdiv_lookup);
$pgmtail = "ovcext";
ovcinit();
ovcopt(\@argv,qw(opt_go opt_f opt_t));
$xfdata = "ovrcat::DATA";
$xfdata = \*$xfdata;
ovceval($xfdata);
ovcfifo($zipflg_all);
ovcline($xfdata);
$code = ovcwait();
ovcclose(\$xfdata);
ovcdiv();
ovczipd_spl()
if ($zipflg_spl);
}
sub ovceval
{
my($xfdata) = @_;
my($buf,$err);
{
$buf = <$xfdata>;
chomp($buf);
last unless ($buf =~ s/^%\s+([\@\$;])/$1/);
eval($buf);
$err = $@;
unless ($err) {
undef($buf);
last;
}
chomp($err);
$err = " (" . $err . ")"
}
sysfault("ovceval: bad options line -- '%s'%s\n",$buf,$err)
if (defined($buf));
}
sub ovcline
{
my($xfdata) = @_;
my($buf);
my($tail);
while ($buf = <$xfdata>) {
chomp($buf);
if ($buf =~ /^%\s+(.+)$/) {
$tail = $1;
ovcdiv($tail);
next;
}
print($xfdiv $buf,"\n")
if (ref($xfdiv));
}
}
sub ovcdiv
{
my($ofile) = @_;
my($mode);
my($xfcur);
my($err,$prt);
($ofile,$mode) = split(" ",$ofile);
$mode = oct($mode);
$mode &= 0777;
{
unless (defined($ofile)) {
while ((undef,$divcur) = each(%ovcdiv_lookup)) {
close($divcur->{div_xfdst});
}
last;
}
$ofile = ovctail($ofile);
$divcur = $ovcdiv_lookup{$ofile};
if (ref($divcur)) {
$xfdiv = $divcur->{div_xfdst};
last;
}
undef($xfdiv);
if (-e $ofile) {
msg("ovcdiv: file '%s' already exists -- ",$ofile);
unless ($opt_f) {
msg("rerun with -f to force\n");
last;
}
msg("overwriting!\n");
}
unless (defined($err)) {
ovcmkdir($1)
if ($ofile =~ m,^(.+)/[^/]+$,);
}
msg("$pgmtail: %s %s",ovcnogo("extracting"),$ofile);
msg(" chmod %3.3o",$mode)
if ($mode);
msg("\n");
last unless ($opt_go);
last if (defined($err));
$xfcur = ovcopen(">$ofile");
$divcur = {};
$ovcdiv_lookup{$ofile} = $divcur;
if ($mode) {
chmod($mode,$xfcur);
$divcur->{div_mode} = $mode;
}
$divcur->{div_xfdst} = $xfcur;
$xfdiv = $xfcur;
}
}
sub ovcinit
{
{
last if (defined($ztmp));
$ztmp = "/tmp/ovrcat_zip";
$PWD = $ENV{PWD};
$quo_2 = '"';
$ztmp_inp = $ztmp . "_0";
$ztmp_out = $ztmp . "_1";
$ztmp_perl = $ztmp . "_perl";
ovcunlink();
$ovcdbg = ($ENV{"ZPXHOWOVC"} != 0);
}
}
sub ovcunlink
{
_ovcunlink($ztmp_inp,1);
_ovcunlink($ztmp_out,1);
_ovcunlink($ztmp_perl,($pgmtail ne "ovcext") || $opt_go);
}
sub _ovcunlink
{
my($file,$rmflg) = @_;
my($found,$tag);
{
last unless (defined($file));
$found = (-e $file);
$tag //= "notfound"
unless ($found);
$tag //= $rmflg ? "cleaning" : "keeping";
msg("ovcunlink: %s %s ...\n",$tag,$file)
if (($found or $ovcdbg) and (! $ovcunlink_quiet));
unlink($file)
if ($rmflg and $found);
}
}
sub ovcopt
{
my($argv) = @_;
my($opt);
while (1) {
$opt = $argv->[0];
last unless ($opt =~ s/^-/opt_/);
shift(@$argv);
$$opt = 1;
}
}
sub ovctail
{
my($file,$sub) = @_;
my(@file);
$file =~ s,^/,,;
@file = split("/",$file);
$sub //= 2;
@file = splice(@file,-$sub)
if (@file >= $sub);
$file = join("/",@file);
$file;
}
sub ovcmkdir
{
my($odir) = @_;
my(@lhs,@rhs);
@rhs = split("/",$odir);
foreach $rhs (@rhs) {
push(@lhs,$rhs);
$odir = join("/",@lhs);
if ($opt_go) {
next if (-d $odir);
}
else {
next if ($ovcmkdir{$odir});
$ovcmkdir{$odir} = 1;
}
msg("$pgmtail: %s %s ...\n",ovcnogo("mkdir"),$odir);
next unless ($opt_go);
mkdir($odir) or
sysfault("$pgmtail: unable to mkdir '%s' -- $!\n",$odir);
}
}
sub ovcopen
{
my($file,$who) = @_;
my($xf);
$who //= $pgmtail;
$who //= "ovcopen";
open($xf,$file) or
sysfault("$who: unable to open '%s' -- $!\n",$file);
$xf;
}
sub ovcclose
{
my($xfp) = @_;
my($ref);
my($xf);
{
$ref = ref($xfp);
last unless ($ref);
if ($ref eq "GLOB") {
close($xfp);
last;
}
if ($ref eq "REF") {
$xf = $$xfp;
if (ref($xf) eq "GLOB") {
close($xf);
undef($$xfp);
}
}
}
undef($xf);
$xf;
}
sub ovcnogo
{
my($str) = @_;
unless ($opt_go) {
$str = "NOGO-$str";
$nogo_msg = 1;
}
$str;
}
sub ovcdbg
{
if ($ovcdbg) {
printf(STDERR "[%d] ",$$);
printf(STDERR @_);
}
}
sub msg
{
printf(STDERR @_);
}
sub msgv
{
$_ = join(" ",@_);
print(STDERR $_,"\n");
}
sub sysfault
{
printf(STDERR @_);
exit(1);
}
sub ovcfifo
{
}
sub ovcwait
{
my($code);
if ($pid_fifo) {
waitpid($pid_fifo,0);
$code = $? >> 8;
}
$code;
}
sub prtstr
{
my($val,$fmtpos,$fmtneg) = @_;
{
unless (defined($val)) {
$val = "undef";
last;
}
if (ref($val)) {
$val = sprintf("(%s)",$val);
last;
}
$fmtpos //= "'%s'";
if (defined($fmtneg) && ($val <= 0)) {
$val = sprintf($fmtneg,$val);
last;
}
$val = sprintf($fmtpos,$val);
}
$val;
}
sub prtnum
{
my($val) = @_;
$val = prtstr($val,"%d");
$val;
}
END {
msg("$pgmtail: rerun with -go to actually do it\n")
if ($nogo_msg);
ovcunlink();
}
1;
package ovrcat;
__DATA__
% ;
% strassen/strassen.h
// Strassen's Algorithm for Matrix Multiplication using Divide & Conquer
#ifndef _strassen_strassen_h_
#define _strassen_strassen_h_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <time.h>
#ifdef _STRASSEN_GLO_
#define EXTRN_STRASSEN /**/
#else
#define EXTRN_STRASSEN extern
#endif
#ifndef CHECK
#define CHECK 1
#endif
#ifndef SMALL
#define SMALL 1
#endif
#define inline_always static inline __attribute__((__always_inline__))
#define sysfault(_fmt...) \
do { \
printf(_fmt); \
_sysfault(); \
} while (0)
#if DEBUG || _USE_ZPRT_
#define dbgprt(_fmt...) \
do { \
printf(_fmt); \
fflush(stdout); \
} while (0)
#else
#define dbgprt(_fmt...) \
do { \
} while (0)
#endif
typedef int *dat_p;
typedef unsigned int u32;
#if SMALL
#define N (2000 / 4)
#define BIG (2048 / 4)
#else
#define N 2000
#define BIG 2048
#endif
typedef struct mtxgeo {
u32 geo_magic;
struct mtxgeo *geo_next;
const char *geo_sym;
int geo_hgt;
int geo_wid;
} mtxgeo_t;
#define GEOMAGIC 0xDEADBEEF
EXTRN_STRASSEN mtxgeo_t *geofree;
EXTRN_STRASSEN mtxgeo_t *geobusy;
EXTRN_STRASSEN double t_start;
EXTRN_STRASSEN double t_end;
#include <strassen/strassen.proto>
inline_always mtxgeo_t *
mtxgeo(int *mtx)
{
void *vp;
mtxgeo_t *geo;
vp = mtx;
vp -= sizeof(mtxgeo_t);
geo = vp;
#if CHECK
if (geo->geo_magic != GEOMAGIC)
sysfault("mtxgeo: bad magic\n");
#endif
return geo;
}
inline_always int *
mtxloc(int *mtx,int y,int x)
{
mtxgeo_t *geo = mtxgeo(mtx);
mtx += (y * geo->geo_wid);
mtx += x;
return mtx;
}
inline_always const char *
mtxsym(int *mtx)
{
mtxgeo_t *geo = mtxgeo(mtx);
return geo->geo_sym;
}
inline_always int *
mtxoffchk(int *mtx,int off,int m,int n)
{
#if CHECK
mtx = _mtxoffchk(mtx,off,m,n);
#else
mtx += off;
#endif
return mtx;
}
#endif
% strassen/arith.c
// strassen/arith.c -- basic add/sub
#include <strassen/strassen.h>
void
Madd_orig(int *A, int *B, int *C, int n, int x)
{
int i,
j,
m = x > 0 ? n / 2 : n;
//parallel
for (i = 0; i < m; i++)
for (j = 0; j < m; j++)
*(C + i * m + j) = *(A + i * n + j) + *(B + i * n + j);
}
void
MaddX(int *A, int *B, int *C, int n, int m)
{
int i, j;
//parallel
for (i = 0; i < m; i++) {
int iN = i * n;
int *Ap = &A[iN];
int *Bp = &B[iN];
int *Cp = &C[i * m];
for (j = 0; j < m; j++) {
#if 0
*(C + i * m + j) = *(A + i * n + j) + *(B + i * n + j);
#else
Cp[j] = Ap[j] + Bp[j];
#endif
}
}
}
void
Msub_orig(int *A, int *B, int *C, int n, int x)
{
int i, j;
int m = x > 0 ? n / 2 : n;
//parallel
for (i = 0; i < m; i++)
for (j = 0; j < m; j++)
*(C + i * m + j) = *(A + i * n + j) - *(B + i * n + j);
}
void
MsubX(int *A, int *B, int *C, int n, int m)
{
int i, j;
//parallel
for (i = 0; i < m; i++) {
int iN = i * n;
int *Ap = &A[iN];
int *Bp = &B[iN];
int *Cp = &C[i * m];
for (j = 0; j < m; j++) {
#if 0
*(C + i * m + j) = *(A + i * n + j) + *(B + i * n + j);
#else
Cp[j] = Ap[j] - Bp[j];
#endif
}
}
}
void
Madd0(int *A, int *B, int *C, int n)
{
int m = n;
A = mtxoffchk(A,0,n,m);
B = mtxoffchk(B,0,n,m);
C = mtxoffchk(C,0,m,m);
MaddX(A,B,C,n,m);
}
void
Madd1(int *A,int Aoff, int *B,int Boff, int *C, int n)
{
int m = n / 2;
A = mtxoffchk(A,Aoff,n,m);
B = mtxoffchk(B,Boff,n,m);
C = mtxoffchk(C,0,m,m);
MaddX(A,B,C,n,m);
}
void
Msub0(int *A, int *B, int *C, int n)
{
int m = n;
A = mtxoffchk(A,0,n,m);
B = mtxoffchk(B,0,n,m);
C = mtxoffchk(C,0,m,m);
MsubX(A,B,C,n,m);
}
void
Msub1(int *A,int Aoff, int *B,int Boff, int *C, int n)
{
int m = n / 2;
A = mtxoffchk(A,Aoff,n,m);
B = mtxoffchk(B,Boff,n,m);
C = mtxoffchk(C,0,m,m);
MsubX(A,B,C,n,m);
}
% strassen/core.c
// strassen/core.c -- strassen function
#include <strassen/strassen.h>
void
strassen(int *A, int *B, int *C, int n)
{
static int depth = 0;
dbgprt("strassen: ENTER A=%s B=%s C=%s n=%d depth=%d\n",
mtxsym(A),mtxsym(B),mtxsym(C),n,depth);
++depth;
if (depth > 40)
sysfault("strassen: depth overflow n=%d depth=%d\n",n,depth);
t_end = tscgetf();
t_end -= t_start;
if (t_end > 2.0)
sysfault("strassen: timeout\n");
//NOT possible for 2000*2000
if (n == 2) {
// P=(A[0][0]+A[1][1])*(B[0][0]+B[1][1])
int P = (*A + *(A + n + 1)) * (*B + *(B + n + 1));
// Q=(A[1][0]+A[1][1])*B[0][0]
int Q = (*(A + n) + *(A + n + 1)) * (*B);
// R=A[0][0]*(B[0][1]-B[1][1])
int R = (*A) * (*(B + 1) - *(B + n + 1));
// S=A[1][1]*(B[1][0]-B[0][0])
int S = (*(A + n + 1)) * (*(B + n) - *B);
// T=(A[0][0]+A[0][1])*B[1][1]
int T = (*A + *(A + 1)) * (*(B + n + 1));
// U=(A[1][0]-A[0][0])*(B[0][0]+B[0][1])
int U = (*(A + n) - *A) * (*B + *(B + 1));
// V=(A[0][1]-A[1][1])*(B[1][0]+B[1][1])
int V = (*(A + 1) - *(A + n + 1)) * (*(B + n) + *(B + n + 1));
// master parallel??
*C = P + S - T + V;
*(C + 1) = R + T;
*(C + n) = Q + S;
*(C + n + 1) = P + R - Q + U;
}
/////////////////////////////////
else {
int m = n / 2;
int i, j;
// printf("else m is %d, n is %d\n\n", m, n);
#if 0
int *x = calloc(m * m, sizeof(int *)),
*y = calloc(m * m, sizeof(int *)),
*o = calloc(n * n, sizeof(int));
#else
int *x = mtxacq(m,m,"x");
int *y = mtxacq(m,m,"y");
int *o = mtxacq(n,n,"o");
#endif
#if 0
int *P = malloc(sizeof(int *) * N * N),
*Q = malloc(sizeof(int *) * n * n),
*R = malloc(sizeof(int *) * n * n),
*S = malloc(sizeof(int *) * n * n),
*T = malloc(sizeof(int *) * n * n),
*U = malloc(sizeof(int *) * n * n),
*V = malloc(sizeof(int *) * n * n);
#else
int *P = mtxacq(N,N,"P");
int *Q = mtxacq(N,N,"Q");
int *R = mtxacq(N,N,"R");
int *S = mtxacq(N,N,"S");
int *T = mtxacq(N,N,"T");
int *U = mtxacq(N,N,"U");
int *V = mtxacq(N,N,"V");
#endif
//malloc not yet
// P=(A[0][0]+A[1][1])*(B[0][0]+B[1][1])
dbgprt("P1\n");
Madd1(A, 0, A,m * (n + 1), x, n);
Madd1(B, 0, B,m * (n + 1), y, n);
strassen(x, y, P, m);
dbgprt("P2\n");
// Q=(A[1][0]+A[1][1])*B[0][0]
dbgprt("Q1\n");
Madd1(A,m * n, A,m * (n + 1), x, n);
Madd1(B,0, o,0, y, n);
strassen(x, y, Q, m);
dbgprt("Q2\n");
// R=A[0][0]*(B[0][1]-B[1][1])
dbgprt("R1\n");
Madd1(A,0, o,0, x, n);
Msub1(B,m, B,m * (n + 1), y, n);
strassen(x, y, R, m);
dbgprt("R2\n");
// S=A[1][1]*(B[1][0]-B[0][0])
dbgprt("S1\n");
Madd1(A,m * (n + 1), o, 0, x, n);
Msub1(B,m * n, B,0, y, n);
strassen(x, y, S, m);
dbgprt("S2\n");
// T=(A[0][0]+A[0][1])*B[1][1]
dbgprt("T1\n");
Madd1(A,0, A,m, x, n);
Madd1(B,m * (n + 1), o,0, y, n);
strassen(x, y, T, m);
dbgprt("T2\n");
// U=(A[1][0]-A[0][0])*(B[0][0]+B[0][1])
dbgprt("U1\n");
Msub1(A,m * n, A,0, x, n);
Madd1(B,0, B,m, y, n);
strassen(x, y, U, m);
dbgprt("U2\n");
// V=(A[0][1]-A[1][1])*(B[1][0]+B[1][1])
dbgprt("V1\n");
Msub1(A,m, A,m * (n + 1), x, n);
Madd1(B,m * n, B,m * (n + 1), y, n);
strassen(x, y, V, m);
dbgprt("V2\n");
// Calculating the 4 parts for the result matrix
// int W[m][m], X[m][m], Y[m][m], Z[m][m];
#if 0
int *W = calloc(m * m, sizeof(int *)),
*X = calloc(m * m, sizeof(int *)),
*Y = calloc(m * m, sizeof(int *)),
*Z = calloc(m * m, sizeof(int *));
#else
int *W = mtxacq(m,m,"W");
int *X = mtxacq(m,m,"X");
int *Y = mtxacq(m,m,"Y");
int *Z = mtxacq(m,m,"Z");
#endif
dbgprt("Final RUN\n");
Msub0(V, T, x, m);
Madd0(S, x, y, m);
// W=P+S-T+V
Madd0(P, y, W, m);
// X==R+T
Madd0(R, T, X, m);
// Y=Q+S
Madd0(Q, S, Y, m);
Msub0(U, Q, x, m);
Madd0(R, x, y, m);
// Z=P+R-Q+U
Madd0(P, y, Z, m);
// Conquering 4 parts in the result matrix
for (i = 0; i < m; i++) {
for (j = 0; j < m; j++) {
// C[0][0]=W
*(C + i * n + j) = *(W + i * m + j);
// C[0][1]=X
*(C + i * n + j + m) = *(X + i * m + j);
// C[1][0]=Y
*(C + (i + m) * n + j) = *(Y + i * m + j);
// C[1][1]=Z
*(C + (i + m) * n + j + m) = *(Z + i * m + j);
}
}
mtxrls(x);
mtxrls(y);
mtxrls(o);
mtxrls(P);
mtxrls(Q);
mtxrls(R);
mtxrls(S);
mtxrls(T);
mtxrls(U);
mtxrls(V);
mtxrls(W);
mtxrls(X);
mtxrls(Y);
mtxrls(Z);
}
--depth;
dbgprt("strassen: EXIT n=%d depth=%d\n",n,depth);
}
% strassen/mtx.c
// strassen/mtx.c -- matrix allocation
#define _STRASSEN_GLO_
#include <strassen/strassen.h>
int *
mtxacq(int hgt,int wid,const char *sym)
{
size_t len = 0;
mtxgeo_t *geo;
int *mtx = NULL;
len += sizeof(mtxgeo_t);
len += sizeof(int) * hgt * wid;
while (1) {
void *vp;
// find existing matrix
mtxgeo_t *prev = NULL;
for (geo = geofree; geo != NULL; geo = geo->geo_next) {
if ((geo->geo_hgt == hgt) && (geo->geo_wid == wid))
break;
prev = geo;
}
// use existing matrix
if (geo != NULL) {
if (prev != NULL)
prev->geo_next = geo->geo_next;
else
geofree = geo->geo_next;
vp = geo;
vp += sizeof(mtxgeo_t);
mtx = vp;
break;
}
vp = malloc(len);
geo = vp;
geo->geo_hgt = hgt;
geo->geo_wid = wid;
// point to matrix data
vp += sizeof(mtxgeo_t);
mtx = vp;
// link it -- we'll find it immediately above
geo->geo_next = geofree;
geofree = geo;
}
geo->geo_sym = sym;
geo->geo_magic = GEOMAGIC;
geo->geo_next = geobusy;
geobusy = geo;
return mtx;
}
void
mtxrls(int *mtx)
{
mtxgeo_t *geo = mtxgeo(mtx);
mtxgeo_t *prev = NULL;
mtxgeo_t *next = NULL;
mtxgeo_t *cur;
for (cur = geobusy; cur != NULL; cur = next) {
next = cur->geo_next;
if (cur == geo)
break;
prev = cur;
}
if (prev != NULL)
prev->geo_next = next;
else
geobusy = next;
geo->geo_next = geofree;
geofree = geo;
}
mtxgeo_t *
mtxfree(mtxgeo_t *list)
{
mtxgeo_t *next;
mtxgeo_t *cur;
for (cur = list; cur != NULL; cur = next) {
next = cur->geo_next;
free(cur);
}
list = NULL;
return list;
}
void
mtxzero(int *mtx)
{
mtxgeo_t *geo = mtxgeo(mtx);
memset(mtx,0,sizeof(int) * geo->geo_hgt * geo->geo_wid);
}
void
mtxcmp(int *lhsbase,int *rhsbase)
{
int pass = 1;
for (int i = 0; i < N; i++) {
int *lhs = mtxloc(lhsbase,i,0);
int *rhs = mtxloc(rhsbase,i,0);
for (int j = 0; j < N; j++) {
if (lhs[j] != rhs[j]) {
printf("mtxcmp: MISMATCH [%d,%d] lhs=%d rhs=%d\n",
i,j,lhs[j],rhs[j]);
pass = 0;
break;
}
}
if (! pass)
break;
}
printf("mtxcmp: %s %s %s\n",
pass ? "PASS" : "FAIL",
mtxgeo(lhsbase)->geo_sym,mtxgeo(rhsbase)->geo_sym);
if (! pass)
sysfault("mtxcmp: aborting ...\n");
}
int *
_mtxoffchk(int *mtx,int off,int m,int n)
{
mtxgeo_t *geo = mtxgeo(mtx);
int geomax = geo->geo_wid * geo->geo_hgt;
// check starting index
if (off >= geomax)
_geofault(geo,off,"off");
// check ending index
int curmax = off + (m * n);
if (curmax > geomax)
_geofault(geo,curmax,"curmax");
// point to start
mtx += off;
return mtx;
}
void
_geofault(mtxgeo_t *geo,int off,const char *why)
{
int geomax = geo->geo_wid * geo->geo_hgt;
sysfault("_geofault: %s -- off=%d geo_wid=%d geo_hgt=%d geomax=%d geo_sym=%s\n",
why,off,geo->geo_wid,geo->geo_hgt,geomax,geo->geo_sym);
}
% strassen/strassen.c
// strassen/strassen.c -- strassen test
#include <strassen/strassen.h>
int
main(void)
{
int i, j, k;
int m = BIG;
#if 0
int *A = malloc(sizeof(int *) * N * N),
*B = malloc(sizeof(int *) * N * N),
*a = calloc(2048 * 2048, sizeof(int *)),
*b = calloc(2048 * 2048, sizeof(int *)),
**CC = NULL,
*C = calloc(2048 * 2048, sizeof(int *));
#else
dat_p A = mtxacq(N,N,"A");
dat_p B = mtxacq(N,N,"B");
//dat_p a = mtxacq(BIG,BIG,"a");
//dat_p b = mtxacq(BIG,BIG,"b");
dat_p C = mtxacq(BIG,BIG,"Cstrass");
dat_p Corig = mtxacq(BIG,BIG,"Corig");
dat_p Cmtx = mtxacq(BIG,BIG,"Cmtx");
#endif
int *Cij;
#if 0
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
*(A + N * i + j) = rand();
*(B + N * i + j) = rand();
}
}
#else
for (i = 0; i < N; i++) {
int *pA = mtxloc(A,i,0);
int *pB = mtxloc(B,i,0);
for (j = 0; j < N; j++) {
pA[j] = rand();
pB[j] = rand();
}
}
#endif
// printf("led m is %d",m );
// a=(int**)malloc(sizeof(int*)*m);
// b=(int**)malloc(sizeof(int*)*m);
// for(i=0;i<N;i++) for(j=0;j<N;j++) a[i][j]=A[i][j]; for(i=0;i<N;i++)
// for(j=0;j<N;j++) b[i][j]=B[i][j];
mtxzero(Corig);
t_start = tscgetf();
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++)
Corig[(i * N) + j] += *(A + N * i + k) * (*(B + N * k + j));
}
}
t_end = tscgetf();
printf("Sequential elapsedTime: %.9f\n", t_end - t_start);
t_start = tscgetf();
Cij = mtxloc(Cmtx,0,0);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++, ++Cij) {
int *Ap = mtxloc(A,i,0);
int *Bp = mtxloc(B,0,j);
int sum = 0;
for (k = 0; k < N; k++, Ap++, Bp += N)
sum += Ap[0] * Bp[0];
*Cij = sum;
}
}
t_end = tscgetf();
printf("Sequential elapsedTime: %.9f\n", t_end - t_start);
mtxcmp(Corig,Cmtx);
t_start = tscgetf();
strassen(A, B, C, m);
t_end = tscgetf();
printf("Parallel elapsedTime: %.9f\n", t_end - t_start);
mtxcmp(Cmtx,C);
mtxfree(geobusy);
mtxfree(geofree);
return 0;
}
% strassen/tsc.c
// strassen/mtx.c -- matrix allocation
#include <strassen/strassen.h>
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_MONOTONIC,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
void
_sysfault(void)
{
__asm__ __volatile__("" ::: "memory");
exit(1);
}
% strassen/strassen.proto
// /home/cae/OBJ/ovrgen/strassen/strassen.proto -- prototypes
// FILE: /home/cae/preserve/ovrbnc/strassen/arith.c
// strassen/arith.c -- basic add/sub
void
Madd_orig(int *A, int *B, int *C, int n, int x);
void
MaddX(int *A, int *B, int *C, int n, int m);
void
Msub_orig(int *A, int *B, int *C, int n, int x);
void
MsubX(int *A, int *B, int *C, int n, int m);
void
Madd0(int *A, int *B, int *C, int n);
void
Madd1(int *A,int Aoff, int *B,int Boff, int *C, int n);
void
Msub0(int *A, int *B, int *C, int n);
void
Msub1(int *A,int Aoff, int *B,int Boff, int *C, int n);
// FILE: /home/cae/preserve/ovrbnc/strassen/core.c
// strassen/core.c -- strassen function
void
strassen(int *A, int *B, int *C, int n);
// FILE: /home/cae/preserve/ovrbnc/strassen/mtx.c
// strassen/mtx.c -- matrix allocation
int *
mtxacq(int hgt,int wid,const char *sym);
void
mtxrls(int *mtx);
mtxgeo_t *
mtxfree(mtxgeo_t *list);
void
mtxzero(int *mtx);
void
mtxcmp(int *lhsbase,int *rhsbase);
int *
_mtxoffchk(int *mtx,int off,int m,int n);
void
_geofault(mtxgeo_t *geo,int off,const char *why);
// FILE: /home/cae/preserve/ovrbnc/strassen/strassen.c
// strassen/strassen.c -- strassen test
int
main(void);
// FILE: /home/cae/preserve/ovrbnc/strassen/tsc.c
// strassen/mtx.c -- matrix allocation
double
tscgetf(void);
void
_sysfault(void);
% strassen/Makefile
# /home/cae/preserve/ovrbnc/strassen -- makefile
PGMTGT += strassen
CURSRC += arith.c
CURSRC += core.c
CURSRC += mtx.c
CURSRC += tsc.c
ifndef COPTS
COPTS += -O2
endif
CFLAGS += $(COPTS)
CFLAGS += -g
CFLAGS += -Wall
CFLAGS += -Werror
CFLAGS += -I..
all: $(PGMTGT)
strassen: strassen.c $(CURSRC) $(LIBSRC)
cc -o strassen $(CFLAGS) strassen.c $(CURSRC) $(LIBSRC)
clean:
rm -f $(PGMTGT)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 |