'PThreads is not providing a program speedup over serial code

I am creating 2 programs to test the differences in run time of serial matrix multiply vs that of parallel matrix multiply. The parallel code that I have written is actually running slower than serial code, and running the program with additional cores enabled provides no speedup at all... using more cores actually seems to slow down the parallel program.

What is going on here? This is my parallel code: to use this pass in matrix size and thread number (see my useage below)

#include <stdio.h>
#include <stdlib.h>  // rand(), srand()
#include <unistd.h>
#include <time.h>
#include <pthread.h>

// Time struct + prototypes
struct timespec time1, time2, diffTime;
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** reserveMatrix(int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);
void* matMult(void* arg);

// Argstruct
typedef struct {
    double** result;
    int tid;
    int size;
    int s;
    int e;
} argStr;

// global variables for use by all threads
int size;           // Size of a row and column. 
int numThreads;     // Number of pThreads to do work

double** mat1;
double** mat2;
double** mat3;

// Main function
int main(int argc, char *argv[]) {
    size = atoi(argv[1]);
    numThreads = atoi(argv[2]);

    mat1 = reserveMatrix(size, size);
    mat2 = reserveMatrix(size, size);
    mat3 = reserveMatrix(size, size);

    if (size == 0) {
        //printf("Matrix cannot be size 0\n");
        return -1;
    }

    //Start timer
    clock_gettime(CLOCK_MONOTONIC, &time1);

    // *********** Begin main operation *********** //
    //                                              //

    // declare necessary local variables
    pthread_t theThreads[numThreads];
    argStr data[numThreads];            // Create numThreads # of argStr objects
    for (int i = 0; i < numThreads; i++) {
        data[i].result = reserveMatrix(size, size);
        data[i].tid = i;       // Self-assigned threadID
        data[i].size = size;    // Size of a block
        data[i].s = size * i / numThreads;
        data[i].e = size * (i + 1) / numThreads - 1;
        //printf("I handle operations from %d to %d\n", data[i].s, data[i].e);
    }
    
    // Start the threads
    for (int i = 0; i < numThreads; i++) {
        pthread_create(&theThreads[i], NULL, matMult, (void*) (&data[i]));
    }

    // await all threads being done.
    for (int i = 0; i < numThreads; i++) {
        pthread_join(theThreads[i], NULL);
    }

    // rejoin received data
    //printMat(data[1].result, size, size);

    //                                              //
    // *********** End main operation ***********   //

    // Stop timer and find time taken
    clock_gettime(CLOCK_MONOTONIC, &time2);
    diffTime = timespecDifference(time1, time2);
    double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
    //Print Time
    printf("Pthread Matrix Multiply, %d, %d, %lf\n", size, numThreads, cpuTimeUsed);

}


// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
    struct timespec temp;
    if ((end.tv_nsec - start.tv_nsec) < 0) {
        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
    }
    else {
        temp.tv_sec = end.tv_sec - start.tv_sec;
        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
    }
    return temp;
}

// Reserve matrix function
double** reserveMatrix(int nRows, int nCols) { 
    double** matrix1 = (double**)malloc(nRows * sizeof(double*));
    matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
    
    // Assign row pointers to "segment" out the data
    for (int r = 1; r < nRows; ++r) {
        matrix1[r] = &(matrix1[0][r * nCols]);
    }

    // Give values to the array
    for(int i = 0; i < nRows * nCols; i++) {
        matrix1[0][i] = i; 
    }

    return matrix1;
}

// Print matrix function
void printMat(double** mat1, int rows, int cols) {
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            printf("%f, ", mat1[i][j]);
        }
        printf("\n");
    }
    printf("End of array print\n");
}

void* matMult(void* arg) {

    //printf("Begin an operation\n");

    argStr* args = (argStr*)arg;
    double** result = args->result;
    int tid = args->tid;
    int size = args->size;  // Size of the matrix
    long s = args->s;    // Start
    long e = args->e;    // End

    // Print message to confirm data is getting stored
    //printf("Hello from operation %d! \n", tid);
    //printf("I am working from number %ld to %ld\n", s, e);

    for(int r = s; r <= e; r++) {        // May need to declare out of loop
        for(int c = 0; c < size; c++) {
            result[r][c] = 0.0;
            for(int i = 0; i < size; i++) {
                result[r][c] += mat1[r][i] * mat2[i][c];
            }
        }
    }

    // Print multipled matrix values
    //printMat(mat3, size, size);
    return NULL;
}

This is my serial code: To use this pass in the same sized row and column (see my useage below)

#include <stdio.h>
#include <stdlib.h>  // rand(), srand()
#include <unistd.h>
#include <time.h>
// Matrix multiply code

// **** Time struct **** //
struct timespec time1, time2, diffTime;

// Prototypes
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** reserveMatrix(int nRows, int nCols);
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);

// Begin main
int main(int argc, char *argv[])
{
    int rows = atoi(argv[1]);
    int cols = atoi(argv[2]);

    // Declare the ARRAYS and populate them
    double** arr1 = reserveMatrix(rows, cols);
    double** arr2 = reserveMatrix(rows, cols);
    double** arr3 = reserveMatrix(rows, cols);
    double** arr4 = reserveMatrix(rows, cols);

    double prod1 = matrixProduct(arr1, arr2, rows, cols);
    
    //Start Clock
    clock_gettime(CLOCK_MONOTONIC, &time1);
        
        arr3 = matrixMultiply(arr1, arr2, arr3, rows, cols);
    
    // Stop timer and find time taken
    clock_gettime(CLOCK_MONOTONIC, &time2);
    diffTime = timespecDifference(time1, time2);
    double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
    //Print Time
    printf("Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
  

    // Print input matrix values. Used to test that matrix multiply works - it does
   


    // Perform a transposition of matrix 2

    for (int r = 0; r < rows; ++r) {
        for (int c = r + 1; c < cols; ++c) {
            double val = arr2[r][c];
            arr2[r][c] = arr2[c][r];
            arr2[c][r] = val;
        }
    }


    // Run matrix multiply again on the newly transposed data.
    //Start Clock
    clock_gettime(CLOCK_MONOTONIC, &time1);

        arr4 = transMatrixMultiply(arr1, arr2, arr4, rows, cols);
   
   // Stop timer and find time taken
    clock_gettime(CLOCK_MONOTONIC, &time2);
    diffTime = timespecDifference(time1, time2);
    cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
    //Print Time
    printf("Trans Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
  


    //double prod2 = matrixProduct(arr3, arr4, rows, cols);
    //printf("The matrix product of m3 and m4 is: %f\n", prod2);

    //printMat(mat3, rows, cols);
    return 0;
} 

// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
    struct timespec temp;
    if ((end.tv_nsec - start.tv_nsec) < 0) {
        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
        temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
    }
    else {
        temp.tv_sec = end.tv_sec - start.tv_sec;
        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
    }
    return temp;
}

// standard matrix multiply
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
    for (int r = 0; r < nRows; ++r) {
        for (int c = 0; c < nCols; ++c) {
            result[r][c] = 0.0;
            for (int i = 0; i < nRows; ++i) {
                result[r][c] += matrix1[r][i] * matrix2[i][c]; 
            }
        }
    }
    return result;
}

// Transpose matrix multiply
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
    for (int c = 0; c < nCols; ++c) {
        for (int r = 0; r < nRows; ++r) {
            result[c][r] = 0.0;
            for (int i = 0; i < nCols; ++i) {
                result[c][r] += matrix1[c][i] * matrix2[r][i]; 
            }
        }
    }
    return result;
}

// Reserve data function. Reserves and populates array data
double** reserveMatrix(int nRows, int nCols) {
    double** matrix1 = (double**)malloc(nRows * sizeof(double*));
    matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
    
    // Assign row pointers to "segment" out the data
    for (int r = 1; r < nRows; ++r) {
        matrix1[r] = &(matrix1[0][r * nCols]);
    }

    // Give values to the array
    for(int i = 0; i < nRows * nCols; i++) {
        matrix1[0][i] = i; 
    }

    return matrix1;
}

// Check that matrix1 and matrix2 are the same
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols) {
    double sum = 0.0;
    for(int i = 0; i < nRows * nCols; i++) {
        sum += (mat1[0][i] - mat2[0][i]) * (mat1[0][i] - mat2[0][i]);
        //printf("matrix product pos: %i, sum: %f\n", i, sum);
    }
    return sum;
}

// Print matrix function
void printMat(double** mat1, int rows, int cols) {
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            printf("%f, ", mat1[i][j]);
        }
        printf("\n");
    }
    printf("End of array print\n");
}

Here is the linux output of me compiling and running this code. At matrix size 1200 x 1200 the run time differences are not that pronounced, but the serial code ends up being significantly faster than the parallel at sizes above 1500 x 1500:

  • MYPC:~/Projects/matrixMultiply/phase3$ gcc matrixMult.c -o MM
  • MYPC:~/Projects/matrixMultiply/phase3$ gcc pMatMult.c -lpthread -o PMM
  • MYPC:~/Projects/matrixMultiply/phase3$ ./MM 1200 1200
  • Matrix Multiply, 1200, 25.487388
  • Trans Matrix Multiply, 1200, 16.452777
  • MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 2
  • Pthread Matrix Multiply, 1200, 2, 22.495115
  • MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 4
  • Pthread Matrix Multiply, 1200, 4, 22.181686

The sections in bold contain the meaningful output. It reads

  • name of the process
  • matrix size
  • number of threads spawned (in pThread program only)
  • run time

Any help would be appreciated. I will be instantly replying to questions for the next 2 hours.



Solution 1:[1]

The solution was to terminate extra processes that were running on my ubuntu machine. The code worked perfectly fine as a few users pointed out. Killing all other processes on the machine, then running my parallel code provided the expected speedups.

I am not sure of the precise technical reason this is going on other than the machine wasn't prioritizing my program when it had others running, resulting in slower times.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 oXeru