Fast and Slow Matrix Multiplication by James Pate Williams, Jr. © January 13, 2024, All Applicable Rights Reserved

Note: For comparison with a paper on matrix operations, we use column major order also known as column major echelon order. Column major order is used in Fortran and row major order is utilized in C computations.

Here is the supposedly slow C source code:

void ColMajorSlowMatMul(double** a, double** b, double** c, int n)
{
    for (int j = 1; j <= n; j++)
        for (int k = 1; k <= n; k++)
            c[j][k] = 0.0;

    for (int j = 1; j <= n; j++)
    {
        for (int i = 1; i <= n; i++)
        {
            double t = 0;

            for (int k = 1; k <= n; k++)
                t += a[i][k] * b[k][j];

            c[i][j] = t;
        }
    }
}

And now we present the allegedly fast multiplication code:

void ColMajorFastMatMul(double** a, double** b, double** c, int n)
{
    for (int j = 1; j <= n; j++)
        for (int k = 1; k <= n; k++)
            c[j][k] = 0.0;
    
    for (int j = 1; j <= n; j++)
    {
        for (int k = 1; k <= n; k++)
        {
            double t = b[k][j];

            for (int i = 1; i <= n; i++)
                c[i][j] += a[i][k] * t;
        }
    }
}

Finally, we have another algorithm for matrix multiplication:

void ColMajorSemiMatMul(double** a, double** b, double** c, int n)
{
    for (int j = 1; j <= n; j++)
        for (int k = 1; k <= n; k++)
            c[j][k] = 0.0;

    for (int j = 1; j <= n; j++)
    {
        for (int i = 1; i <= n; i++)
        {
            for (int k = 1; k <= n; k++)
                c[i][j] += a[i][k] * b[k][j];
        }
    }
}

We performed the following experiments.


slow runtime in microseconds: 2736

semi runtime in microseconds: 4001
fast runtime in microseconds: 5293
n * n = 10000
equal = 10000
n * n = 10000
equal = 10000
slow runtime in microseconds: 21962
semi runtime in microseconds: 29585
fast runtime in microseconds: 28701
n * n = 40000
equal = 40000
n * n = 40000
equal = 40000
slow runtime in microseconds: 67256
semi runtime in microseconds: 101554
fast runtime in microseconds: 107969
n * n = 90000
equal = 90000
n * n = 90000
equal = 90000
slow runtime in microseconds: 183839
semi runtime in microseconds: 268616
fast runtime in microseconds: 244832
n * n = 160000
equal = 160000
n * n = 160000
equal = 160000
slow runtime in microseconds: 428263
semi runtime in microseconds: 638721
fast runtime in microseconds: 650535
n * n = 250000
equal = 250000
n * n = 250000
equal = 250000

C:\Users\james\source\repos\FastMatrixMultiplication\Debug\FastMatrixMultiplication.exe (process 29552) exited with code 0.
Press any key to close this window . . .















// FastMatrixMultiplication.cpp : This file contains the 'main' function. Program execution begins and ends there.
// https://www-users.york.ac.uk/~mijp1/teaching/grad_HPC_for_MatSci/Lecture4.pdf
// https://stackoverflow.com/questions/25483620/how-to-measure-running-time-of-specific-function-in-c-very-accurate

#include <stdlib.h>
#include <chrono>
#include <iostream>
using namespace std;

typedef chrono::high_resolution_clock Clock;

void ColMajorFastMatMul(double** a, double** b, double** c, int n)
{
    for (int j = 1; j <= n; j++)
        for (int k = 1; k <= n; k++)
            c[j][k] = 0.0;
    
    for (int j = 1; j <= n; j++)
    {
        for (int k = 1; k <= n; k++)
        {
            double t = b[k][j];

            for (int i = 1; i <= n; i++)
                c[i][j] += a[i][k] * t;
        }
    }
}

void ColMajorSemiMatMul(double** a, double** b, double** c, int n)
{
    for (int j = 1; j <= n; j++)
        for (int k = 1; k <= n; k++)
            c[j][k] = 0.0;

    for (int j = 1; j <= n; j++)
    {
        for (int i = 1; i <= n; i++)
        {
            for (int k = 1; k <= n; k++)
                c[i][j] += a[i][k] * b[k][j];
        }
    }
}

void ColMajorSlowMatMul(double** a, double** b, double** c, int n)
{
    for (int j = 1; j <= n; j++)
        for (int k = 1; k <= n; k++)
            c[j][k] = 0.0;

    for (int j = 1; j <= n; j++)
    {
        for (int i = 1; i <= n; i++)
        {
            double t = 0;

            for (int k = 1; k <= n; k++)
                t += a[i][k] * b[k][j];

            c[i][j] = t;
        }
    }
}

void GenerateMatrix(double** a, int n, int seed)
{
    srand(seed);

    for (int j = 1; j <= n; j++)
    {
        for (int k = 1; k <= n; k++)
        {
            a[j][k] = (double)rand() / RAND_MAX;
        }
    }
}

int main()
{
    for (int n = 100; n <= 500; n += 100)
    {
        double** a = new double* [n + 1];
        double** b = new double* [n + 1];
        double** c = new double* [n + 1];
        double** d = new double* [n + 1];
        double** e = new double* [n + 1];

        for (int j = 0; j < n + 1; j++)
        {
            a[j] = new double[n + 1];
            b[j] = new double[n + 1];
            c[j] = new double[n + 1];
            d[j] = new double[n + 1];
            e[j] = new double[n + 1];
        }

        GenerateMatrix(a, n, 1);
        GenerateMatrix(b, n, 2);

        auto clock1 = Clock::now();
        ColMajorSlowMatMul(a, b, c, n);
        auto clock2 = Clock::now();
        long long microseconds1 = chrono::duration_cast<chrono::microseconds>
            (clock2 - clock1).count();
    
        auto clock3 = Clock::now();
        ColMajorSemiMatMul(a, b, d, n);
        auto clock4 = Clock::now();
        long long microseconds2 = chrono::duration_cast<chrono::microseconds>
            (clock4 - clock3).count();

        auto clock5 = Clock::now();
        ColMajorFastMatMul(a, b, e, n);
        auto clock6 = Clock::now();
        long long microseconds3 = chrono::duration_cast<chrono::microseconds>
            (clock6 - clock5).count();

        cout << "slow runtime in microseconds: " << microseconds1 << endl;
        cout << "semi runtime in microseconds: " << microseconds2 << endl;
        cout << "fast runtime in microseconds: " << microseconds3 << endl;

        long long equal = 0;

        for (int j = 1; j <= n; j++)
            for (int k = 1; k <= n; k++)
                if (c[j][k] == d[j][k])
                    equal++;

        cout << "n * n = " << n * n << endl;
        cout << "equal = " << equal << endl;

        equal = 0;

        for (int j = 1; j <= n; j++)
            for (int k = 1; k <= n; k++)
                if (c[j][k] == e[j][k])
                    equal++;

        cout << "n * n = " << n * n << endl;
        cout << "equal = " << equal << endl;

        for (int j = 0; j < n + 1; j++)
        {
            delete[] a[j];
            delete[] b[j];
            delete[] c[j];
            delete[] d[j];
            delete[] e[j];
        }

        delete[] a;
        delete[] b;
        delete[] c;
        delete[] d;
        delete[] e;
    }
}
Unknown's avatar

Author: jamespatewilliamsjr

My whole legal name is James Pate Williams, Jr. I was born in LaGrange, Georgia approximately 70 years ago. I barely graduated from LaGrange High School with low marks in June 1971. Later in June 1979, I graduated from LaGrange College with a Bachelor of Arts in Chemistry with a little over a 3 out 4 Grade Point Average (GPA). In the Spring Quarter of 1978, I taught myself how to program a Texas Instruments desktop programmable calculator and in the Summer Quarter of 1978 I taught myself Dayton BASIC (Beginner's All-purpose Symbolic Instruction Code) on LaGrange College's Data General Eclipse minicomputer. I took courses in BASIC in the Fall Quarter of 1978 and FORTRAN IV (Formula Translator IV) in the Winter Quarter of 1979. Professor Kenneth Cooper, a genius poly-scientist taught me a course in the Intel 8085 microprocessor architecture and assembly and machine language. We would hand assemble our programs and insert the resulting machine code into our crude wooden box computer which was designed and built by Professor Cooper. From 1990 to 1994 I earned a Bachelor of Science in Computer Science from LaGrange College. I had a 4 out of 4 GPA in the period 1990 to 1994. I took courses in C, COBOL, and Pascal during my BS work. After graduating from LaGrange College a second time in May 1994, I taught myself C++. In December 1995, I started using the Internet and taught myself client-server programming. I created a website in 1997 which had C and C# implementations of algorithms from the "Handbook of Applied Cryptography" by Alfred J. Menezes, et. al., and some other cryptography and number theory textbooks and treatises.

Leave a comment