Note: For comparison with a paper on matrix operations, we use column major order also known as column major echelon order. Column major order is used in Fortran and row major order is utilized in C computations.
Here is the supposedly slow C source code:
void ColMajorSlowMatMul(double** a, double** b, double** c, int n)
{
for (int j = 1; j <= n; j++)
for (int k = 1; k <= n; k++)
c[j][k] = 0.0;
for (int j = 1; j <= n; j++)
{
for (int i = 1; i <= n; i++)
{
double t = 0;
for (int k = 1; k <= n; k++)
t += a[i][k] * b[k][j];
c[i][j] = t;
}
}
}
And now we present the allegedly fast multiplication code:
void ColMajorFastMatMul(double** a, double** b, double** c, int n)
{
for (int j = 1; j <= n; j++)
for (int k = 1; k <= n; k++)
c[j][k] = 0.0;
for (int j = 1; j <= n; j++)
{
for (int k = 1; k <= n; k++)
{
double t = b[k][j];
for (int i = 1; i <= n; i++)
c[i][j] += a[i][k] * t;
}
}
}
Finally, we have another algorithm for matrix multiplication:
void ColMajorSemiMatMul(double** a, double** b, double** c, int n)
{
for (int j = 1; j <= n; j++)
for (int k = 1; k <= n; k++)
c[j][k] = 0.0;
for (int j = 1; j <= n; j++)
{
for (int i = 1; i <= n; i++)
{
for (int k = 1; k <= n; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
}
We performed the following experiments.
slow runtime in microseconds: 2736
semi runtime in microseconds: 4001
fast runtime in microseconds: 5293
n * n = 10000
equal = 10000
n * n = 10000
equal = 10000
slow runtime in microseconds: 21962
semi runtime in microseconds: 29585
fast runtime in microseconds: 28701
n * n = 40000
equal = 40000
n * n = 40000
equal = 40000
slow runtime in microseconds: 67256
semi runtime in microseconds: 101554
fast runtime in microseconds: 107969
n * n = 90000
equal = 90000
n * n = 90000
equal = 90000
slow runtime in microseconds: 183839
semi runtime in microseconds: 268616
fast runtime in microseconds: 244832
n * n = 160000
equal = 160000
n * n = 160000
equal = 160000
slow runtime in microseconds: 428263
semi runtime in microseconds: 638721
fast runtime in microseconds: 650535
n * n = 250000
equal = 250000
n * n = 250000
equal = 250000
C:\Users\james\source\repos\FastMatrixMultiplication\Debug\FastMatrixMultiplication.exe (process 29552) exited with code 0.
Press any key to close this window . . .
// FastMatrixMultiplication.cpp : This file contains the 'main' function. Program execution begins and ends there.
// https://www-users.york.ac.uk/~mijp1/teaching/grad_HPC_for_MatSci/Lecture4.pdf
// https://stackoverflow.com/questions/25483620/how-to-measure-running-time-of-specific-function-in-c-very-accurate
#include <stdlib.h>
#include <chrono>
#include <iostream>
using namespace std;
typedef chrono::high_resolution_clock Clock;
void ColMajorFastMatMul(double** a, double** b, double** c, int n)
{
for (int j = 1; j <= n; j++)
for (int k = 1; k <= n; k++)
c[j][k] = 0.0;
for (int j = 1; j <= n; j++)
{
for (int k = 1; k <= n; k++)
{
double t = b[k][j];
for (int i = 1; i <= n; i++)
c[i][j] += a[i][k] * t;
}
}
}
void ColMajorSemiMatMul(double** a, double** b, double** c, int n)
{
for (int j = 1; j <= n; j++)
for (int k = 1; k <= n; k++)
c[j][k] = 0.0;
for (int j = 1; j <= n; j++)
{
for (int i = 1; i <= n; i++)
{
for (int k = 1; k <= n; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
}
void ColMajorSlowMatMul(double** a, double** b, double** c, int n)
{
for (int j = 1; j <= n; j++)
for (int k = 1; k <= n; k++)
c[j][k] = 0.0;
for (int j = 1; j <= n; j++)
{
for (int i = 1; i <= n; i++)
{
double t = 0;
for (int k = 1; k <= n; k++)
t += a[i][k] * b[k][j];
c[i][j] = t;
}
}
}
void GenerateMatrix(double** a, int n, int seed)
{
srand(seed);
for (int j = 1; j <= n; j++)
{
for (int k = 1; k <= n; k++)
{
a[j][k] = (double)rand() / RAND_MAX;
}
}
}
int main()
{
for (int n = 100; n <= 500; n += 100)
{
double** a = new double* [n + 1];
double** b = new double* [n + 1];
double** c = new double* [n + 1];
double** d = new double* [n + 1];
double** e = new double* [n + 1];
for (int j = 0; j < n + 1; j++)
{
a[j] = new double[n + 1];
b[j] = new double[n + 1];
c[j] = new double[n + 1];
d[j] = new double[n + 1];
e[j] = new double[n + 1];
}
GenerateMatrix(a, n, 1);
GenerateMatrix(b, n, 2);
auto clock1 = Clock::now();
ColMajorSlowMatMul(a, b, c, n);
auto clock2 = Clock::now();
long long microseconds1 = chrono::duration_cast<chrono::microseconds>
(clock2 - clock1).count();
auto clock3 = Clock::now();
ColMajorSemiMatMul(a, b, d, n);
auto clock4 = Clock::now();
long long microseconds2 = chrono::duration_cast<chrono::microseconds>
(clock4 - clock3).count();
auto clock5 = Clock::now();
ColMajorFastMatMul(a, b, e, n);
auto clock6 = Clock::now();
long long microseconds3 = chrono::duration_cast<chrono::microseconds>
(clock6 - clock5).count();
cout << "slow runtime in microseconds: " << microseconds1 << endl;
cout << "semi runtime in microseconds: " << microseconds2 << endl;
cout << "fast runtime in microseconds: " << microseconds3 << endl;
long long equal = 0;
for (int j = 1; j <= n; j++)
for (int k = 1; k <= n; k++)
if (c[j][k] == d[j][k])
equal++;
cout << "n * n = " << n * n << endl;
cout << "equal = " << equal << endl;
equal = 0;
for (int j = 1; j <= n; j++)
for (int k = 1; k <= n; k++)
if (c[j][k] == e[j][k])
equal++;
cout << "n * n = " << n * n << endl;
cout << "equal = " << equal << endl;
for (int j = 0; j < n + 1; j++)
{
delete[] a[j];
delete[] b[j];
delete[] c[j];
delete[] d[j];
delete[] e[j];
}
delete[] a;
delete[] b;
delete[] c;
delete[] d;
delete[] e;
}
}