#include <iostream>
#include <cmath>
#include <fstream>
#include <vector>
#include <cstdlib>
#include <limits>
#include <mpi.h>
#include <stdio.h>
#include <assert.h>
typedef std::vector<double> Vec;
typedef double(*Function)(const size_t&, const Vec&);
typedef double(*Deriv)(const size_t&, const size_t&, const Vec&);
typedef std::vector<Function> Sys_;
typedef std::vector<std::vector<Deriv> > Derivatives;
typedef std::vector<std::vector<double> > Matrix;
typedef Vec(*LSSolver)(const Matrix&, const Vec&);
typedef Vec(*VecSum)(const Vec&, const Vec&);
typedef double(*VecDiff)(const Vec&, const Vec&);
struct LS {
Matrix A;
Vec b;
};
size_t system_size = 4096;
double func(const size_t& i, const Vec& v) {
return cos(v[i]) - 1;
}
double derivative(const size_t& i, const size_t& j, const Vec& v) {
if (i == j) {
return -sin(v[i]);
}
return 0.0;
}
static void printVec(const Vec& v) {
std::cout << "size: " << v.size() << std::endl;
for (size_t i = 0; i < v.size(); ++i) {
std::cout << v[i] << " ";
}
std::cout << std::endl;
}
Vec gauss(const Matrix& A, const Vec& b) {
MPI_Status status;
int nSize, nRowsBloc, nRows, nCols;
int Numprocs, MyRank, Root = 0;
int irow, jrow, icol, index, ColofPivot, neigh_proc;
double *Input_A, *Input_B, *ARecv, *BRecv;
double *Output, Pivot;
double *X_buffer, *Y_buffer;
double *Buffer_Pivot, *Buffer_bksub;
double tmp;
MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
MPI_Comm_size(MPI_COMM_WORLD, &Numprocs);
if (MyRank == 0) {
nRows = A.size();
nCols = A[0].size();
nSize = nRows;
Input_B = (double *)malloc(nSize * sizeof(double));
for (irow = 0; irow < nSize; irow++) {
Input_B[irow] = b[irow];
}
Input_A = (double *)malloc(nSize*nSize * sizeof(double));
index = 0;
for (irow = 0; irow < nSize; irow++)
for (icol = 0; icol < nSize; icol++)
Input_A[index++] = A[irow][icol];
}
MPI_Bcast(&nRows, 1, MPI_INT, Root, MPI_COMM_WORLD);
MPI_Bcast(&nCols, 1, MPI_INT, Root, MPI_COMM_WORLD);
/* .... Broad cast the size of the matrix to all ....*/
MPI_Bcast(&nSize, 1, MPI_INT, 0, MPI_COMM_WORLD);
nRowsBloc = nSize / Numprocs;
/*......Memory of input matrix and vector on each process .....*/
ARecv = (double *)malloc(nRowsBloc * nSize * sizeof(double));
BRecv = (double *)malloc(nRowsBloc * sizeof(double));
/*......Scatter the Input Data to all process ......*/
MPI_Scatter(Input_A, nRowsBloc * nSize, MPI_DOUBLE, ARecv, nRowsBloc * nSize, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Scatter(Input_B, nRowsBloc, MPI_DOUBLE, BRecv, nRowsBloc, MPI_DOUBLE, 0, MPI_COMM_WORLD);
/* ....Memory allocation of ray Buffer_Pivot .....*/
Buffer_Pivot = (double *)malloc((nRowsBloc + 1 + nSize * nRowsBloc) * sizeof(double));
/* Receive data from all processors (i=0 to k-1) above my processor (k).... */
for (neigh_proc = 0; neigh_proc < MyRank; neigh_proc++) {
MPI_Recv(Buffer_Pivot, nRowsBloc * nSize + 1 + nRowsBloc, MPI_DOUBLE, neigh_proc, neigh_proc, MPI_COMM_WORLD, &status);
for (irow = 0; irow < nRowsBloc; irow++) {
/* .... Buffer_Pivot[0] : locate the rank of the processor received */
/* .... Index is used to reduce the matrix to traingular matrix */
/* .... Buffer_Pivot[0] is used to determine the starting value of
pivot in each row of the matrix, on each processor */
ColofPivot = ((int)Buffer_Pivot[0]) * nRowsBloc + irow;
for (jrow = 0; jrow < nRowsBloc; jrow++) {
index = jrow*nSize;
tmp = ARecv[index + ColofPivot];
for (icol = ColofPivot; icol < nSize; icol++) {
ARecv[index + icol] -= tmp * Buffer_Pivot[irow*nSize + icol + 1 + nRowsBloc];
}
BRecv[jrow] -= tmp * Buffer_Pivot[1 + irow];
ARecv[index + ColofPivot] = 0.0;
}
}
}
Y_buffer = (double *)malloc(nRowsBloc * sizeof(double));
/* ....Modification of row entries on each processor ...*/
/* ....Division by pivot value and modification ...*/
for (irow = 0; irow < nRowsBloc; irow++) {
ColofPivot = MyRank * nRowsBloc + irow;
index = irow*nSize;
Pivot = ARecv[index + ColofPivot];
assert(Pivot != 0);
for (icol = ColofPivot; icol < nSize; icol++) {
ARecv[index + icol] = ARecv[index + icol] / Pivot;
Buffer_Pivot[index + icol + 1 + nRowsBloc] = ARecv[index + icol];
}
Y_buffer[irow] = BRecv[irow] / Pivot;
Buffer_Pivot[irow + 1] = Y_buffer[irow];
for (jrow = irow + 1; jrow < nRowsBloc; jrow++) {
tmp = ARecv[jrow*nSize + ColofPivot];
for (icol = ColofPivot + 1; icol < nSize; icol++) {
ARecv[jrow*nSize + icol] -= tmp * Buffer_Pivot[index + icol + 1 + nRowsBloc];
}
BRecv[jrow] -= tmp * Y_buffer[irow];
ARecv[jrow*nSize + irow] = 0;
}
}
/*....Send data to all processors below the current processors */
for (neigh_proc = MyRank + 1; neigh_proc < Numprocs; neigh_proc++) {
/* ...... Rank is stored in first location of Buffer_Pivot and
this is used in reduction to triangular form ....*/
Buffer_Pivot[0] = (double)MyRank;
MPI_Send(Buffer_Pivot, nRowsBloc*nSize + 1 + nRowsBloc, MPI_DOUBLE, neigh_proc, MyRank, MPI_COMM_WORLD);
}
/*.... Back Substitution starts from here ........*/
/*.... Receive from all higher processors ......*/
Buffer_bksub = (double *)malloc(nRowsBloc * 2 * sizeof(double));
X_buffer = (double *)malloc(nRowsBloc * sizeof(double));
for (neigh_proc = MyRank + 1; neigh_proc < Numprocs; ++neigh_proc) {
MPI_Recv(Buffer_bksub, 2 * nRowsBloc, MPI_DOUBLE, neigh_proc, neigh_proc,
MPI_COMM_WORLD, &status);
for (irow = nRowsBloc - 1; irow >= 0; irow--) {
for (icol = nRowsBloc - 1; icol >= 0; icol--) {
/* ... Pick up starting Index .....*/
index = (int)Buffer_bksub[icol];
Y_buffer[irow] -= Buffer_bksub[nRowsBloc + icol] * ARecv[irow*nSize + index];
}
}
}
for (irow = nRowsBloc - 1; irow >= 0; irow--) {
index = MyRank*nRowsBloc + irow;
Buffer_bksub[irow] = (double)index;
Buffer_bksub[nRowsBloc + irow] = X_buffer[irow] = Y_buffer[irow];
for (jrow = irow - 1; jrow >= 0; jrow--)
Y_buffer[jrow] -= X_buffer[irow] * ARecv[jrow*nSize + index];
}
/*.... Send to all lower processes...*/
for (neigh_proc = 0; neigh_proc < MyRank; neigh_proc++) {
MPI_Send(Buffer_bksub, 2 * nRowsBloc, MPI_DOUBLE, neigh_proc, MyRank, MPI_COMM_WORLD);
}
/*.... Gather the result on the processor 0 ....*/
Output = (double *)malloc(nSize * sizeof(double));
MPI_Gather(X_buffer, nRowsBloc, MPI_DOUBLE, Output, nRowsBloc, MPI_DOUBLE, 0, MPI_COMM_WORLD);
/* .......Output vector .....*/
Vec res(nSize);
if (MyRank == 0) {
for (irow = 0; irow < nSize; irow++)
res[irow] = Output[irow];
}
return res;
}
Vec sum(const Vec& lhs, const Vec& rhs) {
Vec res(lhs.size());
if (lhs.size() != rhs.size()) {
return res;
}
for (size_t i = 0; i < lhs.size(); ++i) {
res[i] = lhs[i] + rhs[i];
}
return res;
}
double diff(const Vec& lhs, const Vec& rhs) {
double res = 0.;
for (size_t i = 0; i < lhs.size(); ++i) {
res += lhs[i] - rhs[i];
}
return fabs(res);
}
class Newtone {
public:
Newtone(int rank) : m_rank(rank) {
m_h = std::numeric_limits<double>::epsilon();
}
Vec find_solution(const Sys_& sys, const Vec& start, const Derivatives& d, LSSolver solver,
VecSum vec_summator, VecDiff vec_differ, const double& eps, const size_t& max_iter) {
size_t iter_count = 1;
double diff = 0.;
Vec sys_val(sys.size(), 0);
if (m_rank == 0) {
m_jac.reserve(sys.size());
for (size_t i = 0; i < sys.size(); ++i) {
Vec v(sys.size());
m_jac.push_back(v);
}
compute_jacobian(sys, d, start);
for (size_t i = 0; i < sys.size(); ++i) {
sys_val[i] = -sys[i](i, start);
}
}
MPI_Barrier(MPI_COMM_WORLD);
Vec delta = solver(m_jac, sys_val);
Vec new_sol(sys.size()), old_sol(sys.size());
if (m_rank == 0) {
new_sol = vec_summator(start, delta);
old_sol = start;
}
MPI_Bcast(new_sol.data(), new_sol.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(old_sol.data(), old_sol.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);
diff = vec_differ(new_sol, old_sol);
while (diff > eps && iter_count <= max_iter) {
old_sol = new_sol;
if (m_rank == 0) {
compute_jacobian(sys, d, old_sol);
for (size_t i = 0; i < sys.size(); ++i) {
sys_val[i] = -sys[i](i, old_sol);
}
}
MPI_Barrier(MPI_COMM_WORLD);
delta = solver(m_jac, sys_val);
if (m_rank == 0) {
new_sol = vec_summator(old_sol, delta);
}
MPI_Bcast(new_sol.data(), new_sol.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);
iter_count++;
diff = vec_differ(new_sol, old_sol);
}
return new_sol;
}
double compute_derivative(const size_t& pos, Function func, const size_t& var_num, const Vec& point) {
Vec left_point(point), right_point(point);
left_point[var_num] -= m_h;
right_point[var_num] += m_h;
double left = func(pos, left_point), right = func(pos, right_point);
return (right - left) / (2 * m_h);
}
private:
void compute_jacobian(const Sys_& sys, const Derivatives& d, const Vec& point) {
for (size_t i = 0; i < sys.size(); ++i) {
for (size_t j = 0; j < sys.size(); ++j) {
double res_val;
res_val = d[i][j](i, j, point);
m_jac[i][j] = res_val;
}
}
}
double m_h;
int m_rank;
Matrix m_jac;
Vec m_right_part;
};
int main(int argc, char** argv) {
int rank, size;
Sys_ s;
Derivatives d(system_size);
Vec start(system_size, 0.87);
for (size_t i = 0; i < system_size; ++i) {
s.push_back(&func);
}
for (size_t i = 0; i < system_size; ++i) {
for (size_t j = 0; j < system_size; ++j)
d[i].push_back(&derivative);
}
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
Newtone n(rank);
MPI_Barrier(MPI_COMM_WORLD);
double time = MPI_Wtime();
Vec sol = n.find_solution(s, start, d, &gauss, &sum, &diff, 0.0001, 100);
MPI_Barrier(MPI_COMM_WORLD);
time = MPI_Wtime() - time;
double max_time;
MPI_Reduce(&time, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (rank == 0) {
std::ofstream myfile;
char filename[32];
snprintf(filename, 32, "out_%ld_%d.txt", system_size, size);
myfile.open(filename);
for (size_t i = 0; i < sol.size(); ++i) {
myfile << sol[i] << " ";
}
myfile << std::endl;
myfile << "Time: " << time << " " << max_time << std::endl;
myfile.close();
}
MPI_Finalize();
return 0;
}