This commit is contained in:
youxie
2019-12-01 22:57:43 +01:00
parent 56a95ec733
commit 2324790537
3 changed files with 160 additions and 0 deletions

750
Simulations/pcgsolver.h Normal file
View File

@@ -0,0 +1,750 @@
//
// Preconditioned conjugate gradient solver
//
// Created by Robert Bridson, Ryoichi Ando and Nils Thuerey
//
#ifndef RCMATRIX3_H
#define RCMATRIX3_H
#include <iterator>
#include <cassert>
#include <vector>
#include <fstream>
#include <cmath>
#include <functional>
// index type
#define int_index long long
// parallelization disabled
#define parallel_for(size) { int thread_number = 0; int_index parallel_index=0; for( int_index parallel_index=0; parallel_index<(int_index)size; parallel_index++ ) {
#define parallel_end } thread_number=parallel_index=0; }
#define parallel_block
#define do_parallel
#define do_end
#define block_end
#include "vectorbase.h"
// note - "Int" instead of "N" here, the latter is size!
template<class Int, class T>
struct InstantBLAS {
static inline Int offset(Int N, Int incX) { return ((incX) > 0 ? 0 : ((N) - 1) * (-(incX))); }
static T cblas_ddot( const Int N, const T *X, const Int incX, const T *Y, const Int incY) {
double r = 0.0; // always use double precision internally here...
Int i;
Int ix = offset(N,incX);
Int iy = offset(N,incY);
for (i = 0; i < N; i++) {
r += X[ix] * Y[iy];
ix += incX;
iy += incY;
}
return (T)r;
}
static void cblas_daxpy( const Int N, const T alpha, const T *X, const Int incX, T *Y, const Int incY) {
Int i;
if (N <= 0 ) return;
if (alpha == 0.0) return;
if (incX == 1 && incY == 1) {
const Int m = N % 4;
for (i = 0; i < m; i++)
Y[i] += alpha * X[i];
for (i = m; i + 3 < N; i += 4) {
Y[i ] += alpha * X[i ];
Y[i + 1] += alpha * X[i + 1];
Y[i + 2] += alpha * X[i + 2];
Y[i + 3] += alpha * X[i + 3];
}
} else {
Int ix = offset(N, incX);
Int iy = offset(N, incY);
for (i = 0; i < N; i++) {
Y[iy] += alpha * X[ix];
ix += incX;
iy += incY;
}
}
}
// dot products ==============================================================
static inline T dot(const std::vector<T> &x, const std::vector<T> &y) {
return cblas_ddot((int)x.size(), &x[0], 1, &y[0], 1);
}
// inf-norm (maximum absolute value: index of max returned) ==================
static inline Int index_abs_max(const std::vector<T> &x) {
int maxind = 0;
T maxvalue = 0;
for(Int i = 0; i < (Int)x.size(); ++i) {
if(std::abs(x[i]) > maxvalue) {
maxvalue = fabs(x[i]);
maxind = i;
}
}
return maxind;
}
// inf-norm (maximum absolute value) =========================================
// technically not part of BLAS, but useful
static inline T abs_max(const std::vector<T> &x)
{ return std::abs(x[index_abs_max(x)]); }
// saxpy (y=alpha*x+y) =======================================================
static inline void add_scaled(T alpha, const std::vector<T> &x, std::vector<T> &y) {
cblas_daxpy((Int)x.size(), alpha, &x[0], 1, &y[0], 1);
}
};
template<class T>
void zero(std::vector<T> &v)
{ for(int i=(int)v.size()-1; i>=0; --i) v[i]=0; }
template<class T>
void insert(std::vector<T> &a, unsigned int index, T e)
{
a.push_back(a.back());
for(unsigned int i=(unsigned int)a.size()-1; i>index; --i)
a[i]=a[i-1];
a[index]=e;
}
template<class T>
void erase(std::vector<T> &a, unsigned int index)
{
for(unsigned int i=index; i<a.size()-1; ++i)
a[i]=a[i+1];
a.pop_back();
}
//============================================================================
// Dynamic compressed sparse row matrix.
template<class T>
struct SparseMatrix
{
int n; // dimension
std::vector<std::vector<int> > index; // for each row, a list of all column indices (sorted)
std::vector<std::vector<T> > value; // values corresponding to index
explicit SparseMatrix(int n_=0, int expected_nonzeros_per_row=7)
: n(n_), index(n_), value(n_)
{
for(int i=0; i<n; ++i){
index[i].reserve(expected_nonzeros_per_row);
value[i].reserve(expected_nonzeros_per_row);
}
}
void clear(void)
{
n=0;
index.clear();
value.clear();
}
void zero(void)
{
for(int i=0; i<n; ++i){
index[i].resize(0);
value[i].resize(0);
}
}
void resize(int n_)
{
n=n_;
index.resize(n);
value.resize(n);
}
T operator()(int i, int j) const
{
for(int k=0; k<(int)index[i].size(); ++k){
if(index[i][k]==j) return value[i][k];
else if(index[i][k]>j) return 0;
}
return 0;
}
void set_element(int i, int j, T new_value)
{
int k=0;
for(; k<(int)index[i].size(); ++k){
if(index[i][k]==j){
value[i][k]=new_value;
return;
}else if(index[i][k]>j){
insert(index[i], k, j);
insert(value[i], k, new_value);
return;
}
}
index[i].push_back(j);
value[i].push_back(new_value);
}
void add_to_element(int i, int j, T increment_value)
{
int k=0;
for(; k<(int)index[i].size(); ++k){
if(index[i][k]==j){
value[i][k]+=increment_value;
return;
}else if(index[i][k]>j){
insert(index[i], k, j);
insert(value[i], k, increment_value);
return;
}
}
index[i].push_back(j);
value[i].push_back(increment_value);
}
// assumes indices is already sorted
void add_sparse_row(int i, const std::vector<int> &indices, const std::vector<T> &values)
{
int j=0, k=0;
while(j<indices.size() && k<(int)index[i].size()){
if(index[i][k]<indices[j]){
++k;
}else if(index[i][k]>indices[j]){
insert(index[i], k, indices[j]);
insert(value[i], k, values[j]);
++j;
}else{
value[i][k]+=values[j];
++j;
++k;
}
}
for(;j<indices.size(); ++j){
index[i].push_back(indices[j]);
value[i].push_back(values[j]);
}
}
// assumes matrix has symmetric structure - so the indices in row i tell us which columns to delete i from
void symmetric_remove_row_and_column(int i)
{
for(int a=0; a<index[i].size(); ++a){
int j=index[i][a]; //
for(int b=0; b<index[j].size(); ++b){
if(index[j][b]==i){
erase(index[j], b);
erase(value[j], b);
break;
}
}
}
index[i].resize(0);
value[i].resize(0);
}
void write_matlab(std::ostream &output, const char *variable_name)
{
output<<variable_name<<"=sparse([";
for(int i=0; i<n; ++i){
for(int j=0; j<index[i].size(); ++j){
output<<i+1<<" ";
}
}
output<<"],...\n [";
for(int i=0; i<n; ++i){
for(int j=0; j<index[i].size(); ++j){
output<<index[i][j]+1<<" ";
}
}
output<<"],...\n [";
for(int i=0; i<n; ++i){
for(int j=0; j<value[i].size(); ++j){
output<<value[i][j]<<" ";
}
}
output<<"], "<<n<<", "<<n<<");"<<std::endl;
}
};
typedef SparseMatrix<float> SparseMatrixf;
typedef SparseMatrix<double> SparseMatrixd;
// perform result=matrix*x
template<class T>
void multiply(const SparseMatrix<T> &matrix, const std::vector<T> &x, std::vector<T> &result)
{
assert(matrix.n==x.size());
result.resize(matrix.n);
//for(int i=0; i<matrix.n; ++i)
parallel_for(matrix.n) {
unsigned i (parallel_index);
T value=0;
for(int j=0; j<(int)matrix.index[i].size(); ++j){
value+=matrix.value[i][j]*x[matrix.index[i][j]];
}
result[i]=value;
} parallel_end
}
// perform result=result-matrix*x
template<class T>
void multiply_and_subtract(const SparseMatrix<T> &matrix, const std::vector<T> &x, std::vector<T> &result)
{
assert(matrix.n==x.size());
result.resize(matrix.n);
for(int i=0; i<(int)matrix.n; ++i){
for(int j=0; j<(int)matrix.index[i].size(); ++j){
result[i]-=matrix.value[i][j]*x[matrix.index[i][j]];
}
}
}
//============================================================================
// Fixed version of SparseMatrix. This is not a good structure for dynamically
// modifying the matrix, but can be significantly faster for matrix-vector
// multiplies due to better data locality.
template<class T>
struct FixedSparseMatrix
{
int n; // dimension
std::vector<T> value; // nonzero values row by row
std::vector<int> colindex; // corresponding column indices
std::vector<int> rowstart; // where each row starts in value and colindex (and last entry is one past the end, the number of nonzeros)
explicit FixedSparseMatrix(int n_=0)
: n(n_), value(0), colindex(0), rowstart(n_+1)
{}
void clear(void)
{
n=0;
value.clear();
colindex.clear();
rowstart.clear();
}
void resize(int n_)
{
n=n_;
rowstart.resize(n+1);
}
void construct_from_matrix(const SparseMatrix<T> &matrix)
{
resize(matrix.n);
rowstart[0]=0;
for(int i=0; i<n; ++i){
rowstart[i+1]=rowstart[i]+matrix.index[i].size();
}
value.resize(rowstart[n]);
colindex.resize(rowstart[n]);
int j=0;
for(int i=0; i<n; ++i){
for(int k=0; k<(int)matrix.index[i].size(); ++k){
value[j]=matrix.value[i][k];
colindex[j]=matrix.index[i][k];
++j;
}
}
}
void write_matlab(std::ostream &output, const char *variable_name)
{
output<<variable_name<<"=sparse([";
for(int i=0; i<n; ++i){
for(int j=rowstart[i]; j<rowstart[i+1]; ++j){
output<<i+1<<" ";
}
}
output<<"],...\n [";
for(int i=0; i<n; ++i){
for(int j=rowstart[i]; j<rowstart[i+1]; ++j){
output<<colindex[j]+1<<" ";
}
}
output<<"],...\n [";
for(int i=0; i<n; ++i){
for(int j=rowstart[i]; j<rowstart[i+1]; ++j){
output<<value[j]<<" ";
}
}
output<<"], "<<n<<", "<<n<<");"<<std::endl;
}
};
// perform result=matrix*x
template<class T>
void multiply(const FixedSparseMatrix<T> &matrix, const std::vector<T> &x, std::vector<T> &result)
{
assert(matrix.n==x.size());
result.resize(matrix.n);
//for(int i=0; i<matrix.n; ++i)
parallel_for(matrix.n) {
unsigned i (parallel_index);
T value=0;
for(int j=matrix.rowstart[i]; j<matrix.rowstart[i+1]; ++j){
value+=matrix.value[j]*x[matrix.colindex[j]];
}
result[i]=value;
} parallel_end
}
// perform result=result-matrix*x
template<class T>
void multiply_and_subtract(const FixedSparseMatrix<T> &matrix, const std::vector<T> &x, std::vector<T> &result)
{
assert(matrix.n==x.size());
result.resize(matrix.n);
for(int i=0; i<matrix.n; ++i){
for(int j=matrix.rowstart[i]; j<matrix.rowstart[i+1]; ++j){
result[i]-=matrix.value[j]*x[matrix.colindex[j]];
}
}
}
//============================================================================
// A simple compressed sparse column data structure (with separate diagonal)
// for lower triangular matrices
template<class T>
struct SparseColumnLowerFactor
{
int n;
std::vector<T> invdiag; // reciprocals of diagonal elements
std::vector<T> value; // values below the diagonal, listed column by column
std::vector<int> rowindex; // a list of all row indices, for each column in turn
std::vector<int> colstart; // where each column begins in rowindex (plus an extra entry at the end, of #nonzeros)
std::vector<T> adiag; // just used in factorization: minimum "safe" diagonal entry allowed
explicit SparseColumnLowerFactor(int n_=0)
: n(n_), invdiag(n_), colstart(n_+1), adiag(n_)
{}
void clear(void)
{
n=0;
invdiag.clear();
value.clear();
rowindex.clear();
colstart.clear();
adiag.clear();
}
void resize(int n_)
{
n=n_;
invdiag.resize(n);
colstart.resize(n+1);
adiag.resize(n);
}
void write_matlab(std::ostream &output, const char *variable_name)
{
output<<variable_name<<"=sparse([";
for(int i=0; i<n; ++i){
output<<" "<<i+1;
for(int j=colstart[i]; j<colstart[i+1]; ++j){
output<<" "<<rowindex[j]+1;
}
}
output<<"],...\n [";
for(int i=0; i<n; ++i){
output<<" "<<i+1;
for(int j=colstart[i]; j<colstart[i+1]; ++j){
output<<" "<<i+1;
}
}
output<<"],...\n [";
for(int i=0; i<n; ++i){
output<<" "<<(invdiag[i]!=0 ? 1/invdiag[i] : 0);
for(int j=colstart[i]; j<colstart[i+1]; ++j){
output<<" "<<value[j];
}
}
output<<"], "<<n<<", "<<n<<");"<<std::endl;
}
};
//============================================================================
// Incomplete Cholesky factorization, level zero, with option for modified version.
// Set modification_parameter between zero (regular incomplete Cholesky) and
// one (fully modified version), with values close to one usually giving the best
// results. The min_diagonal_ratio parameter is used to detect and correct
// problems in factorization: if a pivot is this much less than the diagonal
// entry from the original matrix, the original matrix entry is used instead.
template<class T>
void factor_modified_incomplete_cholesky0(const SparseMatrix<T> &matrix, SparseColumnLowerFactor<T> &factor,
T modification_parameter=0.97, T min_diagonal_ratio=0.25)
{
// first copy lower triangle of matrix into factor (Note: assuming A is symmetric of course!)
factor.resize(matrix.n);
zero(factor.invdiag); // important: eliminate old values from previous solves!
factor.value.resize(0);
factor.rowindex.resize(0);
zero(factor.adiag);
for(int i=0; i<matrix.n; ++i){
factor.colstart[i]=(int)factor.rowindex.size();
for(int j=0; j<(int)matrix.index[i].size(); ++j){
if(matrix.index[i][j]>i){
factor.rowindex.push_back(matrix.index[i][j]);
factor.value.push_back(matrix.value[i][j]);
}else if(matrix.index[i][j]==i){
factor.invdiag[i]=factor.adiag[i]=matrix.value[i][j];
}
}
}
factor.colstart[matrix.n]=(int)factor.rowindex.size();
// now do the incomplete factorization (figure out numerical values)
// MATLAB code:
// L=tril(A);
// for k=1:size(L,2)
// L(k,k)=sqrt(L(k,k));
// L(k+1:end,k)=L(k+1:end,k)/L(k,k);
// for j=find(L(:,k))'
// if j>k
// fullupdate=L(:,k)*L(j,k);
// incompleteupdate=fullupdate.*(A(:,j)~=0);
// missing=sum(fullupdate-incompleteupdate);
// L(j:end,j)=L(j:end,j)-incompleteupdate(j:end);
// L(j,j)=L(j,j)-omega*missing;
// end
// end
// end
for(int k=0; k<matrix.n; ++k){
if(factor.adiag[k]==0) continue; // null row/column
// figure out the final L(k,k) entry
if(factor.invdiag[k]<min_diagonal_ratio*factor.adiag[k])
factor.invdiag[k]=1/sqrt(factor.adiag[k]); // drop to Gauss-Seidel here if the pivot looks dangerously small
else
factor.invdiag[k]=1/sqrt(factor.invdiag[k]);
// finalize the k'th column L(:,k)
for(int p=factor.colstart[k]; p<factor.colstart[k+1]; ++p){
factor.value[p]*=factor.invdiag[k];
}
// incompletely eliminate L(:,k) from future columns, modifying diagonals
for(int p=factor.colstart[k]; p<factor.colstart[k+1]; ++p){
int j=factor.rowindex[p]; // work on column j
T multiplier=factor.value[p];
T missing=0;
int a=factor.colstart[k];
// first look for contributions to missing from dropped entries above the diagonal in column j
int b=0;
while(a<factor.colstart[k+1] && factor.rowindex[a]<j){
// look for factor.rowindex[a] in matrix.index[j] starting at b
while(b<(int)matrix.index[j].size()){
if(matrix.index[j][b]<factor.rowindex[a])
++b;
else if(matrix.index[j][b]==factor.rowindex[a])
break;
else{
missing+=factor.value[a];
break;
}
}
++a;
}
// adjust the diagonal j,j entry
if(a<factor.colstart[k+1] && factor.rowindex[a]==j){
factor.invdiag[j]-=multiplier*factor.value[a];
}
++a;
// and now eliminate from the nonzero entries below the diagonal in column j (or add to missing if we can't)
b=factor.colstart[j];
while(a<factor.colstart[k+1] && b<factor.colstart[j+1]){
if(factor.rowindex[b]<factor.rowindex[a])
++b;
else if(factor.rowindex[b]==factor.rowindex[a]){
factor.value[b]-=multiplier*factor.value[a];
++a;
++b;
}else{
missing+=factor.value[a];
++a;
}
}
// and if there's anything left to do, add it to missing
while(a<factor.colstart[k+1]){
missing+=factor.value[a];
++a;
}
// and do the final diagonal adjustment from the missing entries
factor.invdiag[j]-=modification_parameter*multiplier*missing;
}
}
}
//============================================================================
// Solution routines with lower triangular matrix.
// solve L*result=rhs
template<class T>
void solve_lower(const SparseColumnLowerFactor<T> &factor, const std::vector<T> &rhs, std::vector<T> &result)
{
assert(factor.n==rhs.size());
assert(factor.n==result.size());
result=rhs;
for(int i=0; i<factor.n; ++i){
result[i]*=factor.invdiag[i];
for(int j=factor.colstart[i]; j<factor.colstart[i+1]; ++j){
result[factor.rowindex[j]]-=factor.value[j]*result[i];
}
}
}
// solve L^T*result=rhs
template<class T>
void solve_lower_transpose_in_place(const SparseColumnLowerFactor<T> &factor, std::vector<T> &x)
{
assert(factor.n==(int)x.size());
assert(factor.n>0);
int i=factor.n;
do{
--i;
for(int j=factor.colstart[i]; j<factor.colstart[i+1]; ++j){
x[i]-=factor.value[j]*x[factor.rowindex[j]];
}
x[i]*=factor.invdiag[i];
}while(i!=0);
}
//============================================================================
// Encapsulates the Conjugate Gradient algorithm with incomplete Cholesky
// factorization preconditioner.
template <class T>
struct SparsePCGSolver
{
SparsePCGSolver(void)
{
set_solver_parameters(1e-5, 100, 0.97, 0.25);
}
void set_solver_parameters(T tolerance_factor_, int max_iterations_, T modified_incomplete_cholesky_parameter_=0.97, T min_diagonal_ratio_=0.25)
{
tolerance_factor=tolerance_factor_;
if(tolerance_factor<1e-30) tolerance_factor=1e-30;
max_iterations=max_iterations_;
modified_incomplete_cholesky_parameter=modified_incomplete_cholesky_parameter_;
min_diagonal_ratio=min_diagonal_ratio_;
}
bool solve(const SparseMatrix<T> &matrix, const std::vector<T> &rhs, std::vector<T> &result, T &relative_residual_out, int &iterations_out, int precondition=2)
{
int n=matrix.n;
if((int)m.size()!=n){ m.resize(n); s.resize(n); z.resize(n); r.resize(n); }
zero(result);
r=rhs;
double residual_out=InstantBLAS<int,T>::abs_max(r);
if(residual_out==0) {
iterations_out=0;
return true;
}
//double tol=tolerance_factor*residual_out; // relative residual
double tol=tolerance_factor;
double residual_0 = residual_out;
form_preconditioner(matrix, precondition);
apply_preconditioner( r, z, precondition);
double rho=InstantBLAS<int,T>::dot(z, r);
if(rho==0 || rho!=rho) {
iterations_out=0;
return false;
}
s=z;
fixed_matrix.construct_from_matrix(matrix);
int iteration;
for(iteration=0; iteration<max_iterations; ++iteration){
multiply(fixed_matrix, s, z);
double alpha=rho/InstantBLAS<int,T>::dot(s, z);
InstantBLAS<int,T>::add_scaled(alpha, s, result);
InstantBLAS<int,T>::add_scaled(-alpha, z, r);
residual_out=InstantBLAS<int,T>::abs_max(r);
relative_residual_out = residual_out / residual_0;
if(residual_out<=tol) {
iterations_out=iteration+1;
return true;
}
apply_preconditioner(r, z, precondition);
double rho_new=InstantBLAS<int,T>::dot(z, r);
double beta=rho_new/rho;
InstantBLAS<int,T>::add_scaled(beta, s, z); s.swap(z); // s=beta*s+z
rho=rho_new;
}
iterations_out=iteration;
relative_residual_out = residual_out / residual_0;
return false;
}
protected:
// internal structures
SparseColumnLowerFactor<T> ic_factor; // modified incomplete cholesky factor
std::vector<T> m, z, s, r; // temporary vectors for PCG
FixedSparseMatrix<T> fixed_matrix; // used within loop
// parameters
T tolerance_factor;
int max_iterations;
T modified_incomplete_cholesky_parameter;
T min_diagonal_ratio;
void form_preconditioner(const SparseMatrix<T>& matrix, int precondition=2)
{
if(precondition==2) {
// incomplete cholesky
factor_modified_incomplete_cholesky0(matrix, ic_factor, modified_incomplete_cholesky_parameter, min_diagonal_ratio);
} else if(precondition==1) {
// diagonal
ic_factor.resize(matrix.n);
zero(ic_factor.invdiag);
for(int i=0; i<matrix.n; ++i) {
for(int j=0; j<(int)matrix.index[i].size(); ++j){
if(matrix.index[i][j]==i){
ic_factor.invdiag[i] = 1./matrix.value[i][j];
}
}
}
}
}
void apply_preconditioner(const std::vector<T> &x, std::vector<T> &result, int precondition=2)
{
if (precondition==2) {
// incomplete cholesky
solve_lower(ic_factor, x, result);
solve_lower_transpose_in_place(ic_factor,result);
} else if(precondition==1) {
// diagonal
for(int_index i=0; i<(int_index)result.size(); ++i) {
result[i] = x[i] * ic_factor.invdiag[i];
}
} else {
// off
result = x;
}
}
};
#undef parallel_for
#undef parallel_end
#undef int_index
#undef parallel_block
#undef do_parallel
#undef do_end
#undef block_end
#endif