// // Preconditioned conjugate gradient solver // // Created by Robert Bridson, Ryoichi Ando and Nils Thuerey // #ifndef RCMATRIX3_H #define RCMATRIX3_H #include #include #include #include #include #include // index type #define int_index long long // parallelization disabled #define parallel_for(size) { int thread_number = 0; int_index parallel_index=0; for( int_index parallel_index=0; parallel_index<(int_index)size; parallel_index++ ) { #define parallel_end } thread_number=parallel_index=0; } #define parallel_block #define do_parallel #define do_end #define block_end #include "vectorbase.h" // note - "Int" instead of "N" here, the latter is size! template struct InstantBLAS { static inline Int offset(Int N, Int incX) { return ((incX) > 0 ? 0 : ((N) - 1) * (-(incX))); } static T cblas_ddot( const Int N, const T *X, const Int incX, const T *Y, const Int incY) { double r = 0.0; // always use double precision internally here... Int i; Int ix = offset(N,incX); Int iy = offset(N,incY); for (i = 0; i < N; i++) { r += X[ix] * Y[iy]; ix += incX; iy += incY; } return (T)r; } static void cblas_daxpy( const Int N, const T alpha, const T *X, const Int incX, T *Y, const Int incY) { Int i; if (N <= 0 ) return; if (alpha == 0.0) return; if (incX == 1 && incY == 1) { const Int m = N % 4; for (i = 0; i < m; i++) Y[i] += alpha * X[i]; for (i = m; i + 3 < N; i += 4) { Y[i ] += alpha * X[i ]; Y[i + 1] += alpha * X[i + 1]; Y[i + 2] += alpha * X[i + 2]; Y[i + 3] += alpha * X[i + 3]; } } else { Int ix = offset(N, incX); Int iy = offset(N, incY); for (i = 0; i < N; i++) { Y[iy] += alpha * X[ix]; ix += incX; iy += incY; } } } // dot products ============================================================== static inline T dot(const std::vector &x, const std::vector &y) { return cblas_ddot((int)x.size(), &x[0], 1, &y[0], 1); } // inf-norm (maximum absolute value: index of max returned) ================== static inline Int index_abs_max(const std::vector &x) { int maxind = 0; T maxvalue = 0; for(Int i = 0; i < (Int)x.size(); ++i) { if(std::abs(x[i]) > maxvalue) { maxvalue = fabs(x[i]); maxind = i; } } return maxind; } // inf-norm (maximum absolute value) ========================================= // technically not part of BLAS, but useful static inline T abs_max(const std::vector &x) { return std::abs(x[index_abs_max(x)]); } // saxpy (y=alpha*x+y) ======================================================= static inline void add_scaled(T alpha, const std::vector &x, std::vector &y) { cblas_daxpy((Int)x.size(), alpha, &x[0], 1, &y[0], 1); } }; template void zero(std::vector &v) { for(int i=(int)v.size()-1; i>=0; --i) v[i]=0; } template void insert(std::vector &a, unsigned int index, T e) { a.push_back(a.back()); for(unsigned int i=(unsigned int)a.size()-1; i>index; --i) a[i]=a[i-1]; a[index]=e; } template void erase(std::vector &a, unsigned int index) { for(unsigned int i=index; i struct SparseMatrix { int n; // dimension std::vector > index; // for each row, a list of all column indices (sorted) std::vector > value; // values corresponding to index explicit SparseMatrix(int n_=0, int expected_nonzeros_per_row=7) : n(n_), index(n_), value(n_) { for(int i=0; ij) return 0; } return 0; } void set_element(int i, int j, T new_value) { int k=0; for(; k<(int)index[i].size(); ++k){ if(index[i][k]==j){ value[i][k]=new_value; return; }else if(index[i][k]>j){ insert(index[i], k, j); insert(value[i], k, new_value); return; } } index[i].push_back(j); value[i].push_back(new_value); } void add_to_element(int i, int j, T increment_value) { int k=0; for(; k<(int)index[i].size(); ++k){ if(index[i][k]==j){ value[i][k]+=increment_value; return; }else if(index[i][k]>j){ insert(index[i], k, j); insert(value[i], k, increment_value); return; } } index[i].push_back(j); value[i].push_back(increment_value); } // assumes indices is already sorted void add_sparse_row(int i, const std::vector &indices, const std::vector &values) { int j=0, k=0; while(jindices[j]){ insert(index[i], k, indices[j]); insert(value[i], k, values[j]); ++j; }else{ value[i][k]+=values[j]; ++j; ++k; } } for(;j SparseMatrixf; typedef SparseMatrix SparseMatrixd; // perform result=matrix*x template void multiply(const SparseMatrix &matrix, const std::vector &x, std::vector &result) { assert(matrix.n==x.size()); result.resize(matrix.n); //for(int i=0; i void multiply_and_subtract(const SparseMatrix &matrix, const std::vector &x, std::vector &result) { assert(matrix.n==x.size()); result.resize(matrix.n); for(int i=0; i<(int)matrix.n; ++i){ for(int j=0; j<(int)matrix.index[i].size(); ++j){ result[i]-=matrix.value[i][j]*x[matrix.index[i][j]]; } } } //============================================================================ // Fixed version of SparseMatrix. This is not a good structure for dynamically // modifying the matrix, but can be significantly faster for matrix-vector // multiplies due to better data locality. template struct FixedSparseMatrix { int n; // dimension std::vector value; // nonzero values row by row std::vector colindex; // corresponding column indices std::vector rowstart; // where each row starts in value and colindex (and last entry is one past the end, the number of nonzeros) explicit FixedSparseMatrix(int n_=0) : n(n_), value(0), colindex(0), rowstart(n_+1) {} void clear(void) { n=0; value.clear(); colindex.clear(); rowstart.clear(); } void resize(int n_) { n=n_; rowstart.resize(n+1); } void construct_from_matrix(const SparseMatrix &matrix) { resize(matrix.n); rowstart[0]=0; for(int i=0; i void multiply(const FixedSparseMatrix &matrix, const std::vector &x, std::vector &result) { assert(matrix.n==x.size()); result.resize(matrix.n); //for(int i=0; i void multiply_and_subtract(const FixedSparseMatrix &matrix, const std::vector &x, std::vector &result) { assert(matrix.n==x.size()); result.resize(matrix.n); for(int i=0; i struct SparseColumnLowerFactor { int n; std::vector invdiag; // reciprocals of diagonal elements std::vector value; // values below the diagonal, listed column by column std::vector rowindex; // a list of all row indices, for each column in turn std::vector colstart; // where each column begins in rowindex (plus an extra entry at the end, of #nonzeros) std::vector adiag; // just used in factorization: minimum "safe" diagonal entry allowed explicit SparseColumnLowerFactor(int n_=0) : n(n_), invdiag(n_), colstart(n_+1), adiag(n_) {} void clear(void) { n=0; invdiag.clear(); value.clear(); rowindex.clear(); colstart.clear(); adiag.clear(); } void resize(int n_) { n=n_; invdiag.resize(n); colstart.resize(n+1); adiag.resize(n); } void write_matlab(std::ostream &output, const char *variable_name) { output< void factor_modified_incomplete_cholesky0(const SparseMatrix &matrix, SparseColumnLowerFactor &factor, T modification_parameter=0.97, T min_diagonal_ratio=0.25) { // first copy lower triangle of matrix into factor (Note: assuming A is symmetric of course!) factor.resize(matrix.n); zero(factor.invdiag); // important: eliminate old values from previous solves! factor.value.resize(0); factor.rowindex.resize(0); zero(factor.adiag); for(int i=0; ii){ factor.rowindex.push_back(matrix.index[i][j]); factor.value.push_back(matrix.value[i][j]); }else if(matrix.index[i][j]==i){ factor.invdiag[i]=factor.adiag[i]=matrix.value[i][j]; } } } factor.colstart[matrix.n]=(int)factor.rowindex.size(); // now do the incomplete factorization (figure out numerical values) // MATLAB code: // L=tril(A); // for k=1:size(L,2) // L(k,k)=sqrt(L(k,k)); // L(k+1:end,k)=L(k+1:end,k)/L(k,k); // for j=find(L(:,k))' // if j>k // fullupdate=L(:,k)*L(j,k); // incompleteupdate=fullupdate.*(A(:,j)~=0); // missing=sum(fullupdate-incompleteupdate); // L(j:end,j)=L(j:end,j)-incompleteupdate(j:end); // L(j,j)=L(j,j)-omega*missing; // end // end // end for(int k=0; k void solve_lower(const SparseColumnLowerFactor &factor, const std::vector &rhs, std::vector &result) { assert(factor.n==rhs.size()); assert(factor.n==result.size()); result=rhs; for(int i=0; i void solve_lower_transpose_in_place(const SparseColumnLowerFactor &factor, std::vector &x) { assert(factor.n==(int)x.size()); assert(factor.n>0); int i=factor.n; do{ --i; for(int j=factor.colstart[i]; j struct SparsePCGSolver { SparsePCGSolver(void) { set_solver_parameters(1e-5, 100, 0.97, 0.25); } void set_solver_parameters(T tolerance_factor_, int max_iterations_, T modified_incomplete_cholesky_parameter_=0.97, T min_diagonal_ratio_=0.25) { tolerance_factor=tolerance_factor_; if(tolerance_factor<1e-30) tolerance_factor=1e-30; max_iterations=max_iterations_; modified_incomplete_cholesky_parameter=modified_incomplete_cholesky_parameter_; min_diagonal_ratio=min_diagonal_ratio_; } bool solve(const SparseMatrix &matrix, const std::vector &rhs, std::vector &result, T &relative_residual_out, int &iterations_out, int precondition=2) { int n=matrix.n; if((int)m.size()!=n){ m.resize(n); s.resize(n); z.resize(n); r.resize(n); } zero(result); r=rhs; double residual_out=InstantBLAS::abs_max(r); if(residual_out==0) { iterations_out=0; return true; } //double tol=tolerance_factor*residual_out; // relative residual double tol=tolerance_factor; double residual_0 = residual_out; form_preconditioner(matrix, precondition); apply_preconditioner( r, z, precondition); double rho=InstantBLAS::dot(z, r); if(rho==0 || rho!=rho) { iterations_out=0; return false; } s=z; fixed_matrix.construct_from_matrix(matrix); int iteration; for(iteration=0; iteration::dot(s, z); InstantBLAS::add_scaled(alpha, s, result); InstantBLAS::add_scaled(-alpha, z, r); residual_out=InstantBLAS::abs_max(r); relative_residual_out = residual_out / residual_0; if(residual_out<=tol) { iterations_out=iteration+1; return true; } apply_preconditioner(r, z, precondition); double rho_new=InstantBLAS::dot(z, r); double beta=rho_new/rho; InstantBLAS::add_scaled(beta, s, z); s.swap(z); // s=beta*s+z rho=rho_new; } iterations_out=iteration; relative_residual_out = residual_out / residual_0; return false; } protected: // internal structures SparseColumnLowerFactor ic_factor; // modified incomplete cholesky factor std::vector m, z, s, r; // temporary vectors for PCG FixedSparseMatrix fixed_matrix; // used within loop // parameters T tolerance_factor; int max_iterations; T modified_incomplete_cholesky_parameter; T min_diagonal_ratio; void form_preconditioner(const SparseMatrix& matrix, int precondition=2) { if(precondition==2) { // incomplete cholesky factor_modified_incomplete_cholesky0(matrix, ic_factor, modified_incomplete_cholesky_parameter, min_diagonal_ratio); } else if(precondition==1) { // diagonal ic_factor.resize(matrix.n); zero(ic_factor.invdiag); for(int i=0; i &x, std::vector &result, int precondition=2) { if (precondition==2) { // incomplete cholesky solve_lower(ic_factor, x, result); solve_lower_transpose_in_place(ic_factor,result); } else if(precondition==1) { // diagonal for(int_index i=0; i<(int_index)result.size(); ++i) { result[i] = x[i] * ic_factor.invdiag[i]; } } else { // off result = x; } } }; #undef parallel_for #undef parallel_end #undef int_index #undef parallel_block #undef do_parallel #undef do_end #undef block_end #endif