MueLu  Version of the Day
MueLu_CoalesceDropFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 #include <Kokkos_Core.hpp>
51 #include <KokkosSparse_CrsMatrix.hpp>
52 
53 #include "Xpetra_Matrix.hpp"
54 
56 
57 #include "MueLu_AmalgamationInfo_kokkos.hpp"
58 #include "MueLu_Exceptions.hpp"
59 #include "MueLu_Level.hpp"
60 #include "MueLu_LWGraph_kokkos.hpp"
61 #include "MueLu_MasterList.hpp"
62 #include "MueLu_Monitor.hpp"
63 #include "MueLu_Utilities_kokkos.hpp"
64 
65 namespace MueLu {
66 
67 
68  namespace CoalesceDrop_Kokkos_Details { // anonymous
69 
70  template<class LO, class RowType>
71  class ScanFunctor {
72  public:
73  ScanFunctor(RowType rows_) : rows(rows_) { }
74 
75  KOKKOS_INLINE_FUNCTION
76  void operator()(const LO i, LO& upd, const bool& final) const {
77  upd += rows(i);
78  if (final)
79  rows(i) = upd;
80  }
81 
82  private:
83  RowType rows;
84  };
85 
86  template<class LO, class GhostedViewType>
87  class ClassicalDropFunctor {
88  private:
89  typedef typename GhostedViewType::value_type SC;
90  typedef Kokkos::ArithTraits<SC> ATS;
91  typedef typename ATS::magnitudeType magnitudeType;
92 
93  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
94  magnitudeType eps;
95 
96  public:
97  ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold) :
98  diag(ghostedDiag),
99  eps(threshold)
100  { }
101 
102  // Return true if we drop, false if not
103  KOKKOS_FORCEINLINE_FUNCTION
104  bool operator()(LO row, LO col, SC val) const {
105  // We avoid square root by using squared values
106  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
107  auto aij2 = ATS::magnitude(val) * ATS::magnitude(val); // |a_ij|^2
108 
109  return (aij2 <= eps*eps * aiiajj);
110  }
111  };
112 
113  template<class LO, class CoordsType>
114  class DistanceFunctor {
115  private:
116  typedef typename CoordsType::value_type SC;
117  typedef Kokkos::ArithTraits<SC> ATS;
118  typedef typename ATS::magnitudeType magnitudeType;
119 
120  public:
121  typedef SC value_type;
122 
123  public:
124  DistanceFunctor(CoordsType coords_) : coords(coords_) { }
125 
126  KOKKOS_INLINE_FUNCTION
127  magnitudeType distance2(LO row, LO col) const {
128  SC d = ATS::zero(), s;
129  for (size_t j = 0; j < coords.extent(1); j++) {
130  s = coords(row,j) - coords(col,j);
131  d += s*s;
132  }
133  return ATS::magnitude(d);
134  }
135  private:
136  CoordsType coords;
137  };
138 
139  template<class LO, class GhostedViewType, class DistanceFunctor>
140  class DistanceLaplacianDropFunctor {
141  private:
142  typedef typename GhostedViewType::value_type SC;
143  typedef Kokkos::ArithTraits<SC> ATS;
144  typedef typename ATS::magnitudeType magnitudeType;
145 
146  public:
147  DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold) :
148  diag(ghostedLaplDiag),
149  distFunctor(distFunctor_),
150  eps(threshold)
151  { }
152 
153  // Return true if we drop, false if not
154  KOKKOS_INLINE_FUNCTION
155  bool operator()(LO row, LO col, SC /* val */) const {
156  // We avoid square root by using squared values
157 
158  // We ignore incoming value of val as we operate on an auxiliary
159  // distance Laplacian matrix
160  typedef typename DistanceFunctor::value_type dSC;
161  typedef Kokkos::ArithTraits<dSC> dATS;
162  auto fval = dATS::one() / distFunctor.distance2(row, col);
163 
164  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
165  auto aij2 = ATS::magnitude(fval) * ATS::magnitude(fval); // |a_ij|^2
166 
167  return (aij2 <= eps*eps * aiiajj);
168  }
169 
170  private:
171  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
172  DistanceFunctor distFunctor;
173  magnitudeType eps;
174  };
175 
176  template<class SC, class LO, class MatrixType, class BndViewType, class DropFunctorType>
177  class ScalarFunctor {
178  private:
179  typedef typename MatrixType::StaticCrsGraphType graph_type;
180  typedef typename graph_type::row_map_type rows_type;
181  typedef typename graph_type::entries_type cols_type;
182  typedef typename MatrixType::values_type vals_type;
183  typedef Kokkos::ArithTraits<SC> ATS;
184  typedef typename ATS::magnitudeType magnitudeType;
185 
186  public:
187  ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_,
188  typename rows_type::non_const_type rows_,
189  typename cols_type::non_const_type colsAux_,
190  typename vals_type::non_const_type valsAux_,
191  bool reuseGraph_, bool lumping_, SC /* threshold_ */) :
192  A(A_),
193  bndNodes(bndNodes_),
194  dropFunctor(dropFunctor_),
195  rows(rows_),
196  colsAux(colsAux_),
197  valsAux(valsAux_),
198  reuseGraph(reuseGraph_),
199  lumping(lumping_)
200  {
201  rowsA = A.graph.row_map;
202  zero = ATS::zero();
203  }
204 
205  KOKKOS_INLINE_FUNCTION
206  void operator()(const LO row, LO& nnz) const {
207  auto rowView = A.rowConst(row);
208  auto length = rowView.length;
209  auto offset = rowsA(row);
210 
211  SC diag = zero;
212  LO rownnz = 0;
213  LO diagID = -1;
214  for (decltype(length) colID = 0; colID < length; colID++) {
215  LO col = rowView.colidx(colID);
216  SC val = rowView.value (colID);
217 
218  if (!dropFunctor(row, col, rowView.value(colID)) || row == col) {
219  colsAux(offset+rownnz) = col;
220 
221  LO valID = (reuseGraph ? colID : rownnz);
222  valsAux(offset+valID) = val;
223  if (row == col)
224  diagID = valID;
225 
226  rownnz++;
227 
228  } else {
229  // Rewrite with zeros (needed for reuseGraph)
230  valsAux(offset+colID) = zero;
231  diag += val;
232  }
233  }
234  // How to assert on the device?
235  // assert(diagIndex != -1);
236  rows(row+1) = rownnz;
237  // if (lumping && diagID != -1) {
238  if (lumping) {
239  // Add diag to the diagonal
240 
241  // NOTE_KOKKOS: valsAux was allocated with
242  // ViewAllocateWithoutInitializing. This is not a problem here
243  // because we explicitly set this value above.
244  valsAux(offset+diagID) += diag;
245  }
246 
247  // If the only element remaining after filtering is diagonal, mark node as boundary
248  // FIXME: this should really be replaced by the following
249  // if (indices.size() == 1 && indices[0] == row)
250  // boundaryNodes[row] = true;
251  // We do not do it this way now because there is no framework for distinguishing isolated
252  // and boundary nodes in the aggregation algorithms
253  bndNodes(row) = (rownnz == 1);
254 
255  nnz += rownnz;
256  }
257 
258  private:
259  MatrixType A;
260  BndViewType bndNodes;
261  DropFunctorType dropFunctor;
262 
263  rows_type rowsA;
264 
265  typename rows_type::non_const_type rows;
266  typename cols_type::non_const_type colsAux;
267  typename vals_type::non_const_type valsAux;
268 
269  bool reuseGraph;
270  bool lumping;
271  SC zero;
272  };
273 
274  // collect number nonzeros of blkSize rows in nnz_(row+1)
275  template<class MatrixType, class NnzType, class blkSizeType>
276  class Stage1aVectorFunctor {
277  private:
278  typedef typename MatrixType::ordinal_type LO;
279 
280  public:
281  Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_) :
282  kokkosMatrix(kokkosMatrix_),
283  nnz(nnz_),
284  blkSize(blkSize_) { }
285 
286  KOKKOS_INLINE_FUNCTION
287  void operator()(const LO row, LO& totalnnz) const {
288 
289  // the following code is more or less what MergeRows is doing
290  // count nonzero entries in all dof rows associated with node row
291  LO nodeRowMaxNonZeros = 0;
292  for (LO j = 0; j < blkSize; j++) {
293  auto rowView = kokkosMatrix.row(row * blkSize + j);
294  nodeRowMaxNonZeros += rowView.length;
295  }
296  nnz(row + 1) = nodeRowMaxNonZeros;
297  totalnnz += nodeRowMaxNonZeros;
298  }
299 
300 
301  private:
302  MatrixType kokkosMatrix; //< local matrix part
303  NnzType nnz; //< View containing number of nonzeros for current row
304  blkSizeType blkSize; //< block size (or partial block size in strided maps)
305  };
306 
307 
308  // build the dof-based column map containing the local dof ids belonging to blkSize rows in matrix
309  // the DofIds may not be sorted.
310  template<class MatrixType, class NnzType, class blkSizeType, class ColDofType>
311  class Stage1bVectorFunctor {
312  private:
313  typedef typename MatrixType::ordinal_type LO;
314  //typedef typename MatrixType::value_type SC;
315  //typedef typename MatrixType::device_type NO;
316 
317  private:
318  MatrixType kokkosMatrix; //< local matrix part
319  NnzType nnz; //< View containing dof offsets for dof columns
320  blkSizeType blkSize; //< block size (or partial block size in strided maps)
321  ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
322 
323  public:
324  Stage1bVectorFunctor(MatrixType kokkosMatrix_,
325  NnzType nnz_,
326  blkSizeType blkSize_,
327  ColDofType coldofs_) :
328  kokkosMatrix(kokkosMatrix_),
329  nnz(nnz_),
330  blkSize(blkSize_),
331  coldofs(coldofs_) {
332  }
333 
334  KOKKOS_INLINE_FUNCTION
335  void operator()(const LO rowNode) const {
336 
337  LO pos = nnz(rowNode);
338  for (LO j = 0; j < blkSize; j++) {
339  auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
340  auto numIndices = rowView.length;
341 
342  for (decltype(numIndices) k = 0; k < numIndices; k++) {
343  auto dofID = rowView.colidx(k);
344  coldofs(pos) = dofID;
345  pos ++;
346  }
347  }
348  }
349 
350  };
351 
352  // sort column ids
353  // translate them into (unique) node ids
354  // count the node column ids per node row
355  template<class MatrixType, class ColDofNnzType, class ColDofType, class Dof2NodeTranslationType, class BdryNodeType>
356  class Stage1cVectorFunctor {
357  private:
358  typedef typename MatrixType::ordinal_type LO;
359 
360  private:
361  ColDofNnzType coldofnnz; //< view containing start and stop indices for subviews
362  ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
363  Dof2NodeTranslationType dof2node; //< view containing the local node id associated with the local dof id
364  ColDofNnzType colnodennz; //< view containing number of column nodes for each node row
365  BdryNodeType bdrynode; //< view containing with numNodes booleans. True if node is (full) dirichlet boundardy node.
366 
367  public:
368  Stage1cVectorFunctor(ColDofNnzType coldofnnz_, ColDofType coldofs_, Dof2NodeTranslationType dof2node_, ColDofNnzType colnodennz_, BdryNodeType bdrynode_) :
369  coldofnnz(coldofnnz_),
370  coldofs(coldofs_),
371  dof2node(dof2node_),
372  colnodennz(colnodennz_),
373  bdrynode(bdrynode_) {
374  }
375 
376  KOKKOS_INLINE_FUNCTION
377  void operator()(const LO rowNode, LO& nnz) const {
378  LO begin = coldofnnz(rowNode);
379  LO end = coldofnnz(rowNode+1);
380  LO n = end - begin;
381  for (LO i = 0; i < (n-1); i++) {
382  for (LO j = 0; j < (n-i-1); j++) {
383  if (coldofs(j+begin) > coldofs(j+begin+1)) {
384  LO temp = coldofs(j+begin);
385  coldofs(j+begin) = coldofs(j+begin+1);
386  coldofs(j+begin+1) = temp;
387  }
388  }
389  }
390  size_t cnt = 0;
391  LO lastNodeID = -1;
392  for (LO i = 0; i < n; i++) {
393  LO dofID = coldofs(begin + i);
394  LO nodeID = dof2node(dofID);
395  if(nodeID != lastNodeID) {
396  lastNodeID = nodeID;
397  coldofs(begin+cnt) = nodeID;
398  cnt++;
399  }
400  }
401  if(cnt == 1)
402  bdrynode(rowNode) = true;
403  else
404  bdrynode(rowNode) = false;
405  colnodennz(rowNode+1) = cnt;
406  nnz += cnt;
407  }
408 
409  };
410 
411  // fill column node id view
412  template<class MatrixType, class ColDofNnzType, class ColDofType, class ColNodeNnzType, class ColNodeType>
413  class Stage1dVectorFunctor {
414  private:
415  typedef typename MatrixType::ordinal_type LO;
416  typedef typename MatrixType::value_type SC;
417 
418  private:
419  ColDofType coldofs; //< view containing mixed node and dof indices (only input)
420  ColDofNnzType coldofnnz; //< view containing the start and stop indices for subviews (dofs)
421  ColNodeType colnodes; //< view containing the local node ids associated with columns
422  ColNodeNnzType colnodennz; //< view containing start and stop indices for subviews
423 
424  public:
425  Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_) :
426  coldofs(coldofs_),
427  coldofnnz(coldofnnz_),
428  colnodes(colnodes_),
429  colnodennz(colnodennz_) {
430  }
431 
432  KOKKOS_INLINE_FUNCTION
433  void operator()(const LO rowNode) const {
434  auto dofbegin = coldofnnz(rowNode);
435  auto nodebegin = colnodennz(rowNode);
436  auto nodeend = colnodennz(rowNode+1);
437  auto n = nodeend - nodebegin;
438 
439  for (decltype(nodebegin) i = 0; i < n; i++) {
440  colnodes(nodebegin + i) = coldofs(dofbegin + i);
441  }
442  }
443  };
444 
445 
446  } // namespace
447 
448  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
449  RCP<const ParameterList> CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList() const {
450  RCP<ParameterList> validParamList = rcp(new ParameterList());
451 
452 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
453  SET_VALID_ENTRY("aggregation: drop tol");
454  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
455  SET_VALID_ENTRY("aggregation: drop scheme");
456  SET_VALID_ENTRY("filtered matrix: use lumping");
457  SET_VALID_ENTRY("filtered matrix: reuse graph");
458  SET_VALID_ENTRY("filtered matrix: reuse eigenvalue");
459  {
460  typedef Teuchos::StringToIntegralParameterEntryValidator<int> validatorType;
461  validParamList->getEntry("aggregation: drop scheme").setValidator(
462  rcp(new validatorType(Teuchos::tuple<std::string>("classical", "distance laplacian"), "aggregation: drop scheme")));
463  }
464 #undef SET_VALID_ENTRY
465  validParamList->set< bool > ("lightweight wrap", true, "Experimental option for lightweight graph access");
466 
467  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
468  validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
469  validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory for Coordinates");
470 
471  return validParamList;
472  }
473 
474  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
475  void CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level &currentLevel) const {
476  Input(currentLevel, "A");
477  Input(currentLevel, "UnAmalgamationInfo");
478 
479  const ParameterList& pL = GetParameterList();
480  if (pL.get<bool>("lightweight wrap") == true) {
481  if (pL.get<std::string>("aggregation: drop scheme") == "distance laplacian")
482  Input(currentLevel, "Coordinates");
483  }
484  }
485 
486  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
487  void CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
488  Build(Level& currentLevel) const {
489  FactoryMonitor m(*this, "Build", currentLevel);
490 
491  typedef Teuchos::ScalarTraits<SC> STS;
492  const SC zero = STS::zero();
493 
494  auto A = Get< RCP<Matrix> >(currentLevel, "A");
495  LO blkSize = A->GetFixedBlockSize();
496 
497  auto amalInfo = Get< RCP<AmalgamationInfo_kokkos> >(currentLevel, "UnAmalgamationInfo");
498 
499  const ParameterList& pL = GetParameterList();
500 
501  bool doLightWeightWrap = pL.get<bool>("lightweight wrap");
502  GetOStream(Warnings0) << "lightweight wrap is deprecated" << std::endl;
503  TEUCHOS_TEST_FOR_EXCEPTION(!doLightWeightWrap, Exceptions::RuntimeError,
504  "MueLu KokkosRefactor only supports \"lightweight wrap\"=\"true\"");
505 
506  std::string algo = pL.get<std::string>("aggregation: drop scheme");
507 
508  double threshold = pL.get<double>("aggregation: drop tol");
509  GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold
510  << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
511 
512  const typename STS::magnitudeType dirichletThreshold =
513  STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
514 
515  GO numDropped = 0, numTotal = 0;
516 
517  RCP<LWGraph_kokkos> graph;
518  LO dofsPerNode = -1;
519 
520  typedef typename LWGraph_kokkos::boundary_nodes_type boundary_nodes_type;
521  boundary_nodes_type boundaryNodes;
522 
523  RCP<Matrix> filteredA;
524  if (blkSize == 1 && threshold == zero) {
525  // Scalar problem without dropping
526 
527  // Detect and record rows that correspond to Dirichlet boundary conditions
528  boundaryNodes = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold);
529 
530  // Trivial LWGraph construction
531  graph = rcp(new LWGraph_kokkos(A->getLocalMatrixDevice().graph, A->getRowMap(), A->getColMap(), "graph of A"));
532  graph->SetBoundaryNodeMap(boundaryNodes);
533 
534  numTotal = A->getNodeNumEntries();
535  dofsPerNode = 1;
536 
537  filteredA = A;
538 
539  } else if (blkSize == 1 && threshold != zero) {
540  // Scalar problem with dropping
541 
542  typedef typename Matrix::local_matrix_type local_matrix_type;
543  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
544  typedef typename kokkos_graph_type::row_map_type::non_const_type rows_type;
545  typedef typename kokkos_graph_type::entries_type::non_const_type cols_type;
546  typedef typename local_matrix_type::values_type::non_const_type vals_type;
547 
548  LO numRows = A->getNodeNumRows();
549  local_matrix_type kokkosMatrix = A->getLocalMatrixDevice();
550  auto nnzA = kokkosMatrix.nnz();
551  auto rowsA = kokkosMatrix.graph.row_map;
552 
553 
554  typedef Kokkos::ArithTraits<SC> ATS;
555 
556  bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
557  bool lumping = pL.get<bool>("filtered matrix: use lumping");
558  if (lumping)
559  GetOStream(Runtime0) << "Lumping dropped entries" << std::endl;
560 
561  // FIXME_KOKKOS: replace by ViewAllocateWithoutInitializing + setting a single value
562  rows_type rows ("FA_rows", numRows+1);
563  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("FA_aux_cols"), nnzA);
564  vals_type valsAux;
565  if (reuseGraph) {
566  SubFactoryMonitor m2(*this, "CopyMatrix", currentLevel);
567 
568  // Share graph with the original matrix
569  filteredA = MatrixFactory::Build(A->getCrsGraph());
570 
571  // Do a no-op fill-complete
572  RCP<ParameterList> fillCompleteParams(new ParameterList);
573  fillCompleteParams->set("No Nonlocal Changes", true);
574  filteredA->fillComplete(fillCompleteParams);
575 
576  // No need to reuseFill, just modify in place
577  valsAux = filteredA->getLocalMatrixDevice().values;
578 
579  } else {
580  // Need an extra array to compress
581  valsAux = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_aux_vals"), nnzA);
582  }
583 
584  typename boundary_nodes_type::non_const_type bndNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows);
585 
586  LO nnzFA = 0;
587  {
588  if (algo == "classical") {
589  // Construct overlapped matrix diagonal
590  RCP<Vector> ghostedDiag;
591  {
592  kokkosMatrix = local_matrix_type();
593  SubFactoryMonitor m2(*this, "Ghosted diag construction", currentLevel);
594  ghostedDiag = Utilities_kokkos::GetMatrixOverlappedDiagonal(*A);
595  kokkosMatrix=A->getLocalMatrixDevice();
596  }
597 
598  // Filter out entries
599  {
600  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
601 
602  auto ghostedDiagView = ghostedDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
603 
604  CoalesceDrop_Kokkos_Details::ClassicalDropFunctor<LO, decltype(ghostedDiagView)> dropFunctor(ghostedDiagView, threshold);
605  CoalesceDrop_Kokkos_Details::ScalarFunctor<typename ATS::val_type, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
606  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold);
607 
608  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
609  scalarFunctor, nnzFA);
610  }
611 
612  } else if (algo == "distance laplacian") {
613  typedef Xpetra::MultiVector<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO> doubleMultiVector;
614  auto coords = Get<RCP<doubleMultiVector> >(currentLevel, "Coordinates");
615 
616  auto uniqueMap = A->getRowMap();
617  auto nonUniqueMap = A->getColMap();
618 
619  // Construct ghosted coordinates
620  RCP<const Import> importer;
621  {
622  SubFactoryMonitor m2(*this, "Coords Import construction", currentLevel);
623  importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
624  }
625  RCP<doubleMultiVector> ghostedCoords;
626  {
627  SubFactoryMonitor m2(*this, "Ghosted coords construction", currentLevel);
628  ghostedCoords = Xpetra::MultiVectorFactory<typename Teuchos::ScalarTraits<Scalar>::magnitudeType,LO,GO,NO>::Build(nonUniqueMap, coords->getNumVectors());
629  ghostedCoords->doImport(*coords, *importer, Xpetra::INSERT);
630  }
631 
632  auto ghostedCoordsView = ghostedCoords->getDeviceLocalView(Xpetra::Access::ReadWrite);
633  CoalesceDrop_Kokkos_Details::DistanceFunctor<LO, decltype(ghostedCoordsView)> distFunctor(ghostedCoordsView);
634 
635  // Construct Laplacian diagonal
636  RCP<Vector> localLaplDiag;
637  {
638  SubFactoryMonitor m2(*this, "Local Laplacian diag construction", currentLevel);
639 
640  localLaplDiag = VectorFactory::Build(uniqueMap);
641 
642  auto localLaplDiagView = localLaplDiag->getDeviceLocalView(Xpetra::Access::OverwriteAll);
643  auto kokkosGraph = kokkosMatrix.graph;
644 
645  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:laplacian_diag", range_type(0,numRows),
646  KOKKOS_LAMBDA(const LO row) {
647  auto rowView = kokkosGraph.rowConst(row);
648  auto length = rowView.length;
649 
650  SC d = ATS::zero();
651  for (decltype(length) colID = 0; colID < length; colID++) {
652  auto col = rowView(colID);
653  if (row != col)
654  d += ATS::one()/distFunctor.distance2(row, col);
655  }
656  localLaplDiagView(row,0) = d;
657  });
658  }
659 
660  // Construct ghosted Laplacian diagonal
661  RCP<Vector> ghostedLaplDiag;
662  {
663  SubFactoryMonitor m2(*this, "Ghosted Laplacian diag construction", currentLevel);
664  ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
665  ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
666  }
667 
668  // Filter out entries
669  {
670  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
671 
672  auto ghostedLaplDiagView = ghostedLaplDiag->getDeviceLocalView(Xpetra::Access::ReadWrite);
673 
674  CoalesceDrop_Kokkos_Details::DistanceLaplacianDropFunctor<LO, decltype(ghostedLaplDiagView), decltype(distFunctor)>
675  dropFunctor(ghostedLaplDiagView, distFunctor, threshold);
676  CoalesceDrop_Kokkos_Details::ScalarFunctor<SC, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
677  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold);
678 
679  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
680  scalarFunctor, nnzFA);
681  }
682  }
683 
684  }
685  numDropped = nnzA - nnzFA;
686 
687  boundaryNodes = bndNodes;
688 
689  {
690  SubFactoryMonitor m2(*this, "CompressRows", currentLevel);
691 
692  // parallel_scan (exclusive)
693  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:compress_rows", range_type(0,numRows+1),
694  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
695  update += rows(i);
696  if (final_pass)
697  rows(i) = update;
698  });
699  }
700 
701  // Compress cols (and optionally vals)
702  // We use a trick here: we moved all remaining elements to the beginning
703  // of the original row in the main loop, so we don't need to check for
704  // INVALID here, and just stop when achieving the new number of elements
705  // per row.
706  cols_type cols(Kokkos::ViewAllocateWithoutInitializing("FA_cols"), nnzFA);
707  vals_type vals;
708  if (reuseGraph) {
709  GetOStream(Runtime1) << "reuse matrix graph for filtering (compress matrix columns only)" << std::endl;
710  // Only compress cols
711  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
712 
713  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
714  KOKKOS_LAMBDA(const LO i) {
715  // Is there Kokkos memcpy?
716  LO rowStart = rows(i);
717  LO rowAStart = rowsA(i);
718  size_t rownnz = rows(i+1) - rows(i);
719  for (size_t j = 0; j < rownnz; j++)
720  cols(rowStart+j) = colsAux(rowAStart+j);
721  });
722  } else {
723  // Compress cols and vals
724  GetOStream(Runtime1) << "new matrix graph for filtering (compress matrix columns and values)" << std::endl;
725  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
726 
727  vals = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_vals"), nnzFA);
728 
729  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
730  KOKKOS_LAMBDA(const LO i) {
731  LO rowStart = rows(i);
732  LO rowAStart = rowsA(i);
733  size_t rownnz = rows(i+1) - rows(i);
734  for (size_t j = 0; j < rownnz; j++) {
735  cols(rowStart+j) = colsAux(rowAStart+j);
736  vals(rowStart+j) = valsAux(rowAStart+j);
737  }
738  });
739  }
740 
741  kokkos_graph_type kokkosGraph(cols, rows);
742 
743  {
744  SubFactoryMonitor m2(*this, "LWGraph construction", currentLevel);
745 
746  graph = rcp(new LWGraph_kokkos(kokkosGraph, A->getRowMap(), A->getColMap(), "filtered graph of A"));
747  graph->SetBoundaryNodeMap(boundaryNodes);
748  }
749 
750  numTotal = A->getNodeNumEntries();
751 
752  dofsPerNode = 1;
753 
754  if (!reuseGraph) {
755  SubFactoryMonitor m2(*this, "LocalMatrix+FillComplete", currentLevel);
756 
757  local_matrix_type localFA = local_matrix_type("A", numRows, A->getLocalMatrixDevice().numCols(), nnzFA, vals, rows, cols);
758  auto filteredACrs = CrsMatrixFactory::Build(localFA, A->getRowMap(), A->getColMap(), A->getDomainMap(), A->getRangeMap(),
759  A->getCrsGraph()->getImporter(), A->getCrsGraph()->getExporter());
760  filteredA = rcp(new CrsMatrixWrap(filteredACrs));
761  }
762 
763  filteredA->SetFixedBlockSize(A->GetFixedBlockSize());
764 
765  if (pL.get<bool>("filtered matrix: reuse eigenvalue")) {
766  // Reuse max eigenvalue from A
767  // It is unclear what eigenvalue is the best for the smoothing, but we already may have
768  // the D^{-1}A estimate in A, may as well use it.
769  // NOTE: ML does that too
770  filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
771  } else {
772  filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
773  }
774 
775  } else if (blkSize > 1 && threshold == zero) {
776  // Case 3: block problem without filtering
777  //
778  // FIXME_KOKKOS: this code is completely unoptimized. It really should do
779  // a very simple thing: merge rows and produce nodal graph. But the code
780  // seems very complicated. Can we do better?
781 
782  TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getNodeNumElements() % blkSize != 0, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: Number of local elements is " << A->getRowMap()->getNodeNumElements() << " but should be a multiply of " << blkSize);
783 
784  const RCP<const Map> rowMap = A->getRowMap();
785  const RCP<const Map> colMap = A->getColMap();
786 
787  // build a node row map (uniqueMap = non-overlapping) and a node column map
788  // (nonUniqueMap = overlapping). The arrays rowTranslation and colTranslation
789  // stored in the AmalgamationInfo class container contain the local node id
790  // given a local dof id. The data is calculated in the AmalgamationFactory and
791  // stored in the variable "UnAmalgamationInfo" (which is of type AmalagamationInfo)
792  const RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
793  const RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
794  Array<LO> rowTranslationArray = *(amalInfo->getRowTranslation()); // TAW should be transform that into a View?
795  Array<LO> colTranslationArray = *(amalInfo->getColTranslation());
796 
798  rowTranslationView(rowTranslationArray.getRawPtr(),rowTranslationArray.size() );
800  colTranslationView(colTranslationArray.getRawPtr(),colTranslationArray.size() );
801 
802  // get number of local nodes
803  LO numNodes = Teuchos::as<LocalOrdinal>(uniqueMap->getNodeNumElements());
804  typedef typename Kokkos::View<LocalOrdinal*, DeviceType> id_translation_type;
805  id_translation_type rowTranslation("dofId2nodeId",rowTranslationArray.size());
806  id_translation_type colTranslation("ov_dofId2nodeId",colTranslationArray.size());
807  Kokkos::deep_copy(rowTranslation, rowTranslationView);
808  Kokkos::deep_copy(colTranslation, colTranslationView);
809 
810  // extract striding information
811  blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
812  LocalOrdinal blkId = -1; //< the block id within a strided map or -1 if it is a full block map
813  LocalOrdinal blkPartSize = A->GetFixedBlockSize(); //< stores block size of part blkId (or the full block size)
814  if(A->IsView("stridedMaps") == true) {
815  const RCP<const Map> myMap = A->getRowMap("stridedMaps");
816  const RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
817  TEUCHOS_TEST_FOR_EXCEPTION(strMap.is_null() == true, Exceptions::RuntimeError, "Map is not of type stridedMap");
818  blkSize = Teuchos::as<const LocalOrdinal>(strMap->getFixedBlockSize());
819  blkId = strMap->getStridedBlockId();
820  if (blkId > -1)
821  blkPartSize = Teuchos::as<LocalOrdinal>(strMap->getStridingData()[blkId]);
822  }
823  auto kokkosMatrix = A->getLocalMatrixDevice(); // access underlying kokkos data
824 
825  //
826  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
827  typedef typename kokkos_graph_type::row_map_type row_map_type;
828  //typedef typename row_map_type::HostMirror row_map_type_h;
829  typedef typename kokkos_graph_type::entries_type entries_type;
830 
831  // Stage 1c: get number of dof-nonzeros per blkSize node rows
832  typename row_map_type::non_const_type dofNnz("nnz_map", numNodes + 1);
833  LO numDofCols = 0;
834  CoalesceDrop_Kokkos_Details::Stage1aVectorFunctor<decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize)> stage1aFunctor(kokkosMatrix, dofNnz, blkPartSize);
835  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1a", range_type(0,numNodes), stage1aFunctor, numDofCols);
836  // parallel_scan (exclusive)
837  CoalesceDrop_Kokkos_Details::ScanFunctor<LO,decltype(dofNnz)> scanFunctor(dofNnz);
838  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanFunctor);
839 
840  typename entries_type::non_const_type dofcols("dofcols", numDofCols/*dofNnz(numNodes)*/); // why does dofNnz(numNodes) work? should be a parallel reduce, i guess
841  CoalesceDrop_Kokkos_Details::Stage1bVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize), decltype(dofcols)> stage1bFunctor(kokkosMatrix, dofNnz, blkPartSize, dofcols);
842  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1b", range_type(0,numNodes), stage1bFunctor);
843 
844  // we have dofcols and dofids from Stage1dVectorFunctor
845  LO numNodeCols = 0;
846  typename row_map_type::non_const_type rows("nnz_nodemap", numNodes + 1);
847  typename boundary_nodes_type::non_const_type bndNodes("boundaryNodes", numNodes);
848  CoalesceDrop_Kokkos_Details::Stage1cVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(dofcols), decltype(colTranslation), decltype(bndNodes)> stage1cFunctor(dofNnz, dofcols, colTranslation,rows,bndNodes);
849  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1cFunctor,numNodeCols);
850 
851  // parallel_scan (exclusive)
852  CoalesceDrop_Kokkos_Details::ScanFunctor<LO,decltype(rows)> scanNodeFunctor(rows);
853  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanNodeFunctor);
854 
855  // create column node view
856  typename entries_type::non_const_type cols("nodecols", numNodeCols);
857 
858 
859  CoalesceDrop_Kokkos_Details::Stage1dVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(dofcols), decltype(rows), decltype(cols)> stage1dFunctor(dofcols, dofNnz, cols, rows);
860  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1dFunctor);
861  kokkos_graph_type kokkosGraph(cols, rows);
862 
863  // create LW graph
864  graph = rcp(new LWGraph_kokkos(kokkosGraph, uniqueMap, nonUniqueMap, "amalgamated graph of A"));
865 
866  boundaryNodes = bndNodes;
867  graph->SetBoundaryNodeMap(boundaryNodes);
868  numTotal = A->getNodeNumEntries();
869 
870  dofsPerNode = blkSize;
871 
872  filteredA = A;
873 
874  } else {
875  TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu: CoalesceDropFactory_kokkos: Block filtering is not implemented");
876  }
877 
878  if (GetVerbLevel() & Statistics1) {
879  GO numLocalBoundaryNodes = 0;
880  GO numGlobalBoundaryNodes = 0;
881 
882  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:bnd", range_type(0, boundaryNodes.extent(0)),
883  KOKKOS_LAMBDA(const LO i, GO& n) {
884  if (boundaryNodes(i))
885  n++;
886  }, numLocalBoundaryNodes);
887 
888  auto comm = A->getRowMap()->getComm();
889  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
890  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
891  }
892 
893  if ((GetVerbLevel() & Statistics1) && threshold != zero) {
894  auto comm = A->getRowMap()->getComm();
895 
896  GO numGlobalTotal, numGlobalDropped;
897  MueLu_sumAll(comm, numTotal, numGlobalTotal);
898  MueLu_sumAll(comm, numDropped, numGlobalDropped);
899 
900  if (numGlobalTotal != 0) {
901  GetOStream(Statistics1) << "Number of dropped entries: "
902  << numGlobalDropped << "/" << numGlobalTotal
903  << " (" << 100*Teuchos::as<double>(numGlobalDropped)/Teuchos::as<double>(numGlobalTotal) << "%)" << std::endl;
904  }
905  }
906 
907  Set(currentLevel, "DofsPerNode", dofsPerNode);
908  Set(currentLevel, "Graph", graph);
909  Set(currentLevel, "A", filteredA);
910  }
911 }
912 #endif // HAVE_MUELU_KOKKOS_REFACTOR
913 #endif // MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
#define MueLu_sumAll(rcpComm, in, out)
MueLu::DefaultLocalOrdinal LocalOrdinal
Print more statistics.
One-liner description of what is happening.
Namespace for MueLu classes and methods.
Kokkos::View< bool *, typename NO::device_type > DetectDirichletRows(const Xpetra::Matrix< SC, LO, GO, NO > &A, const typename Teuchos::ScalarTraits< SC >::magnitudeType &tol, const bool count_twos_as_dirichlet)
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
#define SET_VALID_ENTRY(name)