46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR 55 #include "MueLu_Aggregates_kokkos.hpp" 56 #include "MueLu_AmalgamationFactory_kokkos.hpp" 57 #include "MueLu_AmalgamationInfo_kokkos.hpp" 58 #include "MueLu_CoarseMapFactory_kokkos.hpp" 60 #include "MueLu_NullspaceFactory_kokkos.hpp" 61 #include "MueLu_PerfUtils.hpp" 63 #include "MueLu_Utilities_kokkos.hpp" 69 template<
class LocalOrdinal,
class View>
70 class ReduceMaxFunctor{
72 ReduceMaxFunctor(View view) : view_(view) { }
74 KOKKOS_INLINE_FUNCTION
80 KOKKOS_INLINE_FUNCTION
87 KOKKOS_INLINE_FUNCTION
96 template<
class LOType,
class GOType,
class SCType,
class DeviceType,
class NspType,
class aggRowsType,
class maxAggDofSizeType,
class agg2RowMapLOType,
class statusType,
class rowsType,
class rowsAuxType,
class colsAuxType,
class valsAuxType>
97 class LocalQRDecompFunctor {
103 typedef typename DeviceType::execution_space execution_space;
104 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
105 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
106 typedef typename impl_ATS::magnitudeType Magnitude;
116 maxAggDofSizeType maxAggDofSize;
117 agg2RowMapLOType agg2RowMapLO;
118 statusType statusAtomic;
125 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_,
bool doQRStep_) :
129 maxAggDofSize(maxAggDofSize_),
130 agg2RowMapLO(agg2RowMapLO_),
131 statusAtomic(statusAtomic_),
139 KOKKOS_INLINE_FUNCTION
141 auto agg = thread.league_rank();
144 auto aggSize = aggRows(agg+1) - aggRows(agg);
146 const impl_SC one = impl_ATS::one();
147 const impl_SC two = one + one;
148 const impl_SC zero = impl_ATS::zero();
149 const auto zeroM = impl_ATS::magnitude(zero);
152 int n = fineNS.extent(1);
155 Xpetra::global_size_t offset = agg * n;
160 shared_matrix r(thread.team_shmem(), m, n);
161 for (
int j = 0; j < n; j++)
162 for (
int k = 0; k < m; k++)
163 r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
166 for (
int i = 0; i < m; i++) {
167 for (
int j = 0; j < n; j++)
168 printf(
" %5.3lf ", r(i,j));
174 shared_matrix q(thread.team_shmem(), m, m);
176 bool isSingular =
false;
180 for (
int i = 0; i < m; i++) {
181 for (
int j = 0; j < m; j++)
186 for (
int k = 0; k < n; k++) {
188 Magnitude s = zeroM, norm, norm_x;
189 for (
int i = k+1; i < m; i++)
190 s += pow(impl_ATS::magnitude(r(i,k)), 2);
191 norm = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
200 norm_x = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
201 if (norm_x == zeroM) {
209 for (
int i = k; i < m; i++)
213 for (
int j = k+1; j < n; j++) {
216 for (
int i = k; i < m; i++)
217 si += r(i,k) * r(i,j);
218 for (
int i = k; i < m; i++)
219 r(i,j) -= two*si * r(i,k);
223 for (
int j = k; j < m; j++) {
226 for (
int i = k; i < m; i++)
227 si += r(i,k) * qt(i,j);
228 for (
int i = k; i < m; i++)
229 qt(i,j) -= two*si * r(i,k);
234 for (
int i = k+1; i < m; i++)
240 for (
int i = 0; i < m; i++)
241 for (
int j = 0; j < i; j++) {
242 impl_SC tmp = qt(i,j);
249 for (
int j = 0; j < n; j++)
250 for (
int k = 0; k <= j; k++)
251 coarseNS(offset+k,j) = r(k,j);
254 statusAtomic(1) =
true;
289 for (
int j = 0; j < n; j++)
290 for (
int k = 0; k < n; k++)
292 coarseNS(offset+k,j) = r(k,j);
294 coarseNS(offset+k,j) = (k == j ? one : zero);
297 for (
int i = 0; i < m; i++)
298 for (
int j = 0; j < n; j++)
299 q(i,j) = (j == i ? one : zero);
303 for (
int j = 0; j < m; j++) {
304 LO localRow = agg2RowMapLO(aggRows(agg)+j);
305 size_t rowStart = rowsAux(localRow);
307 for (
int k = 0; k < n; k++) {
309 if (q(j,k) != zero) {
310 colsAux(rowStart+lnnz) = offset + k;
311 valsAux(rowStart+lnnz) = q(j,k);
315 rows(localRow+1) = lnnz;
321 for (
int i = 0; i < m; i++) {
322 for (
int j = 0; j < n; j++)
323 printf(
" %5.3lf ", coarseNS(i,j));
328 for (
int i = 0; i < aggSize; i++) {
329 for (
int j = 0; j < aggSize; j++)
330 printf(
" %5.3lf ", q(i,j));
344 for (
int j = 0; j < m; j++) {
345 LO localRow = agg2RowMapLO(aggRows(agg)+j);
346 size_t rowStart = rowsAux(localRow);
348 for (
int k = 0; k < n; k++) {
349 const impl_SC qr_jk = fineNS(localRow,k);
352 colsAux(rowStart+lnnz) = offset + k;
353 valsAux(rowStart+lnnz) = qr_jk;
357 rows(localRow+1) = lnnz;
361 for (
int j = 0; j < n; j++)
362 coarseNS(offset+j,j) = one;
369 size_t team_shmem_size(
int )
const {
371 int m = maxAggDofSize;
372 int n = fineNS.extent(1);
373 return shared_matrix::shmem_size(m, n) +
374 shared_matrix::shmem_size(m, m);
382 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
383 RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList()
const {
384 RCP<ParameterList> validParamList = rcp(
new ParameterList());
386 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) 389 #undef SET_VALID_ENTRY 391 validParamList->set< RCP<const FactoryBase> >(
"A", Teuchos::null,
"Generating factory of the matrix A");
392 validParamList->set< RCP<const FactoryBase> >(
"Aggregates", Teuchos::null,
"Generating factory of the aggregates");
393 validParamList->set< RCP<const FactoryBase> >(
"Nullspace", Teuchos::null,
"Generating factory of the nullspace");
394 validParamList->set< RCP<const FactoryBase> >(
"Scaled Nullspace", Teuchos::null,
"Generating factory of the scaled nullspace");
395 validParamList->set< RCP<const FactoryBase> >(
"UnAmalgamationInfo", Teuchos::null,
"Generating factory of UnAmalgamationInfo");
396 validParamList->set< RCP<const FactoryBase> >(
"CoarseMap", Teuchos::null,
"Generating factory of the coarse map");
397 validParamList->set< RCP<const FactoryBase> >(
"Coordinates", Teuchos::null,
"Generating factory of the coordinates");
400 ParameterList norecurse;
401 norecurse.disableRecursiveValidation();
402 validParamList->set<ParameterList> (
"matrixmatrix: kernel params", norecurse,
"MatrixMatrix kernel parameters");
404 return validParamList;
407 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
408 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& )
const {
410 const ParameterList& pL = GetParameterList();
412 std::string nspName =
"Nullspace";
413 if(pL.isParameter(
"Nullspace name")) nspName = pL.get<std::string>(
"Nullspace name");
415 Input(fineLevel,
"A");
416 Input(fineLevel,
"Aggregates");
417 Input(fineLevel, nspName);
418 Input(fineLevel,
"UnAmalgamationInfo");
419 Input(fineLevel,
"CoarseMap");
420 if( fineLevel.GetLevelID() == 0 &&
422 pL.get<
bool>(
"tentative: build coarse coordinates") ) {
423 bTransferCoordinates_ =
true;
424 Input(fineLevel,
"Coordinates");
425 }
else if (bTransferCoordinates_) {
426 Input(fineLevel,
"Coordinates");
430 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
431 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel)
const {
432 return BuildP(fineLevel, coarseLevel);
435 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
436 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel)
const {
437 FactoryMonitor m(*
this,
"Build", coarseLevel);
439 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
440 typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
441 const ParameterList& pL = GetParameterList();
442 std::string nspName =
"Nullspace";
443 if(pL.isParameter(
"Nullspace name")) nspName = pL.get<std::string>(
"Nullspace name");
445 auto A = Get< RCP<Matrix> > (fineLevel,
"A");
446 auto aggregates = Get< RCP<Aggregates_kokkos> > (fineLevel,
"Aggregates");
447 auto amalgInfo = Get< RCP<AmalgamationInfo_kokkos> > (fineLevel,
"UnAmalgamationInfo");
448 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, nspName);
449 auto coarseMap = Get< RCP<const Map> > (fineLevel,
"CoarseMap");
450 RCP<RealValuedMultiVector> fineCoords;
451 if(bTransferCoordinates_) {
452 fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel,
"Coordinates");
455 RCP<Matrix> Ptentative;
456 RCP<MultiVector> coarseNullspace;
457 RCP<RealValuedMultiVector> coarseCoords;
459 if(bTransferCoordinates_) {
460 ArrayView<const GO> elementAList = coarseMap->getNodeElementList();
461 GO indexBase = coarseMap->getIndexBase();
464 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
465 blkSize = rcp_dynamic_cast<
const StridedMap>(coarseMap)->getFixedBlockSize();
467 Array<GO> elementList;
468 ArrayView<const GO> elementListView;
472 elementListView = elementAList;
475 auto numElements = elementAList.size() / blkSize;
477 elementList.resize(numElements);
480 for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
481 elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
483 elementListView = elementList;
486 auto uniqueMap = fineCoords->getMap();
487 auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
488 elementListView, indexBase, coarseMap->getComm());
489 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
492 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
493 if (aggregates->AggregatesCrossProcessors()) {
494 auto nonUniqueMap = aggregates->GetMap();
495 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
497 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
498 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
503 auto aggGraph = aggregates->GetGraph();
504 auto numAggs = aggGraph.numRows();
506 auto fineCoordsView = fineCoords ->getDeviceLocalView(Xpetra::Access::ReadOnly);
507 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
511 SubFactoryMonitor m2(*
this,
"AverageCoords", coarseLevel);
513 const auto dim = fineCoords->getNumVectors();
515 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
516 for (
size_t j = 0; j < dim; j++) {
518 KOKKOS_LAMBDA(
const LO i) {
522 auto aggregate = aggGraph.rowConst(i);
524 coordinate_type sum = 0.0;
525 for (
size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
526 sum += fineCoordsRandomView(aggregate(colID),j);
528 coarseCoordsView(i,j) = sum / aggregate.length;
534 if (!aggregates->AggregatesCrossProcessors())
535 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
537 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
547 if (A->IsView(
"stridedMaps") ==
true)
548 Ptentative->CreateView(
"stridedMaps", A->getRowMap(
"stridedMaps"), coarseMap);
550 Ptentative->CreateView(
"stridedMaps", Ptentative->getRangeMap(), coarseMap);
552 if(bTransferCoordinates_) {
553 Set(coarseLevel,
"Coordinates", coarseCoords);
555 Set(coarseLevel,
"Nullspace", coarseNullspace);
556 Set(coarseLevel,
"P", Ptentative);
559 RCP<ParameterList> params = rcp(
new ParameterList());
560 params->set(
"printLoadBalancingInfo",
true);
565 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
566 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
567 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
568 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
569 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
570 RCP<MultiVector>& coarseNullspace,
const int levelID)
const {
571 auto rowMap = A->getRowMap();
572 auto colMap = A->getColMap();
574 const size_t numRows = rowMap->getNodeNumElements();
575 const size_t NSDim = fineNullspace->getNumVectors();
577 typedef Kokkos::ArithTraits<SC> ATS;
578 using impl_SC =
typename ATS::val_type;
579 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
580 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
582 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
584 typename Aggregates_kokkos::local_graph_type aggGraph;
586 SubFactoryMonitor m2(*
this,
"Get Aggregates graph", coarseLevel);
587 aggGraph = aggregates->GetGraph();
589 auto aggRows = aggGraph.row_map;
590 auto aggCols = aggGraph.entries;
597 SubFactoryMonitor m2(*
this,
"Check good map", coarseLevel);
598 goodMap = isGoodMap(*rowMap, *colMap);
601 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
602 "MueLu: TentativePFactory_kokkos: for now works only with good maps " 603 "(i.e. \"matching\" row and column maps)");
612 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
614 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
615 GO globalOffset = amalgInfo->GlobalOffset();
618 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
619 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
620 const size_t numAggregates = aggregates->GetNumAggregates();
622 int myPID = aggregates->GetMap()->getComm()->getRank();
627 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
628 AggSizeType aggDofSizes;
630 if (stridedBlockSize == 1) {
631 SubFactoryMonitor m2(*
this,
"Calc AggSizes", coarseLevel);
634 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates+1);
636 auto sizesConst = aggregates->ComputeAggregateSizes();
637 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
640 SubFactoryMonitor m2(*
this,
"Calc AggSizes", coarseLevel);
643 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
645 auto nodeMap = aggregates->GetMap()->getLocalMap();
646 auto dofMap = colMap->getLocalMap();
648 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compute_agg_sizes", range_type(0,numAggregates),
649 KOKKOS_LAMBDA(
const LO agg) {
650 auto aggRowView = aggGraph.rowConst(agg);
653 for (LO colID = 0; colID < aggRowView.length; colID++) {
654 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
656 for (LO k = 0; k < stridedBlockSize; k++) {
657 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
659 if (dofMap.getLocalElement(dofGID) != INVALID)
663 aggDofSizes(agg+1) = size;
670 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
671 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
675 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
676 KOKKOS_LAMBDA(
const LO i, LO& update,
const bool& final_pass) {
677 update += aggDofSizes(i);
679 aggDofSizes(i) = update;
686 SubFactoryMonitor m2(*
this,
"Create Agg2RowMap", coarseLevel);
688 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
689 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
691 Kokkos::parallel_for(
"MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
692 KOKKOS_LAMBDA(
const LO lnode) {
693 if (procWinner(lnode, 0) == myPID) {
695 auto aggID = vertex2AggId(lnode,0);
697 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
701 for (LO k = 0; k < stridedBlockSize; k++)
702 agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
709 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
712 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
713 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
717 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
718 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
719 typedef typename local_matrix_type::index_type::non_const_type cols_type;
720 typedef typename local_matrix_type::values_type::non_const_type vals_type;
725 status_type status(
"status");
727 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
728 typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
730 const ParameterList& pL = GetParameterList();
731 const bool& doQRStep = pL.get<
bool>(
"tentative: calculate qr");
733 GetOStream(
Runtime1) <<
"TentativePFactory : bypassing local QR phase" << std::endl;
735 GetOStream(
Warnings0) <<
"TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
738 size_t nnzEstimate = numRows * NSDim;
739 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_rows"), numRows+1);
740 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_cols"), nnzEstimate);
741 vals_type valsAux(
"Ptent_aux_vals", nnzEstimate);
742 rows_type rows(
"Ptent_rows", numRows+1);
745 SubFactoryMonitor m2(*
this,
"Stage 0 (InitViews)", coarseLevel);
749 Kokkos::parallel_for(
"MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows+1),
750 KOKKOS_LAMBDA(
const LO row) {
751 rowsAux(row) = row*NSDim;
753 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
754 KOKKOS_LAMBDA(
const LO j) {
755 colsAux(j) = INVALID;
765 SubFactoryMonitor m2(*
this,
"Stage 1 (LocalQR)", coarseLevel);
774 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:main_loop", policy,
776 auto agg = thread.league_rank();
779 LO aggSize = aggRows(agg+1) - aggRows(agg);
784 auto norm = impl_ATS::magnitude(zero);
789 for (decltype(aggSize) k = 0; k < aggSize; k++) {
790 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
797 statusAtomic(1) =
true;
802 coarseNS(agg, 0) = norm;
805 for (decltype(aggSize) k = 0; k < aggSize; k++) {
806 LO localRow = agg2RowMapLO(aggRows(agg)+k);
807 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
809 rows(localRow+1) = 1;
810 colsAux(localRow) = agg;
811 valsAux(localRow) = localVal;
816 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
817 Kokkos::deep_copy(statusHost, status);
818 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
820 std::ostringstream oss;
821 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
823 case 0: oss <<
"!goodMap is not implemented";
break;
824 case 1: oss <<
"fine level NS part has a zero column";
break;
826 throw Exceptions::RuntimeError(oss.str());
830 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
832 auto agg = thread.league_rank();
835 LO aggSize = aggRows(agg+1) - aggRows(agg);
838 coarseNS(agg, 0) = one;
841 for (decltype(aggSize) k = 0; k < aggSize; k++) {
842 LO localRow = agg2RowMapLO(aggRows(agg)+k);
843 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
845 rows(localRow+1) = 1;
846 colsAux(localRow) = agg;
847 valsAux(localRow) = localVal;
853 Kokkos::parallel_reduce(
"MueLu:TentativeP:CountNNZ", range_type(0, numRows+1),
854 KOKKOS_LAMBDA(
const LO i,
size_t &nnz_count) {
855 nnz_count += rows(i);
868 SubFactoryMonitor m2 = SubFactoryMonitor(*
this, doQRStep ?
"Stage 1 (LocalQR)" :
"Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
874 decltype(aggDofSizes ), decltype(maxAggSize), decltype(agg2RowMapLO),
875 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
877 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
878 rows, rowsAux, colsAux, valsAux, doQRStep);
879 Kokkos::parallel_reduce(
"MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
882 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
883 Kokkos::deep_copy(statusHost, status);
884 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
886 std::ostringstream oss;
887 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
889 case 0: oss <<
"!goodMap is not implemented";
break;
890 case 1: oss <<
"fine level NS part has a zero column";
break;
892 throw Exceptions::RuntimeError(oss.str());
903 if (nnz != nnzEstimate) {
906 SubFactoryMonitor m2(*
this,
"Stage 2 (CompressRows)", coarseLevel);
908 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
909 KOKKOS_LAMBDA(
const LO i, LO& upd,
const bool&
final) {
917 SubFactoryMonitor m2(*
this,
"Stage 2 (CompressCols)", coarseLevel);
919 cols = cols_type(
"Ptent_cols", nnz);
920 vals = vals_type(
"Ptent_vals", nnz);
925 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows),
926 KOKKOS_LAMBDA(
const LO i) {
927 LO rowStart = rows(i);
930 for (
auto j = rowsAux(i); j < rowsAux(i+1); j++)
931 if (colsAux(j) != INVALID) {
932 cols(rowStart+lnnz) = colsAux(j);
933 vals(rowStart+lnnz) = valsAux(j);
945 GetOStream(
Runtime1) <<
"TentativePFactory : aggregates do not cross process boundaries" << std::endl;
949 SubFactoryMonitor m2(*
this,
"Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
951 local_matrix_type lclMatrix = local_matrix_type(
"A", numRows, coarseMap->getNodeNumElements(), nnz, vals, rows, cols);
954 RCP<ParameterList> FCparams;
955 if (pL.isSublist(
"matrixmatrix: kernel params"))
956 FCparams = rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
958 FCparams = rcp(
new ParameterList);
961 FCparams->set(
"compute global constants", FCparams->get(
"compute global constants",
false));
962 FCparams->set(
"Timer Label", std::string(
"MueLu::TentativeP-") +
toString(levelID));
964 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
965 Ptentative = rcp(
new CrsMatrixWrap(PtentCrs));
969 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
970 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
971 BuildPcoupled(RCP<Matrix> , RCP<Aggregates_kokkos> ,
972 RCP<AmalgamationInfo_kokkos> , RCP<MultiVector> ,
973 RCP<const Map> , RCP<Matrix>& ,
974 RCP<MultiVector>& )
const {
975 throw Exceptions::RuntimeError(
"MueLu: Construction of coupled tentative P is not implemented");
978 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
979 bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
980 isGoodMap(
const Map& rowMap,
const Map& colMap)
const {
981 auto rowLocalMap = rowMap.getLocalMap();
982 auto colLocalMap = colMap.getLocalMap();
984 const size_t numRows = rowLocalMap.getNodeNumElements();
985 const size_t numCols = colLocalMap.getNodeNumElements();
987 if (numCols < numRows)
991 Kokkos::parallel_reduce(
"MueLu:TentativePF:isGoodMap", range_type(0, numRows),
992 KOKKOS_LAMBDA(
const LO i,
size_t &diff) {
993 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
996 return (numDiff == 0);
1001 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT 1002 #endif // HAVE_MUELU_KOKKOS_REFACTOR 1003 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP Important warning messages (one line)
MueLu::DefaultLocalOrdinal LocalOrdinal
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
Namespace for MueLu classes and methods.
static const NoFactory * get()
Print even more statistics.
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Description of what is happening (more verbose)
#define SET_VALID_ENTRY(name)