Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40#ifndef TPETRA_MULTIVECTOR_DEF_HPP
41#define TPETRA_MULTIVECTOR_DEF_HPP
42
50
51#include "Tpetra_Core.hpp"
52#include "Tpetra_Util.hpp"
53#include "Tpetra_Vector.hpp"
65#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
66# include "Teuchos_SerialDenseMatrix.hpp"
67#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
68#include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
69#include "KokkosCompat_View.hpp"
70#include "KokkosBlas.hpp"
71#include "KokkosKernels_Utils.hpp"
72#include "Kokkos_Random.hpp"
73#include "Kokkos_ArithTraits.hpp"
74#include <memory>
75#include <sstream>
76
77#ifdef HAVE_TPETRA_INST_FLOAT128
78namespace Kokkos {
79 // FIXME (mfh 04 Sep 2015) Just a stub for now!
80 template<class Generator>
81 struct rand<Generator, __float128> {
82 static KOKKOS_INLINE_FUNCTION __float128 max ()
83 {
84 return static_cast<__float128> (1.0);
85 }
86 static KOKKOS_INLINE_FUNCTION __float128
87 draw (Generator& gen)
88 {
89 // Half the smallest normalized double, is the scaling factor of
90 // the lower-order term in the double-double representation.
91 const __float128 scalingFactor =
92 static_cast<__float128> (std::numeric_limits<double>::min ()) /
93 static_cast<__float128> (2.0);
94 const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
95 const __float128 lowerOrderTerm =
96 static_cast<__float128> (gen.drand ()) * scalingFactor;
97 return higherOrderTerm + lowerOrderTerm;
98 }
99 static KOKKOS_INLINE_FUNCTION __float128
100 draw (Generator& gen, const __float128& range)
101 {
102 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
103 const __float128 scalingFactor =
104 static_cast<__float128> (std::numeric_limits<double>::min ()) /
105 static_cast<__float128> (2.0);
106 const __float128 higherOrderTerm =
107 static_cast<__float128> (gen.drand (range));
108 const __float128 lowerOrderTerm =
109 static_cast<__float128> (gen.drand (range)) * scalingFactor;
110 return higherOrderTerm + lowerOrderTerm;
111 }
112 static KOKKOS_INLINE_FUNCTION __float128
113 draw (Generator& gen, const __float128& start, const __float128& end)
114 {
115 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
116 const __float128 scalingFactor =
117 static_cast<__float128> (std::numeric_limits<double>::min ()) /
118 static_cast<__float128> (2.0);
119 const __float128 higherOrderTerm =
120 static_cast<__float128> (gen.drand (start, end));
121 const __float128 lowerOrderTerm =
122 static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
123 return higherOrderTerm + lowerOrderTerm;
124 }
125 };
126} // namespace Kokkos
127#endif // HAVE_TPETRA_INST_FLOAT128
128
129namespace { // (anonymous)
130
145 template<class ST, class LO, class GO, class NT>
146 typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type
147 allocDualView (const size_t lclNumRows,
148 const size_t numCols,
149 const bool zeroOut = true,
150 const bool allowPadding = false)
151 {
152 using ::Tpetra::Details::Behavior;
153 using Kokkos::AllowPadding;
154 using Kokkos::view_alloc;
155 using Kokkos::WithoutInitializing;
156 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
157 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
158 typedef typename dual_view_type::t_dev dev_view_type;
159
160 // This needs to be a string and not a char*, if given as an
161 // argument to Kokkos::view_alloc. This is because view_alloc
162 // also allows a raw pointer as its first argument. See
163 // https://github.com/kokkos/kokkos/issues/434.
164 const std::string label ("MV::DualView");
165 const bool debug = Behavior::debug ();
166
167 // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
168 // creation of the DualView's Views works around
169 // Kokkos::DualView's current inability to accept an
170 // AllocationProperties initial argument (as Kokkos::View does).
171 // However, the work-around is harmless, since it does what the
172 // (currently nonexistent) equivalent DualView constructor would
173 // have done anyway.
174
175 dev_view_type d_view;
176 if (zeroOut) {
177 if (allowPadding) {
178 d_view = dev_view_type (view_alloc (label, AllowPadding),
179 lclNumRows, numCols);
180 }
181 else {
182 d_view = dev_view_type (view_alloc (label),
183 lclNumRows, numCols);
184 }
185 }
186 else {
187 if (allowPadding) {
188 d_view = dev_view_type (view_alloc (label,
189 WithoutInitializing,
190 AllowPadding),
191 lclNumRows, numCols);
192 }
193 else {
194 d_view = dev_view_type (view_alloc (label, WithoutInitializing),
195 lclNumRows, numCols);
196 }
197 if (debug) {
198 // Filling with NaN is a cheap and effective way to tell if
199 // downstream code is trying to use a MultiVector's data
200 // without them having been initialized. ArithTraits lets us
201 // call nan() even if the scalar type doesn't define it; it
202 // just returns some undefined value in the latter case. This
203 // won't hurt anything because by setting zeroOut=false, users
204 // already agreed that they don't care about the contents of
205 // the MultiVector.
206 const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
207 KokkosBlas::fill (d_view, nan);
208 }
209 }
210 if (debug) {
211 TEUCHOS_TEST_FOR_EXCEPTION
212 (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
213 static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
214 "allocDualView: d_view's dimensions actual dimensions do not match "
215 "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
216 d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
217 << ". Please report this bug to the Tpetra developers.");
218 }
219
220 return wrapped_dual_view_type(d_view);
221 }
222
223 // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
224 //
225 // T: The type of the entries of the View.
226 // ExecSpace: The Kokkos execution space.
227 template<class T, class ExecSpace>
228 struct MakeUnmanagedView {
229 // The 'false' part of the branch carefully ensures that this
230 // won't attempt to use a host execution space that hasn't been
231 // initialized. For example, if Kokkos::OpenMP is disabled and
232 // Kokkos::Threads is enabled, the latter is always the default
233 // execution space of Kokkos::HostSpace, even when ExecSpace is
234 // Kokkos::Serial. That's why we go through the trouble of asking
235 // Kokkos::DualView what _its_ space is. That seems to work
236 // around this default execution space issue.
237 //
238 typedef typename std::conditional<
239 Kokkos::SpaceAccessibility<
240 typename ExecSpace::memory_space,
241 Kokkos::HostSpace>::accessible,
242 typename ExecSpace::device_type,
243 typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
244 typedef Kokkos::LayoutLeft array_layout;
245 typedef Kokkos::View<T*, array_layout, host_exec_space,
246 Kokkos::MemoryUnmanaged> view_type;
247
248 static view_type getView (const Teuchos::ArrayView<T>& x_in)
249 {
250 const size_t numEnt = static_cast<size_t> (x_in.size ());
251 if (numEnt == 0) {
252 return view_type ();
253 } else {
254 return view_type (x_in.getRawPtr (), numEnt);
255 }
256 }
257 };
258
259
260 template<class WrappedDualViewType>
261 WrappedDualViewType
262 takeSubview (const WrappedDualViewType& X,
263 const std::pair<size_t, size_t>& rowRng,
264 const Kokkos::Impl::ALL_t& colRng)
265
266 {
267 // The bug we saw below should be harder to trigger here.
268 return WrappedDualViewType(X,rowRng,colRng);
269 }
270
271 template<class WrappedDualViewType>
272 WrappedDualViewType
273 takeSubview (const WrappedDualViewType& X,
274 const Kokkos::Impl::ALL_t& rowRng,
275 const std::pair<size_t, size_t>& colRng)
276 {
277 using DualViewType = typename WrappedDualViewType::DVT;
278 // Look carefullly at the comment in the below version of this function.
279 // The comment applies here as well.
280 if (X.extent (0) == 0 && X.extent (1) != 0) {
281 return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
282 }
283 else {
284 return WrappedDualViewType(X,rowRng,colRng);
285 }
286 }
287
288 template<class WrappedDualViewType>
289 WrappedDualViewType
290 takeSubview (const WrappedDualViewType& X,
291 const std::pair<size_t, size_t>& rowRng,
292 const std::pair<size_t, size_t>& colRng)
293 {
294 using DualViewType = typename WrappedDualViewType::DVT;
295 // If you take a subview of a view with zero rows Kokkos::subview()
296 // always returns a DualView with the same data pointers. This will break
297 // pointer equality testing in between two subviews of the same 2D View if
298 // it has zero row extent. While the one (known) case where this was actually used
299 // has been fixed, that sort of check could very easily be reintroduced in the future,
300 // hence I've added this if check here.
301 //
302 // This is not a bug in Kokkos::subview(), just some very subtle behavior which
303 // future developers should be wary of.
304 if (X.extent (0) == 0 && X.extent (1) != 0) {
305 return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
306 }
307 else {
308 return WrappedDualViewType(X,rowRng,colRng);
309 }
310 }
311
312 template<class WrappedOrNotDualViewType>
313 size_t
314 getDualViewStride (const WrappedOrNotDualViewType& dv)
315 {
316 // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
317 // method yet, but its Views do.
318 // NOTE: dv.stride() returns a vector of length one
319 // more than its rank
320 size_t strides[WrappedOrNotDualViewType::t_dev::Rank+1];
321 dv.stride(strides);
322 const size_t LDA = strides[1];
323 const size_t numRows = dv.extent (0);
324
325 if (LDA == 0) {
326 return (numRows == 0) ? size_t (1) : numRows;
327 }
328 else {
329 return LDA;
330 }
331 }
332
333 template<class ViewType>
334 size_t
335 getViewStride (const ViewType& view)
336 {
337 const size_t LDA = view.stride (1);
338 const size_t numRows = view.extent (0);
339
340 if (LDA == 0) {
341 return (numRows == 0) ? size_t (1) : numRows;
342 }
343 else {
344 return LDA;
345 }
346 }
347
348 template <class impl_scalar_type, class buffer_device_type>
349 bool
350 runKernelOnHost (
351 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports
352 )
353 {
354 if (! imports.need_sync_device ()) {
355 return false; // most up-to-date on device
356 }
357 else { // most up-to-date on host,
358 // but if large enough, worth running on device anyway
359 size_t localLengthThreshold =
361 return imports.extent(0) <= localLengthThreshold;
362 }
363 }
364
365
366 template <class SC, class LO, class GO, class NT>
367 bool
368 runKernelOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
369 {
370 if (! X.need_sync_device ()) {
371 return false; // most up-to-date on device
372 }
373 else { // most up-to-date on host
374 // but if large enough, worth running on device anyway
375 size_t localLengthThreshold =
377 return X.getLocalLength () <= localLengthThreshold;
378 }
379 }
380
381 template <class SC, class LO, class GO, class NT>
382 void
383 multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
385 const ::Tpetra::Details::EWhichNorm whichNorm)
386 {
387 using namespace Tpetra;
388 using ::Tpetra::Details::normImpl;
390 using val_type = typename MV::impl_scalar_type;
391 using mag_type = typename MV::mag_type;
392 using dual_view_type = typename MV::dual_view_type;
393
394 auto map = X.getMap ();
395 auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
396 auto whichVecs = getMultiVectorWhichVectors (X);
397 const bool isConstantStride = X.isConstantStride ();
398 const bool isDistributed = X.isDistributed ();
399
400 const bool runOnHost = runKernelOnHost (X);
401 if (runOnHost) {
402 using view_type = typename dual_view_type::t_host;
403 using array_layout = typename view_type::array_layout;
404 using device_type = typename view_type::device_type;
405
406 auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
407 normImpl<val_type, array_layout, device_type,
408 mag_type> (norms, X_lcl, whichNorm, whichVecs,
409 isConstantStride, isDistributed, comm);
410 }
411 else {
412 using view_type = typename dual_view_type::t_dev;
413 using array_layout = typename view_type::array_layout;
414 using device_type = typename view_type::device_type;
415
416 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
417 normImpl<val_type, array_layout, device_type,
418 mag_type> (norms, X_lcl, whichNorm, whichVecs,
419 isConstantStride, isDistributed, comm);
420 }
421 }
422} // namespace (anonymous)
423
424
425namespace Tpetra {
426
427 namespace Details {
428 template <typename DstView, typename SrcView>
429 struct AddAssignFunctor {
430 // This functor would be better as a lambda, but CUDA cannot compile
431 // lambdas in protected functions. It compiles fine with the functor.
432 AddAssignFunctor(DstView &tgt_, SrcView &src_) : tgt(tgt_), src(src_) {}
433
435 operator () (const size_t k) const { tgt(k) += src(k); }
436
437 DstView tgt;
438 SrcView src;
439 };
440 }
441
442 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
443 bool
445 vectorIndexOutOfRange (const size_t VectorIndex) const {
446 return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
447 }
448
449 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
454
455 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
457 MultiVector (const Teuchos::RCP<const map_type>& map,
458 const size_t numVecs,
459 const bool zeroOut) : /* default is true */
460 base_type (map)
461 {
462 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
463
464 const size_t lclNumRows = this->getLocalLength ();
466 }
467
468 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
471 const Teuchos::DataAccess copyOrView) :
473 view_ (source.view_),
474 whichVectors_ (source.whichVectors_)
475 {
477 const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
478 "const Teuchos::DataAccess): ";
480 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
481
482 if (copyOrView == Teuchos::Copy) {
483 // Reuse the conveniently already existing function that creates
484 // a deep copy.
485 MV cpy = createCopy (source);
486 this->view_ = cpy.view_;
487 this->whichVectors_ = cpy.whichVectors_;
488 }
489 else if (copyOrView == Teuchos::View) {
490 }
491 else {
493 true, std::invalid_argument, "Second argument 'copyOrView' has an "
494 "invalid value " << copyOrView << ". Valid values include "
495 "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
496 << Teuchos::View << ".");
497 }
498 }
499
500 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
502 MultiVector (const Teuchos::RCP<const map_type>& map,
503 const dual_view_type& view) :
504 base_type (map),
505 view_ (wrapped_dual_view_type(view))
506 {
507 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
508 const size_t lclNumRows_map = map.is_null () ? size_t (0) :
509 map->getLocalNumElements ();
510 const size_t lclNumRows_view = view.extent (0);
511 const size_t LDA = getDualViewStride (view_);
512
515 std::invalid_argument, "Kokkos::DualView does not match Map. "
516 "map->getLocalNumElements() = " << lclNumRows_map
517 << ", view.extent(0) = " << lclNumRows_view
518 << ", and getStride() = " << LDA << ".");
520 using ::Tpetra::Details::Behavior;
521 const bool debug = Behavior::debug ();
522 if (debug) {
523 using ::Tpetra::Details::checkGlobalDualViewValidity;
524 std::ostringstream gblErrStrm;
525 const bool verbose = Behavior::verbose ();
526 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
527 const bool gblValid =
528 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
529 comm.getRawPtr ());
531 (! gblValid, std::runtime_error, gblErrStrm.str ());
532 }
533 }
534
535
536 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
538 MultiVector (const Teuchos::RCP<const map_type>& map,
539 const wrapped_dual_view_type& view) :
540 base_type (map),
541 view_ (view)
542 {
543 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
544 const size_t lclNumRows_map = map.is_null () ? size_t (0) :
545 map->getLocalNumElements ();
546 const size_t lclNumRows_view = view.extent (0);
547 const size_t LDA = getDualViewStride (view);
548
551 std::invalid_argument, "Kokkos::DualView does not match Map. "
552 "map->getLocalNumElements() = " << lclNumRows_map
553 << ", view.extent(0) = " << lclNumRows_view
554 << ", and getStride() = " << LDA << ".");
555
556 using ::Tpetra::Details::Behavior;
557 const bool debug = Behavior::debug ();
558 if (debug) {
559 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
560 std::ostringstream gblErrStrm;
561 const bool verbose = Behavior::verbose ();
562 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
563 const bool gblValid =
564 checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
565 comm.getRawPtr ());
567 (! gblValid, std::runtime_error, gblErrStrm.str ());
568 }
569 }
570
571
572
573 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
575 MultiVector (const Teuchos::RCP<const map_type>& map,
576 const typename dual_view_type::t_dev& d_view) :
577 base_type (map)
578 {
579 using Teuchos::ArrayRCP;
580 using Teuchos::RCP;
581 const char tfecfFuncName[] = "MultiVector(map,d_view): ";
582
583 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
584
585 const size_t LDA = getViewStride (d_view);
586 const size_t lclNumRows = map->getLocalNumElements ();
588 (LDA < lclNumRows, std::invalid_argument, "Map does not match "
589 "Kokkos::View. map->getLocalNumElements() = " << lclNumRows
590 << ", View's column stride = " << LDA
591 << ", and View's extent(0) = " << d_view.extent (0) << ".");
592
593 auto h_view = Kokkos::create_mirror_view (d_view);
595 view_ = wrapped_dual_view_type(dual_view);
596
597 using ::Tpetra::Details::Behavior;
598 const bool debug = Behavior::debug ();
599 if (debug) {
600 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
601 std::ostringstream gblErrStrm;
602 const bool verbose = Behavior::verbose ();
603 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
604 const bool gblValid =
605 checkGlobalWrappedDualViewValidity (&gblErrStrm, view_, verbose,
606 comm.getRawPtr ());
608 (! gblValid, std::runtime_error, gblErrStrm.str ());
609 }
610 // The user gave us a device view. In order to respect its
611 // initial contents, we mark the DualView as "modified on device."
612 // That way, the next sync will synchronize it with the host view.
613 dual_view.modify_device ();
614 }
615
616 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
618 MultiVector (const Teuchos::RCP<const map_type>& map,
619 const dual_view_type& view,
620 const dual_view_type& origView) :
621 base_type (map),
622 view_ (wrapped_dual_view_type(view,origView))
623 {
624 const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
625
626 const size_t LDA = getDualViewStride (origView);
627 const size_t lclNumRows = this->getLocalLength (); // comes from the Map
629 LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
630 "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
631 << ". This may also mean that the input origView's first dimension (number "
632 "of rows = " << origView.extent (0) << ") does not not match the number "
633 "of entries in the Map on the calling process.");
634
635 using ::Tpetra::Details::Behavior;
636 const bool debug = Behavior::debug ();
637 if (debug) {
638 using ::Tpetra::Details::checkGlobalDualViewValidity;
639 std::ostringstream gblErrStrm;
640 const bool verbose = Behavior::verbose ();
641 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
642 const bool gblValid_0 =
643 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
644 comm.getRawPtr ());
645 const bool gblValid_1 =
646 checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
647 comm.getRawPtr ());
648 const bool gblValid = gblValid_0 && gblValid_1;
650 (! gblValid, std::runtime_error, gblErrStrm.str ());
651 }
652 }
653
654
655 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
657 MultiVector (const Teuchos::RCP<const map_type>& map,
658 const dual_view_type& view,
659 const Teuchos::ArrayView<const size_t>& whichVectors) :
660 base_type (map),
661 view_ (view),
662 whichVectors_ (whichVectors.begin (), whichVectors.end ())
664 using Kokkos::ALL;
665 using Kokkos::subview;
666 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
667
668 using ::Tpetra::Details::Behavior;
669 const bool debug = Behavior::debug ();
670 if (debug) {
671 using ::Tpetra::Details::checkGlobalDualViewValidity;
672 std::ostringstream gblErrStrm;
673 const bool verbose = Behavior::verbose ();
674 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
675 const bool gblValid =
676 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
677 comm.getRawPtr ());
679 (! gblValid, std::runtime_error, gblErrStrm.str ());
680 }
681
682 const size_t lclNumRows = map.is_null () ? size_t (0) :
683 map->getLocalNumElements ();
684 // Check dimensions of the input DualView. We accept that Kokkos
685 // might not allow construction of a 0 x m (Dual)View with m > 0,
686 // so we only require the number of rows to match if the
687 // (Dual)View has more than zero columns. Likewise, we only
688 // require the number of columns to match if the (Dual)View has
689 // more than zero rows.
691 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
692 std::invalid_argument, "view.extent(0) = " << view.extent (0)
693 << " < map->getLocalNumElements() = " << lclNumRows << ".");
694 if (whichVectors.size () != 0) {
696 view.extent (1) != 0 && view.extent (1) == 0,
697 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
698 " = " << whichVectors.size () << " > 0.");
699 size_t maxColInd = 0;
700 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
701 for (size_type k = 0; k < whichVectors.size (); ++k) {
703 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
704 std::invalid_argument, "whichVectors[" << k << "] = "
705 "Teuchos::OrdinalTraits<size_t>::invalid().");
706 maxColInd = std::max (maxColInd, whichVectors[k]);
707 }
709 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
710 std::invalid_argument, "view.extent(1) = " << view.extent (1)
711 << " <= max(whichVectors) = " << maxColInd << ".");
712 }
713
714 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
715 // zero strides, so modify in that case.
716 const size_t LDA = getDualViewStride (view);
718 (LDA < lclNumRows, std::invalid_argument,
719 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
720
721 if (whichVectors.size () == 1) {
722 // If whichVectors has only one entry, we don't need to bother
723 // with nonconstant stride. Just shift the view over so it
724 // points to the desired column.
725 //
726 // NOTE (mfh 10 May 2014) This is a special case where we set
727 // origView_ just to view that one column, not all of the
728 // original columns. This ensures that the use of origView_ in
729 // offsetView works correctly.
730 //
731 const std::pair<size_t, size_t> colRng (whichVectors[0],
732 whichVectors[0] + 1);
733 view_ = takeSubview (view_, ALL (), colRng);
734 // whichVectors_.size() == 0 means "constant stride."
735 whichVectors_.clear ();
736 }
737 }
738
739 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
741 MultiVector (const Teuchos::RCP<const map_type>& map,
742 const wrapped_dual_view_type& view,
743 const Teuchos::ArrayView<const size_t>& whichVectors) :
744 base_type (map),
745 view_ (view),
746 whichVectors_ (whichVectors.begin (), whichVectors.end ())
747 {
748 using Kokkos::ALL;
749 using Kokkos::subview;
750 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
751
752 using ::Tpetra::Details::Behavior;
753 const bool debug = Behavior::debug ();
754 if (debug) {
755 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
756 std::ostringstream gblErrStrm;
757 const bool verbose = Behavior::verbose ();
758 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
759 const bool gblValid =
760 checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
761 comm.getRawPtr ());
763 (! gblValid, std::runtime_error, gblErrStrm.str ());
764 }
765
766 const size_t lclNumRows = map.is_null () ? size_t (0) :
767 map->getLocalNumElements ();
768 // Check dimensions of the input DualView. We accept that Kokkos
769 // might not allow construction of a 0 x m (Dual)View with m > 0,
770 // so we only require the number of rows to match if the
771 // (Dual)View has more than zero columns. Likewise, we only
772 // require the number of columns to match if the (Dual)View has
773 // more than zero rows.
775 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
776 std::invalid_argument, "view.extent(0) = " << view.extent (0)
777 << " < map->getLocalNumElements() = " << lclNumRows << ".");
778 if (whichVectors.size () != 0) {
780 view.extent (1) != 0 && view.extent (1) == 0,
781 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
782 " = " << whichVectors.size () << " > 0.");
783 size_t maxColInd = 0;
784 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
785 for (size_type k = 0; k < whichVectors.size (); ++k) {
787 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
788 std::invalid_argument, "whichVectors[" << k << "] = "
789 "Teuchos::OrdinalTraits<size_t>::invalid().");
790 maxColInd = std::max (maxColInd, whichVectors[k]);
791 }
793 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
794 std::invalid_argument, "view.extent(1) = " << view.extent (1)
795 << " <= max(whichVectors) = " << maxColInd << ".");
796 }
797
798 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
799 // zero strides, so modify in that case.
800 const size_t LDA = getDualViewStride (view);
802 (LDA < lclNumRows, std::invalid_argument,
803 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
804
805 if (whichVectors.size () == 1) {
806 // If whichVectors has only one entry, we don't need to bother
807 // with nonconstant stride. Just shift the view over so it
808 // points to the desired column.
809 //
810 // NOTE (mfh 10 May 2014) This is a special case where we set
811 // origView_ just to view that one column, not all of the
812 // original columns. This ensures that the use of origView_ in
813 // offsetView works correctly.
814 //
815 const std::pair<size_t, size_t> colRng (whichVectors[0],
816 whichVectors[0] + 1);
818 // whichVectors_.size() == 0 means "constant stride."
819 whichVectors_.clear ();
820 }
821 }
822
823
824 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
826 MultiVector (const Teuchos::RCP<const map_type>& map,
827 const dual_view_type& view,
829 const Teuchos::ArrayView<const size_t>& whichVectors) :
830 base_type (map),
831 view_ (wrapped_dual_view_type(view,origView)),
832 whichVectors_ (whichVectors.begin (), whichVectors.end ())
833 {
834 using Kokkos::ALL;
835 using Kokkos::subview;
836 const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
837
838 using ::Tpetra::Details::Behavior;
839 const bool debug = Behavior::debug ();
840 if (debug) {
841 using ::Tpetra::Details::checkGlobalDualViewValidity;
842 std::ostringstream gblErrStrm;
843 const bool verbose = Behavior::verbose ();
844 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
845 const bool gblValid_0 =
846 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
847 comm.getRawPtr ());
848 const bool gblValid_1 =
849 checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
850 comm.getRawPtr ());
851 const bool gblValid = gblValid_0 && gblValid_1;
853 (! gblValid, std::runtime_error, gblErrStrm.str ());
854 }
855
856 const size_t lclNumRows = this->getLocalLength ();
857 // Check dimensions of the input DualView. We accept that Kokkos
858 // might not allow construction of a 0 x m (Dual)View with m > 0,
859 // so we only require the number of rows to match if the
860 // (Dual)View has more than zero columns. Likewise, we only
861 // require the number of columns to match if the (Dual)View has
862 // more than zero rows.
864 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
865 std::invalid_argument, "view.extent(0) = " << view.extent (0)
866 << " < map->getLocalNumElements() = " << lclNumRows << ".");
867 if (whichVectors.size () != 0) {
869 view.extent (1) != 0 && view.extent (1) == 0,
870 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
871 " = " << whichVectors.size () << " > 0.");
872 size_t maxColInd = 0;
873 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
874 for (size_type k = 0; k < whichVectors.size (); ++k) {
876 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
877 std::invalid_argument, "whichVectors[" << k << "] = "
878 "Teuchos::OrdinalTraits<size_t>::invalid().");
879 maxColInd = std::max (maxColInd, whichVectors[k]);
880 }
882 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
883 std::invalid_argument, "view.extent(1) = " << view.extent (1)
884 << " <= max(whichVectors) = " << maxColInd << ".");
885 }
886
887 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
888 // zero strides, so modify in that case.
889 const size_t LDA = getDualViewStride (origView);
891 (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
892 "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
893 lclNumRows << ". origView.extent(0) = " << origView.extent(0)
894 << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
895
896 if (whichVectors.size () == 1) {
897 // If whichVectors has only one entry, we don't need to bother
898 // with nonconstant stride. Just shift the view over so it
899 // points to the desired column.
900 //
901 // NOTE (mfh 10 May 2014) This is a special case where we set
902 // origView_ just to view that one column, not all of the
903 // original columns. This ensures that the use of origView_ in
904 // offsetView works correctly.
905 const std::pair<size_t, size_t> colRng (whichVectors[0],
906 whichVectors[0] + 1);
907 view_ = takeSubview (view_, ALL (), colRng);
908 // origView_ = takeSubview (origView_, ALL (), colRng);
909 // whichVectors_.size() == 0 means "constant stride."
910 whichVectors_.clear ();
911 }
912 }
913
914 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
916 MultiVector (const Teuchos::RCP<const map_type>& map,
917 const Teuchos::ArrayView<const Scalar>& data,
918 const size_t LDA,
919 const size_t numVecs) :
920 base_type (map)
921 {
922 typedef LocalOrdinal LO;
923 typedef GlobalOrdinal GO;
924 const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
925 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
926
927 // Deep copy constructor, constant stride (NO whichVectors_).
928 // There is no need for a deep copy constructor with nonconstant stride.
929
930 const size_t lclNumRows =
931 map.is_null () ? size_t (0) : map->getLocalNumElements ();
933 (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
934 "map->getLocalNumElements() = " << lclNumRows << ".");
935 if (numVecs != 0) {
936 const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
938 (static_cast<size_t> (data.size ()) < minNumEntries,
939 std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
940 "entries, given the input Map and number of vectors in the MultiVector."
941 " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
942 "map->getLocalNumElements () = " << minNumEntries << ".");
943 }
944
946 //Note: X_out will be completely overwritten
947 auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
948
949 // Make an unmanaged host Kokkos::View of the input data. First
950 // create a View (X_in_orig) with the original stride. Then,
951 // create a subview (X_in) with the right number of columns.
952 const impl_scalar_type* const X_in_raw =
953 reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
954 Kokkos::View<const impl_scalar_type**,
955 Kokkos::LayoutLeft,
956 Kokkos::HostSpace,
957 Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
958 const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
959 auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
960
961 // If LDA != X_out's column stride, then we need to copy one
962 // column at a time; Kokkos::deep_copy refuses to work in that
963 // case.
964 const size_t outStride =
965 X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
966 if (LDA == outStride) { // strides are the same; deep_copy once
967 // This only works because MultiVector uses LayoutLeft.
968 // We would need a custom copy functor otherwise.
969 // DEEP_COPY REVIEW - HOST-TO-DEVICE
970 Kokkos::deep_copy (execution_space(), X_out, X_in);
971 }
972 else { // strides differ; copy one column at a time
973 typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
975 typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
977 for (size_t j = 0; j < numVecs; ++j) {
978 out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
979 in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
980 // DEEP_COPY REVIEW - HOST-TO-DEVICE
981 Kokkos::deep_copy (execution_space(), X_out_j, X_in_j);
982 }
983 }
984 }
985
986 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
988 MultiVector (const Teuchos::RCP<const map_type>& map,
989 const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
990 const size_t numVecs) :
991 base_type (map)
992 {
993 typedef impl_scalar_type IST;
994 typedef LocalOrdinal LO;
995 typedef GlobalOrdinal GO;
996 const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
997 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
998
999 const size_t lclNumRows =
1000 map.is_null () ? size_t (0) : map->getLocalNumElements ();
1002 (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
1003 std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
1004 "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
1005 for (size_t j = 0; j < numVecs; ++j) {
1006 Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
1008 static_cast<size_t> (X_j_av.size ()) < lclNumRows,
1009 std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
1010 << X_j_av.size () << " < map->getLocalNumElements() = " << lclNumRows
1011 << ".");
1012 }
1013
1015 auto X_out = this->getLocalViewDevice(Access::ReadWrite);
1016
1017 // Make sure that the type of a single input column has the same
1018 // array layout as each output column does, so we can deep_copy.
1019 using array_layout = typename decltype (X_out)::array_layout;
1020 using input_col_view_type = typename Kokkos::View<const IST*,
1021 array_layout,
1022 Kokkos::HostSpace,
1023 Kokkos::MemoryUnmanaged>;
1025 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
1026 for (size_t j = 0; j < numVecs; ++j) {
1027 Teuchos::ArrayView<const IST> X_j_av =
1028 Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
1030 auto X_j_out = Kokkos::subview (X_out, rowRng, j);
1031 // DEEP_COPY REVIEW - HOST-TO-DEVICE
1032 Kokkos::deep_copy (execution_space(), X_j_out, X_j_in);
1033 }
1034 }
1035
1036 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1038 isConstantStride () const {
1039 return whichVectors_.empty ();
1040 }
1041
1042 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1043 size_t
1045 getLocalLength () const
1046 {
1047 if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1048 return static_cast<size_t> (0);
1049 } else {
1050 return this->getMap ()->getLocalNumElements ();
1051 }
1052 }
1053
1054 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1057 getGlobalLength () const
1058 {
1059 if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1060 return static_cast<size_t> (0);
1061 } else {
1062 return this->getMap ()->getGlobalNumElements ();
1063 }
1064 }
1065
1066 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1067 size_t
1069 getStride () const
1070 {
1071 return isConstantStride () ? getDualViewStride (view_) : size_t (0);
1072 }
1073
1074 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1075 bool
1078 {
1079 //Don't actually get a view, just get pointers.
1080 auto thisData = view_.getDualView().h_view.data();
1081 auto otherData = other.view_.getDualView().h_view.data();
1082 size_t thisSpan = view_.getDualView().h_view.span();
1083 size_t otherSpan = other.view_.getDualView().h_view.span();
1084 return (otherData <= thisData && thisData < otherData + otherSpan)
1086 }
1087
1088 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1089 bool
1092 {
1093 // Check whether the source object is a MultiVector. If not, then
1094 // we can't even compare sizes, so it's definitely not OK to
1095 // Import or Export from it.
1097 const MV* src = dynamic_cast<const MV*> (&sourceObj);
1098 if (src == nullptr) {
1099 return false;
1100 }
1101 else {
1102 // The target of the Import or Export calls checkSizes() in
1103 // DistObject::doTransfer(). By that point, we've already
1104 // constructed an Import or Export object using the two
1105 // multivectors' Maps, which means that (hopefully) we've
1106 // already checked other attributes of the multivectors. Thus,
1107 // all we need to do here is check the number of columns.
1108 return src->getNumVectors () == this->getNumVectors ();
1109 }
1110 }
1111
1112 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1113 size_t
1115 constantNumberOfPackets () const {
1116 return this->getNumVectors ();
1117 }
1118
1119 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1120 void
1123 (const SrcDistObject& sourceObj,
1124 const size_t numSameIDs,
1125 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1126 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1127 const CombineMode CM)
1128 {
1129 using ::Tpetra::Details::Behavior;
1130 using ::Tpetra::Details::getDualViewCopyFromArrayView;
1131 using ::Tpetra::Details::ProfilingRegion;
1132 using std::endl;
1133 using KokkosRefactor::Details::permute_array_multi_column;
1134 using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1135 using Kokkos::Compat::create_const_view;
1137 const char tfecfFuncName[] = "copyAndPermute: ";
1138 ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
1139
1140 const bool verbose = Behavior::verbose ();
1141 std::unique_ptr<std::string> prefix;
1142 if (verbose) {
1143 auto map = this->getMap ();
1144 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1145 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1146 std::ostringstream os;
1147 os << "Proc " << myRank << ": MV::copyAndPermute: ";
1148 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1149 os << "Start" << endl;
1150 std::cerr << os.str ();
1151 }
1152
1154 (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
1155 std::logic_error, "permuteToLIDs.extent(0) = "
1156 << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
1157 << permuteFromLIDs.extent (0) << ".");
1158
1159 // We've already called checkSizes(), so this cast must succeed.
1160 MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
1161 const size_t numCols = this->getNumVectors ();
1162
1163 // sourceMV doesn't belong to us, so we can't sync it. Do the
1164 // copying where it's currently sync'd.
1166 (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1167 std::logic_error, "Input MultiVector needs sync to both host "
1168 "and device.");
1169 const bool copyOnHost = runKernelOnHost(sourceMV);
1170 if (verbose) {
1171 std::ostringstream os;
1172 os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1173 std::cerr << os.str ();
1174 }
1175
1176 if (verbose) {
1177 std::ostringstream os;
1178 os << *prefix << "Copy" << endl;
1179 std::cerr << os.str ();
1180 }
1182 // TODO (mfh 15 Sep 2013) When we replace
1183 // KokkosClassic::MultiVector with a Kokkos::View, there are two
1184 // ways to copy the data:
1185 //
1186 // 1. Get a (sub)view of each column and call deep_copy on that.
1187 // 2. Write a custom kernel to copy the data.
1188 //
1189 // The first is easier, but the second might be more performant in
1190 // case we decide to use layouts other than LayoutLeft. It might
1191 // even make sense to hide whichVectors_ in an entirely new layout
1192 // for Kokkos Views.
1193
1194 // Copy rows [0, numSameIDs-1] of the local multivectors.
1195 //
1196 // Note (ETP 2 Jul 2014) We need to always copy one column at a
1197 // time, even when both multivectors are constant-stride, since
1198 // deep_copy between strided subviews with more than one column
1199 // doesn't currently work.
1200
1201 // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1202 // both source and target are constant stride and have multiple
1203 // columns.
1204 if (numSameIDs > 0) {
1205 const std::pair<size_t, size_t> rows (0, numSameIDs);
1206 if (copyOnHost) {
1207 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1208 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1209
1210 for (size_t j = 0; j < numCols; ++j) {
1211 const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1212 const size_t srcCol =
1213 sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1214
1215 auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1216 auto src_j = Kokkos::subview (src_h, rows, srcCol);
1217 if (CM == ADD_ASSIGN) {
1218 // Sum src_j into tgt_j
1219 using range_t =
1220 Kokkos::RangePolicy<typename Node::execution_space, size_t>;
1221 range_t rp(0,numSameIDs);
1222 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1223 aaf(tgt_j, src_j);
1224 Kokkos::parallel_for(rp, aaf);
1225 }
1226 else {
1227 // Copy src_j into tgt_j
1228 // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1229 Kokkos::deep_copy (tgt_j, src_j);
1230 }
1231 }
1232 }
1233 else { // copy on device
1234 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1235 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1236
1237 for (size_t j = 0; j < numCols; ++j) {
1238 const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1239 const size_t srcCol =
1240 sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1241
1242 auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1243 auto src_j = Kokkos::subview (src_d, rows, srcCol);
1244 if (CM == ADD_ASSIGN) {
1245 // Sum src_j into tgt_j
1246 using range_t =
1247 Kokkos::RangePolicy<typename Node::execution_space, size_t>;
1248 range_t rp(0,numSameIDs);
1249 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1250 aaf(tgt_j, src_j);
1251 Kokkos::parallel_for(rp, aaf);
1252 }
1253 else {
1254 // Copy src_j into tgt_j
1255 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1256 Kokkos::deep_copy (tgt_j, src_j);
1257 }
1258 }
1259 }
1260 }
1261
1262
1263 // For the remaining GIDs, execute the permutations. This may
1264 // involve noncontiguous access of both source and destination
1265 // vectors, depending on the LID lists.
1266 //
1267 // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1268 // the same process, this merges their values by replacement of
1269 // the last encountered GID, not by the specified merge rule
1270 // (such as ADD).
1271
1272 // If there are no permutations, we are done
1273 if (permuteFromLIDs.extent (0) == 0 ||
1274 permuteToLIDs.extent (0) == 0) {
1275 if (verbose) {
1276 std::ostringstream os;
1277 os << *prefix << "No permutations. Done!" << endl;
1278 std::cerr << os.str ();
1279 }
1280 return;
1281 }
1282
1283 if (verbose) {
1284 std::ostringstream os;
1285 os << *prefix << "Permute" << endl;
1286 std::cerr << os.str ();
1287 }
1288
1289 // We could in theory optimize for the case where exactly one of
1290 // them is constant stride, but we don't currently do that.
1291 const bool nonConstStride =
1292 ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1294 if (verbose) {
1295 std::ostringstream os;
1296 os << *prefix << "nonConstStride="
1297 << (nonConstStride ? "true" : "false") << endl;
1298 std::cerr << os.str ();
1299 }
1300
1301 // We only need the "which vectors" arrays if either the source or
1302 // target MV is not constant stride. Since we only have one
1303 // kernel that must do double-duty, we have to create a "which
1304 // vectors" array for the MV that _is_ constant stride.
1305 Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1306 Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1307 if (nonConstStride) {
1308 if (this->whichVectors_.size () == 0) {
1309 Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1310 tmpTgt.modify_host ();
1311 for (size_t j = 0; j < numCols; ++j) {
1312 tmpTgt.h_view(j) = j;
1313 }
1314 if (! copyOnHost) {
1315 tmpTgt.sync_device ();
1316 }
1318 }
1319 else {
1320 Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1321 tgtWhichVecs =
1323 "tgtWhichVecs",
1324 copyOnHost);
1325 }
1327 (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1328 this->getNumVectors (),
1329 std::logic_error, "tgtWhichVecs.extent(0) = " <<
1330 tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1331 this->getNumVectors () << ".");
1332
1333 if (sourceMV.whichVectors_.size () == 0) {
1334 Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1335 tmpSrc.modify_host ();
1336 for (size_t j = 0; j < numCols; ++j) {
1337 tmpSrc.h_view(j) = j;
1338 }
1339 if (! copyOnHost) {
1340 tmpSrc.sync_device ();
1341 }
1343 }
1344 else {
1345 Teuchos::ArrayView<const size_t> srcWhichVecsT =
1346 sourceMV.whichVectors_ ();
1347 srcWhichVecs =
1349 "srcWhichVecs",
1350 copyOnHost);
1351 }
1353 (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1354 sourceMV.getNumVectors (), std::logic_error,
1355 "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1356 << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1357 << ".");
1358 }
1359
1360 if (copyOnHost) { // permute on host too
1361 if (verbose) {
1362 std::ostringstream os;
1363 os << *prefix << "Get permute LIDs on host" << std::endl;
1364 std::cerr << os.str ();
1365 }
1366 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1367 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1368
1369 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1370 auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1371 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1372 auto permuteFromLIDs_h =
1373 create_const_view (permuteFromLIDs.view_host ());
1375 if (verbose) {
1376 std::ostringstream os;
1377 os << *prefix << "Permute on host" << endl;
1378 std::cerr << os.str ();
1379 }
1380 if (nonConstStride) {
1381 // No need to sync first, because copyOnHost argument to
1382 // getDualViewCopyFromArrayView puts them in the right place.
1383 auto tgtWhichVecs_h =
1384 create_const_view (tgtWhichVecs.view_host ());
1385 auto srcWhichVecs_h =
1386 create_const_view (srcWhichVecs.view_host ());
1387 if (CM == ADD_ASSIGN) {
1388 using op_type = KokkosRefactor::Details::AddOp;
1389 permute_array_multi_column_variable_stride (tgt_h, src_h,
1393 srcWhichVecs_h, numCols,
1394 op_type());
1396 else {
1397 using op_type = KokkosRefactor::Details::InsertOp;
1398 permute_array_multi_column_variable_stride (tgt_h, src_h,
1402 srcWhichVecs_h, numCols,
1403 op_type());
1405 }
1406 else {
1407 if (CM == ADD_ASSIGN) {
1408 using op_type = KokkosRefactor::Details::AddOp;
1409 permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1410 permuteFromLIDs_h, numCols, op_type());
1411 }
1412 else {
1413 using op_type = KokkosRefactor::Details::InsertOp;
1414 permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1415 permuteFromLIDs_h, numCols, op_type());
1416 }
1417 }
1418 }
1419 else { // permute on device
1420 if (verbose) {
1421 std::ostringstream os;
1422 os << *prefix << "Get permute LIDs on device" << endl;
1423 std::cerr << os.str ();
1424 }
1425 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1426 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1427
1428 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1429 auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1430 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1431 auto permuteFromLIDs_d =
1432 create_const_view (permuteFromLIDs.view_device ());
1433
1434 if (verbose) {
1435 std::ostringstream os;
1436 os << *prefix << "Permute on device" << endl;
1437 std::cerr << os.str ();
1438 }
1439 if (nonConstStride) {
1440 // No need to sync first, because copyOnHost argument to
1441 // getDualViewCopyFromArrayView puts them in the right place.
1442 auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1443 auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1444 if (CM == ADD_ASSIGN) {
1445 using op_type = KokkosRefactor::Details::AddOp;
1446 permute_array_multi_column_variable_stride (tgt_d, src_d,
1451 op_type());
1452 }
1453 else {
1454 using op_type = KokkosRefactor::Details::InsertOp;
1455 permute_array_multi_column_variable_stride (tgt_d, src_d,
1459 srcWhichVecs_d, numCols,
1460 op_type());
1461 }
1463 else {
1464 if (CM == ADD_ASSIGN) {
1465 using op_type = KokkosRefactor::Details::AddOp;
1466 permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1467 permuteFromLIDs_d, numCols, op_type());
1468 }
1469 else {
1470 using op_type = KokkosRefactor::Details::InsertOp;
1471 permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1472 permuteFromLIDs_d, numCols, op_type());
1473 }
1475 }
1476
1477 if (verbose) {
1478 std::ostringstream os;
1479 os << *prefix << "Done!" << endl;
1480 std::cerr << os.str ();
1481 }
1482 }
1484 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1485 void
1488 (const SrcDistObject& sourceObj,
1489 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1490 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1491 Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1492 size_t& constantNumPackets)
1493 {
1494 using ::Tpetra::Details::Behavior;
1495 using ::Tpetra::Details::ProfilingRegion;
1496 using ::Tpetra::Details::reallocDualViewIfNeeded;
1497 using Kokkos::Compat::create_const_view;
1498 using Kokkos::Compat::getKokkosViewDeepCopy;
1499 using std::endl;
1501 const char tfecfFuncName[] = "packAndPrepare: ";
1502 ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1503
1504 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1505 // have the option to check indices. We do so when Tpetra is in
1506 // debug mode. It is in debug mode by default in a debug build,
1507 // but you may control this at run time, before launching the
1508 // executable, by setting the TPETRA_DEBUG environment variable to
1509 // "1" (or "TRUE").
1510 const bool debugCheckIndices = Behavior::debug ();
1511 // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1512 // environment variable to "1" (or "TRUE") for copious debug
1513 // output to std::cerr on every MPI process. This is unwise for
1514 // runs with large numbers of MPI processes.
1515 const bool printDebugOutput = Behavior::verbose ();
1516
1517 std::unique_ptr<std::string> prefix;
1518 if (printDebugOutput) {
1519 auto map = this->getMap ();
1520 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1521 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1522 std::ostringstream os;
1523 os << "Proc " << myRank << ": MV::packAndPrepare: ";
1524 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1525 os << "Start" << endl;
1526 std::cerr << os.str ();
1527 }
1528
1529 // We've already called checkSizes(), so this cast must succeed.
1530 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
1531
1532 const size_t numCols = sourceMV.getNumVectors ();
1533
1534 // This spares us from needing to fill numExportPacketsPerLID.
1535 // Setting constantNumPackets to a nonzero value signals that
1536 // all packets have the same number of entries.
1537 constantNumPackets = numCols;
1538
1539 // If we have no exports, there is nothing to do. Make sure this
1540 // goes _after_ setting constantNumPackets correctly.
1541 if (exportLIDs.extent (0) == 0) {
1542 if (printDebugOutput) {
1543 std::ostringstream os;
1544 os << *prefix << "No exports on this proc, DONE" << std::endl;
1545 std::cerr << os.str ();
1546 }
1547 return;
1548 }
1549
1550 /* The layout in the export for MultiVectors is as follows:
1551 exports = { all of the data from row exportLIDs.front() ;
1552 ....
1553 all of the data from row exportLIDs.back() }
1554 This doesn't have the best locality, but is necessary because
1555 the data for a Packet (all data associated with an LID) is
1556 required to be contiguous. */
1557
1558 // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1559 // packing scheme in the above comment? The data going to a
1560 // particular process must be contiguous, of course, but those
1561 // data could include entries from multiple LIDs. DistObject just
1562 // needs to know how to index into that data. Kokkos is good at
1563 // decoupling storage intent from data layout choice.
1564
1565 const size_t numExportLIDs = exportLIDs.extent (0);
1566 const size_t newExportsSize = numCols * numExportLIDs;
1567 if (printDebugOutput) {
1568 std::ostringstream os;
1569 os << *prefix << "realloc: "
1570 << "numExportLIDs: " << numExportLIDs
1571 << ", exports.extent(0): " << exports.extent (0)
1572 << ", newExportsSize: " << newExportsSize << std::endl;
1573 std::cerr << os.str ();
1574 }
1575 reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1576
1577 // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1578 // sync it. Pack it where it's currently sync'd.
1580 (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1581 std::logic_error, "Input MultiVector needs sync to both host "
1582 "and device.");
1583 const bool packOnHost = runKernelOnHost(sourceMV);
1584 if (printDebugOutput) {
1585 std::ostringstream os;
1586 os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1587 std::cerr << os.str ();
1588 }
1589
1590 // Mark 'exports' here, since we might have resized it above.
1591 // Resizing currently requires calling the constructor, which
1592 // clears out the 'modified' flags.
1593 if (packOnHost) {
1594 // nde 06 Feb 2020: If 'exports' does not require resize
1595 // when reallocDualViewIfNeeded is called, the modified flags
1596 // are not cleared out. This can result in host and device views
1597 // being out-of-sync, resuling in an error in exports.modify_* calls.
1598 // Clearing the sync flags prevents this possible case.
1599 exports.clear_sync_state ();
1600 exports.modify_host ();
1601 }
1602 else {
1603 // nde 06 Feb 2020: If 'exports' does not require resize
1604 // when reallocDualViewIfNeeded is called, the modified flags
1605 // are not cleared out. This can result in host and device views
1606 // being out-of-sync, resuling in an error in exports.modify_* calls.
1607 // Clearing the sync flags prevents this possible case.
1608 exports.clear_sync_state ();
1609 exports.modify_device ();
1610 }
1611
1612 if (numCols == 1) { // special case for one column only
1613 // MultiVector always represents a single column with constant
1614 // stride, but it doesn't hurt to implement both cases anyway.
1615 //
1616 // ETP: I'm not sure I agree with the above statement. Can't a single-
1617 // column multivector be a subview of another multi-vector, in which case
1618 // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1619 // separately.
1620 //
1621 // mfh 18 Jan 2016: In answer to ETP's comment above:
1622 // MultiVector treats single-column MultiVectors created using a
1623 // "nonconstant stride constructor" as a special case, and makes
1624 // them constant stride (by making whichVectors_ have length 0).
1625 if (sourceMV.isConstantStride ()) {
1626 using KokkosRefactor::Details::pack_array_single_column;
1627 if (printDebugOutput) {
1628 std::ostringstream os;
1629 os << *prefix << "Pack numCols=1 const stride" << endl;
1630 std::cerr << os.str ();
1631 }
1632 if (packOnHost) {
1633 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1634 pack_array_single_column (exports.view_host (),
1635 src_host,
1636 exportLIDs.view_host (),
1637 0,
1639 }
1640 else { // pack on device
1641 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1642 pack_array_single_column (exports.view_device (),
1643 src_dev,
1644 exportLIDs.view_device (),
1645 0,
1647 }
1648 }
1649 else {
1650 using KokkosRefactor::Details::pack_array_single_column;
1651 if (printDebugOutput) {
1652 std::ostringstream os;
1653 os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1654 std::cerr << os.str ();
1655 }
1656 if (packOnHost) {
1657 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1658 pack_array_single_column (exports.view_host (),
1659 src_host,
1660 exportLIDs.view_host (),
1661 sourceMV.whichVectors_[0],
1663 }
1664 else { // pack on device
1665 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1666 pack_array_single_column (exports.view_device (),
1667 src_dev,
1668 exportLIDs.view_device (),
1669 sourceMV.whichVectors_[0],
1671 }
1672 }
1673 }
1674 else { // the source MultiVector has multiple columns
1675 if (sourceMV.isConstantStride ()) {
1676 using KokkosRefactor::Details::pack_array_multi_column;
1678 std::ostringstream os;
1679 os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1680 std::cerr << os.str ();
1681 }
1682 if (packOnHost) {
1683 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1684 pack_array_multi_column (exports.view_host (),
1685 src_host,
1686 exportLIDs.view_host (),
1687 numCols,
1689 }
1690 else { // pack on device
1691 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1692 pack_array_multi_column (exports.view_device (),
1693 src_dev,
1694 exportLIDs.view_device (),
1695 numCols,
1698 }
1699 else {
1700 using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1701 if (printDebugOutput) {
1702 std::ostringstream os;
1703 os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1704 << endl;
1705 std::cerr << os.str ();
1706 }
1707 // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1708 // whichVectors_ can be expensive, but pack and unpack for
1709 // nonconstant-stride MultiVectors is slower anyway.
1710 using IST = impl_scalar_type;
1711 using DV = Kokkos::DualView<IST*, device_type>;
1712 using HES = typename DV::t_host::execution_space;
1713 using DES = typename DV::t_dev::execution_space;
1714 Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1715 if (packOnHost) {
1716 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1717 pack_array_multi_column_variable_stride
1718 (exports.view_host (),
1719 src_host,
1720 exportLIDs.view_host (),
1722 numCols,
1724 }
1725 else { // pack on device
1726 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1727 pack_array_multi_column_variable_stride
1728 (exports.view_device (),
1730 exportLIDs.view_device (),
1732 numCols,
1734 }
1735 }
1736 }
1737
1738 if (printDebugOutput) {
1739 std::ostringstream os;
1740 os << *prefix << "Done!" << endl;
1741 std::cerr << os.str ();
1742 }
1743
1744 }
1745
1746
1747 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1748 template <class NO>
1749 typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1750 typename NO::device_type::memory_space>::value,
1751 bool>::type
1754 const bool verbose,
1755 const std::string* prefix,
1756 const bool areRemoteLIDsContiguous,
1757 const CombineMode CM)
1758 {
1759 // This implementation of reallocImportsIfNeeded is an
1760 // optimization that is specific to MultiVector. We check if the
1761 // imports_ view can be aliased to the end of the data view_. If
1762 // that is the case, we can skip the unpackAndCombine call.
1763
1764 if (verbose) {
1765 std::ostringstream os;
1766 os << *prefix << "Realloc (if needed) imports_ from "
1767 << this->imports_.extent (0) << " to " << newSize << std::endl;
1768 std::cerr << os.str ();
1769 }
1770
1771 bool reallocated = false;
1772
1773 using IST = impl_scalar_type;
1774 using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1775
1776 // Conditions for aliasing memory:
1777 // - When both sides of the dual view are in the same memory
1778 // space, we do not need to worry about syncing things.
1779 // - When both memory spaces are different, we only alias if this
1780 // does not incur additional sync'ing.
1781 // - The remote LIDs need to be contiguous, so that we do not need
1782 // to reorder received information.
1783 // - CombineMode needs to be INSERT.
1784 // - The number of vectors needs to be 1, otherwise we need to
1785 // reorder the received data.
1786 if ((dual_view_type::impl_dualview_is_single_device::value ||
1787 (Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_device()) ||
1788 (!Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_host())) &&
1789 areRemoteLIDsContiguous &&
1790 (CM == INSERT || CM == REPLACE) &&
1791 (getNumVectors() == 1) &&
1792 (newSize > 0)) {
1793
1794 size_t offset = getLocalLength () - newSize;
1795 reallocated = this->imports_.d_view.data() != view_.getDualView().d_view.data() + offset;
1796 if (reallocated) {
1797 typedef std::pair<size_t, size_t> range_type;
1798 this->imports_ = DV(view_.getDualView(),
1799 range_type (offset, getLocalLength () ),
1800 0);
1801
1802 if (verbose) {
1803 std::ostringstream os;
1804 os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1805 std::cerr << os.str ();
1806 }
1807 }
1808 return reallocated;
1809 }
1810 {
1811 using ::Tpetra::Details::reallocDualViewIfNeeded;
1812 reallocated =
1813 reallocDualViewIfNeeded (this->unaliased_imports_, newSize, "imports");
1814 if (verbose) {
1815 std::ostringstream os;
1816 os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1817 std::cerr << os.str ();
1818 }
1819 this->imports_ = this->unaliased_imports_;
1820 }
1821 return reallocated;
1822 }
1823
1824 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1825 template <class NO>
1826 typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1827 typename NO::device_type::memory_space>::value,
1828 bool>::type
1831 const bool verbose,
1832 const std::string* prefix,
1833 const bool areRemoteLIDsContiguous,
1834 const CombineMode CM)
1835 {
1837 }
1838
1839 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1840 bool
1842 reallocImportsIfNeeded(const size_t newSize,
1843 const bool verbose,
1844 const std::string* prefix,
1845 const bool areRemoteLIDsContiguous,
1846 const CombineMode CM) {
1848 return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1849 }
1850
1851 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1852 bool
1855 return (this->imports_.d_view.data() + this->imports_.d_view.extent(0) ==
1856 view_.getDualView().d_view.data() + view_.getDualView().d_view.extent(0));
1857 }
1858
1859
1860 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1861 void
1864 (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1865 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1866 Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1867 const size_t constantNumPackets,
1868 const CombineMode CM)
1869 {
1870 using ::Tpetra::Details::Behavior;
1871 using ::Tpetra::Details::ProfilingRegion;
1872 using KokkosRefactor::Details::unpack_array_multi_column;
1873 using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1874 using Kokkos::Compat::getKokkosViewDeepCopy;
1875 using std::endl;
1876 const char longFuncName[] = "Tpetra::MultiVector::unpackAndCombine";
1877 const char tfecfFuncName[] = "unpackAndCombine: ";
1878 ProfilingRegion regionUAC (longFuncName);
1879
1880 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1881 // have the option to check indices. We do so when Tpetra is in
1882 // debug mode. It is in debug mode by default in a debug build,
1883 // but you may control this at run time, before launching the
1884 // executable, by setting the TPETRA_DEBUG environment variable to
1885 // "1" (or "TRUE").
1886 const bool debugCheckIndices = Behavior::debug ();
1887
1888 const bool printDebugOutput = Behavior::verbose ();
1889 std::unique_ptr<std::string> prefix;
1891 auto map = this->getMap ();
1892 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1893 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1894 std::ostringstream os;
1895 os << "Proc " << myRank << ": " << longFuncName << ": ";
1896 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1897 os << "Start" << endl;
1898 std::cerr << os.str ();
1899 }
1900
1901 // If we have no imports, there is nothing to do
1902 if (importLIDs.extent (0) == 0) {
1903 if (printDebugOutput) {
1904 std::ostringstream os;
1905 os << *prefix << "No imports. Done!" << endl;
1906 std::cerr << os.str ();
1907 }
1908 return;
1909 }
1910
1911 // Check, whether imports_ is a subview of the MV view.
1912 if (importsAreAliased()) {
1913 if (printDebugOutput) {
1914 std::ostringstream os;
1915 os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1916 std::cerr << os.str ();
1917 }
1918 return;
1919 }
1920
1921
1922 const size_t numVecs = getNumVectors ();
1923 if (debugCheckIndices) {
1925 (static_cast<size_t> (imports.extent (0)) !=
1926 numVecs * importLIDs.extent (0),
1927 std::runtime_error,
1928 "imports.extent(0) = " << imports.extent (0)
1929 << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1930 << " * " << importLIDs.extent (0) << " = "
1931 << numVecs * importLIDs.extent (0) << ".");
1932
1934 (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1935 "constantNumPackets input argument must be nonzero.");
1936
1938 (static_cast<size_t> (numVecs) !=
1939 static_cast<size_t> (constantNumPackets),
1940 std::runtime_error, "constantNumPackets must equal numVecs.");
1941 }
1942
1943 // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1944 // the memory space in which the imports buffer was last modified and
1945 // the size of the imports buffer.
1946 // DistObject::doTransferNew decides where it was last modified (based on
1947 // whether communication buffers used were on host or device).
1948 const bool unpackOnHost = runKernelOnHost(imports);
1949 if (unpackOnHost) {
1950 if (this->imports_.need_sync_host()) this->imports_.sync_host();
1951 }
1952 else {
1953 if (this->imports_.need_sync_device()) this->imports_.sync_device();
1954 }
1955
1956 if (printDebugOutput) {
1957 std::ostringstream os;
1958 os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1959 << endl;
1960 std::cerr << os.str ();
1961 }
1962
1963 // We have to sync before modifying, because this method may read
1964 // as well as write (depending on the CombineMode).
1965 auto imports_d = imports.view_device ();
1966 auto imports_h = imports.view_host ();
1967 auto importLIDs_d = importLIDs.view_device ();
1968 auto importLIDs_h = importLIDs.view_host ();
1969
1970 Kokkos::DualView<size_t*, device_type> whichVecs;
1971 if (! isConstantStride ()) {
1972 Kokkos::View<const size_t*, Kokkos::HostSpace,
1973 Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1974 numVecs);
1975 whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1977 whichVecs.modify_host ();
1978 // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
1979 Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
1980 }
1981 else {
1982 whichVecs.modify_device ();
1983 // DEEP_COPY REVIEW - HOST-TO-DEVICE
1984 Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
1985 }
1986 }
1987 auto whichVecs_d = whichVecs.view_device ();
1988 auto whichVecs_h = whichVecs.view_host ();
1989
1990 /* The layout in the export for MultiVectors is as follows:
1991 imports = { all of the data from row exportLIDs.front() ;
1992 ....
1993 all of the data from row exportLIDs.back() }
1994 This doesn't have the best locality, but is necessary because
1995 the data for a Packet (all data associated with an LID) is
1996 required to be contiguous. */
1997
1998 if (numVecs > 0 && importLIDs.extent (0) > 0) {
1999 using dev_exec_space = typename dual_view_type::t_dev::execution_space;
2000 using host_exec_space = typename dual_view_type::t_host::execution_space;
2001
2002 // This fixes GitHub Issue #4418.
2003 const bool use_atomic_updates = unpackOnHost ?
2004 host_exec_space().concurrency () != 1 :
2005 dev_exec_space().concurrency () != 1;
2006
2007 if (printDebugOutput) {
2008 std::ostringstream os;
2009 os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
2010 std::cerr << os.str ();
2011 }
2013 // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
2014 // custom combine modes, start editing here.
2015
2016 if (CM == INSERT || CM == REPLACE) {
2017 using op_type = KokkosRefactor::Details::InsertOp;
2018 if (isConstantStride ()) {
2019 if (unpackOnHost) {
2020 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2021 unpack_array_multi_column (host_exec_space (),
2023 op_type (), numVecs,
2026
2027 }
2028 else { // unpack on device
2029 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2030 unpack_array_multi_column (dev_exec_space (),
2032 op_type (), numVecs,
2035 }
2036 }
2037 else { // not constant stride
2038 if (unpackOnHost) {
2039 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2040 unpack_array_multi_column_variable_stride (host_exec_space (),
2041 X_h, imports_h,
2044 op_type (),
2045 numVecs,
2048 }
2049 else { // unpack on device
2050 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2051 unpack_array_multi_column_variable_stride (dev_exec_space (),
2052 X_d, imports_d,
2055 op_type (),
2056 numVecs,
2059 }
2060 }
2061 }
2062 else if (CM == ADD || CM == ADD_ASSIGN) {
2063 using op_type = KokkosRefactor::Details::AddOp;
2064 if (isConstantStride ()) {
2065 if (unpackOnHost) {
2066 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2067 unpack_array_multi_column (host_exec_space (),
2069 op_type (), numVecs,
2072 }
2073 else { // unpack on device
2074 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2075 unpack_array_multi_column (dev_exec_space (),
2077 op_type (), numVecs,
2080 }
2081 }
2082 else { // not constant stride
2083 if (unpackOnHost) {
2084 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2085 unpack_array_multi_column_variable_stride (host_exec_space (),
2086 X_h, imports_h,
2089 op_type (),
2090 numVecs,
2093 }
2094 else { // unpack on device
2095 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2096 unpack_array_multi_column_variable_stride (dev_exec_space (),
2097 X_d, imports_d,
2100 op_type (),
2101 numVecs,
2104 }
2105 }
2106 }
2107 else if (CM == ABSMAX) {
2108 using op_type = KokkosRefactor::Details::AbsMaxOp;
2109 if (isConstantStride ()) {
2110 if (unpackOnHost) {
2111 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2112 unpack_array_multi_column (host_exec_space (),
2114 op_type (), numVecs,
2117 }
2118 else { // unpack on device
2119 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2120 unpack_array_multi_column (dev_exec_space (),
2122 op_type (), numVecs,
2125 }
2126 }
2127 else {
2128 if (unpackOnHost) {
2129 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2130 unpack_array_multi_column_variable_stride (host_exec_space (),
2131 X_h, imports_h,
2134 op_type (),
2135 numVecs,
2138 }
2139 else { // unpack on device
2140 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2141 unpack_array_multi_column_variable_stride (dev_exec_space (),
2142 X_d, imports_d,
2145 op_type (),
2146 numVecs,
2149 }
2150 }
2151 }
2152 else {
2154 (true, std::logic_error, "Invalid CombineMode");
2155 }
2156 }
2157 else {
2158 if (printDebugOutput) {
2159 std::ostringstream os;
2160 os << *prefix << "Nothing to unpack" << endl;
2161 std::cerr << os.str ();
2162 }
2163 }
2164
2165 if (printDebugOutput) {
2166 std::ostringstream os;
2167 os << *prefix << "Done!" << endl;
2168 std::cerr << os.str ();
2169 }
2170 }
2171
2172 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2173 size_t
2175 getNumVectors () const
2176 {
2177 if (isConstantStride ()) {
2178 return static_cast<size_t> (view_.extent (1));
2179 } else {
2180 return static_cast<size_t> (whichVectors_.size ());
2181 }
2182 }
2183
2184 namespace { // (anonymous)
2185
2186 template<class RV>
2187 void
2188 gblDotImpl (const RV& dotsOut,
2189 const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2190 const bool distributed)
2191 {
2192 using Teuchos::REDUCE_MAX;
2193 using Teuchos::REDUCE_SUM;
2194 using Teuchos::reduceAll;
2195 typedef typename RV::non_const_value_type dot_type;
2196
2197 const size_t numVecs = dotsOut.extent (0);
2198
2199 // If the MultiVector is distributed over multiple processes, do
2200 // the distributed (interprocess) part of the dot product. We
2201 // assume that the MPI implementation can read from and write to
2202 // device memory.
2203 //
2204 // replaceMap() may have removed some processes. Those
2205 // processes have a null Map. They must not participate in any
2206 // collective operations. We ask first whether the Map is null,
2207 // because isDistributed() defers that question to the Map. We
2208 // still compute and return local dots for processes not
2209 // participating in collective operations; those probably don't
2210 // make any sense, but it doesn't hurt to do them, since it's
2211 // illegal to call dot() on those processes anyway.
2212 if (distributed && ! comm.is_null ()) {
2213 // The calling process only participates in the collective if
2214 // both the Map and its Comm on that process are nonnull.
2215 const int nv = static_cast<int> (numVecs);
2218 if (commIsInterComm) {
2219 // If comm is an intercomm, then we may not alias input and
2220 // output buffers, so we have to make a copy of the local
2221 // sum.
2222 typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
2223 // DEEP_COPY REVIEW - NOT TESTED
2224 Kokkos::deep_copy (lclDots, dotsOut);
2225 const dot_type* const lclSum = lclDots.data ();
2226 dot_type* const gblSum = dotsOut.data ();
2228 }
2229 else {
2230 dot_type* const inout = dotsOut.data ();
2232 }
2233 }
2234 }
2235 } // namespace (anonymous)
2236
2237 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2238 void
2241 const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
2242 {
2243 using ::Tpetra::Details::Behavior;
2244 using Kokkos::subview;
2245 using Teuchos::Comm;
2246 using Teuchos::null;
2247 using Teuchos::RCP;
2248 // View of all the dot product results.
2249 typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2250 typedef typename dual_view_type::t_dev_const XMV;
2251 const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2252
2253 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
2254
2255 const size_t numVecs = this->getNumVectors ();
2256 if (numVecs == 0) {
2257 return; // nothing to do
2258 }
2259 const size_t lclNumRows = this->getLocalLength ();
2260 const size_t numDots = static_cast<size_t> (dots.extent (0));
2261 const bool debug = Behavior::debug ();
2262
2263 if (debug) {
2264 const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
2266 (! compat, std::invalid_argument, "'*this' MultiVector is not "
2267 "compatible with the input MultiVector A. We only test for this "
2268 "in debug mode.");
2269 }
2270
2271 // FIXME (mfh 11 Jul 2014) These exception tests may not
2272 // necessarily be thrown on all processes consistently. We should
2273 // instead pass along error state with the inner product. We
2274 // could do this by setting an extra slot to
2275 // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
2276 // final sum should be
2277 // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
2279 lclNumRows != A.getLocalLength (), std::runtime_error,
2280 "MultiVectors do not have the same local length. "
2281 "this->getLocalLength() = " << lclNumRows << " != "
2282 "A.getLocalLength() = " << A.getLocalLength () << ".");
2284 numVecs != A.getNumVectors (), std::runtime_error,
2285 "MultiVectors must have the same number of columns (vectors). "
2286 "this->getNumVectors() = " << numVecs << " != "
2287 "A.getNumVectors() = " << A.getNumVectors () << ".");
2289 numDots != numVecs, std::runtime_error,
2290 "The output array 'dots' must have the same number of entries as the "
2291 "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2292 numDots << " != this->getNumVectors() = " << numVecs << ".");
2293
2294 const std::pair<size_t, size_t> colRng (0, numVecs);
2296 RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2297 this->getMap ()->getComm ();
2298
2299 auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2300 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2301
2302 ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
2303 this->whichVectors_.getRawPtr (),
2304 A.whichVectors_.getRawPtr (),
2305 this->isConstantStride (), A.isConstantStride ());
2306 gblDotImpl (dotsOut, comm, this->isDistributed ());
2307 }
2308
2309 namespace { // (anonymous)
2310 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2314 {
2315 using ::Tpetra::Details::ProfilingRegion;
2317 using dot_type = typename MV::dot_type;
2318 ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
2319
2320 auto map = x.getMap ();
2321 Teuchos::RCP<const Teuchos::Comm<int> > comm =
2322 map.is_null () ? Teuchos::null : map->getComm ();
2323 if (comm.is_null ()) {
2324 return Kokkos::ArithTraits<dot_type>::zero ();
2325 }
2326 else {
2327 using LO = LocalOrdinal;
2328 // The min just ensures that we don't overwrite memory that
2329 // doesn't belong to us, in the erroneous input case where x
2330 // and y have different numbers of rows.
2331 const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
2332 y.getLocalLength ()));
2333 const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
2334 dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
2335 dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
2337 // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2338 //const_cast<MV&>(x).sync_device ();
2339 //const_cast<MV&>(y).sync_device ();
2340
2341 auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2342 auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
2343 auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2344 auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
2345 lclDot = KokkosBlas::dot (x_1d, y_1d);
2346
2347 if (x.isDistributed ()) {
2348 using Teuchos::outArg;
2349 using Teuchos::REDUCE_SUM;
2350 using Teuchos::reduceAll;
2352 }
2353 else {
2354 gblDot = lclDot;
2355 }
2356 return gblDot;
2357 }
2358 }
2359 } // namespace (anonymous)
2360
2361
2362
2363 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2364 void
2367 const Teuchos::ArrayView<dot_type>& dots) const
2368 {
2369 const char tfecfFuncName[] = "dot: ";
2370 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2371
2372 const size_t numVecs = this->getNumVectors ();
2373 const size_t lclNumRows = this->getLocalLength ();
2374 const size_t numDots = static_cast<size_t> (dots.size ());
2375
2376 // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2377 // not necessarily be thrown on all processes consistently. We
2378 // keep them for now, because MultiVector's unit tests insist on
2379 // them. In the future, we should instead pass along error state
2380 // with the inner product. We could do this by setting an extra
2381 // slot to Kokkos::Details::ArithTraits<dot_type>::one() on error.
2382 // The final sum should be
2383 // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
2385 (lclNumRows != A.getLocalLength (), std::runtime_error,
2386 "MultiVectors do not have the same local length. "
2387 "this->getLocalLength() = " << lclNumRows << " != "
2388 "A.getLocalLength() = " << A.getLocalLength () << ".");
2390 (numVecs != A.getNumVectors (), std::runtime_error,
2391 "MultiVectors must have the same number of columns (vectors). "
2392 "this->getNumVectors() = " << numVecs << " != "
2393 "A.getNumVectors() = " << A.getNumVectors () << ".");
2395 (numDots != numVecs, std::runtime_error,
2396 "The output array 'dots' must have the same number of entries as the "
2397 "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2398 numDots << " != this->getNumVectors() = " << numVecs << ".");
2399
2400 if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2402 dots[0] = gblDot;
2403 }
2404 else {
2405 this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2406 }
2407 }
2408
2409 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2410 void
2412 norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2413 {
2415 using ::Tpetra::Details::NORM_TWO;
2416 using ::Tpetra::Details::ProfilingRegion;
2417 ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2418
2419 // The function needs to be able to sync X.
2420 MV& X = const_cast<MV&> (*this);
2421 multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2422 }
2423
2424 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2425 void
2427 norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2428 {
2429 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2430 this->norm2 (norms_av);
2431 }
2432
2433
2434 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2435 void
2437 norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2438 {
2440 using ::Tpetra::Details::NORM_ONE;
2441 using ::Tpetra::Details::ProfilingRegion;
2442 ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2443
2444 // The function needs to be able to sync X.
2445 MV& X = const_cast<MV&> (*this);
2446 multiVectorNormImpl (norms.data (), X, NORM_ONE);
2447 }
2448
2449 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2450 void
2452 norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2453 {
2454 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2455 this->norm1 (norms_av);
2456 }
2457
2458 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2459 void
2461 normInf (const Teuchos::ArrayView<mag_type>& norms) const
2462 {
2464 using ::Tpetra::Details::NORM_INF;
2465 using ::Tpetra::Details::ProfilingRegion;
2466 ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2467
2468 // The function needs to be able to sync X.
2469 MV& X = const_cast<MV&> (*this);
2470 multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2471 }
2472
2473 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2474 void
2476 normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2477 {
2478 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2479 this->normInf (norms_av);
2480 }
2481
2482 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2483 void
2485 meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2486 {
2487 // KR FIXME Overload this method to take a View.
2488 using Kokkos::ALL;
2489 using Kokkos::subview;
2490 using Teuchos::Comm;
2491 using Teuchos::RCP;
2492 using Teuchos::reduceAll;
2493 using Teuchos::REDUCE_SUM;
2494 typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2495
2496 const size_t lclNumRows = this->getLocalLength ();
2497 const size_t numVecs = this->getNumVectors ();
2498 const size_t numMeans = static_cast<size_t> (means.size ());
2499
2501 numMeans != numVecs, std::runtime_error,
2502 "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2503 << " != this->getNumVectors() = " << numVecs << ".");
2504
2505 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2506 const std::pair<size_t, size_t> colRng (0, numVecs);
2507
2508 // Make sure that the final output view has the same layout as the
2509 // temporary view's HostMirror. Left or Right doesn't matter for
2510 // a 1-D array anyway; this is just to placate the compiler.
2511 typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2512 typedef Kokkos::View<impl_scalar_type*,
2513 typename local_view_type::HostMirror::array_layout,
2514 Kokkos::HostSpace,
2515 Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2517
2518 RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2519 this->getMap ()->getComm ();
2520
2521 // If we need sync to device, then host has the most recent version.
2522 const bool useHostVersion = this->need_sync_device ();
2523 if (useHostVersion) {
2524 // DualView was last modified on host, so run the local kernel there.
2525 auto X_lcl = subview (getLocalViewHost(Access::ReadOnly),
2526 rowRng, Kokkos::ALL ());
2527 // Compute the local sum of each column.
2528 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2529 if (isConstantStride ()) {
2530 KokkosBlas::sum (lclSums, X_lcl);
2531 }
2532 else {
2533 for (size_t j = 0; j < numVecs; ++j) {
2534 const size_t col = whichVectors_[j];
2535 KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2536 }
2537 }
2538
2539 // If there are multiple MPI processes, the all-reduce reads
2540 // from lclSums, and writes to meansOut. Otherwise, we just
2541 // copy lclSums into meansOut.
2542 if (! comm.is_null () && this->isDistributed ()) {
2543 reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2544 lclSums.data (), meansOut.data ());
2545 }
2546 else {
2547 // DEEP_COPY REVIEW - NOT TESTED
2548 Kokkos::deep_copy (meansOut, lclSums);
2549 }
2550 }
2551 else {
2552 // DualView was last modified on device, so run the local kernel there.
2553 auto X_lcl = subview (this->getLocalViewDevice(Access::ReadOnly),
2554 rowRng, Kokkos::ALL ());
2555
2556 // Compute the local sum of each column.
2557 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2558 if (isConstantStride ()) {
2559 KokkosBlas::sum (lclSums, X_lcl);
2560 }
2561 else {
2562 for (size_t j = 0; j < numVecs; ++j) {
2563 const size_t col = whichVectors_[j];
2564 KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2565 }
2566 }
2567
2568 // If there are multiple MPI processes, the all-reduce reads
2569 // from lclSums, and writes to meansOut. (We assume that MPI
2570 // can read device memory.) Otherwise, we just copy lclSums
2571 // into meansOut.
2572 if (! comm.is_null () && this->isDistributed ()) {
2573 reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2574 lclSums.data (), meansOut.data ());
2575 }
2576 else {
2577 // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2578 Kokkos::deep_copy (meansOut, lclSums);
2579 }
2580 }
2581
2582 // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2583 // to the magnitude type, since operator/ (std::complex<T>, int)
2584 // isn't necessarily defined.
2586 ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2587 for (size_t k = 0; k < numMeans; ++k) {
2589 }
2590 }
2591
2592
2593 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2594 void
2596 randomize ()
2597 {
2598 typedef impl_scalar_type IST;
2599 typedef Kokkos::Details::ArithTraits<IST> ATS;
2600 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2601 typedef typename pool_type::generator_type generator_type;
2602
2603 const IST max = Kokkos::rand<generator_type, IST>::max ();
2604 const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2605
2606 this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2607 }
2608
2609
2610 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2611 void
2613 randomize (const Scalar& minVal, const Scalar& maxVal)
2614 {
2615 typedef impl_scalar_type IST;
2616 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2617
2618 // Seed the pseudorandom number generator using the calling
2619 // process' rank. This helps decorrelate different process'
2620 // pseudorandom streams. It's not perfect but it's effective and
2621 // doesn't require MPI communication. The seed also includes bits
2622 // from the standard library's rand().
2623 //
2624 // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2625 // The code below just makes a new seed each time.
2626
2627 const uint64_t myRank =
2628 static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2629 uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2630 unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2631
2633 const IST max = static_cast<IST> (maxVal);
2634 const IST min = static_cast<IST> (minVal);
2635
2636 auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2637
2638 if (isConstantStride ()) {
2639 Kokkos::fill_random (thisView, rand_pool, min, max);
2640 }
2641 else {
2642 const size_t numVecs = getNumVectors ();
2643 for (size_t k = 0; k < numVecs; ++k) {
2644 const size_t col = whichVectors_[k];
2645 auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2646 Kokkos::fill_random (X_k, rand_pool, min, max);
2647 }
2648 }
2649 }
2650
2651 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2652 void
2654 putScalar (const Scalar& alpha)
2655 {
2656 using ::Tpetra::Details::ProfilingRegion;
2657 using ::Tpetra::Details::Blas::fill;
2658 using DES = typename dual_view_type::t_dev::execution_space;
2659 using HES = typename dual_view_type::t_host::execution_space;
2660 using LO = LocalOrdinal;
2661 ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2662
2663 // We need this cast for cases like Scalar = std::complex<T> but
2664 // impl_scalar_type = Kokkos::complex<T>.
2665 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2666 const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2667 const LO numVecs = static_cast<LO> (this->getNumVectors ());
2668
2669 // Modify the most recently updated version of the data. This
2670 // avoids sync'ing, which could violate users' expectations.
2671 //
2672 // If we need sync to device, then host has the most recent version.
2673 const bool runOnHost = runKernelOnHost(*this);
2674 if (! runOnHost) {
2675 auto X = this->getLocalViewDevice(Access::OverwriteAll);
2676 if (this->isConstantStride ()) {
2677 fill (DES (), X, theAlpha, lclNumRows, numVecs);
2678 }
2679 else {
2680 fill (DES (), X, theAlpha, lclNumRows, numVecs,
2681 this->whichVectors_.getRawPtr ());
2682 }
2683 }
2684 else { // last modified in host memory, so modify data there.
2685 auto X = this->getLocalViewHost(Access::OverwriteAll);
2686 if (this->isConstantStride ()) {
2687 fill (HES (), X, theAlpha, lclNumRows, numVecs);
2688 }
2689 else {
2690 fill (HES (), X, theAlpha, lclNumRows, numVecs,
2691 this->whichVectors_.getRawPtr ());
2692 }
2693 }
2694 }
2695
2696
2697 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2698 void
2700 replaceMap (const Teuchos::RCP<const map_type>& newMap)
2701 {
2702 using Teuchos::ArrayRCP;
2703 using Teuchos::Comm;
2704 using Teuchos::RCP;
2705 using ST = Scalar;
2706 using LO = LocalOrdinal;
2707 using GO = GlobalOrdinal;
2708
2709 // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2710 // it might work if the MV is a column view of another MV.
2711 // However, things might go wrong when restoring the original
2712 // Map, so we don't allow this case for now.
2714 ! this->isConstantStride (), std::logic_error,
2715 "Tpetra::MultiVector::replaceMap: This method does not currently work "
2716 "if the MultiVector is a column view of another MultiVector (that is, if "
2717 "isConstantStride() == false).");
2718
2719 // Case 1: current Map and new Map are both nonnull on this process.
2720 // Case 2: current Map is nonnull, new Map is null.
2721 // Case 3: current Map is null, new Map is nonnull.
2722 // Case 4: both Maps are null: forbidden.
2723 //
2724 // Case 1 means that we don't have to do anything on this process,
2725 // other than assign the new Map. (We always have to do that.)
2726 // It's an error for the user to supply a Map that requires
2727 // resizing in this case.
2728 //
2729 // Case 2 means that the calling process is in the current Map's
2730 // communicator, but will be excluded from the new Map's
2731 // communicator. We don't have to do anything on the calling
2732 // process; just leave whatever data it may have alone.
2733 //
2734 // Case 3 means that the calling process is excluded from the
2735 // current Map's communicator, but will be included in the new
2736 // Map's communicator. This means we need to (re)allocate the
2737 // local DualView if it does not have the right number of rows.
2738 // If the new number of rows is nonzero, we'll fill the newly
2739 // allocated local data with zeros, as befits a projection
2740 // operation.
2741 //
2742 // The typical use case for Case 3 is that the MultiVector was
2743 // first created with the Map with more processes, then that Map
2744 // was replaced with a Map with fewer processes, and finally the
2745 // original Map was restored on this call to replaceMap.
2746
2747 if (this->getMap ().is_null ()) { // current Map is null
2748 // If this->getMap() is null, that means that this MultiVector
2749 // has already had replaceMap happen to it. In that case, just
2750 // reallocate the DualView with the right size.
2751
2753 newMap.is_null (), std::invalid_argument,
2754 "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2755 "This probably means that the input Map is incorrect.");
2756
2757 // Case 3: current Map is null, new Map is nonnull.
2758 // Reallocate the DualView with the right dimensions.
2759 const size_t newNumRows = newMap->getLocalNumElements ();
2760 const size_t origNumRows = view_.extent (0);
2761 const size_t numCols = this->getNumVectors ();
2762
2763 if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2765 }
2766 }
2767 else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2768 // I am an excluded process. Reinitialize my data so that I
2769 // have 0 rows. Keep the number of columns as before.
2770 const size_t newNumRows = static_cast<size_t> (0);
2771 const size_t numCols = this->getNumVectors ();
2773 }
2774
2775 this->map_ = newMap;
2776 }
2777
2778 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2779 void
2781 scale (const Scalar& alpha)
2782 {
2783 using Kokkos::ALL;
2784 using IST = impl_scalar_type;
2785
2786 const IST theAlpha = static_cast<IST> (alpha);
2787 if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2788 return; // do nothing
2789 }
2790 const size_t lclNumRows = getLocalLength ();
2791 const size_t numVecs = getNumVectors ();
2792 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2793 const std::pair<size_t, size_t> colRng (0, numVecs);
2794
2795 // We can't substitute putScalar(0.0) for scale(0.0), because the
2796 // former will overwrite NaNs present in the MultiVector. The
2797 // semantics of this call require multiplying them by 0, which
2798 // IEEE 754 requires to be NaN.
2799
2800 // If we need sync to device, then host has the most recent version.
2801 const bool useHostVersion = need_sync_device ();
2802 if (useHostVersion) {
2803 auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2804 if (isConstantStride ()) {
2805 KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2806 }
2807 else {
2808 for (size_t k = 0; k < numVecs; ++k) {
2809 const size_t Y_col = whichVectors_[k];
2810 auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2811 KokkosBlas::scal (Y_k, theAlpha, Y_k);
2812 }
2813 }
2814 }
2815 else { // work on device
2816 auto Y_lcl = Kokkos::subview (getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2817 if (isConstantStride ()) {
2818 KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2819 }
2820 else {
2821 for (size_t k = 0; k < numVecs; ++k) {
2822 const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2823 auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2824 KokkosBlas::scal (Y_k, theAlpha, Y_k);
2825 }
2826 }
2827 }
2828 }
2829
2830
2831 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2832 void
2834 scale (const Teuchos::ArrayView<const Scalar>& alphas)
2835 {
2836 const size_t numVecs = this->getNumVectors ();
2837 const size_t numAlphas = static_cast<size_t> (alphas.size ());
2839 numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2840 "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2841 << numVecs << ".");
2842
2843 // Use a DualView to copy the scaling constants onto the device.
2844 using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2845 k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2846 k_alphas.modify_host ();
2847 for (size_t i = 0; i < numAlphas; ++i) {
2848 k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2849 }
2850 k_alphas.sync_device ();
2851 // Invoke the scale() overload that takes a device View of coefficients.
2852 this->scale (k_alphas.view_device ());
2853 }
2854
2855 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2856 void
2858 scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2859 {
2860 using Kokkos::ALL;
2861 using Kokkos::subview;
2862
2863 const size_t lclNumRows = this->getLocalLength ();
2864 const size_t numVecs = this->getNumVectors ();
2866 static_cast<size_t> (alphas.extent (0)) != numVecs,
2867 std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2868 "alphas.extent(0) = " << alphas.extent (0)
2869 << " != this->getNumVectors () = " << numVecs << ".");
2870 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2871 const std::pair<size_t, size_t> colRng (0, numVecs);
2872
2873 // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2874 // type of the return value of subview. This is because if we
2875 // switch the array layout from LayoutLeft to LayoutRight
2876 // (preferred for performance of block operations), the types
2877 // below won't be valid. (A view of a column of a LayoutRight
2878 // multivector has LayoutStride, not LayoutLeft.)
2879
2880 // If we need sync to device, then host has the most recent version.
2881 const bool useHostVersion = this->need_sync_device ();
2882 if (useHostVersion) {
2883 // Work in host memory. This means we need to create a host
2884 // mirror of the input View of coefficients.
2885 auto alphas_h = Kokkos::create_mirror_view (alphas);
2886 // DEEP_COPY REVIEW - NOT TESTED
2887 Kokkos::deep_copy (alphas_h, alphas);
2888
2889 auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2890 if (isConstantStride ()) {
2891 KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2892 }
2893 else {
2894 for (size_t k = 0; k < numVecs; ++k) {
2895 const size_t Y_col = this->isConstantStride () ? k :
2896 this->whichVectors_[k];
2897 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2898 // We don't have to use the entire 1-D View here; we can use
2899 // the version that takes a scalar coefficient.
2900 KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2901 }
2902 }
2903 }
2904 else { // Work in device memory, using the input View 'alphas' directly.
2905 auto Y_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2906 if (isConstantStride ()) {
2907 KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2908 }
2909 else {
2910 // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2911 // as values on host, so copy them to host. Another approach
2912 // would be to fix scal() so that it takes a 0-D View as the
2913 // second argument.
2914 auto alphas_h = Kokkos::create_mirror_view (alphas);
2915 // DEEP_COPY REVIEW - NOT TESTED
2916 Kokkos::deep_copy (alphas_h, alphas);
2917
2918 for (size_t k = 0; k < numVecs; ++k) {
2919 const size_t Y_col = this->isConstantStride () ? k :
2920 this->whichVectors_[k];
2921 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2922 KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2923 }
2924 }
2925 }
2926 }
2927
2928 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2929 void
2931 scale (const Scalar& alpha,
2933 {
2934 using Kokkos::ALL;
2935 using Kokkos::subview;
2936 const char tfecfFuncName[] = "scale: ";
2937
2938 const size_t lclNumRows = getLocalLength ();
2939 const size_t numVecs = getNumVectors ();
2940
2942 lclNumRows != A.getLocalLength (), std::invalid_argument,
2943 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2944 << A.getLocalLength () << ".");
2946 numVecs != A.getNumVectors (), std::invalid_argument,
2947 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2948 << A.getNumVectors () << ".");
2949
2950 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2951 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2952 const std::pair<size_t, size_t> colRng (0, numVecs);
2953
2954 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2955 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2956 auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
2957 auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
2958
2959 if (isConstantStride () && A.isConstantStride ()) {
2960 KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2961 }
2962 else {
2963 // Make sure that Kokkos only uses the local length for add.
2964 for (size_t k = 0; k < numVecs; ++k) {
2965 const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2966 const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2967 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2968 auto X_k = subview (X_lcl, ALL (), X_col);
2969
2970 KokkosBlas::scal (Y_k, theAlpha, X_k);
2971 }
2972 }
2973 }
2974
2975
2976
2977 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2978 void
2981 {
2982 const char tfecfFuncName[] = "reciprocal: ";
2983
2985 getLocalLength () != A.getLocalLength (), std::runtime_error,
2986 "MultiVectors do not have the same local length. "
2987 "this->getLocalLength() = " << getLocalLength ()
2988 << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2990 A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2991 ": MultiVectors do not have the same number of columns (vectors). "
2992 "this->getNumVectors() = " << getNumVectors ()
2993 << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2994
2995 const size_t numVecs = getNumVectors ();
2996
2997 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2998 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2999
3000 if (isConstantStride () && A.isConstantStride ()) {
3001 KokkosBlas::reciprocal (this_view_dev, A_view_dev);
3002 }
3003 else {
3004 using Kokkos::ALL;
3005 using Kokkos::subview;
3006 for (size_t k = 0; k < numVecs; ++k) {
3007 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3009 const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3010 auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3011 KokkosBlas::reciprocal (vector_k, vector_Ak);
3012 }
3013 }
3014 }
3015
3016 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3017 void
3020 {
3021 const char tfecfFuncName[] = "abs";
3022
3024 getLocalLength () != A.getLocalLength (), std::runtime_error,
3025 ": MultiVectors do not have the same local length. "
3026 "this->getLocalLength() = " << getLocalLength ()
3027 << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3029 A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3030 ": MultiVectors do not have the same number of columns (vectors). "
3031 "this->getNumVectors() = " << getNumVectors ()
3032 << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3033 const size_t numVecs = getNumVectors ();
3034
3035 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3036 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3037
3038 if (isConstantStride () && A.isConstantStride ()) {
3039 KokkosBlas::abs (this_view_dev, A_view_dev);
3040 }
3041 else {
3042 using Kokkos::ALL;
3043 using Kokkos::subview;
3044
3045 for (size_t k=0; k < numVecs; ++k) {
3046 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3048 const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3049 auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3050 KokkosBlas::abs (vector_k, vector_Ak);
3051 }
3052 }
3053 }
3054
3055 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3056 void
3058 update (const Scalar& alpha,
3060 const Scalar& beta)
3061 {
3062 const char tfecfFuncName[] = "update: ";
3063 using Kokkos::subview;
3064 using Kokkos::ALL;
3065
3066 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
3067
3068 const size_t lclNumRows = getLocalLength ();
3069 const size_t numVecs = getNumVectors ();
3070
3073 lclNumRows != A.getLocalLength (), std::invalid_argument,
3074 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3075 << A.getLocalLength () << ".");
3077 numVecs != A.getNumVectors (), std::invalid_argument,
3078 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3079 << A.getNumVectors () << ".");
3080 }
3081
3082 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3083 const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3084 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3085 const std::pair<size_t, size_t> colRng (0, numVecs);
3086
3087 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
3088 auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
3089 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
3090 auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
3091
3092 // The device memory of *this is about to be modified
3093 if (isConstantStride () && A.isConstantStride ()) {
3094 KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
3095 }
3096 else {
3097 // Make sure that Kokkos only uses the local length for add.
3098 for (size_t k = 0; k < numVecs; ++k) {
3099 const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3100 const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3101 auto Y_k = subview (Y_lcl, ALL (), Y_col);
3102 auto X_k = subview (X_lcl, ALL (), X_col);
3103
3104 KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
3105 }
3106 }
3107 }
3108
3109 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3110 void
3112 update (const Scalar& alpha,
3114 const Scalar& beta,
3116 const Scalar& gamma)
3117 {
3118 using Kokkos::ALL;
3119 using Kokkos::subview;
3120
3121 const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3122
3123 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3124
3125 const size_t lclNumRows = this->getLocalLength ();
3126 const size_t numVecs = getNumVectors ();
3127
3130 lclNumRows != A.getLocalLength (), std::invalid_argument,
3131 "The input MultiVector A has " << A.getLocalLength () << " local "
3132 "row(s), but this MultiVector has " << lclNumRows << " local "
3133 "row(s).");
3135 lclNumRows != B.getLocalLength (), std::invalid_argument,
3136 "The input MultiVector B has " << B.getLocalLength () << " local "
3137 "row(s), but this MultiVector has " << lclNumRows << " local "
3138 "row(s).");
3140 A.getNumVectors () != numVecs, std::invalid_argument,
3141 "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3142 "but this MultiVector has " << numVecs << " column(s).");
3144 B.getNumVectors () != numVecs, std::invalid_argument,
3145 "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3146 "but this MultiVector has " << numVecs << " column(s).");
3147 }
3148
3149 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3150 const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3151 const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3152
3153 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3154 const std::pair<size_t, size_t> colRng (0, numVecs);
3155
3156 // Prefer 'auto' over specifying the type explicitly. This avoids
3157 // issues with a subview possibly having a different type than the
3158 // original view.
3159 auto C_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
3160 auto A_lcl = subview (A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3161 auto B_lcl = subview (B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3162
3163 if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3164 KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3165 }
3166 else {
3167 // Some input (or *this) is not constant stride,
3168 // so perform the update one column at a time.
3169 for (size_t k = 0; k < numVecs; ++k) {
3170 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3171 const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3172 const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3173 KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3174 theBeta, subview (B_lcl, rowRng, B_col),
3176 }
3177 }
3178 }
3179
3180 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3181 Teuchos::ArrayRCP<const Scalar>
3183 getData (size_t j) const
3184 {
3185 using Kokkos::ALL;
3186 using IST = impl_scalar_type;
3187 const char tfecfFuncName[] = "getData: ";
3188
3189 // Any MultiVector method that called the (classic) Kokkos Node's
3190 // viewBuffer or viewBufferNonConst methods always implied a
3191 // device->host synchronization. Thus, we synchronize here as
3192 // well.
3193
3194 auto hostView = getLocalViewHost(Access::ReadOnly);
3195 const size_t col = isConstantStride () ? j : whichVectors_[j];
3196 auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3197 Teuchos::ArrayRCP<const IST> dataAsArcp =
3198 Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3199
3201 (static_cast<size_t> (hostView_j.extent (0)) <
3202 static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3203 "hostView_j.extent(0) = " << hostView_j.extent (0)
3204 << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3205 "Please report this bug to the Tpetra developers.");
3206
3207 return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3208 }
3209
3210 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3211 Teuchos::ArrayRCP<Scalar>
3213 getDataNonConst (size_t j)
3214 {
3215 using Kokkos::ALL;
3216 using Kokkos::subview;
3217 using IST = impl_scalar_type;
3218 const char tfecfFuncName[] = "getDataNonConst: ";
3219
3220 auto hostView = getLocalViewHost(Access::ReadWrite);
3221 const size_t col = isConstantStride () ? j : whichVectors_[j];
3222 auto hostView_j = subview (hostView, ALL (), col);
3223 Teuchos::ArrayRCP<IST> dataAsArcp =
3224 Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3225
3227 (static_cast<size_t> (hostView_j.extent (0)) <
3228 static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3229 "hostView_j.extent(0) = " << hostView_j.extent (0)
3230 << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3231 "Please report this bug to the Tpetra developers.");
3232
3233 return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3234 }
3235
3236 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3237 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3239 subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3240 {
3241 using Teuchos::RCP;
3243
3244 // Check whether the index set in cols is contiguous. If it is,
3245 // use the more efficient Range1D version of subCopy.
3246 bool contiguous = true;
3247 const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3248 for (size_t j = 1; j < numCopyVecs; ++j) {
3249 if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3250 contiguous = false;
3251 break;
3252 }
3253 }
3254 if (contiguous && numCopyVecs > 0) {
3255 return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3256 }
3257 else {
3258 RCP<const MV> X_sub = this->subView (cols);
3259 RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3260 Y->assign (*X_sub);
3261 return Y;
3262 }
3263 }
3264
3265 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3266 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3268 subCopy (const Teuchos::Range1D &colRng) const
3269 {
3270 using Teuchos::RCP;
3272
3273 RCP<const MV> X_sub = this->subView (colRng);
3274 RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3275 Y->assign (*X_sub);
3276 return Y;
3277 }
3278
3279 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3280 size_t
3282 getOrigNumLocalRows () const {
3283 return view_.origExtent(0);
3284 }
3285
3286 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3287 size_t
3289 getOrigNumLocalCols () const {
3290 return view_.origExtent(1);
3291 }
3292
3293 template <class Scalar, class LO, class GO, class Node>
3296 const Teuchos::RCP<const map_type>& subMap,
3297 const local_ordinal_type rowOffset) :
3298 base_type (subMap)
3299 {
3300 using Kokkos::ALL;
3301 using Kokkos::subview;
3302 using Teuchos::outArg;
3303 using Teuchos::RCP;
3304 using Teuchos::rcp;
3305 using Teuchos::reduceAll;
3306 using Teuchos::REDUCE_MIN;
3307 using std::endl;
3309 const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3310 const char suffix[] = "Please report this bug to the Tpetra developers.";
3311 int lclGood = 1;
3312 int gblGood = 1;
3313 std::unique_ptr<std::ostringstream> errStrm;
3314 const bool debug = ::Tpetra::Details::Behavior::debug ();
3315 const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3316
3317 // Be careful to use the input Map's communicator, not X's. The
3318 // idea is that, on return, *this is a subview of X, using the
3319 // input Map.
3320 const auto comm = subMap->getComm ();
3321 TEUCHOS_ASSERT( ! comm.is_null () );
3322 const int myRank = comm->getRank ();
3323
3324 const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3325 const LO numCols = static_cast<LO> (X.getNumVectors ());
3326 const LO newNumRows = static_cast<LO> (subMap->getLocalNumElements ());
3327 if (verbose) {
3328 std::ostringstream os;
3329 os << "Proc " << myRank << ": " << prefix
3330 << "X: {lclNumRows: " << lclNumRowsBefore
3331 << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3332 << ", numCols: " << numCols << "}, "
3333 << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3334 std::cerr << os.str ();
3335 }
3336 // We ask for the _original_ number of rows in X, because X could
3337 // be a shorter (fewer rows) view of a longer MV. (For example, X
3338 // could be a domain Map view of a column Map MV.)
3339 const bool tooManyElts =
3340 newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3341 if (tooManyElts) {
3342 errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3343 *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3344 << newNumRows << ") + rowOffset (=" << rowOffset
3345 << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3346 << ")." << endl;
3347 lclGood = 0;
3349 (! debug && tooManyElts, std::invalid_argument,
3350 prefix << errStrm->str () << suffix);
3351 }
3352
3353 if (debug) {
3355 if (gblGood != 1) {
3356 std::ostringstream gblErrStrm;
3357 const std::string myErrStr =
3358 errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3361 (true, std::invalid_argument, gblErrStrm.str ());
3362 }
3363 }
3364
3365 using range_type = std::pair<LO, LO>;
3366 const range_type origRowRng
3367 (rowOffset, static_cast<LO> (X.view_.origExtent (0)));
3368 const range_type rowRng
3370
3371 wrapped_dual_view_type newView = takeSubview (X.view_, rowRng, ALL ());
3372
3373 // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3374 // handling subviews of degenerate Views quite so well. For some
3375 // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3376 // 0. We work around by creating a new empty DualView of the
3377 // desired (degenerate) dimensions.
3378 if (newView.extent (0) == 0 &&
3379 newView.extent (1) != X.view_.extent (1)) {
3380 newView =
3381 allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3382 }
3383
3384 MV subViewMV = X.isConstantStride () ?
3385 MV (subMap, newView) :
3386 MV (subMap, newView, X.whichVectors_ ());
3387
3388 if (debug) {
3389 const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3390 const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3391 if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3392 lclGood = 0;
3393 if (errStrm.get () == nullptr) {
3394 errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3395 }
3396 *errStrm << " Proc " << myRank <<
3397 ": subMap.getLocalNumElements(): " << newNumRows <<
3398 ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3399 ", X.getNumVectors(): " << numCols <<
3400 ", subViewMV.getNumVectors(): " << numColsRet << endl;
3401 }
3403 if (gblGood != 1) {
3404 std::ostringstream gblErrStrm;
3405 if (myRank == 0) {
3406 gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3407 "dimensions on one or more processes:" << endl;
3408 }
3409 const std::string myErrStr =
3410 errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3412 gblErrStrm << suffix << endl;
3414 (true, std::invalid_argument, gblErrStrm.str ());
3415 }
3416 }
3417
3418 if (verbose) {
3419 std::ostringstream os;
3420 os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3421 std::cerr << os.str ();
3422 }
3423
3424 *this = subViewMV; // shallow copy
3425
3426 if (verbose) {
3427 std::ostringstream os;
3428 os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3429 std::cerr << os.str ();
3430 }
3431 }
3432
3433 template <class Scalar, class LO, class GO, class Node>
3436 const map_type& subMap,
3437 const size_t rowOffset) :
3438 MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3440 {}
3441
3442 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3443 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3445 offsetView (const Teuchos::RCP<const map_type>& subMap,
3446 const size_t offset) const
3447 {
3449 return Teuchos::rcp (new MV (*this, *subMap, offset));
3450 }
3451
3452 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3453 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3455 offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3456 const size_t offset)
3457 {
3459 return Teuchos::rcp (new MV (*this, *subMap, offset));
3460 }
3461
3462 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3463 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3465 subView (const Teuchos::ArrayView<const size_t>& cols) const
3466 {
3467 using Teuchos::Array;
3468 using Teuchos::rcp;
3470
3471 const size_t numViewCols = static_cast<size_t> (cols.size ());
3473 numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3474 "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3475 "contain at least one entry, but cols.size() = " << cols.size ()
3476 << " == 0.");
3477
3478 // Check whether the index set in cols is contiguous. If it is,
3479 // use the more efficient Range1D version of subView.
3480 bool contiguous = true;
3481 for (size_t j = 1; j < numViewCols; ++j) {
3482 if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3483 contiguous = false;
3484 break;
3485 }
3486 }
3487 if (contiguous) {
3488 if (numViewCols == 0) {
3489 // The output MV has no columns, so there is nothing to view.
3490 return rcp (new MV (this->getMap (), numViewCols));
3491 } else {
3492 // Use the more efficient contiguous-index-range version.
3493 return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3494 }
3495 }
3496
3497 if (isConstantStride ()) {
3498 return rcp (new MV (this->getMap (), view_, cols));
3499 }
3500 else {
3501 Array<size_t> newcols (cols.size ());
3502 for (size_t j = 0; j < numViewCols; ++j) {
3503 newcols[j] = whichVectors_[cols[j]];
3504 }
3505 return rcp (new MV (this->getMap (), view_, newcols ()));
3506 }
3507 }
3508
3509
3510 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3511 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3513 subView (const Teuchos::Range1D& colRng) const
3514 {
3515 using ::Tpetra::Details::Behavior;
3516 using Kokkos::ALL;
3517 using Kokkos::subview;
3518 using Teuchos::Array;
3519 using Teuchos::RCP;
3520 using Teuchos::rcp;
3522 const char tfecfFuncName[] = "subView(Range1D): ";
3523
3524 const size_t lclNumRows = this->getLocalLength ();
3525 const size_t numVecs = this->getNumVectors ();
3526 // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3527 // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3528 // "at least one vector.");
3530 static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3531 "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3532 << numVecs << ".");
3534 numVecs != 0 && colRng.size () != 0 &&
3535 (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3536 static_cast<size_t> (colRng.ubound ()) >= numVecs),
3537 std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3538 "," << colRng.ubound () << "] exceeds the valid range of column indices "
3539 "[0, " << numVecs << "].");
3540
3541 RCP<const MV> X_ret; // the MultiVector subview to return
3542
3543 // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3544 // broken for the case of views with zero rows. I will brutally
3545 // enforce that the subview has the correct dimensions. In
3546 // particular, in the case of zero rows, I will, if necessary,
3547 // create a new dual_view_type with zero rows and the correct
3548 // number of columns. In a debug build, I will use an all-reduce
3549 // to ensure that it has the correct dimensions on all processes.
3550
3551 const std::pair<size_t, size_t> rows (0, lclNumRows);
3552 if (colRng.size () == 0) {
3553 const std::pair<size_t, size_t> cols (0, 0); // empty range
3554 wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3555 X_ret = rcp (new MV (this->getMap (), X_sub));
3556 }
3557 else {
3558 // Returned MultiVector is constant stride only if *this is.
3559 if (isConstantStride ()) {
3560 const std::pair<size_t, size_t> cols (colRng.lbound (),
3561 colRng.ubound () + 1);
3562 wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3563 X_ret = rcp (new MV (this->getMap (), X_sub));
3564 }
3565 else {
3566 if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3567 // We're only asking for one column, so the result does have
3568 // constant stride, even though this MultiVector does not.
3569 const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3570 whichVectors_[0] + colRng.ubound () + 1);
3571 wrapped_dual_view_type X_sub = takeSubview (view_, ALL (), col);
3572 X_ret = rcp (new MV (this->getMap (), X_sub));
3573 }
3574 else {
3575 Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3576 whichVectors_.begin () + colRng.ubound () + 1);
3577 X_ret = rcp (new MV (this->getMap (), view_, which));
3578 }
3579 }
3580 }
3581
3582 const bool debug = Behavior::debug ();
3583 if (debug) {
3584 using Teuchos::Comm;
3585 using Teuchos::outArg;
3586 using Teuchos::REDUCE_MIN;
3587 using Teuchos::reduceAll;
3588
3589 RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3590 Teuchos::null : this->getMap ()->getComm ();
3591 if (! comm.is_null ()) {
3592 int lclSuccess = 1;
3593 int gblSuccess = 1;
3594
3595 if (X_ret.is_null ()) {
3596 lclSuccess = 0;
3597 }
3600 (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3601 "MultiVector; the return value of this method) is null on some MPI "
3602 "process in this MultiVector's communicator. This should never "
3603 "happen. Please report this bug to the Tpetra developers.");
3604 if (! X_ret.is_null () &&
3605 X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3606 lclSuccess = 0;
3607 }
3609 outArg (gblSuccess));
3611 (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3612 "colRng.size(), on at least one MPI process in this MultiVector's "
3613 "communicator. This should never happen. "
3614 "Please report this bug to the Tpetra developers.");
3615 }
3616 }
3617 return X_ret;
3618 }
3619
3620
3621 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3622 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3624 subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3625 {
3627 return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3628 }
3629
3630
3631 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3632 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3634 subViewNonConst (const Teuchos::Range1D &colRng)
3635 {
3637 return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3638 }
3639
3640
3641 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3644 const size_t j)
3645 : base_type (X.getMap ())
3646 {
3647 using Kokkos::subview;
3648 typedef std::pair<size_t, size_t> range_type;
3649 const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3650
3651 const size_t numCols = X.getNumVectors ();
3653 (j >= numCols, std::invalid_argument, "Input index j (== " << j
3654 << ") exceeds valid column index range [0, " << numCols << " - 1].");
3655 const size_t jj = X.isConstantStride () ?
3656 static_cast<size_t> (j) :
3657 static_cast<size_t> (X.whichVectors_[j]);
3658 this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3659
3660 // mfh 31 Jul 2017: It would be unwise to execute concurrent
3661 // Export or Import operations with different subviews of a
3662 // MultiVector. Thus, it is safe to reuse communication buffers.
3663 // See #1560 discussion.
3664 //
3665 // We only need one column's worth of buffer for imports_ and
3666 // exports_. Taking subviews now ensures that their lengths will
3667 // be exactly what we need, so we won't have to resize them later.
3668 {
3669 const size_t newSize = X.imports_.extent (0) / numCols;
3670 const size_t offset = jj*newSize;
3671 auto newImports = X.imports_;
3672 newImports.d_view = subview (X.imports_.d_view,
3673 range_type (offset, offset+newSize));
3674 newImports.h_view = subview (X.imports_.h_view,
3675 range_type (offset, offset+newSize));
3676 this->imports_ = newImports;
3677 }
3678 {
3679 const size_t newSize = X.exports_.extent (0) / numCols;
3680 const size_t offset = jj*newSize;
3681 auto newExports = X.exports_;
3682 newExports.d_view = subview (X.exports_.d_view,
3683 range_type (offset, offset+newSize));
3684 newExports.h_view = subview (X.exports_.h_view,
3685 range_type (offset, offset+newSize));
3686 this->exports_ = newExports;
3687 }
3688 // These two DualViews already either have the right number of
3689 // entries, or zero entries. This means that we don't need to
3690 // resize them.
3691 this->numImportPacketsPerLID_ = X.numImportPacketsPerLID_;
3692 this->numExportPacketsPerLID_ = X.numExportPacketsPerLID_;
3693 }
3694
3695
3696 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3697 Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3699 getVector (const size_t j) const
3700 {
3702 return Teuchos::rcp (new V (*this, j));
3703 }
3704
3705
3706 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3707 Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3709 getVectorNonConst (const size_t j)
3710 {
3712 return Teuchos::rcp (new V (*this, j));
3713 }
3714
3715
3716 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3717 void
3719 get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3720 {
3721 using IST = impl_scalar_type;
3722 using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3723 Kokkos::HostSpace,
3724 Kokkos::MemoryUnmanaged>;
3725 const char tfecfFuncName[] = "get1dCopy: ";
3726
3727 const size_t numRows = this->getLocalLength ();
3728 const size_t numCols = this->getNumVectors ();
3729
3731 (LDA < numRows, std::runtime_error,
3732 "LDA = " << LDA << " < numRows = " << numRows << ".");
3734 (numRows > size_t (0) && numCols > size_t (0) &&
3735 size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3736 std::runtime_error,
3737 "A.size() = " << A.size () << ", but its size must be at least "
3738 << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3739
3740 const std::pair<size_t, size_t> rowRange (0, numRows);
3741 const std::pair<size_t, size_t> colRange (0, numCols);
3742
3743 input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3744 LDA, numCols);
3745 auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3746
3758 if (this->need_sync_host() && this->need_sync_device()) {
3761 throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3762 }
3763 else {
3764 const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3765 if (this->isConstantStride ()) {
3766 if (useHostView) {
3767 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3768 // DEEP_COPY REVIEW - NOT TESTED
3769 Kokkos::deep_copy (A_view, srcView_host);
3770 } else {
3771 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3772 // DEEP_COPY REVIEW - NOT TESTED
3773 Kokkos::deep_copy (A_view, srcView_device);
3774 }
3775 }
3776 else {
3777 for (size_t j = 0; j < numCols; ++j) {
3778 const size_t srcCol = this->whichVectors_[j];
3779 auto dstColView = Kokkos::subview (A_view, rowRange, j);
3780
3781 if (useHostView) {
3782 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3783 auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3784 // DEEP_COPY REVIEW - NOT TESTED
3785 Kokkos::deep_copy (dstColView, srcColView_host);
3786 } else {
3787 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3788 auto srcColView_device = Kokkos::subview (srcView_device, rowRange, srcCol);
3789 // DEEP_COPY REVIEW - NOT TESTED
3790 Kokkos::deep_copy (dstColView, srcColView_device);
3791 }
3792 }
3793 }
3794 }
3795 }
3796
3797
3798 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3799 void
3801 get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3802 {
3804 const char tfecfFuncName[] = "get2dCopy: ";
3805 const size_t numRows = this->getLocalLength ();
3806 const size_t numCols = this->getNumVectors ();
3807
3809 static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3810 std::runtime_error, "Input array of pointers must contain as many "
3811 "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3812 << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3813
3814 if (numRows != 0 && numCols != 0) {
3815 // No side effects until we've validated the input.
3816 for (size_t j = 0; j < numCols; ++j) {
3817 const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3819 dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3820 "the input array of arrays is not long enough to fit all entries in "
3821 "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3822 << " < getLocalLength() = " << numRows << ".");
3823 }
3824
3825 // We've validated the input, so it's safe to start copying.
3826 for (size_t j = 0; j < numCols; ++j) {
3827 Teuchos::RCP<const V> X_j = this->getVector (j);
3828 const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3829 X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3830 }
3831 }
3832 }
3833
3834
3835 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3836 Teuchos::ArrayRCP<const Scalar>
3838 get1dView () const
3839 {
3840 if (getLocalLength () == 0 || getNumVectors () == 0) {
3841 return Teuchos::null;
3842 } else {
3844 ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3845 "get1dView: This MultiVector does not have constant stride, so it is "
3846 "not possible to view its data as a single array. You may check "
3847 "whether a MultiVector has constant stride by calling "
3848 "isConstantStride().");
3849 // Since get1dView() is and was always marked const, I have to
3850 // cast away const here in order not to break backwards
3851 // compatibility.
3852 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3853 Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3854 Kokkos::Compat::persistingView (X_lcl);
3855 return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3856 }
3857 }
3858
3859 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3860 Teuchos::ArrayRCP<Scalar>
3863 {
3864 if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3865 return Teuchos::null;
3866 }
3867 else {
3869 (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3870 "get1dViewNonConst: This MultiVector does not have constant stride, "
3871 "so it is not possible to view its data as a single array. You may "
3872 "check whether a MultiVector has constant stride by calling "
3873 "isConstantStride().");
3874 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3875 Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3876 Kokkos::Compat::persistingView (X_lcl);
3877 return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3878 }
3879 }
3880
3881 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3882 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3885 {
3886 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3887
3888 // Don't use the row range here on the outside, in order to avoid
3889 // a strided return type (in case Kokkos::subview is conservative
3890 // about that). Instead, use the row range for the column views
3891 // in the loop.
3892 const size_t myNumRows = this->getLocalLength ();
3893 const size_t numCols = this->getNumVectors ();
3894 const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3895
3896 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3897 for (size_t j = 0; j < numCols; ++j) {
3898 const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3899 auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3900 Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3901 Kokkos::Compat::persistingView (X_lcl_j);
3902 views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3903 }
3904 return views;
3905 }
3906
3907 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3910 getLocalViewHost(Access::ReadOnlyStruct s) const
3911 {
3912 return view_.getHostView(s);
3913 }
3914
3915 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3918 getLocalViewHost(Access::ReadWriteStruct s)
3919 {
3920 return view_.getHostView(s);
3921 }
3922
3923 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3926 getLocalViewHost(Access::OverwriteAllStruct s)
3927 {
3928 return view_.getHostView(s);
3929 }
3930
3931 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3934 getLocalViewDevice(Access::ReadOnlyStruct s) const
3935 {
3936 return view_.getDeviceView(s);
3937 }
3938
3939 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3942 getLocalViewDevice(Access::ReadWriteStruct s)
3943 {
3944 return view_.getDeviceView(s);
3945 }
3946
3947 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3950 getLocalViewDevice(Access::OverwriteAllStruct s)
3951 {
3952 return view_.getDeviceView(s);
3953 }
3954
3955 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3956 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3958 get2dView () const
3959 {
3960 // Since get2dView() is and was always marked const, I have to
3961 // cast away const here in order not to break backwards
3962 // compatibility.
3963 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3964
3965 // Don't use the row range here on the outside, in order to avoid
3966 // a strided return type (in case Kokkos::subview is conservative
3967 // about that). Instead, use the row range for the column views
3968 // in the loop.
3969 const size_t myNumRows = this->getLocalLength ();
3970 const size_t numCols = this->getNumVectors ();
3971 const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3972
3973 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3974 for (size_t j = 0; j < numCols; ++j) {
3975 const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3976 auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3977 Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3978 Kokkos::Compat::persistingView (X_lcl_j);
3979 views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
3980 }
3981 return views;
3982 }
3983
3984 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3985 void
3987 multiply (Teuchos::ETransp transA,
3988 Teuchos::ETransp transB,
3989 const Scalar& alpha,
3992 const Scalar& beta)
3993 {
3994 using ::Tpetra::Details::ProfilingRegion;
3995 using Teuchos::CONJ_TRANS;
3996 using Teuchos::NO_TRANS;
3997 using Teuchos::TRANS;
3998 using Teuchos::RCP;
3999 using Teuchos::rcp;
4000 using Teuchos::rcpFromRef;
4001 using std::endl;
4002 using ATS = Kokkos::ArithTraits<impl_scalar_type>;
4003 using LO = local_ordinal_type;
4004 using STS = Teuchos::ScalarTraits<Scalar>;
4006 const char tfecfFuncName[] = "multiply: ";
4007 ProfilingRegion region ("Tpetra::MV::multiply");
4008
4009 // This routine performs a variety of matrix-matrix multiply
4010 // operations, interpreting the MultiVector (this-aka C , A and B)
4011 // as 2D matrices. Variations are due to the fact that A, B and C
4012 // can be locally replicated or globally distributed MultiVectors
4013 // and that we may or may not operate with the transpose of A and
4014 // B. Possible cases are:
4015 //
4016 // Operations # Cases Notes
4017 // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
4018 // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
4019 // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
4020 //
4021 // The following operations are not meaningful for 1-D
4022 // distributions:
4023 //
4024 // u1) C(local) = A^T(distr) * B^T(distr) 1
4025 // u2) C(local) = A (distr) * B^X(distr) 2
4026 // u3) C(distr) = A^X(local) * B^X(local) 4
4027 // u4) C(distr) = A^X(local) * B^X(distr) 4
4028 // u5) C(distr) = A^T(distr) * B^X(local) 2
4029 // u6) C(local) = A^X(distr) * B^X(local) 4
4030 // u7) C(distr) = A^X(distr) * B^X(local) 4
4031 // u8) C(local) = A^X(local) * B^X(distr) 4
4032 //
4033 // Total number of cases: 32 (= 2^5).
4034
4035 impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4036
4037 const bool A_is_local = ! A.isDistributed ();
4038 const bool B_is_local = ! B.isDistributed ();
4039 const bool C_is_local = ! this->isDistributed ();
4040
4041 // In debug mode, check compatibility of local dimensions. We
4042 // only do this in debug mode, since it requires an all-reduce
4043 // to ensure correctness on all processses. It's entirely
4044 // possible that only some processes may have incompatible local
4045 // dimensions. Throwing an exception only on those processes
4046 // could cause this method to hang.
4047 const bool debug = ::Tpetra::Details::Behavior::debug ();
4048 if (debug) {
4049 auto myMap = this->getMap ();
4050 if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
4051 using Teuchos::REDUCE_MIN;
4052 using Teuchos::reduceAll;
4053 using Teuchos::outArg;
4054
4055 auto comm = myMap->getComm ();
4056 const size_t A_nrows =
4057 (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
4058 const size_t A_ncols =
4059 (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
4060 const size_t B_nrows =
4061 (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
4062 const size_t B_ncols =
4063 (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
4064
4065 const bool lclBad = this->getLocalLength () != A_nrows ||
4066 this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4067
4068 const int myRank = comm->getRank ();
4069 std::ostringstream errStrm;
4070 if (this->getLocalLength () != A_nrows) {
4071 errStrm << "Proc " << myRank << ": this->getLocalLength()="
4072 << this->getLocalLength () << " != A_nrows=" << A_nrows
4073 << "." << std::endl;
4074 }
4075 if (this->getNumVectors () != B_ncols) {
4076 errStrm << "Proc " << myRank << ": this->getNumVectors()="
4077 << this->getNumVectors () << " != B_ncols=" << B_ncols
4078 << "." << std::endl;
4079 }
4080 if (A_ncols != B_nrows) {
4081 errStrm << "Proc " << myRank << ": A_ncols="
4082 << A_ncols << " != B_nrows=" << B_nrows
4083 << "." << std::endl;
4084 }
4085
4086 const int lclGood = lclBad ? 0 : 1;
4087 int gblGood = 0;
4089 if (gblGood != 1) {
4090 std::ostringstream os;
4092
4094 (true, std::runtime_error, "Inconsistent local dimensions on at "
4095 "least one process in this object's communicator." << std::endl
4096 << "Operation: "
4097 << "C(" << (C_is_local ? "local" : "distr") << ") = "
4098 << alpha << "*A"
4099 << (transA == Teuchos::TRANS ? "^T" :
4100 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4101 << "(" << (A_is_local ? "local" : "distr") << ") * "
4102 << beta << "*B"
4103 << (transA == Teuchos::TRANS ? "^T" :
4104 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4105 << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
4106 << "Global dimensions: C(" << this->getGlobalLength () << ", "
4107 << this->getNumVectors () << "), A(" << A.getGlobalLength ()
4108 << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
4109 << ", " << B.getNumVectors () << ")." << std::endl
4110 << os.str ());
4111 }
4112 }
4113 }
4114
4115 // Case 1: C(local) = A^X(local) * B^X(local)
4116 const bool Case1 = C_is_local && A_is_local && B_is_local;
4117 // Case 2: C(local) = A^T(distr) * B (distr)
4118 const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4119 transA != NO_TRANS &&
4120 transB == NO_TRANS;
4121 // Case 3: C(distr) = A (distr) * B^X(local)
4122 const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4123 transA == NO_TRANS;
4124
4125 // Test that we are considering a meaningful case
4127 (! Case1 && ! Case2 && ! Case3, std::runtime_error,
4128 "Multiplication of op(A) and op(B) into *this is not a "
4129 "supported use case.");
4130
4131 if (beta != STS::zero () && Case2) {
4132 // If Case2, then C is local and contributions must be summed
4133 // across all processes. However, if beta != 0, then accumulate
4134 // beta*C into the sum. When summing across all processes, we
4135 // only want to accumulate this once, so set beta == 0 on all
4136 // processes except Process 0.
4137 const int myRank = this->getMap ()->getComm ()->getRank ();
4138 if (myRank != 0) {
4139 beta_local = ATS::zero ();
4140 }
4141 }
4142
4143 // We only know how to do matrix-matrix multiplies if all the
4144 // MultiVectors have constant stride. If not, we have to make
4145 // temporary copies of those MultiVectors (including possibly
4146 // *this) that don't have constant stride.
4147 RCP<MV> C_tmp;
4148 if (! isConstantStride ()) {
4149 C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4150 } else {
4151 C_tmp = rcp (this, false);
4152 }
4153
4155 if (! A.isConstantStride ()) {
4156 A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4157 } else {
4158 A_tmp = rcpFromRef (A);
4159 }
4160
4162 if (! B.isConstantStride ()) {
4163 B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4164 } else {
4165 B_tmp = rcpFromRef (B);
4166 }
4167
4169 (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4170 ! A_tmp->isConstantStride (), std::logic_error,
4171 "Failed to make temporary constant-stride copies of MultiVectors.");
4172
4173 {
4174 const LO A_lclNumRows = A_tmp->getLocalLength ();
4175 const LO A_numVecs = A_tmp->getNumVectors ();
4176 auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4177 auto A_sub = Kokkos::subview (A_lcl,
4178 std::make_pair (LO (0), A_lclNumRows),
4179 std::make_pair (LO (0), A_numVecs));
4180
4181
4182 const LO B_lclNumRows = B_tmp->getLocalLength ();
4183 const LO B_numVecs = B_tmp->getNumVectors ();
4184 auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4185 auto B_sub = Kokkos::subview (B_lcl,
4186 std::make_pair (LO (0), B_lclNumRows),
4187 std::make_pair (LO (0), B_numVecs));
4188
4189 const LO C_lclNumRows = C_tmp->getLocalLength ();
4190 const LO C_numVecs = C_tmp->getNumVectors ();
4191
4192 auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4193 auto C_sub = Kokkos::subview (C_lcl,
4194 std::make_pair (LO (0), C_lclNumRows),
4195 std::make_pair (LO (0), C_numVecs));
4196 const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4197 (transA == Teuchos::TRANS ? 'T' : 'C'));
4198 const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4199 (transB == Teuchos::TRANS ? 'T' : 'C'));
4201
4202 ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4203
4204 KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4205 beta_local, C_sub);
4206 }
4207
4208 if (! isConstantStride ()) {
4209 ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4210 }
4211
4212 // Dispose of (possibly) extra copies of A and B.
4213 A_tmp = Teuchos::null;
4214 B_tmp = Teuchos::null;
4215
4216 // If Case 2 then sum up *this and distribute it to all processes.
4217 if (Case2) {
4218 this->reduce ();
4219 }
4220 }
4221
4222 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4223 void
4229 {
4230 using Kokkos::ALL;
4231 using Kokkos::subview;
4232 const char tfecfFuncName[] = "elementWiseMultiply: ";
4233
4234 const size_t lclNumRows = this->getLocalLength ();
4235 const size_t numVecs = this->getNumVectors ();
4236
4238 (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4239 std::runtime_error, "MultiVectors do not have the same local length.");
4241 numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4242 "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4243 << ".");
4244
4245 auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4246 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4247 auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4248
4249 if (isConstantStride () && B.isConstantStride ()) {
4250 // A is just a Vector; it only has one column, so it always has
4251 // constant stride.
4252 //
4253 // If both *this and B have constant stride, we can do an
4254 // element-wise multiply on all columns at once.
4255 KokkosBlas::mult (scalarThis,
4256 this_view,
4257 scalarAB,
4258 subview (A_view, ALL (), 0),
4259 B_view);
4260 }
4261 else {
4262 for (size_t j = 0; j < numVecs; ++j) {
4263 const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4264 const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4265 KokkosBlas::mult (scalarThis,
4266 subview (this_view, ALL (), C_col),
4267 scalarAB,
4268 subview (A_view, ALL (), 0),
4269 subview (B_view, ALL (), B_col));
4270 }
4271 }
4272 }
4273
4274 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4275 void
4277 reduce ()
4278 {
4279 using ::Tpetra::Details::allReduceView;
4280 using ::Tpetra::Details::ProfilingRegion;
4281 ProfilingRegion region ("Tpetra::MV::reduce");
4282
4283 const auto map = this->getMap ();
4284 if (map.get () == nullptr) {
4285 return;
4286 }
4287 const auto comm = map->getComm ();
4288 if (comm.get () == nullptr) {
4289 return;
4290 }
4291
4292 // Avoid giving device buffers to MPI. Even if MPI handles them
4293 // correctly, doing so may not perform well.
4294 const bool changed_on_device = this->need_sync_host ();
4295 if (changed_on_device) {
4296 // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4297 // and host will be separate allocations. In that case, it may
4298 // pay to do the all-reduce from device to host.
4299 Kokkos::fence(); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4300 auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4301 allReduceView (X_lcl, X_lcl, *comm);
4302 }
4303 else {
4304 auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4305 allReduceView (X_lcl, X_lcl, *comm);
4306 }
4307 }
4308
4309 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4310 void
4313 const size_t col,
4315 {
4317 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4318 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4321 std::runtime_error,
4322 "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4323 << " is invalid. The range of valid row indices on this process "
4324 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4325 << ", " << maxLocalIndex << "].");
4327 vectorIndexOutOfRange(col),
4328 std::runtime_error,
4329 "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4330 << " of the multivector is invalid.");
4331 }
4332
4333 auto view = this->getLocalViewHost(Access::ReadWrite);
4334 const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4335 view (lclRow, colInd) = ScalarValue;
4336 }
4337
4338
4339 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4340 void
4343 const size_t col,
4344 const impl_scalar_type& value,
4345 const bool atomic)
4346 {
4348 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4349 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4352 std::runtime_error,
4353 "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4354 << " is invalid. The range of valid row indices on this process "
4355 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4356 << ", " << maxLocalIndex << "].");
4358 vectorIndexOutOfRange(col),
4359 std::runtime_error,
4360 "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4361 << " of the multivector is invalid.");
4362 }
4363
4364 const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4365
4366 auto view = this->getLocalViewHost(Access::ReadWrite);
4367 if (atomic) {
4368 Kokkos::atomic_add (& (view(lclRow, colInd)), value);
4369 }
4370 else {
4371 view(lclRow, colInd) += value;
4372 }
4373 }
4374
4375
4376 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4377 void
4380 const size_t col,
4382 {
4383 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4384 // touches the RCP's reference count, which isn't thread safe.
4385 const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4386
4388 const char tfecfFuncName[] = "replaceGlobalValue: ";
4390 (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4391 std::runtime_error,
4392 "Global row index " << gblRow << " is not present on this process "
4393 << this->getMap ()->getComm ()->getRank () << ".");
4395 (this->vectorIndexOutOfRange (col), std::runtime_error,
4396 "Vector index " << col << " of the MultiVector is invalid.");
4397 }
4398
4399 this->replaceLocalValue (lclRow, col, ScalarValue);
4400 }
4401
4402 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4403 void
4406 const size_t col,
4407 const impl_scalar_type& value,
4408 const bool atomic)
4409 {
4410 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4411 // touches the RCP's reference count, which isn't thread safe.
4412 const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4413
4416 lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4417 std::runtime_error,
4418 "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4419 << " is not present on this process "
4420 << this->getMap ()->getComm ()->getRank () << ".");
4422 vectorIndexOutOfRange(col),
4423 std::runtime_error,
4424 "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4425 << " of the multivector is invalid.");
4426 }
4427
4428 this->sumIntoLocalValue (lclRow, col, value, atomic);
4429 }
4430
4431 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4432 template <class T>
4433 Teuchos::ArrayRCP<T>
4435 getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4436 size_t j) const
4437 {
4438 typedef Kokkos::DualView<impl_scalar_type*,
4439 typename dual_view_type::array_layout,
4441 const size_t col = isConstantStride () ? j : whichVectors_[j];
4443 Kokkos::subview (view_, Kokkos::ALL (), col);
4444 return Kokkos::Compat::persistingView (X_col.d_view);
4445 }
4446
4447 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4448 bool
4450 need_sync_host () const {
4451 return view_.need_sync_host ();
4452 }
4453
4454 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4455 bool
4457 need_sync_device () const {
4458 return view_.need_sync_device ();
4459 }
4460
4461 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4462 std::string
4464 descriptionImpl (const std::string& className) const
4465 {
4466 using Teuchos::TypeNameTraits;
4467
4468 std::ostringstream out;
4469 out << "\"" << className << "\": {";
4470 out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4471 << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4472 << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4473 << ", Node" << Node::name ()
4474 << "}, ";
4475 if (this->getObjectLabel () != "") {
4476 out << "Label: \"" << this->getObjectLabel () << "\", ";
4477 }
4478 out << ", numRows: " << this->getGlobalLength ();
4479 if (className != "Tpetra::Vector") {
4480 out << ", numCols: " << this->getNumVectors ()
4481 << ", isConstantStride: " << this->isConstantStride ();
4482 }
4483 if (this->isConstantStride ()) {
4484 out << ", columnStride: " << this->getStride ();
4485 }
4486 out << "}";
4487
4488 return out.str ();
4489 }
4490
4491 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4492 std::string
4494 description () const
4495 {
4496 return this->descriptionImpl ("Tpetra::MultiVector");
4497 }
4498
4499 namespace Details
4500 {
4501 template<typename ViewType>
4502 void print_vector(Teuchos::FancyOStream & out, const char* prefix, const ViewType& v)
4503 {
4504 using std::endl;
4505 static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4506 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4507 static_assert(ViewType::rank == 2,
4508 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4509 // The square braces [] and their contents are in Matlab
4510 // format, so users may copy and paste directly into Matlab.
4511 out << "Values("<<prefix<<"): " << std::endl
4512 << "[";
4513 const size_t numRows = v.extent(0);
4514 const size_t numCols = v.extent(1);
4515 if (numCols == 1) {
4516 for (size_t i = 0; i < numRows; ++i) {
4517 out << v(i,0);
4518 if (i + 1 < numRows) {
4519 out << "; ";
4520 }
4521 }
4522 }
4523 else {
4524 for (size_t i = 0; i < numRows; ++i) {
4525 for (size_t j = 0; j < numCols; ++j) {
4526 out << v(i,j);
4527 if (j + 1 < numCols) {
4528 out << ", ";
4529 }
4530 }
4531 if (i + 1 < numRows) {
4532 out << "; ";
4533 }
4534 }
4535 }
4536 out << "]" << endl;
4537 }
4538 }
4539
4540 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4541 std::string
4543 localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4544 {
4545 typedef LocalOrdinal LO;
4546 using std::endl;
4547
4548 if (vl <= Teuchos::VERB_LOW) {
4549 return std::string ();
4550 }
4551 auto map = this->getMap ();
4552 if (map.is_null ()) {
4553 return std::string ();
4554 }
4555 auto outStringP = Teuchos::rcp (new std::ostringstream ());
4556 auto outp = Teuchos::getFancyOStream (outStringP);
4557 Teuchos::FancyOStream& out = *outp;
4558 auto comm = map->getComm ();
4559 const int myRank = comm->getRank ();
4560 const int numProcs = comm->getSize ();
4561
4562 out << "Process " << myRank << " of " << numProcs << ":" << endl;
4563 Teuchos::OSTab tab1 (out);
4564
4565 // VERB_MEDIUM and higher prints getLocalLength()
4566 const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4567 out << "Local number of rows: " << lclNumRows << endl;
4568
4569 if (vl > Teuchos::VERB_MEDIUM) {
4570 // VERB_HIGH and higher prints isConstantStride() and getStride().
4571 // The first is only relevant if the Vector has multiple columns.
4572 if (this->getNumVectors () != static_cast<size_t> (1)) {
4573 out << "isConstantStride: " << this->isConstantStride () << endl;
4574 }
4575 if (this->isConstantStride ()) {
4576 out << "Column stride: " << this->getStride () << endl;
4577 }
4578
4579 if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4580 // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4582 // so we can't use our regular accessor functins
4583
4584 // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4585 // to trigger (since this function is conceptually const). Thus, we
4586 // get *copies* of the view's data instead.
4587 auto X_dev = view_.getDeviceCopy();
4588 auto X_host = view_.getHostCopy();
4589
4590 if(X_dev.data() == X_host.data()) {
4591 // One single allocation
4592 Details::print_vector(out,"unified",X_host);
4593 }
4594 else {
4595 Details::print_vector(out,"host",X_host);
4596 Details::print_vector(out,"dev",X_dev);
4597 }
4598 }
4599 }
4600 out.flush (); // make sure the ostringstream got everything
4601 return outStringP->str ();
4602 }
4603
4604 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4605 void
4607 describeImpl (Teuchos::FancyOStream& out,
4608 const std::string& className,
4609 const Teuchos::EVerbosityLevel verbLevel) const
4610 {
4611 using Teuchos::TypeNameTraits;
4612 using Teuchos::VERB_DEFAULT;
4613 using Teuchos::VERB_NONE;
4614 using Teuchos::VERB_LOW;
4615 using std::endl;
4616 const Teuchos::EVerbosityLevel vl =
4618
4619 if (vl == VERB_NONE) {
4620 return; // don't print anything
4621 }
4622 // If this Vector's Comm is null, then the Vector does not
4623 // participate in collective operations with the other processes.
4624 // In that case, it is not even legal to call this method. The
4625 // reasonable thing to do in that case is nothing.
4626 auto map = this->getMap ();
4627 if (map.is_null ()) {
4628 return;
4629 }
4630 auto comm = map->getComm ();
4631 if (comm.is_null ()) {
4632 return;
4633 }
4634
4635 const int myRank = comm->getRank ();
4636
4637 // Only Process 0 should touch the output stream, but this method
4638 // in general may need to do communication. Thus, we may need to
4639 // preserve the current tab level across multiple "if (myRank ==
4640 // 0) { ... }" inner scopes. This is why we sometimes create
4641 // OSTab instances by pointer, instead of by value. We only need
4642 // to create them by pointer if the tab level must persist through
4643 // multiple inner scopes.
4644 Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4645
4646 // VERB_LOW and higher prints the equivalent of description().
4647 if (myRank == 0) {
4648 tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4649 out << "\"" << className << "\":" << endl;
4650 tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4651 {
4652 out << "Template parameters:" << endl;
4653 Teuchos::OSTab tab2 (out);
4654 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4655 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4656 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4657 << "Node: " << Node::name () << endl;
4658 }
4659 if (this->getObjectLabel () != "") {
4660 out << "Label: \"" << this->getObjectLabel () << "\", ";
4661 }
4662 out << "Global number of rows: " << this->getGlobalLength () << endl;
4663 if (className != "Tpetra::Vector") {
4664 out << "Number of columns: " << this->getNumVectors () << endl;
4665 }
4666 // getStride() may differ on different processes, so it (and
4667 // isConstantStride()) properly belong to per-process data.
4668 }
4669
4670 // This is collective over the Map's communicator.
4671 if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4672 const std::string lclStr = this->localDescribeToString (vl);
4674 }
4675 }
4676
4677 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4678 void
4680 describe (Teuchos::FancyOStream &out,
4681 const Teuchos::EVerbosityLevel verbLevel) const
4682 {
4683 this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4684 }
4685
4686 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4687 void
4689 removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4690 {
4691 replaceMap (newMap);
4692 }
4693
4694 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4695 void
4698 {
4699 using ::Tpetra::Details::localDeepCopy;
4700 const char prefix[] = "Tpetra::MultiVector::assign: ";
4701
4703 (this->getGlobalLength () != src.getGlobalLength () ||
4704 this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4705 prefix << "Global dimensions of the two Tpetra::MultiVector "
4706 "objects do not match. src has dimensions [" << src.getGlobalLength ()
4707 << "," << src.getNumVectors () << "], and *this has dimensions ["
4708 << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4709 // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4711 (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4712 prefix << "The local row counts of the two Tpetra::MultiVector "
4713 "objects do not match. src has " << src.getLocalLength () << " row(s) "
4714 << " and *this has " << this->getLocalLength () << " row(s).");
4715
4716
4717 // See #1510. We're writing to *this, so we don't care about its
4718 // contents in either memory space, and we don't want
4719 // DualView::modify to complain about "concurrent modification" of
4720 // host and device Views.
4721
4725
4726 // If need sync to device, then host has most recent version.
4727 const bool src_last_updated_on_host = src.need_sync_device ();
4728
4730 localDeepCopy (this->getLocalViewHost(Access::ReadWrite),
4731 src.getLocalViewHost(Access::ReadOnly),
4732 this->isConstantStride (),
4733 src.isConstantStride (),
4734 this->whichVectors_ (),
4735 src.whichVectors_ ());
4736 }
4737 else {
4738 localDeepCopy (this->getLocalViewDevice(Access::ReadWrite),
4739 src.getLocalViewDevice(Access::ReadOnly),
4740 this->isConstantStride (),
4741 src.isConstantStride (),
4742 this->whichVectors_ (),
4743 src.whichVectors_ ());
4744 }
4745 }
4746
4747 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4748 template<class T>
4749 Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4751 convert () const
4752 {
4753 using Teuchos::RCP;
4754
4755 auto newMV = Teuchos::rcp(new MultiVector<T,LocalOrdinal,GlobalOrdinal,Node>(this->getMap(),
4756 this->getNumVectors(),
4757 /*zeroOut=*/false));
4758 Tpetra::deep_copy(*newMV, *this);
4759 return newMV;
4760 }
4761
4762 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4763 bool
4766 {
4767 using ::Tpetra::Details::PackTraits;
4768
4769 const size_t l1 = this->getLocalLength();
4770 const size_t l2 = vec.getLocalLength();
4771 if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4772 return false;
4773 }
4774
4775 return true;
4776 }
4777
4778 template <class ST, class LO, class GO, class NT>
4781 std::swap(mv.map_, this->map_);
4782 std::swap(mv.view_, this->view_);
4783 std::swap(mv.whichVectors_, this->whichVectors_);
4784 }
4785
4786#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4787 template <class ST, class LO, class GO, class NT>
4788 void
4790 const Teuchos::SerialDenseMatrix<int, ST>& src)
4791 {
4792 using ::Tpetra::Details::localDeepCopy;
4793 using MV = MultiVector<ST, LO, GO, NT>;
4794 using IST = typename MV::impl_scalar_type;
4795 using input_view_type =
4796 Kokkos::View<const IST**, Kokkos::LayoutLeft,
4797 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4798 using pair_type = std::pair<LO, LO>;
4799
4800 const LO numRows = static_cast<LO> (src.numRows ());
4801 const LO numCols = static_cast<LO> (src.numCols ());
4802
4804 (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4805 numCols != static_cast<LO> (dst.getNumVectors ()),
4806 std::invalid_argument, "Tpetra::deep_copy: On Process "
4807 << dst.getMap ()->getComm ()->getRank () << ", dst is "
4808 << dst.getLocalLength () << " x " << dst.getNumVectors ()
4809 << ", but src is " << numRows << " x " << numCols << ".");
4810
4811 const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4812 input_view_type src_orig (src_raw, src.stride (), numCols);
4813 input_view_type src_view =
4814 Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4815
4816 constexpr bool src_isConstantStride = true;
4817 Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4818 localDeepCopy (dst.getLocalViewHost(Access::ReadWrite),
4819 src_view,
4820 dst.isConstantStride (),
4822 getMultiVectorWhichVectors (dst),
4824 }
4825
4826 template <class ST, class LO, class GO, class NT>
4827 void
4828 deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4829 const MultiVector<ST, LO, GO, NT>& src)
4830 {
4831 using ::Tpetra::Details::localDeepCopy;
4832 using MV = MultiVector<ST, LO, GO, NT>;
4833 using IST = typename MV::impl_scalar_type;
4834 using output_view_type =
4835 Kokkos::View<IST**, Kokkos::LayoutLeft,
4836 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4837 using pair_type = std::pair<LO, LO>;
4838
4839 const LO numRows = static_cast<LO> (dst.numRows ());
4840 const LO numCols = static_cast<LO> (dst.numCols ());
4841
4843 (numRows != static_cast<LO> (src.getLocalLength ()) ||
4844 numCols != static_cast<LO> (src.getNumVectors ()),
4845 std::invalid_argument, "Tpetra::deep_copy: On Process "
4846 << src.getMap ()->getComm ()->getRank () << ", src is "
4847 << src.getLocalLength () << " x " << src.getNumVectors ()
4848 << ", but dst is " << numRows << " x " << numCols << ".");
4849
4850 IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4851 output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4852 auto dst_view =
4853 Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4854
4855 constexpr bool dst_isConstantStride = true;
4856 Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4857
4858 // Prefer the host version of src's data.
4859 localDeepCopy (dst_view,
4860 src.getLocalViewHost(Access::ReadOnly),
4862 src.isConstantStride (),
4864 getMultiVectorWhichVectors (src));
4865 }
4866#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4867
4868 template <class Scalar, class LO, class GO, class NT>
4869 Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4870 createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4871 size_t numVectors)
4872 {
4874 return Teuchos::rcp (new MV (map, numVectors));
4875 }
4876
4877 template <class ST, class LO, class GO, class NT>
4880 {
4881 typedef MultiVector<ST, LO, GO, NT> MV;
4882 MV cpy (src.getMap (), src.getNumVectors (), false);
4883 cpy.assign (src);
4884 return cpy;
4885 }
4886
4887} // namespace Tpetra
4888
4889//
4890// Explicit instantiation macro
4891//
4892// Must be expanded from within the Tpetra namespace!
4893//
4894
4895#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4896# define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4897 template class MultiVector< SCALAR , LO , GO , NODE >; \
4898 template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4899 template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4900 template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4901 template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4902
4903#else
4904# define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4905 template class MultiVector< SCALAR , LO , GO , NODE >; \
4906 template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4907
4908#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4909
4910
4911#define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
4912 \
4913 template Teuchos::RCP< MultiVector< SO , LO , GO , NODE > > \
4914 MultiVector< SI , LO , GO , NODE >::convert< SO > () const;
4915
4916
4917#endif // TPETRA_MULTIVECTOR_DEF_HPP
Functions for initializing and finalizing Tpetra.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
Declaration of a function that prints strings from each process.
Declaration of Tpetra::Details::isInterComm.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
static bool assumeMpiIsGPUAware()
Whether to assume that MPI is CUDA aware.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool remoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,...
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector's local values.
size_t getStride() const
Stride between columns in the multivector.
virtual size_t constantNumberOfPackets() const
Number of packets to send per LID.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
virtual std::string description() const
A simple one-line description of this object.
void reduce()
Sum values of a locally replicated multivector across all processes.
void randomize()
Set all values in the multivector to pseudorandom numbers.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes owning zero rows from the Map and their communicator.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
typename Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector's local values.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM)
Perform copies and permutations that are local to the calling (MPI) process.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using local (row) index.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Swap contents of mv with contents of *this.
size_t getLocalLength() const
Local number of rows on the calling process.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector's local values.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using global row index.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool areRemoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with the given verbosity level to a FancyOStream.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
size_t getNumVectors() const
Number of columns in the multivector.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using local row index.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
virtual bool checkSizes(const SrcDistObject &sourceObj)
Whether data redistribution between sourceObj and this object is legal.
Teuchos::RCP< MultiVector< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another MultiVector with the same entries, but converted to a different Scalar type T.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector's local values.
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using global row index.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
bool aliases(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &other) const
Whether this multivector's memory might alias other. This is conservative: if either this or other is...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process' verbose describe() information to the returned string.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Abstract base class for objects that can be the source of an Import or Export operation.
Implementation details of Tpetra.
int local_ordinal_type
Default value of Scalar template parameter.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.