xtensor
Loading...
Searching...
No Matches
xassign.hpp
1/***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht *
3 * Copyright (c) QuantStack *
4 * *
5 * Distributed under the terms of the BSD 3-Clause License. *
6 * *
7 * The full license is in the file LICENSE, distributed with this software. *
8 ****************************************************************************/
9
10#ifndef XTENSOR_ASSIGN_HPP
11#define XTENSOR_ASSIGN_HPP
12
13#include <algorithm>
14#include <functional>
15#include <type_traits>
16#include <utility>
17
18#include <xtl/xcomplex.hpp>
19#include <xtl/xsequence.hpp>
20
21#include "xexpression.hpp"
22#include "xfunction.hpp"
23#include "xiterator.hpp"
24#include "xstrides.hpp"
25#include "xtensor_config.hpp"
26#include "xtensor_forward.hpp"
27#include "xutils.hpp"
28
29#if defined(XTENSOR_USE_TBB)
30#include <tbb/tbb.h>
31#endif
32
33namespace xt
34{
35
36 /********************
37 * Assign functions *
38 ********************/
39
40 template <class E1, class E2>
41 void assign_data(xexpression<E1>& e1, const xexpression<E2>& e2, bool trivial);
42
43 template <class E1, class E2>
44 void assign_xexpression(xexpression<E1>& e1, const xexpression<E2>& e2);
45
46 template <class E1, class E2>
47 void computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2);
48
49 template <class E1, class E2, class F>
50 void scalar_computed_assign(xexpression<E1>& e1, const E2& e2, F&& f);
51
52 template <class E1, class E2>
53 void assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2);
54
55 template <class E1, class E2>
56 void strided_assign(E1& e1, const E2& e2, std::false_type /*disable*/);
57
58 template <class E1, class E2>
59 void strided_assign(E1& e1, const E2& e2, std::true_type /*enable*/);
60
61 /************************
62 * xexpression_assigner *
63 ************************/
64
65 template <class Tag>
67
68 template <>
70 {
71 public:
72
73 template <class E1, class E2>
74 static void assign_data(xexpression<E1>& e1, const xexpression<E2>& e2, bool trivial);
75 };
76
77 template <class Tag>
79 {
80 public:
81
83
84 template <class E1, class E2>
85 static void assign_xexpression(E1& e1, const E2& e2);
86
87 template <class E1, class E2>
88 static void computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2);
89
90 template <class E1, class E2, class F>
91 static void scalar_computed_assign(xexpression<E1>& e1, const E2& e2, F&& f);
92
93 template <class E1, class E2>
94 static void assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2);
95
96 private:
97
98 template <class E1, class E2>
99 static bool resize(E1& e1, const E2& e2);
100
101 template <class E1, class F, class... CT>
102 static bool resize(E1& e1, const xfunction<F, CT...>& e2);
103 };
104
105 /********************
106 * stepper_assigner *
107 ********************/
108
109 template <class E1, class E2, layout_type L>
111 {
112 public:
113
114 using lhs_iterator = typename E1::stepper;
115 using rhs_iterator = typename E2::const_stepper;
116 using shape_type = typename E1::shape_type;
118 using size_type = typename lhs_iterator::size_type;
119 using difference_type = typename lhs_iterator::difference_type;
120
121 stepper_assigner(E1& e1, const E2& e2);
122
123 void run();
124
125 void step(size_type i);
126 void step(size_type i, size_type n);
127 void reset(size_type i);
128
129 void to_end(layout_type);
130
131 private:
132
133 E1& m_e1;
134
135 lhs_iterator m_lhs;
136 rhs_iterator m_rhs;
137
138 index_type m_index;
139 };
140
141 /*******************
142 * linear_assigner *
143 *******************/
144
145 template <bool simd_assign>
147 {
148 public:
149
150 template <class E1, class E2>
151 static void run(E1& e1, const E2& e2);
152 };
153
154 template <>
156 {
157 public:
158
159 template <class E1, class E2>
160 static void run(E1& e1, const E2& e2);
161
162 private:
163
164 template <class E1, class E2>
165 static void run_impl(E1& e1, const E2& e2, std::true_type);
166
167 template <class E1, class E2>
168 static void run_impl(E1& e1, const E2& e2, std::false_type);
169 };
170
171 /*************************
172 * strided_loop_assigner *
173 *************************/
174
175 namespace strided_assign_detail
176 {
178 {
179 bool can_do_strided_assign;
180 bool is_row_major;
181 std::size_t inner_loop_size;
182 std::size_t outer_loop_size;
183 std::size_t cut;
184 std::size_t dimension;
185 };
186 }
187
188 template <bool simd>
190 {
191 public:
192
194 // is_row_major, inner_loop_size, outer_loop_size, cut
195 template <class E1, class E2>
196 static void run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes);
197 template <class E1, class E2>
198 static loop_sizes_t get_loop_sizes(E1& e1, const E2& e2);
199 template <class E1, class E2>
200 static void run(E1& e1, const E2& e2);
201 };
202
203 /***********************************
204 * Assign functions implementation *
205 ***********************************/
206
207 template <class E1, class E2>
208 inline void assign_data(xexpression<E1>& e1, const xexpression<E2>& e2, bool trivial)
209 {
212 }
213
214 template <class E1, class E2>
215 inline void assign_xexpression(xexpression<E1>& e1, const xexpression<E2>& e2)
216 {
217 xtl::mpl::static_if<has_assign_to<E1, E2>::value>(
218 [&](auto self)
219 {
220 self(e2).derived_cast().assign_to(e1);
221 },
222 /*else*/
223 [&](auto /*self*/)
224 {
225 using tag = xexpression_tag_t<E1, E2>;
226 xexpression_assigner<tag>::assign_xexpression(e1, e2);
227 }
228 );
229 }
230
231 template <class E1, class E2>
232 inline void computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2)
233 {
234 using tag = xexpression_tag_t<E1, E2>;
235 xexpression_assigner<tag>::computed_assign(e1, e2);
236 }
237
238 template <class E1, class E2, class F>
239 inline void scalar_computed_assign(xexpression<E1>& e1, const E2& e2, F&& f)
240 {
241 using tag = xexpression_tag_t<E1, E2>;
242 xexpression_assigner<tag>::scalar_computed_assign(e1, e2, std::forward<F>(f));
243 }
244
245 template <class E1, class E2>
246 inline void assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2)
247 {
248 using tag = xexpression_tag_t<E1, E2>;
249 xexpression_assigner<tag>::assert_compatible_shape(e1, e2);
250 }
251
252 /***************************************
253 * xexpression_assigner implementation *
254 ***************************************/
255
256 namespace detail
257 {
258 template <class E1, class E2>
259 constexpr bool linear_static_layout()
260 {
261 // A row_major or column_major container with a dimension <= 1 is computed as
262 // layout any, leading to some performance improvements, for example when
263 // assigning a col-major vector to a row-major vector etc
264 return compute_layout(
265 select_layout<E1::static_layout, typename E1::shape_type>::value,
266 select_layout<E2::static_layout, typename E2::shape_type>::value
267 )
269 }
270
271 template <class E1, class E2>
272 inline auto is_linear_assign(const E1& e1, const E2& e2)
273 -> std::enable_if_t<has_strides<E1>::value, bool>
274 {
275 return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>())
276 || (e1.is_contiguous() && e2.has_linear_assign(e1.strides()));
277 }
278
279 template <class E1, class E2>
280 inline auto is_linear_assign(const E1&, const E2&) -> std::enable_if_t<!has_strides<E1>::value, bool>
281 {
282 return false;
283 }
284
285 template <class E1, class E2>
286 inline bool linear_dynamic_layout(const E1& e1, const E2& e2)
287 {
288 return e1.is_contiguous() && e2.is_contiguous()
289 && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic;
290 }
291
292 template <class E, class = void>
293 struct has_step_leading : std::false_type
294 {
295 };
296
297 template <class E>
298 struct has_step_leading<E, void_t<decltype(std::declval<E>().step_leading())>> : std::true_type
299 {
300 };
301
302 template <class T>
303 struct use_strided_loop
304 {
305 static constexpr bool stepper_deref()
306 {
307 return std::is_reference<typename T::stepper::reference>::value;
308 }
309
310 static constexpr bool value = has_strides<T>::value
311 && has_step_leading<typename T::stepper>::value && stepper_deref();
312 };
313
314 template <class T>
315 struct use_strided_loop<xscalar<T>>
316 {
317 static constexpr bool value = true;
318 };
319
320 template <class F, class... CT>
321 struct use_strided_loop<xfunction<F, CT...>>
322 {
323 static constexpr bool value = xtl::conjunction<use_strided_loop<std::decay_t<CT>>...>::value;
324 };
325
342 template <class T1, class T2>
343 struct conditional_promote_to_complex
344 {
345 static constexpr bool cond = xtl::is_gen_complex<T1>::value && !xtl::is_gen_complex<T2>::value;
346 // Alternative: use std::complex<T2> or xcomplex<T2, T2, bool> depending on T1
347 using type = std::conditional_t<cond, T1, T2>;
348 };
349
350 template <class T1, class T2>
351 using conditional_promote_to_complex_t = typename conditional_promote_to_complex<T1, T2>::type;
352 }
353
354 template <class E1, class E2>
356 {
357 private:
358
359 using e1_value_type = typename E1::value_type;
360 using e2_value_type = typename E2::value_type;
361
362 template <class T>
363 using is_bool = std::is_same<T, bool>;
364
365 static constexpr bool is_bool_conversion()
366 {
367 return is_bool<e2_value_type>::value && !is_bool<e1_value_type>::value;
368 }
369
370 static constexpr bool contiguous_layout()
371 {
372 return E1::contiguous_layout && E2::contiguous_layout;
373 }
374
375 static constexpr bool convertible_types()
376 {
377 return std::is_convertible<e2_value_type, e1_value_type>::value && !is_bool_conversion();
378 }
379
380 static constexpr bool use_xsimd()
381 {
383 }
384
385 template <class T>
386 static constexpr bool simd_size_impl()
387 {
388 return xt_simd::simd_traits<T>::size > 1 || (is_bool<T>::value && use_xsimd());
389 }
390
391 static constexpr bool simd_size()
392 {
394 }
395
396 static constexpr bool simd_interface()
397 {
400 }
401
402 public:
403
404 // constexpr methods instead of constexpr data members avoid the need of definitions at namespace
405 // scope of these data members (since they are odr-used).
406
407 static constexpr bool simd_assign()
408 {
409 return convertible_types() && simd_size() && simd_interface();
410 }
411
412 static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial)
413 {
414 return trivial && detail::is_linear_assign(e1, e2);
415 }
416
417 static constexpr bool strided_assign()
418 {
419 return detail::use_strided_loop<E1>::value && detail::use_strided_loop<E2>::value;
420 }
421
422 static constexpr bool simd_linear_assign()
423 {
424 return contiguous_layout() && simd_assign();
425 }
426
427 static constexpr bool simd_strided_assign()
428 {
429 return strided_assign() && simd_assign();
430 }
431
432 static constexpr bool simd_linear_assign(const E1& e1, const E2& e2)
433 {
434 return simd_assign() && detail::linear_dynamic_layout(e1, e2);
435 }
436
437 using e2_requested_value_type = std::
438 conditional_t<is_bool<e2_value_type>::value, typename E2::bool_load_type, e2_value_type>;
439 using requested_value_type = detail::conditional_promote_to_complex_t<e1_value_type, e2_requested_value_type>;
440 };
441
442 template <class E1, class E2>
445 const xexpression<E2>& e2,
446 bool trivial
447 )
448 {
449 E1& de1 = e1.derived_cast();
450 const E2& de2 = e2.derived_cast();
451 using traits = xassign_traits<E1, E2>;
452
453 bool linear_assign = traits::linear_assign(de1, de2, trivial);
454 constexpr bool simd_assign = traits::simd_assign();
455 constexpr bool simd_linear_assign = traits::simd_linear_assign();
456 constexpr bool simd_strided_assign = traits::simd_strided_assign();
457 if (linear_assign)
458 {
459 if (simd_linear_assign || traits::simd_linear_assign(de1, de2))
460 {
461 // Do not use linear_assigner<true> here since it will make the compiler
462 // instantiate this branch even if the runtime condition is false, resulting
463 // in compilation error for expressions that do not provide a SIMD interface.
464 // simd_assign is true if simd_linear_assign() or simd_linear_assign(de1, de2)
465 // is true.
467 }
468 else
469 {
470 linear_assigner<false>::run(de1, de2);
471 }
472 }
473 else if (simd_strided_assign)
474 {
475 strided_loop_assigner<simd_strided_assign>::run(de1, de2);
476 }
477 else
478 {
479 stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(de1, de2).run();
480 }
481 }
482
483 template <class Tag>
484 template <class E1, class E2>
485 inline void xexpression_assigner<Tag>::assign_xexpression(E1& e1, const E2& e2)
486 {
487 bool trivial_broadcast = resize(e1.derived_cast(), e2.derived_cast());
488 base_type::assign_data(e1, e2, trivial_broadcast);
489 }
490
491 template <class Tag>
492 template <class E1, class E2>
493 inline void xexpression_assigner<Tag>::computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2)
494 {
495 using shape_type = typename E1::shape_type;
496 using comperator_type = std::greater<typename shape_type::value_type>;
497
498 using size_type = typename E1::size_type;
499
500 E1& de1 = e1.derived_cast();
501 const E2& de2 = e2.derived_cast();
502
503 size_type dim2 = de2.dimension();
504 shape_type shape = uninitialized_shape<shape_type>(dim2);
505
506 bool trivial_broadcast = de2.broadcast_shape(shape, true);
507
508 auto&& de1_shape = de1.shape();
509 if (dim2 > de1.dimension()
510 || std::lexicographical_compare(
511 shape.begin(),
512 shape.end(),
513 de1_shape.begin(),
514 de1_shape.end(),
515 comperator_type()
516 ))
517 {
518 typename E1::temporary_type tmp(shape);
519 base_type::assign_data(tmp, e2, trivial_broadcast);
520 de1.assign_temporary(std::move(tmp));
521 }
522 else
523 {
524 base_type::assign_data(e1, e2, trivial_broadcast);
525 }
526 }
527
528 template <class Tag>
529 template <class E1, class E2, class F>
530 inline void xexpression_assigner<Tag>::scalar_computed_assign(xexpression<E1>& e1, const E2& e2, F&& f)
531 {
532 E1& d = e1.derived_cast();
533 using size_type = typename E1::size_type;
534 auto dst = d.storage().begin();
535 for (size_type i = d.size(); i > 0; --i)
536 {
537 *dst = f(*dst, e2);
538 ++dst;
539 }
540 }
541
542 template <class Tag>
543 template <class E1, class E2>
544 inline void
545 xexpression_assigner<Tag>::assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2)
546 {
547 const E1& de1 = e1.derived_cast();
548 const E2& de2 = e2.derived_cast();
549 if (!broadcastable(de2.shape(), de1.shape()))
550 {
551 throw_broadcast_error(de2.shape(), de1.shape());
552 }
553 }
554
555 namespace detail
556 {
557 template <bool B, class... CT>
558 struct static_trivial_broadcast;
559
560 template <class... CT>
561 struct static_trivial_broadcast<true, CT...>
562 {
563 static constexpr bool value = detail::promote_index<typename std::decay_t<CT>::shape_type...>::value;
564 };
565
566 template <class... CT>
567 struct static_trivial_broadcast<false, CT...>
568 {
569 static constexpr bool value = false;
570 };
571 }
572
573 template <class Tag>
574 template <class E1, class E2>
575 inline bool xexpression_assigner<Tag>::resize(E1& e1, const E2& e2)
576 {
577 // If our RHS is not a xfunction, we know that the RHS is at least potentially trivial
578 // We check the strides of the RHS in detail::is_trivial_broadcast to see if they match up!
579 // So we can skip a shape copy and a call to broadcast_shape(...)
580 e1.resize(e2.shape());
581 return true;
582 }
583
584 template <class Tag>
585 template <class E1, class F, class... CT>
586 inline bool xexpression_assigner<Tag>::resize(E1& e1, const xfunction<F, CT...>& e2)
587 {
588 return xtl::mpl::static_if<detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value>(
589 [&](auto /*self*/)
590 {
591 /*
592 * If the shape of the xfunction is statically known, we can compute the broadcast triviality
593 * at compile time plus we can resize right away.
594 */
595 // resize in case LHS is not a fixed size container. If it is, this is a NOP
596 e1.resize(typename xfunction<F, CT...>::shape_type{});
597 return detail::static_trivial_broadcast<
598 detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value,
599 CT...>::value;
600 },
601 /* else */
602 [&](auto /*self*/)
603 {
604 using index_type = xindex_type_t<typename E1::shape_type>;
605 using size_type = typename E1::size_type;
606 size_type size = e2.dimension();
607 index_type shape = uninitialized_shape<index_type>(size);
608 bool trivial_broadcast = e2.broadcast_shape(shape, true);
609 e1.resize(std::move(shape));
610 return trivial_broadcast;
611 }
612 );
613 }
614
615 /***********************************
616 * stepper_assigner implementation *
617 ***********************************/
618
619 template <class FROM, class TO>
621 {
622 using argument_type = std::decay_t<FROM>;
623 using result_type = std::decay_t<TO>;
624
625 static const bool value = xtl::is_arithmetic<result_type>::value
626 && (sizeof(result_type) < sizeof(argument_type)
627 || (xtl::is_integral<result_type>::value
628 && std::is_floating_point<argument_type>::value));
629 };
630
631 template <class FROM, class TO>
633 {
634 using argument_type = std::decay_t<FROM>;
635 using result_type = std::decay_t<TO>;
636
637 static const bool value = xtl::is_signed<argument_type>::value != xtl::is_signed<result_type>::value;
638 };
639
640 template <class FROM, class TO>
642 {
643 using argument_type = std::decay_t<FROM>;
644 using result_type = std::decay_t<TO>;
645
648 };
649
650 template <class E1, class E2, layout_type L>
652 : m_e1(e1)
653 , m_lhs(e1.stepper_begin(e1.shape()))
654 , m_rhs(e2.stepper_begin(e1.shape()))
655 , m_index(xtl::make_sequence<index_type>(e1.shape().size(), size_type(0)))
656 {
657 }
658
659 template <class E1, class E2, layout_type L>
660 inline void stepper_assigner<E1, E2, L>::run()
661 {
662 using tmp_size_type = typename E1::size_type;
663 using argument_type = std::decay_t<decltype(*m_rhs)>;
664 using result_type = std::decay_t<decltype(*m_lhs)>;
665 constexpr bool needs_cast = has_assign_conversion<argument_type, result_type>::value;
666
667 tmp_size_type s = m_e1.size();
668 for (tmp_size_type i = 0; i < s; ++i)
669 {
670 *m_lhs = conditional_cast<needs_cast, result_type>(*m_rhs);
671 stepper_tools<L>::increment_stepper(*this, m_index, m_e1.shape());
672 }
673 }
674
675 template <class E1, class E2, layout_type L>
676 inline void stepper_assigner<E1, E2, L>::step(size_type i)
677 {
678 m_lhs.step(i);
679 m_rhs.step(i);
680 }
681
682 template <class E1, class E2, layout_type L>
683 inline void stepper_assigner<E1, E2, L>::step(size_type i, size_type n)
684 {
685 m_lhs.step(i, n);
686 m_rhs.step(i, n);
687 }
688
689 template <class E1, class E2, layout_type L>
690 inline void stepper_assigner<E1, E2, L>::reset(size_type i)
691 {
692 m_lhs.reset(i);
693 m_rhs.reset(i);
694 }
695
696 template <class E1, class E2, layout_type L>
697 inline void stepper_assigner<E1, E2, L>::to_end(layout_type l)
698 {
699 m_lhs.to_end(l);
700 m_rhs.to_end(l);
701 }
702
703 /**********************************
704 * linear_assigner implementation *
705 **********************************/
706
707 template <bool simd_assign>
708 template <class E1, class E2>
709 inline void linear_assigner<simd_assign>::run(E1& e1, const E2& e2)
710 {
711 using lhs_align_mode = xt_simd::container_alignment_t<E1>;
712 constexpr bool is_aligned = std::is_same<lhs_align_mode, aligned_mode>::value;
713 using rhs_align_mode = std::conditional_t<is_aligned, inner_aligned_mode, unaligned_mode>;
714 using e1_value_type = typename E1::value_type;
715 using e2_value_type = typename E2::value_type;
716 using value_type = typename xassign_traits<E1, E2>::requested_value_type;
717 using simd_type = xt_simd::simd_type<value_type>;
718 using size_type = typename E1::size_type;
719 size_type size = e1.size();
720 constexpr size_type simd_size = simd_type::size;
721 constexpr bool needs_cast = has_assign_conversion<e1_value_type, e2_value_type>::value;
722
723 size_type align_begin = is_aligned ? 0 : xt_simd::get_alignment_offset(e1.data(), size, simd_size);
724 size_type align_end = align_begin + ((size - align_begin) & ~(simd_size - 1));
725
726 for (size_type i = 0; i < align_begin; ++i)
727 {
728 e1.data_element(i) = conditional_cast<needs_cast, e1_value_type>(e2.data_element(i));
729 }
730
731#if defined(XTENSOR_USE_TBB)
732 if (size >= XTENSOR_TBB_THRESHOLD)
733 {
734 tbb::static_partitioner ap;
735 tbb::parallel_for(
736 align_begin,
737 align_end,
738 simd_size,
739 [&e1, &e2](size_t i)
740 {
741 e1.template store_simd<lhs_align_mode>(
742 i,
743 e2.template load_simd<rhs_align_mode, value_type>(i)
744 );
745 },
746 ap
747 );
748 }
749 else
750 {
751 for (size_type i = align_begin; i < align_end; i += simd_size)
752 {
753 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
754 }
755 }
756#elif defined(XTENSOR_USE_OPENMP)
757 if (size >= size_type(XTENSOR_OPENMP_TRESHOLD))
758 {
759#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
760#ifndef _WIN32
761 for (size_type i = align_begin; i < align_end; i += simd_size)
762 {
763 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
764 }
765#else
766 for (auto i = static_cast<std::ptrdiff_t>(align_begin); i < static_cast<std::ptrdiff_t>(align_end);
767 i += static_cast<std::ptrdiff_t>(simd_size))
768 {
769 size_type ui = static_cast<size_type>(i);
770 e1.template store_simd<lhs_align_mode>(ui, e2.template load_simd<rhs_align_mode, value_type>(ui));
771 }
772#endif
773 }
774 else
775 {
776 for (size_type i = align_begin; i < align_end; i += simd_size)
777 {
778 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
779 }
780 }
781#else
782 for (size_type i = align_begin; i < align_end; i += simd_size)
783 {
784 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
785 }
786#endif
787 for (size_type i = align_end; i < size; ++i)
788 {
789 e1.data_element(i) = conditional_cast<needs_cast, e1_value_type>(e2.data_element(i));
790 }
791 }
792
793 template <class E1, class E2>
794 inline void linear_assigner<false>::run(E1& e1, const E2& e2)
795 {
796 using is_convertible = std::
797 is_convertible<typename std::decay_t<E2>::value_type, typename std::decay_t<E1>::value_type>;
798 // If the types are not compatible, this function is still instantiated but never called.
799 // To avoid compilation problems in effectively unused code trivial_assigner_run_impl is
800 // empty in this case.
801 run_impl(e1, e2, is_convertible());
802 }
803
804 template <class E1, class E2>
805 inline void linear_assigner<false>::run_impl(E1& e1, const E2& e2, std::true_type /*is_convertible*/)
806 {
807 using value_type = typename E1::value_type;
808 using size_type = typename E1::size_type;
809 auto src = linear_begin(e2);
810 auto dst = linear_begin(e1);
811 size_type n = e1.size();
812#if defined(XTENSOR_USE_TBB)
813 tbb::static_partitioner sp;
814 tbb::parallel_for(
815 std::ptrdiff_t(0),
816 static_cast<std::ptrdiff_t>(n),
817 [&](std::ptrdiff_t i)
818 {
819 *(dst + i) = static_cast<value_type>(*(src + i));
820 },
821 sp
822 );
823#elif defined(XTENSOR_USE_OPENMP)
824 if (n >= XTENSOR_OPENMP_TRESHOLD)
825 {
826#pragma omp parallel for default(none) shared(src, dst, n)
827 for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n); i++)
828 {
829 *(dst + i) = static_cast<value_type>(*(src + i));
830 }
831 }
832 else
833 {
834 for (; n > size_type(0); --n)
835 {
836 *dst = static_cast<value_type>(*src);
837 ++src;
838 ++dst;
839 }
840 }
841#else
842 for (; n > size_type(0); --n)
843 {
844 *dst = static_cast<value_type>(*src);
845 ++src;
846 ++dst;
847 }
848#endif
849 }
850
851 template <class E1, class E2>
852 inline void linear_assigner<false>::run_impl(E1&, const E2&, std::false_type /*is_convertible*/)
853 {
854 XTENSOR_PRECONDITION(false, "Internal error: linear_assigner called with unrelated types.");
855 }
856
857 /****************************************
858 * strided_loop_assigner implementation *
859 ****************************************/
860
861 namespace strided_assign_detail
862 {
863 template <layout_type layout>
864 struct idx_tools;
865
866 template <>
868 {
869 template <class T>
870 static void next_idx(T& outer_index, T& outer_shape)
871 {
872 auto i = outer_index.size();
873 for (; i > 0; --i)
874 {
875 if (outer_index[i - 1] + 1 >= outer_shape[i - 1])
876 {
877 outer_index[i - 1] = 0;
878 }
879 else
880 {
881 outer_index[i - 1]++;
882 break;
883 }
884 }
885 }
886
887 template <class T>
888 static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
889 {
891 xt::resize_container(stride_sizes, outer_shape.size());
892 // compute strides
893 using size_type = typename T::size_type;
894 for (size_type i = outer_shape.size(); i > 0; i--)
895 {
896 stride_sizes[i - 1] = (i == outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i];
897 }
898
899 // compute index
900 for (size_type i = 0; i < outer_shape.size(); i++)
901 {
902 auto d_idx = n / stride_sizes[i];
904 n -= d_idx * stride_sizes[i];
905 }
906 }
907 };
908
909 template <>
911 {
912 template <class T>
913 static void next_idx(T& outer_index, T& outer_shape)
914 {
915 using size_type = typename T::size_type;
916 size_type i = 0;
917 auto sz = outer_index.size();
918 for (; i < sz; ++i)
919 {
920 if (outer_index[i] + 1 >= outer_shape[i])
921 {
922 outer_index[i] = 0;
923 }
924 else
925 {
926 outer_index[i]++;
927 break;
928 }
929 }
930 }
931
932 template <class T>
933 static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
934 {
936 xt::resize_container(stride_sizes, outer_shape.size());
937
938 using size_type = typename T::size_type;
939
940 // compute required strides
941 for (size_type i = 0; i < outer_shape.size(); i++)
942 {
943 stride_sizes[i] = (i == 0) ? 1 : stride_sizes[i - 1] * outer_shape[i - 1];
944 }
945
946 // compute index
947 for (size_type i = outer_shape.size(); i > 0;)
948 {
949 i--;
950 auto d_idx = n / stride_sizes[i];
952 n -= d_idx * stride_sizes[i];
953 }
954 }
955 };
956
957 template <layout_type L, class S>
959 {
960 using strides_type = S;
961
963 : m_cut(L == layout_type::row_major ? 0 : strides.size())
964 , m_strides(strides)
965 {
966 }
967
968 template <class T, layout_type LE = L>
969 std::enable_if_t<LE == layout_type::row_major, std::size_t> operator()(const T& el)
970 {
971 // All dimenions less than var have differing strides
972 auto var = check_strides_overlap<layout_type::row_major>::get(m_strides, el.strides());
973 if (var > m_cut)
974 {
975 m_cut = var;
976 }
977 return m_cut;
978 }
979
980 template <class T, layout_type LE = L>
981 std::enable_if_t<LE == layout_type::column_major, std::size_t> operator()(const T& el)
982 {
984 // All dimensions >= var have differing strides
985 if (var < m_cut)
986 {
987 m_cut = var;
988 }
989 return m_cut;
990 }
991
992 template <class T>
993 std::size_t operator()(const xt::xscalar<T>& /*el*/)
994 {
995 return m_cut;
996 }
997
998 template <class F, class... CT>
999 std::size_t operator()(const xt::xfunction<F, CT...>& xf)
1000 {
1001 xt::for_each(*this, xf.arguments());
1002 return m_cut;
1003 }
1004
1005 private:
1006
1007 std::size_t m_cut;
1008 const strides_type& m_strides;
1009 };
1010
1012 loop_sizes_t get_loop_sizes(const E1& e1, const E2&)
1013 {
1014 return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()};
1015 }
1016
1017 template <bool possible = true, class E1, class E2, std::enable_if_t<has_strides<E1>::value && possible, bool> = true>
1018 loop_sizes_t get_loop_sizes(const E1& e1, const E2& e2)
1019 {
1020 using shape_value_type = typename E1::shape_type::value_type;
1021 bool is_row_major = true;
1022
1023 // Try to find a row-major scheme first, where the outer loop is on the first N = `cut`
1024 // dimensions, and the inner loop is
1025 is_row_major = true;
1026 auto is_zero = [](auto i)
1027 {
1028 return i == 0;
1029 };
1030 auto&& strides = e1.strides();
1031 auto it_bwd = std::find_if_not(strides.rbegin(), strides.rend(), is_zero);
1032 bool de1_row_contiguous = it_bwd != strides.rend() && *it_bwd == 1;
1033 auto it_fwd = std::find_if_not(strides.begin(), strides.end(), is_zero);
1034 bool de1_col_contiguous = it_fwd != strides.end() && *it_fwd == 1;
1035 if (de1_row_contiguous)
1036 {
1037 is_row_major = true;
1038 }
1039 else if (de1_col_contiguous)
1040 {
1041 is_row_major = false;
1042 }
1043 else
1044 {
1045 // No strided loop possible.
1046 return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()};
1047 }
1048
1049 // Cut is the number of dimensions in the outer loop
1050 std::size_t cut = 0;
1051
1052 if (is_row_major)
1053 {
1054 auto csf = check_strides_functor<layout_type::row_major, decltype(e1.strides())>(e1.strides());
1055 cut = csf(e2);
1056 // This makes that only one dimension will be treated in the inner loop.
1057 if (cut < e1.strides().size() - 1)
1058 {
1059 // Only make the inner loop go over one dimension by default for now
1060 cut = e1.strides().size() - 1;
1061 }
1062 }
1063 else if (!is_row_major)
1064 {
1065 auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides()
1066 );
1067 cut = csf(e2);
1068 if (cut > 1)
1069 {
1070 // Only make the inner loop go over one dimension by default for now
1071 cut = 1;
1072 }
1073 } // can't reach here because this would have already triggered the fallback
1074
1075 std::size_t outer_loop_size = static_cast<std::size_t>(std::accumulate(
1076 e1.shape().begin(),
1077 e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
1078 shape_value_type(1),
1079 std::multiplies<shape_value_type>{}
1080 ));
1081 std::size_t inner_loop_size = static_cast<std::size_t>(std::accumulate(
1082 e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
1083 e1.shape().end(),
1084 shape_value_type(1),
1085 std::multiplies<shape_value_type>{}
1086 ));
1087
1088 if (!is_row_major)
1089 {
1090 std::swap(outer_loop_size, inner_loop_size);
1091 }
1092
1093 return {inner_loop_size > 1, is_row_major, inner_loop_size, outer_loop_size, cut, e1.dimension()};
1094 }
1095 }
1096
1097 template <bool simd>
1098 template <class E1, class E2>
1099 inline strided_assign_detail::loop_sizes_t strided_loop_assigner<simd>::get_loop_sizes(E1& e1, const E2& e2)
1100 {
1101 return strided_assign_detail::get_loop_sizes<simd>(e1, e2);
1102 }
1103
1104#define strided_parallel_assign
1105
1106 template <bool simd>
1107 template <class E1, class E2>
1108 inline void strided_loop_assigner<simd>::run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes)
1109 {
1110 bool is_row_major = loop_sizes.is_row_major;
1111 std::size_t inner_loop_size = loop_sizes.inner_loop_size;
1112 std::size_t outer_loop_size = loop_sizes.outer_loop_size;
1113 std::size_t cut = loop_sizes.cut;
1114
1115
1116 // TODO can we get rid of this and use `shape_type`?
1117 dynamic_shape<std::size_t> idx, max_shape;
1118
1119 if (is_row_major)
1120 {
1121 xt::resize_container(idx, cut);
1122 max_shape.assign(e1.shape().begin(), e1.shape().begin() + static_cast<std::ptrdiff_t>(cut));
1123 }
1124 else
1125 {
1126 xt::resize_container(idx, e1.shape().size() - cut);
1127 max_shape.assign(e1.shape().begin() + static_cast<std::ptrdiff_t>(cut), e1.shape().end());
1128 }
1129
1130 // add this when we have std::array index!
1131 // std::fill(idx.begin(), idx.end(), 0);
1132 using e1_value_type = typename E1::value_type;
1133 using e2_value_type = typename E2::value_type;
1134 constexpr bool needs_cast = has_assign_conversion<e1_value_type, e2_value_type>::value;
1135 using value_type = typename xassign_traits<E1, E2>::requested_value_type;
1136 using simd_type = std::conditional_t<
1137 std::is_same<e1_value_type, bool>::value,
1138 xt_simd::simd_bool_type<value_type>,
1139 xt_simd::simd_type<value_type>>;
1140
1141 std::size_t simd_size = inner_loop_size / simd_type::size;
1142 std::size_t simd_rest = inner_loop_size % simd_type::size;
1143
1144 auto fct_stepper = e2.stepper_begin(e1.shape());
1145 auto res_stepper = e1.stepper_begin(e1.shape());
1146
1147 // TODO in 1D case this is ambiguous -- could be RM or CM.
1148 // Use default layout to make decision
1149 std::size_t step_dim = 0;
1150 if (!is_row_major) // row major case
1151 {
1152 step_dim = cut;
1153 }
1154#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
1155 if (outer_loop_size >= XTENSOR_OPENMP_TRESHOLD / inner_loop_size)
1156 {
1157 std::size_t first_step = true;
1158#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
1159 for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
1160 {
1161 if (first_step)
1162 {
1163 is_row_major
1164 ? strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox, idx, max_shape)
1165 : strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox, idx, max_shape);
1166
1167 for (std::size_t i = 0; i < idx.size(); ++i)
1168 {
1169 fct_stepper.step(i + step_dim, idx[i]);
1170 res_stepper.step(i + step_dim, idx[i]);
1171 }
1172 first_step = false;
1173 }
1174
1175 for (std::size_t i = 0; i < simd_size; ++i)
1176 {
1177 res_stepper.template store_simd(fct_stepper.template step_simd<value_type>());
1178 }
1179 for (std::size_t i = 0; i < simd_rest; ++i)
1180 {
1181 *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
1182 res_stepper.step_leading();
1183 fct_stepper.step_leading();
1184 }
1185
1186 // next unaligned index
1187 is_row_major
1188 ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
1189 : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
1190
1191 fct_stepper.to_begin();
1192
1193 // need to step E1 as well if not contigous assign (e.g. view)
1194 if (!E1::contiguous_layout)
1195 {
1196 res_stepper.to_begin();
1197 for (std::size_t i = 0; i < idx.size(); ++i)
1198 {
1199 fct_stepper.step(i + step_dim, idx[i]);
1200 res_stepper.step(i + step_dim, idx[i]);
1201 }
1202 }
1203 else
1204 {
1205 for (std::size_t i = 0; i < idx.size(); ++i)
1206 {
1207 fct_stepper.step(i + step_dim, idx[i]);
1208 }
1209 }
1210 }
1211 }
1212 else
1213 {
1214#elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB)
1215 if (outer_loop_size > XTENSOR_TBB_THRESHOLD / inner_loop_size)
1216 {
1217 tbb::static_partitioner sp;
1218 tbb::parallel_for(
1219 tbb::blocked_range<size_t>(0ul, outer_loop_size),
1220 [&e1, &e2, is_row_major, step_dim, simd_size, simd_rest, &max_shape, &idx_ = idx](
1221 const tbb::blocked_range<size_t>& r
1222 )
1223 {
1224 auto idx = idx_;
1225 auto fct_stepper = e2.stepper_begin(e1.shape());
1226 auto res_stepper = e1.stepper_begin(e1.shape());
1227 std::size_t first_step = true;
1228 // #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper,
1229 // res_stepper, idx)
1230 for (std::size_t ox = r.begin(); ox < r.end(); ++ox)
1231 {
1232 if (first_step)
1233 {
1234 is_row_major
1235 ? strided_assign_detail::idx_tools<layout_type::row_major>::nth_idx(ox, idx, max_shape)
1236 : strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(
1237 ox,
1238 idx,
1239 max_shape
1240 );
1241
1242 for (std::size_t i = 0; i < idx.size(); ++i)
1243 {
1244 fct_stepper.step(i + step_dim, idx[i]);
1245 res_stepper.step(i + step_dim, idx[i]);
1246 }
1247 first_step = false;
1248 }
1249
1250 for (std::size_t i = 0; i < simd_size; ++i)
1251 {
1252 res_stepper.template store_simd(fct_stepper.template step_simd<value_type>());
1253 }
1254 for (std::size_t i = 0; i < simd_rest; ++i)
1255 {
1256 *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
1257 res_stepper.step_leading();
1258 fct_stepper.step_leading();
1259 }
1260
1261 // next unaligned index
1262 is_row_major
1263 ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
1264 : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
1265
1266 fct_stepper.to_begin();
1267
1268 // need to step E1 as well if not contigous assign (e.g. view)
1269 if (!E1::contiguous_layout)
1270 {
1271 res_stepper.to_begin();
1272 for (std::size_t i = 0; i < idx.size(); ++i)
1273 {
1274 fct_stepper.step(i + step_dim, idx[i]);
1275 res_stepper.step(i + step_dim, idx[i]);
1276 }
1277 }
1278 else
1279 {
1280 for (std::size_t i = 0; i < idx.size(); ++i)
1281 {
1282 fct_stepper.step(i + step_dim, idx[i]);
1283 }
1284 }
1285 }
1286 },
1287 sp
1288 );
1289 }
1290 else
1291 {
1292
1293#endif
1294 for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
1295 {
1296 for (std::size_t i = 0; i < simd_size; ++i)
1297 {
1298 res_stepper.store_simd(fct_stepper.template step_simd<value_type>());
1299 }
1300 for (std::size_t i = 0; i < simd_rest; ++i)
1301 {
1302 *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
1303 res_stepper.step_leading();
1304 fct_stepper.step_leading();
1305 }
1306
1307 is_row_major
1308 ? strided_assign_detail::idx_tools<layout_type::row_major>::next_idx(idx, max_shape)
1309 : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
1310
1311 fct_stepper.to_begin();
1312
1313 // need to step E1 as well if not contigous assign (e.g. view)
1314 if (!E1::contiguous_layout)
1315 {
1316 res_stepper.to_begin();
1317 for (std::size_t i = 0; i < idx.size(); ++i)
1318 {
1319 fct_stepper.step(i + step_dim, idx[i]);
1320 res_stepper.step(i + step_dim, idx[i]);
1321 }
1322 }
1323 else
1324 {
1325 for (std::size_t i = 0; i < idx.size(); ++i)
1326 {
1327 fct_stepper.step(i + step_dim, idx[i]);
1328 }
1329 }
1330 }
1331#if (defined(XTENSOR_USE_OPENMP) || defined(XTENSOR_USE_TBB)) && defined(strided_parallel_assign)
1332 }
1333#endif
1334 }
1335
1336 template <>
1337 template <class E1, class E2>
1338 inline void strided_loop_assigner<true>::run(E1& e1, const E2& e2)
1339 {
1340 strided_assign_detail::loop_sizes_t loop_sizes = strided_loop_assigner<true>::get_loop_sizes(e1, e2);
1341 if (loop_sizes.can_do_strided_assign)
1342 {
1343 run(e1, e2, loop_sizes);
1344 }
1345 else
1346 {
1347 // trigger the fallback assigner
1348 stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(e1, e2).run();
1349 }
1350 }
1351
1352 template <>
1353 template <class E1, class E2>
1354 inline void strided_loop_assigner<false>::run(E1& /*e1*/, const E2& /*e2*/, const loop_sizes_t&)
1355 {
1356 }
1357
1358 template <>
1359 template <class E1, class E2>
1360 inline void strided_loop_assigner<false>::run(E1& e1, const E2& e2)
1361 {
1362 // trigger the fallback assigner
1363 stepper_assigner<E1, E2, default_assignable_layout(E1::static_layout)>(e1, e2).run();
1364 }
1365}
1366
1367#endif
auto strides(const E &e, stride_type type=stride_type::normal) noexcept
Get strides of an object.
Definition xstrides.hpp:248
standard mathematical functions for xexpressions
constexpr layout_type compute_layout(Args... args) noexcept
Implementation of the following logical table:
Definition xlayout.hpp:88
layout_type
Definition xlayout.hpp:24