xtensor
 
Loading...
Searching...
No Matches
xassign.hpp
1/***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht *
3 * Copyright (c) QuantStack *
4 * *
5 * Distributed under the terms of the BSD 3-Clause License. *
6 * *
7 * The full license is in the file LICENSE, distributed with this software. *
8 ****************************************************************************/
9
10#ifndef XTENSOR_ASSIGN_HPP
11#define XTENSOR_ASSIGN_HPP
12
13#include <algorithm>
14#include <functional>
15#include <type_traits>
16#include <utility>
17
18#include <xtl/xcomplex.hpp>
19#include <xtl/xsequence.hpp>
20
21#include "../core/xexpression.hpp"
22#include "../core/xfunction.hpp"
23#include "../core/xiterator.hpp"
24#include "../core/xstrides.hpp"
25#include "../core/xtensor_config.hpp"
26#include "../core/xtensor_forward.hpp"
27#include "../utils/xutils.hpp"
28
29#if defined(XTENSOR_USE_TBB)
30#include <tbb/tbb.h>
31#endif
32
33namespace xt
34{
35
36 /********************
37 * Assign functions *
38 ********************/
39
40 template <class E1, class E2>
41 void assign_data(xexpression<E1>& e1, const xexpression<E2>& e2, bool trivial);
42
43 template <class E1, class E2>
44 void assign_xexpression(xexpression<E1>& e1, const xexpression<E2>& e2);
45
46 template <class E1, class E2>
47 void computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2);
48
49 template <class E1, class E2, class F>
50 void scalar_computed_assign(xexpression<E1>& e1, const E2& e2, F&& f);
51
52 template <class E1, class E2>
53 void assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2);
54
55 template <class E1, class E2>
56 void strided_assign(E1& e1, const E2& e2, std::false_type /*disable*/);
57
58 template <class E1, class E2>
59 void strided_assign(E1& e1, const E2& e2, std::true_type /*enable*/);
60
61 /************************
62 * xexpression_assigner *
63 ************************/
64
65 template <class Tag>
67
68 template <>
70 {
71 public:
72
73 template <class E1, class E2>
74 static void assign_data(xexpression<E1>& e1, const xexpression<E2>& e2, bool trivial);
75 };
76
77 template <class Tag>
79 {
80 public:
81
82 using base_type = xexpression_assigner_base<Tag>;
83
84 template <class E1, class E2>
85 static void assign_xexpression(E1& e1, const E2& e2);
86
87 template <class E1, class E2>
88 static void computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2);
89
90 template <class E1, class E2, class F>
91 static void scalar_computed_assign(xexpression<E1>& e1, const E2& e2, F&& f);
92
93 template <class E1, class E2>
94 static void assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2);
95
96 private:
97
98 template <class E1, class E2>
99 static bool resize(E1& e1, const E2& e2);
100
101 template <class E1, class F, class... CT>
102 static bool resize(E1& e1, const xfunction<F, CT...>& e2);
103 };
104
105 /********************
106 * stepper_assigner *
107 ********************/
108
109 template <class E1, class E2, layout_type L>
110 class stepper_assigner
111 {
112 public:
113
114 using lhs_iterator = typename E1::stepper;
115 using rhs_iterator = typename E2::const_stepper;
116 using shape_type = typename E1::shape_type;
117 using index_type = xindex_type_t<shape_type>;
118 using size_type = typename lhs_iterator::size_type;
119 using difference_type = typename lhs_iterator::difference_type;
120
121 stepper_assigner(E1& e1, const E2& e2);
122
123 void run();
124
125 void step(size_type i);
126 void step(size_type i, size_type n);
127 void reset(size_type i);
128
129 void to_end(layout_type);
130
131 private:
132
133 E1& m_e1;
134
135 lhs_iterator m_lhs;
136 rhs_iterator m_rhs;
137
138 index_type m_index;
139 };
140
141 /*******************
142 * linear_assigner *
143 *******************/
144
145 template <bool simd_assign>
147 {
148 public:
149
150 template <class E1, class E2>
151 static void run(E1& e1, const E2& e2);
152 };
153
154 template <>
155 class linear_assigner<false>
156 {
157 public:
158
159 template <class E1, class E2>
160 static void run(E1& e1, const E2& e2);
161
162 private:
163
164 template <class E1, class E2>
165 static void run_impl(E1& e1, const E2& e2, std::true_type);
166
167 template <class E1, class E2>
168 static void run_impl(E1& e1, const E2& e2, std::false_type);
169 };
170
171 /*************************
172 * strided_loop_assigner *
173 *************************/
174
175 namespace strided_assign_detail
176 {
178 {
179 bool can_do_strided_assign;
180 bool is_row_major;
181 std::size_t inner_loop_size;
182 std::size_t outer_loop_size;
183 std::size_t cut;
184 std::size_t dimension;
185 };
186 }
187
188 template <bool simd>
190 {
191 public:
192
193 using loop_sizes_t = strided_assign_detail::loop_sizes_t;
194 // is_row_major, inner_loop_size, outer_loop_size, cut
195 template <class E1, class E2>
196 static void run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes);
197 template <class E1, class E2>
198 static loop_sizes_t get_loop_sizes(E1& e1, const E2& e2);
199 template <class E1, class E2>
200 static void run(E1& e1, const E2& e2);
201 };
202
203 /***********************************
204 * Assign functions implementation *
205 ***********************************/
206
207 template <class E1, class E2>
208 inline void assign_data(xexpression<E1>& e1, const xexpression<E2>& e2, bool trivial)
209 {
210 using tag = xexpression_tag_t<E1, E2>;
212 }
213
214 template <class E1, class E2>
215 inline void assign_xexpression(xexpression<E1>& e1, const xexpression<E2>& e2)
216 {
217 if constexpr (has_assign_to<E1, E2>::value)
218 {
219 e2.derived_cast().assign_to(e1);
220 }
221 else
222 {
223 using tag = xexpression_tag_t<E1, E2>;
224 xexpression_assigner<tag>::assign_xexpression(e1, e2);
225 }
226 }
227
228 template <class E1, class E2>
229 inline void computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2)
230 {
231 using tag = xexpression_tag_t<E1, E2>;
232 xexpression_assigner<tag>::computed_assign(e1, e2);
233 }
234
235 template <class E1, class E2, class F>
236 inline void scalar_computed_assign(xexpression<E1>& e1, const E2& e2, F&& f)
237 {
238 using tag = xexpression_tag_t<E1, E2>;
239 xexpression_assigner<tag>::scalar_computed_assign(e1, e2, std::forward<F>(f));
240 }
241
242 template <class E1, class E2>
243 inline void assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2)
244 {
245 using tag = xexpression_tag_t<E1, E2>;
246 xexpression_assigner<tag>::assert_compatible_shape(e1, e2);
247 }
248
249 /***************************************
250 * xexpression_assigner implementation *
251 ***************************************/
252
253 namespace detail
254 {
255 template <class E1, class E2>
256 constexpr bool linear_static_layout()
257 {
258 // A row_major or column_major container with a dimension <= 1 is computed as
259 // layout any, leading to some performance improvements, for example when
260 // assigning a col-major vector to a row-major vector etc
261 return compute_layout(
262 select_layout<E1::static_layout, typename E1::shape_type>::value,
263 select_layout<E2::static_layout, typename E2::shape_type>::value
264 )
266 }
267
268 template <class E1, class E2>
269 inline auto is_linear_assign(const E1& e1, const E2& e2)
270 -> std::enable_if_t<has_strides<E1>::value, bool>
271 {
272 return (E1::contiguous_layout && E2::contiguous_layout && linear_static_layout<E1, E2>())
273 || (e1.is_contiguous() && e2.has_linear_assign(e1.strides()));
274 }
275
276 template <class E1, class E2>
277 inline auto is_linear_assign(const E1&, const E2&) -> std::enable_if_t<!has_strides<E1>::value, bool>
278 {
279 return false;
280 }
281
282 template <class E1, class E2>
283 inline bool linear_dynamic_layout(const E1& e1, const E2& e2)
284 {
285 return e1.is_contiguous() && e2.is_contiguous()
286 && compute_layout(e1.layout(), e2.layout()) != layout_type::dynamic;
287 }
288
289 template <class E, class = void>
290 struct has_step_leading : std::false_type
291 {
292 };
293
294 template <class E>
295 struct has_step_leading<E, void_t<decltype(std::declval<E>().step_leading())>> : std::true_type
296 {
297 };
298
299 template <class T>
300 struct use_strided_loop
301 {
302 static constexpr bool stepper_deref()
303 {
304 return std::is_reference<typename T::stepper::reference>::value;
305 }
306
307 static constexpr bool value = has_strides<T>::value
308 && has_step_leading<typename T::stepper>::value && stepper_deref();
309 };
310
311 template <class T>
312 struct use_strided_loop<xscalar<T>>
313 {
314 static constexpr bool value = true;
315 };
316
317 template <class F, class... CT>
318 struct use_strided_loop<xfunction<F, CT...>>
319 {
320 static constexpr bool value = std::conjunction<use_strided_loop<std::decay_t<CT>>...>::value;
321 };
322
339 template <class T1, class T2>
340 struct conditional_promote_to_complex
341 {
342 static constexpr bool cond = xtl::is_gen_complex<T1>::value && !xtl::is_gen_complex<T2>::value;
343 // Alternative: use std::complex<T2> or xcomplex<T2, T2, bool> depending on T1
344 using type = std::conditional_t<cond, T1, T2>;
345 };
346
347 template <class T1, class T2>
348 using conditional_promote_to_complex_t = typename conditional_promote_to_complex<T1, T2>::type;
349 }
350
351 template <class E1, class E2>
353 {
354 private:
355
356 using e1_value_type = typename E1::value_type;
357 using e2_value_type = typename E2::value_type;
358
359 template <class T>
360 using is_bool = std::is_same<T, bool>;
361
362 static constexpr bool is_bool_conversion()
363 {
364 return is_bool<e2_value_type>::value && !is_bool<e1_value_type>::value;
365 }
366
367 static constexpr bool contiguous_layout()
368 {
369 return E1::contiguous_layout && E2::contiguous_layout;
370 }
371
372 static constexpr bool convertible_types()
373 {
374 return std::is_convertible<e2_value_type, e1_value_type>::value && !is_bool_conversion();
375 }
376
377 static constexpr bool use_xsimd()
378 {
379 return xt_simd::simd_traits<int8_t>::size > 1;
380 }
381
382 template <class T>
383 static constexpr bool simd_size_impl()
384 {
385 return xt_simd::simd_traits<T>::size > 1 || (is_bool<T>::value && use_xsimd());
386 }
387
388 static constexpr bool simd_size()
389 {
390 return simd_size_impl<e1_value_type>() && simd_size_impl<e2_value_type>();
391 }
392
393 static constexpr bool simd_interface()
394 {
397 }
398
399 public:
400
401 // constexpr methods instead of constexpr data members avoid the need of definitions at namespace
402 // scope of these data members (since they are odr-used).
403
404 static constexpr bool simd_assign()
405 {
406 return convertible_types() && simd_size() && simd_interface();
407 }
408
409 static constexpr bool linear_assign(const E1& e1, const E2& e2, bool trivial)
410 {
411 return trivial && detail::is_linear_assign(e1, e2);
412 }
413
414 static constexpr bool strided_assign()
415 {
416 return detail::use_strided_loop<E1>::value && detail::use_strided_loop<E2>::value;
417 }
418
419 static constexpr bool simd_linear_assign()
420 {
421 return contiguous_layout() && simd_assign();
422 }
423
424 static constexpr bool simd_strided_assign()
425 {
426 return strided_assign() && simd_assign();
427 }
428
429 static constexpr bool simd_linear_assign(const E1& e1, const E2& e2)
430 {
431 return simd_assign() && detail::linear_dynamic_layout(e1, e2);
432 }
433
434 using e2_requested_value_type = std::
435 conditional_t<is_bool<e2_value_type>::value, typename E2::bool_load_type, e2_value_type>;
436 using requested_value_type = detail::conditional_promote_to_complex_t<e1_value_type, e2_requested_value_type>;
437 };
438
439 template <class E1, class E2>
441 xexpression<E1>& e1,
442 const xexpression<E2>& e2,
443 bool trivial
444 )
445 {
446 E1& de1 = e1.derived_cast();
447 const E2& de2 = e2.derived_cast();
448 using traits = xassign_traits<E1, E2>;
449
450 bool linear_assign = traits::linear_assign(de1, de2, trivial);
451 constexpr bool simd_assign = traits::simd_assign();
452 constexpr bool simd_linear_assign = traits::simd_linear_assign();
453 constexpr bool simd_strided_assign = traits::simd_strided_assign();
454 if (linear_assign)
455 {
456 if (simd_linear_assign || traits::simd_linear_assign(de1, de2))
457 {
458 // Do not use linear_assigner<true> here since it will make the compiler
459 // instantiate this branch even if the runtime condition is false, resulting
460 // in compilation error for expressions that do not provide a SIMD interface.
461 // simd_assign is true if simd_linear_assign() or simd_linear_assign(de1, de2)
462 // is true.
463 linear_assigner<simd_assign>::run(de1, de2);
464 }
465 else
466 {
467 linear_assigner<false>::run(de1, de2);
468 }
469 }
470 else if (simd_strided_assign)
471 {
472 strided_loop_assigner<simd_strided_assign>::run(de1, de2);
473 }
474 else
475 {
477 }
478 }
479
480 template <class Tag>
481 template <class E1, class E2>
482 inline void xexpression_assigner<Tag>::assign_xexpression(E1& e1, const E2& e2)
483 {
484 bool trivial_broadcast = resize(e1.derived_cast(), e2.derived_cast());
485 base_type::assign_data(e1, e2, trivial_broadcast);
486 }
487
488 template <class Tag>
489 template <class E1, class E2>
490 inline void xexpression_assigner<Tag>::computed_assign(xexpression<E1>& e1, const xexpression<E2>& e2)
491 {
492 using shape_type = typename E1::shape_type;
493 using comperator_type = std::greater<typename shape_type::value_type>;
494
495 using size_type = typename E1::size_type;
496
497 E1& de1 = e1.derived_cast();
498 const E2& de2 = e2.derived_cast();
499
500 size_type dim2 = de2.dimension();
501 shape_type shape = uninitialized_shape<shape_type>(dim2);
502
503 bool trivial_broadcast = de2.broadcast_shape(shape, true);
504
505 auto&& de1_shape = de1.shape();
506 if (dim2 > de1.dimension()
507 || std::lexicographical_compare(
508 shape.begin(),
509 shape.end(),
510 de1_shape.begin(),
511 de1_shape.end(),
512 comperator_type()
513 ))
514 {
515 typename E1::temporary_type tmp(shape);
516 base_type::assign_data(tmp, e2, trivial_broadcast);
517 de1.assign_temporary(std::move(tmp));
518 }
519 else
520 {
521 base_type::assign_data(e1, e2, trivial_broadcast);
522 }
523 }
524
525 template <class Tag>
526 template <class E1, class E2, class F>
527 inline void xexpression_assigner<Tag>::scalar_computed_assign(xexpression<E1>& e1, const E2& e2, F&& f)
528 {
529 E1& d = e1.derived_cast();
530 using size_type = typename E1::size_type;
531 auto dst = d.storage().begin();
532 for (size_type i = d.size(); i > 0; --i)
533 {
534 *dst = f(*dst, e2);
535 ++dst;
536 }
537 }
538
539 template <class Tag>
540 template <class E1, class E2>
541 inline void
542 xexpression_assigner<Tag>::assert_compatible_shape(const xexpression<E1>& e1, const xexpression<E2>& e2)
543 {
544 const E1& de1 = e1.derived_cast();
545 const E2& de2 = e2.derived_cast();
546 if (!broadcastable(de2.shape(), de1.shape()))
547 {
548 throw_broadcast_error(de2.shape(), de1.shape());
549 }
550 }
551
552 namespace detail
553 {
554 template <bool B, class... CT>
555 struct static_trivial_broadcast;
556
557 template <class... CT>
558 struct static_trivial_broadcast<true, CT...>
559 {
560 static constexpr bool value = detail::promote_index<typename std::decay_t<CT>::shape_type...>::value;
561 };
562
563 template <class... CT>
564 struct static_trivial_broadcast<false, CT...>
565 {
566 static constexpr bool value = false;
567 };
568 }
569
570 template <class Tag>
571 template <class E1, class E2>
572 inline bool xexpression_assigner<Tag>::resize(E1& e1, const E2& e2)
573 {
574 // If our RHS is not a xfunction, we know that the RHS is at least potentially trivial
575 // We check the strides of the RHS in detail::is_trivial_broadcast to see if they match up!
576 // So we can skip a shape copy and a call to broadcast_shape(...)
577 e1.resize(e2.shape());
578 return true;
579 }
580
581 template <class Tag>
582 template <class E1, class F, class... CT>
583 inline bool xexpression_assigner<Tag>::resize(E1& e1, const xfunction<F, CT...>& e2)
584 {
585 if constexpr (detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value)
586 {
587 /*
588 * If the shape of the xfunction is statically known, we can compute the broadcast triviality
589 * at compile time plus we can resize right away.
590 */
591 // resize in case LHS is not a fixed size container. If it is, this is a NOP
592 e1.resize(typename xfunction<F, CT...>::shape_type{});
593 return detail::static_trivial_broadcast<
594 detail::is_fixed<typename xfunction<F, CT...>::shape_type>::value,
595 CT...>::value;
596 }
597 else
598 {
599 using index_type = xindex_type_t<typename E1::shape_type>;
600 using size_type = typename E1::size_type;
601 size_type size = e2.dimension();
602 index_type shape = uninitialized_shape<index_type>(size);
603 bool trivial_broadcast = e2.broadcast_shape(shape, true);
604 e1.resize(std::move(shape));
605 return trivial_broadcast;
606 }
607 }
608
609 /***********************************
610 * stepper_assigner implementation *
611 ***********************************/
612
613 template <class FROM, class TO>
615 {
616 using argument_type = std::decay_t<FROM>;
617 using result_type = std::decay_t<TO>;
618
619 static const bool value = xtl::is_arithmetic<result_type>::value
620 && (sizeof(result_type) < sizeof(argument_type)
621 || (xtl::is_integral<result_type>::value
622 && std::is_floating_point<argument_type>::value));
623 };
624
625 template <class FROM, class TO>
627 {
628 using argument_type = std::decay_t<FROM>;
629 using result_type = std::decay_t<TO>;
630
631 static const bool value = xtl::is_signed<argument_type>::value != xtl::is_signed<result_type>::value;
632 };
633
634 template <class FROM, class TO>
636 {
637 using argument_type = std::decay_t<FROM>;
638 using result_type = std::decay_t<TO>;
639
640 static const bool value = is_narrowing_conversion<argument_type, result_type>::value
641 || has_sign_conversion<argument_type, result_type>::value;
642 };
643
644 template <class E1, class E2, layout_type L>
645 inline stepper_assigner<E1, E2, L>::stepper_assigner(E1& e1, const E2& e2)
646 : m_e1(e1)
647 , m_lhs(e1.stepper_begin(e1.shape()))
648 , m_rhs(e2.stepper_begin(e1.shape()))
649 , m_index(xtl::make_sequence<index_type>(e1.shape().size(), size_type(0)))
650 {
651 }
652
653 template <class E1, class E2, layout_type L>
654 inline void stepper_assigner<E1, E2, L>::run()
655 {
656 using tmp_size_type = typename E1::size_type;
657 using argument_type = std::decay_t<decltype(*m_rhs)>;
658 using result_type = std::decay_t<decltype(*m_lhs)>;
659 constexpr bool needs_cast = has_assign_conversion<argument_type, result_type>::value;
660
661 tmp_size_type s = m_e1.size();
662 for (tmp_size_type i = 0; i < s; ++i)
663 {
665 stepper_tools<L>::increment_stepper(*this, m_index, m_e1.shape());
666 }
667 }
668
669 template <class E1, class E2, layout_type L>
670 inline void stepper_assigner<E1, E2, L>::step(size_type i)
671 {
672 m_lhs.step(i);
673 m_rhs.step(i);
674 }
675
676 template <class E1, class E2, layout_type L>
677 inline void stepper_assigner<E1, E2, L>::step(size_type i, size_type n)
678 {
679 m_lhs.step(i, n);
680 m_rhs.step(i, n);
681 }
682
683 template <class E1, class E2, layout_type L>
684 inline void stepper_assigner<E1, E2, L>::reset(size_type i)
685 {
686 m_lhs.reset(i);
687 m_rhs.reset(i);
688 }
689
690 template <class E1, class E2, layout_type L>
691 inline void stepper_assigner<E1, E2, L>::to_end(layout_type l)
692 {
693 m_lhs.to_end(l);
694 m_rhs.to_end(l);
695 }
696
697 /**********************************
698 * linear_assigner implementation *
699 **********************************/
700
701 template <bool simd_assign>
702 template <class E1, class E2>
703 inline void linear_assigner<simd_assign>::run(E1& e1, const E2& e2)
704 {
705 using lhs_align_mode = xt_simd::container_alignment_t<E1>;
706 constexpr bool is_aligned = std::is_same<lhs_align_mode, aligned_mode>::value;
707 using rhs_align_mode = std::conditional_t<is_aligned, inner_aligned_mode, unaligned_mode>;
708 using e1_value_type = typename E1::value_type;
709 using e2_value_type = typename E2::value_type;
710 using value_type = typename xassign_traits<E1, E2>::requested_value_type;
711 using simd_type = xt_simd::simd_type<value_type>;
712 using size_type = typename E1::size_type;
713 size_type size = e1.size();
714 constexpr size_type simd_size = simd_type::size;
715 constexpr bool needs_cast = has_assign_conversion<e2_value_type, e1_value_type>::value;
716
717 size_type align_begin = is_aligned ? 0 : xt_simd::get_alignment_offset(e1.data(), size, simd_size);
718 size_type align_end = align_begin + ((size - align_begin) & ~(simd_size - 1));
719
720 for (size_type i = 0; i < align_begin; ++i)
721 {
722 e1.data_element(i) = conditional_cast<needs_cast, e1_value_type>(e2.data_element(i));
723 }
724
725#if defined(XTENSOR_USE_TBB)
726 if (size >= XTENSOR_TBB_THRESHOLD)
727 {
728 tbb::static_partitioner ap;
729 tbb::parallel_for(
730 align_begin,
731 align_end,
732 simd_size,
733 [&e1, &e2](size_t i)
734 {
735 e1.template store_simd<lhs_align_mode>(
736 i,
737 e2.template load_simd<rhs_align_mode, value_type>(i)
738 );
739 },
740 ap
741 );
742 }
743 else
744 {
745 for (size_type i = align_begin; i < align_end; i += simd_size)
746 {
747 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
748 }
749 }
750#elif defined(XTENSOR_USE_OPENMP)
751 if (size >= size_type(XTENSOR_OPENMP_TRESHOLD))
752 {
753#pragma omp parallel for default(none) shared(align_begin, align_end, e1, e2)
754#ifndef _WIN32
755 for (size_type i = align_begin; i < align_end; i += simd_size)
756 {
757 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
758 }
759#else
760 for (auto i = static_cast<std::ptrdiff_t>(align_begin); i < static_cast<std::ptrdiff_t>(align_end);
761 i += static_cast<std::ptrdiff_t>(simd_size))
762 {
763 size_type ui = static_cast<size_type>(i);
764 e1.template store_simd<lhs_align_mode>(ui, e2.template load_simd<rhs_align_mode, value_type>(ui));
765 }
766#endif
767 }
768 else
769 {
770 for (size_type i = align_begin; i < align_end; i += simd_size)
771 {
772 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
773 }
774 }
775#else
776 for (size_type i = align_begin; i < align_end; i += simd_size)
777 {
778 e1.template store_simd<lhs_align_mode>(i, e2.template load_simd<rhs_align_mode, value_type>(i));
779 }
780#endif
781 for (size_type i = align_end; i < size; ++i)
782 {
783 e1.data_element(i) = conditional_cast<needs_cast, e1_value_type>(e2.data_element(i));
784 }
785 }
786
787 template <class E1, class E2>
788 inline void linear_assigner<false>::run(E1& e1, const E2& e2)
789 {
790 using is_convertible = std::
791 is_convertible<typename std::decay_t<E2>::value_type, typename std::decay_t<E1>::value_type>;
792 // If the types are not compatible, this function is still instantiated but never called.
793 // To avoid compilation problems in effectively unused code trivial_assigner_run_impl is
794 // empty in this case.
795 run_impl(e1, e2, is_convertible());
796 }
797
798 template <class E1, class E2>
799 inline void linear_assigner<false>::run_impl(E1& e1, const E2& e2, std::true_type /*is_convertible*/)
800 {
801 using value_type = typename E1::value_type;
802 using size_type = typename E1::size_type;
803 auto src = linear_begin(e2);
804 auto dst = linear_begin(e1);
805 size_type n = e1.size();
806#if defined(XTENSOR_USE_TBB)
807 tbb::static_partitioner sp;
808 tbb::parallel_for(
809 std::ptrdiff_t(0),
810 static_cast<std::ptrdiff_t>(n),
811 [&](std::ptrdiff_t i)
812 {
813 *(dst + i) = static_cast<value_type>(*(src + i));
814 },
815 sp
816 );
817#elif defined(XTENSOR_USE_OPENMP)
818 if (n >= XTENSOR_OPENMP_TRESHOLD)
819 {
820#pragma omp parallel for default(none) shared(src, dst, n)
821 for (std::ptrdiff_t i = std::ptrdiff_t(0); i < static_cast<std::ptrdiff_t>(n); i++)
822 {
823 *(dst + i) = static_cast<value_type>(*(src + i));
824 }
825 }
826 else
827 {
828 for (; n > size_type(0); --n)
829 {
830 *dst = static_cast<value_type>(*src);
831 ++src;
832 ++dst;
833 }
834 }
835#else
836 for (; n > size_type(0); --n)
837 {
838 *dst = static_cast<value_type>(*src);
839 ++src;
840 ++dst;
841 }
842#endif
843 }
844
845 template <class E1, class E2>
846 inline void linear_assigner<false>::run_impl(E1&, const E2&, std::false_type /*is_convertible*/)
847 {
848 XTENSOR_PRECONDITION(false, "Internal error: linear_assigner called with unrelated types.");
849 }
850
851 /****************************************
852 * strided_loop_assigner implementation *
853 ****************************************/
854
855 namespace strided_assign_detail
856 {
857 template <layout_type layout>
858 struct idx_tools;
859
860 template <>
862 {
863 template <class T>
864 static void next_idx(T& outer_index, T& outer_shape)
865 {
866 auto i = outer_index.size();
867 for (; i > 0; --i)
868 {
869 if (outer_index[i - 1] + 1 >= outer_shape[i - 1])
870 {
871 outer_index[i - 1] = 0;
872 }
873 else
874 {
875 outer_index[i - 1]++;
876 break;
877 }
878 }
879 }
880
881 template <class T>
882 static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
883 {
884 dynamic_shape<std::size_t> stride_sizes;
885 xt::resize_container(stride_sizes, outer_shape.size());
886 // compute strides
887 using size_type = typename T::size_type;
888 for (size_type i = outer_shape.size(); i > 0; i--)
889 {
890 stride_sizes[i - 1] = (i == outer_shape.size()) ? 1 : stride_sizes[i] * outer_shape[i];
891 }
892
893 // compute index
894 for (size_type i = 0; i < outer_shape.size(); i++)
895 {
896 auto d_idx = n / stride_sizes[i];
897 outer_index[i] = d_idx;
898 n -= d_idx * stride_sizes[i];
899 }
900 }
901 };
902
903 template <>
905 {
906 template <class T>
907 static void next_idx(T& outer_index, T& outer_shape)
908 {
909 using size_type = typename T::size_type;
910 size_type i = 0;
911 auto sz = outer_index.size();
912 for (; i < sz; ++i)
913 {
914 if (outer_index[i] + 1 >= outer_shape[i])
915 {
916 outer_index[i] = 0;
917 }
918 else
919 {
920 outer_index[i]++;
921 break;
922 }
923 }
924 }
925
926 template <class T>
927 static void nth_idx(size_t n, T& outer_index, const T& outer_shape)
928 {
929 dynamic_shape<std::size_t> stride_sizes;
930 xt::resize_container(stride_sizes, outer_shape.size());
931
932 using size_type = typename T::size_type;
933
934 // compute required strides
935 for (size_type i = 0; i < outer_shape.size(); i++)
936 {
937 stride_sizes[i] = (i == 0) ? 1 : stride_sizes[i - 1] * outer_shape[i - 1];
938 }
939
940 // compute index
941 for (size_type i = outer_shape.size(); i > 0;)
942 {
943 i--;
944 auto d_idx = n / stride_sizes[i];
945 outer_index[i] = d_idx;
946 n -= d_idx * stride_sizes[i];
947 }
948 }
949 };
950
951 template <layout_type L, class S>
952 struct check_strides_functor
953 {
954 using strides_type = S;
955
956 check_strides_functor(const S& strides)
957 : m_cut(L == layout_type::row_major ? 0 : strides.size())
958 , m_strides(strides)
959 {
960 }
961
962 template <class T, layout_type LE = L>
963 std::enable_if_t<LE == layout_type::row_major, std::size_t> operator()(const T& el)
964 {
965 // All dimenions less than var have differing strides
966 auto var = check_strides_overlap<layout_type::row_major>::get(m_strides, el.strides());
967 if (var > m_cut)
968 {
969 m_cut = var;
970 }
971 return m_cut;
972 }
973
974 template <class T, layout_type LE = L>
975 std::enable_if_t<LE == layout_type::column_major, std::size_t> operator()(const T& el)
976 {
977 auto var = check_strides_overlap<layout_type::column_major>::get(m_strides, el.strides());
978 // All dimensions >= var have differing strides
979 if (var < m_cut)
980 {
981 m_cut = var;
982 }
983 return m_cut;
984 }
985
986 template <class T>
987 std::size_t operator()(const xt::xscalar<T>& /*el*/)
988 {
989 return m_cut;
990 }
991
992 template <class F, class... CT>
993 std::size_t operator()(const xt::xfunction<F, CT...>& xf)
994 {
995 xt::for_each(*this, xf.arguments());
996 return m_cut;
997 }
998
999 private:
1000
1001 std::size_t m_cut;
1002 const strides_type& m_strides;
1003 };
1004
1005 template <bool possible = true, class E1, class E2, std::enable_if_t<!has_strides<E1>::value || !possible, bool> = true>
1006 loop_sizes_t get_loop_sizes(const E1& e1, const E2&)
1007 {
1008 return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()};
1009 }
1010
1011 template <bool possible = true, class E1, class E2, std::enable_if_t<has_strides<E1>::value && possible, bool> = true>
1012 loop_sizes_t get_loop_sizes(const E1& e1, const E2& e2)
1013 {
1014 using shape_value_type = typename E1::shape_type::value_type;
1015 bool is_row_major = true;
1016
1017 // Try to find a row-major scheme first, where the outer loop is on the first N = `cut`
1018 // dimensions, and the inner loop is
1019 is_row_major = true;
1020 auto is_zero = [](auto i)
1021 {
1022 return i == 0;
1023 };
1024 auto&& strides = e1.strides();
1025 auto it_bwd = std::find_if_not(strides.rbegin(), strides.rend(), is_zero);
1026 bool de1_row_contiguous = it_bwd != strides.rend() && *it_bwd == 1;
1027 auto it_fwd = std::find_if_not(strides.begin(), strides.end(), is_zero);
1028 bool de1_col_contiguous = it_fwd != strides.end() && *it_fwd == 1;
1029 if (de1_row_contiguous)
1030 {
1031 is_row_major = true;
1032 }
1033 else if (de1_col_contiguous)
1034 {
1035 is_row_major = false;
1036 }
1037 else
1038 {
1039 // No strided loop possible.
1040 return {false, true, 1, e1.size(), e1.dimension(), e1.dimension()};
1041 }
1042
1043 // Cut is the number of dimensions in the outer loop
1044 std::size_t cut = 0;
1045
1046 if (is_row_major)
1047 {
1048 auto csf = check_strides_functor<layout_type::row_major, decltype(e1.strides())>(e1.strides());
1049 cut = csf(e2);
1050 // This makes that only one dimension will be treated in the inner loop.
1051 if (cut < e1.strides().size() - 1)
1052 {
1053 // Only make the inner loop go over one dimension by default for now
1054 cut = e1.strides().size() - 1;
1055 }
1056 }
1057 else if (!is_row_major)
1058 {
1059 auto csf = check_strides_functor<layout_type::column_major, decltype(e1.strides())>(e1.strides()
1060 );
1061 cut = csf(e2);
1062 if (cut > 1)
1063 {
1064 // Only make the inner loop go over one dimension by default for now
1065 cut = 1;
1066 }
1067 } // can't reach here because this would have already triggered the fallback
1068
1069 std::size_t outer_loop_size = static_cast<std::size_t>(std::accumulate(
1070 e1.shape().begin(),
1071 e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
1072 shape_value_type(1),
1073 std::multiplies<shape_value_type>{}
1074 ));
1075 std::size_t inner_loop_size = static_cast<std::size_t>(std::accumulate(
1076 e1.shape().begin() + static_cast<std::ptrdiff_t>(cut),
1077 e1.shape().end(),
1078 shape_value_type(1),
1079 std::multiplies<shape_value_type>{}
1080 ));
1081
1082 if (!is_row_major)
1083 {
1084 std::swap(outer_loop_size, inner_loop_size);
1085 }
1086
1087 return {inner_loop_size > 1, is_row_major, inner_loop_size, outer_loop_size, cut, e1.dimension()};
1088 }
1089 }
1090
1091 template <bool simd>
1092 template <class E1, class E2>
1093 inline strided_assign_detail::loop_sizes_t strided_loop_assigner<simd>::get_loop_sizes(E1& e1, const E2& e2)
1094 {
1095 return strided_assign_detail::get_loop_sizes<simd>(e1, e2);
1096 }
1097
1098#define strided_parallel_assign
1099
1100 template <bool simd>
1101 template <class E1, class E2>
1102 inline void strided_loop_assigner<simd>::run(E1& e1, const E2& e2, const loop_sizes_t& loop_sizes)
1103 {
1104 bool is_row_major = loop_sizes.is_row_major;
1105 std::size_t inner_loop_size = loop_sizes.inner_loop_size;
1106 std::size_t outer_loop_size = loop_sizes.outer_loop_size;
1107 std::size_t cut = loop_sizes.cut;
1108
1109
1110 // TODO can we get rid of this and use `shape_type`?
1111 dynamic_shape<std::size_t> idx, max_shape;
1112
1113 if (is_row_major)
1114 {
1115 xt::resize_container(idx, cut);
1116 max_shape.assign(e1.shape().begin(), e1.shape().begin() + static_cast<std::ptrdiff_t>(cut));
1117 }
1118 else
1119 {
1120 xt::resize_container(idx, e1.shape().size() - cut);
1121 max_shape.assign(e1.shape().begin() + static_cast<std::ptrdiff_t>(cut), e1.shape().end());
1122 }
1123
1124 // add this when we have std::array index!
1125 // std::fill(idx.begin(), idx.end(), 0);
1126 using e1_value_type = typename E1::value_type;
1127 using e2_value_type = typename E2::value_type;
1128 constexpr bool needs_cast = has_assign_conversion<e2_value_type, e1_value_type>::value;
1129 using value_type = typename xassign_traits<E1, E2>::requested_value_type;
1130 using simd_type = std::conditional_t<
1131 std::is_same<e1_value_type, bool>::value,
1132 xt_simd::simd_bool_type<value_type>,
1133 xt_simd::simd_type<value_type>>;
1134
1135 std::size_t simd_size = inner_loop_size / simd_type::size;
1136 std::size_t simd_rest = inner_loop_size % simd_type::size;
1137
1138 auto fct_stepper = e2.stepper_begin(e1.shape());
1139 auto res_stepper = e1.stepper_begin(e1.shape());
1140
1141 // TODO in 1D case this is ambiguous -- could be RM or CM.
1142 // Use default layout to make decision
1143 std::size_t step_dim = 0;
1144 if (!is_row_major) // row major case
1145 {
1146 step_dim = cut;
1147 }
1148#if defined(XTENSOR_USE_OPENMP) && defined(strided_parallel_assign)
1149 if (outer_loop_size >= XTENSOR_OPENMP_TRESHOLD / inner_loop_size)
1150 {
1151 std::size_t first_step = true;
1152#pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper, res_stepper, idx)
1153 for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
1154 {
1155 if (first_step)
1156 {
1157 is_row_major
1159 : strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(ox, idx, max_shape);
1160
1161 for (std::size_t i = 0; i < idx.size(); ++i)
1162 {
1163 fct_stepper.step(i + step_dim, idx[i]);
1164 res_stepper.step(i + step_dim, idx[i]);
1165 }
1166 first_step = false;
1167 }
1168
1169 for (std::size_t i = 0; i < simd_size; ++i)
1170 {
1171 res_stepper.store_simd(fct_stepper.template step_simd<value_type>());
1172 }
1173 for (std::size_t i = 0; i < simd_rest; ++i)
1174 {
1175 *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
1176 res_stepper.step_leading();
1177 fct_stepper.step_leading();
1178 }
1179
1180 // next unaligned index
1181 is_row_major
1183 : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
1184
1185 fct_stepper.to_begin();
1186
1187 // need to step E1 as well if not contigous assign (e.g. view)
1188 if (!E1::contiguous_layout)
1189 {
1190 res_stepper.to_begin();
1191 for (std::size_t i = 0; i < idx.size(); ++i)
1192 {
1193 fct_stepper.step(i + step_dim, idx[i]);
1194 res_stepper.step(i + step_dim, idx[i]);
1195 }
1196 }
1197 else
1198 {
1199 for (std::size_t i = 0; i < idx.size(); ++i)
1200 {
1201 fct_stepper.step(i + step_dim, idx[i]);
1202 }
1203 }
1204 }
1205 }
1206 else
1207 {
1208#elif defined(strided_parallel_assign) && defined(XTENSOR_USE_TBB)
1209 if (outer_loop_size > XTENSOR_TBB_THRESHOLD / inner_loop_size)
1210 {
1211 tbb::static_partitioner sp;
1212 tbb::parallel_for(
1213 tbb::blocked_range<size_t>(0ul, outer_loop_size),
1214 [&e1, &e2, is_row_major, step_dim, simd_size, simd_rest, &max_shape, &idx_ = idx](
1215 const tbb::blocked_range<size_t>& r
1216 )
1217 {
1218 auto idx = idx_;
1219 auto fct_stepper = e2.stepper_begin(e1.shape());
1220 auto res_stepper = e1.stepper_begin(e1.shape());
1221 std::size_t first_step = true;
1222 // #pragma omp parallel for schedule(static) firstprivate(first_step, fct_stepper,
1223 // res_stepper, idx)
1224 for (std::size_t ox = r.begin(); ox < r.end(); ++ox)
1225 {
1226 if (first_step)
1227 {
1228 is_row_major
1230 : strided_assign_detail::idx_tools<layout_type::column_major>::nth_idx(
1231 ox,
1232 idx,
1233 max_shape
1234 );
1235
1236 for (std::size_t i = 0; i < idx.size(); ++i)
1237 {
1238 fct_stepper.step(i + step_dim, idx[i]);
1239 res_stepper.step(i + step_dim, idx[i]);
1240 }
1241 first_step = false;
1242 }
1243
1244 for (std::size_t i = 0; i < simd_size; ++i)
1245 {
1246 res_stepper.store_simd(fct_stepper.template step_simd<value_type>());
1247 }
1248 for (std::size_t i = 0; i < simd_rest; ++i)
1249 {
1250 *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
1251 res_stepper.step_leading();
1252 fct_stepper.step_leading();
1253 }
1254
1255 // next unaligned index
1256 is_row_major
1258 : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
1259
1260 fct_stepper.to_begin();
1261
1262 // need to step E1 as well if not contigous assign (e.g. view)
1263 if (!E1::contiguous_layout)
1264 {
1265 res_stepper.to_begin();
1266 for (std::size_t i = 0; i < idx.size(); ++i)
1267 {
1268 fct_stepper.step(i + step_dim, idx[i]);
1269 res_stepper.step(i + step_dim, idx[i]);
1270 }
1271 }
1272 else
1273 {
1274 for (std::size_t i = 0; i < idx.size(); ++i)
1275 {
1276 fct_stepper.step(i + step_dim, idx[i]);
1277 }
1278 }
1279 }
1280 },
1281 sp
1282 );
1283 }
1284 else
1285 {
1286
1287#endif
1288 for (std::size_t ox = 0; ox < outer_loop_size; ++ox)
1289 {
1290 for (std::size_t i = 0; i < simd_size; ++i)
1291 {
1292 res_stepper.store_simd(fct_stepper.template step_simd<value_type>());
1293 }
1294 for (std::size_t i = 0; i < simd_rest; ++i)
1295 {
1296 *(res_stepper) = conditional_cast<needs_cast, e1_value_type>(*(fct_stepper));
1297 res_stepper.step_leading();
1298 fct_stepper.step_leading();
1299 }
1300
1301 is_row_major
1303 : strided_assign_detail::idx_tools<layout_type::column_major>::next_idx(idx, max_shape);
1304
1305 fct_stepper.to_begin();
1306
1307 // need to step E1 as well if not contigous assign (e.g. view)
1308 if (!E1::contiguous_layout)
1309 {
1310 res_stepper.to_begin();
1311 for (std::size_t i = 0; i < idx.size(); ++i)
1312 {
1313 fct_stepper.step(i + step_dim, idx[i]);
1314 res_stepper.step(i + step_dim, idx[i]);
1315 }
1316 }
1317 else
1318 {
1319 for (std::size_t i = 0; i < idx.size(); ++i)
1320 {
1321 fct_stepper.step(i + step_dim, idx[i]);
1322 }
1323 }
1324 }
1325#if (defined(XTENSOR_USE_OPENMP) || defined(XTENSOR_USE_TBB)) && defined(strided_parallel_assign)
1326 }
1327#endif
1328 }
1329
1330 template <>
1331 template <class E1, class E2>
1332 inline void strided_loop_assigner<true>::run(E1& e1, const E2& e2)
1333 {
1334 strided_assign_detail::loop_sizes_t loop_sizes = strided_loop_assigner<true>::get_loop_sizes(e1, e2);
1335 if (loop_sizes.can_do_strided_assign)
1336 {
1337 run(e1, e2, loop_sizes);
1338 }
1339 else
1340 {
1341 // trigger the fallback assigner
1343 }
1344 }
1345
1346 template <>
1347 template <class E1, class E2>
1348 inline void strided_loop_assigner<false>::run(E1& /*e1*/, const E2& /*e2*/, const loop_sizes_t&)
1349 {
1350 }
1351
1352 template <>
1353 template <class E1, class E2>
1354 inline void strided_loop_assigner<false>::run(E1& e1, const E2& e2)
1355 {
1356 // trigger the fallback assigner
1358 }
1359}
1360
1361#endif
Base class for xexpressions.
derived_type & derived_cast() &noexcept
Returns a reference to the actual derived type of the xexpression.
Multidimensional function operating on xtensor expressions.
auto strides(const E &e, stride_type type=stride_type::normal) noexcept
Get strides of an object.
Definition xstrides.hpp:248
standard mathematical functions for xexpressions
constexpr layout_type compute_layout(Args... args) noexcept
Implementation of the following logical table:
Definition xlayout.hpp:88
layout_type
Definition xlayout.hpp:24
auto conditional_cast(U &&u)
Perform a type cast when a condition is true.
Definition xutils.hpp:686