xtensor
 
Loading...
Searching...
No Matches
xnpy.hpp
1/***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht *
3 * Copyright Leon Merten Lohse
4 * Copyright (c) QuantStack *
5 * *
6 * Distributed under the terms of the BSD 3-Clause License. *
7 * *
8 * The full license is in the file LICENSE, distributed with this software. *
9 ****************************************************************************/
10
11#ifndef XTENSOR_NPY_HPP
12#define XTENSOR_NPY_HPP
13
14// Derived from https://github.com/llohse/libnpy by Leon Merten Lohse,
15// relicensed from MIT License with permission
16
17#include <algorithm>
18#include <complex>
19#include <cstdint>
20#include <fstream>
21#include <memory>
22#include <regex>
23#include <sstream>
24#include <stdexcept>
25#include <string>
26#include <vector>
27
28#include <xtl/xplatform.hpp>
29#include <xtl/xsequence.hpp>
30
31#include "../containers/xadapt.hpp"
32#include "../containers/xarray.hpp"
33#include "../core/xeval.hpp"
34#include "../core/xstrides.hpp"
35#include "../core/xtensor_config.hpp"
36
37namespace xt
38{
39 using namespace std::string_literals;
40
41 namespace detail
42 {
43
44 const char magic_string[] = "\x93NUMPY";
45 const std::size_t magic_string_length = sizeof(magic_string) - 1;
46
47 template <class O>
48 inline void write_magic(O& ostream, unsigned char v_major = 1, unsigned char v_minor = 0)
49 {
50 ostream.write(magic_string, magic_string_length);
51 ostream.put(char(v_major));
52 ostream.put(char(v_minor));
53 }
54
55 inline void read_magic(std::istream& istream, unsigned char* v_major, unsigned char* v_minor)
56 {
57 std::unique_ptr<char[]> buf(new char[magic_string_length + 2]);
58 istream.read(buf.get(), magic_string_length + 2);
59
60 if (!istream)
61 {
62 XTENSOR_THROW(std::runtime_error, "io error: failed reading file");
63 }
64
65 for (std::size_t i = 0; i < magic_string_length; i++)
66 {
67 if (buf[i] != magic_string[i])
68 {
69 XTENSOR_THROW(std::runtime_error, "this file do not have a valid npy format.");
70 }
71 }
72
73 *v_major = static_cast<unsigned char>(buf[magic_string_length]);
74 *v_minor = static_cast<unsigned char>(buf[magic_string_length + 1]);
75 }
76
77 template <class T>
78 inline char map_type()
79 {
80 if (std::is_same<T, float>::value)
81 {
82 return 'f';
83 }
84 if (std::is_same<T, double>::value)
85 {
86 return 'f';
87 }
88 if (std::is_same<T, long double>::value)
89 {
90 return 'f';
91 }
92
93 if (std::is_same<T, char>::value)
94 {
95 return 'i';
96 }
97 if (std::is_same<T, signed char>::value)
98 {
99 return 'i';
100 }
101 if (std::is_same<T, short>::value)
102 {
103 return 'i';
104 }
105 if (std::is_same<T, int>::value)
106 {
107 return 'i';
108 }
109 if (std::is_same<T, long>::value)
110 {
111 return 'i';
112 }
113 if (std::is_same<T, long long>::value)
114 {
115 return 'i';
116 }
117
118 if (std::is_same<T, unsigned char>::value)
119 {
120 return 'u';
121 }
122 if (std::is_same<T, unsigned short>::value)
123 {
124 return 'u';
125 }
126 if (std::is_same<T, unsigned int>::value)
127 {
128 return 'u';
129 }
130 if (std::is_same<T, unsigned long>::value)
131 {
132 return 'u';
133 }
134 if (std::is_same<T, unsigned long long>::value)
135 {
136 return 'u';
137 }
138
139 if (std::is_same<T, bool>::value)
140 {
141 return 'b';
142 }
143
144 if (std::is_same<T, std::complex<float>>::value)
145 {
146 return 'c';
147 }
148 if (std::is_same<T, std::complex<double>>::value)
149 {
150 return 'c';
151 }
152 if (std::is_same<T, std::complex<long double>>::value)
153 {
154 return 'c';
155 }
156
157 XTENSOR_THROW(std::runtime_error, "Type not known.");
158 }
159
160 template <class T>
161 inline char get_endianess()
162 {
163 constexpr char little_endian_char = '<';
164 constexpr char big_endian_char = '>';
165 constexpr char no_endian_char = '|';
166
167 if (sizeof(T) <= sizeof(char))
168 {
169 return no_endian_char;
170 }
171
172 switch (xtl::endianness())
173 {
174 case xtl::endian::little_endian:
175 return little_endian_char;
176 case xtl::endian::big_endian:
177 return big_endian_char;
178 default:
179 return no_endian_char;
180 }
181 }
182
183 template <class T>
184 inline std::string build_typestring()
185 {
186 std::stringstream ss;
187 ss << get_endianess<T>() << map_type<T>() << sizeof(T);
188 return ss.str();
189 }
190
191 // Safety check function
192 inline void parse_typestring(std::string typestring)
193 {
194 std::regex re("'([<>|])([ifucb])(\\d+)'");
195 std::smatch sm;
196
197 std::regex_match(typestring, sm, re);
198 if (sm.size() != 4)
199 {
200 XTENSOR_THROW(std::runtime_error, "invalid typestring");
201 }
202 }
203
204 // Helpers for the improvised parser
205 inline std::string unwrap_s(std::string s, char delim_front, char delim_back)
206 {
207 if ((s.back() == delim_back) && (s.front() == delim_front))
208 {
209 return s.substr(1, s.length() - 2);
210 }
211 else
212 {
213 XTENSOR_THROW(std::runtime_error, "unable to unwrap");
214 }
215 }
216
217 inline std::string get_value_from_map(std::string mapstr)
218 {
219 std::size_t sep_pos = mapstr.find_first_of(":");
220 if (sep_pos == std::string::npos)
221 {
222 return "";
223 }
224
225 return mapstr.substr(sep_pos + 1);
226 }
227
228 inline void pop_char(std::string& s, char c)
229 {
230 if (s.back() == c)
231 {
232 s.pop_back();
233 }
234 }
235
236 inline void
237 parse_header(std::string header, std::string& descr, bool* fortran_order, std::vector<std::size_t>& shape)
238 {
239 // The first 6 bytes are a magic string: exactly "x93NUMPY".
240 //
241 // The next 1 byte is an unsigned byte: the major version number of the file
242 // format, e.g. x01.
243 //
244 // The next 1 byte is an unsigned byte: the minor version number of the file
245 // format, e.g. x00. Note: the version of the file format is not tied to the
246 // version of the NumPy package.
247 //
248 // The next 2 bytes form a little-endian unsigned short int: the length of the
249 // header data HEADER_LEN.
250 //
251 // The next HEADER_LEN bytes form the header data describing the array's
252 // format. It is an ASCII string which contains a Python literal expression of
253 // a dictionary. It is terminated by a newline ('n') and padded with spaces
254 // ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
255 // evenly divisible by 16 for alignment purposes.
256 //
257 // The dictionary contains three keys:
258 //
259 // "descr" : dtype.descr
260 // An object that can be passed as an argument to the numpy.dtype()
261 // constructor to create the array's dtype.
262 // "fortran_order" : bool
263 // Whether the array data is Fortran-contiguous or not. Since
264 // Fortran-contiguous arrays are a common form of non-C-contiguity, we allow
265 // them to be written directly to disk for efficiency.
266 // "shape" : tuple of int
267 // The shape of the array.
268 // For repeatability and readability, this dictionary is formatted using
269 // pprint.pformat() so the keys are in alphabetic order.
270
271 // remove trailing newline
272 if (header.back() != '\n')
273 {
274 XTENSOR_THROW(std::runtime_error, "invalid header");
275 }
276 header.pop_back();
277
278 // remove all whitespaces
279 header.erase(std::remove(header.begin(), header.end(), ' '), header.end());
280
281 // unwrap dictionary
282 header = unwrap_s(header, '{', '}');
283
284 // find the positions of the 3 dictionary keys
285 std::size_t keypos_descr = header.find("'descr'");
286 std::size_t keypos_fortran = header.find("'fortran_order'");
287 std::size_t keypos_shape = header.find("'shape'");
288
289 // make sure all the keys are present
290 if (keypos_descr == std::string::npos)
291 {
292 XTENSOR_THROW(std::runtime_error, "missing 'descr' key");
293 }
294 if (keypos_fortran == std::string::npos)
295 {
296 XTENSOR_THROW(std::runtime_error, "missing 'fortran_order' key");
297 }
298 if (keypos_shape == std::string::npos)
299 {
300 XTENSOR_THROW(std::runtime_error, "missing 'shape' key");
301 }
302
303 // Make sure the keys are in order.
304 // Note that this violates the standard, which states that readers *must* not
305 // depend on the correct order here.
306 // TODO: fix
307 if (keypos_descr >= keypos_fortran || keypos_fortran >= keypos_shape)
308 {
309 XTENSOR_THROW(std::runtime_error, "header keys in wrong order");
310 }
311
312 // get the 3 key-value pairs
313 std::string keyvalue_descr;
314 keyvalue_descr = header.substr(keypos_descr, keypos_fortran - keypos_descr);
315 pop_char(keyvalue_descr, ',');
316
317 std::string keyvalue_fortran;
318 keyvalue_fortran = header.substr(keypos_fortran, keypos_shape - keypos_fortran);
319 pop_char(keyvalue_fortran, ',');
320
321 std::string keyvalue_shape;
322 keyvalue_shape = header.substr(keypos_shape, std::string::npos);
323 pop_char(keyvalue_shape, ',');
324
325 // get the values (right side of `:')
326 std::string descr_s = get_value_from_map(keyvalue_descr);
327 std::string fortran_s = get_value_from_map(keyvalue_fortran);
328 std::string shape_s = get_value_from_map(keyvalue_shape);
329
330 parse_typestring(descr_s);
331 descr = unwrap_s(descr_s, '\'', '\'');
332
333 // convert literal Python bool to C++ bool
334 if (fortran_s == "True")
335 {
336 *fortran_order = true;
337 }
338 else if (fortran_s == "False")
339 {
340 *fortran_order = false;
341 }
342 else
343 {
344 XTENSOR_THROW(std::runtime_error, "invalid fortran_order value");
345 }
346
347 // parse the shape Python tuple ( x, y, z,)
348
349 // first clear the vector
350 shape.clear();
351 shape_s = unwrap_s(shape_s, '(', ')');
352
353 // a tokenizer would be nice...
354 std::size_t pos = 0;
355 for (;;)
356 {
357 std::size_t pos_next = shape_s.find_first_of(',', pos);
358 std::string dim_s;
359
360 if (pos_next != std::string::npos)
361 {
362 dim_s = shape_s.substr(pos, pos_next - pos);
363 }
364 else
365 {
366 dim_s = shape_s.substr(pos);
367 }
368
369 if (dim_s.length() == 0)
370 {
371 if (pos_next != std::string::npos)
372 {
373 XTENSOR_THROW(std::runtime_error, "invalid shape");
374 }
375 }
376 else
377 {
378 std::stringstream ss;
379 ss << dim_s;
380 std::size_t tmp;
381 ss >> tmp;
382 shape.push_back(tmp);
383 }
384
385 if (pos_next != std::string::npos)
386 {
387 pos = ++pos_next;
388 }
389 else
390 {
391 break;
392 }
393 }
394 }
395
396 template <class O, class S>
397 inline void write_header(O& out, const std::string& descr, bool fortran_order, const S& shape)
398 {
399 std::ostringstream ss_header;
400 std::string s_fortran_order;
401 if (fortran_order)
402 {
403 s_fortran_order = "True";
404 }
405 else
406 {
407 s_fortran_order = "False";
408 }
409
410 std::string s_shape;
411 std::ostringstream ss_shape;
412 ss_shape << "(";
413 for (auto shape_it = std::begin(shape); shape_it != std::end(shape); ++shape_it)
414 {
415 ss_shape << *shape_it << ", ";
416 }
417 s_shape = ss_shape.str();
418 if (std::size(shape) > 1)
419 {
420 s_shape = s_shape.erase(s_shape.size() - 2);
421 }
422 else if (std::size(shape) == 1)
423 {
424 s_shape = s_shape.erase(s_shape.size() - 1);
425 }
426 s_shape += ")";
427
428 ss_header << "{'descr': '" << descr << "', 'fortran_order': " << s_fortran_order
429 << ", 'shape': " << s_shape << ", }";
430
431 std::size_t header_len_pre = ss_header.str().length() + 1;
432 std::size_t metadata_len = magic_string_length + 2 + 2 + header_len_pre;
433
434 unsigned char version[2] = {1, 0};
435 if (metadata_len >= 255 * 255)
436 {
437 metadata_len = magic_string_length + 2 + 4 + header_len_pre;
438 version[0] = 2;
439 version[1] = 0;
440 }
441 std::size_t padding_len = 64 - (metadata_len % 64);
442 std::string padding(padding_len, ' ');
443 ss_header << padding;
444 ss_header << std::endl;
445
446 std::string header = ss_header.str();
447
448 // write magic
449 write_magic(out, version[0], version[1]);
450
451 // write header length
452 if (version[0] == 1 && version[1] == 0)
453 {
454 char header_len_le16[2];
455 uint16_t header_len = uint16_t(header.length());
456
457 header_len_le16[0] = char((header_len >> 0) & 0xff);
458 header_len_le16[1] = char((header_len >> 8) & 0xff);
459 out.write(reinterpret_cast<char*>(header_len_le16), 2);
460 }
461 else
462 {
463 char header_len_le32[4];
464 uint32_t header_len = uint32_t(header.length());
465
466 header_len_le32[0] = char((header_len >> 0) & 0xff);
467 header_len_le32[1] = char((header_len >> 8) & 0xff);
468 header_len_le32[2] = char((header_len >> 16) & 0xff);
469 header_len_le32[3] = char((header_len >> 24) & 0xff);
470 out.write(reinterpret_cast<char*>(header_len_le32), 4);
471 }
472
473 out << header;
474 }
475
476 inline std::string read_header_1_0(std::istream& istream)
477 {
478 // read header length and convert from little endian
479 char header_len_le16[2];
480 istream.read(header_len_le16, 2);
481
482 uint16_t header_b0 = static_cast<uint16_t>(static_cast<uint8_t>(header_len_le16[0]));
483 uint16_t header_b1 = static_cast<uint16_t>(static_cast<uint8_t>(header_len_le16[1])) << 8;
484 uint16_t header_length = header_b0 | header_b1;
485
486 if ((magic_string_length + 2 + 2 + header_length) % 16 != 0)
487 {
488 // TODO: display warning
489 }
490
491 std::unique_ptr<char[]> buf(new char[header_length]);
492 istream.read(buf.get(), header_length);
493 std::string header(buf.get(), header_length);
494
495 return header;
496 }
497
498 inline std::string read_header_2_0(std::istream& istream)
499 {
500 // read header length and convert from little endian
501 char header_len_le32[4];
502 istream.read(header_len_le32, 4);
503
504 uint32_t header_b0 = static_cast<uint32_t>(static_cast<uint8_t>(header_len_le32[0]));
505 uint32_t header_b1 = static_cast<uint32_t>(static_cast<uint8_t>(header_len_le32[1])) << 8;
506 uint32_t header_b2 = static_cast<uint32_t>(static_cast<uint8_t>(header_len_le32[2])) << 16;
507 uint32_t header_b3 = static_cast<uint32_t>(static_cast<uint8_t>(header_len_le32[3])) << 24;
508 uint32_t header_length = header_b0 | header_b1 | header_b2 | header_b3;
509
510 if ((magic_string_length + 2 + 4 + header_length) % 16 != 0)
511 {
512 // TODO: display warning
513 }
514
515 std::unique_ptr<char[]> buf(new char[header_length]);
516 istream.read(buf.get(), header_length);
517 std::string header(buf.get(), header_length);
518
519 return header;
520 }
521
522 struct npy_file
523 {
524 npy_file() = default;
525
526 npy_file(std::vector<std::size_t>& shape, bool fortran_order, std::string typestring)
527 : m_shape(shape)
528 , m_fortran_order(fortran_order)
529 , m_typestring(typestring)
530 {
531 // Allocate memory
532 m_word_size = std::size_t(atoi(&typestring[2]));
533 m_n_bytes = compute_size(shape) * m_word_size;
534 m_buffer = std::allocator<char>{}.allocate(m_n_bytes);
535 }
536
537 ~npy_file()
538 {
539 if (m_buffer != nullptr)
540 {
541 std::allocator<char>{}.deallocate(m_buffer, m_n_bytes);
542 }
543 }
544
545 // delete copy constructor
546 npy_file(const npy_file&) = delete;
547 npy_file& operator=(const npy_file&) = delete;
548
549 // implement move constructor and assignment
550 npy_file(npy_file&& rhs)
551 : m_shape(std::move(rhs.m_shape))
552 , m_fortran_order(std::move(rhs.m_fortran_order))
553 , m_word_size(std::move(rhs.m_word_size))
554 , m_n_bytes(std::move(rhs.m_n_bytes))
555 , m_typestring(std::move(rhs.m_typestring))
556 , m_buffer(rhs.m_buffer)
557 {
558 rhs.m_buffer = nullptr;
559 }
560
561 npy_file& operator=(npy_file&& rhs)
562 {
563 if (this != &rhs)
564 {
565 m_shape = std::move(rhs.m_shape);
566 m_fortran_order = std::move(rhs.m_fortran_order);
567 m_word_size = std::move(rhs.m_word_size);
568 m_n_bytes = std::move(rhs.m_n_bytes);
569 m_typestring = std::move(rhs.m_typestring);
570 m_buffer = rhs.m_buffer;
571 rhs.m_buffer = nullptr;
572 }
573 return *this;
574 }
575
576 template <class T, layout_type L>
577 auto cast_impl(bool check_type)
578 {
579 if (m_buffer == nullptr)
580 {
581 XTENSOR_THROW(std::runtime_error, "This npy_file has already been cast.");
582 }
583 T* ptr = reinterpret_cast<T*>(&m_buffer[0]);
584 std::vector<std::size_t> strides(m_shape.size());
585 std::size_t sz = compute_size(m_shape);
586
587 // check if the typestring matches the given one
588 if (check_type && m_typestring != detail::build_typestring<T>())
589 {
590 XTENSOR_THROW(
591 std::runtime_error,
592 "Cast error: formats not matching "s + m_typestring + " vs "s
593 + detail::build_typestring<T>()
594 );
595 }
596
597 if ((L == layout_type::column_major && !m_fortran_order)
598 || (L == layout_type::row_major && m_fortran_order))
599 {
600 XTENSOR_THROW(
601 std::runtime_error,
602 "Cast error: layout mismatch between npy file and requested layout."
603 );
604 }
605
607 m_shape,
609 strides
610 );
611 std::vector<std::size_t> shape(m_shape);
612
613 return std::make_tuple(ptr, sz, std::move(shape), std::move(strides));
614 }
615
616 template <class T, layout_type L = layout_type::dynamic>
617 auto cast(bool check_type = true) &&
618 {
619 auto cast_elems = cast_impl<T, L>(check_type);
620 m_buffer = nullptr;
621 return adapt(
622 std::move(std::get<0>(cast_elems)),
623 std::get<1>(cast_elems),
624 acquire_ownership(),
625 std::get<2>(cast_elems),
626 std::get<3>(cast_elems)
627 );
628 }
629
630 template <class T, layout_type L = layout_type::dynamic>
631 auto cast(bool check_type = true) const&
632 {
633 auto cast_elems = cast_impl<T, L>(check_type);
634 return adapt(
635 std::get<0>(cast_elems),
636 std::get<1>(cast_elems),
637 no_ownership(),
638 std::get<2>(cast_elems),
639 std::get<3>(cast_elems)
640 );
641 }
642
643 template <class T, layout_type L = layout_type::dynamic>
644 auto cast(bool check_type = true) &
645 {
646 auto cast_elems = cast_impl<T, L>(check_type);
647 return adapt(
648 std::get<0>(cast_elems),
649 std::get<1>(cast_elems),
650 no_ownership(),
651 std::get<2>(cast_elems),
652 std::get<3>(cast_elems)
653 );
654 }
655
656 char* ptr()
657 {
658 return m_buffer;
659 }
660
661 std::size_t n_bytes()
662 {
663 return m_n_bytes;
664 }
665
666 std::vector<std::size_t> m_shape;
667 bool m_fortran_order;
668 std::size_t m_word_size;
669 std::size_t m_n_bytes;
670 std::string m_typestring;
671 char* m_buffer;
672 };
673
674 inline npy_file load_npy_file(std::istream& stream)
675 {
676 // check magic bytes an version number
677 unsigned char v_major, v_minor;
678 detail::read_magic(stream, &v_major, &v_minor);
679
680 std::string header;
681
682 if (v_major == 1 && v_minor == 0)
683 {
684 header = detail::read_header_1_0(stream);
685 }
686 else if (v_major == 2 && v_minor == 0)
687 {
688 header = detail::read_header_2_0(stream);
689 }
690 else
691 {
692 XTENSOR_THROW(std::runtime_error, "unsupported file format version");
693 }
694
695 // parse header
696 bool fortran_order;
697 std::string typestr;
698
699 std::vector<std::size_t> shape;
700 detail::parse_header(header, typestr, &fortran_order, shape);
701
702 npy_file result(shape, fortran_order, typestr);
703 // read the data
704 stream.read(result.ptr(), std::streamsize((result.n_bytes())));
705 return result;
706 }
707
708 template <class O, class E>
709 inline void dump_npy_stream(O& stream, const xexpression<E>& e)
710 {
711 using value_type = typename E::value_type;
712 const E& ex = e.derived_cast();
713 auto&& eval_ex = eval(ex);
714 bool fortran_order = false;
715 if (eval_ex.layout() == layout_type::column_major && eval_ex.dimension() > 1)
716 {
717 fortran_order = true;
718 }
719
720 std::string typestring = detail::build_typestring<value_type>();
721
722 auto shape = eval_ex.shape();
723 detail::write_header(stream, typestring, fortran_order, shape);
724
725 std::size_t size = compute_size(shape);
726 stream.write(
727 reinterpret_cast<const char*>(eval_ex.data()),
728 std::streamsize((sizeof(value_type) * size))
729 );
730 }
731 } // namespace detail
732
739 template <typename E>
740 inline void dump_npy(const std::string& filename, const xexpression<E>& e)
741 {
742 std::ofstream stream(filename, std::ofstream::binary);
743 if (!stream)
744 {
745 XTENSOR_THROW(std::runtime_error, "IO Error: failed to open file: "s + filename);
746 }
747
748 detail::dump_npy_stream(stream, e);
749 }
750
756 template <typename E>
757 inline std::string dump_npy(const xexpression<E>& e)
758 {
759 std::stringstream stream;
760 detail::dump_npy_stream(stream, e);
761 return stream.str();
762 }
763
774 template <typename T, layout_type L = layout_type::dynamic>
775 inline auto load_npy(std::istream& stream)
776 {
777 detail::npy_file file = detail::load_npy_file(stream);
778 return std::move(file).cast<T, L>();
779 }
780
791 template <typename T, layout_type L = layout_type::dynamic>
792 inline auto load_npy(const std::string& filename)
793 {
794 std::ifstream stream(filename, std::ifstream::binary);
795 if (!stream)
796 {
797 XTENSOR_THROW(std::runtime_error, "io error: failed to open a file.");
798 }
799 return load_npy<T, L>(stream);
800 }
801
802} // namespace xt
803
804#endif
Base class for xexpressions.
auto adapt(C &&container, const SC &shape, layout_type l=L)
Constructs:
auto eval(T &&t) -> std::enable_if_t< detail::is_container< std::decay_t< T > >::value, T && >
Force evaluation of xexpression.
Definition xeval.hpp:46
std::size_t compute_strides(const shape_type &shape, layout_type l, strides_type &strides)
Compute the strides given the shape and the layout of an array.
Definition xstrides.hpp:570
auto strides(const E &e, stride_type type=stride_type::normal) noexcept
Get strides of an object.
Definition xstrides.hpp:250
standard mathematical functions for xexpressions
auto load_npy(std::istream &stream)
Loads a npy file (the NumPy storage format)
Definition xnpy.hpp:775
void dump_npy(const std::string &filename, const xexpression< E > &e)
Save xexpression to NumPy npy format.
Definition xnpy.hpp:740