xtensor
Loading...
Searching...
No Matches
xnpy.hpp
1/***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht *
3 * Copyright Leon Merten Lohse
4 * Copyright (c) QuantStack *
5 * *
6 * Distributed under the terms of the BSD 3-Clause License. *
7 * *
8 * The full license is in the file LICENSE, distributed with this software. *
9 ****************************************************************************/
10
11#ifndef XTENSOR_NPY_HPP
12#define XTENSOR_NPY_HPP
13
14// Derived from https://github.com/llohse/libnpy by Leon Merten Lohse,
15// relicensed from MIT License with permission
16
17#include <algorithm>
18#include <complex>
19#include <cstdint>
20#include <cstring>
21#include <fstream>
22#include <iostream>
23#include <memory>
24#include <regex>
25#include <sstream>
26#include <stdexcept>
27#include <string>
28#include <typeinfo>
29#include <vector>
30
31#include <xtl/xplatform.hpp>
32#include <xtl/xsequence.hpp>
33
34#include "xtensor/xadapt.hpp"
35#include "xtensor/xarray.hpp"
36#include "xtensor/xeval.hpp"
37#include "xtensor/xstrides.hpp"
38
39#include "xtensor_config.hpp"
40
41namespace xt
42{
43 using namespace std::string_literals;
44
45 namespace detail
46 {
47
48 const char magic_string[] = "\x93NUMPY";
49 const std::size_t magic_string_length = sizeof(magic_string) - 1;
50
51 template <class O>
52 inline void write_magic(O& ostream, unsigned char v_major = 1, unsigned char v_minor = 0)
53 {
54 ostream.write(magic_string, magic_string_length);
55 ostream.put(char(v_major));
56 ostream.put(char(v_minor));
57 }
58
59 inline void read_magic(std::istream& istream, unsigned char* v_major, unsigned char* v_minor)
60 {
61 std::unique_ptr<char[]> buf(new char[magic_string_length + 2]);
62 istream.read(buf.get(), magic_string_length + 2);
63
64 if (!istream)
65 {
66 XTENSOR_THROW(std::runtime_error, "io error: failed reading file");
67 }
68
69 for (std::size_t i = 0; i < magic_string_length; i++)
70 {
71 if (buf[i] != magic_string[i])
72 {
73 XTENSOR_THROW(std::runtime_error, "this file do not have a valid npy format.");
74 }
75 }
76
77 *v_major = static_cast<unsigned char>(buf[magic_string_length]);
78 *v_minor = static_cast<unsigned char>(buf[magic_string_length + 1]);
79 }
80
81 template <class T>
82 inline char map_type()
83 {
84 if (std::is_same<T, float>::value)
85 {
86 return 'f';
87 }
88 if (std::is_same<T, double>::value)
89 {
90 return 'f';
91 }
92 if (std::is_same<T, long double>::value)
93 {
94 return 'f';
95 }
96
97 if (std::is_same<T, char>::value)
98 {
99 return 'i';
100 }
101 if (std::is_same<T, signed char>::value)
102 {
103 return 'i';
104 }
105 if (std::is_same<T, short>::value)
106 {
107 return 'i';
108 }
109 if (std::is_same<T, int>::value)
110 {
111 return 'i';
112 }
113 if (std::is_same<T, long>::value)
114 {
115 return 'i';
116 }
117 if (std::is_same<T, long long>::value)
118 {
119 return 'i';
120 }
121
122 if (std::is_same<T, unsigned char>::value)
123 {
124 return 'u';
125 }
126 if (std::is_same<T, unsigned short>::value)
127 {
128 return 'u';
129 }
130 if (std::is_same<T, unsigned int>::value)
131 {
132 return 'u';
133 }
134 if (std::is_same<T, unsigned long>::value)
135 {
136 return 'u';
137 }
138 if (std::is_same<T, unsigned long long>::value)
139 {
140 return 'u';
141 }
142
143 if (std::is_same<T, bool>::value)
144 {
145 return 'b';
146 }
147
148 if (std::is_same<T, std::complex<float>>::value)
149 {
150 return 'c';
151 }
152 if (std::is_same<T, std::complex<double>>::value)
153 {
154 return 'c';
155 }
156 if (std::is_same<T, std::complex<long double>>::value)
157 {
158 return 'c';
159 }
160
161 XTENSOR_THROW(std::runtime_error, "Type not known.");
162 }
163
164 template <class T>
165 inline char get_endianess()
166 {
167 constexpr char little_endian_char = '<';
168 constexpr char big_endian_char = '>';
169 constexpr char no_endian_char = '|';
170
171 if (sizeof(T) <= sizeof(char))
172 {
173 return no_endian_char;
174 }
175
176 switch (xtl::endianness())
177 {
178 case xtl::endian::little_endian:
179 return little_endian_char;
180 case xtl::endian::big_endian:
181 return big_endian_char;
182 default:
183 return no_endian_char;
184 }
185 }
186
187 template <class T>
188 inline std::string build_typestring()
189 {
190 std::stringstream ss;
191 ss << get_endianess<T>() << map_type<T>() << sizeof(T);
192 return ss.str();
193 }
194
195 // Safety check function
196 inline void parse_typestring(std::string typestring)
197 {
198 std::regex re("'([<>|])([ifucb])(\\d+)'");
199 std::smatch sm;
200
201 std::regex_match(typestring, sm, re);
202 if (sm.size() != 4)
203 {
204 XTENSOR_THROW(std::runtime_error, "invalid typestring");
205 }
206 }
207
208 // Helpers for the improvised parser
209 inline std::string unwrap_s(std::string s, char delim_front, char delim_back)
210 {
211 if ((s.back() == delim_back) && (s.front() == delim_front))
212 {
213 return s.substr(1, s.length() - 2);
214 }
215 else
216 {
217 XTENSOR_THROW(std::runtime_error, "unable to unwrap");
218 }
219 }
220
221 inline std::string get_value_from_map(std::string mapstr)
222 {
223 std::size_t sep_pos = mapstr.find_first_of(":");
224 if (sep_pos == std::string::npos)
225 {
226 return "";
227 }
228
229 return mapstr.substr(sep_pos + 1);
230 }
231
232 inline void pop_char(std::string& s, char c)
233 {
234 if (s.back() == c)
235 {
236 s.pop_back();
237 }
238 }
239
240 inline void
241 parse_header(std::string header, std::string& descr, bool* fortran_order, std::vector<std::size_t>& shape)
242 {
243 // The first 6 bytes are a magic string: exactly "x93NUMPY".
244 //
245 // The next 1 byte is an unsigned byte: the major version number of the file
246 // format, e.g. x01.
247 //
248 // The next 1 byte is an unsigned byte: the minor version number of the file
249 // format, e.g. x00. Note: the version of the file format is not tied to the
250 // version of the NumPy package.
251 //
252 // The next 2 bytes form a little-endian unsigned short int: the length of the
253 // header data HEADER_LEN.
254 //
255 // The next HEADER_LEN bytes form the header data describing the array's
256 // format. It is an ASCII string which contains a Python literal expression of
257 // a dictionary. It is terminated by a newline ('n') and padded with spaces
258 // ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
259 // evenly divisible by 16 for alignment purposes.
260 //
261 // The dictionary contains three keys:
262 //
263 // "descr" : dtype.descr
264 // An object that can be passed as an argument to the numpy.dtype()
265 // constructor to create the array's dtype.
266 // "fortran_order" : bool
267 // Whether the array data is Fortran-contiguous or not. Since
268 // Fortran-contiguous arrays are a common form of non-C-contiguity, we allow
269 // them to be written directly to disk for efficiency.
270 // "shape" : tuple of int
271 // The shape of the array.
272 // For repeatability and readability, this dictionary is formatted using
273 // pprint.pformat() so the keys are in alphabetic order.
274
275 // remove trailing newline
276 if (header.back() != '\n')
277 {
278 XTENSOR_THROW(std::runtime_error, "invalid header");
279 }
280 header.pop_back();
281
282 // remove all whitespaces
283 header.erase(std::remove(header.begin(), header.end(), ' '), header.end());
284
285 // unwrap dictionary
286 header = unwrap_s(header, '{', '}');
287
288 // find the positions of the 3 dictionary keys
289 std::size_t keypos_descr = header.find("'descr'");
290 std::size_t keypos_fortran = header.find("'fortran_order'");
291 std::size_t keypos_shape = header.find("'shape'");
292
293 // make sure all the keys are present
294 if (keypos_descr == std::string::npos)
295 {
296 XTENSOR_THROW(std::runtime_error, "missing 'descr' key");
297 }
298 if (keypos_fortran == std::string::npos)
299 {
300 XTENSOR_THROW(std::runtime_error, "missing 'fortran_order' key");
301 }
302 if (keypos_shape == std::string::npos)
303 {
304 XTENSOR_THROW(std::runtime_error, "missing 'shape' key");
305 }
306
307 // Make sure the keys are in order.
308 // Note that this violates the standard, which states that readers *must* not
309 // depend on the correct order here.
310 // TODO: fix
311 if (keypos_descr >= keypos_fortran || keypos_fortran >= keypos_shape)
312 {
313 XTENSOR_THROW(std::runtime_error, "header keys in wrong order");
314 }
315
316 // get the 3 key-value pairs
317 std::string keyvalue_descr;
318 keyvalue_descr = header.substr(keypos_descr, keypos_fortran - keypos_descr);
319 pop_char(keyvalue_descr, ',');
320
321 std::string keyvalue_fortran;
322 keyvalue_fortran = header.substr(keypos_fortran, keypos_shape - keypos_fortran);
323 pop_char(keyvalue_fortran, ',');
324
325 std::string keyvalue_shape;
326 keyvalue_shape = header.substr(keypos_shape, std::string::npos);
327 pop_char(keyvalue_shape, ',');
328
329 // get the values (right side of `:')
330 std::string descr_s = get_value_from_map(keyvalue_descr);
331 std::string fortran_s = get_value_from_map(keyvalue_fortran);
332 std::string shape_s = get_value_from_map(keyvalue_shape);
333
334 parse_typestring(descr_s);
335 descr = unwrap_s(descr_s, '\'', '\'');
336
337 // convert literal Python bool to C++ bool
338 if (fortran_s == "True")
339 {
340 *fortran_order = true;
341 }
342 else if (fortran_s == "False")
343 {
344 *fortran_order = false;
345 }
346 else
347 {
348 XTENSOR_THROW(std::runtime_error, "invalid fortran_order value");
349 }
350
351 // parse the shape Python tuple ( x, y, z,)
352
353 // first clear the vector
354 shape.clear();
355 shape_s = unwrap_s(shape_s, '(', ')');
356
357 // a tokenizer would be nice...
358 std::size_t pos = 0;
359 for (;;)
360 {
361 std::size_t pos_next = shape_s.find_first_of(',', pos);
362 std::string dim_s;
363
364 if (pos_next != std::string::npos)
365 {
366 dim_s = shape_s.substr(pos, pos_next - pos);
367 }
368 else
369 {
370 dim_s = shape_s.substr(pos);
371 }
372
373 if (dim_s.length() == 0)
374 {
375 if (pos_next != std::string::npos)
376 {
377 XTENSOR_THROW(std::runtime_error, "invalid shape");
378 }
379 }
380 else
381 {
382 std::stringstream ss;
383 ss << dim_s;
384 std::size_t tmp;
385 ss >> tmp;
386 shape.push_back(tmp);
387 }
388
389 if (pos_next != std::string::npos)
390 {
391 pos = ++pos_next;
392 }
393 else
394 {
395 break;
396 }
397 }
398 }
399
400 template <class O, class S>
401 inline void write_header(O& out, const std::string& descr, bool fortran_order, const S& shape)
402 {
403 std::ostringstream ss_header;
404 std::string s_fortran_order;
405 if (fortran_order)
406 {
407 s_fortran_order = "True";
408 }
409 else
410 {
411 s_fortran_order = "False";
412 }
413
414 std::string s_shape;
415 std::ostringstream ss_shape;
416 ss_shape << "(";
417 for (auto shape_it = std::begin(shape); shape_it != std::end(shape); ++shape_it)
418 {
419 ss_shape << *shape_it << ", ";
420 }
421 s_shape = ss_shape.str();
422 if (xtl::sequence_size(shape) > 1)
423 {
424 s_shape = s_shape.erase(s_shape.size() - 2);
425 }
426 else if (xtl::sequence_size(shape) == 1)
427 {
428 s_shape = s_shape.erase(s_shape.size() - 1);
429 }
430 s_shape += ")";
431
432 ss_header << "{'descr': '" << descr << "', 'fortran_order': " << s_fortran_order
433 << ", 'shape': " << s_shape << ", }";
434
435 std::size_t header_len_pre = ss_header.str().length() + 1;
436 std::size_t metadata_len = magic_string_length + 2 + 2 + header_len_pre;
437
438 unsigned char version[2] = {1, 0};
439 if (metadata_len >= 255 * 255)
440 {
441 metadata_len = magic_string_length + 2 + 4 + header_len_pre;
442 version[0] = 2;
443 version[1] = 0;
444 }
445 std::size_t padding_len = 64 - (metadata_len % 64);
446 std::string padding(padding_len, ' ');
447 ss_header << padding;
448 ss_header << std::endl;
449
450 std::string header = ss_header.str();
451
452 // write magic
453 write_magic(out, version[0], version[1]);
454
455 // write header length
456 if (version[0] == 1 && version[1] == 0)
457 {
458 char header_len_le16[2];
459 uint16_t header_len = uint16_t(header.length());
460
461 header_len_le16[0] = char((header_len >> 0) & 0xff);
462 header_len_le16[1] = char((header_len >> 8) & 0xff);
463 out.write(reinterpret_cast<char*>(header_len_le16), 2);
464 }
465 else
466 {
467 char header_len_le32[4];
468 uint32_t header_len = uint32_t(header.length());
469
470 header_len_le32[0] = char((header_len >> 0) & 0xff);
471 header_len_le32[1] = char((header_len >> 8) & 0xff);
472 header_len_le32[2] = char((header_len >> 16) & 0xff);
473 header_len_le32[3] = char((header_len >> 24) & 0xff);
474 out.write(reinterpret_cast<char*>(header_len_le32), 4);
475 }
476
477 out << header;
478 }
479
480 inline std::string read_header_1_0(std::istream& istream)
481 {
482 // read header length and convert from little endian
483 char header_len_le16[2];
484 istream.read(header_len_le16, 2);
485
486 uint16_t header_length = uint16_t(header_len_le16[0] << 0) | uint16_t(header_len_le16[1] << 8);
487
488 if ((magic_string_length + 2 + 2 + header_length) % 16 != 0)
489 {
490 // TODO: display warning
491 }
492
493 std::unique_ptr<char[]> buf(new char[header_length]);
494 istream.read(buf.get(), header_length);
495 std::string header(buf.get(), header_length);
496
497 return header;
498 }
499
500 inline std::string read_header_2_0(std::istream& istream)
501 {
502 // read header length and convert from little endian
503 char header_len_le32[4];
504 istream.read(header_len_le32, 4);
505
506 uint32_t header_length = uint32_t(header_len_le32[0] << 0) | uint32_t(header_len_le32[1] << 8)
507 | uint32_t(header_len_le32[2] << 16) | uint32_t(header_len_le32[3] << 24);
508
509 if ((magic_string_length + 2 + 4 + header_length) % 16 != 0)
510 {
511 // TODO: display warning
512 }
513
514 std::unique_ptr<char[]> buf(new char[header_length]);
515 istream.read(buf.get(), header_length);
516 std::string header(buf.get(), header_length);
517
518 return header;
519 }
520
521 struct npy_file
522 {
523 npy_file() = default;
524
525 npy_file(std::vector<std::size_t>& shape, bool fortran_order, std::string typestring)
526 : m_shape(shape)
527 , m_fortran_order(fortran_order)
528 , m_typestring(typestring)
529 {
530 // Allocate memory
531 m_word_size = std::size_t(atoi(&typestring[2]));
532 m_n_bytes = compute_size(shape) * m_word_size;
533 m_buffer = std::allocator<char>{}.allocate(m_n_bytes);
534 }
535
536 ~npy_file()
537 {
538 if (m_buffer != nullptr)
539 {
540 std::allocator<char>{}.deallocate(m_buffer, m_n_bytes);
541 }
542 }
543
544 // delete copy constructor
545 npy_file(const npy_file&) = delete;
546 npy_file& operator=(const npy_file&) = delete;
547
548 // implement move constructor and assignment
549 npy_file(npy_file&& rhs)
550 : m_shape(std::move(rhs.m_shape))
551 , m_fortran_order(std::move(rhs.m_fortran_order))
552 , m_word_size(std::move(rhs.m_word_size))
553 , m_n_bytes(std::move(rhs.m_n_bytes))
554 , m_typestring(std::move(rhs.m_typestring))
555 , m_buffer(rhs.m_buffer)
556 {
557 rhs.m_buffer = nullptr;
558 }
559
560 npy_file& operator=(npy_file&& rhs)
561 {
562 if (this != &rhs)
563 {
564 m_shape = std::move(rhs.m_shape);
565 m_fortran_order = std::move(rhs.m_fortran_order);
566 m_word_size = std::move(rhs.m_word_size);
567 m_n_bytes = std::move(rhs.m_n_bytes);
568 m_typestring = std::move(rhs.m_typestring);
569 m_buffer = rhs.m_buffer;
570 rhs.m_buffer = nullptr;
571 }
572 return *this;
573 }
574
575 template <class T, layout_type L>
576 auto cast_impl(bool check_type)
577 {
578 if (m_buffer == nullptr)
579 {
580 XTENSOR_THROW(std::runtime_error, "This npy_file has already been cast.");
581 }
582 T* ptr = reinterpret_cast<T*>(&m_buffer[0]);
583 std::vector<std::size_t> strides(m_shape.size());
584 std::size_t sz = compute_size(m_shape);
585
586 // check if the typestring matches the given one
587 if (check_type && m_typestring != detail::build_typestring<T>())
588 {
589 XTENSOR_THROW(
590 std::runtime_error,
591 "Cast error: formats not matching "s + m_typestring + " vs "s
592 + detail::build_typestring<T>()
593 );
594 }
595
596 if ((L == layout_type::column_major && !m_fortran_order)
597 || (L == layout_type::row_major && m_fortran_order))
598 {
599 XTENSOR_THROW(
600 std::runtime_error,
601 "Cast error: layout mismatch between npy file and requested layout."
602 );
603 }
604
606 m_shape,
608 strides
609 );
610 std::vector<std::size_t> shape(m_shape);
611
612 return std::make_tuple(ptr, sz, std::move(shape), std::move(strides));
613 }
614
615 template <class T, layout_type L = layout_type::dynamic>
616 auto cast(bool check_type = true) &&
617 {
618 auto cast_elems = cast_impl<T, L>(check_type);
619 m_buffer = nullptr;
620 return adapt(
621 std::move(std::get<0>(cast_elems)),
622 std::get<1>(cast_elems),
623 acquire_ownership(),
624 std::get<2>(cast_elems),
625 std::get<3>(cast_elems)
626 );
627 }
628
629 template <class T, layout_type L = layout_type::dynamic>
630 auto cast(bool check_type = true) const&
631 {
632 auto cast_elems = cast_impl<T, L>(check_type);
633 return adapt(
634 std::get<0>(cast_elems),
635 std::get<1>(cast_elems),
636 no_ownership(),
637 std::get<2>(cast_elems),
638 std::get<3>(cast_elems)
639 );
640 }
641
642 template <class T, layout_type L = layout_type::dynamic>
643 auto cast(bool check_type = true) &
644 {
645 auto cast_elems = cast_impl<T, L>(check_type);
646 return adapt(
647 std::get<0>(cast_elems),
648 std::get<1>(cast_elems),
649 no_ownership(),
650 std::get<2>(cast_elems),
651 std::get<3>(cast_elems)
652 );
653 }
654
655 char* ptr()
656 {
657 return m_buffer;
658 }
659
660 std::size_t n_bytes()
661 {
662 return m_n_bytes;
663 }
664
665 std::vector<std::size_t> m_shape;
666 bool m_fortran_order;
667 std::size_t m_word_size;
668 std::size_t m_n_bytes;
669 std::string m_typestring;
670 char* m_buffer;
671 };
672
673 inline npy_file load_npy_file(std::istream& stream)
674 {
675 // check magic bytes an version number
676 unsigned char v_major, v_minor;
677 detail::read_magic(stream, &v_major, &v_minor);
678
679 std::string header;
680
681 if (v_major == 1 && v_minor == 0)
682 {
683 header = detail::read_header_1_0(stream);
684 }
685 else if (v_major == 2 && v_minor == 0)
686 {
687 header = detail::read_header_2_0(stream);
688 }
689 else
690 {
691 XTENSOR_THROW(std::runtime_error, "unsupported file format version");
692 }
693
694 // parse header
695 bool fortran_order;
696 std::string typestr;
697
698 std::vector<std::size_t> shape;
699 detail::parse_header(header, typestr, &fortran_order, shape);
700
701 npy_file result(shape, fortran_order, typestr);
702 // read the data
703 stream.read(result.ptr(), std::streamsize((result.n_bytes())));
704 return result;
705 }
706
707 template <class O, class E>
708 inline void dump_npy_stream(O& stream, const xexpression<E>& e)
709 {
710 using value_type = typename E::value_type;
711 const E& ex = e.derived_cast();
712 auto&& eval_ex = eval(ex);
713 bool fortran_order = false;
714 if (eval_ex.layout() == layout_type::column_major && eval_ex.dimension() > 1)
715 {
716 fortran_order = true;
717 }
718
719 std::string typestring = detail::build_typestring<value_type>();
720
721 auto shape = eval_ex.shape();
722 detail::write_header(stream, typestring, fortran_order, shape);
723
724 std::size_t size = compute_size(shape);
725 stream.write(
726 reinterpret_cast<const char*>(eval_ex.data()),
727 std::streamsize((sizeof(value_type) * size))
728 );
729 }
730 } // namespace detail
731
738 template <typename E>
739 inline void dump_npy(const std::string& filename, const xexpression<E>& e)
740 {
741 std::ofstream stream(filename, std::ofstream::binary);
742 if (!stream)
743 {
744 XTENSOR_THROW(std::runtime_error, "IO Error: failed to open file: "s + filename);
745 }
746
747 detail::dump_npy_stream(stream, e);
748 }
749
755 template <typename E>
756 inline std::string dump_npy(const xexpression<E>& e)
757 {
758 std::stringstream stream;
759 detail::dump_npy_stream(stream, e);
760 return stream.str();
761 }
762
773 template <typename T, layout_type L = layout_type::dynamic>
774 inline auto load_npy(std::istream& stream)
775 {
776 detail::npy_file file = detail::load_npy_file(stream);
777 return std::move(file).cast<T, L>();
778 }
779
790 template <typename T, layout_type L = layout_type::dynamic>
791 inline auto load_npy(const std::string& filename)
792 {
793 std::ifstream stream(filename, std::ifstream::binary);
794 if (!stream)
795 {
796 XTENSOR_THROW(std::runtime_error, "io error: failed to open a file.");
797 }
798 return load_npy<T, L>(stream);
799 }
800
801} // namespace xt
802
803#endif
auto cast(E &&e) noexcept -> detail::xfunction_type_t< typename detail::cast< R >::functor, E >
Element-wise static_cast.
auto adapt(C &&container, const SC &shape, layout_type l=L)
Constructs:
auto eval(T &&t) -> std::enable_if_t< detail::is_container< std::decay_t< T > >::value, T && >
Force evaluation of xexpression.
Definition xeval.hpp:46
std::size_t compute_strides(const shape_type &shape, layout_type l, strides_type &strides)
Compute the strides given the shape and the layout of an array.
Definition xstrides.hpp:566
auto strides(const E &e, stride_type type=stride_type::normal) noexcept
Get strides of an object.
Definition xstrides.hpp:248
standard mathematical functions for xexpressions
auto load_npy(std::istream &stream)
Loads a npy file (the NumPy storage format)
Definition xnpy.hpp:774
layout_type
Definition xlayout.hpp:24
void dump_npy(const std::string &filename, const xexpression< E > &e)
Save xexpression to NumPy npy format.
Definition xnpy.hpp:739