xtensor
 
Loading...
Searching...
No Matches
xnpy.hpp
1/***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht *
3 * Copyright Leon Merten Lohse
4 * Copyright (c) QuantStack *
5 * *
6 * Distributed under the terms of the BSD 3-Clause License. *
7 * *
8 * The full license is in the file LICENSE, distributed with this software. *
9 ****************************************************************************/
10
11#ifndef XTENSOR_NPY_HPP
12#define XTENSOR_NPY_HPP
13
14// Derived from https://github.com/llohse/libnpy by Leon Merten Lohse,
15// relicensed from MIT License with permission
16
17#include <algorithm>
18#include <complex>
19#include <cstdint>
20#include <cstring>
21#include <fstream>
22#include <iostream>
23#include <memory>
24#include <regex>
25#include <sstream>
26#include <stdexcept>
27#include <string>
28#include <typeinfo>
29#include <vector>
30
31#include <xtl/xplatform.hpp>
32#include <xtl/xsequence.hpp>
33
34#include "../containers/xadapt.hpp"
35#include "../containers/xarray.hpp"
36#include "../core/xeval.hpp"
37#include "../core/xstrides.hpp"
38#include "../core/xtensor_config.hpp"
39
40namespace xt
41{
42 using namespace std::string_literals;
43
44 namespace detail
45 {
46
47 const char magic_string[] = "\x93NUMPY";
48 const std::size_t magic_string_length = sizeof(magic_string) - 1;
49
50 template <class O>
51 inline void write_magic(O& ostream, unsigned char v_major = 1, unsigned char v_minor = 0)
52 {
53 ostream.write(magic_string, magic_string_length);
54 ostream.put(char(v_major));
55 ostream.put(char(v_minor));
56 }
57
58 inline void read_magic(std::istream& istream, unsigned char* v_major, unsigned char* v_minor)
59 {
60 std::unique_ptr<char[]> buf(new char[magic_string_length + 2]);
61 istream.read(buf.get(), magic_string_length + 2);
62
63 if (!istream)
64 {
65 XTENSOR_THROW(std::runtime_error, "io error: failed reading file");
66 }
67
68 for (std::size_t i = 0; i < magic_string_length; i++)
69 {
70 if (buf[i] != magic_string[i])
71 {
72 XTENSOR_THROW(std::runtime_error, "this file do not have a valid npy format.");
73 }
74 }
75
76 *v_major = static_cast<unsigned char>(buf[magic_string_length]);
77 *v_minor = static_cast<unsigned char>(buf[magic_string_length + 1]);
78 }
79
80 template <class T>
81 inline char map_type()
82 {
83 if (std::is_same<T, float>::value)
84 {
85 return 'f';
86 }
87 if (std::is_same<T, double>::value)
88 {
89 return 'f';
90 }
91 if (std::is_same<T, long double>::value)
92 {
93 return 'f';
94 }
95
96 if (std::is_same<T, char>::value)
97 {
98 return 'i';
99 }
100 if (std::is_same<T, signed char>::value)
101 {
102 return 'i';
103 }
104 if (std::is_same<T, short>::value)
105 {
106 return 'i';
107 }
108 if (std::is_same<T, int>::value)
109 {
110 return 'i';
111 }
112 if (std::is_same<T, long>::value)
113 {
114 return 'i';
115 }
116 if (std::is_same<T, long long>::value)
117 {
118 return 'i';
119 }
120
121 if (std::is_same<T, unsigned char>::value)
122 {
123 return 'u';
124 }
125 if (std::is_same<T, unsigned short>::value)
126 {
127 return 'u';
128 }
129 if (std::is_same<T, unsigned int>::value)
130 {
131 return 'u';
132 }
133 if (std::is_same<T, unsigned long>::value)
134 {
135 return 'u';
136 }
137 if (std::is_same<T, unsigned long long>::value)
138 {
139 return 'u';
140 }
141
142 if (std::is_same<T, bool>::value)
143 {
144 return 'b';
145 }
146
147 if (std::is_same<T, std::complex<float>>::value)
148 {
149 return 'c';
150 }
151 if (std::is_same<T, std::complex<double>>::value)
152 {
153 return 'c';
154 }
155 if (std::is_same<T, std::complex<long double>>::value)
156 {
157 return 'c';
158 }
159
160 XTENSOR_THROW(std::runtime_error, "Type not known.");
161 }
162
163 template <class T>
164 inline char get_endianess()
165 {
166 constexpr char little_endian_char = '<';
167 constexpr char big_endian_char = '>';
168 constexpr char no_endian_char = '|';
169
170 if (sizeof(T) <= sizeof(char))
171 {
172 return no_endian_char;
173 }
174
175 switch (xtl::endianness())
176 {
177 case xtl::endian::little_endian:
178 return little_endian_char;
179 case xtl::endian::big_endian:
180 return big_endian_char;
181 default:
182 return no_endian_char;
183 }
184 }
185
186 template <class T>
187 inline std::string build_typestring()
188 {
189 std::stringstream ss;
190 ss << get_endianess<T>() << map_type<T>() << sizeof(T);
191 return ss.str();
192 }
193
194 // Safety check function
195 inline void parse_typestring(std::string typestring)
196 {
197 std::regex re("'([<>|])([ifucb])(\\d+)'");
198 std::smatch sm;
199
200 std::regex_match(typestring, sm, re);
201 if (sm.size() != 4)
202 {
203 XTENSOR_THROW(std::runtime_error, "invalid typestring");
204 }
205 }
206
207 // Helpers for the improvised parser
208 inline std::string unwrap_s(std::string s, char delim_front, char delim_back)
209 {
210 if ((s.back() == delim_back) && (s.front() == delim_front))
211 {
212 return s.substr(1, s.length() - 2);
213 }
214 else
215 {
216 XTENSOR_THROW(std::runtime_error, "unable to unwrap");
217 }
218 }
219
220 inline std::string get_value_from_map(std::string mapstr)
221 {
222 std::size_t sep_pos = mapstr.find_first_of(":");
223 if (sep_pos == std::string::npos)
224 {
225 return "";
226 }
227
228 return mapstr.substr(sep_pos + 1);
229 }
230
231 inline void pop_char(std::string& s, char c)
232 {
233 if (s.back() == c)
234 {
235 s.pop_back();
236 }
237 }
238
239 inline void
240 parse_header(std::string header, std::string& descr, bool* fortran_order, std::vector<std::size_t>& shape)
241 {
242 // The first 6 bytes are a magic string: exactly "x93NUMPY".
243 //
244 // The next 1 byte is an unsigned byte: the major version number of the file
245 // format, e.g. x01.
246 //
247 // The next 1 byte is an unsigned byte: the minor version number of the file
248 // format, e.g. x00. Note: the version of the file format is not tied to the
249 // version of the NumPy package.
250 //
251 // The next 2 bytes form a little-endian unsigned short int: the length of the
252 // header data HEADER_LEN.
253 //
254 // The next HEADER_LEN bytes form the header data describing the array's
255 // format. It is an ASCII string which contains a Python literal expression of
256 // a dictionary. It is terminated by a newline ('n') and padded with spaces
257 // ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
258 // evenly divisible by 16 for alignment purposes.
259 //
260 // The dictionary contains three keys:
261 //
262 // "descr" : dtype.descr
263 // An object that can be passed as an argument to the numpy.dtype()
264 // constructor to create the array's dtype.
265 // "fortran_order" : bool
266 // Whether the array data is Fortran-contiguous or not. Since
267 // Fortran-contiguous arrays are a common form of non-C-contiguity, we allow
268 // them to be written directly to disk for efficiency.
269 // "shape" : tuple of int
270 // The shape of the array.
271 // For repeatability and readability, this dictionary is formatted using
272 // pprint.pformat() so the keys are in alphabetic order.
273
274 // remove trailing newline
275 if (header.back() != '\n')
276 {
277 XTENSOR_THROW(std::runtime_error, "invalid header");
278 }
279 header.pop_back();
280
281 // remove all whitespaces
282 header.erase(std::remove(header.begin(), header.end(), ' '), header.end());
283
284 // unwrap dictionary
285 header = unwrap_s(header, '{', '}');
286
287 // find the positions of the 3 dictionary keys
288 std::size_t keypos_descr = header.find("'descr'");
289 std::size_t keypos_fortran = header.find("'fortran_order'");
290 std::size_t keypos_shape = header.find("'shape'");
291
292 // make sure all the keys are present
293 if (keypos_descr == std::string::npos)
294 {
295 XTENSOR_THROW(std::runtime_error, "missing 'descr' key");
296 }
297 if (keypos_fortran == std::string::npos)
298 {
299 XTENSOR_THROW(std::runtime_error, "missing 'fortran_order' key");
300 }
301 if (keypos_shape == std::string::npos)
302 {
303 XTENSOR_THROW(std::runtime_error, "missing 'shape' key");
304 }
305
306 // Make sure the keys are in order.
307 // Note that this violates the standard, which states that readers *must* not
308 // depend on the correct order here.
309 // TODO: fix
310 if (keypos_descr >= keypos_fortran || keypos_fortran >= keypos_shape)
311 {
312 XTENSOR_THROW(std::runtime_error, "header keys in wrong order");
313 }
314
315 // get the 3 key-value pairs
316 std::string keyvalue_descr;
317 keyvalue_descr = header.substr(keypos_descr, keypos_fortran - keypos_descr);
318 pop_char(keyvalue_descr, ',');
319
320 std::string keyvalue_fortran;
321 keyvalue_fortran = header.substr(keypos_fortran, keypos_shape - keypos_fortran);
322 pop_char(keyvalue_fortran, ',');
323
324 std::string keyvalue_shape;
325 keyvalue_shape = header.substr(keypos_shape, std::string::npos);
326 pop_char(keyvalue_shape, ',');
327
328 // get the values (right side of `:')
329 std::string descr_s = get_value_from_map(keyvalue_descr);
330 std::string fortran_s = get_value_from_map(keyvalue_fortran);
331 std::string shape_s = get_value_from_map(keyvalue_shape);
332
333 parse_typestring(descr_s);
334 descr = unwrap_s(descr_s, '\'', '\'');
335
336 // convert literal Python bool to C++ bool
337 if (fortran_s == "True")
338 {
339 *fortran_order = true;
340 }
341 else if (fortran_s == "False")
342 {
343 *fortran_order = false;
344 }
345 else
346 {
347 XTENSOR_THROW(std::runtime_error, "invalid fortran_order value");
348 }
349
350 // parse the shape Python tuple ( x, y, z,)
351
352 // first clear the vector
353 shape.clear();
354 shape_s = unwrap_s(shape_s, '(', ')');
355
356 // a tokenizer would be nice...
357 std::size_t pos = 0;
358 for (;;)
359 {
360 std::size_t pos_next = shape_s.find_first_of(',', pos);
361 std::string dim_s;
362
363 if (pos_next != std::string::npos)
364 {
365 dim_s = shape_s.substr(pos, pos_next - pos);
366 }
367 else
368 {
369 dim_s = shape_s.substr(pos);
370 }
371
372 if (dim_s.length() == 0)
373 {
374 if (pos_next != std::string::npos)
375 {
376 XTENSOR_THROW(std::runtime_error, "invalid shape");
377 }
378 }
379 else
380 {
381 std::stringstream ss;
382 ss << dim_s;
383 std::size_t tmp;
384 ss >> tmp;
385 shape.push_back(tmp);
386 }
387
388 if (pos_next != std::string::npos)
389 {
390 pos = ++pos_next;
391 }
392 else
393 {
394 break;
395 }
396 }
397 }
398
399 template <class O, class S>
400 inline void write_header(O& out, const std::string& descr, bool fortran_order, const S& shape)
401 {
402 std::ostringstream ss_header;
403 std::string s_fortran_order;
404 if (fortran_order)
405 {
406 s_fortran_order = "True";
407 }
408 else
409 {
410 s_fortran_order = "False";
411 }
412
413 std::string s_shape;
414 std::ostringstream ss_shape;
415 ss_shape << "(";
416 for (auto shape_it = std::begin(shape); shape_it != std::end(shape); ++shape_it)
417 {
418 ss_shape << *shape_it << ", ";
419 }
420 s_shape = ss_shape.str();
421 if (std::size(shape) > 1)
422 {
423 s_shape = s_shape.erase(s_shape.size() - 2);
424 }
425 else if (std::size(shape) == 1)
426 {
427 s_shape = s_shape.erase(s_shape.size() - 1);
428 }
429 s_shape += ")";
430
431 ss_header << "{'descr': '" << descr << "', 'fortran_order': " << s_fortran_order
432 << ", 'shape': " << s_shape << ", }";
433
434 std::size_t header_len_pre = ss_header.str().length() + 1;
435 std::size_t metadata_len = magic_string_length + 2 + 2 + header_len_pre;
436
437 unsigned char version[2] = {1, 0};
438 if (metadata_len >= 255 * 255)
439 {
440 metadata_len = magic_string_length + 2 + 4 + header_len_pre;
441 version[0] = 2;
442 version[1] = 0;
443 }
444 std::size_t padding_len = 64 - (metadata_len % 64);
445 std::string padding(padding_len, ' ');
446 ss_header << padding;
447 ss_header << std::endl;
448
449 std::string header = ss_header.str();
450
451 // write magic
452 write_magic(out, version[0], version[1]);
453
454 // write header length
455 if (version[0] == 1 && version[1] == 0)
456 {
457 char header_len_le16[2];
458 uint16_t header_len = uint16_t(header.length());
459
460 header_len_le16[0] = char((header_len >> 0) & 0xff);
461 header_len_le16[1] = char((header_len >> 8) & 0xff);
462 out.write(reinterpret_cast<char*>(header_len_le16), 2);
463 }
464 else
465 {
466 char header_len_le32[4];
467 uint32_t header_len = uint32_t(header.length());
468
469 header_len_le32[0] = char((header_len >> 0) & 0xff);
470 header_len_le32[1] = char((header_len >> 8) & 0xff);
471 header_len_le32[2] = char((header_len >> 16) & 0xff);
472 header_len_le32[3] = char((header_len >> 24) & 0xff);
473 out.write(reinterpret_cast<char*>(header_len_le32), 4);
474 }
475
476 out << header;
477 }
478
479 inline std::string read_header_1_0(std::istream& istream)
480 {
481 // read header length and convert from little endian
482 char header_len_le16[2];
483 istream.read(header_len_le16, 2);
484
485 uint16_t header_length = uint16_t(header_len_le16[0] << 0) | uint16_t(header_len_le16[1] << 8);
486
487 if ((magic_string_length + 2 + 2 + header_length) % 16 != 0)
488 {
489 // TODO: display warning
490 }
491
492 std::unique_ptr<char[]> buf(new char[header_length]);
493 istream.read(buf.get(), header_length);
494 std::string header(buf.get(), header_length);
495
496 return header;
497 }
498
499 inline std::string read_header_2_0(std::istream& istream)
500 {
501 // read header length and convert from little endian
502 char header_len_le32[4];
503 istream.read(header_len_le32, 4);
504
505 uint32_t header_length = uint32_t(header_len_le32[0] << 0) | uint32_t(header_len_le32[1] << 8)
506 | uint32_t(header_len_le32[2] << 16) | uint32_t(header_len_le32[3] << 24);
507
508 if ((magic_string_length + 2 + 4 + header_length) % 16 != 0)
509 {
510 // TODO: display warning
511 }
512
513 std::unique_ptr<char[]> buf(new char[header_length]);
514 istream.read(buf.get(), header_length);
515 std::string header(buf.get(), header_length);
516
517 return header;
518 }
519
520 struct npy_file
521 {
522 npy_file() = default;
523
524 npy_file(std::vector<std::size_t>& shape, bool fortran_order, std::string typestring)
525 : m_shape(shape)
526 , m_fortran_order(fortran_order)
527 , m_typestring(typestring)
528 {
529 // Allocate memory
530 m_word_size = std::size_t(atoi(&typestring[2]));
531 m_n_bytes = compute_size(shape) * m_word_size;
532 m_buffer = std::allocator<char>{}.allocate(m_n_bytes);
533 }
534
535 ~npy_file()
536 {
537 if (m_buffer != nullptr)
538 {
539 std::allocator<char>{}.deallocate(m_buffer, m_n_bytes);
540 }
541 }
542
543 // delete copy constructor
544 npy_file(const npy_file&) = delete;
545 npy_file& operator=(const npy_file&) = delete;
546
547 // implement move constructor and assignment
548 npy_file(npy_file&& rhs)
549 : m_shape(std::move(rhs.m_shape))
550 , m_fortran_order(std::move(rhs.m_fortran_order))
551 , m_word_size(std::move(rhs.m_word_size))
552 , m_n_bytes(std::move(rhs.m_n_bytes))
553 , m_typestring(std::move(rhs.m_typestring))
554 , m_buffer(rhs.m_buffer)
555 {
556 rhs.m_buffer = nullptr;
557 }
558
559 npy_file& operator=(npy_file&& rhs)
560 {
561 if (this != &rhs)
562 {
563 m_shape = std::move(rhs.m_shape);
564 m_fortran_order = std::move(rhs.m_fortran_order);
565 m_word_size = std::move(rhs.m_word_size);
566 m_n_bytes = std::move(rhs.m_n_bytes);
567 m_typestring = std::move(rhs.m_typestring);
568 m_buffer = rhs.m_buffer;
569 rhs.m_buffer = nullptr;
570 }
571 return *this;
572 }
573
574 template <class T, layout_type L>
575 auto cast_impl(bool check_type)
576 {
577 if (m_buffer == nullptr)
578 {
579 XTENSOR_THROW(std::runtime_error, "This npy_file has already been cast.");
580 }
581 T* ptr = reinterpret_cast<T*>(&m_buffer[0]);
582 std::vector<std::size_t> strides(m_shape.size());
583 std::size_t sz = compute_size(m_shape);
584
585 // check if the typestring matches the given one
586 if (check_type && m_typestring != detail::build_typestring<T>())
587 {
588 XTENSOR_THROW(
589 std::runtime_error,
590 "Cast error: formats not matching "s + m_typestring + " vs "s
591 + detail::build_typestring<T>()
592 );
593 }
594
595 if ((L == layout_type::column_major && !m_fortran_order)
596 || (L == layout_type::row_major && m_fortran_order))
597 {
598 XTENSOR_THROW(
599 std::runtime_error,
600 "Cast error: layout mismatch between npy file and requested layout."
601 );
602 }
603
605 m_shape,
607 strides
608 );
609 std::vector<std::size_t> shape(m_shape);
610
611 return std::make_tuple(ptr, sz, std::move(shape), std::move(strides));
612 }
613
614 template <class T, layout_type L = layout_type::dynamic>
615 auto cast(bool check_type = true) &&
616 {
617 auto cast_elems = cast_impl<T, L>(check_type);
618 m_buffer = nullptr;
619 return adapt(
620 std::move(std::get<0>(cast_elems)),
621 std::get<1>(cast_elems),
622 acquire_ownership(),
623 std::get<2>(cast_elems),
624 std::get<3>(cast_elems)
625 );
626 }
627
628 template <class T, layout_type L = layout_type::dynamic>
629 auto cast(bool check_type = true) const&
630 {
631 auto cast_elems = cast_impl<T, L>(check_type);
632 return adapt(
633 std::get<0>(cast_elems),
634 std::get<1>(cast_elems),
635 no_ownership(),
636 std::get<2>(cast_elems),
637 std::get<3>(cast_elems)
638 );
639 }
640
641 template <class T, layout_type L = layout_type::dynamic>
642 auto cast(bool check_type = true) &
643 {
644 auto cast_elems = cast_impl<T, L>(check_type);
645 return adapt(
646 std::get<0>(cast_elems),
647 std::get<1>(cast_elems),
648 no_ownership(),
649 std::get<2>(cast_elems),
650 std::get<3>(cast_elems)
651 );
652 }
653
654 char* ptr()
655 {
656 return m_buffer;
657 }
658
659 std::size_t n_bytes()
660 {
661 return m_n_bytes;
662 }
663
664 std::vector<std::size_t> m_shape;
665 bool m_fortran_order;
666 std::size_t m_word_size;
667 std::size_t m_n_bytes;
668 std::string m_typestring;
669 char* m_buffer;
670 };
671
672 inline npy_file load_npy_file(std::istream& stream)
673 {
674 // check magic bytes an version number
675 unsigned char v_major, v_minor;
676 detail::read_magic(stream, &v_major, &v_minor);
677
678 std::string header;
679
680 if (v_major == 1 && v_minor == 0)
681 {
682 header = detail::read_header_1_0(stream);
683 }
684 else if (v_major == 2 && v_minor == 0)
685 {
686 header = detail::read_header_2_0(stream);
687 }
688 else
689 {
690 XTENSOR_THROW(std::runtime_error, "unsupported file format version");
691 }
692
693 // parse header
694 bool fortran_order;
695 std::string typestr;
696
697 std::vector<std::size_t> shape;
698 detail::parse_header(header, typestr, &fortran_order, shape);
699
700 npy_file result(shape, fortran_order, typestr);
701 // read the data
702 stream.read(result.ptr(), std::streamsize((result.n_bytes())));
703 return result;
704 }
705
706 template <class O, class E>
707 inline void dump_npy_stream(O& stream, const xexpression<E>& e)
708 {
709 using value_type = typename E::value_type;
710 const E& ex = e.derived_cast();
711 auto&& eval_ex = eval(ex);
712 bool fortran_order = false;
713 if (eval_ex.layout() == layout_type::column_major && eval_ex.dimension() > 1)
714 {
715 fortran_order = true;
716 }
717
718 std::string typestring = detail::build_typestring<value_type>();
719
720 auto shape = eval_ex.shape();
721 detail::write_header(stream, typestring, fortran_order, shape);
722
723 std::size_t size = compute_size(shape);
724 stream.write(
725 reinterpret_cast<const char*>(eval_ex.data()),
726 std::streamsize((sizeof(value_type) * size))
727 );
728 }
729 } // namespace detail
730
737 template <typename E>
738 inline void dump_npy(const std::string& filename, const xexpression<E>& e)
739 {
740 std::ofstream stream(filename, std::ofstream::binary);
741 if (!stream)
742 {
743 XTENSOR_THROW(std::runtime_error, "IO Error: failed to open file: "s + filename);
744 }
745
746 detail::dump_npy_stream(stream, e);
747 }
748
754 template <typename E>
755 inline std::string dump_npy(const xexpression<E>& e)
756 {
757 std::stringstream stream;
758 detail::dump_npy_stream(stream, e);
759 return stream.str();
760 }
761
772 template <typename T, layout_type L = layout_type::dynamic>
773 inline auto load_npy(std::istream& stream)
774 {
775 detail::npy_file file = detail::load_npy_file(stream);
776 return std::move(file).cast<T, L>();
777 }
778
789 template <typename T, layout_type L = layout_type::dynamic>
790 inline auto load_npy(const std::string& filename)
791 {
792 std::ifstream stream(filename, std::ifstream::binary);
793 if (!stream)
794 {
795 XTENSOR_THROW(std::runtime_error, "io error: failed to open a file.");
796 }
797 return load_npy<T, L>(stream);
798 }
799
800} // namespace xt
801
802#endif
Base class for xexpressions.
auto adapt(C &&container, const SC &shape, layout_type l=L)
Constructs:
auto eval(T &&t) -> std::enable_if_t< detail::is_container< std::decay_t< T > >::value, T && >
Force evaluation of xexpression.
Definition xeval.hpp:46
std::size_t compute_strides(const shape_type &shape, layout_type l, strides_type &strides)
Compute the strides given the shape and the layout of an array.
Definition xstrides.hpp:566
auto strides(const E &e, stride_type type=stride_type::normal) noexcept
Get strides of an object.
Definition xstrides.hpp:248
standard mathematical functions for xexpressions
auto load_npy(std::istream &stream)
Loads a npy file (the NumPy storage format)
Definition xnpy.hpp:773
void dump_npy(const std::string &filename, const xexpression< E > &e)
Save xexpression to NumPy npy format.
Definition xnpy.hpp:738