Coverage for src / thunderlab / dataloader.py: 84%
1114 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-10 21:21 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-10 21:21 +0000
1"""Load time-series data from files.
3```
4data, rate, unit, amax = load_data('data/file.wav')
5```
7The function `data_loader()` loads the whole time-series from the file
8as a numpy array of floats. First dimension is frames, second is
9channels. In contrast to the `audioio.load_audio()` function, the
10values of the data array are not restricted between -1 and 1. They can
11assume any value wihin the range `-amax` to `+amax` with the returned
12`unit`.
14```
15data = DataLoader('data/file.wav', 60.0)
16```
17or
18```
19with DataLoader('data/file.wav', 60.0) as data:
20```
21Create an `DataLoader` object that loads chuncks of 60 seconds long data
22on demand. `data` can be used like a read-only numpy array of floats.
25## Supported file formats
27- python pickle files
28- numpy .npz files
29- matlab .mat files
30- audio files via [`audioio`](https://github.com/bendalab/audioio) package
31- LabView .scandat files
32- raw files
33- relacs files (https://www.relacs.net)
34- fishgrid files (https://github.com/bendalab/fishgrid)
37## Metadata
39Many file formats allow to store metadata that further describe the
40stored time series data. We handle them as nested dictionary of key-value
41pairs. Load them with the `metadata()` function:
42```
43metadata = metadata('data/file.mat')
44```
46## Markers
48Some file formats also allow to store markers that mark specific
49positions in the time series data. Load marker positions and spans (in
50the 2-D array `locs`) and label and text strings (in the 2-D array
51`labels`) with the `markers()` function:
52```
53locs, labels = markers('data.wav')
54```
56## Aditional, format specific functions
58- `extract_container_metadata()`: extract metadata from dictionary loaded from a container file.
59- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file.
60- `relacs_header()`: read key-value pairs from relacs *.dat file headers.
61- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file.
62- `fishgrid_spacings()`: spacing between grid electrodes.
64"""
66import gc
67import os
68import sys
69import gzip
70import pickle
71import numpy as np
72try:
73 import matplotlib.pyplot as plt
74except ImportError:
75 pass
77from pathlib import Path
78from datetime import timedelta
80from audioio import load_audio, AudioLoader, unflatten_metadata
81from audioio import get_number_unit, get_number, get_int, get_bool, get_gain
82from audioio import default_starttime_keys, default_gain_keys
83from audioio import get_datetime, flatten_metadata, add_metadata, set_starttime
84from audioio import metadata as metadata_audioio
85from audioio import markers as markers_audioio
88def relacs_samplerate_unit(filepath, channel=0):
89 """Retrieve sampling rate and unit from a relacs stimuli.dat file.
91 Parameters
92 ----------
93 filepath: str or Path
94 Path to a relacs data directory, or a file in a relacs data directory.
95 channel: int
96 Channel (trace) number, if `filepath` does not specify a
97 trace-*.raw file.
99 Returns
100 -------
101 samplerate: float
102 Sampling rate in Hertz
103 unit: str
104 Unit of the trace, can be empty if not found
106 Raises
107 ------
108 IOError/FileNotFoundError:
109 If the stimuli.dat file does not exist.
110 ValueError:
111 stimuli.dat file does not contain sampling rate.
112 """
113 trace = channel + 1
114 relacs_dir = Path(filepath)
115 # check for relacs data directory:
116 if not relacs_dir.is_dir():
117 bn = relacs_dir.stem.lower()
118 ext = relacs_dir.suffix.lower()
119 relacs_dir = relacs_dir.parent
120 if len(bn) > 6 and bn[:6] == 'trace-':
121 trace = int(bn[6:])
123 # retreive sampling rate and unit from stimuli.dat file:
124 samplerate = None
125 sampleinterval = None
126 unit = ""
128 # load stimuli.dat file:
129 lines = []
130 stimuli_file = relacs_dir / 'stimuli.dat.gz'
131 if stimuli_file.is_file():
132 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf:
133 for line in sf:
134 line = line.strip()
135 if len(line) == 0 or line[0] != '#':
136 break
137 lines.append(line)
138 else:
139 stimuli_file = relacs_dir / 'stimuli.dat'
140 with open(stimuli_file, 'r', encoding='latin-1') as sf:
141 for line in sf:
142 line = line.strip()
143 if len(line) == 0 or line[0] != '#':
144 break
145 lines.append(line)
146 # extract unit and sampling rate:
147 for line in lines:
148 if f'unit{trace}' in line:
149 unit = line.split(':')[1].strip()
150 if f'sampling rate{trace}' in line:
151 value = line.split(':')[1].strip()
152 samplerate = float(value.replace('Hz',''))
153 elif f'sample interval{trace}' in line:
154 value = line.split(':')[1].strip()
155 sampleinterval = float(value.replace('ms',''))
157 if samplerate is not None:
158 return samplerate, unit
159 if sampleinterval is not None:
160 return 1000/sampleinterval, unit
161 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}')
164def relacs_header(filepath, store_empty=False, first_only=False,
165 lower_keys=False, flat=False,
166 add_sections=False):
167 """Read key-value pairs from a relacs *.dat file header.
169 Parameters
170 ----------
171 filepath: str or Path
172 A relacs *.dat file, can be also a zipped .gz file.
173 store_empty: bool
174 If `False` do not add meta data with empty values.
175 first_only: bool
176 If `False` only store the first element of a list.
177 lower_keys: bool
178 Make all keys lower case.
179 flat: bool
180 Do not make a nested dictionary.
181 Use this option also to read in very old relacs metadata with
182 ragged left alignment.
183 add_sections: bool
184 If `True`, prepend keys with sections names separated by
185 '.' to make them unique.
187 Returns
188 -------
189 data: dict
190 Nested dictionary with key-value pairs of the file header.
192 Raises
193 ------
194 IOError/FileNotFoundError:
195 If `filepath` cannot be opened.
196 """
197 filepath = Path(filepath)
198 # read in header from file:
199 lines = []
200 gzfilepath = filepath.with_suffix(filepath.suffix + '.gz')
201 if gzfilepath.is_file():
202 with gzip.open(gzfilepath, 'r', encoding='latin-1') as sf:
203 for line in sf:
204 line = line.strip()
205 if len(line) == 0 or line[0] != '#':
206 break
207 lines.append(line)
208 else:
209 with open(filepath, 'r', encoding='latin-1') as sf:
210 for line in sf:
211 line = line.strip()
212 if len(line) == 0 or line[0] != '#':
213 break
214 lines.append(line)
215 # parse:
216 data = {}
217 cdatas = [data]
218 sections = ['']
219 ident_offs = None
220 ident = None
221 for line in lines:
222 words = line.split(':')
223 value = ':'.join(words[1:]).strip() if len(words) > 1 else ''
224 if len(words) >= 1:
225 key = words[0].strip('#')
226 # get section level:
227 level = 0
228 if not flat or len(value) == 0:
229 nident = len(key) - len(key.lstrip())
230 if ident_offs is None:
231 ident_offs = nident
232 elif ident is None:
233 if nident > ident_offs:
234 ident = nident - ident_offs
235 level = 1
236 else:
237 level = (nident - ident_offs)//ident
238 # close sections:
239 if not flat:
240 while len(cdatas) > level + 1:
241 cdatas[-1][sections.pop()] = cdatas.pop()
242 else:
243 while len(sections) > level + 1:
244 sections.pop()
245 # key:
246 key = key.strip().strip('"')
247 if lower_keys:
248 key = key.lower()
249 skey = key
250 if add_sections:
251 key = '.'.join(sections[1:] + [key])
252 if len(value) == 0:
253 # new sub-section:
254 if flat:
255 if store_empty:
256 cdatas[-1][key] = None
257 else:
258 cdatas.append({})
259 sections.append(skey)
260 else:
261 # key-value pair:
262 value = value.strip('"')
263 if len(value) > 0 or value != '-' or store_empty:
264 if len(value) > 0 and value[0] == '[' and value[-1] == ']':
265 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')]
266 if first_only:
267 value = value[0]
268 cdatas[-1][key] = value
269 while len(cdatas) > 1:
270 cdatas[-1][sections.pop()] = cdatas.pop()
271 return data
274def check_relacs(filepath):
275 """Check for valid relacs file.
277 Parameters
278 ----------
279 filepath: str or Path
280 Path to a relacs data directory, or a file in a relacs data directory.
282 Returns
283 -------
284 is_relacs: boolean
285 `True` if `filepath` is a valid relacs directory or is a file therein.
286 """
287 # relacs data directory:
288 relacs_dir = Path(filepath)
289 if not relacs_dir.is_dir():
290 relacs_dir = relacs_dir.parent
291 # check for a valid relacs data directory:
292 has_stimuli = False
293 has_trace = False
294 for fname in ['stimuli.dat', 'stimuli.dat.gz']:
295 if (relacs_dir / fname).is_file():
296 has_stimuli = True
297 break
298 for fname in ['trace-1.raw', 'trace-1.raw.gz']:
299 if (relacs_dir / fname).is_file():
300 has_trace = True
301 break
302 return has_stimuli and has_trace
305def relacs_trace_files(filepath):
306 """Expand file path for relacs data to appropriate trace*.raw file names.
308 Parameters
309 ----------
310 filepath: str or Path
311 Path to a relacs data directory, or a file in a relacs data directory.
313 Returns
314 -------
315 trace_filepaths: list of Path
316 List of relacs trace*.raw files.
317 """
318 relacs_dir = Path(filepath)
319 if not relacs_dir.is_dir():
320 relacs_dir = relacs_dir.parent
321 trace_filepaths = []
322 for k in range(10000):
323 trace_file = relacs_dir / f'trace-{k+1}.raw'
324 gz_trace_file = relacs_dir / f'trace-{k+1}.raw.gz'
325 if trace_file.is_file():
326 trace_filepaths.append(trace_file)
327 elif gz_trace_file.is_file():
328 trace_filepaths.append(gz_trace_file)
329 else:
330 break
331 return trace_filepaths
334def load_relacs(filepath, amax=1.0):
335 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs).
337 Parameters
338 ----------
339 filepath: str of Path
340 Path to a relacs data directory, or a file in a relacs data directory.
341 amax: float
342 The amplitude range of the data.
344 Returns
345 -------
346 data: 2-D array
347 All data traces as an 2-D numpy array, even for single channel data.
348 First dimension is time, second is channel.
349 rate: float
350 Sampling rate of the data in Hz
351 unit: str
352 Unit of the data
353 amax: float
354 Maximum amplitude of data range.
356 Raises
357 ------
358 FileNotFoundError:
359 Invalid or non existing relacs files.
360 ValueError:
361 - Invalid name for relacs trace-*.raw file.
362 - Sampling rates of traces differ.
363 - Unit of traces differ.
364 """
365 trace_filepaths = relacs_trace_files(filepath)
366 if len(trace_filepaths) == 0:
367 raise FileNotFoundError(f'no relacs files found')
368 # load trace*.raw files:
369 nchannels = len(trace_filepaths)
370 data = None
371 nrows = 0
372 rate = None
373 unit = ''
374 for c, path in enumerate(sorted(trace_filepaths)):
375 if path.suffix == '.gz':
376 with gzip.open(path, 'rb') as sf:
377 x = np.frombuffer(sf.read(), dtype=np.float32)
378 else:
379 x = np.fromfile(path, np.float32)
380 if data is None:
381 nrows = len(x)
382 data = np.zeros((nrows, nchannels))
383 n = min(len(x), nrows)
384 data[:n,c] = x[:n]
385 # retrieve sampling rate and unit:
386 crate, us = relacs_samplerate_unit(path, c)
387 if rate is None:
388 rate = crate
389 elif crate != rate:
390 raise ValueError('sampling rates of traces differ')
391 if len(unit) == 0:
392 unit = us
393 elif us != unit:
394 raise ValueError('unit of traces differ')
395 return data, rate, unit, amax
398def metadata_relacs(filepath, store_empty=False, first_only=False,
399 lower_keys=False, flat=False, add_sections=False):
400 """ Read meta-data of a relacs data set.
402 Parameters
403 ----------
404 filepath: str or Path
405 A relacs data directory or a file therein.
406 store_empty: bool
407 If `False` do not add meta data with empty values.
408 first_only: bool
409 If `False` only store the first element of a list.
410 lower_keys: bool
411 Make all keys lower case.
412 flat: bool
413 Do not make a nested dictionary.
414 Use this option also to read in very old relacs metadata with
415 ragged left alignment.
416 add_sections: bool
417 If `True`, prepend keys with sections names separated by
418 '.' to make them unique.
420 Returns
421 -------
422 data: nested dict
423 Nested dictionary with key-value pairs of the meta data.
424 """
425 relacs_dir = Path(filepath)
426 if not relacs_dir.is_dir():
427 relacs_dir = relacs_dir.parent
428 info_path = relacs_dir / 'info.dat'
429 if not info_path.is_file():
430 return dict()
431 data = relacs_header(info_path, store_empty, first_only,
432 lower_keys, flat, add_sections)
433 return data
436def fishgrid_spacings(metadata, unit='m'):
437 """Spacing between grid electrodes.
439 Parameters
440 ----------
441 metadata: dict
442 Fishgrid metadata obtained from `metadata_fishgrid()`.
443 unit: str
444 Unit in which to return the spacings.
446 Returns
447 -------
448 grid_dist: list of tuple of float
449 For each grid the distances between rows and columns in `unit`.
450 """
451 grids_dist = []
452 for k in range(4):
453 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0)
454 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0)
455 rows = get_int(metadata, f'Rows{k+1}', default=0)
456 cols = get_int(metadata, f'Columns{k+1}', default=0)
457 if get_bool(metadata, f'Used{k+1}', default=False) or \
458 cols > 0 and rows > 0:
459 grids_dist.append((row_dist, col_dist))
460 return grids_dist
463def fishgrid_grids(metadata):
464 """Retrieve grid sizes from a fishgrid.cfg file.
466 Parameters
467 ----------
468 metadata: dict
469 Fishgrid metadata obtained from `metadata_fishgrid()`.
471 Returns
472 -------
473 grids: list of tuple of int
474 For each grid the number of rows and columns.
475 """
476 grids = []
477 for k in range(4):
478 rows = get_int(metadata, f'Rows{k+1}', default=0)
479 cols = get_int(metadata, f'Columns{k+1}', default=0)
480 if get_bool(metadata, f'Used{k+1}', default=False) or \
481 cols > 0 and rows > 0:
482 grids.append((rows, cols))
483 return grids
486def check_fishgrid(filepath):
487 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid).
489 Parameters
490 ----------
491 filepath: str or Path
492 Path to a fishgrid data directory or a file in a fishgrid
493 data directory.
495 Returns
496 -------
497 is_fishgrid: bool
498 `True` if `filepath` is a valid fishgrid data directory or
499 a file therein.
500 """
501 # fishgrid data directory:
502 fishgrid_dir = Path(filepath)
503 if not fishgrid_dir.is_dir():
504 fishgrid_dir = fishgrid_dir.parent
505 # check for a valid fishgrid data directory:
506 return ((fishgrid_dir / 'fishgrid.cfg').is_file() and
507 ((fishgrid_dir / 'traces-grid1.raw').is_file() or
508 (fishgrid_dir / 'traces.raw').is_file()))
511def fishgrid_trace_files(filepath):
512 """Expand file paths for fishgrid data to appropriate traces*.raw file names.
514 Parameters
515 ----------
516 filepath: str or Path
517 Path to a fishgrid data directory, or a file therein.
519 Returns
520 -------
521 trace_filepaths: list of Path
522 List of fishgrid traces*.raw files.
523 """
524 fishgrid_dir = Path(filepath)
525 if not fishgrid_dir.is_dir():
526 fishgrid_dir = fishgrid_dir.parent
527 # find grids:
528 trace_filepaths = []
529 for k in range(10000):
530 trace_file = fishgrid_dir / f'traces-grid{k+1}.raw'
531 gz_trace_file = fishgrid_dir / f'traces-grid{k+1}.raw.gz'
532 if trace_file.is_file():
533 trace_filepaths.append(trace_file)
534 elif gz_trace_file.is_file():
535 trace_filepaths.append(gz_trace_file)
536 else:
537 break
538 if len(trace_filepaths) == 0:
539 trace_file = fishgrid_dir / f'traces.raw'
540 gz_trace_file = fishgrid_dir / f'traces.raw.gz'
541 if trace_file.is_file():
542 trace_filepaths.append(trace_file)
543 elif gz_trace_file.is_file():
544 trace_filepaths.append(gz_trace_file)
545 return trace_filepaths
548def load_fishgrid(filepath):
549 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid).
551 Parameters
552 ----------
553 filepath: str
554 Path to a fishgrid data directory, or a file therein.
556 Returns
557 -------
558 data: 2-D array
559 All data traces as an 2-D numpy array, even for single channel data.
560 First dimension is time, second is channel.
561 rate: float
562 Sampling rate of the data in Hz.
563 unit: str
564 Unit of the data.
565 amax: float
566 Maximum amplitude of data range.
568 Raises
569 ------
570 FileNotFoundError:
571 Invalid or non existing fishgrid files.
572 """
573 trace_filepaths = fishgrid_trace_files(filepath)
574 if len(trace_filepaths) == 0:
575 raise FileNotFoundError(f'no fishgrid files found')
576 md = metadata_fishgrid(filepath)
577 grids = fishgrid_grids(md)
578 grid_sizes = [r*c for r, c in grids]
580 # load traces-grid*.raw files:
581 grid_channels = []
582 nchannels = 0
583 for g, path in enumerate(trace_filepaths):
584 grid_channels.append(grid_sizes[g])
585 nchannels += grid_sizes[g]
586 data = None
587 nrows = 0
588 c = 0
589 rate = get_number(md, 'Hz', 'AISampleRate')
590 for path, channels in zip(trace_filepaths, grid_channels):
591 if path.suffix == '.gz':
592 with gzip.open(path, 'rb') as sf:
593 x = np.frombuffer(sf.read(), dtype=np.float32)
594 else:
595 x = np.fromfile(path, np.float32).reshape((-1, channels))
596 if data is None:
597 nrows = len(x)
598 data = np.zeros((nrows, nchannels))
599 n = min(len(x), nrows)
600 data[:n, c:c + channels] = x[:n, :]
601 c += channels
602 amax, unit = get_number_unit(md, 'AIMaxVolt')
603 return data, rate, unit, amax
606# add fishgrid keys:
607default_starttime_keys.append(['StartDate', 'StartTime'])
608default_gain_keys.insert(0, 'AIMaxVolt')
611def metadata_fishgrid(filepath):
612 """ Read meta-data of a fishgrid data set.
614 Parameters
615 ----------
616 filepath: str or Path
617 A fishgrid data directory or a file therein.
619 Returns
620 -------
621 data: nested dict
622 Nested dictionary with key-value pairs of the meta data.
623 """
624 fishgrid_dir = Path(filepath)
625 if not fishgrid_dir.is_dir():
626 fishgrid_dir = fishgrid_dir.parent
627 config_path = fishgrid_dir / 'fishgrid.cfg'
628 gz_config_path = fishgrid_dir / 'fishgrid.cfg.gz'
629 # read in header from file:
630 lines = []
631 if gz_config_path.is_file():
632 with gzip.open(gz_config_path, 'r', encoding='latin-1') as sf:
633 for line in sf:
634 lines.append(line)
635 elif config_path.is_file():
636 with open(config_path, 'r', encoding='latin-1') as sf:
637 for line in sf:
638 lines.append(line)
639 else:
640 return {}
641 # parse:
642 data = {}
643 cdatas = [data]
644 ident_offs = None
645 ident = None
646 old_style = False
647 grid_n = False
648 for line in lines:
649 if len(line.strip()) == 0:
650 continue
651 if line[0] == '*':
652 key = line[1:].strip()
653 data[key] = {}
654 cdatas = [data, data[key]]
655 elif '----' in line:
656 old_style = True
657 key = line.strip().strip(' -').replace('&', '')
658 if key.upper() == 'SETUP':
659 key = 'Grid 1'
660 grid_n = False
661 if key[:4].lower() == 'grid':
662 grid_n = key[5]
663 cdatas = cdatas[:2]
664 cdatas[1][key] = {}
665 cdatas.append(cdatas[1][key])
666 else:
667 words = line.split(':')
668 key = words[0].strip().strip('"')
669 value = None
670 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style):
671 value = ':'.join(words[1:]).strip().strip('"')
672 if old_style:
673 if value is None:
674 cdatas = cdatas[:3]
675 cdatas[2][key] = {}
676 cdatas.append(cdatas[2][key])
677 else:
678 if grid_n and key[-1] != grid_n:
679 key = key + grid_n
680 cdatas[-1][key] = value
681 else:
682 # get section level:
683 level = 0
684 nident = len(line) - len(line.lstrip())
685 if ident_offs is None:
686 ident_offs = nident
687 elif ident is None:
688 if nident > ident_offs:
689 ident = nident - ident_offs
690 level = 1
691 else:
692 level = (nident - ident_offs)//ident
693 # close sections:
694 cdatas = cdatas[:2 + level]
695 if value is None:
696 # new section:
697 cdatas[-1][key] = {}
698 cdatas.append(cdatas[-1][key])
699 else:
700 # key-value pair:
701 cdatas[-1][key] = value.replace(r'\n', '\n')
702 # remove unused grids:
703 fgm = data.get('FishGrid', {})
704 for i in range(4):
705 gs = f'Grid {i+1}'
706 if gs in fgm:
707 gm = fgm[gs]
708 us = f'Used{i+1}'
709 if us in gm and gm[us].upper() == 'FALSE':
710 del fgm[gs]
711 return data
714def markers_fishgrid(filepath):
715 """ Read markers of a fishgrid data set.
717 Parameters
718 ----------
719 filepath: str or Path
720 A fishgrid data directory or a file therein.
722 Returns
723 -------
724 locs: 2-D array of ints
725 Marker positions (first column) and spans (second column)
726 for each marker (rows).
727 labels: 2-D array of string objects
728 Labels (first column) and texts (second column)
729 for each marker (rows).
730 """
731 def add_marker():
732 if 'index1' in marker:
733 index1 = int(marker['index1'])//nchannels
734 else:
735 index1 = int(marker['index'])//nchannels
736 span1 = int(marker.get('span1', 0))//nchannels
737 locs.append([index1, span1])
738 ls = marker.get('label', 'M')
739 cs = marker.get('comment', '')
740 labels.append([ls, cs])
742 fishgrid_dir = Path(filepath)
743 if not fishgrid_dir.is_dir():
744 fishgrid_dir = fishgrid_dir.parent
745 path = fishgrid_dir / 'timestamps.dat'
746 if not path.is_file():
747 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
748 # get number of channels:
749 md = metadata_fishgrid(path.with_name('fishgrid.cfg'))
750 grids = fishgrid_grids(md)
751 nchannels = np.prod(grids[0])
752 # read timestamps:
753 locs = []
754 labels = []
755 marker = {}
756 with open(path, 'r') as sf:
757 for line in sf:
758 if len(line.strip()) == 0:
759 add_marker()
760 marker = {}
761 else:
762 words = line.split(':')
763 if len(words) > 1:
764 v = words[1].strip()
765 v = v.strip('"')
766 marker[words[0].strip().lower()] = v
767 if len(marker) > 0:
768 add_marker()
769 if len(locs) > 2:
770 return np.array(locs[1:-1]), np.array(labels[1:-1])
771 else:
772 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
775def check_container(filepath):
776 """Check if file is a generic container file.
778 Supported file formats are:
780 - python pickle files (.pkl)
781 - numpy files (.npz)
782 - matlab files (.mat)
784 Parameters
785 ----------
786 filepath: str or Path
787 Path of the file to check.
789 Returns
790 -------
791 is_container: bool
792 `True`, if `filepath` is a supported container format.
793 """
794 ext = Path(filepath).suffix
795 return ext.lower() in ('.pkl', '.npz', '.mat')
798def extract_container_data(data_dict, datakey=None,
799 samplekey=['rate', 'Fs', 'fs'],
800 timekey=['time'], amplkey=['amax'], unitkey='unit',
801 amax=1.0, unit='a.u.'):
802 """Extract data from dictionary loaded from a container file.
804 Parameters
805 ----------
806 data_dict: dict
807 Dictionary of the data items contained in the container.
808 datakey: None, str, or list of str
809 Name of the variable holding the data. If `None` take the
810 variable that is an 2D array and has the largest number of
811 elements.
812 samplekey: str or list of str
813 Name of the variable holding the sampling rate.
814 timekey: str or list of str
815 Name of the variable holding sampling times.
816 If no sampling rate is available, the sampling rate is retrieved
817 from the sampling times.
818 amplkey: str or list of str
819 Name of the variable holding the amplitude range of the data.
820 unitkey: str
821 Name of the variable holding the unit of the data.
822 amax: None or float
823 If specified and no amplitude range has been found in `data_dict`,
824 then this is the amplitude range of the data.
825 unit: None or str
826 If specified and no unit has been found in `data_dict`,
827 then return this as the unit of the data.
829 Returns
830 -------
831 data: 2-D array of floats
832 All data traces as an 2-D numpy array, even for single channel data.
833 First dimension is time, second is channel.
834 rate: float
835 Sampling rate of the data in Hz.
836 unit: str
837 Unit of the data.
838 amax: float
839 Maximum amplitude of data range in `unit`.
841 Raises
842 ------
843 ValueError:
844 Invalid key requested.
845 """
846 # extract format data:
847 if not isinstance(samplekey, (list, tuple, np.ndarray)):
848 samplekey = (samplekey,)
849 if not isinstance(timekey, (list, tuple, np.ndarray)):
850 timekey = (timekey,)
851 if not isinstance(amplkey, (list, tuple, np.ndarray)):
852 amplkey = (amplkey,)
853 rate = 0.0
854 for skey in samplekey:
855 if skey in data_dict:
856 rate = float(data_dict[skey])
857 break
858 if rate == 0.0:
859 for tkey in timekey:
860 if tkey in data_dict:
861 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0])
862 break
863 if rate == 0.0:
864 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times")
865 for akey in amplkey:
866 if akey in data_dict:
867 amax = float(data_dict[akey])
868 break
869 if unitkey in data_dict:
870 unit = data_dict[unitkey]
871 # get data array:
872 raw_data = np.array([])
873 if datakey:
874 # try data keys:
875 if not isinstance(datakey, (list, tuple, np.ndarray)):
876 datakey = (datakey,)
877 for dkey in datakey:
878 if dkey in data_dict:
879 raw_data = data_dict[dkey]
880 break
881 if len(raw_data) == 0:
882 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data")
883 else:
884 # find largest 2D array:
885 for d in data_dict:
886 if hasattr(data_dict[d], 'shape'):
887 if 1 <= len(data_dict[d].shape) <= 2 and \
888 np.max(data_dict[d].shape) > np.max(raw_data.shape):
889 raw_data = data_dict[d]
890 if len(raw_data) == 0:
891 raise ValueError('no data found')
892 # make 2D:
893 if len(raw_data.shape) == 1:
894 raw_data = raw_data.reshape(-1, 1)
895 # transpose if necessary:
896 if np.argmax(raw_data.shape) > 0:
897 raw_data = raw_data.T
898 # recode:
899 if raw_data.dtype == np.dtype('int16'):
900 data = raw_data.astype('float32')
901 data *= amax/2**15
902 elif raw_data.dtype == np.dtype('int32'):
903 data = raw_data.astype(float)
904 data *= amax/2**31
905 elif raw_data.dtype == np.dtype('int64'):
906 data = raw_data.astype(float)
907 data *= amax/2**63
908 else:
909 data = raw_data
910 return data, rate, unit, amax
913def load_container(filepath, datakey=None,
914 samplekey=['rate', 'Fs', 'fs'],
915 timekey=['time'], amplkey=['amax'], unitkey='unit',
916 amax=1.0, unit='a.u.'):
917 """Load data from a generic container file.
919 Supported file formats are:
921 - python pickle files (.pkl)
922 - numpy files (.npz)
923 - matlab files (.mat)
925 Parameters
926 ----------
927 filepath: str or Path
928 Path of the file to load.
929 datakey: None, str, or list of str
930 Name of the variable holding the data. If `None` take the
931 variable that is an 2D array and has the largest number of
932 elements.
933 samplekey: str or list of str
934 Name of the variable holding the sampling rate.
935 timekey: str or list of str
936 Name of the variable holding sampling times.
937 If no sampling rate is available, the sampling rate is retrieved
938 from the sampling times.
939 amplkey: str
940 Name of the variable holding the amplitude range of the data.
941 unitkey: str
942 Name of the variable holding the unit of the data.
943 If `unitkey` is not a valid key, then return `unitkey` as the `unit`.
944 amax: None or float
945 If specified and no amplitude range has been found in the data
946 container, then this is the amplitude range of the data.
947 unit: None or str
948 If specified and no unit has been found in the data container,
949 then return this as the unit of the data.
951 Returns
952 -------
953 data: 2-D array of floats
954 All data traces as an 2-D numpy array, even for single channel data.
955 First dimension is time, second is channel.
956 rate: float
957 Sampling rate of the data in Hz.
958 unit: str
959 Unit of the data.
960 amax: float
961 Maximum amplitude of data range.
963 Raises
964 ------
965 ValueError:
966 Invalid key requested.
967 """
968 # load data:
969 data_dict = {}
970 filepath = Path(filepath)
971 ext = filepath.suffix.lower()
972 if ext == '.pkl':
973 with open(filepath, 'rb') as f:
974 data_dict = pickle.load(f)
975 elif ext == '.npz':
976 data_dict = np.load(filepath)
977 elif ext == '.mat':
978 from scipy.io import loadmat
979 data_dict = loadmat(filepath, squeeze_me=True)
980 return extract_container_data(data_dict, datakey, samplekey,
981 timekey, amplkey, unitkey, amax, unit)
984def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']):
985 """ Extract metadata from dictionary loaded from a container file.
987 Parameters
988 ----------
989 data_dict: dict
990 Dictionary of the data items contained in the container.
991 metadatakey: str or list of str
992 Name of the variable holding the metadata.
994 Returns
995 -------
996 metadata: nested dict
997 Nested dictionary with key-value pairs of the meta data.
998 """
999 if not isinstance(metadatakey, (list, tuple, np.ndarray)):
1000 metadatakey = (metadatakey,)
1001 # get single metadata dictionary:
1002 for mkey in metadatakey:
1003 if mkey in data_dict:
1004 return data_dict[mkey]
1005 # collect all keys starting with metadatakey:
1006 metadata = {}
1007 for mkey in metadatakey:
1008 mkey += '__'
1009 for dkey in data_dict:
1010 if dkey[:len(mkey)] == mkey:
1011 v = data_dict[dkey]
1012 if hasattr(v, 'size') and v.ndim == 0:
1013 v = v.item()
1014 metadata[dkey[len(mkey):]] = v
1015 if len(metadata) > 0:
1016 return unflatten_metadata(metadata, sep='__')
1017 return metadata
1020def metadata_container(filepath, metadatakey=['metadata', 'info']):
1021 """ Read meta-data of a container file.
1023 Parameters
1024 ----------
1025 filepath: str or Path
1026 A container file.
1027 metadatakey: str or list of str
1028 Name of the variable holding the metadata.
1030 Returns
1031 -------
1032 metadata: nested dict
1033 Nested dictionary with key-value pairs of the meta data.
1034 """
1035 data_dict = {}
1036 filepath = Path(filepath)
1037 ext = filepath.suffix.lower()
1038 if ext == '.pkl':
1039 with open(filepath, 'rb') as f:
1040 data_dict = pickle.load(f)
1041 elif ext == '.npz':
1042 data_dict = np.load(filepath)
1043 elif ext == '.mat':
1044 from scipy.io import loadmat
1045 data_dict = loadmat(filepath, squeeze_me=True)
1046 return extract_container_metadata(data_dict, metadatakey)
1049def extract_container_markers(data_dict, poskey=['positions'],
1050 spanskey=['spans'], labelskey=['labels'],
1051 descrkey=['descriptions']):
1052 """ Extract markers from dictionary loaded from a container file.
1054 Parameters
1055 ----------
1056 data_dict: dict
1057 Dictionary of the data items contained in the container.
1058 poskey: str or list of str
1059 Name of the variable holding positions of markers.
1060 spanskey: str or list of str
1061 Name of the variable holding spans of markers.
1062 labelskey: str or list of str
1063 Name of the variable holding labels of markers.
1064 descrkey: str or list of str
1065 Name of the variable holding descriptions of markers.
1067 Returns
1068 -------
1069 locs: 2-D array of ints
1070 Marker positions (first column) and spans (second column)
1071 for each marker (rows).
1072 labels: 2-D array of string objects
1073 Labels (first column) and texts (second column)
1074 for each marker (rows).
1075 """
1076 if not isinstance(poskey, (list, tuple, np.ndarray)):
1077 poskey = (poskey,)
1078 if not isinstance(spanskey, (list, tuple, np.ndarray)):
1079 spanskey = (spanskey,)
1080 if not isinstance(labelskey, (list, tuple, np.ndarray)):
1081 labelskey = (labelskey,)
1082 if not isinstance(descrkey, (list, tuple, np.ndarray)):
1083 descrkey = (descrkey,)
1084 locs = np.zeros((0, 2), dtype=int)
1085 for pkey in poskey:
1086 if pkey in data_dict:
1087 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int)
1088 locs[:,0] = data_dict[pkey]
1089 break
1090 for skey in spanskey:
1091 if skey in data_dict:
1092 locs[:,1] = data_dict[skey]
1093 break
1094 labels = np.zeros((0, 2), dtype=object)
1095 for lkey in labelskey:
1096 if lkey in data_dict:
1097 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object)
1098 labels[:,0] = data_dict[lkey]
1099 break
1100 for dkey in descrkey:
1101 if dkey in data_dict:
1102 labels[:,1] = data_dict[dkey]
1103 break
1104 return locs, labels
1107def markers_container(filepath, poskey=['positions'],
1108 spanskey=['spans'], labelskey=['labels'],
1109 descrkey=['descriptions']):
1110 """ Read markers of a container file.
1112 Parameters
1113 ----------
1114 filepath: str or Path
1115 A container file.
1116 poskey: str or list of str
1117 Name of the variable holding positions of markers.
1118 spanskey: str or list of str
1119 Name of the variable holding spans of markers.
1120 labelskey: str or list of str
1121 Name of the variable holding labels of markers.
1122 descrkey: str or list of str
1123 Name of the variable holding descriptions of markers.
1125 Returns
1126 -------
1127 locs: 2-D array of ints
1128 Marker positions (first column) and spans (second column)
1129 for each marker (rows).
1130 labels: 2-D array of string objects
1131 Labels (first column) and texts (second column)
1132 for each marker (rows).
1133 """
1134 data_dict = {}
1135 filepath = Path(filepath)
1136 ext = filepath.suffix.lower()
1137 if ext == '.pkl':
1138 with open(filepath, 'rb') as f:
1139 data_dict = pickle.load(f)
1140 elif ext == '.npz':
1141 data_dict = np.load(filepath)
1142 elif ext == '.mat':
1143 from scipy.io import loadmat
1144 data_dict = loadmat(filepath, squeeze_me=True)
1145 return extract_container_markers(data_dict, poskey, spanskey,
1146 labelskey, descrkey)
1149def check_raw(filepath):
1150 """Check if file is a raw file.
1152 The following extensions are interpreted as raw files:
1154 - raw files (*.raw)
1155 - LabView scandata (*.scandat)
1157 Parameters
1158 ----------
1159 filepath: str or Path
1160 Path of the file to check.
1162 Returns
1163 -------
1164 is_raw: bool
1165 `True`, if `filepath` is a raw format.
1166 """
1167 ext = Path(filepath).suffix
1168 return ext.lower() in ('.raw', '.scandat')
1171def load_raw(filepath, rate=44000, channels=1, encoding='FLOAT',
1172 amax=1.0, unit='a.u.'):
1173 """Load data from a raw file.
1175 Raw files just contain the data and absolutely no metadata, not
1176 even the sampling rate, number of channels, etc.
1177 Supported file formats are:
1179 - raw files (*.raw)
1180 - LabView scandata (*.scandat)
1182 Parameters
1183 ----------
1184 filepath: str or Path
1185 Path of the file to load.
1186 rate: float
1187 Sampling rate of the data in Hertz.
1188 channels: int
1189 Number of channels multiplexed in the data.
1190 encoding: str
1191 The encoding of the data stored in the file.
1192 Valid encodings are 'PCM_16', 'PCM_32', 'PCM_64', 'FLOAT', or
1193 'DOUBLE' or lower-case versions thereof.
1194 amax: float
1195 The amplitude range of the data.
1196 unit: str
1197 The unit of the data.
1199 Returns
1200 -------
1201 data: 2-D array of floats
1202 All data traces as an 2-D numpy array, even for single channel data.
1203 First dimension is time, second is channel.
1204 rate: float
1205 Sampling rate of the data in Hz.
1206 unit: str
1207 Unit of the data.
1208 amax: float
1209 Maximum amplitude of data range.
1211 Raises
1212 ------
1213 ValueError:
1214 Invalid encoding.
1216 """
1217 encodings = {'PCM_16': 'i2',
1218 'PCM_32': 'i4',
1219 'PCM_64': 'i8',
1220 'FLOAT': 'f',
1221 'DOUBLE': 'd'}
1222 encoding = encoding.upper()
1223 if not encoding in encodings:
1224 raise ValueError(f'invalid encoding {encoding} for raw file!')
1225 dtype = np.dtype(encodings[encoding])
1226 raw_data = np.fromfile(filepath, dtype=dtype).reshape(-1, channels)
1227 # recode:
1228 if dtype == np.dtype('int16'):
1229 data = raw_data.astype('float32')
1230 data *= amax/2**15
1231 elif dtype == np.dtype('int32'):
1232 data = raw_data.astype(float)
1233 data *= amax/2**31
1234 elif dtype == np.dtype('int64'):
1235 data = raw_data.astype(float)
1236 data *= amax/2**63
1237 else:
1238 data = raw_data
1239 return data, rate, unit, amax
1242def load_audioio(filepath, verbose=0, gainkey=default_gain_keys, sep='.',
1243 amax=1.0, unit='a.u.'):
1244 """Load data from an audio file.
1246 See the
1247 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio)
1248 function of the [`audioio`](https://github.com/bendalab/audioio)
1249 package for more infos.
1251 Parameters
1252 ----------
1253 filepath: str or Path
1254 Path of the file to load.
1255 verbose: int
1256 If > 0 show detailed error/warning messages.
1257 gainkey: str or list of str
1258 Key in the file's metadata that holds some gain information.
1259 If found, the data will be multiplied with the gain,
1260 and if available, the corresponding unit is returned.
1261 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
1262 sep: str
1263 String that separates section names in `gainkey`.
1264 amax: float
1265 If specified and no gain has been found in the metadata,
1266 then use this as the amplitude range.
1267 unit: str
1268 If specified and no gain has been found in the metadata,
1269 then return this as the unit of the data.
1271 Returns
1272 -------
1273 data: 2-D array of floats
1274 All data traces as an 2-D numpy array, even for single channel data.
1275 First dimension is time, second is channel.
1276 rate: float
1277 Sampling rate of the data in Hz.
1278 unit: str
1279 Unit of the data if found in the metadata (see `gainkey`),
1280 otherwise `unit`.
1281 amax: float
1282 Maximum amplitude of data range.
1283 """
1284 # get gain:
1285 md = metadata_audioio(filepath)
1286 amax, unit = get_gain(md, gainkey, sep, amax, unit)
1287 # load data:
1288 data, rate = load_audio(filepath, verbose)
1289 if amax != 1.0:
1290 data *= amax
1291 return data, rate, unit, amax
1294data_loader_funcs = (
1295 ('relacs', check_relacs, load_relacs, metadata_relacs, None),
1296 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid),
1297 ('container', check_container, load_container, metadata_container, markers_container),
1298 ('raw', check_raw, load_raw, None, None),
1299 ('audioio', None, load_audioio, metadata_audioio, markers_audioio),
1300 )
1301"""List of implemented load functions.
1303Each element of the list is a tuple with the data format's name, its
1304check and its load function.
1306"""
1309def load_data(filepath, verbose=0, **kwargs):
1310 """Load time-series data from a file.
1312 Parameters
1313 ----------
1314 filepath: str or Path
1315 Path and name of the file to load.
1316 verbose: int
1317 If > 0 show detailed error/warning messages.
1318 **kwargs: dict
1319 Further keyword arguments that are passed on to the
1320 format specific loading functions.
1321 For example:
1322 - `amax`: the amplitude range of the data.
1323 - 'unit': the unit of the data.
1325 Returns
1326 -------
1327 data: 2-D array
1328 All data traces as an 2-D numpy array, even for single channel data.
1329 First dimension is time, second is channel.
1330 rate: float
1331 Sampling rate of the data in Hz.
1332 unit: str
1333 Unit of the data.
1334 amax: float
1335 Maximum amplitude of data range.
1336 """
1337 # load data:
1338 for name, check_file, load_file, _, _ in data_loader_funcs:
1339 if check_file is None or check_file(filepath):
1340 data, rate, unit, amax = load_file(filepath, **kwargs)
1341 if verbose > 0:
1342 print(f'loaded {name} data from file "{filepath}"')
1343 if verbose > 1:
1344 print(f' sampling rate: {rate:g} Hz')
1345 print(f' channels : {data.shape[1]}')
1346 print(f' frames : {len(data)}')
1347 print(f' range : {amax:g}{unit}')
1348 return data, rate, unit, amax
1349 return np.zeros((0, 1)), 0.0, '', 1.0
1352def metadata(filepath, **kwargs):
1353 """ Read meta-data from a data file.
1355 Parameters
1356 ----------
1357 filepath: str or Path
1358 The full path and name of the file to load. For some file
1359 formats several files can be provided in a list.
1360 **kwargs: dict
1361 Further keyword arguments that are passed on to the
1362 format specific loading functions.
1364 Returns
1365 -------
1366 meta_data: nested dict
1367 Meta data contained in the file. Keys of the nested
1368 dictionaries are always strings. If the corresponding
1369 values are dictionaries, then the key is the section name
1370 of the metadata contained in the dictionary. All other
1371 types of values are values for the respective key. In
1372 particular they are strings, or list of strings. But other
1373 simple types like ints or floats are also allowed.
1374 """
1375 # load metadata:
1376 for _, check_file, _, metadata_file, _ in data_loader_funcs:
1377 if check_file is None or check_file(filepath):
1378 if metadata_file is not None:
1379 return metadata_file(filepath, **kwargs)
1380 return {}
1383def markers(filepath):
1384 """ Read markers of a data file.
1386 Parameters
1387 ----------
1388 filepath: str or Path
1389 The data file.
1391 Returns
1392 -------
1393 locs: 2-D array of ints
1394 Marker positions (first column) and spans (second column)
1395 for each marker (rows).
1396 labels: 2-D array of string objects
1397 Labels (first column) and texts (second column)
1398 for each marker (rows).
1399 """
1400 # load markers:
1401 for _, check_file, _, _, markers_file in data_loader_funcs:
1402 if check_file is None or check_file(filepath):
1403 if markers_file is not None:
1404 return markers_file(filepath)
1405 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
1408class DataLoader(AudioLoader):
1409 """Buffered reading of time-series data for random access of the data in the file.
1411 This allows for reading very large data files that do not fit into
1412 memory. A `DataLoader` instance can be used like a huge
1413 read-only numpy array, i.e.
1414 ```
1415 data = DataLoader('path/to/data/file.dat')
1416 x = data[10000:20000,0]
1417 ```
1418 The first index specifies the frame, the second one the channel.
1420 `DataLoader` first determines the format of the data file and then
1421 opens the file (first line). It then reads data from the file as
1422 necessary for the requested data (second line).
1424 Supported file formats are
1426 - python pickle files
1427 - numpy .npz files
1428 - matlab .mat files
1429 - audio files via [`audioio`](https://github.com/bendalab/audioio) package
1430 - LabView .scandat files
1431 - raw files
1432 - relacs files (https://www.relacs.net)
1433 - fishgrid files (https://github.com/bendalab/fishgrid)
1435 Reading sequentially through the file is always possible. If
1436 previous data are requested, then the file is read from the
1437 beginning. This might slow down access to previous data
1438 considerably. Use the `backsize` argument to the open functions to
1439 make sure some data are loaded before the requested frame. Then a
1440 subsequent access to the data within `backsize` seconds before that
1441 frame can still be handled without the need to reread the file
1442 from the beginning.
1444 Usage:
1445 ------
1446 ```
1447 import thunderlab.dataloader as dl
1448 with dl.DataLoader(filepath, 60.0, 10.0) as data:
1449 # do something with the content of the file:
1450 x = data[0:10000,0]
1451 y = data[10000:20000,0]
1452 z = x + y
1453 ```
1455 Normal open and close:
1456 ```
1457 data = dl.DataLoader(filepath, 60.0)
1458 x = data[:,:] # read the whole file
1459 data.close()
1460 ```
1461 that is the same as:
1462 ```
1463 data = dl.DataLoader()
1464 data.open(filepath, 60.0)
1465 ```
1467 Parameters
1468 ----------
1469 filepath: str or Path
1470 Path of the data file.
1471 buffersize: float
1472 Size of internal buffer in seconds.
1473 backsize: float
1474 Part of the buffer to be loaded before the requested start index in seconds.
1475 verbose: int
1476 If larger than zero show detailed error/warning messages.
1477 meta_kwargs: dict
1478 Keyword arguments that are passed on to the _load_metadata() function.
1479 **kwargs: dict
1480 Further keyword arguments that are passed on to the
1481 specific open() functions.
1483 Attributes
1484 ----------
1485 filepath: Path
1486 Name and path of the opened file. In case of many files, the first one.
1487 file_paths: list of Path
1488 List of pathes of the opened files that are made accessible
1489 as a single array.
1490 file_indices: list of int
1491 For each file the index of its first sample.
1492 rate: float
1493 The sampling rate of the data in Hertz.
1494 channels: int
1495 The number of channels that are read in.
1496 frames: int
1497 The number of frames in the file.
1498 format: str or None
1499 Format of the audio file.
1500 encoding: str or None
1501 Encoding/subtype of the audio file.
1502 shape: tuple
1503 Number of frames and channels of the data.
1504 ndim: int
1505 Number of dimensions: always 2 (frames and channels).
1506 offset: int
1507 Index of first frame in the current buffer.
1508 buffer: ndarray of floats
1509 The curently available data from the file.
1510 unit: str
1511 Unit of the data.
1512 ampl_min: float
1513 Minimum amplitude the file format supports.
1514 ampl_max: float
1515 Maximum amplitude the file format supports.
1517 Methods
1518 -------
1520 - `len()`: the number of frames
1521 - `open()`: open a data file.
1522 - `open_*()`: open a data file of a specific format.
1523 - `close()`: close the file.
1524 - `basename()`: Base name of the audio data.
1525 - `format_dict()`: technical infos about how the data are stored.
1526 - `metadata()`: metadata of the file.
1527 - `markers()`: markers of the file.
1528 - `set_unwrap()`: Set parameters for unwrapping clipped data.
1530 See audioio.audioloader.AudioLoader for more methods.
1532 """
1534 def __init__(self, filepath=None, buffersize=10.0, backsize=0.0,
1535 verbose=0, meta_kwargs={}, **kwargs):
1536 super().__init__(None, buffersize, backsize,
1537 verbose, meta_kwargs)
1538 if filepath is not None:
1539 self.open(filepath, buffersize, backsize, verbose, **kwargs)
1541 def __getitem__(self, key):
1542 return super(DataLoader, self).__getitem__(key)
1544 def __next__(self):
1545 return super(DataLoader, self).__next__()
1548 # relacs interface:
1549 def open_relacs(self, filepath, buffersize=10.0, backsize=0.0,
1550 verbose=0, amax=1.0):
1551 """Open relacs data files (www.relacs.net) for reading.
1553 Parameters
1554 ----------
1555 filepath: str
1556 Path to a relacs data directory or a file therein.
1557 buffersize: float
1558 Size of internal buffer in seconds.
1559 backsize: float
1560 Part of the buffer to be loaded before the requested start index in seconds.
1561 verbose: int
1562 If > 0 show detailed error/warning messages.
1563 amax: float
1564 The amplitude range of the data.
1566 Raises
1567 ------
1568 FileNotFoundError:
1569 Invalid or non existing fishgrid files.
1570 ValueError:
1571 .gz files not supported.
1572 """
1573 self.verbose = verbose
1575 # open trace files:
1576 filepath = Path(filepath)
1577 self.trace_filepaths = relacs_trace_files(filepath)
1578 if len(self.trace_filepaths) == 0:
1579 raise FileNotFoundError('no relacs files found')
1580 self.sf = []
1581 self.frames = None
1582 self.rate = None
1583 self.unit = ''
1584 self.filepath = filepath
1585 self.file_paths = [self.filepath]
1586 self.file_indices = [0]
1587 for path in self.trace_filepaths:
1588 if path.suffix == '.gz':
1589 raise ValueError('.gz files not supported')
1590 sf = open(path, 'rb')
1591 self.sf.append(sf)
1592 if self.verbose > 0:
1593 print(f'open_relacs("{path}")')
1594 # file size:
1595 sf.seek(0, os.SEEK_END)
1596 frames = sf.tell()//4
1597 if self.frames is None:
1598 self.frames = frames
1599 elif self.frames != frames:
1600 diff = self.frames - frames
1601 if diff > 1 or diff < -2:
1602 raise ValueError('number of frames of traces differ')
1603 elif diff >= 0:
1604 self.frames = frames
1605 sf.seek(0)
1606 # retrieve sampling rate and unit:
1607 rate, us = relacs_samplerate_unit(path)
1608 if self.rate is None:
1609 self.rate = rate
1610 elif rate != self.rate:
1611 raise ValueError('sampling rates of traces differ')
1612 if len(self.unit) == 0:
1613 self.unit = us
1614 elif us != self.unit:
1615 raise ValueError('unit of traces differ')
1616 self.channels = len(self.sf)
1617 self.shape = (self.frames, self.channels)
1618 self.size = self.frames * self.channels
1619 self.ndim = len(self.shape)
1620 self.format = 'RELACS'
1621 self.encoding = 'FLOAT'
1622 self.bufferframes = int(buffersize*self.rate)
1623 self.backframes = int(backsize*self.rate)
1624 self.init_buffer()
1625 self.offset = 0
1626 self.close = self._close_relacs
1627 self.load_audio_buffer = self._load_buffer_relacs
1628 self.basename = self._basename_relacs
1629 self.ampl_min = -amax
1630 self.ampl_max = +amax
1631 self._load_metadata = metadata_relacs
1632 # TODO: load markers:
1633 self._locs = np.zeros((0, 2), dtype=int)
1634 self._labels = np.zeros((0, 2), dtype=object)
1635 self._load_markers = None
1636 return self
1638 def _close_relacs(self):
1639 """Close the relacs data files.
1640 """
1641 for f in self.sf:
1642 f.close()
1643 self.sf = []
1645 def _load_buffer_relacs(self, r_offset, r_size, buffer):
1646 """Load new data from relacs data file.
1648 Parameters
1649 ----------
1650 r_offset: int
1651 First frame to be read from file.
1652 r_size: int
1653 Number of frames to be read from file.
1654 buffer: ndarray
1655 Buffer where to store the loaded data.
1656 """
1657 if len(self.sf) == 0 and len(self.trace_filepaths) > 0:
1658 for path in self.trace_filepaths:
1659 self.sf.append(open(path, 'rb'))
1660 for i, f in enumerate(self.sf):
1661 f.seek(r_offset*4)
1662 data = f.read(r_size*4)
1663 buffer[:, i] = np.frombuffer(data, dtype=np.float32)
1666 def _basename_relacs(self, path=None):
1667 """ Base name of the relacs data files.
1669 Parameters
1670 ----------
1671 path: str or None
1672 Path of a relacs data file (*.raw, info.dat, or just the directory).
1673 If `None`, use `self.filepath`.
1675 Returns
1676 -------
1677 s: str
1678 The base name, i.e. the name of the directory containing the
1679 relacs data files.
1681 """
1682 if path is None:
1683 path = self.filepath
1684 else:
1685 path = Path(path)
1686 if path.is_dir():
1687 return path.name
1688 else:
1689 return path.parent.name
1692 # fishgrid interface:
1693 def open_fishgrid(self, filepath, buffersize=10.0, backsize=0.0,
1694 verbose=0):
1695 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading.
1697 Parameters
1698 ----------
1699 filepath: str
1700 Path to a fishgrid data directory, or a file therein.
1701 buffersize: float
1702 Size of internal buffer in seconds.
1703 backsize: float
1704 Part of the buffer to be loaded before the requested start index in seconds.
1705 verbose: int
1706 If > 0 show detailed error/warning messages.
1708 Raises
1709 ------
1710 FileNotFoundError:
1711 Invalid or non existing fishgrid files.
1712 ValueError:
1713 .gz files not supported.
1714 """
1715 self.verbose = verbose
1717 filepath = Path(filepath)
1718 self.trace_filepaths = fishgrid_trace_files(filepath)
1719 if len(self.trace_filepaths) == 0:
1720 raise FileNotFoundError(f'no fishgrid files found')
1721 self.filepath = filepath
1722 self.file_paths = [self.filepath]
1723 self.file_indices = [0]
1724 self._load_metadata = metadata_fishgrid
1725 self._load_markers = markers_fishgrid
1727 # open grid files:
1728 grids = fishgrid_grids(self.metadata())
1729 grid_sizes = [r*c for r,c in grids]
1730 self.channels = 0
1731 for g, path in enumerate(self.trace_filepaths):
1732 self.channels += grid_sizes[g]
1733 self.sf = []
1734 self.grid_channels = []
1735 self.grid_offs = []
1736 offs = 0
1737 self.frames = None
1738 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate')
1739 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt')
1740 if v is not None:
1741 self.ampl_min = -v
1742 self.ampl_max = +v
1744 for g, path in enumerate(self.trace_filepaths):
1745 if path.suffix == '.gz':
1746 raise ValueError('.gz files not supported')
1747 sf = open(path, 'rb')
1748 self.sf.append(sf)
1749 if self.verbose > 0:
1750 print(f'open_fishgrid("{path}")')
1751 # grid channels:
1752 self.grid_channels.append(grid_sizes[g])
1753 self.grid_offs.append(offs)
1754 offs += grid_sizes[g]
1755 # file size:
1756 sf.seek(0, os.SEEK_END)
1757 frames = sf.tell()//4//grid_sizes[g]
1758 if self.frames is None:
1759 self.frames = frames
1760 elif self.frames != frames:
1761 diff = self.frames - frames
1762 if diff > 1 or diff < -2:
1763 raise ValueError('number of frames of traces differ')
1764 elif diff >= 0:
1765 self.frames = frames
1766 sf.seek(0)
1767 self.shape = (self.frames, self.channels)
1768 self.size = self.frames * self.channels
1769 self.ndim = len(self.shape)
1770 self.format = 'FISHGRID'
1771 self.encoding = 'FLOAT'
1772 self.bufferframes = int(buffersize*self.rate)
1773 self.backframes = int(backsize*self.rate)
1774 self.init_buffer()
1775 self.offset = 0
1776 self.close = self._close_fishgrid
1777 self.load_audio_buffer = self._load_buffer_fishgrid
1778 self.basename = self._basename_fishgrid
1779 return self
1781 def _close_fishgrid(self):
1782 """Close the fishgrid data files.
1783 """
1784 for file in self.sf:
1785 file.close()
1786 self.sf = []
1788 def _load_buffer_fishgrid(self, r_offset, r_size, buffer):
1789 """Load new data from relacs data file.
1791 Parameters
1792 ----------
1793 r_offset: int
1794 First frame to be read from file.
1795 r_size: int
1796 Number of frames to be read from file.
1797 buffer: ndarray
1798 Buffer where to store the loaded data.
1799 """
1800 if len(self.sf) == 0 and len(self.trace_filepaths) > 0:
1801 for path in self.trace_filepaths:
1802 self.sf.append(open(path, 'rb'))
1803 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs):
1804 file.seek(r_offset*4*gchannels)
1805 data = file.read(r_size*4*gchannels)
1806 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels))
1808 def _basename_fishgrid(self, path=None):
1809 """ Base name of the fishgrid data files.
1811 Parameters
1812 ----------
1813 path: str or Path or None
1814 Path of a fishgrid data file
1815 (*.raw, fishgrid.cfg, or just the directory).
1816 If `None`, use `self.filepath`.
1818 Returns
1819 -------
1820 s: str
1821 The base name, i.e. the name of the directory containing the
1822 fishgrid data files.
1824 """
1825 if path is None:
1826 path = self.filepath
1827 else:
1828 path = Path(path)
1829 if path.is_dir():
1830 return path.name
1831 else:
1832 return path.parent.name
1836 # container interface:
1837 def open_container(self, filepath, buffersize=10.0,
1838 backsize=0.0, verbose=0, datakey=None,
1839 samplekey=['rate', 'Fs', 'fs'],
1840 timekey=['time'], amplkey=['amax'], unitkey='unit',
1841 metadatakey=['metadata', 'info'],
1842 poskey=['positions'],
1843 spanskey=['spans'], labelskey=['labels'],
1844 descrkey=['descriptions'],
1845 amax=1.0, unit='a.u.'):
1846 """Open generic container file.
1848 Supported file formats are:
1850 - python pickle files (.pkl)
1851 - numpy files (.npz)
1852 - matlab files (.mat)
1854 Parameters
1855 ----------
1856 filepath: str
1857 Path to a container file.
1858 buffersize: float
1859 Size of internal buffer in seconds.
1860 backsize: float
1861 Part of the buffer to be loaded before the requested start index in seconds.
1862 verbose: int
1863 If > 0 show detailed error/warning messages.
1864 datakey: None, str, or list of str
1865 Name of the variable holding the data. If `None` take the
1866 variable that is an 2D array and has the largest number of
1867 elements.
1868 samplekey: str or list of str
1869 Name of the variable holding the sampling rate.
1870 timekey: str or list of str
1871 Name of the variable holding sampling times.
1872 If no sampling rate is available, the sampling rate is retrieved
1873 from the sampling times.
1874 amplkey: str or list of str
1875 Name of the variable holding the amplitude range of the data.
1876 unitkey: str
1877 Name of the variable holding the unit of the data.
1878 metadatakey: str or list of str
1879 Name of the variable holding the metadata.
1880 poskey: str or list of str
1881 Name of the variable holding positions of markers.
1882 spanskey: str or list of str
1883 Name of the variable holding spans of markers.
1884 labelskey: str or list of str
1885 Name of the variable holding labels of markers.
1886 descrkey: str or list of str
1887 Name of the variable holding descriptions of markers.
1888 amax: None or float
1889 If specified and no amplitude range has been found in the data
1890 container, then this is the amplitude range of the data.
1891 unit: None or str
1892 If specified and no unit has been found in the data container,
1893 then return this as the unit of the data.
1895 Raises
1896 ------
1897 ValueError:
1898 Invalid key requested.
1899 """
1900 self.verbose = verbose
1901 data_dict = {}
1902 filepath = Path(filepath)
1903 ext = filepath.suffix.lower()
1904 if ext == '.pkl':
1905 with open(filepath, 'rb') as f:
1906 data_dict = pickle.load(f)
1907 self.format = 'PKL'
1908 elif ext == '.npz':
1909 data_dict = np.load(filepath)
1910 self.format = 'NPZ'
1911 elif ext == '.mat':
1912 from scipy.io import loadmat
1913 data_dict = loadmat(filepath, squeeze_me=True)
1914 self.format = 'MAT'
1915 if self.verbose > 0:
1916 print(f'open_container("{filepath}")')
1917 self.buffer, self.rate, self.unit, amax = \
1918 extract_container_data(data_dict, datakey, samplekey,
1919 timekey, amplkey, unitkey, amax, unit)
1920 self.filepath = filepath
1921 self.file_paths = [self.filepath]
1922 self.file_indices = [0]
1923 self.channels = self.buffer.shape[1]
1924 self.frames = self.buffer.shape[0]
1925 self.shape = self.buffer.shape
1926 self.ndim = self.buffer.ndim
1927 self.size = self.buffer.size
1928 self.encoding = self.numpy_encodings[self.buffer.dtype]
1929 self.ampl_min = -amax
1930 self.ampl_max = +amax
1931 self.offset = 0
1932 self.buffer_changed = np.zeros(self.channels, dtype=bool)
1933 self.bufferframes = self.frames
1934 self.backsize = 0
1935 self.close = self._close_container
1936 self.load_audio_buffer = self._load_buffer_container
1937 self._metadata = extract_container_metadata(data_dict, metadatakey)
1938 self._load_metadata = None
1939 self._locs, self._labels = extract_container_markers(data_dict,
1940 poskey,
1941 spanskey,
1942 labelskey,
1943 descrkey)
1944 self._load_markers = None
1946 def _close_container(self):
1947 """Close container. """
1948 pass
1950 def _load_buffer_container(self, r_offset, r_size, buffer):
1951 """Load new data from container."""
1952 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :]
1955 # raw data interface:
1956 def open_raw(self, filepath, buffersize=10.0, backsize=0.0,
1957 verbose=0, rate=44000, channels=1, encoding='FLOAT',
1958 amax=1.0, unit='a.u.'):
1959 """Load data from a raw file.
1961 Raw files just contain the data and absolutely no metadata, not
1962 even the smapling rate, number of channels, etc.
1963 Supported file formats are:
1965 - raw files (*.raw)
1966 - LabView scandata (*.scandat)
1968 Parameters
1969 ----------
1970 filepath: str or Path
1971 Path of the file to load.
1972 buffersize: float
1973 Size of internal buffer in seconds.
1974 backsize: float
1975 Part of the buffer to be loaded before the requested start index in seconds.
1976 verbose: int
1977 If > 0 show detailed error/warning messages.
1978 rate: float
1979 Sampling rate of the data in Hertz.
1980 channels: int
1981 Number of channels multiplexed in the data.
1982 encoding: str
1983 The encoding of the data stored in the file.
1984 Valid encodings are 'PCM_16', 'PCM_32', 'PCM_64', 'FLOAT', or
1985 'DOUBLE' or lower-case versions thereof.
1986 amax: float
1987 The amplitude range of the data.
1988 unit: str
1989 The unit of the data.
1990 """
1991 encodings = {'PCM_16': 'i2',
1992 'PCM_32': 'i4',
1993 'PCM_64': 'i8',
1994 'FLOAT': 'f',
1995 'DOUBLE': 'd'}
1996 encoding = encoding.upper()
1997 if not encoding in encodings:
1998 raise ValueError(f'invalid encoding {encoding} for raw file!')
1999 self.dtype = np.dtype(encodings[encoding])
2000 self.verbose = verbose
2001 self.filepath = Path(filepath)
2002 self.file_paths = [self.filepath]
2003 self.file_indices = [0]
2004 self.sf = open(self.filepath, 'rb')
2005 if self.verbose > 0:
2006 print(f'open_raw("{self.filepath}")')
2007 self.rate = float(rate)
2008 # file size:
2009 self.channels = int(channels)
2010 self.sf.seek(0, os.SEEK_END)
2011 self.frames = self.sf.tell()//self.dtype.itemsize//self.channels
2012 self.sf.seek(0)
2013 self.shape = (self.frames, self.channels)
2014 self.ndim = len(self.shape)
2015 self.size = self.frames*self.channels
2016 self.format = 'RAW'
2017 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN')
2018 self.unit = unit
2019 self.ampl_max = float(amax)
2020 self.ampl_min = -self.ampl_max
2021 self.offset = 0
2022 self.bufferframes = int(buffersize*self.rate)
2023 self.backframes = int(backsize*self.rate)
2024 self.init_buffer()
2025 self.close = self._close_raw
2026 self.load_audio_buffer = self._load_buffer_raw
2027 self._metadata = None
2028 self._load_metadata = None
2029 self._locs = None
2030 self._labels = None
2031 self._load_markers = None
2033 def _close_raw(self):
2034 """Close raw file. """
2035 if self.sf is not None:
2036 self.sf.close()
2037 self.sf = None
2039 def _load_buffer_raw(self, r_offset, r_size, buffer):
2040 """Load new data from container."""
2041 if self.sf is None:
2042 self.sf = open(self.filepath, 'rb')
2043 self.sf.seek(r_offset*self.dtype.itemsize*self.channels)
2044 raw_data = self.sf.read(r_size*self.dtype.itemsize*self.channels)
2045 raw_data = np.frombuffer(raw_data, dtype=self.dtype)
2046 raw_data = raw_data.reshape(-1, self.channels)
2047 # recode:
2048 if self.dtype == np.dtype('int16'):
2049 data = raw_data.astype('float32')
2050 data *= self.ampl_max/2**15
2051 elif self.dtype == np.dtype('int32'):
2052 data = raw_data.astype(float)
2053 data *= self.ampl_max/2**31
2054 elif self.dtype == np.dtype('int64'):
2055 data = raw_data.astype(float)
2056 data *= self.ampl_max/2**63
2057 else:
2058 data = raw_data
2059 buffer[:, :] = data
2062 # audioio interface:
2063 def open_audioio(self, filepath, buffersize=10.0, backsize=0.0,
2064 verbose=0, gainkey=default_gain_keys, sep='.',
2065 amax=None, unit='a.u.'):
2066 """Open an audio file.
2068 See the [audioio](https://github.com/bendalab/audioio) package
2069 for details.
2071 Parameters
2072 ----------
2073 filepath: str
2074 Path to an audio file.
2075 buffersize: float
2076 Size of internal buffer in seconds.
2077 backsize: float
2078 Part of the buffer to be loaded before the requested start index
2079 in seconds.
2080 verbose: int
2081 If > 0 show detailed error/warning messages.
2082 gainkey: str or list of str
2083 Key in the file's metadata that holds some gain information.
2084 If found, the data will be multiplied with the gain,
2085 and if available, the corresponding unit is returned.
2086 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
2087 sep: str
2088 String that separates section names in `gainkey`.
2089 amax: None or float
2090 If specified and no gain has been found in the metadata,
2091 then use this as the amplitude range.
2092 unit: None or str
2093 If specified and no gain has been found in the metadata,
2094 then this is the unit of the data.
2096 """
2097 self.verbose = verbose
2098 super(DataLoader, self).open(filepath, buffersize, backsize, verbose)
2099 md = self.metadata()
2100 fac, unit = get_gain(md, gainkey, sep, amax, unit)
2101 if fac is None:
2102 self.gain_fac = 1.0
2103 else:
2104 self.gain_fac = fac
2105 self._load_buffer_audio_org = self.load_audio_buffer
2106 self.load_audio_buffer = self._load_buffer_audioio
2107 self.ampl_min *= self.gain_fac
2108 self.ampl_max *= self.gain_fac
2109 self.unit = unit
2110 return self
2112 def _load_buffer_audioio(self, r_offset, r_size, buffer):
2113 """Load and scale new data from an audio file.
2115 Parameters
2116 ----------
2117 r_offset: int
2118 First frame to be read from file.
2119 r_size: int
2120 Number of frames to be read from file.
2121 buffer: ndarray
2122 Buffer where to store the loaded data.
2123 """
2124 self._load_buffer_audio_org(r_offset, r_size, buffer)
2125 buffer *= self.gain_fac
2128 # open multiple files as one:
2129 def open_multiple(self, filepaths, buffersize=10.0, backsize=0.0,
2130 verbose=0, mode='strict', rate=None, channels=None,
2131 unit=None, amax=None, end_indices=None):
2132 """Open multiple files as a single concatenated array.
2134 Parameters
2135 ----------
2136 filepaths: list of str or Path
2137 List of file paths of audio files.
2138 buffersize: float
2139 Size of internal buffer in seconds.
2140 backsize: float
2141 Part of the buffer to be loaded before the requested start index in seconds.
2142 verbose: int
2143 If larger than zero show detailed error/warning messages.
2144 mode: 'relaxed' or 'strict'
2145 If 'strict', only concatenate files if they contain
2146 a start time in their meta data.
2147 rate: float
2148 If provided, do a minimal initialization (no checking)
2149 using the provided sampling rate (in Hertz), channels,
2150 unit, maximum amplitude, and end_indices.
2151 channels: int
2152 If provided, do a minimal initialization (no checking)
2153 using the provided rate, number of channels,
2154 unit, maximum amplitude, and end_indices.
2155 unit: str
2156 If provided, do a minimal initialization (no checking)
2157 using the provided rate, number of channels,
2158 unit, maximum amplitude, and end_indices.
2159 amax: float
2160 If provided, do a minimal initialization (no checking)
2161 using the provided rate, number of channels,
2162 unit, maximum amplitude amax, and end_indices.
2163 end_indices: sequence of int
2164 If provided, do a minimal initialization (no checking)
2165 using the provided rate, channels,
2166 unit, maximum amplitude, and end_indices.
2168 Raises
2169 ------
2170 TypeError
2171 `filepaths` must be a sequence.
2172 ValueError
2173 Empty `filepaths`.
2174 FileNotFoundError
2175 `filepaths` does not contain a single valid file.
2177 """
2178 if not isinstance(filepaths, (list, tuple, np.ndarray)):
2179 raise TypeError('input argument filepaths is not a sequence!')
2180 if len(filepaths) == 0:
2181 raise ValueError('input argument filepaths is empy sequence!')
2182 self.buffersize = buffersize
2183 self.backsize = backsize
2184 self.filepath = None
2185 self.file_paths = []
2186 self.open_files = []
2187 self.open_loaders = []
2188 self.data_files = []
2189 self.collect_counter = 0
2190 self.frames = 0
2191 self.start_indices = []
2192 self.end_indices = []
2193 self.start_time = None
2194 start_time = None
2195 self._metadata = {}
2196 self._locs = np.zeros((0, 2), dtype=int)
2197 self._labels = np.zeros((0, 2), dtype=object)
2198 if end_indices is not None:
2199 self.file_paths = [Path(fp) for fp in filepaths]
2200 self.filepath = self.file_paths[0]
2201 self.data_files = [None] * len(self.file_paths)
2202 self.frames = end_indices[-1]
2203 self.start_indices = [0] + list(end_indices[:-1])
2204 self.end_indices = end_indices
2205 self.format = None
2206 self.encoding = None
2207 self.rate = rate
2208 self.channels = channels
2209 self.unit = unit
2210 self.ampl_max = amax
2211 self.ampl_min = -amax
2212 else:
2213 for filepath in filepaths:
2214 try:
2215 a = DataLoader(filepath, buffersize, backsize, verbose)
2216 except Exception as e:
2217 if verbose > 0:
2218 print(e)
2219 continue
2220 # collect metadata:
2221 md = a.metadata()
2222 fmd = flatten_metadata(md, True)
2223 add_metadata(self._metadata, fmd)
2224 if self.filepath is None:
2225 # first file:
2226 self.filepath = a.filepath
2227 self.format = a.format
2228 self.encoding = a.encoding
2229 self.rate = a.rate
2230 self.channels = a.channels
2231 self.unit = a.unit
2232 self.ampl_max = a.ampl_max
2233 self.ampl_min = a.ampl_min
2234 self.start_time = get_datetime(md)
2235 start_time = self.start_time
2236 stime = self.start_time
2237 else:
2238 # check channels, rate, and amplitudes:
2239 error_str = None
2240 if a.channels != self.channels:
2241 error_str = f'number of channels differs: ' \
2242 f'{a.channels} in {a.filepath} versus ' \
2243 f'{self.channels} in {self.filepath}'
2244 if a.rate != self.rate:
2245 error_str = f'sampling rates differ: ' \
2246 f'{a.rate} in {a.filepath} versus ' \
2247 f'{self.rate} in {self.filepath}'
2248 if a.ampl_min != self.ampl_min:
2249 error_str = f'minimum amplitudes differ: ' \
2250 f'{a.ampl_min} in {a.filepath} versus ' \
2251 f'{self.ampl_min} in {self.filepath}'
2252 if a.ampl_max != self.ampl_max:
2253 error_Str = f'maximum amplitudes differ: ' \
2254 f'{a.ampl_max} in {a.filepath} versus ' \
2255 f'{self.ampl_max} in {self.filepath}'
2256 # check start time of recording:
2257 stime = get_datetime(md)
2258 if mode == 'strict' and (start_time is None or stime is None):
2259 error_str = 'file does not contain a start time in its meta data'
2260 if start_time is not None and stime is not None and \
2261 abs(start_time - stime) > timedelta(seconds=self._max_time_diff):
2262 error_str = f'start time does not indicate continuous recording: ' \
2263 f'expected {start_time} instead of ' \
2264 f'{stime} in {a.filepath}'
2265 if error_str is not None:
2266 if verbose > 0:
2267 print(error_str)
2268 a.close()
2269 del a
2270 break
2271 # markers:
2272 locs, labels = a.markers()
2273 locs[:,0] += self.frames
2274 self._locs = np.vstack((self._locs, locs))
2275 self._labels = np.vstack((self._labels, labels))
2276 # indices:
2277 self.start_indices.append(self.frames)
2278 self.frames += a.frames
2279 self.end_indices.append(self.frames)
2280 if stime is not None:
2281 start_time = stime + timedelta(seconds=a.frames/a.rate)
2282 # add file to lists:
2283 self.file_paths.append(a.filepath)
2284 if len(self.open_files) < AudioLoader.max_open_files:
2285 self.open_files.append(a)
2286 else:
2287 a.close()
2288 if len(self.open_loaders) < AudioLoader.max_open_loaders:
2289 self.data_files.append(a)
2290 self.open_loaders.append(a)
2291 else:
2292 a.close()
2293 del a
2294 self.data_files.append(None)
2295 if len(self.data_files) == 0:
2296 raise FileNotFoundError('input argument filepaths does not contain any valid audio file!')
2297 # set startime from first file:
2298 if self.start_time is not None:
2299 set_starttime(self._metadata, self.start_time)
2300 # setup infrastructure:
2301 self.file_indices = self.start_indices
2302 self.start_indices = np.array(self.start_indices)
2303 self.end_indices = np.array(self.end_indices)
2304 self.shape = (self.frames, self.channels)
2305 self.bufferframes = int(buffersize*self.rate)
2306 self.backframes = int(backsize*self.rate)
2307 self.init_buffer()
2308 self.close = self._close_multiple
2309 self.load_audio_buffer = self._load_buffer_multiple
2310 self._load_metadata = None
2311 self._load_markers = None
2312 return self
2314 def _close_multiple(self):
2315 """Close all the data files. """
2316 self.open_files = []
2317 self.open_loaders = []
2318 if hasattr(self, 'data_files'):
2319 for a in self.data_files:
2320 if a is not None:
2321 a.close()
2322 self.data_files = []
2323 self.filepath = None
2324 self.file_paths = []
2325 self.file_indices = []
2326 self.start_indices = []
2327 self.end_indices = []
2328 del self.data_files
2329 del self.open_files
2330 del self.open_loaders
2331 del self.start_indices
2332 del self.end_indices
2334 def _load_buffer_multiple(self, r_offset, r_size, buffer):
2335 """Load new data from the underlying files.
2337 Parameters
2338 ----------
2339 r_offset: int
2340 First frame to be read from file.
2341 r_size: int
2342 Number of frames to be read from file.
2343 buffer: ndarray
2344 Buffer where to store the loaded data.
2345 """
2346 offs = r_offset
2347 size = r_size
2348 boffs = 0
2349 ai = np.searchsorted(self.end_indices, offs, side='right')
2350 while size > 0:
2351 if self.data_files[ai] is None:
2352 a = DataLoader(self.file_paths[ai],
2353 self.buffersize, self.backsize, 0)
2354 self.data_files[ai] = a
2355 self.open_loaders.append(a)
2356 self.open_files.append(a)
2357 if len(self.open_files) > AudioLoader.max_open_files:
2358 a0 = self.open_files.pop(0)
2359 a0.close()
2360 if len(self.open_loaders) > AudioLoader.max_open_loaders:
2361 a0 = self.open_loaders.pop(0)
2362 self.data_files[self.data_files.index(a0)] = None
2363 a0.close()
2364 del a0
2365 self.collect_counter += 1
2366 if self.collect_counter > AudioLoader.max_open_loaders//2:
2367 gc.collect() # takes time!
2368 self.collect_counter = 0
2369 else:
2370 self.open_loaders.pop(self.open_loaders.index(self.data_files[ai]))
2371 self.open_loaders.append(self.data_files[ai])
2372 ai0 = offs - self.start_indices[ai]
2373 ai1 = offs + size
2374 if ai1 > self.end_indices[ai]:
2375 ai1 = self.end_indices[ai]
2376 ai1 -= self.start_indices[ai]
2377 n = ai1 - ai0
2378 self.data_files[ai].load_audio_buffer(ai0, n,
2379 buffer[boffs:boffs + n,:])
2380 if self.data_files[ai] in self.open_files:
2381 self.open_files.pop(self.open_files.index(self.data_files[ai]))
2382 self.open_files.append(self.data_files[ai])
2383 if len(self.open_files) > AudioLoader.max_open_files:
2384 self.open_files[0].close()
2385 self.open_files.pop(0)
2386 boffs += n
2387 offs += n
2388 size -= n
2389 ai += 1
2392 def open(self, filepath, buffersize=10.0, backsize=0.0,
2393 verbose=0, **kwargs):
2394 """Open file with time-series data for reading.
2396 Parameters
2397 ----------
2398 filepath: str or list of str
2399 Name of the file or list of many file names that should be
2400 made accessible as a single array.
2401 buffersize: float
2402 Size of internal buffer in seconds.
2403 backsize: float
2404 Part of the buffer to be loaded before the requested start index
2405 in seconds.
2406 verbose: int
2407 If > 0 show detailed error/warning messages.
2408 **kwargs: dict
2409 Further keyword arguments that are passed on to the
2410 format specific opening functions.
2411 For example:
2412 - `amax`: the amplitude range of the data.
2413 - 'unit': the unit of the data.
2415 Raises
2416 ------
2417 ValueError:
2418 `filepath` is empty string.
2419 """
2420 # list of implemented open functions:
2421 data_open_funcs = (
2422 ('relacs', check_relacs, self.open_relacs, 1),
2423 ('fishgrid', check_fishgrid, self.open_fishgrid, 1),
2424 ('container', check_container, self.open_container, 1),
2425 ('raw', check_raw, self.open_raw, 1),
2426 ('audioio', None, self.open_audioio, 0),
2427 )
2429 self.buffer = np.array([])
2430 self.rate = 0.0
2431 if not filepath:
2432 raise ValueError('input argument filepath is empty string.')
2433 if isinstance(filepath, (list, tuple, np.ndarray)):
2434 if len(filepath) > 1:
2435 self.open_multiple(filepath, buffersize, backsize,
2436 verbose, **kwargs)
2437 if len(self.file_paths) > 1:
2438 return self
2439 filepath = self.file_paths[0]
2440 self.close()
2441 else:
2442 filepath = filepath[0]
2443 # open data:
2444 for name, check_file, open_file, v in data_open_funcs:
2445 if check_file is None or check_file(filepath):
2446 open_file(filepath, buffersize, backsize, verbose, **kwargs)
2447 if v*verbose > 1:
2448 if self.format is not None:
2449 print(f' format : {self.format}')
2450 if self.encoding is not None:
2451 print(f' encoding : {self.encoding}')
2452 print(f' sampling rate: {self.rate} Hz')
2453 print(f' channels : {self.channels}')
2454 print(f' frames : {self.frames}')
2455 print(f' range : {self.ampl_max:g}{self.unit}')
2456 break
2457 return self
2460def demo(filepath, plot=False):
2461 print("try load_data:")
2462 data, rate, unit, amax = load_data(filepath, verbose=2)
2463 if plot:
2464 fig, ax = plt.subplots()
2465 time = np.arange(len(data))/rate
2466 for c in range(data.shape[1]):
2467 ax.plot(time, data[:,c])
2468 ax.set_xlabel('Time [s]')
2469 ax.set_ylabel(f'[{unit}]')
2470 if amax is not None and np.isfinite(amax):
2471 ax.set_ylim(-amax, +amax)
2472 plt.show()
2473 return
2475 print('')
2476 print("try DataLoader:")
2477 with DataLoader(filepath, 2.0, 1.0, 1) as data:
2478 print('sampling rate: %g' % data.rate)
2479 print('frames : %d %d' % (len(data), data.shape[0]))
2480 nframes = int(1.0 * data.rate)
2481 # forward:
2482 for i in range(0, len(data), nframes):
2483 print('forward %d-%d' % (i, i + nframes))
2484 x = data[i:i + nframes, 0]
2485 if plot:
2486 fig, ax = plt.subplots()
2487 ax.plot((i + np.arange(len(x)))/data.rate, x)
2488 ax.set_xlabel('Time [s]')
2489 ax.set_ylabel(f'[{data.unit}]')
2490 plt.show()
2491 # and backwards:
2492 for i in reversed(range(0, len(data), nframes)):
2493 print('backward %d-%d' % (i, i + nframes))
2494 x = data[i:i + nframes, 0]
2495 if plot:
2496 fig, ax = plt.subplots()
2497 ax.plot((i + np.arange(len(x)))/data.rate, x)
2498 ax.set_xlabel('Time [s]')
2499 ax.set_ylabel(f'[{data.unit}]')
2500 plt.show()
2503def main(*cargs):
2504 """Call demo with command line arguments.
2506 Parameters
2507 ----------
2508 cargs: list of str
2509 Command line arguments as provided by sys.argv[1:]
2510 """
2511 import argparse
2512 parser = argparse.ArgumentParser(description=
2513 'Checking thunderlab.dataloader module.')
2514 parser.add_argument('-p', dest='plot', action='store_true',
2515 help='plot loaded data')
2516 parser.add_argument('file', nargs=1, default='', type=str,
2517 help='name of data file')
2518 args = parser.parse_args(cargs)
2519 demo(args.file[0], args.plot)
2522if __name__ == "__main__":
2523 main(*sys.argv[1:])