Coverage for src/thunderlab/dataloader.py: 77%
1077 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-03-18 22:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-03-18 22:36 +0000
1"""Load time-series data from files.
3```
4data, rate, unit, amax = load_data('data/file.wav')
5```
7The function `data_loader()` loads the whole time-series from the file
8as a numpy array of floats. First dimension is frames, second is
9channels. In contrast to the `audioio.load_audio()` function, the
10values of the data array are not restricted between -1 and 1. They can
11assume any value wihin the range `-amax` to `+amax` with the returned
12`unit`.
14```
15data = DataLoader('data/file.wav', 60.0)
16```
17or
18```
19with DataLoader('data/file.wav', 60.0) as data:
20```
21Create an `DataLoader` object that loads chuncks of 60 seconds long data
22on demand. `data` can be used like a read-only numpy array of floats.
25## Supported file formats
27- python pickle files
28- numpy .npz files
29- matlab .mat files
30- audio files via [`audioio`](https://github.com/bendalab/audioio) package
31- LabView .scandat files
32- relacs trace*.raw files (https://www.relacs.net)
33- fishgrid traces-*.raw files (https://github.com/bendalab/fishgrid)
36## Metadata
38Many file formats allow to store metadata that further describe the
39stored time series data. We handle them as nested dictionary of key-value
40pairs. Load them with the `metadata()` function:
41```
42metadata = metadata('data/file.mat')
43```
45## Markers
47Some file formats also allow to store markers that mark specific
48positions in the time series data. Load marker positions and spans (in
49the 2-D array `locs`) and label and text strings (in the 2-D array
50`labels`) with the `markers()` function:
51```
52locs, labels = markers('data.wav')
53```
55## Aditional, format specific functions
57- `extract_container_metadata()`: extract metadata from dictionary loaded from a container file.
58- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file.
59- `relacs_header()`: read key-value pairs from relacs *.dat file headers.
60- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file.
61- `fishgrid_spacings()`: spacing between grid electrodes.
63"""
65import gc
66import os
67import sys
68import glob
69import gzip
70import numpy as np
71try:
72 import matplotlib.pyplot as plt
73except ImportError:
74 pass
75from datetime import timedelta
76from audioio import load_audio, AudioLoader, unflatten_metadata
77from audioio import get_number_unit, get_number, get_int, get_bool, get_gain
78from audioio import default_starttime_keys, default_gain_keys
79from audioio import get_datetime, flatten_metadata, add_metadata, set_starttime
80from audioio import metadata as metadata_audioio
81from audioio import markers as markers_audioio
84def relacs_samplerate_unit(filepath, channel=0):
85 """Retrieve sampling rate and unit from a relacs stimuli.dat file.
87 Parameters
88 ----------
89 filepath: str
90 Path to a relacs data directory, or a file in a relacs data directory.
91 channel: int
92 Channel (trace) number, if `filepath` does not specify a
93 trace-*.raw file.
95 Returns
96 -------
97 samplerate: float
98 Sampling rate in Hertz
99 unit: str
100 Unit of the trace, can be empty if not found
102 Raises
103 ------
104 IOError/FileNotFoundError:
105 If the stimuli.dat file does not exist.
106 ValueError:
107 stimuli.dat file does not contain sampling rate.
108 """
109 trace = channel + 1
110 relacs_dir = filepath
111 # check for relacs data directory:
112 if not os.path.isdir(filepath):
113 relacs_dir = os.path.dirname(filepath)
114 bn = os.path.basename(filepath).lower()
115 i = bn.find('.raw')
116 if len(bn) > 5 and bn[0:5] == 'trace' and i > 6:
117 trace = int(bn[6:i])
119 # retreive sampling rate and unit from stimuli.dat file:
120 samplerate = None
121 sampleinterval = None
122 unit = ""
124 lines = []
125 stimuli_file = os.path.join(relacs_dir, 'stimuli.dat')
126 if os.path.isfile(stimuli_file + '.gz'):
127 stimuli_file += '.gz'
128 if stimuli_file[-3:] == '.gz':
129 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf:
130 for line in sf:
131 line = line.strip()
132 if len(line) == 0 or line[0] != '#':
133 break
134 lines.append(line)
135 else:
136 with open(stimuli_file, 'r', encoding='latin-1') as sf:
137 for line in sf:
138 line = line.strip()
139 if len(line) == 0 or line[0] != '#':
140 break
141 lines.append(line)
143 for line in lines:
144 if "unit%d" % trace in line:
145 unit = line.split(':')[1].strip()
146 if "sampling rate%d" % trace in line:
147 value = line.split(':')[1].strip()
148 samplerate = float(value.replace('Hz',''))
149 elif "sample interval%d" % trace in line:
150 value = line.split(':')[1].strip()
151 sampleinterval = float(value.replace('ms',''))
153 if samplerate is not None:
154 return samplerate, unit
155 if sampleinterval is not None:
156 return 1000/sampleinterval, unit
157 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}')
160def relacs_header(filepath, store_empty=False, first_only=False,
161 lower_keys=False, flat=False,
162 add_sections=False):
163 """Read key-value pairs from a relacs *.dat file header.
165 Parameters
166 ----------
167 filepath: str
168 A relacs *.dat file, can be also a zipped .gz file.
169 store_empty: bool
170 If `False` do not add meta data with empty values.
171 first_only: bool
172 If `False` only store the first element of a list.
173 lower_keys: bool
174 Make all keys lower case.
175 flat: bool
176 Do not make a nested dictionary.
177 Use this option also to read in very old relacs metadata with
178 ragged left alignment.
179 add_sections: bool
180 If `True`, prepend keys with sections names separated by
181 '.' to make them unique.
183 Returns
184 -------
185 data: dict
186 Nested dictionary with key-value pairs of the file header.
188 Raises
189 ------
190 IOError/FileNotFoundError:
191 If `filepath` cannot be opened.
192 """
193 # read in header from file:
194 lines = []
195 if os.path.isfile(filepath + '.gz'):
196 filepath += '.gz'
197 if filepath[-3:] == '.gz':
198 with gzip.open(filepath, 'r', encoding='latin-1') as sf:
199 for line in sf:
200 line = line.strip()
201 if len(line) == 0 or line[0] != '#':
202 break
203 lines.append(line)
204 else:
205 with open(filepath, 'r', encoding='latin-1') as sf:
206 for line in sf:
207 line = line.strip()
208 if len(line) == 0 or line[0] != '#':
209 break
210 lines.append(line)
211 # parse:
212 data = {}
213 cdatas = [data]
214 sections = ['']
215 ident_offs = None
216 ident = None
217 for line in lines:
218 words = line.split(':')
219 value = ':'.join(words[1:]).strip() if len(words) > 1 else ''
220 if len(words) >= 1:
221 key = words[0].strip('#')
222 # get section level:
223 level = 0
224 if not flat or len(value) == 0:
225 nident = len(key) - len(key.lstrip())
226 if ident_offs is None:
227 ident_offs = nident
228 elif ident is None:
229 if nident > ident_offs:
230 ident = nident - ident_offs
231 level = 1
232 else:
233 level = (nident - ident_offs)//ident
234 # close sections:
235 if not flat:
236 while len(cdatas) > level + 1:
237 cdatas[-1][sections.pop()] = cdatas.pop()
238 else:
239 while len(sections) > level + 1:
240 sections.pop()
241 # key:
242 key = key.strip().strip('"')
243 if lower_keys:
244 key = key.lower()
245 skey = key
246 if add_sections:
247 key = '.'.join(sections[1:] + [key])
248 if len(value) == 0:
249 # new sub-section:
250 if flat:
251 if store_empty:
252 cdatas[-1][key] = None
253 else:
254 cdatas.append({})
255 sections.append(skey)
256 else:
257 # key-value pair:
258 value = value.strip('"')
259 if len(value) > 0 or value != '-' or store_empty:
260 if len(value) > 0 and value[0] == '[' and value[-1] == ']':
261 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')]
262 if first_only:
263 value = value[0]
264 cdatas[-1][key] = value
265 while len(cdatas) > 1:
266 cdatas[-1][sections.pop()] = cdatas.pop()
267 return data
270def check_relacs(filepath):
271 """Check for valid relacs file.
273 Parameters
274 ----------
275 filepath: str
276 Path to a relacs data directory, or a file in a relacs data directory.
278 Returns
279 -------
280 is_relacs: boolean
281 `True` if `filepath` is a valid relacs directory or is a file therein.
282 """
283 # relacs data directory:
284 relacs_dir = filepath
285 if not os.path.isdir(filepath):
286 relacs_dir = os.path.dirname(filepath)
287 # check for a valid relacs data directory:
288 has_stimuli = False
289 has_trace = False
290 for fname in ['stimuli.dat', 'stimuli.dat.gz']:
291 if os.path.isfile(os.path.join(relacs_dir, fname)):
292 has_stimuli = True
293 for fname in ['trace-1.raw', 'trace-1.raw.gz']:
294 if os.path.isfile(os.path.join(relacs_dir, fname)):
295 has_trace = True
296 return has_stimuli and has_trace
299def relacs_trace_files(filepath):
300 """Expand file path for relacs data to appropriate trace*.raw file names.
302 Parameters
303 ----------
304 filepath: str
305 Path to a relacs data directory, or a file in a relacs data directory.
307 Returns
308 -------
309 trace_filepaths: list of str
310 List of relacs trace*.raw files.
311 """
312 relacs_dir = filepath
313 if not os.path.isdir(filepath):
314 relacs_dir = os.path.dirname(filepath)
315 trace_filepaths = []
316 for k in range(10000):
317 fname = os.path.join(relacs_dir, f'trace-{k+1}.raw')
318 if os.path.isfile(fname):
319 trace_filepaths.append(fname)
320 elif os.path.isfile(fname + '.gz'):
321 trace_filepaths.append(fname + '.gz')
322 else:
323 break
324 return trace_filepaths
327def load_relacs(filepath, amax=1.0):
328 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs).
330 Parameters
331 ----------
332 filepath: str
333 Path to a relacs data directory, or a file in a relacs data directory.
334 amax: float
335 The amplitude range of the data.
337 Returns
338 -------
339 data: 2-D array
340 All data traces as an 2-D numpy array, even for single channel data.
341 First dimension is time, second is channel.
342 rate: float
343 Sampling rate of the data in Hz
344 unit: str
345 Unit of the data
346 amax: float
347 Maximum amplitude of data range.
349 Raises
350 ------
351 FileNotFoundError:
352 Invalid or non existing relacs files.
353 ValueError:
354 - Invalid name for relacs trace-*.raw file.
355 - Sampling rates of traces differ.
356 - Unit of traces differ.
357 """
358 trace_filepaths = relacs_trace_files(filepath)
359 if len(trace_filepaths) == 0:
360 raise FileNotFoundError(f'no relacs files found')
361 # load trace*.raw files:
362 nchannels = len(trace_filepaths)
363 data = None
364 nrows = 0
365 rate = None
366 unit = ''
367 for c, path in enumerate(sorted(trace_filepaths)):
368 if path[-3:] == '.gz':
369 with gzip.open(path, 'rb') as sf:
370 x = np.frombuffer(sf.read(), dtype=np.float32)
371 else:
372 x = np.fromfile(path, np.float32)
373 if data is None:
374 nrows = len(x)
375 data = np.zeros((nrows, nchannels))
376 n = min(len(x), nrows)
377 data[:n,c] = x[:n]
378 # retrieve sampling rate and unit:
379 crate, us = relacs_samplerate_unit(path, c)
380 if rate is None:
381 rate = crate
382 elif crate != rate:
383 raise ValueError('sampling rates of traces differ')
384 if len(unit) == 0:
385 unit = us
386 elif us != unit:
387 raise ValueError('unit of traces differ')
388 return data, rate, unit, amax
391def metadata_relacs(filepath, store_empty=False, first_only=False,
392 lower_keys=False, flat=False, add_sections=False):
393 """ Read meta-data of a relacs data set.
395 Parameters
396 ----------
397 filepath: str
398 A relacs data directory or a file therein.
399 store_empty: bool
400 If `False` do not add meta data with empty values.
401 first_only: bool
402 If `False` only store the first element of a list.
403 lower_keys: bool
404 Make all keys lower case.
405 flat: bool
406 Do not make a nested dictionary.
407 Use this option also to read in very old relacs metadata with
408 ragged left alignment.
409 add_sections: bool
410 If `True`, prepend keys with sections names separated by
411 '.' to make them unique.
413 Returns
414 -------
415 data: nested dict
416 Nested dictionary with key-value pairs of the meta data.
417 """
418 relacs_dir = filepath
419 if not os.path.isdir(filepath):
420 relacs_dir = os.path.dirname(filepath)
421 info_path = os.path.join(relacs_dir, 'info.dat')
422 if not os.path.exists(info_path):
423 return dict(), []
424 data = relacs_header(info_path, store_empty, first_only,
425 lower_keys, flat, add_sections)
426 return data
429def fishgrid_spacings(metadata, unit='m'):
430 """Spacing between grid electrodes.
432 Parameters
433 ----------
434 metadata: dict
435 Fishgrid metadata obtained from `metadata_fishgrid()`.
436 unit: str
437 Unit in which to return the spacings.
439 Returns
440 -------
441 grid_dist: list of tuple of float
442 For each grid the distances between rows and columns in `unit`.
443 """
444 grids_dist = []
445 for k in range(4):
446 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0)
447 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0)
448 rows = get_int(metadata, f'Rows{k+1}', default=0)
449 cols = get_int(metadata, f'Columns{k+1}', default=0)
450 if get_bool(metadata, f'Used{k+1}', default=False) or \
451 cols > 0 and rows > 0:
452 grids_dist.append((row_dist, col_dist))
453 return grids_dist
456def fishgrid_grids(metadata):
457 """Retrieve grid sizes from a fishgrid.cfg file.
459 Parameters
460 ----------
461 metadata: dict
462 Fishgrid metadata obtained from `metadata_fishgrid()`.
464 Returns
465 -------
466 grids: list of tuple of int
467 For each grid the number of rows and columns.
468 """
469 grids = []
470 for k in range(4):
471 rows = get_int(metadata, f'Rows{k+1}', default=0)
472 cols = get_int(metadata, f'Columns{k+1}', default=0)
473 if get_bool(metadata, f'Used{k+1}', default=False) or \
474 cols > 0 and rows > 0:
475 grids.append((rows, cols))
476 return grids
479def check_fishgrid(filepath):
480 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid).
482 Parameters
483 ----------
484 filepath: str
485 Path to a fishgrid data directory or a file in a fishgrid
486 data directory.
488 Returns
489 -------
490 is_fishgrid: bool
491 `True` if `filepath` is a valid fishgrid data directory or
492 a file therein.
493 """
494 # fishgrid data directory:
495 fishgrid_dir = filepath
496 if not os.path.isdir(filepath):
497 fishgrid_dir = os.path.dirname(filepath)
498 # check for a valid fishgrid data directory:
499 return (os.path.isfile(os.path.join(fishgrid_dir, 'fishgrid.cfg')) and
500 (os.path.isfile(os.path.join(fishgrid_dir, 'traces-grid1.raw')) or
501 os.path.isfile(os.path.join(fishgrid_dir, 'traces.raw'))))
504def fishgrid_trace_files(filepath):
505 """Expand file paths for fishgrid data to appropriate traces*.raw file names.
507 Parameters
508 ----------
509 filepath: str
510 Path to a fishgrid data directory, or a file therein.
512 Returns
513 -------
514 trace_filepaths: list of str
515 List of fishgrid traces*.raw files.
516 """
517 # find grids:
518 fishgrid_dir = filepath
519 if not os.path.isdir(fishgrid_dir):
520 fishgrid_dir = os.path.dirname(filepath)
521 trace_filepaths = []
522 for k in range(10000):
523 file = os.path.join(fishgrid_dir, f'traces-grid{k+1}.raw')
524 if os.path.isfile(file):
525 trace_filepaths.append(file)
526 else:
527 break
528 if len(trace_filepaths) == 0:
529 file = os.path.join(fishgrid_dir, f'traces.raw')
530 if os.path.isfile(file):
531 trace_filepaths.append(file)
532 return trace_filepaths
535def load_fishgrid(filepath):
536 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid).
538 Parameters
539 ----------
540 filepath: str
541 Path to a fishgrid data directory, or a file therein.
543 Returns
544 -------
545 data: 2-D array
546 All data traces as an 2-D numpy array, even for single channel data.
547 First dimension is time, second is channel.
548 rate: float
549 Sampling rate of the data in Hz.
550 unit: str
551 Unit of the data.
552 amax: float
553 Maximum amplitude of data range.
555 Raises
556 ------
557 FileNotFoundError:
558 Invalid or non existing fishgrid files.
559 """
560 trace_filepaths = fishgrid_trace_files(filepath)
561 if len(trace_filepaths) == 0:
562 raise FileNotFoundError(f'no fishgrid files found')
563 md = metadata_fishgrid(filepath)
564 grids = fishgrid_grids(md)
565 grid_sizes = [r*c for r, c in grids]
567 # load traces-grid*.raw files:
568 grid_channels = []
569 nchannels = 0
570 for g, path in enumerate(trace_filepaths):
571 grid_channels.append(grid_sizes[g])
572 nchannels += grid_sizes[g]
573 data = None
574 nrows = 0
575 c = 0
576 rate = get_number(md, 'Hz', 'AISampleRate')
577 for path, channels in zip(trace_filepaths, grid_channels):
578 x = np.fromfile(path, np.float32).reshape((-1, channels))
579 if data is None:
580 nrows = len(x)
581 data = np.zeros((nrows, nchannels))
582 n = min(len(x), nrows)
583 data[:n,c:c+channels] = x[:n,:]
584 c += channels
585 amax, unit = get_number_unit(md, 'AIMaxVolt')
586 return data, rate, unit, amax
589# add fishgrid keys:
590default_starttime_keys.append(['StartDate', 'StartTime'])
591default_gain_keys.insert(0, 'AIMaxVolt')
594def metadata_fishgrid(filepath):
595 """ Read meta-data of a fishgrid data set.
597 Parameters
598 ----------
599 filepath: str
600 A fishgrid data directory or a file therein.
602 Returns
603 -------
604 data: nested dict
605 Nested dictionary with key-value pairs of the meta data.
606 """
607 fishgrid_dir = filepath
608 if not os.path.isdir(fishgrid_dir):
609 fishgrid_dir = os.path.dirname(filepath)
610 path = os.path.join(fishgrid_dir, 'fishgrid.cfg')
611 # read in header from file:
612 lines = []
613 if os.path.isfile(path + '.gz'):
614 path += '.gz'
615 if not os.path.exists(path):
616 return {}
617 if path[-3:] == '.gz':
618 with gzip.open(path, 'r', encoding='latin-1') as sf:
619 for line in sf:
620 lines.append(line)
621 else:
622 with open(path, 'r', encoding='latin-1') as sf:
623 for line in sf:
624 lines.append(line)
625 # parse:
626 data = {}
627 cdatas = [data]
628 ident_offs = None
629 ident = None
630 old_style = False
631 grid_n = False
632 for line in lines:
633 if len(line.strip()) == 0:
634 continue
635 if line[0] == '*':
636 key = line[1:].strip()
637 data[key] = {}
638 cdatas = [data, data[key]]
639 elif '----' in line:
640 old_style = True
641 key = line.strip().strip(' -').replace('&', '')
642 if key.upper() == 'SETUP':
643 key = 'Grid 1'
644 grid_n = False
645 if key[:4].lower() == 'grid':
646 grid_n = key[5]
647 cdatas = cdatas[:2]
648 cdatas[1][key] = {}
649 cdatas.append(cdatas[1][key])
650 else:
651 words = line.split(':')
652 key = words[0].strip().strip('"')
653 value = None
654 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style):
655 value = ':'.join(words[1:]).strip().strip('"')
656 if old_style:
657 if value is None:
658 cdatas = cdatas[:3]
659 cdatas[2][key] = {}
660 cdatas.append(cdatas[2][key])
661 else:
662 if grid_n and key[-1] != grid_n:
663 key = key + grid_n
664 cdatas[-1][key] = value
665 else:
666 # get section level:
667 level = 0
668 nident = len(line) - len(line.lstrip())
669 if ident_offs is None:
670 ident_offs = nident
671 elif ident is None:
672 if nident > ident_offs:
673 ident = nident - ident_offs
674 level = 1
675 else:
676 level = (nident - ident_offs)//ident
677 # close sections:
678 cdatas = cdatas[:2 + level]
679 if value is None:
680 # new section:
681 cdatas[-1][key] = {}
682 cdatas.append(cdatas[-1][key])
683 else:
684 # key-value pair:
685 cdatas[-1][key] = value.replace(r'\n', '\n')
686 # remove unused grids:
687 fgm = data.get('FishGrid', {})
688 for i in range(4):
689 gs = f'Grid {i+1}'
690 if gs in fgm:
691 gm = fgm[gs]
692 us = f'Used{i+1}'
693 if us in gm and gm[us].upper() == 'FALSE':
694 del fgm[gs]
695 return data
698def markers_fishgrid(filepath):
699 """ Read markers of a fishgrid data set.
701 Parameters
702 ----------
703 filepath: str
704 A fishgrid data directory or a file therein.
706 Returns
707 -------
708 locs: 2-D array of ints
709 Marker positions (first column) and spans (second column)
710 for each marker (rows).
711 labels: 2-D array of string objects
712 Labels (first column) and texts (second column)
713 for each marker (rows).
714 """
715 def add_marker():
716 if 'index1' in marker:
717 index1 = int(marker['index1'])//nchannels
718 else:
719 index1 = int(marker['index'])//nchannels
720 span1 = int(marker.get('span1', 0))//nchannels
721 locs.append([index1, span1])
722 ls = marker.get('label', 'M')
723 cs = marker.get('comment', '')
724 labels.append([ls, cs])
726 fishgrid_dir = filepath
727 if not os.path.isdir(fishgrid_dir):
728 fishgrid_dir = os.path.dirname(filepath)
729 path = os.path.join(fishgrid_dir, 'timestamps.dat')
730 if not os.path.isfile(path):
731 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
732 # get number of channels:
733 md = metadata_fishgrid(path.replace('timestamps.dat', 'fishgrid.cfg'))
734 grids = fishgrid_grids(md)
735 nchannels = np.prod(grids[0])
736 # read timestamps:
737 locs = []
738 labels = []
739 marker = {}
740 with open(path, 'r') as sf:
741 for line in sf:
742 if len(line.strip()) == 0:
743 add_marker()
744 marker = {}
745 else:
746 words = line.split(':')
747 if len(words) > 1:
748 v = words[1].strip()
749 v = v.strip('"')
750 marker[words[0].strip().lower()] = v
751 if len(marker) > 0:
752 add_marker()
753 if len(locs) > 2:
754 return np.array(locs[1:-1]), np.array(labels[1:-1])
755 else:
756 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
759def check_container(filepath):
760 """Check if file is a generic container file.
762 Supported file formats are:
764 - python pickle files (.pkl)
765 - numpy files (.npz)
766 - matlab files (.mat)
768 Parameters
769 ----------
770 filepath: str
771 Path of the file to check.
773 Returns
774 -------
775 is_container: bool
776 `True`, if `filepath` is a supported container format.
777 """
778 ext = os.path.splitext(filepath)[1]
779 return ext.lower() in ('.pkl', '.npz', '.mat')
782def extract_container_data(data_dict, datakey=None,
783 samplekey=['rate', 'Fs', 'fs'],
784 timekey=['time'], amplkey=['amax'], unitkey='unit',
785 amax=1.0, unit='a.u.'):
786 """Extract data from dictionary loaded from a container file.
788 Parameters
789 ----------
790 data_dict: dict
791 Dictionary of the data items contained in the container.
792 datakey: None, str, or list of str
793 Name of the variable holding the data. If `None` take the
794 variable that is an 2D array and has the largest number of
795 elements.
796 samplekey: str or list of str
797 Name of the variable holding the sampling rate.
798 timekey: str or list of str
799 Name of the variable holding sampling times.
800 If no sampling rate is available, the sampling rate is retrieved
801 from the sampling times.
802 amplkey: str or list of str
803 Name of the variable holding the amplitude range of the data.
804 unitkey: str
805 Name of the variable holding the unit of the data.
806 amax: None or float
807 If specified and no amplitude range has been found in `data_dict`,
808 then this is the amplitude range of the data.
809 unit: None or str
810 If specified and no unit has been found in `data_dict`,
811 then return this as the unit of the data.
813 Returns
814 -------
815 data: 2-D array of floats
816 All data traces as an 2-D numpy array, even for single channel data.
817 First dimension is time, second is channel.
818 rate: float
819 Sampling rate of the data in Hz.
820 unit: str
821 Unit of the data.
822 amax: float
823 Maximum amplitude of data range in `unit`.
825 Raises
826 ------
827 ValueError:
828 Invalid key requested.
829 """
830 # extract format data:
831 if not isinstance(samplekey, (list, tuple, np.ndarray)):
832 samplekey = (samplekey,)
833 if not isinstance(timekey, (list, tuple, np.ndarray)):
834 timekey = (timekey,)
835 if not isinstance(amplkey, (list, tuple, np.ndarray)):
836 amplkey = (amplkey,)
837 rate = 0.0
838 for skey in samplekey:
839 if skey in data_dict:
840 rate = float(data_dict[skey])
841 break
842 if rate == 0.0:
843 for tkey in timekey:
844 if tkey in data_dict:
845 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0])
846 break
847 if rate == 0.0:
848 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times")
849 for akey in amplkey:
850 if akey in data_dict:
851 amax = float(data_dict[akey])
852 break
853 if unitkey in data_dict:
854 unit = data_dict[unitkey]
855 # get data array:
856 raw_data = np.array([])
857 if datakey:
858 # try data keys:
859 if not isinstance(datakey, (list, tuple, np.ndarray)):
860 datakey = (datakey,)
861 for dkey in datakey:
862 if dkey in data_dict:
863 raw_data = data_dict[dkey]
864 break
865 if len(raw_data) == 0:
866 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data")
867 else:
868 # find largest 2D array:
869 for d in data_dict:
870 if hasattr(data_dict[d], 'shape'):
871 if 1 <= len(data_dict[d].shape) <= 2 and \
872 np.max(data_dict[d].shape) > np.max(raw_data.shape):
873 raw_data = data_dict[d]
874 if len(raw_data) == 0:
875 raise ValueError('no data found')
876 # make 2D:
877 if len(raw_data.shape) == 1:
878 raw_data = raw_data.reshape(-1, 1)
879 # transpose if necessary:
880 if np.argmax(raw_data.shape) > 0:
881 raw_data = raw_data.T
882 # recode:
883 if raw_data.dtype == np.dtype('int16'):
884 data = raw_data.astype('float32')
885 data *= amax/2**15
886 elif raw_data.dtype == np.dtype('int32'):
887 data = raw_data.astype(float)
888 data *= amax/2**31
889 elif raw_data.dtype == np.dtype('int64'):
890 data = raw_data.astype(float)
891 data *= amax/2**63
892 else:
893 data = raw_data
894 return data, rate, unit, amax
897def load_container(filepath, datakey=None,
898 samplekey=['rate', 'Fs', 'fs'],
899 timekey=['time'], amplkey=['amax'], unitkey='unit',
900 amax=1.0, unit='a.u.'):
901 """Load data from a generic container file.
903 Supported file formats are:
905 - python pickle files (.pkl)
906 - numpy files (.npz)
907 - matlab files (.mat)
909 Parameters
910 ----------
911 filepath: str
912 Path of the file to load.
913 datakey: None, str, or list of str
914 Name of the variable holding the data. If `None` take the
915 variable that is an 2D array and has the largest number of
916 elements.
917 samplekey: str or list of str
918 Name of the variable holding the sampling rate.
919 timekey: str or list of str
920 Name of the variable holding sampling times.
921 If no sampling rate is available, the sampling rate is retrieved
922 from the sampling times.
923 amplkey: str
924 Name of the variable holding the amplitude range of the data.
925 unitkey: str
926 Name of the variable holding the unit of the data.
927 If `unitkey` is not a valid key, then return `unitkey` as the `unit`.
928 amax: None or float
929 If specified and no amplitude range has been found in the data
930 container, then this is the amplitude range of the data.
931 unit: None or str
932 If specified and no unit has been found in the data container,
933 then return this as the unit of the data.
935 Returns
936 -------
937 data: 2-D array of floats
938 All data traces as an 2-D numpy array, even for single channel data.
939 First dimension is time, second is channel.
940 rate: float
941 Sampling rate of the data in Hz.
942 unit: str
943 Unit of the data.
944 amax: float
945 Maximum amplitude of data range.
947 Raises
948 ------
949 ValueError:
950 Invalid key requested.
951 """
952 # load data:
953 data_dict = {}
954 ext = os.path.splitext(filepath)[1]
955 if ext == '.pkl':
956 import pickle
957 with open(filepath, 'rb') as f:
958 data_dict = pickle.load(f)
959 elif ext == '.npz':
960 data_dict = np.load(filepath)
961 elif ext == '.mat':
962 from scipy.io import loadmat
963 data_dict = loadmat(filepath, squeeze_me=True)
964 return extract_container_data(data_dict, datakey, samplekey,
965 timekey, amplkey, unitkey, amax, unit)
968def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']):
969 """ Extract metadata from dictionary loaded from a container file.
971 Parameters
972 ----------
973 data_dict: dict
974 Dictionary of the data items contained in the container.
975 metadatakey: str or list of str
976 Name of the variable holding the metadata.
978 Returns
979 -------
980 metadata: nested dict
981 Nested dictionary with key-value pairs of the meta data.
982 """
983 if not isinstance(metadatakey, (list, tuple, np.ndarray)):
984 metadatakey = (metadatakey,)
985 # get single metadata dictionary:
986 for mkey in metadatakey:
987 if mkey in data_dict:
988 return data_dict[mkey]
989 # collect all keys starting with metadatakey:
990 metadata = {}
991 for mkey in metadatakey:
992 mkey += '__'
993 for dkey in data_dict:
994 if dkey[:len(mkey)] == mkey:
995 v = data_dict[dkey]
996 if hasattr(v, 'size') and v.ndim == 0:
997 v = v.item()
998 metadata[dkey[len(mkey):]] = v
999 if len(metadata) > 0:
1000 return unflatten_metadata(metadata, sep='__')
1001 return metadata
1004def metadata_container(filepath, metadatakey=['metadata', 'info']):
1005 """ Read meta-data of a container file.
1007 Parameters
1008 ----------
1009 filepath: str
1010 A container file.
1011 metadatakey: str or list of str
1012 Name of the variable holding the metadata.
1014 Returns
1015 -------
1016 metadata: nested dict
1017 Nested dictionary with key-value pairs of the meta data.
1018 """
1019 data_dict = {}
1020 ext = os.path.splitext(filepath)[1]
1021 if ext == '.pkl':
1022 import pickle
1023 with open(filepath, 'rb') as f:
1024 data_dict = pickle.load(f)
1025 elif ext == '.npz':
1026 data_dict = np.load(filepath)
1027 elif ext == '.mat':
1028 from scipy.io import loadmat
1029 data_dict = loadmat(filepath, squeeze_me=True)
1030 return extract_container_metadata(data_dict, metadatakey)
1033def extract_container_markers(data_dict, poskey=['positions'],
1034 spanskey=['spans'], labelskey=['labels'],
1035 descrkey=['descriptions']):
1036 """ Extract markers from dictionary loaded from a container file.
1038 Parameters
1039 ----------
1040 data_dict: dict
1041 Dictionary of the data items contained in the container.
1042 poskey: str or list of str
1043 Name of the variable holding positions of markers.
1044 spanskey: str or list of str
1045 Name of the variable holding spans of markers.
1046 labelskey: str or list of str
1047 Name of the variable holding labels of markers.
1048 descrkey: str or list of str
1049 Name of the variable holding descriptions of markers.
1051 Returns
1052 -------
1053 locs: 2-D array of ints
1054 Marker positions (first column) and spans (second column)
1055 for each marker (rows).
1056 labels: 2-D array of string objects
1057 Labels (first column) and texts (second column)
1058 for each marker (rows).
1059 """
1060 if not isinstance(poskey, (list, tuple, np.ndarray)):
1061 poskey = (poskey,)
1062 if not isinstance(spanskey, (list, tuple, np.ndarray)):
1063 spanskey = (spanskey,)
1064 if not isinstance(labelskey, (list, tuple, np.ndarray)):
1065 labelskey = (labelskey,)
1066 if not isinstance(descrkey, (list, tuple, np.ndarray)):
1067 descrkey = (descrkey,)
1068 locs = np.zeros((0, 2), dtype=int)
1069 for pkey in poskey:
1070 if pkey in data_dict:
1071 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int)
1072 locs[:,0] = data_dict[pkey]
1073 break
1074 for skey in spanskey:
1075 if skey in data_dict:
1076 locs[:,1] = data_dict[skey]
1077 break
1078 labels = np.zeros((0, 2), dtype=object)
1079 for lkey in labelskey:
1080 if lkey in data_dict:
1081 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object)
1082 labels[:,0] = data_dict[lkey]
1083 break
1084 for dkey in descrkey:
1085 if dkey in data_dict:
1086 labels[:,1] = data_dict[dkey]
1087 break
1088 return locs, labels
1091def markers_container(filepath, poskey=['positions'],
1092 spanskey=['spans'], labelskey=['labels'],
1093 descrkey=['descriptions']):
1094 """ Read markers of a container file.
1096 Parameters
1097 ----------
1098 filepath: str
1099 A container file.
1100 poskey: str or list of str
1101 Name of the variable holding positions of markers.
1102 spanskey: str or list of str
1103 Name of the variable holding spans of markers.
1104 labelskey: str or list of str
1105 Name of the variable holding labels of markers.
1106 descrkey: str or list of str
1107 Name of the variable holding descriptions of markers.
1109 Returns
1110 -------
1111 locs: 2-D array of ints
1112 Marker positions (first column) and spans (second column)
1113 for each marker (rows).
1114 labels: 2-D array of string objects
1115 Labels (first column) and texts (second column)
1116 for each marker (rows).
1117 """
1118 data_dict = {}
1119 ext = os.path.splitext(filepath)[1]
1120 if ext == '.pkl':
1121 import pickle
1122 with open(filepath, 'rb') as f:
1123 data_dict = pickle.load(f)
1124 elif ext == '.npz':
1125 data_dict = np.load(filepath)
1126 elif ext == '.mat':
1127 from scipy.io import loadmat
1128 data_dict = loadmat(filepath, squeeze_me=True)
1129 return extract_container_markers(data_dict, poskey, spanskey,
1130 labelskey, descrkey)
1133def check_raw(filepath):
1134 """Check if file is a raw file.
1136 The following extensions are interpreted as raw files:
1138 - raw files (*.raw)
1139 - LabView scandata (*.scandat)
1141 Parameters
1142 ----------
1143 filepath: str
1144 Path of the file to check.
1146 Returns
1147 -------
1148 is_raw: bool
1149 `True`, if `filepath` is a raw format.
1150 """
1151 ext = os.path.splitext(filepath)[1]
1152 return ext.lower() in ('.raw', '.scandat', '.mat')
1155def load_raw(filepath, rate=44000, channels=1, dtype=np.float32,
1156 amax=1.0, unit='a.u.'):
1157 """Load data from a raw file.
1159 Raw files just contain the data and absolutely no metadata, not
1160 even the smapling rate, number of channels, etc.
1161 Supported file formats are:
1163 - raw files (*.raw)
1164 - LabView scandata (*.scandat)
1166 Parameters
1167 ----------
1168 filepath: str
1169 Path of the file to load.
1170 rate: float
1171 Sampling rate of the data in Hertz.
1172 channels: int
1173 Number of channels multiplexed in the data.
1174 dtype: str or numpy.dtype
1175 The data type stored in the file.
1176 amax: float
1177 The amplitude range of the data.
1178 unit: str
1179 The unit of the data.
1181 Returns
1182 -------
1183 data: 2-D array of floats
1184 All data traces as an 2-D numpy array, even for single channel data.
1185 First dimension is time, second is channel.
1186 rate: float
1187 Sampling rate of the data in Hz.
1188 unit: str
1189 Unit of the data.
1190 amax: float
1191 Maximum amplitude of data range.
1193 """
1194 raw_data = np.fromfile(filepath, dtype=dtype).reshape(-1, channels)
1195 # recode:
1196 if dtype == np.dtype('int16'):
1197 data = raw_data.astype('float32')
1198 data *= amax/2**15
1199 elif dtype == np.dtype('int32'):
1200 data = raw_data.astype(float)
1201 data *= amax/2**31
1202 elif dtype == np.dtype('int64'):
1203 data = raw_data.astype(float)
1204 data *= amax/2**63
1205 else:
1206 data = raw_data
1207 return data, rate, unit, amax
1210def load_audioio(filepath, verbose=0, gainkey=default_gain_keys, sep='.',
1211 amax=1.0, unit='a.u.'):
1212 """Load data from an audio file.
1214 See the
1215 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio)
1216 function of the [`audioio`](https://github.com/bendalab/audioio)
1217 package for more infos.
1219 Parameters
1220 ----------
1221 filepath: str
1222 Path of the file to load.
1223 verbose: int
1224 If > 0 show detailed error/warning messages.
1225 gainkey: str or list of str
1226 Key in the file's metadata that holds some gain information.
1227 If found, the data will be multiplied with the gain,
1228 and if available, the corresponding unit is returned.
1229 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
1230 sep: str
1231 String that separates section names in `gainkey`.
1232 amax: float
1233 If specified and no gain has been found in the metadata,
1234 then use this as the amplitude range.
1235 unit: str
1236 If specified and no gain has been found in the metadata,
1237 then return this as the unit of the data.
1239 Returns
1240 -------
1241 data: 2-D array of floats
1242 All data traces as an 2-D numpy array, even for single channel data.
1243 First dimension is time, second is channel.
1244 rate: float
1245 Sampling rate of the data in Hz.
1246 unit: str
1247 Unit of the data if found in the metadata (see `gainkey`),
1248 otherwise `unit`.
1249 amax: float
1250 Maximum amplitude of data range.
1251 """
1252 # get gain:
1253 md = metadata_audioio(filepath)
1254 amax, unit = get_gain(md, gainkey, sep, amax, unit)
1255 # load data:
1256 data, rate = load_audio(filepath, verbose)
1257 if amax != 1.0:
1258 data *= amax
1259 return data, rate, unit, amax
1262data_loader_funcs = (
1263 ('relacs', check_relacs, load_relacs, metadata_relacs, None),
1264 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid),
1265 ('container', check_container, load_container, metadata_container, markers_container),
1266 ('raw', check_raw, load_raw, None, None),
1267 ('audioio', None, load_audioio, metadata_audioio, markers_audioio),
1268 )
1269"""List of implemented load functions.
1271Each element of the list is a tuple with the data format's name, its
1272check and its load function.
1274"""
1277def load_data(filepath, verbose=0, **kwargs):
1278 """Load time-series data from a file.
1280 Parameters
1281 ----------
1282 filepath: str
1283 Path and name of the file to load.
1284 verbose: int
1285 If > 0 show detailed error/warning messages.
1286 **kwargs: dict
1287 Further keyword arguments that are passed on to the
1288 format specific loading functions.
1289 For example:
1290 - `amax`: the amplitude range of the data.
1291 - 'unit': the unit of the data.
1293 Returns
1294 -------
1295 data: 2-D array
1296 All data traces as an 2-D numpy array, even for single channel data.
1297 First dimension is time, second is channel.
1298 rate: float
1299 Sampling rate of the data in Hz.
1300 unit: str
1301 Unit of the data.
1302 amax: float
1303 Maximum amplitude of data range.
1305 Raises
1306 ------
1307 ValueError:
1308 `filepath` is empty string.
1309 """
1310 if len(filepath) == 0:
1311 raise ValueError('input argument filepath is empty string.')
1312 # load data:
1313 for name, check_file, load_file, _, _ in data_loader_funcs:
1314 if check_file is None or check_file(filepath):
1315 data, rate, unit, amax = load_file(filepath, **kwargs)
1316 if verbose > 0:
1317 print(f'loaded {name} data from file "{filepath}"')
1318 if verbose > 1:
1319 print(f' sampling rate: {rate:g} Hz')
1320 print(f' channels : {data.shape[1]}')
1321 print(f' frames : {len(data)}')
1322 print(f' range : {amax:g}{unit}')
1323 return data, rate, unit, amax
1324 return np.zeros((0, 1)), 0.0, '', 1.0
1327def metadata(filepath, **kwargs):
1328 """ Read meta-data from a data file.
1330 Parameters
1331 ----------
1332 filepath: str
1333 The full path and name of the file to load. For some file
1334 formats several files can be provided in a list.
1335 **kwargs: dict
1336 Further keyword arguments that are passed on to the
1337 format specific loading functions.
1339 Returns
1340 -------
1341 meta_data: nested dict
1342 Meta data contained in the file. Keys of the nested
1343 dictionaries are always strings. If the corresponding
1344 values are dictionaries, then the key is the section name
1345 of the metadata contained in the dictionary. All other
1346 types of values are values for the respective key. In
1347 particular they are strings, or list of strings. But other
1348 simple types like ints or floats are also allowed.
1350 Raises
1351 ------
1352 ValueError:
1353 `filepath` is empty string.
1354 """
1355 if len(filepath) == 0:
1356 raise ValueError('input argument filepath is empty string.')
1357 # load metadata:
1358 for _, check_file, _, metadata_file, _ in data_loader_funcs:
1359 if check_file is None or check_file(filepath):
1360 if metadata_file is not None:
1361 return metadata_file(filepath, **kwargs)
1362 return {}
1365def markers(filepath):
1366 """ Read markers of a data file.
1368 Parameters
1369 ----------
1370 filepath: str or file handle
1371 The data file.
1373 Returns
1374 -------
1375 locs: 2-D array of ints
1376 Marker positions (first column) and spans (second column)
1377 for each marker (rows).
1378 labels: 2-D array of string objects
1379 Labels (first column) and texts (second column)
1380 for each marker (rows).
1382 Raises
1383 ------
1384 ValueError:
1385 `filepath` is empty string.
1386 """
1387 if len(filepath) == 0:
1388 raise ValueError('input argument filepath is empty string.')
1389 # load markers:
1390 for _, check_file, _, _, markers_file in data_loader_funcs:
1391 if check_file is None or check_file(filepath):
1392 if markers_file is not None:
1393 return markers_file(filepath)
1394 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
1397class DataLoader(AudioLoader):
1398 """Buffered reading of time-series data for random access of the data in the file.
1400 This allows for reading very large data files that do not fit into
1401 memory. A `DataLoader` instance can be used like a huge
1402 read-only numpy array, i.e.
1403 ```
1404 data = DataLoader('path/to/data/file.dat')
1405 x = data[10000:20000,0]
1406 ```
1407 The first index specifies the frame, the second one the channel.
1409 `DataLoader` first determines the format of the data file and then
1410 opens the file (first line). It then reads data from the file as
1411 necessary for the requested data (second line).
1413 Supported file formats are
1415 - audio files via `audioio` package
1416 - python pickle files
1417 - numpy .npz files
1418 - matlab .mat files
1419 - relacs trace*.raw files (www.relacs.net)
1420 - fishgrid traces-*.raw files
1422 Reading sequentially through the file is always possible. If
1423 previous data are requested, then the file is read from the
1424 beginning. This might slow down access to previous data
1425 considerably. Use the `backsize` argument to the open functions to
1426 make sure some data are loaded before the requested frame. Then a
1427 subsequent access to the data within `backsize` seconds before that
1428 frame can still be handled without the need to reread the file
1429 from the beginning.
1431 Usage:
1432 ------
1433 ```
1434 import thunderlab.dataloader as dl
1435 with dl.DataLoader(filepath, 60.0, 10.0) as data:
1436 # do something with the content of the file:
1437 x = data[0:10000,0]
1438 y = data[10000:20000,0]
1439 z = x + y
1440 ```
1442 Normal open and close:
1443 ```
1444 data = dl.DataLoader(filepath, 60.0)
1445 x = data[:,:] # read the whole file
1446 data.close()
1447 ```
1448 that is the same as:
1449 ```
1450 data = dl.DataLoader()
1451 data.open(filepath, 60.0)
1452 ```
1454 Parameters
1455 ----------
1456 filepath: str
1457 Name of the file.
1458 buffersize: float
1459 Size of internal buffer in seconds.
1460 backsize: float
1461 Part of the buffer to be loaded before the requested start index in seconds.
1462 verbose: int
1463 If larger than zero show detailed error/warning messages.
1464 meta_kwargs: dict
1465 Keyword arguments that are passed on to the _load_metadata() function.
1467 Attributes
1468 ----------
1469 rate: float
1470 The sampling rate of the data in Hertz.
1471 channels: int
1472 The number of channels that are read in.
1473 frames: int
1474 The number of frames in the file.
1475 format: str or None
1476 Format of the audio file.
1477 encoding: str or None
1478 Encoding/subtype of the audio file.
1479 shape: tuple
1480 Number of frames and channels of the data.
1481 ndim: int
1482 Number of dimensions: always 2 (frames and channels).
1483 unit: str
1484 Unit of the data.
1485 ampl_min: float
1486 Minimum amplitude the file format supports.
1487 ampl_max: float
1488 Maximum amplitude the file format supports.
1490 Methods
1491 -------
1493 - `len()`: the number of frames
1494 - `open()`: open a data file.
1495 - `open_*()`: open a data file of a specific format.
1496 - `close()`: close the file.
1497 - `metadata()`: metadata of the file.
1498 - `markers()`: markers of the file.
1499 - `set_unwrap()`: Set parameters for unwrapping clipped data.
1501 """
1503 def __init__(self, filepath=None, buffersize=10.0, backsize=0.0,
1504 verbose=0, **meta_kwargs):
1505 super().__init__(None, buffersize, backsize,
1506 verbose, **meta_kwargs)
1507 if filepath is not None:
1508 self.open(filepath, buffersize, backsize, verbose, **meta_kwargs)
1510 def __getitem__(self, key):
1511 return super(DataLoader, self).__getitem__(key)
1513 def __next__(self):
1514 return super(DataLoader, self).__next__()
1517 # relacs interface:
1518 def open_relacs(self, filepath, buffersize=10.0, backsize=0.0,
1519 verbose=0, amax=1.0):
1520 """Open relacs data files (www.relacs.net) for reading.
1522 Parameters
1523 ----------
1524 filepath: str
1525 Path to a relacs data directory or a file therein.
1526 buffersize: float
1527 Size of internal buffer in seconds.
1528 backsize: float
1529 Part of the buffer to be loaded before the requested start index in seconds.
1530 verbose: int
1531 If > 0 show detailed error/warning messages.
1532 amax: float
1533 The amplitude range of the data.
1535 Raises
1536 ------
1537 FileNotFoundError:
1538 Invalid or non existing fishgrid files.
1539 ValueError:
1540 .gz files not supported.
1541 """
1542 self.verbose = verbose
1544 # open trace files:
1545 self.trace_filepaths = relacs_trace_files(filepath)
1546 if len(self.trace_filepaths) == 0:
1547 raise FileNotFoundError(f'no relacs files found')
1548 self.sf = []
1549 self.frames = None
1550 self.rate = None
1551 self.unit = ''
1552 self.filepath = os.path.dirname(self.trace_filepaths[0])
1553 self.file_paths = [self.filepath]
1554 self.file_indices = [0]
1555 for path in self.trace_filepaths:
1556 if path[-3:] == '.gz':
1557 raise ValueError('.gz files not supported')
1558 sf = open(path, 'rb')
1559 self.sf.append(sf)
1560 if verbose > 0:
1561 print(f'open_relacs(filepath) with filepath={path}')
1562 # file size:
1563 sf.seek(0, os.SEEK_END)
1564 frames = sf.tell()//4
1565 if self.frames is None:
1566 self.frames = frames
1567 elif self.frames != frames:
1568 diff = self.frames - frames
1569 if diff > 1 or diff < -2:
1570 raise ValueError('number of frames of traces differ')
1571 elif diff >= 0:
1572 self.frames = frames
1573 sf.seek(0)
1574 # retrieve sampling rate and unit:
1575 rate, us = relacs_samplerate_unit(path)
1576 if self.rate is None:
1577 self.rate = rate
1578 elif rate != self.rate:
1579 raise ValueError('sampling rates of traces differ')
1580 if len(self.unit) == 0:
1581 self.unit = us
1582 elif us != self.unit:
1583 raise ValueError('unit of traces differ')
1584 self.channels = len(self.sf)
1585 self.shape = (self.frames, self.channels)
1586 self.size = self.frames * self.channels
1587 self.ndim = len(self.shape)
1588 self.format = 'RELACS'
1589 self.encoding = 'FLOAT'
1590 self.bufferframes = int(buffersize*self.rate)
1591 self.backframes = int(backsize*self.rate)
1592 self.init_buffer()
1593 self.offset = 0
1594 self.close = self._close_relacs
1595 self.load_audio_buffer = self._load_buffer_relacs
1596 self.ampl_min = -amax
1597 self.ampl_max = +amax
1598 self._load_metadata = self._metadata_relacs
1599 # TODO: load markers:
1600 self._locs = np.zeros((0, 2), dtype=int)
1601 self._labels = np.zeros((0, 2), dtype=object)
1602 self._load_markers = None
1603 return self
1605 def _close_relacs(self):
1606 """Close the relacs data files.
1607 """
1608 for file in self.sf:
1609 file.close()
1610 self.sf = []
1612 def _load_buffer_relacs(self, r_offset, r_size, buffer):
1613 """Load new data from relacs data file.
1615 Parameters
1616 ----------
1617 r_offset: int
1618 First frame to be read from file.
1619 r_size: int
1620 Number of frames to be read from file.
1621 buffer: ndarray
1622 Buffer where to store the loaded data.
1623 """
1624 if len(self.sf) == 0 and len(self.trace_filepaths) > 0:
1625 for path in self.trace_filepaths:
1626 self.sf.append(open(path, 'rb'))
1627 for i, file in enumerate(self.sf):
1628 file.seek(r_offset*4)
1629 data = file.read(r_size*4)
1630 buffer[:, i] = np.frombuffer(data, dtype=np.float32)
1633 def _metadata_relacs(self, store_empty=False, first_only=False):
1634 """ Load meta-data of a relacs data set.
1635 """
1636 info_path = os.path.join(self.filepath, 'info.dat')
1637 if not os.path.exists(info_path):
1638 return {}
1639 return relacs_header(info_path, store_empty, first_only)
1642 # fishgrid interface:
1643 def open_fishgrid(self, filepath, buffersize=10.0, backsize=0.0,
1644 verbose=0):
1645 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading.
1647 Parameters
1648 ----------
1649 filepath: str
1650 Path to a fishgrid data directory, or a file therein.
1651 buffersize: float
1652 Size of internal buffer in seconds.
1653 backsize: float
1654 Part of the buffer to be loaded before the requested start index in seconds.
1655 verbose: int
1656 If > 0 show detailed error/warning messages.
1658 Raises
1659 ------
1660 FileNotFoundError:
1661 Invalid or non existing fishgrid files.
1662 """
1663 self.verbose = verbose
1665 self.trace_filepaths = fishgrid_trace_files(filepath)
1666 if len(self.trace_filepaths) == 0:
1667 raise FileNotFoundError(f'no fishgrid files found')
1668 self.filepath = os.path.dirname(self.trace_filepaths[0])
1669 self.file_paths = [self.filepath]
1670 self.file_indices = [0]
1671 self._load_metadata = metadata_fishgrid
1672 self._load_markers = markers_fishgrid
1674 # open grid files:
1675 grids = fishgrid_grids(self.metadata())
1676 grid_sizes = [r*c for r,c in grids]
1677 self.channels = 0
1678 for g, path in enumerate(self.trace_filepaths):
1679 self.channels += grid_sizes[g]
1680 self.sf = []
1681 self.grid_channels = []
1682 self.grid_offs = []
1683 offs = 0
1684 self.frames = None
1685 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate')
1686 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt')
1687 if v is not None:
1688 self.ampl_min = -v
1689 self.ampl_max = +v
1691 for g, path in enumerate(self.trace_filepaths):
1692 sf = open(path, 'rb')
1693 self.sf.append(sf)
1694 if verbose > 0:
1695 print(f'open_fishgrid(filepath) with filepath={path}')
1696 # grid channels:
1697 self.grid_channels.append(grid_sizes[g])
1698 self.grid_offs.append(offs)
1699 offs += grid_sizes[g]
1700 # file size:
1701 sf.seek(0, os.SEEK_END)
1702 frames = sf.tell()//4//grid_sizes[g]
1703 if self.frames is None:
1704 self.frames = frames
1705 elif self.frames != frames:
1706 diff = self.frames - frames
1707 if diff > 1 or diff < -2:
1708 raise ValueError('number of frames of traces differ')
1709 elif diff >= 0:
1710 self.frames = frames
1711 sf.seek(0)
1712 self.shape = (self.frames, self.channels)
1713 self.size = self.frames * self.channels
1714 self.ndim = len(self.shape)
1715 self.format = 'FISHGRID'
1716 self.encoding = 'FLOAT'
1717 self.bufferframes = int(buffersize*self.rate)
1718 self.backframes = int(backsize*self.rate)
1719 self.init_buffer()
1720 self.offset = 0
1721 self.close = self._close_fishgrid
1722 self.load_audio_buffer = self._load_buffer_fishgrid
1723 return self
1725 def _close_fishgrid(self):
1726 """Close the fishgrid data files.
1727 """
1728 for file in self.sf:
1729 file.close()
1730 self.sf = []
1732 def _load_buffer_fishgrid(self, r_offset, r_size, buffer):
1733 """Load new data from relacs data file.
1735 Parameters
1736 ----------
1737 r_offset: int
1738 First frame to be read from file.
1739 r_size: int
1740 Number of frames to be read from file.
1741 buffer: ndarray
1742 Buffer where to store the loaded data.
1743 """
1744 if len(self.sf) == 0 and len(self.trace_filepaths) > 0:
1745 for path in self.trace_filepaths:
1746 self.sf.append(open(path, 'rb'))
1747 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs):
1748 file.seek(r_offset*4*gchannels)
1749 data = file.read(r_size*4*gchannels)
1750 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels))
1753 # container interface:
1754 def open_container(self, filepath, buffersize=10.0,
1755 backsize=0.0, verbose=0, datakey=None,
1756 samplekey=['rate', 'Fs', 'fs'],
1757 timekey=['time'], amplkey=['amax'], unitkey='unit',
1758 metadatakey=['metadata', 'info'],
1759 poskey=['positions'],
1760 spanskey=['spans'], labelskey=['labels'],
1761 descrkey=['descriptions'],
1762 amax=1.0, unit='a.u.'):
1763 """Open generic container file.
1765 Supported file formats are:
1767 - python pickle files (.pkl)
1768 - numpy files (.npz)
1769 - matlab files (.mat)
1771 Parameters
1772 ----------
1773 filepath: str
1774 Path to a container file.
1775 buffersize: float
1776 Size of internal buffer in seconds.
1777 backsize: float
1778 Part of the buffer to be loaded before the requested start index in seconds.
1779 verbose: int
1780 If > 0 show detailed error/warning messages.
1781 datakey: None, str, or list of str
1782 Name of the variable holding the data. If `None` take the
1783 variable that is an 2D array and has the largest number of
1784 elements.
1785 samplekey: str or list of str
1786 Name of the variable holding the sampling rate.
1787 timekey: str or list of str
1788 Name of the variable holding sampling times.
1789 If no sampling rate is available, the sampling rate is retrieved
1790 from the sampling times.
1791 amplkey: str or list of str
1792 Name of the variable holding the amplitude range of the data.
1793 unitkey: str
1794 Name of the variable holding the unit of the data.
1795 metadatakey: str or list of str
1796 Name of the variable holding the metadata.
1797 poskey: str or list of str
1798 Name of the variable holding positions of markers.
1799 spanskey: str or list of str
1800 Name of the variable holding spans of markers.
1801 labelskey: str or list of str
1802 Name of the variable holding labels of markers.
1803 descrkey: str or list of str
1804 Name of the variable holding descriptions of markers.
1805 amax: None or float
1806 If specified and no amplitude range has been found in the data
1807 container, then this is the amplitude range of the data.
1808 unit: None or str
1809 If specified and no unit has been found in the data container,
1810 then return this as the unit of the data.
1812 Raises
1813 ------
1814 ValueError:
1815 Invalid key requested.
1816 """
1817 self.verbose = verbose
1818 data_dict = {}
1819 ext = os.path.splitext(filepath)[1]
1820 if ext == '.pkl':
1821 import pickle
1822 with open(filepath, 'rb') as f:
1823 data_dict = pickle.load(f)
1824 self.format = 'PKL'
1825 elif ext == '.npz':
1826 data_dict = np.load(filepath)
1827 self.format = 'NPZ'
1828 elif ext == '.mat':
1829 from scipy.io import loadmat
1830 data_dict = loadmat(filepath, squeeze_me=True)
1831 self.format = 'MAT'
1832 self.buffer, self.rate, self.unit, amax = \
1833 extract_container_data(data_dict, datakey, samplekey,
1834 timekey, amplkey, unitkey, amax, unit)
1835 self.filepath = filepath
1836 self.file_paths = [self.filepath]
1837 self.file_indices = [0]
1838 self.channels = self.buffer.shape[1]
1839 self.frames = self.buffer.shape[0]
1840 self.shape = self.buffer.shape
1841 self.ndim = self.buffer.ndim
1842 self.size = self.buffer.size
1843 self.encoding = self.numpy_encodings[self.buffer.dtype]
1844 self.ampl_min = -amax
1845 self.ampl_max = +amax
1846 self.offset = 0
1847 self.buffer_changed = np.zeros(self.channels, dtype=bool)
1848 self.bufferframes = self.frames
1849 self.backsize = 0
1850 self.close = self._close_container
1851 self.load_audio_buffer = self._load_buffer_container
1852 self._metadata = extract_container_metadata(data_dict, metadatakey)
1853 self._load_metadata = None
1854 self._locs, self._labels = extract_container_markers(data_dict,
1855 poskey,
1856 spanskey,
1857 labelskey,
1858 descrkey)
1859 self._load_markers = None
1861 def _close_container(self):
1862 """Close container. """
1863 pass
1865 def _load_buffer_container(self, r_offset, r_size, buffer):
1866 """Load new data from container."""
1867 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :]
1870 # raw data interface:
1871 def open_raw(self, filepath, buffersize=10.0, backsize=0.0,
1872 verbose=0, rate=44000, channels=1, dtype=np.float32,
1873 amax=1.0, unit='a.u.'):
1874 """Load data from a raw file.
1876 Raw files just contain the data and absolutely no metadata, not
1877 even the smapling rate, number of channels, etc.
1878 Supported file formats are:
1880 - raw files (*.raw)
1881 - LabView scandata (*.scandat)
1883 Parameters
1884 ----------
1885 filepath: str
1886 Path of the file to load.
1887 buffersize: float
1888 Size of internal buffer in seconds.
1889 backsize: float
1890 Part of the buffer to be loaded before the requested start index in seconds.
1891 verbose: int
1892 If > 0 show detailed error/warning messages.
1893 rate: float
1894 Sampling rate of the data in Hertz.
1895 channels: int
1896 Number of channels multiplexed in the data.
1897 dtype: str or numpy.dtype
1898 The data type stored in the file.
1899 amax: float
1900 The amplitude range of the data.
1901 unit: str
1902 The unit of the data.
1903 """
1904 self.verbose = verbose
1905 self.filepath = filepath
1906 self.file_paths = [self.filepath]
1907 self.file_indices = [0]
1908 self.sf = open(self.filepath, 'rb')
1909 if verbose > 0:
1910 print(f'open_raw(filepath) with filepath={filepath}')
1911 self.dtype = np.dtype(dtype)
1912 self.rate = float(rate)
1913 # file size:
1914 self.sf.seek(0, os.SEEK_END)
1915 self.frames = self.sf.tell()//self.dtype.itemsize
1916 self.sf.seek(0)
1917 self.channels = int(channels)
1918 self.shape = (self.frames, self.channels)
1919 self.ndim = len(self.shape)
1920 self.size = self.frames*self.channels
1921 self.format = 'RAW'
1922 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN')
1923 self.unit = unit
1924 self.ampl_max = float(amax)
1925 self.ampl_min = -self.ampl_max
1926 self.offset = 0
1927 self.bufferframes = int(buffersize*self.rate)
1928 self.backframes = int(backsize*self.rate)
1929 self.init_buffer()
1930 self.close = self._close_raw
1931 self.load_audio_buffer = self._load_buffer_raw
1932 self._metadata = None
1933 self._load_metadata = None
1934 self._locs = None
1935 self._labels = None
1936 self._load_markers = None
1938 def _close_raw(self):
1939 """Close raw file. """
1940 self.sf.close()
1941 self.sf = None
1943 def _load_buffer_raw(self, r_offset, r_size, buffer):
1944 """Load new data from container."""
1945 if self.sf is None:
1946 self.sf = open(self.filepath, 'rb')
1947 self.sf.seek(r_offset*self.dtype.itemsize)
1948 raw_data = self.sf.read(r_size*self.dtype.itemsize)
1949 raw_data = np.frombuffer(raw_data, dtype=self.dtype)
1950 raw_data = raw_data.reshape(-1, self.channels)
1951 # recode:
1952 if self.dtype == np.dtype('int16'):
1953 data = raw_data.astype('float32')
1954 data *= self.ampl_max/2**15
1955 elif self.dtype == np.dtype('int32'):
1956 data = raw_data.astype(float)
1957 data *= self.ampl_max/2**31
1958 elif self.dtype == np.dtype('int64'):
1959 data = raw_data.astype(float)
1960 data *= self.ampl_max/2**63
1961 else:
1962 data = raw_data
1963 buffer[:, :] = data
1966 # audioio interface:
1967 def open_audioio(self, filepath, buffersize=10.0, backsize=0.0,
1968 verbose=0, gainkey=default_gain_keys, sep='.',
1969 amax=None, unit='a.u.'):
1970 """Open an audio file.
1972 See the [audioio](https://github.com/bendalab/audioio) package
1973 for details.
1975 Parameters
1976 ----------
1977 filepath: str
1978 Path to an audio file.
1979 buffersize: float
1980 Size of internal buffer in seconds.
1981 backsize: float
1982 Part of the buffer to be loaded before the requested start index
1983 in seconds.
1984 verbose: int
1985 If > 0 show detailed error/warning messages.
1986 gainkey: str or list of str
1987 Key in the file's metadata that holds some gain information.
1988 If found, the data will be multiplied with the gain,
1989 and if available, the corresponding unit is returned.
1990 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
1991 sep: str
1992 String that separates section names in `gainkey`.
1993 amax: None or float
1994 If specified and no gain has been found in the metadata,
1995 then use this as the amplitude range.
1996 unit: None or str
1997 If specified and no gain has been found in the metadata,
1998 then this is the unit of the data.
2000 """
2001 self.verbose = verbose
2002 super(DataLoader, self).open(filepath, buffersize, backsize, verbose)
2003 md = self.metadata()
2004 fac, unit = get_gain(md, gainkey, sep, amax, unit)
2005 if fac is None:
2006 self.gain_fac = 1.0
2007 else:
2008 self.gain_fac = fac
2009 self._load_buffer_audio_org = self.load_audio_buffer
2010 self.load_audio_buffer = self._load_buffer_audioio
2011 self.ampl_min *= self.gain_fac
2012 self.ampl_max *= self.gain_fac
2013 self.unit = unit
2014 return self
2016 def _load_buffer_audioio(self, r_offset, r_size, buffer):
2017 """Load and scale new data from an audio file.
2019 Parameters
2020 ----------
2021 r_offset: int
2022 First frame to be read from file.
2023 r_size: int
2024 Number of frames to be read from file.
2025 buffer: ndarray
2026 Buffer where to store the loaded data.
2027 """
2028 self._load_buffer_audio_org(r_offset, r_size, buffer)
2029 buffer *= self.gain_fac
2032 # open multiple files as one:
2033 def open_multiple(self, filepaths, buffersize=10.0, backsize=0.0,
2034 verbose=0, rate=None, channels=None,
2035 unit=None, amax=None, end_indices=None):
2036 """Open multiple files as a single concatenated array.
2038 Parameters
2039 ----------
2040 filepaths: list of str
2041 List of file names of audio files.
2042 buffersize: float
2043 Size of internal buffer in seconds.
2044 backsize: float
2045 Part of the buffer to be loaded before the requested start index in seconds.
2046 verbose: int
2047 If larger than zero show detailed error/warning messages.
2048 rate: float
2049 If provided, do a minimal initialization (no checking)
2050 using the provided sampling rate (in Hertz), channels,
2051 unit, maximum amplitude, and end_indices.
2052 channels: int
2053 If provided, do a minimal initialization (no checking)
2054 using the provided rate, number of channels,
2055 unit, maximum amplitude, and end_indices.
2056 unit: str
2057 If provided, do a minimal initialization (no checking)
2058 using the provided rate, number of channels,
2059 unit, maximum amplitude, and end_indices.
2060 amax: float
2061 If provided, do a minimal initialization (no checking)
2062 using the provided rate, number of channels,
2063 unit, maximum amplitude amax, and end_indices.
2064 end_indices: sequence of int
2065 If provided, do a minimal initialization (no checking)
2066 using the provided rate, channels,
2067 unit, maximum amplitude, and end_indices.
2069 Raises
2070 ------
2071 TypeError
2072 `filepaths` must be a sequence.
2073 ValueError
2074 Empty `filepaths`.
2075 FileNotFoundError
2076 `filepaths` does not contain a single valid file.
2078 """
2079 if not isinstance(filepaths, (list, tuple, np.ndarray)):
2080 raise TypeError('input argument filepaths is not a sequence!')
2081 if len(filepaths) == 0:
2082 raise ValueError('input argument filepaths is empy sequence!')
2083 self.buffersize = buffersize
2084 self.backsize = backsize
2085 self.filepath = None
2086 self.file_paths = []
2087 self.open_files = []
2088 self.open_loaders = []
2089 self.data_files = []
2090 self.collect_counter = 0
2091 self.frames = 0
2092 self.start_indices = []
2093 self.end_indices = []
2094 self.start_time = None
2095 start_time = None
2096 self._metadata = {}
2097 self._locs = np.zeros((0, 2), dtype=int)
2098 self._labels = np.zeros((0, 2), dtype=object)
2099 if end_indices is not None:
2100 self.filepath = filepaths[0]
2101 self.file_paths = filepaths
2102 self.data_files = [None] * len(filepaths)
2103 self.frames = end_indices[-1]
2104 self.start_indices = [0] + list(end_indices[:-1])
2105 self.end_indices = end_indices
2106 self.format = None
2107 self.encoding = None
2108 self.rate = rate
2109 self.channels = channels
2110 self.unit = unit
2111 self.ampl_max = amax
2112 self.ampl_min = -amax
2113 else:
2114 for filepath in filepaths:
2115 try:
2116 a = DataLoader(filepath, buffersize, backsize, verbose)
2117 except Exception as e:
2118 if verbose > 0:
2119 print(e)
2120 continue
2121 # collect metadata:
2122 md = a.metadata()
2123 fmd = flatten_metadata(md, True)
2124 add_metadata(self._metadata, fmd)
2125 if self.filepath is None:
2126 # first file:
2127 self.filepath = a.filepath
2128 self.format = a.format
2129 self.encoding = a.encoding
2130 self.rate = a.rate
2131 self.channels = a.channels
2132 self.unit = a.unit
2133 self.ampl_max = a.ampl_max
2134 self.ampl_min = a.ampl_min
2135 self.start_time = get_datetime(md)
2136 start_time = self.start_time
2137 else:
2138 # check channels, rate, and amplitudes:
2139 error_str = None
2140 if a.channels != self.channels:
2141 error_str = f'number of channels differs: ' \
2142 f'{a.channels} in {a.filepath} versus ' \
2143 f'{self.channels} in {self.filepath}'
2144 if a.rate != self.rate:
2145 error_str = f'sampling rates differ: ' \
2146 f'{a.rate} in {a.filepath} versus ' \
2147 f'{self.rate} in {self.filepath}'
2148 if a.ampl_min != self.ampl_min:
2149 error_str = f'minimum amplitudes differ: ' \
2150 f'{a.ampl_min} in {a.filepath} versus ' \
2151 f'{self.ampl_min} in {self.filepath}'
2152 if a.ampl_max != self.ampl_max:
2153 error_Str = f'maximum amplitudes differ: ' \
2154 f'{a.ampl_max} in {a.filepath} versus ' \
2155 f'{self.ampl_max} in {self.filepath}'
2156 # check start time of recording:
2157 stime = get_datetime(md)
2158 if start_time is None or stime is None or \
2159 abs(start_time - stime) > timedelta(seconds=1):
2160 error_str = f'start time does not indicate continuous recording: ' \
2161 f'expected {start_time} instead of ' \
2162 f'{stime} in {a.filepath}'
2163 if error_str is not None:
2164 if verbose > 0:
2165 print(error_str)
2166 a.close()
2167 del a
2168 break
2169 # markers:
2170 locs, labels = a.markers()
2171 locs[:,0] += self.frames
2172 self._locs = np.vstack((self._locs, locs))
2173 self._labels = np.vstack((self._labels, labels))
2174 # indices:
2175 self.start_indices.append(self.frames)
2176 self.frames += a.frames
2177 self.end_indices.append(self.frames)
2178 if start_time is not None:
2179 start_time += timedelta(seconds=a.frames/a.rate)
2180 # add file to lists:
2181 self.file_paths.append(filepath)
2182 if len(self.open_files) < AudioLoader.max_open_files:
2183 self.open_files.append(a)
2184 else:
2185 a.close()
2186 if len(self.open_loaders) < AudioLoader.max_open_loaders:
2187 self.data_files.append(a)
2188 self.open_loaders.append(a)
2189 else:
2190 a.close()
2191 del a
2192 self.data_files.append(None)
2193 if len(self.data_files) == 0:
2194 raise FileNotFoundError('input argument filepaths does not contain any valid audio file!')
2195 # set startime from first file:
2196 if self.start_time is not None:
2197 set_starttime(self._metadata, self.start_time)
2198 # setup infrastructure:
2199 self.file_indices = self.start_indices
2200 self.start_indices = np.array(self.start_indices)
2201 self.end_indices = np.array(self.end_indices)
2202 self.shape = (self.frames, self.channels)
2203 self.bufferframes = int(buffersize*self.rate)
2204 self.backframes = int(backsize*self.rate)
2205 self.init_buffer()
2206 self.close = self._close_multiple
2207 self.load_audio_buffer = self._load_buffer_multiple
2208 self._load_metadata = None
2209 self._load_markers = None
2210 return self
2212 def _close_multiple(self):
2213 """Close all the data files. """
2214 self.open_files = []
2215 self.open_loaders = []
2216 if hasattr(self, 'data_files'):
2217 for a in self.data_files:
2218 if a is not None:
2219 a.close()
2220 self.data_files = []
2221 self.filepath = None
2222 self.file_paths = []
2223 self.file_indices = []
2224 self.start_indices = []
2225 self.end_indices = []
2226 del self.data_files
2227 del self.open_files
2228 del self.open_loaders
2229 del self.start_indices
2230 del self.end_indices
2232 def _load_buffer_multiple(self, r_offset, r_size, buffer):
2233 """Load new data from the underlying files.
2235 Parameters
2236 ----------
2237 r_offset: int
2238 First frame to be read from file.
2239 r_size: int
2240 Number of frames to be read from file.
2241 buffer: ndarray
2242 Buffer where to store the loaded data.
2243 """
2244 offs = r_offset
2245 size = r_size
2246 boffs = 0
2247 ai = np.searchsorted(self.end_indices, offs, side='right')
2248 while size > 0:
2249 if self.data_files[ai] is None:
2250 a = DataLoader(self.file_paths[ai],
2251 self.buffersize, self.backsize, 0)
2252 self.data_files[ai] = a
2253 self.open_loaders.append(a)
2254 self.open_files.append(a)
2255 if len(self.open_files) > AudioLoader.max_open_files:
2256 a0 = self.open_files.pop(0)
2257 a0.close()
2258 if len(self.open_loaders) > AudioLoader.max_open_loaders:
2259 a0 = self.open_loaders.pop(0)
2260 self.data_files[self.data_files.index(a0)] = None
2261 a0.close()
2262 del a0
2263 self.collect_counter += 1
2264 if self.collect_counter > AudioLoader.max_open_loaders//2:
2265 gc.collect() # takes time!
2266 self.collect_counter = 0
2267 else:
2268 self.open_loaders.pop(self.open_loaders.index(self.data_files[ai]))
2269 self.open_loaders.append(self.data_files[ai])
2270 ai0 = offs - self.start_indices[ai]
2271 ai1 = offs + size
2272 if ai1 > self.end_indices[ai]:
2273 ai1 = self.end_indices[ai]
2274 ai1 -= self.start_indices[ai]
2275 n = ai1 - ai0
2276 self.data_files[ai].load_audio_buffer(ai0, n,
2277 buffer[boffs:boffs + n,:])
2278 if self.data_files[ai] in self.open_files:
2279 self.open_files.pop(self.open_files.index(self.data_files[ai]))
2280 self.open_files.append(self.data_files[ai])
2281 if len(self.open_files) > AudioLoader.max_open_files:
2282 self.open_files[0].close()
2283 self.open_files.pop(0)
2284 boffs += n
2285 offs += n
2286 size -= n
2287 ai += 1
2290 def open(self, filepath, buffersize=10.0, backsize=0.0,
2291 verbose=0, **kwargs):
2292 """Open file with time-series data for reading.
2294 Parameters
2295 ----------
2296 filepath: str or list of str
2297 Name of the file or list of many file names that should be
2298 made accessible as a single array.
2299 buffersize: float
2300 Size of internal buffer in seconds.
2301 backsize: float
2302 Part of the buffer to be loaded before the requested start index
2303 in seconds.
2304 verbose: int
2305 If > 0 show detailed error/warning messages.
2306 **kwargs: dict
2307 Further keyword arguments that are passed on to the
2308 format specific opening functions.
2309 For example:
2310 - `amax`: the amplitude range of the data.
2311 - 'unit': the unit of the data.
2313 Raises
2314 ------
2315 ValueError:
2316 `filepath` is empty string.
2317 """
2318 # list of implemented open functions:
2319 data_open_funcs = (
2320 ('relacs', check_relacs, self.open_relacs, 1),
2321 ('fishgrid', check_fishgrid, self.open_fishgrid, 1),
2322 ('container', check_container, self.open_container, 1),
2323 ('raw', check_raw, self.open_raw, 1),
2324 ('audioio', None, self.open_audioio, 0),
2325 )
2327 self.buffer = np.array([])
2328 self.rate = 0.0
2329 if not filepath:
2330 raise ValueError('input argument filepath is empty string.')
2331 if isinstance(filepath, (list, tuple, np.ndarray)):
2332 self.open_multiple(filepath, buffersize, backsize,
2333 verbose, **kwargs)
2334 if len(self.file_paths) > 1:
2335 return self
2336 filepath = self.file_paths[0]
2337 self.close()
2338 # open data:
2339 for name, check_file, open_file, v in data_open_funcs:
2340 if check_file is None or check_file(filepath):
2341 open_file(filepath, buffersize, backsize, verbose, **kwargs)
2342 if v*verbose > 1:
2343 if self.format is not None:
2344 print(f' format : {self.format}')
2345 if self.encoding is not None:
2346 print(f' encoding : {self.encoding}')
2347 print(f' sampling rate: {self.rate} Hz')
2348 print(f' channels : {self.channels}')
2349 print(f' frames : {self.frames}')
2350 print(f' range : {self.ampl_max:g}{self.unit}')
2351 break
2352 return self
2355def demo(filepath, plot=False):
2356 print("try load_data:")
2357 data, rate, unit, amax = load_data(filepath, verbose=2)
2358 if plot:
2359 fig, ax = plt.subplots()
2360 time = np.arange(len(data))/rate
2361 for c in range(data.shape[1]):
2362 ax.plot(time, data[:,c])
2363 ax.set_xlabel('Time [s]')
2364 ax.set_ylabel(f'[{unit}]')
2365 if amax is not None and np.isfinite(amax):
2366 ax.set_ylim(-amax, +amax)
2367 plt.show()
2368 return
2370 print('')
2371 print("try DataLoader:")
2372 with DataLoader(filepath, 2.0, 1.0, 1) as data:
2373 print('sampling rate: %g' % data.rate)
2374 print('frames : %d %d' % (len(data), data.shape[0]))
2375 nframes = int(1.0 * data.rate)
2376 # forward:
2377 for i in range(0, len(data), nframes):
2378 print('forward %d-%d' % (i, i + nframes))
2379 x = data[i:i + nframes, 0]
2380 if plot:
2381 fig, ax = plt.subplots()
2382 ax.plot((i + np.arange(len(x)))/data.rate, x)
2383 ax.set_xlabel('Time [s]')
2384 ax.set_ylabel(f'[{data.unit}]')
2385 plt.show()
2386 # and backwards:
2387 for i in reversed(range(0, len(data), nframes)):
2388 print('backward %d-%d' % (i, i + nframes))
2389 x = data[i:i + nframes, 0]
2390 if plot:
2391 fig, ax = plt.subplots()
2392 ax.plot((i + np.arange(len(x)))/data.rate, x)
2393 ax.set_xlabel('Time [s]')
2394 ax.set_ylabel(f'[{data.unit}]')
2395 plt.show()
2398def main(*cargs):
2399 """Call demo with command line arguments.
2401 Parameters
2402 ----------
2403 cargs: list of str
2404 Command line arguments as provided by sys.argv[1:]
2405 """
2406 import argparse
2407 parser = argparse.ArgumentParser(description=
2408 'Checking thunderlab.dataloader module.')
2409 parser.add_argument('-p', dest='plot', action='store_true',
2410 help='plot loaded data')
2411 parser.add_argument('file', nargs=1, default='', type=str,
2412 help='name of data file')
2413 args = parser.parse_args(cargs)
2414 demo(args.file[0], args.plot)
2417if __name__ == "__main__":
2418 main(*sys.argv[1:])