Coverage for src/thunderlab/dataloader.py: 77%
1097 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-18 22:10 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-18 22:10 +0000
1"""Load time-series data from files.
3```
4data, rate, unit, amax = load_data('data/file.wav')
5```
7The function `data_loader()` loads the whole time-series from the file
8as a numpy array of floats. First dimension is frames, second is
9channels. In contrast to the `audioio.load_audio()` function, the
10values of the data array are not restricted between -1 and 1. They can
11assume any value wihin the range `-amax` to `+amax` with the returned
12`unit`.
14```
15data = DataLoader('data/file.wav', 60.0)
16```
17or
18```
19with DataLoader('data/file.wav', 60.0) as data:
20```
21Create an `DataLoader` object that loads chuncks of 60 seconds long data
22on demand. `data` can be used like a read-only numpy array of floats.
25## Supported file formats
27- python pickle files
28- numpy .npz files
29- matlab .mat files
30- audio files via [`audioio`](https://github.com/bendalab/audioio) package
31- LabView .scandat files
32- relacs trace*.raw files (https://www.relacs.net)
33- fishgrid traces-*.raw files (https://github.com/bendalab/fishgrid)
36## Metadata
38Many file formats allow to store metadata that further describe the
39stored time series data. We handle them as nested dictionary of key-value
40pairs. Load them with the `metadata()` function:
41```
42metadata = metadata('data/file.mat')
43```
45## Markers
47Some file formats also allow to store markers that mark specific
48positions in the time series data. Load marker positions and spans (in
49the 2-D array `locs`) and label and text strings (in the 2-D array
50`labels`) with the `markers()` function:
51```
52locs, labels = markers('data.wav')
53```
55## Aditional, format specific functions
57- `extract_container_metadata()`: extract metadata from dictionary loaded from a container file.
58- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file.
59- `relacs_header()`: read key-value pairs from relacs *.dat file headers.
60- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file.
61- `fishgrid_spacings()`: spacing between grid electrodes.
63"""
65import gc
66import os
67import sys
68import glob
69import gzip
70import numpy as np
71try:
72 import matplotlib.pyplot as plt
73except ImportError:
74 pass
75from pathlib import Path
76from datetime import timedelta
77from audioio import load_audio, AudioLoader, unflatten_metadata
78from audioio import get_number_unit, get_number, get_int, get_bool, get_gain
79from audioio import default_starttime_keys, default_gain_keys
80from audioio import get_datetime, flatten_metadata, add_metadata, set_starttime
81from audioio import metadata as metadata_audioio
82from audioio import markers as markers_audioio
85def relacs_samplerate_unit(filepath, channel=0):
86 """Retrieve sampling rate and unit from a relacs stimuli.dat file.
88 Parameters
89 ----------
90 filepath: str
91 Path to a relacs data directory, or a file in a relacs data directory.
92 channel: int
93 Channel (trace) number, if `filepath` does not specify a
94 trace-*.raw file.
96 Returns
97 -------
98 samplerate: float
99 Sampling rate in Hertz
100 unit: str
101 Unit of the trace, can be empty if not found
103 Raises
104 ------
105 IOError/FileNotFoundError:
106 If the stimuli.dat file does not exist.
107 ValueError:
108 stimuli.dat file does not contain sampling rate.
109 """
110 trace = channel + 1
111 relacs_dir = filepath
112 # check for relacs data directory:
113 if not os.path.isdir(filepath):
114 relacs_dir = os.path.dirname(filepath)
115 bn = os.path.basename(filepath).lower()
116 i = bn.find('.raw')
117 if len(bn) > 5 and bn[0:5] == 'trace' and i > 6:
118 trace = int(bn[6:i])
120 # retreive sampling rate and unit from stimuli.dat file:
121 samplerate = None
122 sampleinterval = None
123 unit = ""
125 lines = []
126 stimuli_file = os.path.join(relacs_dir, 'stimuli.dat')
127 if os.path.isfile(stimuli_file + '.gz'):
128 stimuli_file += '.gz'
129 if stimuli_file[-3:] == '.gz':
130 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf:
131 for line in sf:
132 line = line.strip()
133 if len(line) == 0 or line[0] != '#':
134 break
135 lines.append(line)
136 else:
137 with open(stimuli_file, 'r', encoding='latin-1') as sf:
138 for line in sf:
139 line = line.strip()
140 if len(line) == 0 or line[0] != '#':
141 break
142 lines.append(line)
144 for line in lines:
145 if "unit%d" % trace in line:
146 unit = line.split(':')[1].strip()
147 if "sampling rate%d" % trace in line:
148 value = line.split(':')[1].strip()
149 samplerate = float(value.replace('Hz',''))
150 elif "sample interval%d" % trace in line:
151 value = line.split(':')[1].strip()
152 sampleinterval = float(value.replace('ms',''))
154 if samplerate is not None:
155 return samplerate, unit
156 if sampleinterval is not None:
157 return 1000/sampleinterval, unit
158 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}')
161def relacs_header(filepath, store_empty=False, first_only=False,
162 lower_keys=False, flat=False,
163 add_sections=False):
164 """Read key-value pairs from a relacs *.dat file header.
166 Parameters
167 ----------
168 filepath: str
169 A relacs *.dat file, can be also a zipped .gz file.
170 store_empty: bool
171 If `False` do not add meta data with empty values.
172 first_only: bool
173 If `False` only store the first element of a list.
174 lower_keys: bool
175 Make all keys lower case.
176 flat: bool
177 Do not make a nested dictionary.
178 Use this option also to read in very old relacs metadata with
179 ragged left alignment.
180 add_sections: bool
181 If `True`, prepend keys with sections names separated by
182 '.' to make them unique.
184 Returns
185 -------
186 data: dict
187 Nested dictionary with key-value pairs of the file header.
189 Raises
190 ------
191 IOError/FileNotFoundError:
192 If `filepath` cannot be opened.
193 """
194 # read in header from file:
195 lines = []
196 if os.path.isfile(filepath + '.gz'):
197 filepath += '.gz'
198 if filepath[-3:] == '.gz':
199 with gzip.open(filepath, 'r', encoding='latin-1') as sf:
200 for line in sf:
201 line = line.strip()
202 if len(line) == 0 or line[0] != '#':
203 break
204 lines.append(line)
205 else:
206 with open(filepath, 'r', encoding='latin-1') as sf:
207 for line in sf:
208 line = line.strip()
209 if len(line) == 0 or line[0] != '#':
210 break
211 lines.append(line)
212 # parse:
213 data = {}
214 cdatas = [data]
215 sections = ['']
216 ident_offs = None
217 ident = None
218 for line in lines:
219 words = line.split(':')
220 value = ':'.join(words[1:]).strip() if len(words) > 1 else ''
221 if len(words) >= 1:
222 key = words[0].strip('#')
223 # get section level:
224 level = 0
225 if not flat or len(value) == 0:
226 nident = len(key) - len(key.lstrip())
227 if ident_offs is None:
228 ident_offs = nident
229 elif ident is None:
230 if nident > ident_offs:
231 ident = nident - ident_offs
232 level = 1
233 else:
234 level = (nident - ident_offs)//ident
235 # close sections:
236 if not flat:
237 while len(cdatas) > level + 1:
238 cdatas[-1][sections.pop()] = cdatas.pop()
239 else:
240 while len(sections) > level + 1:
241 sections.pop()
242 # key:
243 key = key.strip().strip('"')
244 if lower_keys:
245 key = key.lower()
246 skey = key
247 if add_sections:
248 key = '.'.join(sections[1:] + [key])
249 if len(value) == 0:
250 # new sub-section:
251 if flat:
252 if store_empty:
253 cdatas[-1][key] = None
254 else:
255 cdatas.append({})
256 sections.append(skey)
257 else:
258 # key-value pair:
259 value = value.strip('"')
260 if len(value) > 0 or value != '-' or store_empty:
261 if len(value) > 0 and value[0] == '[' and value[-1] == ']':
262 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')]
263 if first_only:
264 value = value[0]
265 cdatas[-1][key] = value
266 while len(cdatas) > 1:
267 cdatas[-1][sections.pop()] = cdatas.pop()
268 return data
271def check_relacs(filepath):
272 """Check for valid relacs file.
274 Parameters
275 ----------
276 filepath: str
277 Path to a relacs data directory, or a file in a relacs data directory.
279 Returns
280 -------
281 is_relacs: boolean
282 `True` if `filepath` is a valid relacs directory or is a file therein.
283 """
284 # relacs data directory:
285 relacs_dir = filepath
286 if not os.path.isdir(filepath):
287 relacs_dir = os.path.dirname(filepath)
288 # check for a valid relacs data directory:
289 has_stimuli = False
290 has_trace = False
291 for fname in ['stimuli.dat', 'stimuli.dat.gz']:
292 if os.path.isfile(os.path.join(relacs_dir, fname)):
293 has_stimuli = True
294 for fname in ['trace-1.raw', 'trace-1.raw.gz']:
295 if os.path.isfile(os.path.join(relacs_dir, fname)):
296 has_trace = True
297 return has_stimuli and has_trace
300def relacs_trace_files(filepath):
301 """Expand file path for relacs data to appropriate trace*.raw file names.
303 Parameters
304 ----------
305 filepath: str
306 Path to a relacs data directory, or a file in a relacs data directory.
308 Returns
309 -------
310 trace_filepaths: list of str
311 List of relacs trace*.raw files.
312 """
313 relacs_dir = filepath
314 if not os.path.isdir(filepath):
315 relacs_dir = os.path.dirname(filepath)
316 trace_filepaths = []
317 for k in range(10000):
318 fname = os.path.join(relacs_dir, f'trace-{k+1}.raw')
319 if os.path.isfile(fname):
320 trace_filepaths.append(fname)
321 elif os.path.isfile(fname + '.gz'):
322 trace_filepaths.append(fname + '.gz')
323 else:
324 break
325 return trace_filepaths
328def load_relacs(filepath, amax=1.0):
329 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs).
331 Parameters
332 ----------
333 filepath: str
334 Path to a relacs data directory, or a file in a relacs data directory.
335 amax: float
336 The amplitude range of the data.
338 Returns
339 -------
340 data: 2-D array
341 All data traces as an 2-D numpy array, even for single channel data.
342 First dimension is time, second is channel.
343 rate: float
344 Sampling rate of the data in Hz
345 unit: str
346 Unit of the data
347 amax: float
348 Maximum amplitude of data range.
350 Raises
351 ------
352 FileNotFoundError:
353 Invalid or non existing relacs files.
354 ValueError:
355 - Invalid name for relacs trace-*.raw file.
356 - Sampling rates of traces differ.
357 - Unit of traces differ.
358 """
359 trace_filepaths = relacs_trace_files(filepath)
360 if len(trace_filepaths) == 0:
361 raise FileNotFoundError(f'no relacs files found')
362 # load trace*.raw files:
363 nchannels = len(trace_filepaths)
364 data = None
365 nrows = 0
366 rate = None
367 unit = ''
368 for c, path in enumerate(sorted(trace_filepaths)):
369 if path[-3:] == '.gz':
370 with gzip.open(path, 'rb') as sf:
371 x = np.frombuffer(sf.read(), dtype=np.float32)
372 else:
373 x = np.fromfile(path, np.float32)
374 if data is None:
375 nrows = len(x)
376 data = np.zeros((nrows, nchannels))
377 n = min(len(x), nrows)
378 data[:n,c] = x[:n]
379 # retrieve sampling rate and unit:
380 crate, us = relacs_samplerate_unit(path, c)
381 if rate is None:
382 rate = crate
383 elif crate != rate:
384 raise ValueError('sampling rates of traces differ')
385 if len(unit) == 0:
386 unit = us
387 elif us != unit:
388 raise ValueError('unit of traces differ')
389 return data, rate, unit, amax
392def metadata_relacs(filepath, store_empty=False, first_only=False,
393 lower_keys=False, flat=False, add_sections=False):
394 """ Read meta-data of a relacs data set.
396 Parameters
397 ----------
398 filepath: str
399 A relacs data directory or a file therein.
400 store_empty: bool
401 If `False` do not add meta data with empty values.
402 first_only: bool
403 If `False` only store the first element of a list.
404 lower_keys: bool
405 Make all keys lower case.
406 flat: bool
407 Do not make a nested dictionary.
408 Use this option also to read in very old relacs metadata with
409 ragged left alignment.
410 add_sections: bool
411 If `True`, prepend keys with sections names separated by
412 '.' to make them unique.
414 Returns
415 -------
416 data: nested dict
417 Nested dictionary with key-value pairs of the meta data.
418 """
419 relacs_dir = filepath
420 if not os.path.isdir(filepath):
421 relacs_dir = os.path.dirname(filepath)
422 info_path = os.path.join(relacs_dir, 'info.dat')
423 if not os.path.exists(info_path):
424 return dict(), []
425 data = relacs_header(info_path, store_empty, first_only,
426 lower_keys, flat, add_sections)
427 return data
430def fishgrid_spacings(metadata, unit='m'):
431 """Spacing between grid electrodes.
433 Parameters
434 ----------
435 metadata: dict
436 Fishgrid metadata obtained from `metadata_fishgrid()`.
437 unit: str
438 Unit in which to return the spacings.
440 Returns
441 -------
442 grid_dist: list of tuple of float
443 For each grid the distances between rows and columns in `unit`.
444 """
445 grids_dist = []
446 for k in range(4):
447 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0)
448 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0)
449 rows = get_int(metadata, f'Rows{k+1}', default=0)
450 cols = get_int(metadata, f'Columns{k+1}', default=0)
451 if get_bool(metadata, f'Used{k+1}', default=False) or \
452 cols > 0 and rows > 0:
453 grids_dist.append((row_dist, col_dist))
454 return grids_dist
457def fishgrid_grids(metadata):
458 """Retrieve grid sizes from a fishgrid.cfg file.
460 Parameters
461 ----------
462 metadata: dict
463 Fishgrid metadata obtained from `metadata_fishgrid()`.
465 Returns
466 -------
467 grids: list of tuple of int
468 For each grid the number of rows and columns.
469 """
470 grids = []
471 for k in range(4):
472 rows = get_int(metadata, f'Rows{k+1}', default=0)
473 cols = get_int(metadata, f'Columns{k+1}', default=0)
474 if get_bool(metadata, f'Used{k+1}', default=False) or \
475 cols > 0 and rows > 0:
476 grids.append((rows, cols))
477 return grids
480def check_fishgrid(filepath):
481 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid).
483 Parameters
484 ----------
485 filepath: str
486 Path to a fishgrid data directory or a file in a fishgrid
487 data directory.
489 Returns
490 -------
491 is_fishgrid: bool
492 `True` if `filepath` is a valid fishgrid data directory or
493 a file therein.
494 """
495 # fishgrid data directory:
496 fishgrid_dir = filepath
497 if not os.path.isdir(filepath):
498 fishgrid_dir = os.path.dirname(filepath)
499 # check for a valid fishgrid data directory:
500 return (os.path.isfile(os.path.join(fishgrid_dir, 'fishgrid.cfg')) and
501 (os.path.isfile(os.path.join(fishgrid_dir, 'traces-grid1.raw')) or
502 os.path.isfile(os.path.join(fishgrid_dir, 'traces.raw'))))
505def fishgrid_trace_files(filepath):
506 """Expand file paths for fishgrid data to appropriate traces*.raw file names.
508 Parameters
509 ----------
510 filepath: str
511 Path to a fishgrid data directory, or a file therein.
513 Returns
514 -------
515 trace_filepaths: list of str
516 List of fishgrid traces*.raw files.
517 """
518 # find grids:
519 fishgrid_dir = filepath
520 if not os.path.isdir(fishgrid_dir):
521 fishgrid_dir = os.path.dirname(filepath)
522 trace_filepaths = []
523 for k in range(10000):
524 file = os.path.join(fishgrid_dir, f'traces-grid{k+1}.raw')
525 if os.path.isfile(file):
526 trace_filepaths.append(file)
527 else:
528 break
529 if len(trace_filepaths) == 0:
530 file = os.path.join(fishgrid_dir, f'traces.raw')
531 if os.path.isfile(file):
532 trace_filepaths.append(file)
533 return trace_filepaths
536def load_fishgrid(filepath):
537 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid).
539 Parameters
540 ----------
541 filepath: str
542 Path to a fishgrid data directory, or a file therein.
544 Returns
545 -------
546 data: 2-D array
547 All data traces as an 2-D numpy array, even for single channel data.
548 First dimension is time, second is channel.
549 rate: float
550 Sampling rate of the data in Hz.
551 unit: str
552 Unit of the data.
553 amax: float
554 Maximum amplitude of data range.
556 Raises
557 ------
558 FileNotFoundError:
559 Invalid or non existing fishgrid files.
560 """
561 trace_filepaths = fishgrid_trace_files(filepath)
562 if len(trace_filepaths) == 0:
563 raise FileNotFoundError(f'no fishgrid files found')
564 md = metadata_fishgrid(filepath)
565 grids = fishgrid_grids(md)
566 grid_sizes = [r*c for r, c in grids]
568 # load traces-grid*.raw files:
569 grid_channels = []
570 nchannels = 0
571 for g, path in enumerate(trace_filepaths):
572 grid_channels.append(grid_sizes[g])
573 nchannels += grid_sizes[g]
574 data = None
575 nrows = 0
576 c = 0
577 rate = get_number(md, 'Hz', 'AISampleRate')
578 for path, channels in zip(trace_filepaths, grid_channels):
579 x = np.fromfile(path, np.float32).reshape((-1, channels))
580 if data is None:
581 nrows = len(x)
582 data = np.zeros((nrows, nchannels))
583 n = min(len(x), nrows)
584 data[:n,c:c+channels] = x[:n,:]
585 c += channels
586 amax, unit = get_number_unit(md, 'AIMaxVolt')
587 return data, rate, unit, amax
590# add fishgrid keys:
591default_starttime_keys.append(['StartDate', 'StartTime'])
592default_gain_keys.insert(0, 'AIMaxVolt')
595def metadata_fishgrid(filepath):
596 """ Read meta-data of a fishgrid data set.
598 Parameters
599 ----------
600 filepath: str
601 A fishgrid data directory or a file therein.
603 Returns
604 -------
605 data: nested dict
606 Nested dictionary with key-value pairs of the meta data.
607 """
608 fishgrid_dir = filepath
609 if not os.path.isdir(fishgrid_dir):
610 fishgrid_dir = os.path.dirname(filepath)
611 path = os.path.join(fishgrid_dir, 'fishgrid.cfg')
612 # read in header from file:
613 lines = []
614 if os.path.isfile(path + '.gz'):
615 path += '.gz'
616 if not os.path.exists(path):
617 return {}
618 if path[-3:] == '.gz':
619 with gzip.open(path, 'r', encoding='latin-1') as sf:
620 for line in sf:
621 lines.append(line)
622 else:
623 with open(path, 'r', encoding='latin-1') as sf:
624 for line in sf:
625 lines.append(line)
626 # parse:
627 data = {}
628 cdatas = [data]
629 ident_offs = None
630 ident = None
631 old_style = False
632 grid_n = False
633 for line in lines:
634 if len(line.strip()) == 0:
635 continue
636 if line[0] == '*':
637 key = line[1:].strip()
638 data[key] = {}
639 cdatas = [data, data[key]]
640 elif '----' in line:
641 old_style = True
642 key = line.strip().strip(' -').replace('&', '')
643 if key.upper() == 'SETUP':
644 key = 'Grid 1'
645 grid_n = False
646 if key[:4].lower() == 'grid':
647 grid_n = key[5]
648 cdatas = cdatas[:2]
649 cdatas[1][key] = {}
650 cdatas.append(cdatas[1][key])
651 else:
652 words = line.split(':')
653 key = words[0].strip().strip('"')
654 value = None
655 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style):
656 value = ':'.join(words[1:]).strip().strip('"')
657 if old_style:
658 if value is None:
659 cdatas = cdatas[:3]
660 cdatas[2][key] = {}
661 cdatas.append(cdatas[2][key])
662 else:
663 if grid_n and key[-1] != grid_n:
664 key = key + grid_n
665 cdatas[-1][key] = value
666 else:
667 # get section level:
668 level = 0
669 nident = len(line) - len(line.lstrip())
670 if ident_offs is None:
671 ident_offs = nident
672 elif ident is None:
673 if nident > ident_offs:
674 ident = nident - ident_offs
675 level = 1
676 else:
677 level = (nident - ident_offs)//ident
678 # close sections:
679 cdatas = cdatas[:2 + level]
680 if value is None:
681 # new section:
682 cdatas[-1][key] = {}
683 cdatas.append(cdatas[-1][key])
684 else:
685 # key-value pair:
686 cdatas[-1][key] = value.replace(r'\n', '\n')
687 # remove unused grids:
688 fgm = data.get('FishGrid', {})
689 for i in range(4):
690 gs = f'Grid {i+1}'
691 if gs in fgm:
692 gm = fgm[gs]
693 us = f'Used{i+1}'
694 if us in gm and gm[us].upper() == 'FALSE':
695 del fgm[gs]
696 return data
699def markers_fishgrid(filepath):
700 """ Read markers of a fishgrid data set.
702 Parameters
703 ----------
704 filepath: str
705 A fishgrid data directory or a file therein.
707 Returns
708 -------
709 locs: 2-D array of ints
710 Marker positions (first column) and spans (second column)
711 for each marker (rows).
712 labels: 2-D array of string objects
713 Labels (first column) and texts (second column)
714 for each marker (rows).
715 """
716 def add_marker():
717 if 'index1' in marker:
718 index1 = int(marker['index1'])//nchannels
719 else:
720 index1 = int(marker['index'])//nchannels
721 span1 = int(marker.get('span1', 0))//nchannels
722 locs.append([index1, span1])
723 ls = marker.get('label', 'M')
724 cs = marker.get('comment', '')
725 labels.append([ls, cs])
727 fishgrid_dir = filepath
728 if not os.path.isdir(fishgrid_dir):
729 fishgrid_dir = os.path.dirname(filepath)
730 path = os.path.join(fishgrid_dir, 'timestamps.dat')
731 if not os.path.isfile(path):
732 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
733 # get number of channels:
734 md = metadata_fishgrid(path.replace('timestamps.dat', 'fishgrid.cfg'))
735 grids = fishgrid_grids(md)
736 nchannels = np.prod(grids[0])
737 # read timestamps:
738 locs = []
739 labels = []
740 marker = {}
741 with open(path, 'r') as sf:
742 for line in sf:
743 if len(line.strip()) == 0:
744 add_marker()
745 marker = {}
746 else:
747 words = line.split(':')
748 if len(words) > 1:
749 v = words[1].strip()
750 v = v.strip('"')
751 marker[words[0].strip().lower()] = v
752 if len(marker) > 0:
753 add_marker()
754 if len(locs) > 2:
755 return np.array(locs[1:-1]), np.array(labels[1:-1])
756 else:
757 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
760def check_container(filepath):
761 """Check if file is a generic container file.
763 Supported file formats are:
765 - python pickle files (.pkl)
766 - numpy files (.npz)
767 - matlab files (.mat)
769 Parameters
770 ----------
771 filepath: str
772 Path of the file to check.
774 Returns
775 -------
776 is_container: bool
777 `True`, if `filepath` is a supported container format.
778 """
779 ext = os.path.splitext(filepath)[1]
780 return ext.lower() in ('.pkl', '.npz', '.mat')
783def extract_container_data(data_dict, datakey=None,
784 samplekey=['rate', 'Fs', 'fs'],
785 timekey=['time'], amplkey=['amax'], unitkey='unit',
786 amax=1.0, unit='a.u.'):
787 """Extract data from dictionary loaded from a container file.
789 Parameters
790 ----------
791 data_dict: dict
792 Dictionary of the data items contained in the container.
793 datakey: None, str, or list of str
794 Name of the variable holding the data. If `None` take the
795 variable that is an 2D array and has the largest number of
796 elements.
797 samplekey: str or list of str
798 Name of the variable holding the sampling rate.
799 timekey: str or list of str
800 Name of the variable holding sampling times.
801 If no sampling rate is available, the sampling rate is retrieved
802 from the sampling times.
803 amplkey: str or list of str
804 Name of the variable holding the amplitude range of the data.
805 unitkey: str
806 Name of the variable holding the unit of the data.
807 amax: None or float
808 If specified and no amplitude range has been found in `data_dict`,
809 then this is the amplitude range of the data.
810 unit: None or str
811 If specified and no unit has been found in `data_dict`,
812 then return this as the unit of the data.
814 Returns
815 -------
816 data: 2-D array of floats
817 All data traces as an 2-D numpy array, even for single channel data.
818 First dimension is time, second is channel.
819 rate: float
820 Sampling rate of the data in Hz.
821 unit: str
822 Unit of the data.
823 amax: float
824 Maximum amplitude of data range in `unit`.
826 Raises
827 ------
828 ValueError:
829 Invalid key requested.
830 """
831 # extract format data:
832 if not isinstance(samplekey, (list, tuple, np.ndarray)):
833 samplekey = (samplekey,)
834 if not isinstance(timekey, (list, tuple, np.ndarray)):
835 timekey = (timekey,)
836 if not isinstance(amplkey, (list, tuple, np.ndarray)):
837 amplkey = (amplkey,)
838 rate = 0.0
839 for skey in samplekey:
840 if skey in data_dict:
841 rate = float(data_dict[skey])
842 break
843 if rate == 0.0:
844 for tkey in timekey:
845 if tkey in data_dict:
846 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0])
847 break
848 if rate == 0.0:
849 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times")
850 for akey in amplkey:
851 if akey in data_dict:
852 amax = float(data_dict[akey])
853 break
854 if unitkey in data_dict:
855 unit = data_dict[unitkey]
856 # get data array:
857 raw_data = np.array([])
858 if datakey:
859 # try data keys:
860 if not isinstance(datakey, (list, tuple, np.ndarray)):
861 datakey = (datakey,)
862 for dkey in datakey:
863 if dkey in data_dict:
864 raw_data = data_dict[dkey]
865 break
866 if len(raw_data) == 0:
867 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data")
868 else:
869 # find largest 2D array:
870 for d in data_dict:
871 if hasattr(data_dict[d], 'shape'):
872 if 1 <= len(data_dict[d].shape) <= 2 and \
873 np.max(data_dict[d].shape) > np.max(raw_data.shape):
874 raw_data = data_dict[d]
875 if len(raw_data) == 0:
876 raise ValueError('no data found')
877 # make 2D:
878 if len(raw_data.shape) == 1:
879 raw_data = raw_data.reshape(-1, 1)
880 # transpose if necessary:
881 if np.argmax(raw_data.shape) > 0:
882 raw_data = raw_data.T
883 # recode:
884 if raw_data.dtype == np.dtype('int16'):
885 data = raw_data.astype('float32')
886 data *= amax/2**15
887 elif raw_data.dtype == np.dtype('int32'):
888 data = raw_data.astype(float)
889 data *= amax/2**31
890 elif raw_data.dtype == np.dtype('int64'):
891 data = raw_data.astype(float)
892 data *= amax/2**63
893 else:
894 data = raw_data
895 return data, rate, unit, amax
898def load_container(filepath, datakey=None,
899 samplekey=['rate', 'Fs', 'fs'],
900 timekey=['time'], amplkey=['amax'], unitkey='unit',
901 amax=1.0, unit='a.u.'):
902 """Load data from a generic container file.
904 Supported file formats are:
906 - python pickle files (.pkl)
907 - numpy files (.npz)
908 - matlab files (.mat)
910 Parameters
911 ----------
912 filepath: str
913 Path of the file to load.
914 datakey: None, str, or list of str
915 Name of the variable holding the data. If `None` take the
916 variable that is an 2D array and has the largest number of
917 elements.
918 samplekey: str or list of str
919 Name of the variable holding the sampling rate.
920 timekey: str or list of str
921 Name of the variable holding sampling times.
922 If no sampling rate is available, the sampling rate is retrieved
923 from the sampling times.
924 amplkey: str
925 Name of the variable holding the amplitude range of the data.
926 unitkey: str
927 Name of the variable holding the unit of the data.
928 If `unitkey` is not a valid key, then return `unitkey` as the `unit`.
929 amax: None or float
930 If specified and no amplitude range has been found in the data
931 container, then this is the amplitude range of the data.
932 unit: None or str
933 If specified and no unit has been found in the data container,
934 then return this as the unit of the data.
936 Returns
937 -------
938 data: 2-D array of floats
939 All data traces as an 2-D numpy array, even for single channel data.
940 First dimension is time, second is channel.
941 rate: float
942 Sampling rate of the data in Hz.
943 unit: str
944 Unit of the data.
945 amax: float
946 Maximum amplitude of data range.
948 Raises
949 ------
950 ValueError:
951 Invalid key requested.
952 """
953 # load data:
954 data_dict = {}
955 ext = os.path.splitext(filepath)[1]
956 if ext == '.pkl':
957 import pickle
958 with open(filepath, 'rb') as f:
959 data_dict = pickle.load(f)
960 elif ext == '.npz':
961 data_dict = np.load(filepath)
962 elif ext == '.mat':
963 from scipy.io import loadmat
964 data_dict = loadmat(filepath, squeeze_me=True)
965 return extract_container_data(data_dict, datakey, samplekey,
966 timekey, amplkey, unitkey, amax, unit)
969def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']):
970 """ Extract metadata from dictionary loaded from a container file.
972 Parameters
973 ----------
974 data_dict: dict
975 Dictionary of the data items contained in the container.
976 metadatakey: str or list of str
977 Name of the variable holding the metadata.
979 Returns
980 -------
981 metadata: nested dict
982 Nested dictionary with key-value pairs of the meta data.
983 """
984 if not isinstance(metadatakey, (list, tuple, np.ndarray)):
985 metadatakey = (metadatakey,)
986 # get single metadata dictionary:
987 for mkey in metadatakey:
988 if mkey in data_dict:
989 return data_dict[mkey]
990 # collect all keys starting with metadatakey:
991 metadata = {}
992 for mkey in metadatakey:
993 mkey += '__'
994 for dkey in data_dict:
995 if dkey[:len(mkey)] == mkey:
996 v = data_dict[dkey]
997 if hasattr(v, 'size') and v.ndim == 0:
998 v = v.item()
999 metadata[dkey[len(mkey):]] = v
1000 if len(metadata) > 0:
1001 return unflatten_metadata(metadata, sep='__')
1002 return metadata
1005def metadata_container(filepath, metadatakey=['metadata', 'info']):
1006 """ Read meta-data of a container file.
1008 Parameters
1009 ----------
1010 filepath: str
1011 A container file.
1012 metadatakey: str or list of str
1013 Name of the variable holding the metadata.
1015 Returns
1016 -------
1017 metadata: nested dict
1018 Nested dictionary with key-value pairs of the meta data.
1019 """
1020 data_dict = {}
1021 ext = os.path.splitext(filepath)[1]
1022 if ext == '.pkl':
1023 import pickle
1024 with open(filepath, 'rb') as f:
1025 data_dict = pickle.load(f)
1026 elif ext == '.npz':
1027 data_dict = np.load(filepath)
1028 elif ext == '.mat':
1029 from scipy.io import loadmat
1030 data_dict = loadmat(filepath, squeeze_me=True)
1031 return extract_container_metadata(data_dict, metadatakey)
1034def extract_container_markers(data_dict, poskey=['positions'],
1035 spanskey=['spans'], labelskey=['labels'],
1036 descrkey=['descriptions']):
1037 """ Extract markers from dictionary loaded from a container file.
1039 Parameters
1040 ----------
1041 data_dict: dict
1042 Dictionary of the data items contained in the container.
1043 poskey: str or list of str
1044 Name of the variable holding positions of markers.
1045 spanskey: str or list of str
1046 Name of the variable holding spans of markers.
1047 labelskey: str or list of str
1048 Name of the variable holding labels of markers.
1049 descrkey: str or list of str
1050 Name of the variable holding descriptions of markers.
1052 Returns
1053 -------
1054 locs: 2-D array of ints
1055 Marker positions (first column) and spans (second column)
1056 for each marker (rows).
1057 labels: 2-D array of string objects
1058 Labels (first column) and texts (second column)
1059 for each marker (rows).
1060 """
1061 if not isinstance(poskey, (list, tuple, np.ndarray)):
1062 poskey = (poskey,)
1063 if not isinstance(spanskey, (list, tuple, np.ndarray)):
1064 spanskey = (spanskey,)
1065 if not isinstance(labelskey, (list, tuple, np.ndarray)):
1066 labelskey = (labelskey,)
1067 if not isinstance(descrkey, (list, tuple, np.ndarray)):
1068 descrkey = (descrkey,)
1069 locs = np.zeros((0, 2), dtype=int)
1070 for pkey in poskey:
1071 if pkey in data_dict:
1072 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int)
1073 locs[:,0] = data_dict[pkey]
1074 break
1075 for skey in spanskey:
1076 if skey in data_dict:
1077 locs[:,1] = data_dict[skey]
1078 break
1079 labels = np.zeros((0, 2), dtype=object)
1080 for lkey in labelskey:
1081 if lkey in data_dict:
1082 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object)
1083 labels[:,0] = data_dict[lkey]
1084 break
1085 for dkey in descrkey:
1086 if dkey in data_dict:
1087 labels[:,1] = data_dict[dkey]
1088 break
1089 return locs, labels
1092def markers_container(filepath, poskey=['positions'],
1093 spanskey=['spans'], labelskey=['labels'],
1094 descrkey=['descriptions']):
1095 """ Read markers of a container file.
1097 Parameters
1098 ----------
1099 filepath: str
1100 A container file.
1101 poskey: str or list of str
1102 Name of the variable holding positions of markers.
1103 spanskey: str or list of str
1104 Name of the variable holding spans of markers.
1105 labelskey: str or list of str
1106 Name of the variable holding labels of markers.
1107 descrkey: str or list of str
1108 Name of the variable holding descriptions of markers.
1110 Returns
1111 -------
1112 locs: 2-D array of ints
1113 Marker positions (first column) and spans (second column)
1114 for each marker (rows).
1115 labels: 2-D array of string objects
1116 Labels (first column) and texts (second column)
1117 for each marker (rows).
1118 """
1119 data_dict = {}
1120 ext = os.path.splitext(filepath)[1]
1121 if ext == '.pkl':
1122 import pickle
1123 with open(filepath, 'rb') as f:
1124 data_dict = pickle.load(f)
1125 elif ext == '.npz':
1126 data_dict = np.load(filepath)
1127 elif ext == '.mat':
1128 from scipy.io import loadmat
1129 data_dict = loadmat(filepath, squeeze_me=True)
1130 return extract_container_markers(data_dict, poskey, spanskey,
1131 labelskey, descrkey)
1134def check_raw(filepath):
1135 """Check if file is a raw file.
1137 The following extensions are interpreted as raw files:
1139 - raw files (*.raw)
1140 - LabView scandata (*.scandat)
1142 Parameters
1143 ----------
1144 filepath: str
1145 Path of the file to check.
1147 Returns
1148 -------
1149 is_raw: bool
1150 `True`, if `filepath` is a raw format.
1151 """
1152 ext = os.path.splitext(filepath)[1]
1153 return ext.lower() in ('.raw', '.scandat', '.mat')
1156def load_raw(filepath, rate=44000, channels=1, dtype=np.float32,
1157 amax=1.0, unit='a.u.'):
1158 """Load data from a raw file.
1160 Raw files just contain the data and absolutely no metadata, not
1161 even the smapling rate, number of channels, etc.
1162 Supported file formats are:
1164 - raw files (*.raw)
1165 - LabView scandata (*.scandat)
1167 Parameters
1168 ----------
1169 filepath: str
1170 Path of the file to load.
1171 rate: float
1172 Sampling rate of the data in Hertz.
1173 channels: int
1174 Number of channels multiplexed in the data.
1175 dtype: str or numpy.dtype
1176 The data type stored in the file.
1177 amax: float
1178 The amplitude range of the data.
1179 unit: str
1180 The unit of the data.
1182 Returns
1183 -------
1184 data: 2-D array of floats
1185 All data traces as an 2-D numpy array, even for single channel data.
1186 First dimension is time, second is channel.
1187 rate: float
1188 Sampling rate of the data in Hz.
1189 unit: str
1190 Unit of the data.
1191 amax: float
1192 Maximum amplitude of data range.
1194 """
1195 raw_data = np.fromfile(filepath, dtype=dtype).reshape(-1, channels)
1196 # recode:
1197 if dtype == np.dtype('int16'):
1198 data = raw_data.astype('float32')
1199 data *= amax/2**15
1200 elif dtype == np.dtype('int32'):
1201 data = raw_data.astype(float)
1202 data *= amax/2**31
1203 elif dtype == np.dtype('int64'):
1204 data = raw_data.astype(float)
1205 data *= amax/2**63
1206 else:
1207 data = raw_data
1208 return data, rate, unit, amax
1211def load_audioio(filepath, verbose=0, gainkey=default_gain_keys, sep='.',
1212 amax=1.0, unit='a.u.'):
1213 """Load data from an audio file.
1215 See the
1216 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio)
1217 function of the [`audioio`](https://github.com/bendalab/audioio)
1218 package for more infos.
1220 Parameters
1221 ----------
1222 filepath: str
1223 Path of the file to load.
1224 verbose: int
1225 If > 0 show detailed error/warning messages.
1226 gainkey: str or list of str
1227 Key in the file's metadata that holds some gain information.
1228 If found, the data will be multiplied with the gain,
1229 and if available, the corresponding unit is returned.
1230 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
1231 sep: str
1232 String that separates section names in `gainkey`.
1233 amax: float
1234 If specified and no gain has been found in the metadata,
1235 then use this as the amplitude range.
1236 unit: str
1237 If specified and no gain has been found in the metadata,
1238 then return this as the unit of the data.
1240 Returns
1241 -------
1242 data: 2-D array of floats
1243 All data traces as an 2-D numpy array, even for single channel data.
1244 First dimension is time, second is channel.
1245 rate: float
1246 Sampling rate of the data in Hz.
1247 unit: str
1248 Unit of the data if found in the metadata (see `gainkey`),
1249 otherwise `unit`.
1250 amax: float
1251 Maximum amplitude of data range.
1252 """
1253 # get gain:
1254 md = metadata_audioio(filepath)
1255 amax, unit = get_gain(md, gainkey, sep, amax, unit)
1256 # load data:
1257 data, rate = load_audio(filepath, verbose)
1258 if amax != 1.0:
1259 data *= amax
1260 return data, rate, unit, amax
1263data_loader_funcs = (
1264 ('relacs', check_relacs, load_relacs, metadata_relacs, None),
1265 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid),
1266 ('container', check_container, load_container, metadata_container, markers_container),
1267 ('raw', check_raw, load_raw, None, None),
1268 ('audioio', None, load_audioio, metadata_audioio, markers_audioio),
1269 )
1270"""List of implemented load functions.
1272Each element of the list is a tuple with the data format's name, its
1273check and its load function.
1275"""
1278def load_data(filepath, verbose=0, **kwargs):
1279 """Load time-series data from a file.
1281 Parameters
1282 ----------
1283 filepath: str
1284 Path and name of the file to load.
1285 verbose: int
1286 If > 0 show detailed error/warning messages.
1287 **kwargs: dict
1288 Further keyword arguments that are passed on to the
1289 format specific loading functions.
1290 For example:
1291 - `amax`: the amplitude range of the data.
1292 - 'unit': the unit of the data.
1294 Returns
1295 -------
1296 data: 2-D array
1297 All data traces as an 2-D numpy array, even for single channel data.
1298 First dimension is time, second is channel.
1299 rate: float
1300 Sampling rate of the data in Hz.
1301 unit: str
1302 Unit of the data.
1303 amax: float
1304 Maximum amplitude of data range.
1306 Raises
1307 ------
1308 ValueError:
1309 `filepath` is empty string.
1310 """
1311 if len(filepath) == 0:
1312 raise ValueError('input argument filepath is empty string.')
1313 # load data:
1314 for name, check_file, load_file, _, _ in data_loader_funcs:
1315 if check_file is None or check_file(filepath):
1316 data, rate, unit, amax = load_file(filepath, **kwargs)
1317 if verbose > 0:
1318 print(f'loaded {name} data from file "{filepath}"')
1319 if verbose > 1:
1320 print(f' sampling rate: {rate:g} Hz')
1321 print(f' channels : {data.shape[1]}')
1322 print(f' frames : {len(data)}')
1323 print(f' range : {amax:g}{unit}')
1324 return data, rate, unit, amax
1325 return np.zeros((0, 1)), 0.0, '', 1.0
1328def metadata(filepath, **kwargs):
1329 """ Read meta-data from a data file.
1331 Parameters
1332 ----------
1333 filepath: str
1334 The full path and name of the file to load. For some file
1335 formats several files can be provided in a list.
1336 **kwargs: dict
1337 Further keyword arguments that are passed on to the
1338 format specific loading functions.
1340 Returns
1341 -------
1342 meta_data: nested dict
1343 Meta data contained in the file. Keys of the nested
1344 dictionaries are always strings. If the corresponding
1345 values are dictionaries, then the key is the section name
1346 of the metadata contained in the dictionary. All other
1347 types of values are values for the respective key. In
1348 particular they are strings, or list of strings. But other
1349 simple types like ints or floats are also allowed.
1351 Raises
1352 ------
1353 ValueError:
1354 `filepath` is empty string.
1355 """
1356 if len(filepath) == 0:
1357 raise ValueError('input argument filepath is empty string.')
1358 # load metadata:
1359 for _, check_file, _, metadata_file, _ in data_loader_funcs:
1360 if check_file is None or check_file(filepath):
1361 if metadata_file is not None:
1362 return metadata_file(filepath, **kwargs)
1363 return {}
1366def markers(filepath):
1367 """ Read markers of a data file.
1369 Parameters
1370 ----------
1371 filepath: str or file handle
1372 The data file.
1374 Returns
1375 -------
1376 locs: 2-D array of ints
1377 Marker positions (first column) and spans (second column)
1378 for each marker (rows).
1379 labels: 2-D array of string objects
1380 Labels (first column) and texts (second column)
1381 for each marker (rows).
1383 Raises
1384 ------
1385 ValueError:
1386 `filepath` is empty string.
1387 """
1388 if len(filepath) == 0:
1389 raise ValueError('input argument filepath is empty string.')
1390 # load markers:
1391 for _, check_file, _, _, markers_file in data_loader_funcs:
1392 if check_file is None or check_file(filepath):
1393 if markers_file is not None:
1394 return markers_file(filepath)
1395 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
1398class DataLoader(AudioLoader):
1399 """Buffered reading of time-series data for random access of the data in the file.
1401 This allows for reading very large data files that do not fit into
1402 memory. A `DataLoader` instance can be used like a huge
1403 read-only numpy array, i.e.
1404 ```
1405 data = DataLoader('path/to/data/file.dat')
1406 x = data[10000:20000,0]
1407 ```
1408 The first index specifies the frame, the second one the channel.
1410 `DataLoader` first determines the format of the data file and then
1411 opens the file (first line). It then reads data from the file as
1412 necessary for the requested data (second line).
1414 Supported file formats are
1416 - audio files via `audioio` package
1417 - python pickle files
1418 - numpy .npz files
1419 - matlab .mat files
1420 - relacs trace*.raw files (www.relacs.net)
1421 - fishgrid traces-*.raw files
1423 Reading sequentially through the file is always possible. If
1424 previous data are requested, then the file is read from the
1425 beginning. This might slow down access to previous data
1426 considerably. Use the `backsize` argument to the open functions to
1427 make sure some data are loaded before the requested frame. Then a
1428 subsequent access to the data within `backsize` seconds before that
1429 frame can still be handled without the need to reread the file
1430 from the beginning.
1432 Usage:
1433 ------
1434 ```
1435 import thunderlab.dataloader as dl
1436 with dl.DataLoader(filepath, 60.0, 10.0) as data:
1437 # do something with the content of the file:
1438 x = data[0:10000,0]
1439 y = data[10000:20000,0]
1440 z = x + y
1441 ```
1443 Normal open and close:
1444 ```
1445 data = dl.DataLoader(filepath, 60.0)
1446 x = data[:,:] # read the whole file
1447 data.close()
1448 ```
1449 that is the same as:
1450 ```
1451 data = dl.DataLoader()
1452 data.open(filepath, 60.0)
1453 ```
1455 Parameters
1456 ----------
1457 filepath: str
1458 Name of the file.
1459 buffersize: float
1460 Size of internal buffer in seconds.
1461 backsize: float
1462 Part of the buffer to be loaded before the requested start index in seconds.
1463 verbose: int
1464 If larger than zero show detailed error/warning messages.
1465 meta_kwargs: dict
1466 Keyword arguments that are passed on to the _load_metadata() function.
1468 Attributes
1469 ----------
1470 rate: float
1471 The sampling rate of the data in Hertz.
1472 channels: int
1473 The number of channels that are read in.
1474 frames: int
1475 The number of frames in the file.
1476 format: str or None
1477 Format of the audio file.
1478 encoding: str or None
1479 Encoding/subtype of the audio file.
1480 shape: tuple
1481 Number of frames and channels of the data.
1482 ndim: int
1483 Number of dimensions: always 2 (frames and channels).
1484 unit: str
1485 Unit of the data.
1486 ampl_min: float
1487 Minimum amplitude the file format supports.
1488 ampl_max: float
1489 Maximum amplitude the file format supports.
1491 Methods
1492 -------
1494 - `len()`: the number of frames
1495 - `open()`: open a data file.
1496 - `open_*()`: open a data file of a specific format.
1497 - `close()`: close the file.
1498 - `metadata()`: metadata of the file.
1499 - `markers()`: markers of the file.
1500 - `set_unwrap()`: Set parameters for unwrapping clipped data.
1502 """
1504 def __init__(self, filepath=None, buffersize=10.0, backsize=0.0,
1505 verbose=0, **meta_kwargs):
1506 super().__init__(None, buffersize, backsize,
1507 verbose, **meta_kwargs)
1508 if filepath is not None:
1509 self.open(filepath, buffersize, backsize, verbose, **meta_kwargs)
1511 def __getitem__(self, key):
1512 return super(DataLoader, self).__getitem__(key)
1514 def __next__(self):
1515 return super(DataLoader, self).__next__()
1518 # relacs interface:
1519 def open_relacs(self, filepath, buffersize=10.0, backsize=0.0,
1520 verbose=0, amax=1.0):
1521 """Open relacs data files (www.relacs.net) for reading.
1523 Parameters
1524 ----------
1525 filepath: str
1526 Path to a relacs data directory or a file therein.
1527 buffersize: float
1528 Size of internal buffer in seconds.
1529 backsize: float
1530 Part of the buffer to be loaded before the requested start index in seconds.
1531 verbose: int
1532 If > 0 show detailed error/warning messages.
1533 amax: float
1534 The amplitude range of the data.
1536 Raises
1537 ------
1538 FileNotFoundError:
1539 Invalid or non existing fishgrid files.
1540 ValueError:
1541 .gz files not supported.
1542 """
1543 self.verbose = verbose
1545 # open trace files:
1546 self.trace_filepaths = relacs_trace_files(filepath)
1547 if len(self.trace_filepaths) == 0:
1548 raise FileNotFoundError(f'no relacs files found')
1549 self.sf = []
1550 self.frames = None
1551 self.rate = None
1552 self.unit = ''
1553 self.filepath = filepath
1554 self.file_paths = [self.filepath]
1555 self.file_indices = [0]
1556 for path in self.trace_filepaths:
1557 if path[-3:] == '.gz':
1558 raise ValueError('.gz files not supported')
1559 sf = open(path, 'rb')
1560 self.sf.append(sf)
1561 if verbose > 0:
1562 print(f'open_relacs(filepath) with filepath={path}')
1563 # file size:
1564 sf.seek(0, os.SEEK_END)
1565 frames = sf.tell()//4
1566 if self.frames is None:
1567 self.frames = frames
1568 elif self.frames != frames:
1569 diff = self.frames - frames
1570 if diff > 1 or diff < -2:
1571 raise ValueError('number of frames of traces differ')
1572 elif diff >= 0:
1573 self.frames = frames
1574 sf.seek(0)
1575 # retrieve sampling rate and unit:
1576 rate, us = relacs_samplerate_unit(path)
1577 if self.rate is None:
1578 self.rate = rate
1579 elif rate != self.rate:
1580 raise ValueError('sampling rates of traces differ')
1581 if len(self.unit) == 0:
1582 self.unit = us
1583 elif us != self.unit:
1584 raise ValueError('unit of traces differ')
1585 self.channels = len(self.sf)
1586 self.shape = (self.frames, self.channels)
1587 self.size = self.frames * self.channels
1588 self.ndim = len(self.shape)
1589 self.format = 'RELACS'
1590 self.encoding = 'FLOAT'
1591 self.bufferframes = int(buffersize*self.rate)
1592 self.backframes = int(backsize*self.rate)
1593 self.init_buffer()
1594 self.offset = 0
1595 self.close = self._close_relacs
1596 self.load_audio_buffer = self._load_buffer_relacs
1597 self.basename = self._basename_relacs
1598 self.ampl_min = -amax
1599 self.ampl_max = +amax
1600 self._load_metadata = self._metadata_relacs
1601 # TODO: load markers:
1602 self._locs = np.zeros((0, 2), dtype=int)
1603 self._labels = np.zeros((0, 2), dtype=object)
1604 self._load_markers = None
1605 return self
1607 def _close_relacs(self):
1608 """Close the relacs data files.
1609 """
1610 for file in self.sf:
1611 file.close()
1612 self.sf = []
1614 def _load_buffer_relacs(self, r_offset, r_size, buffer):
1615 """Load new data from relacs data file.
1617 Parameters
1618 ----------
1619 r_offset: int
1620 First frame to be read from file.
1621 r_size: int
1622 Number of frames to be read from file.
1623 buffer: ndarray
1624 Buffer where to store the loaded data.
1625 """
1626 if len(self.sf) == 0 and len(self.trace_filepaths) > 0:
1627 for path in self.trace_filepaths:
1628 self.sf.append(open(path, 'rb'))
1629 for i, file in enumerate(self.sf):
1630 file.seek(r_offset*4)
1631 data = file.read(r_size*4)
1632 buffer[:, i] = np.frombuffer(data, dtype=np.float32)
1635 def _metadata_relacs(self, store_empty=False, first_only=False):
1636 """ Load meta-data of a relacs data set.
1637 """
1638 path = os.path.dirname(self.filepath)
1639 info_path = os.path.join(path, 'info.dat')
1640 if not os.path.exists(info_path):
1641 return {}
1642 return relacs_header(info_path, store_empty, first_only)
1644 def _basename_relacs(self, path=None):
1645 """ Base name of the relacs data files.
1647 Parameters
1648 ----------
1649 path: str or None
1650 Path of a relacs data file (*.raw, info.dat, or just the directory).
1651 If `None`, use `self.filepath`.
1653 Returns
1654 -------
1655 s: str
1656 The base name, i.e. the name of the directory containing the
1657 relacs data files.
1659 """
1660 if path is None:
1661 path = self.filepath
1662 path = Path(path)
1663 if path.is_dir():
1664 return path.name
1665 else:
1666 return path.parent.name
1669 # fishgrid interface:
1670 def open_fishgrid(self, filepath, buffersize=10.0, backsize=0.0,
1671 verbose=0):
1672 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading.
1674 Parameters
1675 ----------
1676 filepath: str
1677 Path to a fishgrid data directory, or a file therein.
1678 buffersize: float
1679 Size of internal buffer in seconds.
1680 backsize: float
1681 Part of the buffer to be loaded before the requested start index in seconds.
1682 verbose: int
1683 If > 0 show detailed error/warning messages.
1685 Raises
1686 ------
1687 FileNotFoundError:
1688 Invalid or non existing fishgrid files.
1689 """
1690 self.verbose = verbose
1692 self.trace_filepaths = fishgrid_trace_files(filepath)
1693 if len(self.trace_filepaths) == 0:
1694 raise FileNotFoundError(f'no fishgrid files found')
1695 self.filepath = filepath
1696 self.file_paths = [self.filepath]
1697 self.file_indices = [0]
1698 self._load_metadata = metadata_fishgrid
1699 self._load_markers = markers_fishgrid
1701 # open grid files:
1702 grids = fishgrid_grids(self.metadata())
1703 grid_sizes = [r*c for r,c in grids]
1704 self.channels = 0
1705 for g, path in enumerate(self.trace_filepaths):
1706 self.channels += grid_sizes[g]
1707 self.sf = []
1708 self.grid_channels = []
1709 self.grid_offs = []
1710 offs = 0
1711 self.frames = None
1712 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate')
1713 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt')
1714 if v is not None:
1715 self.ampl_min = -v
1716 self.ampl_max = +v
1718 for g, path in enumerate(self.trace_filepaths):
1719 sf = open(path, 'rb')
1720 self.sf.append(sf)
1721 if verbose > 0:
1722 print(f'open_fishgrid(filepath) with filepath={path}')
1723 # grid channels:
1724 self.grid_channels.append(grid_sizes[g])
1725 self.grid_offs.append(offs)
1726 offs += grid_sizes[g]
1727 # file size:
1728 sf.seek(0, os.SEEK_END)
1729 frames = sf.tell()//4//grid_sizes[g]
1730 if self.frames is None:
1731 self.frames = frames
1732 elif self.frames != frames:
1733 diff = self.frames - frames
1734 if diff > 1 or diff < -2:
1735 raise ValueError('number of frames of traces differ')
1736 elif diff >= 0:
1737 self.frames = frames
1738 sf.seek(0)
1739 self.shape = (self.frames, self.channels)
1740 self.size = self.frames * self.channels
1741 self.ndim = len(self.shape)
1742 self.format = 'FISHGRID'
1743 self.encoding = 'FLOAT'
1744 self.bufferframes = int(buffersize*self.rate)
1745 self.backframes = int(backsize*self.rate)
1746 self.init_buffer()
1747 self.offset = 0
1748 self.close = self._close_fishgrid
1749 self.load_audio_buffer = self._load_buffer_fishgrid
1750 self.basename = self._basename_fishgrid
1751 return self
1753 def _close_fishgrid(self):
1754 """Close the fishgrid data files.
1755 """
1756 for file in self.sf:
1757 file.close()
1758 self.sf = []
1760 def _load_buffer_fishgrid(self, r_offset, r_size, buffer):
1761 """Load new data from relacs data file.
1763 Parameters
1764 ----------
1765 r_offset: int
1766 First frame to be read from file.
1767 r_size: int
1768 Number of frames to be read from file.
1769 buffer: ndarray
1770 Buffer where to store the loaded data.
1771 """
1772 if len(self.sf) == 0 and len(self.trace_filepaths) > 0:
1773 for path in self.trace_filepaths:
1774 self.sf.append(open(path, 'rb'))
1775 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs):
1776 file.seek(r_offset*4*gchannels)
1777 data = file.read(r_size*4*gchannels)
1778 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels))
1780 def _basename_fishgrid(self, path=None):
1781 """ Base name of the fishgrid data files.
1783 Parameters
1784 ----------
1785 path: str or None
1786 Path of a fishgrid data file
1787 (*.raw, fishgrid.cfg, or just the directory).
1788 If `None`, use `self.filepath`.
1790 Returns
1791 -------
1792 s: str
1793 The base name, i.e. the name of the directory containing the
1794 fishgrid data files.
1796 """
1797 if path is None:
1798 path = self.filepath
1799 path = Path(path)
1800 if path.is_dir():
1801 return path.name
1802 else:
1803 return path.parent.name
1807 # container interface:
1808 def open_container(self, filepath, buffersize=10.0,
1809 backsize=0.0, verbose=0, datakey=None,
1810 samplekey=['rate', 'Fs', 'fs'],
1811 timekey=['time'], amplkey=['amax'], unitkey='unit',
1812 metadatakey=['metadata', 'info'],
1813 poskey=['positions'],
1814 spanskey=['spans'], labelskey=['labels'],
1815 descrkey=['descriptions'],
1816 amax=1.0, unit='a.u.'):
1817 """Open generic container file.
1819 Supported file formats are:
1821 - python pickle files (.pkl)
1822 - numpy files (.npz)
1823 - matlab files (.mat)
1825 Parameters
1826 ----------
1827 filepath: str
1828 Path to a container file.
1829 buffersize: float
1830 Size of internal buffer in seconds.
1831 backsize: float
1832 Part of the buffer to be loaded before the requested start index in seconds.
1833 verbose: int
1834 If > 0 show detailed error/warning messages.
1835 datakey: None, str, or list of str
1836 Name of the variable holding the data. If `None` take the
1837 variable that is an 2D array and has the largest number of
1838 elements.
1839 samplekey: str or list of str
1840 Name of the variable holding the sampling rate.
1841 timekey: str or list of str
1842 Name of the variable holding sampling times.
1843 If no sampling rate is available, the sampling rate is retrieved
1844 from the sampling times.
1845 amplkey: str or list of str
1846 Name of the variable holding the amplitude range of the data.
1847 unitkey: str
1848 Name of the variable holding the unit of the data.
1849 metadatakey: str or list of str
1850 Name of the variable holding the metadata.
1851 poskey: str or list of str
1852 Name of the variable holding positions of markers.
1853 spanskey: str or list of str
1854 Name of the variable holding spans of markers.
1855 labelskey: str or list of str
1856 Name of the variable holding labels of markers.
1857 descrkey: str or list of str
1858 Name of the variable holding descriptions of markers.
1859 amax: None or float
1860 If specified and no amplitude range has been found in the data
1861 container, then this is the amplitude range of the data.
1862 unit: None or str
1863 If specified and no unit has been found in the data container,
1864 then return this as the unit of the data.
1866 Raises
1867 ------
1868 ValueError:
1869 Invalid key requested.
1870 """
1871 self.verbose = verbose
1872 data_dict = {}
1873 ext = os.path.splitext(filepath)[1]
1874 if ext == '.pkl':
1875 import pickle
1876 with open(filepath, 'rb') as f:
1877 data_dict = pickle.load(f)
1878 self.format = 'PKL'
1879 elif ext == '.npz':
1880 data_dict = np.load(filepath)
1881 self.format = 'NPZ'
1882 elif ext == '.mat':
1883 from scipy.io import loadmat
1884 data_dict = loadmat(filepath, squeeze_me=True)
1885 self.format = 'MAT'
1886 self.buffer, self.rate, self.unit, amax = \
1887 extract_container_data(data_dict, datakey, samplekey,
1888 timekey, amplkey, unitkey, amax, unit)
1889 self.filepath = filepath
1890 self.file_paths = [self.filepath]
1891 self.file_indices = [0]
1892 self.channels = self.buffer.shape[1]
1893 self.frames = self.buffer.shape[0]
1894 self.shape = self.buffer.shape
1895 self.ndim = self.buffer.ndim
1896 self.size = self.buffer.size
1897 self.encoding = self.numpy_encodings[self.buffer.dtype]
1898 self.ampl_min = -amax
1899 self.ampl_max = +amax
1900 self.offset = 0
1901 self.buffer_changed = np.zeros(self.channels, dtype=bool)
1902 self.bufferframes = self.frames
1903 self.backsize = 0
1904 self.close = self._close_container
1905 self.load_audio_buffer = self._load_buffer_container
1906 self._metadata = extract_container_metadata(data_dict, metadatakey)
1907 self._load_metadata = None
1908 self._locs, self._labels = extract_container_markers(data_dict,
1909 poskey,
1910 spanskey,
1911 labelskey,
1912 descrkey)
1913 self._load_markers = None
1915 def _close_container(self):
1916 """Close container. """
1917 pass
1919 def _load_buffer_container(self, r_offset, r_size, buffer):
1920 """Load new data from container."""
1921 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :]
1924 # raw data interface:
1925 def open_raw(self, filepath, buffersize=10.0, backsize=0.0,
1926 verbose=0, rate=44000, channels=1, dtype=np.float32,
1927 amax=1.0, unit='a.u.'):
1928 """Load data from a raw file.
1930 Raw files just contain the data and absolutely no metadata, not
1931 even the smapling rate, number of channels, etc.
1932 Supported file formats are:
1934 - raw files (*.raw)
1935 - LabView scandata (*.scandat)
1937 Parameters
1938 ----------
1939 filepath: str
1940 Path of the file to load.
1941 buffersize: float
1942 Size of internal buffer in seconds.
1943 backsize: float
1944 Part of the buffer to be loaded before the requested start index in seconds.
1945 verbose: int
1946 If > 0 show detailed error/warning messages.
1947 rate: float
1948 Sampling rate of the data in Hertz.
1949 channels: int
1950 Number of channels multiplexed in the data.
1951 dtype: str or numpy.dtype
1952 The data type stored in the file.
1953 amax: float
1954 The amplitude range of the data.
1955 unit: str
1956 The unit of the data.
1957 """
1958 self.verbose = verbose
1959 self.filepath = filepath
1960 self.file_paths = [self.filepath]
1961 self.file_indices = [0]
1962 self.sf = open(self.filepath, 'rb')
1963 if verbose > 0:
1964 print(f'open_raw(filepath) with filepath={filepath}')
1965 self.dtype = np.dtype(dtype)
1966 self.rate = float(rate)
1967 # file size:
1968 self.sf.seek(0, os.SEEK_END)
1969 self.frames = self.sf.tell()//self.dtype.itemsize
1970 self.sf.seek(0)
1971 self.channels = int(channels)
1972 self.shape = (self.frames, self.channels)
1973 self.ndim = len(self.shape)
1974 self.size = self.frames*self.channels
1975 self.format = 'RAW'
1976 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN')
1977 self.unit = unit
1978 self.ampl_max = float(amax)
1979 self.ampl_min = -self.ampl_max
1980 self.offset = 0
1981 self.bufferframes = int(buffersize*self.rate)
1982 self.backframes = int(backsize*self.rate)
1983 self.init_buffer()
1984 self.close = self._close_raw
1985 self.load_audio_buffer = self._load_buffer_raw
1986 self._metadata = None
1987 self._load_metadata = None
1988 self._locs = None
1989 self._labels = None
1990 self._load_markers = None
1992 def _close_raw(self):
1993 """Close raw file. """
1994 self.sf.close()
1995 self.sf = None
1997 def _load_buffer_raw(self, r_offset, r_size, buffer):
1998 """Load new data from container."""
1999 if self.sf is None:
2000 self.sf = open(self.filepath, 'rb')
2001 self.sf.seek(r_offset*self.dtype.itemsize)
2002 raw_data = self.sf.read(r_size*self.dtype.itemsize)
2003 raw_data = np.frombuffer(raw_data, dtype=self.dtype)
2004 raw_data = raw_data.reshape(-1, self.channels)
2005 # recode:
2006 if self.dtype == np.dtype('int16'):
2007 data = raw_data.astype('float32')
2008 data *= self.ampl_max/2**15
2009 elif self.dtype == np.dtype('int32'):
2010 data = raw_data.astype(float)
2011 data *= self.ampl_max/2**31
2012 elif self.dtype == np.dtype('int64'):
2013 data = raw_data.astype(float)
2014 data *= self.ampl_max/2**63
2015 else:
2016 data = raw_data
2017 buffer[:, :] = data
2020 # audioio interface:
2021 def open_audioio(self, filepath, buffersize=10.0, backsize=0.0,
2022 verbose=0, gainkey=default_gain_keys, sep='.',
2023 amax=None, unit='a.u.'):
2024 """Open an audio file.
2026 See the [audioio](https://github.com/bendalab/audioio) package
2027 for details.
2029 Parameters
2030 ----------
2031 filepath: str
2032 Path to an audio file.
2033 buffersize: float
2034 Size of internal buffer in seconds.
2035 backsize: float
2036 Part of the buffer to be loaded before the requested start index
2037 in seconds.
2038 verbose: int
2039 If > 0 show detailed error/warning messages.
2040 gainkey: str or list of str
2041 Key in the file's metadata that holds some gain information.
2042 If found, the data will be multiplied with the gain,
2043 and if available, the corresponding unit is returned.
2044 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
2045 sep: str
2046 String that separates section names in `gainkey`.
2047 amax: None or float
2048 If specified and no gain has been found in the metadata,
2049 then use this as the amplitude range.
2050 unit: None or str
2051 If specified and no gain has been found in the metadata,
2052 then this is the unit of the data.
2054 """
2055 self.verbose = verbose
2056 super(DataLoader, self).open(filepath, buffersize, backsize, verbose)
2057 md = self.metadata()
2058 fac, unit = get_gain(md, gainkey, sep, amax, unit)
2059 if fac is None:
2060 self.gain_fac = 1.0
2061 else:
2062 self.gain_fac = fac
2063 self._load_buffer_audio_org = self.load_audio_buffer
2064 self.load_audio_buffer = self._load_buffer_audioio
2065 self.ampl_min *= self.gain_fac
2066 self.ampl_max *= self.gain_fac
2067 self.unit = unit
2068 return self
2070 def _load_buffer_audioio(self, r_offset, r_size, buffer):
2071 """Load and scale new data from an audio file.
2073 Parameters
2074 ----------
2075 r_offset: int
2076 First frame to be read from file.
2077 r_size: int
2078 Number of frames to be read from file.
2079 buffer: ndarray
2080 Buffer where to store the loaded data.
2081 """
2082 self._load_buffer_audio_org(r_offset, r_size, buffer)
2083 buffer *= self.gain_fac
2086 # open multiple files as one:
2087 def open_multiple(self, filepaths, buffersize=10.0, backsize=0.0,
2088 verbose=0, rate=None, channels=None,
2089 unit=None, amax=None, end_indices=None):
2090 """Open multiple files as a single concatenated array.
2092 Parameters
2093 ----------
2094 filepaths: list of str
2095 List of file names of audio files.
2096 buffersize: float
2097 Size of internal buffer in seconds.
2098 backsize: float
2099 Part of the buffer to be loaded before the requested start index in seconds.
2100 verbose: int
2101 If larger than zero show detailed error/warning messages.
2102 rate: float
2103 If provided, do a minimal initialization (no checking)
2104 using the provided sampling rate (in Hertz), channels,
2105 unit, maximum amplitude, and end_indices.
2106 channels: int
2107 If provided, do a minimal initialization (no checking)
2108 using the provided rate, number of channels,
2109 unit, maximum amplitude, and end_indices.
2110 unit: str
2111 If provided, do a minimal initialization (no checking)
2112 using the provided rate, number of channels,
2113 unit, maximum amplitude, and end_indices.
2114 amax: float
2115 If provided, do a minimal initialization (no checking)
2116 using the provided rate, number of channels,
2117 unit, maximum amplitude amax, and end_indices.
2118 end_indices: sequence of int
2119 If provided, do a minimal initialization (no checking)
2120 using the provided rate, channels,
2121 unit, maximum amplitude, and end_indices.
2123 Raises
2124 ------
2125 TypeError
2126 `filepaths` must be a sequence.
2127 ValueError
2128 Empty `filepaths`.
2129 FileNotFoundError
2130 `filepaths` does not contain a single valid file.
2132 """
2133 if not isinstance(filepaths, (list, tuple, np.ndarray)):
2134 raise TypeError('input argument filepaths is not a sequence!')
2135 if len(filepaths) == 0:
2136 raise ValueError('input argument filepaths is empy sequence!')
2137 self.buffersize = buffersize
2138 self.backsize = backsize
2139 self.filepath = None
2140 self.file_paths = []
2141 self.open_files = []
2142 self.open_loaders = []
2143 self.data_files = []
2144 self.collect_counter = 0
2145 self.frames = 0
2146 self.start_indices = []
2147 self.end_indices = []
2148 self.start_time = None
2149 start_time = None
2150 self._metadata = {}
2151 self._locs = np.zeros((0, 2), dtype=int)
2152 self._labels = np.zeros((0, 2), dtype=object)
2153 if end_indices is not None:
2154 self.filepath = filepaths[0]
2155 self.file_paths = filepaths
2156 self.data_files = [None] * len(filepaths)
2157 self.frames = end_indices[-1]
2158 self.start_indices = [0] + list(end_indices[:-1])
2159 self.end_indices = end_indices
2160 self.format = None
2161 self.encoding = None
2162 self.rate = rate
2163 self.channels = channels
2164 self.unit = unit
2165 self.ampl_max = amax
2166 self.ampl_min = -amax
2167 else:
2168 for filepath in filepaths:
2169 try:
2170 a = DataLoader(filepath, buffersize, backsize, verbose)
2171 except Exception as e:
2172 if verbose > 0:
2173 print(e)
2174 continue
2175 # collect metadata:
2176 md = a.metadata()
2177 fmd = flatten_metadata(md, True)
2178 add_metadata(self._metadata, fmd)
2179 if self.filepath is None:
2180 # first file:
2181 self.filepath = a.filepath
2182 self.format = a.format
2183 self.encoding = a.encoding
2184 self.rate = a.rate
2185 self.channels = a.channels
2186 self.unit = a.unit
2187 self.ampl_max = a.ampl_max
2188 self.ampl_min = a.ampl_min
2189 self.start_time = get_datetime(md)
2190 start_time = self.start_time
2191 else:
2192 # check channels, rate, and amplitudes:
2193 error_str = None
2194 if a.channels != self.channels:
2195 error_str = f'number of channels differs: ' \
2196 f'{a.channels} in {a.filepath} versus ' \
2197 f'{self.channels} in {self.filepath}'
2198 if a.rate != self.rate:
2199 error_str = f'sampling rates differ: ' \
2200 f'{a.rate} in {a.filepath} versus ' \
2201 f'{self.rate} in {self.filepath}'
2202 if a.ampl_min != self.ampl_min:
2203 error_str = f'minimum amplitudes differ: ' \
2204 f'{a.ampl_min} in {a.filepath} versus ' \
2205 f'{self.ampl_min} in {self.filepath}'
2206 if a.ampl_max != self.ampl_max:
2207 error_Str = f'maximum amplitudes differ: ' \
2208 f'{a.ampl_max} in {a.filepath} versus ' \
2209 f'{self.ampl_max} in {self.filepath}'
2210 # check start time of recording:
2211 stime = get_datetime(md)
2212 if start_time is None or stime is None or \
2213 abs(start_time - stime) > timedelta(seconds=1):
2214 error_str = f'start time does not indicate continuous recording: ' \
2215 f'expected {start_time} instead of ' \
2216 f'{stime} in {a.filepath}'
2217 if error_str is not None:
2218 if verbose > 0:
2219 print(error_str)
2220 a.close()
2221 del a
2222 break
2223 # markers:
2224 locs, labels = a.markers()
2225 locs[:,0] += self.frames
2226 self._locs = np.vstack((self._locs, locs))
2227 self._labels = np.vstack((self._labels, labels))
2228 # indices:
2229 self.start_indices.append(self.frames)
2230 self.frames += a.frames
2231 self.end_indices.append(self.frames)
2232 if start_time is not None:
2233 start_time += timedelta(seconds=a.frames/a.rate)
2234 # add file to lists:
2235 self.file_paths.append(filepath)
2236 if len(self.open_files) < AudioLoader.max_open_files:
2237 self.open_files.append(a)
2238 else:
2239 a.close()
2240 if len(self.open_loaders) < AudioLoader.max_open_loaders:
2241 self.data_files.append(a)
2242 self.open_loaders.append(a)
2243 else:
2244 a.close()
2245 del a
2246 self.data_files.append(None)
2247 if len(self.data_files) == 0:
2248 raise FileNotFoundError('input argument filepaths does not contain any valid audio file!')
2249 # set startime from first file:
2250 if self.start_time is not None:
2251 set_starttime(self._metadata, self.start_time)
2252 # setup infrastructure:
2253 self.file_indices = self.start_indices
2254 self.start_indices = np.array(self.start_indices)
2255 self.end_indices = np.array(self.end_indices)
2256 self.shape = (self.frames, self.channels)
2257 self.bufferframes = int(buffersize*self.rate)
2258 self.backframes = int(backsize*self.rate)
2259 self.init_buffer()
2260 self.close = self._close_multiple
2261 self.load_audio_buffer = self._load_buffer_multiple
2262 self._load_metadata = None
2263 self._load_markers = None
2264 return self
2266 def _close_multiple(self):
2267 """Close all the data files. """
2268 self.open_files = []
2269 self.open_loaders = []
2270 if hasattr(self, 'data_files'):
2271 for a in self.data_files:
2272 if a is not None:
2273 a.close()
2274 self.data_files = []
2275 self.filepath = None
2276 self.file_paths = []
2277 self.file_indices = []
2278 self.start_indices = []
2279 self.end_indices = []
2280 del self.data_files
2281 del self.open_files
2282 del self.open_loaders
2283 del self.start_indices
2284 del self.end_indices
2286 def _load_buffer_multiple(self, r_offset, r_size, buffer):
2287 """Load new data from the underlying files.
2289 Parameters
2290 ----------
2291 r_offset: int
2292 First frame to be read from file.
2293 r_size: int
2294 Number of frames to be read from file.
2295 buffer: ndarray
2296 Buffer where to store the loaded data.
2297 """
2298 offs = r_offset
2299 size = r_size
2300 boffs = 0
2301 ai = np.searchsorted(self.end_indices, offs, side='right')
2302 while size > 0:
2303 if self.data_files[ai] is None:
2304 a = DataLoader(self.file_paths[ai],
2305 self.buffersize, self.backsize, 0)
2306 self.data_files[ai] = a
2307 self.open_loaders.append(a)
2308 self.open_files.append(a)
2309 if len(self.open_files) > AudioLoader.max_open_files:
2310 a0 = self.open_files.pop(0)
2311 a0.close()
2312 if len(self.open_loaders) > AudioLoader.max_open_loaders:
2313 a0 = self.open_loaders.pop(0)
2314 self.data_files[self.data_files.index(a0)] = None
2315 a0.close()
2316 del a0
2317 self.collect_counter += 1
2318 if self.collect_counter > AudioLoader.max_open_loaders//2:
2319 gc.collect() # takes time!
2320 self.collect_counter = 0
2321 else:
2322 self.open_loaders.pop(self.open_loaders.index(self.data_files[ai]))
2323 self.open_loaders.append(self.data_files[ai])
2324 ai0 = offs - self.start_indices[ai]
2325 ai1 = offs + size
2326 if ai1 > self.end_indices[ai]:
2327 ai1 = self.end_indices[ai]
2328 ai1 -= self.start_indices[ai]
2329 n = ai1 - ai0
2330 self.data_files[ai].load_audio_buffer(ai0, n,
2331 buffer[boffs:boffs + n,:])
2332 if self.data_files[ai] in self.open_files:
2333 self.open_files.pop(self.open_files.index(self.data_files[ai]))
2334 self.open_files.append(self.data_files[ai])
2335 if len(self.open_files) > AudioLoader.max_open_files:
2336 self.open_files[0].close()
2337 self.open_files.pop(0)
2338 boffs += n
2339 offs += n
2340 size -= n
2341 ai += 1
2344 def open(self, filepath, buffersize=10.0, backsize=0.0,
2345 verbose=0, **kwargs):
2346 """Open file with time-series data for reading.
2348 Parameters
2349 ----------
2350 filepath: str or list of str
2351 Name of the file or list of many file names that should be
2352 made accessible as a single array.
2353 buffersize: float
2354 Size of internal buffer in seconds.
2355 backsize: float
2356 Part of the buffer to be loaded before the requested start index
2357 in seconds.
2358 verbose: int
2359 If > 0 show detailed error/warning messages.
2360 **kwargs: dict
2361 Further keyword arguments that are passed on to the
2362 format specific opening functions.
2363 For example:
2364 - `amax`: the amplitude range of the data.
2365 - 'unit': the unit of the data.
2367 Raises
2368 ------
2369 ValueError:
2370 `filepath` is empty string.
2371 """
2372 # list of implemented open functions:
2373 data_open_funcs = (
2374 ('relacs', check_relacs, self.open_relacs, 1),
2375 ('fishgrid', check_fishgrid, self.open_fishgrid, 1),
2376 ('container', check_container, self.open_container, 1),
2377 ('raw', check_raw, self.open_raw, 1),
2378 ('audioio', None, self.open_audioio, 0),
2379 )
2381 self.buffer = np.array([])
2382 self.rate = 0.0
2383 if not filepath:
2384 raise ValueError('input argument filepath is empty string.')
2385 if isinstance(filepath, (list, tuple, np.ndarray)):
2386 if len(filepath) > 1:
2387 self.open_multiple(filepath, buffersize, backsize,
2388 verbose, **kwargs)
2389 if len(self.file_paths) > 1:
2390 return self
2391 filepath = self.file_paths[0]
2392 self.close()
2393 else:
2394 filepath = filepath[0]
2395 # open data:
2396 for name, check_file, open_file, v in data_open_funcs:
2397 if check_file is None or check_file(filepath):
2398 open_file(filepath, buffersize, backsize, verbose, **kwargs)
2399 if v*verbose > 1:
2400 if self.format is not None:
2401 print(f' format : {self.format}')
2402 if self.encoding is not None:
2403 print(f' encoding : {self.encoding}')
2404 print(f' sampling rate: {self.rate} Hz')
2405 print(f' channels : {self.channels}')
2406 print(f' frames : {self.frames}')
2407 print(f' range : {self.ampl_max:g}{self.unit}')
2408 break
2409 return self
2412def demo(filepath, plot=False):
2413 print("try load_data:")
2414 data, rate, unit, amax = load_data(filepath, verbose=2)
2415 if plot:
2416 fig, ax = plt.subplots()
2417 time = np.arange(len(data))/rate
2418 for c in range(data.shape[1]):
2419 ax.plot(time, data[:,c])
2420 ax.set_xlabel('Time [s]')
2421 ax.set_ylabel(f'[{unit}]')
2422 if amax is not None and np.isfinite(amax):
2423 ax.set_ylim(-amax, +amax)
2424 plt.show()
2425 return
2427 print('')
2428 print("try DataLoader:")
2429 with DataLoader(filepath, 2.0, 1.0, 1) as data:
2430 print('sampling rate: %g' % data.rate)
2431 print('frames : %d %d' % (len(data), data.shape[0]))
2432 nframes = int(1.0 * data.rate)
2433 # forward:
2434 for i in range(0, len(data), nframes):
2435 print('forward %d-%d' % (i, i + nframes))
2436 x = data[i:i + nframes, 0]
2437 if plot:
2438 fig, ax = plt.subplots()
2439 ax.plot((i + np.arange(len(x)))/data.rate, x)
2440 ax.set_xlabel('Time [s]')
2441 ax.set_ylabel(f'[{data.unit}]')
2442 plt.show()
2443 # and backwards:
2444 for i in reversed(range(0, len(data), nframes)):
2445 print('backward %d-%d' % (i, i + nframes))
2446 x = data[i:i + nframes, 0]
2447 if plot:
2448 fig, ax = plt.subplots()
2449 ax.plot((i + np.arange(len(x)))/data.rate, x)
2450 ax.set_xlabel('Time [s]')
2451 ax.set_ylabel(f'[{data.unit}]')
2452 plt.show()
2455def main(*cargs):
2456 """Call demo with command line arguments.
2458 Parameters
2459 ----------
2460 cargs: list of str
2461 Command line arguments as provided by sys.argv[1:]
2462 """
2463 import argparse
2464 parser = argparse.ArgumentParser(description=
2465 'Checking thunderlab.dataloader module.')
2466 parser.add_argument('-p', dest='plot', action='store_true',
2467 help='plot loaded data')
2468 parser.add_argument('file', nargs=1, default='', type=str,
2469 help='name of data file')
2470 args = parser.parse_args(cargs)
2471 demo(args.file[0], args.plot)
2474if __name__ == "__main__":
2475 main(*sys.argv[1:])