Coverage for src/thunderlab/dataloader.py: 76%
885 statements
« prev ^ index » next coverage.py v7.6.2, created at 2024-10-09 16:02 +0000
« prev ^ index » next coverage.py v7.6.2, created at 2024-10-09 16:02 +0000
1"""Load time-series data from files.
3```
4data, rate, unit, amax = load_data('data/file.wav')
5```
7The function `data_loader()` loads the whole time-series from the file
8as a numpy array of floats. First dimension is frames, second is
9channels. In contrast to the `audioio.load_audio()` function, the
10values of the data array are not restricted between -1 and 1. They can
11assume any value wihin the range `-amax` to `+amax` with the returned
12`unit`.
14```
15data = DataLoader('data/file.wav', 60.0)
16```
17or
18```
19with DataLoader('data/file.wav', 60.0) as data:
20```
21Create an `DataLoader` object that loads chuncks of 60 seconds long data
22on demand. `data` can be used like a read-only numpy array of floats.
25## Supported file formats
27- python pickle files
28- numpy .npz files
29- matlab .mat files
30- audio files via [`audioio`](https://github.com/bendalab/audioio) package
31- LabView .scandat files
32- relacs trace*.raw files (https://www.relacs.net)
33- fishgrid traces-*.raw files (https://github.com/bendalab/fishgrid)
36## Metadata
38Many file formats allow to store metadata that further describe the
39stored time series data. We handle them as nested dictionary of key-value
40pairs. Load them with the `metadata()` function:
41```
42metadata = metadata('data/file.mat')
43```
45## Markers
47Some file formats also allow to store markers that mark specific
48positions in the time series data. Load marker positions and spans (in
49the 2-D array `locs`) and label and text strings (in the 2-D array
50`labels`) with the `markers()` function:
51```
52locs, labels = markers('data.wav')
53```
55## Aditional, format specific functions
57- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file.
58- `relacs_header()`: read key-value pairs from relacs *.dat file headers.
59- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file.
60- `fishgrid_spacings()`: spacing between grid electrodes.
62"""
64import os
65import sys
66import glob
67import gzip
68import numpy as np
69try:
70 import matplotlib.pyplot as plt
71except ImportError:
72 pass
73from audioio import load_audio, AudioLoader, unflatten_metadata
74from audioio import get_number_unit, get_number, get_int, get_bool, get_gain
75from audioio import default_starttime_keys, default_gain_keys
76from audioio import metadata as metadata_audioio
77from audioio import markers as markers_audioio
80def relacs_samplerate_unit(filepath, channel=0):
81 """Retrieve sampling rate and unit from a relacs stimuli.dat file.
83 Parameters
84 ----------
85 filepath: str
86 Path to a relacs data directory, or a file in a relacs data directory.
87 channel: int
88 Channel (trace) number, if `filepath` does not specify a
89 trace-*.raw file.
91 Returns
92 -------
93 samplerate: float
94 Sampling rate in Hertz
95 unit: str
96 Unit of the trace, can be empty if not found
98 Raises
99 ------
100 IOError/FileNotFoundError:
101 If the stimuli.dat file does not exist.
102 ValueError:
103 stimuli.dat file does not contain sampling rate.
104 """
105 trace = channel + 1
106 relacs_dir = filepath
107 # check for relacs data directory:
108 if not os.path.isdir(filepath):
109 relacs_dir = os.path.dirname(filepath)
110 bn = os.path.basename(filepath).lower()
111 i = bn.find('.raw')
112 if len(bn) > 5 and bn[0:5] == 'trace' and i > 6:
113 trace = int(bn[6:i])
115 # retreive sampling rate and unit from stimuli.dat file:
116 samplerate = None
117 sampleinterval = None
118 unit = ""
120 lines = []
121 stimuli_file = os.path.join(relacs_dir, 'stimuli.dat')
122 if os.path.isfile(stimuli_file + '.gz'):
123 stimuli_file += '.gz'
124 if stimuli_file[-3:] == '.gz':
125 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf:
126 for line in sf:
127 line = line.strip()
128 if len(line) == 0 or line[0] != '#':
129 break
130 lines.append(line)
131 else:
132 with open(stimuli_file, 'r', encoding='latin-1') as sf:
133 for line in sf:
134 line = line.strip()
135 if len(line) == 0 or line[0] != '#':
136 break
137 lines.append(line)
139 for line in lines:
140 if "unit%d" % trace in line:
141 unit = line.split(':')[1].strip()
142 if "sampling rate%d" % trace in line:
143 value = line.split(':')[1].strip()
144 samplerate = float(value.replace('Hz',''))
145 elif "sample interval%d" % trace in line:
146 value = line.split(':')[1].strip()
147 sampleinterval = float(value.replace('ms',''))
149 if samplerate is not None:
150 return samplerate, unit
151 if sampleinterval is not None:
152 return 1000/sampleinterval, unit
153 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}')
156def relacs_header(filepath, store_empty=False, first_only=False,
157 lower_keys=False, flat=False,
158 add_sections=False):
159 """Read key-value pairs from a relacs *.dat file header.
161 Parameters
162 ----------
163 filepath: str
164 A relacs *.dat file, can be also a zipped .gz file.
165 store_empty: bool
166 If `False` do not add meta data with empty values.
167 first_only: bool
168 If `False` only store the first element of a list.
169 lower_keys: bool
170 Make all keys lower case.
171 flat: bool
172 Do not make a nested dictionary.
173 Use this option also to read in very old relacs metadata with
174 ragged left alignment.
175 add_sections: bool
176 If `True`, prepend keys with sections names separated by
177 '.' to make them unique.
179 Returns
180 -------
181 data: dict
182 Nested dictionary with key-value pairs of the file header.
184 Raises
185 ------
186 IOError/FileNotFoundError:
187 If `filepath` cannot be opened.
188 """
189 # read in header from file:
190 lines = []
191 if os.path.isfile(filepath + '.gz'):
192 filepath += '.gz'
193 if filepath[-3:] == '.gz':
194 with gzip.open(filepath, 'r', encoding='latin-1') as sf:
195 for line in sf:
196 line = line.strip()
197 if len(line) == 0 or line[0] != '#':
198 break
199 lines.append(line)
200 else:
201 with open(filepath, 'r', encoding='latin-1') as sf:
202 for line in sf:
203 line = line.strip()
204 if len(line) == 0 or line[0] != '#':
205 break
206 lines.append(line)
207 # parse:
208 data = {}
209 cdatas = [data]
210 sections = ['']
211 ident_offs = None
212 ident = None
213 for line in lines:
214 words = line.split(':')
215 value = ':'.join(words[1:]).strip() if len(words) > 1 else ''
216 if len(words) >= 1:
217 key = words[0].strip('#')
218 # get section level:
219 level = 0
220 if not flat or len(value) == 0:
221 nident = len(key) - len(key.lstrip())
222 if ident_offs is None:
223 ident_offs = nident
224 elif ident is None:
225 if nident > ident_offs:
226 ident = nident - ident_offs
227 level = 1
228 else:
229 level = (nident - ident_offs)//ident
230 # close sections:
231 if not flat:
232 while len(cdatas) > level + 1:
233 cdatas[-1][sections.pop()] = cdatas.pop()
234 else:
235 while len(sections) > level + 1:
236 sections.pop()
237 # key:
238 key = key.strip().strip('"')
239 if lower_keys:
240 key = key.lower()
241 skey = key
242 if add_sections:
243 key = '.'.join(sections[1:] + [key])
244 if len(value) == 0:
245 # new sub-section:
246 if flat:
247 if store_empty:
248 cdatas[-1][key] = None
249 else:
250 cdatas.append({})
251 sections.append(skey)
252 else:
253 # key-value pair:
254 value = value.strip('"')
255 if len(value) > 0 or value != '-' or store_empty:
256 if len(value) > 0 and value[0] == '[' and value[-1] == ']':
257 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')]
258 if first_only:
259 value = value[0]
260 cdatas[-1][key] = value
261 while len(cdatas) > 1:
262 cdatas[-1][sections.pop()] = cdatas.pop()
263 return data
266def check_relacs(file_path):
267 """Check for valid relacs file.
269 Parameters
270 ----------
271 file_path: str
272 Path to a relacs data directory, or a file in a relacs data directory.
274 Returns
275 -------
276 is_relacs: boolean
277 `True` if `file_path` is a valid relacs directory or is a file therein.
278 """
279 # relacs data directory:
280 relacs_dir = file_path
281 if not os.path.isdir(file_path):
282 relacs_dir = os.path.dirname(file_path)
283 # check for a valid relacs data directory:
284 has_stimuli = False
285 has_trace = False
286 for fname in ['stimuli.dat', 'stimuli.dat.gz']:
287 if os.path.isfile(os.path.join(relacs_dir, fname)):
288 has_stimuli = True
289 for fname in ['trace-1.raw', 'trace-1.raw.gz']:
290 if os.path.isfile(os.path.join(relacs_dir, fname)):
291 has_trace = True
292 return has_stimuli and has_trace
295def relacs_trace_files(file_path):
296 """Expand file path for relacs data to appropriate trace*.raw file names.
298 Parameters
299 ----------
300 file_path: str
301 Path to a relacs data directory, or a file in a relacs data directory.
303 Returns
304 -------
305 trace_file_paths: list of str
306 List of relacs trace*.raw files.
307 """
308 relacs_dir = file_path
309 if not os.path.isdir(file_path):
310 relacs_dir = os.path.dirname(file_path)
311 trace_file_paths = []
312 for k in range(10000):
313 fname = os.path.join(relacs_dir, f'trace-{k+1}.raw')
314 if os.path.isfile(fname):
315 trace_file_paths.append(fname)
316 elif os.path.isfile(fname + '.gz'):
317 trace_file_paths.append(fname + '.gz')
318 else:
319 break
320 return trace_file_paths
323def load_relacs(file_path, amax=1.0):
324 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs).
326 Parameters
327 ----------
328 file_path: str
329 Path to a relacs data directory, or a file in a relacs data directory.
330 amax: float
331 The amplitude range of the data.
333 Returns
334 -------
335 data: 2-D array
336 All data traces as an 2-D numpy array, even for single channel data.
337 First dimension is time, second is channel.
338 rate: float
339 Sampling rate of the data in Hz
340 unit: str
341 Unit of the data
342 amax: float
343 Maximum amplitude of data range.
345 Raises
346 ------
347 ValueError:
348 - Invalid name for relacs trace-*.raw file.
349 - Sampling rates of traces differ.
350 - Unit of traces differ.
351 """
352 trace_file_paths = relacs_trace_files(file_path)
353 # load trace*.raw files:
354 nchannels = len(trace_file_paths)
355 data = None
356 nrows = 0
357 rate = None
358 unit = ''
359 for c, path in enumerate(sorted(trace_file_paths)):
360 if path[-3:] == '.gz':
361 with gzip.open(path, 'rb') as sf:
362 x = np.frombuffer(sf.read(), dtype=np.float32)
363 else:
364 x = np.fromfile(path, np.float32)
365 if data is None:
366 nrows = len(x)
367 data = np.zeros((nrows, nchannels))
368 n = min(len(x), nrows)
369 data[:n,c] = x[:n]
370 # retrieve sampling rate and unit:
371 crate, us = relacs_samplerate_unit(path, c)
372 if rate is None:
373 rate = crate
374 elif crate != rate:
375 raise ValueError('sampling rates of traces differ')
376 if len(unit) == 0:
377 unit = us
378 elif us != unit:
379 raise ValueError('unit of traces differ')
380 return data, rate, unit, amax
383def metadata_relacs(file_path, store_empty=False, first_only=False,
384 lower_keys=False, flat=False, add_sections=False):
385 """ Read meta-data of a relacs data set.
387 Parameters
388 ----------
389 file_path: str
390 A relacs data directory or a file therein.
391 store_empty: bool
392 If `False` do not add meta data with empty values.
393 first_only: bool
394 If `False` only store the first element of a list.
395 lower_keys: bool
396 Make all keys lower case.
397 flat: bool
398 Do not make a nested dictionary.
399 Use this option also to read in very old relacs metadata with
400 ragged left alignment.
401 add_sections: bool
402 If `True`, prepend keys with sections names separated by
403 '.' to make them unique.
405 Returns
406 -------
407 data: nested dict
408 Nested dictionary with key-value pairs of the meta data.
409 """
410 relacs_dir = file_path
411 if not os.path.isdir(file_path):
412 relacs_dir = os.path.dirname(file_path)
413 info_path = os.path.join(relacs_dir, 'info.dat')
414 if not os.path.exists(info_path):
415 return dict(), []
416 data = relacs_header(info_path, store_empty, first_only,
417 lower_keys, flat, add_sections)
418 return data
421def fishgrid_spacings(metadata, unit='m'):
422 """Spacing between grid electrodes.
424 Parameters
425 ----------
426 metadata: dict
427 Fishgrid metadata obtained from `metadata_fishgrid()`.
428 unit: str
429 Unit in which to return the spacings.
431 Returns
432 -------
433 grid_dist: list of tuple of float
434 For each grid the distances between rows and columns in `unit`.
435 """
436 grids_dist = []
437 for k in range(4):
438 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0)
439 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0)
440 rows = get_int(metadata, f'Rows{k+1}', default=0)
441 cols = get_int(metadata, f'Columns{k+1}', default=0)
442 if get_bool(metadata, f'Used{k+1}', default=False) or \
443 cols > 0 and rows > 0:
444 grids_dist.append((row_dist, col_dist))
445 return grids_dist
448def fishgrid_grids(metadata):
449 """Retrieve grid sizes from a fishgrid.cfg file.
451 Parameters
452 ----------
453 metadata: dict
454 Fishgrid metadata obtained from `metadata_fishgrid()`.
456 Returns
457 -------
458 grids: list of tuple of int
459 For each grid the number of rows and columns.
460 """
461 grids = []
462 for k in range(4):
463 rows = get_int(metadata, f'Rows{k+1}', default=0)
464 cols = get_int(metadata, f'Columns{k+1}', default=0)
465 if get_bool(metadata, f'Used{k+1}', default=False) or \
466 cols > 0 and rows > 0:
467 grids.append((rows, cols))
468 return grids
471def check_fishgrid(file_path):
472 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid).
474 Parameters
475 ----------
476 file_path: str
477 Path to a fishgrid data directory or a file in a fishgrid
478 data directory.
480 Returns
481 -------
482 is_fishgrid: bool
483 `True` if `file_path` is a valid fishgrid data directory or
484 a file therein.
485 """
486 # fishgrid data directory:
487 fishgrid_dir = file_path
488 if not os.path.isdir(file_path):
489 fishgrid_dir = os.path.dirname(file_path)
490 # check for a valid fishgrid data directory:
491 return (os.path.isfile(os.path.join(fishgrid_dir, 'fishgrid.cfg')) and
492 (os.path.isfile(os.path.join(fishgrid_dir, 'traces-grid1.raw')) or
493 os.path.isfile(os.path.join(fishgrid_dir, 'traces.raw'))))
496def fishgrid_trace_files(file_path):
497 """Expand file paths for fishgrid data to appropriate traces*.raw file names.
499 Parameters
500 ----------
501 file_path: str
502 Path to a fishgrid data directory, or a file therein.
504 Returns
505 -------
506 trace_file_paths: list of str
507 List of fishgrid traces*.raw files.
508 """
509 # find grids:
510 fishgrid_dir = file_path
511 if not os.path.isdir(fishgrid_dir):
512 fishgrid_dir = os.path.dirname(file_path)
513 trace_file_paths = []
514 for k in range(10000):
515 file = os.path.join(fishgrid_dir, f'traces-grid{k+1}.raw')
516 if os.path.isfile(file):
517 trace_file_paths.append(file)
518 else:
519 break
520 if len(trace_file_paths) == 0:
521 file = os.path.join(fishgrid_dir, f'traces.raw')
522 if os.path.isfile(file):
523 trace_file_paths.append(file)
524 return trace_file_paths
527def load_fishgrid(file_path):
528 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid).
530 Parameters
531 ----------
532 file_path: str
533 Path to a fishgrid data directory, or a file therein.
535 Returns
536 -------
537 data: 2-D array
538 All data traces as an 2-D numpy array, even for single channel data.
539 First dimension is time, second is channel.
540 rate: float
541 Sampling rate of the data in Hz.
542 unit: str
543 Unit of the data.
544 amax: float
545 Maximum amplitude of data range.
547 Raises
548 ------
549 FileNotFoundError:
550 Invalid or not existing fishgrid files.
551 """
552 trace_file_paths = fishgrid_trace_files(file_path)
553 if len(trace_file_paths) == 0:
554 raise FileNotFoundError(f'no fishgrid files specified')
555 md = metadata_fishgrid(file_path)
556 grids = fishgrid_grids(md)
557 grid_sizes = [r*c for r,c in grids]
559 # load traces-grid*.raw files:
560 grid_channels = []
561 nchannels = 0
562 for g, path in enumerate(trace_file_paths):
563 grid_channels.append(grid_sizes[g])
564 nchannels += grid_sizes[g]
565 data = None
566 nrows = 0
567 c = 0
568 rate = get_number(md, 'Hz', 'AISampleRate')
569 for path, channels in zip(trace_file_paths, grid_channels):
570 x = np.fromfile(path, np.float32).reshape((-1, channels))
571 if data is None:
572 nrows = len(x)
573 data = np.zeros((nrows, nchannels))
574 n = min(len(x), nrows)
575 data[:n,c:c+channels] = x[:n,:]
576 c += channels
577 amax, unit = get_number_unit(md, 'AIMaxVolt')
578 return data, rate, unit, amax
581# add fishgrid keys:
582default_starttime_keys.append(['StartDate', 'StartTime'])
583default_gain_keys.insert(0, 'AIMaxVolt')
586def metadata_fishgrid(file_path):
587 """ Read meta-data of a fishgrid data set.
589 Parameters
590 ----------
591 file_path: str
592 A fishgrid data directory or a file therein.
594 Returns
595 -------
596 data: nested dict
597 Nested dictionary with key-value pairs of the meta data.
598 """
599 fishgrid_dir = file_path
600 if not os.path.isdir(fishgrid_dir):
601 fishgrid_dir = os.path.dirname(file_path)
602 path = os.path.join(fishgrid_dir, 'fishgrid.cfg')
603 # read in header from file:
604 lines = []
605 if os.path.isfile(path + '.gz'):
606 path += '.gz'
607 if not os.path.exists(path):
608 return {}
609 if path[-3:] == '.gz':
610 with gzip.open(path, 'r', encoding='latin-1') as sf:
611 for line in sf:
612 lines.append(line)
613 else:
614 with open(path, 'r', encoding='latin-1') as sf:
615 for line in sf:
616 lines.append(line)
617 # parse:
618 data = {}
619 cdatas = [data]
620 ident_offs = None
621 ident = None
622 old_style = False
623 grid_n = False
624 for line in lines:
625 if len(line.strip()) == 0:
626 continue
627 if line[0] == '*':
628 key = line[1:].strip()
629 data[key] = {}
630 cdatas = [data, data[key]]
631 elif '----' in line:
632 old_style = True
633 key = line.strip().strip(' -').replace('&', '')
634 if key.upper() == 'SETUP':
635 key = 'Grid 1'
636 grid_n = False
637 if key[:4].lower() == 'grid':
638 grid_n = key[5]
639 cdatas = cdatas[:2]
640 cdatas[1][key] = {}
641 cdatas.append(cdatas[1][key])
642 else:
643 words = line.split(':')
644 key = words[0].strip().strip('"')
645 value = None
646 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style):
647 value = ':'.join(words[1:]).strip().strip('"')
648 if old_style:
649 if value is None:
650 cdatas = cdatas[:3]
651 cdatas[2][key] = {}
652 cdatas.append(cdatas[2][key])
653 else:
654 if grid_n and key[-1] != grid_n:
655 key = key + grid_n
656 cdatas[-1][key] = value
657 else:
658 # get section level:
659 level = 0
660 nident = len(line) - len(line.lstrip())
661 if ident_offs is None:
662 ident_offs = nident
663 elif ident is None:
664 if nident > ident_offs:
665 ident = nident - ident_offs
666 level = 1
667 else:
668 level = (nident - ident_offs)//ident
669 # close sections:
670 cdatas = cdatas[:2 + level]
671 if value is None:
672 # new section:
673 cdatas[-1][key] = {}
674 cdatas.append(cdatas[-1][key])
675 else:
676 # key-value pair:
677 cdatas[-1][key] = value.replace(r'\n', '\n')
678 # remove unused grids:
679 fgm = data.get('FishGrid', {})
680 for i in range(4):
681 gs = f'Grid {i+1}'
682 if gs in fgm:
683 gm = fgm[gs]
684 us = f'Used{i+1}'
685 if us in gm and gm[us].upper() == 'FALSE':
686 del fgm[gs]
687 return data
690def markers_fishgrid(file_path):
691 """ Read markers of a fishgrid data set.
693 Parameters
694 ----------
695 file_path: str
696 A fishgrid data directory or a file therein.
698 Returns
699 -------
700 locs: 2-D array of ints
701 Marker positions (first column) and spans (second column)
702 for each marker (rows).
703 labels: 2-D array of string objects
704 Labels (first column) and texts (second column)
705 for each marker (rows).
706 """
707 def add_marker():
708 if 'index1' in marker:
709 index1 = int(marker['index1'])//nchannels
710 else:
711 index1 = int(marker['index'])//nchannels
712 span1 = int(marker.get('span1', 0))//nchannels
713 locs.append([index1, span1])
714 ls = marker.get('label', 'M')
715 cs = marker.get('comment', '')
716 labels.append([ls, cs])
718 fishgrid_dir = file_path
719 if not os.path.isdir(fishgrid_dir):
720 fishgrid_dir = os.path.dirname(file_path)
721 path = os.path.join(fishgrid_dir, 'timestamps.dat')
722 if not os.path.isfile(path):
723 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
724 # get number of channels:
725 md = metadata_fishgrid(path.replace('timestamps.dat', 'fishgrid.cfg'))
726 grids = fishgrid_grids(md)
727 nchannels = np.prod(grids[0])
728 # read timestamps:
729 locs = []
730 labels = []
731 marker = {}
732 with open(path, 'r') as sf:
733 for line in sf:
734 if len(line.strip()) == 0:
735 add_marker()
736 marker = {}
737 else:
738 words = line.split(':')
739 if len(words) > 1:
740 v = words[1].strip()
741 v = v.strip('"')
742 marker[words[0].strip().lower()] = v
743 if len(marker) > 0:
744 add_marker()
745 if len(locs) > 2:
746 return np.array(locs[1:-1]), np.array(labels[1:-1])
747 else:
748 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
751def check_container(filepath):
752 """Check if file is a generic container file.
754 Supported file formats are:
756 - python pickle files (.pkl)
757 - numpy files (.npz)
758 - matlab files (.mat)
760 Parameters
761 ----------
762 filepath: str
763 Path of the file to check.
765 Returns
766 -------
767 is_container: bool
768 `True`, if `filepath` is a supported container format.
769 """
770 ext = os.path.splitext(filepath)[1]
771 return ext.lower() in ('.pkl', '.npz', '.mat')
774def extract_container_data(data_dict, datakey=None,
775 samplekey=['rate', 'Fs', 'fs'],
776 timekey=['time'], amplkey=['amax'], unitkey='unit',
777 amax=1.0, unit='a.u.'):
778 """Extract data from dictionary loaded from a container file.
780 Parameters
781 ----------
782 data_dict: dict
783 Dictionary of the data items contained in the container.
784 datakey: None, str, or list of str
785 Name of the variable holding the data. If `None` take the
786 variable that is an 2D array and has the largest number of
787 elements.
788 samplekey: str or list of str
789 Name of the variable holding the sampling rate.
790 timekey: str or list of str
791 Name of the variable holding sampling times.
792 If no sampling rate is available, the sampling rate is retrieved
793 from the sampling times.
794 amplkey: str or list of str
795 Name of the variable holding the amplitude range of the data.
796 unitkey: str
797 Name of the variable holding the unit of the data.
798 amax: None or float
799 If specified and no amplitude range has been found in `data_dict`,
800 then this is the amplitude range of the data.
801 unit: None or str
802 If specified and no unit has been found in `data_dict`,
803 then return this as the unit of the data.
805 Returns
806 -------
807 data: 2-D array of floats
808 All data traces as an 2-D numpy array, even for single channel data.
809 First dimension is time, second is channel.
810 rate: float
811 Sampling rate of the data in Hz.
812 unit: str
813 Unit of the data.
814 amax: float
815 Maximum amplitude of data range in `unit`.
817 Raises
818 ------
819 ValueError:
820 Invalid key requested.
821 """
822 # extract format data:
823 if not isinstance(samplekey, (list, tuple, np.ndarray)):
824 samplekey = (samplekey,)
825 if not isinstance(timekey, (list, tuple, np.ndarray)):
826 timekey = (timekey,)
827 if not isinstance(amplkey, (list, tuple, np.ndarray)):
828 amplkey = (amplkey,)
829 rate = 0.0
830 for skey in samplekey:
831 if skey in data_dict:
832 rate = float(data_dict[skey])
833 break
834 if rate == 0.0:
835 for tkey in timekey:
836 if tkey in data_dict:
837 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0])
838 break
839 if rate == 0.0:
840 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times")
841 for akey in amplkey:
842 if akey in data_dict:
843 amax = float(data_dict[akey])
844 break
845 if unitkey in data_dict:
846 unit = data_dict[unitkey]
847 # get data array:
848 raw_data = np.array([])
849 if datakey:
850 # try data keys:
851 if not isinstance(datakey, (list, tuple, np.ndarray)):
852 datakey = (datakey,)
853 for dkey in datakey:
854 if dkey in data_dict:
855 raw_data = data_dict[dkey]
856 break
857 if len(raw_data) == 0:
858 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data")
859 else:
860 # find largest 2D array:
861 for d in data_dict:
862 if hasattr(data_dict[d], 'shape'):
863 if 1 <= len(data_dict[d].shape) <= 2 and \
864 np.max(data_dict[d].shape) > np.max(raw_data.shape):
865 raw_data = data_dict[d]
866 if len(raw_data) == 0:
867 raise ValueError('no data found')
868 # make 2D:
869 if len(raw_data.shape) == 1:
870 raw_data = raw_data.reshape(-1, 1)
871 # transpose if necessary:
872 if np.argmax(raw_data.shape) > 0:
873 raw_data = raw_data.T
874 # recode:
875 if raw_data.dtype == np.dtype('int16'):
876 data = raw_data.astype('float32')
877 data *= amax/2**15
878 elif raw_data.dtype == np.dtype('int32'):
879 data = raw_data.astype(float)
880 data *= amax/2**31
881 elif raw_data.dtype == np.dtype('int64'):
882 data = raw_data.astype(float)
883 data *= amax/2**63
884 else:
885 data = raw_data
886 return data, rate, unit, amax
889def load_container(file_path, datakey=None,
890 samplekey=['rate', 'Fs', 'fs'],
891 timekey=['time'], amplkey=['amax'], unitkey='unit',
892 amax=1.0, unit='a.u.'):
893 """Load data from a generic container file.
895 Supported file formats are:
897 - python pickle files (.pkl)
898 - numpy files (.npz)
899 - matlab files (.mat)
901 Parameters
902 ----------
903 file_path: str
904 Path of the file to load.
905 datakey: None, str, or list of str
906 Name of the variable holding the data. If `None` take the
907 variable that is an 2D array and has the largest number of
908 elements.
909 samplekey: str or list of str
910 Name of the variable holding the sampling rate.
911 timekey: str or list of str
912 Name of the variable holding sampling times.
913 If no sampling rate is available, the sampling rate is retrieved
914 from the sampling times.
915 amplkey: str
916 Name of the variable holding the amplitude range of the data.
917 unitkey: str
918 Name of the variable holding the unit of the data.
919 If `unitkey` is not a valid key, then return `unitkey` as the `unit`.
920 amax: None or float
921 If specified and no amplitude range has been found in the data
922 container, then this is the amplitude range of the data.
923 unit: None or str
924 If specified and no unit has been found in the data container,
925 then return this as the unit of the data.
927 Returns
928 -------
929 data: 2-D array of floats
930 All data traces as an 2-D numpy array, even for single channel data.
931 First dimension is time, second is channel.
932 rate: float
933 Sampling rate of the data in Hz.
934 unit: str
935 Unit of the data.
936 amax: float
937 Maximum amplitude of data range.
939 Raises
940 ------
941 ValueError:
942 Invalid key requested.
943 """
944 # load data:
945 data_dict = {}
946 ext = os.path.splitext(file_path)[1]
947 if ext == '.pkl':
948 import pickle
949 with open(file_path, 'rb') as f:
950 data_dict = pickle.load(f)
951 elif ext == '.npz':
952 data_dict = np.load(file_path)
953 elif ext == '.mat':
954 from scipy.io import loadmat
955 data_dict = loadmat(file_path, squeeze_me=True)
956 return extract_container_data(data_dict, datakey, samplekey,
957 timekey, amplkey, unitkey, amax, unit)
960def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']):
961 """ Extract metadata from dictionary loaded from a container file.
963 Parameters
964 ----------
965 data_dict: dict
966 Dictionary of the data items contained in the container.
967 metadatakey: str or list of str
968 Name of the variable holding the metadata.
970 Returns
971 -------
972 metadata: nested dict
973 Nested dictionary with key-value pairs of the meta data.
974 """
975 if not isinstance(metadatakey, (list, tuple, np.ndarray)):
976 metadatakey = (metadatakey,)
977 # get single metadata dictionary:
978 for mkey in metadatakey:
979 if mkey in data_dict:
980 return data_dict[mkey]
981 # collect all keys starting with metadatakey:
982 metadata = {}
983 for mkey in metadatakey:
984 mkey += '__'
985 for dkey in data_dict:
986 if dkey[:len(mkey)] == mkey:
987 v = data_dict[dkey]
988 if hasattr(v, 'size') and v.ndim == 0:
989 v = v.item()
990 metadata[dkey[len(mkey):]] = v
991 if len(metadata) > 0:
992 return unflatten_metadata(metadata, sep='__')
993 return metadata
996def metadata_container(file_path, metadatakey=['metadata', 'info']):
997 """ Read meta-data of a container file.
999 Parameters
1000 ----------
1001 file_path: str
1002 A container file.
1003 metadatakey: str or list of str
1004 Name of the variable holding the metadata.
1006 Returns
1007 -------
1008 metadata: nested dict
1009 Nested dictionary with key-value pairs of the meta data.
1010 """
1011 data_dict = {}
1012 ext = os.path.splitext(file_path)[1]
1013 if ext == '.pkl':
1014 import pickle
1015 with open(file_path, 'rb') as f:
1016 data_dict = pickle.load(f)
1017 elif ext == '.npz':
1018 data_dict = np.load(file_path)
1019 elif ext == '.mat':
1020 from scipy.io import loadmat
1021 data_dict = loadmat(file_path, squeeze_me=True)
1022 return extract_container_metadata(data_dict, metadatakey)
1025def extract_container_markers(data_dict, poskey=['positions'],
1026 spanskey=['spans'], labelskey=['labels'],
1027 descrkey=['descriptions']):
1028 """ Extract markers from dictionary loaded from a container file.
1030 Parameters
1031 ----------
1032 data_dict: dict
1033 Dictionary of the data items contained in the container.
1034 poskey: str or list of str
1035 Name of the variable holding positions of markers.
1036 spanskey: str or list of str
1037 Name of the variable holding spans of markers.
1038 labelskey: str or list of str
1039 Name of the variable holding labels of markers.
1040 descrkey: str or list of str
1041 Name of the variable holding descriptions of markers.
1043 Returns
1044 -------
1045 locs: 2-D array of ints
1046 Marker positions (first column) and spans (second column)
1047 for each marker (rows).
1048 labels: 2-D array of string objects
1049 Labels (first column) and texts (second column)
1050 for each marker (rows).
1051 """
1052 if not isinstance(poskey, (list, tuple, np.ndarray)):
1053 poskey = (poskey,)
1054 if not isinstance(spanskey, (list, tuple, np.ndarray)):
1055 spanskey = (spanskey,)
1056 if not isinstance(labelskey, (list, tuple, np.ndarray)):
1057 labelskey = (labelskey,)
1058 if not isinstance(descrkey, (list, tuple, np.ndarray)):
1059 descrkey = (descrkey,)
1060 locs = np.zeros((0, 2), dtype=int)
1061 for pkey in poskey:
1062 if pkey in data_dict:
1063 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int)
1064 locs[:,0] = data_dict[pkey]
1065 break
1066 for skey in spanskey:
1067 if skey in data_dict:
1068 locs[:,1] = data_dict[skey]
1069 break
1070 labels = np.zeros((0, 2), dtype=object)
1071 for lkey in labelskey:
1072 if lkey in data_dict:
1073 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object)
1074 labels[:,0] = data_dict[lkey]
1075 break
1076 for dkey in descrkey:
1077 if dkey in data_dict:
1078 labels[:,1] = data_dict[dkey]
1079 break
1080 return locs, labels
1083def markers_container(file_path, poskey=['positions'],
1084 spanskey=['spans'], labelskey=['labels'],
1085 descrkey=['descriptions']):
1086 """ Read markers of a container file.
1088 Parameters
1089 ----------
1090 file_path: str
1091 A container file.
1092 poskey: str or list of str
1093 Name of the variable holding positions of markers.
1094 spanskey: str or list of str
1095 Name of the variable holding spans of markers.
1096 labelskey: str or list of str
1097 Name of the variable holding labels of markers.
1098 descrkey: str or list of str
1099 Name of the variable holding descriptions of markers.
1101 Returns
1102 -------
1103 locs: 2-D array of ints
1104 Marker positions (first column) and spans (second column)
1105 for each marker (rows).
1106 labels: 2-D array of string objects
1107 Labels (first column) and texts (second column)
1108 for each marker (rows).
1109 """
1110 data_dict = {}
1111 ext = os.path.splitext(file_path)[1]
1112 if ext == '.pkl':
1113 import pickle
1114 with open(file_path, 'rb') as f:
1115 data_dict = pickle.load(f)
1116 elif ext == '.npz':
1117 data_dict = np.load(file_path)
1118 elif ext == '.mat':
1119 from scipy.io import loadmat
1120 data_dict = loadmat(file_path, squeeze_me=True)
1121 return extract_container_markers(data_dict, poskey, spanskey,
1122 labelskey, descrkey)
1125def check_raw(filepath):
1126 """Check if file is a raw file.
1128 The following extensions are interpreted as raw files:
1130 - raw files (*.raw)
1131 - LabView scandata (*.scandat)
1133 Parameters
1134 ----------
1135 filepath: str
1136 Path of the file to check.
1138 Returns
1139 -------
1140 is_raw: bool
1141 `True`, if `filepath` is a raw format.
1142 """
1143 ext = os.path.splitext(filepath)[1]
1144 return ext.lower() in ('.raw', '.scandat', '.mat')
1147def load_raw(file_path, rate=44000, channels=1, dtype=np.float32,
1148 amax=1.0, unit='a.u.'):
1149 """Load data from a raw file.
1151 Raw files just contain the data and absolutely no metadata, not
1152 even the smapling rate, number of channels, etc.
1153 Supported file formats are:
1155 - raw files (*.raw)
1156 - LabView scandata (*.scandat)
1158 Parameters
1159 ----------
1160 file_path: str
1161 Path of the file to load.
1162 rate: float
1163 Sampling rate of the data in Hertz.
1164 channels: int
1165 Number of channels multiplexed in the data.
1166 dtype: str or numpy.dtype
1167 The data type stored in the file.
1168 amax: float
1169 The amplitude range of the data.
1170 unit: str
1171 The unit of the data.
1173 Returns
1174 -------
1175 data: 2-D array of floats
1176 All data traces as an 2-D numpy array, even for single channel data.
1177 First dimension is time, second is channel.
1178 rate: float
1179 Sampling rate of the data in Hz.
1180 unit: str
1181 Unit of the data.
1182 amax: float
1183 Maximum amplitude of data range.
1185 """
1186 raw_data = np.fromfile(file_path, dtype=dtype).reshape(-1, channels)
1187 # recode:
1188 if dtype == np.dtype('int16'):
1189 data = raw_data.astype('float32')
1190 data *= amax/2**15
1191 elif dtype == np.dtype('int32'):
1192 data = raw_data.astype(float)
1193 data *= amax/2**31
1194 elif dtype == np.dtype('int64'):
1195 data = raw_data.astype(float)
1196 data *= amax/2**63
1197 else:
1198 data = raw_data
1199 return data, rate, unit, amax
1202def load_audioio(file_path, verbose=0, gainkey=default_gain_keys, sep='.',
1203 amax=1.0, unit='a.u.'):
1204 """Load data from an audio file.
1206 See the
1207 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio)
1208 function of the [`audioio`](https://github.com/bendalab/audioio)
1209 package for more infos.
1211 Parameters
1212 ----------
1213 file_path: str
1214 Path of the file to load.
1215 verbose: int
1216 If > 0 show detailed error/warning messages.
1217 gainkey: str or list of str
1218 Key in the file's metadata that holds some gain information.
1219 If found, the data will be multiplied with the gain,
1220 and if available, the corresponding unit is returned.
1221 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
1222 sep: str
1223 String that separates section names in `gainkey`.
1224 amax: float
1225 If specified and no gain has been found in the metadata,
1226 then use this as the amplitude range.
1227 unit: str
1228 If specified and no gain has been found in the metadata,
1229 then return this as the unit of the data.
1231 Returns
1232 -------
1233 data: 2-D array of floats
1234 All data traces as an 2-D numpy array, even for single channel data.
1235 First dimension is time, second is channel.
1236 rate: float
1237 Sampling rate of the data in Hz.
1238 unit: str
1239 Unit of the data if found in the metadata (see `gainkey`),
1240 otherwise `unit`.
1241 amax: float
1242 Maximum amplitude of data range.
1243 """
1244 # get gain:
1245 md = metadata_audioio(file_path)
1246 amax, unit = get_gain(md, gainkey, sep, amax, unit)
1247 # load data:
1248 data, rate = load_audio(file_path, verbose)
1249 if amax != 1.0:
1250 data *= amax
1251 return data, rate, unit, amax
1254data_loader_funcs = (
1255 ('relacs', check_relacs, load_relacs, metadata_relacs, None),
1256 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid),
1257 ('container', check_container, load_container, metadata_container, markers_container),
1258 ('raw', check_raw, load_raw, None, None),
1259 ('audioio', None, load_audioio, metadata_audioio, markers_audioio),
1260 )
1261"""List of implemented load functions.
1263Each element of the list is a tuple with the data format's name, its
1264check and its load function.
1266"""
1269def load_data(file_path, verbose=0, **kwargs):
1270 """Load time-series data from a file.
1272 Parameters
1273 ----------
1274 file_path: str
1275 Path and name of the file to load.
1276 verbose: int
1277 If > 0 show detailed error/warning messages.
1278 **kwargs: dict
1279 Further keyword arguments that are passed on to the
1280 format specific loading functions.
1281 For example:
1282 - `amax`: the amplitude range of the data.
1283 - 'unit': the unit of the data.
1285 Returns
1286 -------
1287 data: 2-D array
1288 All data traces as an 2-D numpy array, even for single channel data.
1289 First dimension is time, second is channel.
1290 rate: float
1291 Sampling rate of the data in Hz.
1292 unit: str
1293 Unit of the data.
1294 amax: float
1295 Maximum amplitude of data range.
1297 Raises
1298 ------
1299 ValueError:
1300 `file_path` is empty string.
1301 """
1302 if len(file_path) == 0:
1303 raise ValueError('input argument file_path is empty string.')
1304 # load data:
1305 for name, check_file, load_file, _, _ in data_loader_funcs:
1306 if check_file is None or check_file(file_path):
1307 data, rate, unit, amax = load_file(file_path, **kwargs)
1308 if verbose > 0:
1309 print(f'loaded {name} data from file "{file_path}"')
1310 if verbose > 1:
1311 print(f' sampling rate: {rate:g} Hz')
1312 print(f' channels : {data.shape[1]}')
1313 print(f' frames : {len(data)}')
1314 print(f' range : {amax:g}{unit}')
1315 return data, rate, unit, amax
1316 return np.zeros((0, 1)), 0.0, '', 1.0
1319def metadata(file_path, **kwargs):
1320 """ Read meta-data from a data file.
1322 Parameters
1323 ----------
1324 file_path: str
1325 The full path and name of the file to load. For some file
1326 formats several files can be provided in a list.
1327 **kwargs: dict
1328 Further keyword arguments that are passed on to the
1329 format specific loading functions.
1331 Returns
1332 -------
1333 meta_data: nested dict
1334 Meta data contained in the file. Keys of the nested
1335 dictionaries are always strings. If the corresponding
1336 values are dictionaries, then the key is the section name
1337 of the metadata contained in the dictionary. All other
1338 types of values are values for the respective key. In
1339 particular they are strings, or list of strings. But other
1340 simple types like ints or floats are also allowed.
1342 Raises
1343 ------
1344 ValueError:
1345 `file_path` is empty string.
1346 """
1347 if len(file_path) == 0:
1348 raise ValueError('input argument file_path is empty string.')
1349 # load metadata:
1350 for _, check_file, _, metadata_file, _ in data_loader_funcs:
1351 if check_file is None or check_file(file_path):
1352 if metadata_file is not None:
1353 return metadata_file(file_path, **kwargs)
1354 return {}
1357def markers(file_path):
1358 """ Read markers of a data file.
1360 Parameters
1361 ----------
1362 file_path: str or file handle
1363 The data file.
1365 Returns
1366 -------
1367 locs: 2-D array of ints
1368 Marker positions (first column) and spans (second column)
1369 for each marker (rows).
1370 labels: 2-D array of string objects
1371 Labels (first column) and texts (second column)
1372 for each marker (rows).
1374 Raises
1375 ------
1376 ValueError:
1377 `file_path` is empty string.
1378 """
1379 if len(file_path) == 0:
1380 raise ValueError('input argument file_path is empty string.')
1381 # load markers:
1382 for _, check_file, _, _, markers_file in data_loader_funcs:
1383 if check_file is None or check_file(file_path):
1384 if markers_file is not None:
1385 return markers_file(file_path)
1386 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
1389class DataLoader(AudioLoader):
1390 """Buffered reading of time-series data for random access of the data in the file.
1392 This allows for reading very large data files that do not fit into
1393 memory. A `DataLoader` instance can be used like a huge
1394 read-only numpy array, i.e.
1395 ```
1396 data = DataLoader('path/to/data/file.dat')
1397 x = data[10000:20000,0]
1398 ```
1399 The first index specifies the frame, the second one the channel.
1401 `DataLoader` first determines the format of the data file and then
1402 opens the file (first line). It then reads data from the file as
1403 necessary for the requested data (second line).
1405 Supported file formats are
1407 - audio files via `audioio` package
1408 - python pickle files
1409 - numpy .npz files
1410 - matlab .mat files
1411 - relacs trace*.raw files (www.relacs.net)
1412 - fishgrid traces-*.raw files
1414 Reading sequentially through the file is always possible. If
1415 previous data are requested, then the file is read from the
1416 beginning. This might slow down access to previous data
1417 considerably. Use the `backsize` argument to the open functions to
1418 make sure some data are loaded before the requested frame. Then a
1419 subsequent access to the data within `backsize` seconds before that
1420 frame can still be handled without the need to reread the file
1421 from the beginning.
1423 Usage:
1424 ------
1425 ```
1426 import thunderlab.dataloader as dl
1427 with dl.DataLoader(file_path, 60.0, 10.0) as data:
1428 # do something with the content of the file:
1429 x = data[0:10000,0]
1430 y = data[10000:20000,0]
1431 z = x + y
1432 ```
1434 Normal open and close:
1435 ```
1436 data = dl.DataLoader(file_path, 60.0)
1437 x = data[:,:] # read the whole file
1438 data.close()
1439 ```
1440 that is the same as:
1441 ```
1442 data = dl.DataLoader()
1443 data.open(file_path, 60.0)
1444 ```
1446 Parameters
1447 ----------
1448 file_path: str
1449 Name of the file.
1450 buffersize: float
1451 Size of internal buffer in seconds.
1452 backsize: float
1453 Part of the buffer to be loaded before the requested start index in seconds.
1454 verbose: int
1455 If larger than zero show detailed error/warning messages.
1456 meta_kwargs: dict
1457 Keyword arguments that are passed on to the _load_metadata() function.
1459 Attributes
1460 ----------
1461 rate: float
1462 The sampling rate of the data in Hertz.
1463 channels: int
1464 The number of channels that are read in.
1465 frames: int
1466 The number of frames in the file.
1467 format: str or None
1468 Format of the audio file.
1469 encoding: str or None
1470 Encoding/subtype of the audio file.
1471 shape: tuple
1472 Number of frames and channels of the data.
1473 ndim: int
1474 Number of dimensions: always 2 (frames and channels).
1475 unit: str
1476 Unit of the data.
1477 ampl_min: float
1478 Minimum amplitude the file format supports.
1479 ampl_max: float
1480 Maximum amplitude the file format supports.
1482 Methods
1483 -------
1485 - `len()`: the number of frames
1486 - `open()`: open a data file.
1487 - `open_*()`: open a data file of a specific format.
1488 - `close()`: close the file.
1489 - `metadata()`: metadata of the file.
1490 - `markers()`: markers of the file.
1491 - `set_unwrap()`: Set parameters for unwrapping clipped data.
1493 """
1495 def __init__(self, file_path=None, buffersize=10.0, backsize=0.0,
1496 verbose=0, **meta_kwargs):
1497 super().__init__(None, buffersize, backsize,
1498 verbose, **meta_kwargs)
1499 if file_path is not None:
1500 self.open(file_path, buffersize, backsize, verbose, **meta_kwargs)
1502 def __getitem__(self, key):
1503 return super(DataLoader, self).__getitem__(key)
1505 def __next__(self):
1506 return super(DataLoader, self).__next__()
1509 # relacs interface:
1510 def open_relacs(self, file_path, buffersize=10.0, backsize=0.0,
1511 verbose=0, amax=1.0):
1512 """Open relacs data files (www.relacs.net) for reading.
1514 Parameters
1515 ----------
1516 file_path: str
1517 Path to a relacs data directory or a file therein.
1518 buffersize: float
1519 Size of internal buffer in seconds.
1520 backsize: float
1521 Part of the buffer to be loaded before the requested start index in seconds.
1522 verbose: int
1523 If > 0 show detailed error/warning messages.
1524 amax: float
1525 The amplitude range of the data.
1527 Raises
1528 ------
1529 ValueError: .gz files not supported.
1530 """
1531 self.verbose = verbose
1533 if self.sf is not None:
1534 self._close_relacs()
1536 trace_file_paths = relacs_trace_files(file_path)
1538 # open trace files:
1539 self.sf = []
1540 self.frames = None
1541 self.rate = None
1542 self.unit = ''
1543 self.filepath = None
1544 if len(trace_file_paths) > 0:
1545 self.filepath = os.path.dirname(trace_file_paths[0])
1546 for path in sorted(trace_file_paths):
1547 if path[-3:] == '.gz':
1548 raise ValueError('.gz files not supported')
1549 sf = open(path, 'rb')
1550 self.sf.append(sf)
1551 if verbose > 0:
1552 print(f'open_relacs(file_path) with file_path={path}')
1553 # file size:
1554 sf.seek(0, os.SEEK_END)
1555 frames = sf.tell()//4
1556 if self.frames is None:
1557 self.frames = frames
1558 elif self.frames != frames:
1559 diff = self.frames - frames
1560 if diff > 1 or diff < -2:
1561 raise ValueError('number of frames of traces differ')
1562 elif diff >= 0:
1563 self.frames = frames
1564 sf.seek(0)
1565 # retrieve sampling rate and unit:
1566 rate, us = relacs_samplerate_unit(path)
1567 if self.rate is None:
1568 self.rate = rate
1569 elif rate != self.rate:
1570 raise ValueError('sampling rates of traces differ')
1571 if len(self.unit) == 0:
1572 self.unit = us
1573 elif us != self.unit:
1574 raise ValueError('unit of traces differ')
1575 self.channels = len(self.sf)
1576 self.shape = (self.frames, self.channels)
1577 self.size = self.frames * self.channels
1578 self.ndim = len(self.shape)
1579 self.format = 'RELACS'
1580 self.encoding = 'FLOAT'
1581 self.bufferframes = int(buffersize*self.rate)
1582 self.backframes = int(backsize*self.rate)
1583 self.init_buffer()
1584 self.offset = 0
1585 self.close = self._close_relacs
1586 self.load_audio_buffer = self._load_buffer_relacs
1587 self.ampl_min = -amax
1588 self.ampl_max = +amax
1589 self._load_metadata = self._metadata_relacs
1590 # TODO: load markers:
1591 self._locs = np.zeros((0, 2), dtype=int)
1592 self._labels = np.zeros((0, 2), dtype=object)
1593 self._load_markers = None
1594 return self
1596 def _close_relacs(self):
1597 """Close the relacs data files.
1598 """
1599 if self.sf is not None:
1600 for file in self.sf:
1601 file.close()
1602 self.sf = None
1604 def _load_buffer_relacs(self, r_offset, r_size, buffer):
1605 """Load new data from relacs data file.
1607 Parameters
1608 ----------
1609 r_offset: int
1610 First frame to be read from file.
1611 r_size: int
1612 Number of frames to be read from file.
1613 buffer: ndarray
1614 Buffer where to store the loaded data.
1615 """
1616 for i, file in enumerate(self.sf):
1617 file.seek(r_offset*4)
1618 data = file.read(r_size*4)
1619 buffer[:, i] = np.frombuffer(data, dtype=np.float32)
1622 def _metadata_relacs(self, store_empty=False, first_only=False):
1623 """ Load meta-data of a relacs data set.
1624 """
1625 info_path = os.path.join(self.filepath, 'info.dat')
1626 if not os.path.exists(info_path):
1627 return {}
1628 return relacs_header(info_path, store_empty, first_only)
1631 # fishgrid interface:
1632 def open_fishgrid(self, file_path, buffersize=10.0, backsize=0.0,
1633 verbose=0):
1634 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading.
1636 Parameters
1637 ----------
1638 file_path: str
1639 Path to a fishgrid data directory, or a file therein.
1640 buffersize: float
1641 Size of internal buffer in seconds.
1642 backsize: float
1643 Part of the buffer to be loaded before the requested start index in seconds.
1644 verbose: int
1645 If > 0 show detailed error/warning messages.
1646 """
1647 self.verbose = verbose
1649 if self.sf is not None:
1650 self._close_fishgrid()
1652 trace_file_paths = fishgrid_trace_files(file_path)
1653 self.filepath = None
1654 if len(trace_file_paths) > 0:
1655 self.filepath = os.path.dirname(trace_file_paths[0])
1656 self._load_metadata = metadata_fishgrid
1657 self._load_markers = markers_fishgrid
1659 # open grid files:
1660 grids = fishgrid_grids(self.metadata())
1661 grid_sizes = [r*c for r,c in grids]
1662 self.channels = 0
1663 for g, path in enumerate(trace_file_paths):
1664 self.channels += grid_sizes[g]
1665 self.sf = []
1666 self.grid_channels = []
1667 self.grid_offs = []
1668 offs = 0
1669 self.frames = None
1670 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate')
1671 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt')
1672 if v is not None:
1673 self.ampl_min = -v
1674 self.ampl_max = +v
1676 for g, path in enumerate(trace_file_paths):
1677 sf = open(path, 'rb')
1678 self.sf.append(sf)
1679 if verbose > 0:
1680 print(f'open_fishgrid(file_path) with file_path={path}')
1681 # grid channels:
1682 self.grid_channels.append(grid_sizes[g])
1683 self.grid_offs.append(offs)
1684 offs += grid_sizes[g]
1685 # file size:
1686 sf.seek(0, os.SEEK_END)
1687 frames = sf.tell()//4//grid_sizes[g]
1688 if self.frames is None:
1689 self.frames = frames
1690 elif self.frames != frames:
1691 diff = self.frames - frames
1692 if diff > 1 or diff < -2:
1693 raise ValueError('number of frames of traces differ')
1694 elif diff >= 0:
1695 self.frames = frames
1696 sf.seek(0)
1697 self.shape = (self.frames, self.channels)
1698 self.size = self.frames * self.channels
1699 self.ndim = len(self.shape)
1700 self.format = 'FISHGRID'
1701 self.encoding = 'FLOAT'
1702 self.bufferframes = int(buffersize*self.rate)
1703 self.backframes = int(backsize*self.rate)
1704 self.init_buffer()
1705 self.offset = 0
1706 self.close = self._close_fishgrid
1707 self.load_audio_buffer = self._load_buffer_fishgrid
1708 return self
1710 def _close_fishgrid(self):
1711 """Close the fishgrid data files.
1712 """
1713 if self.sf is not None:
1714 for file in self.sf:
1715 file.close()
1716 self.sf = None
1718 def _load_buffer_fishgrid(self, r_offset, r_size, buffer):
1719 """Load new data from relacs data file.
1721 Parameters
1722 ----------
1723 r_offset: int
1724 First frame to be read from file.
1725 r_size: int
1726 Number of frames to be read from file.
1727 buffer: ndarray
1728 Buffer where to store the loaded data.
1729 """
1730 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs):
1731 file.seek(r_offset*4*gchannels)
1732 data = file.read(r_size*4*gchannels)
1733 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels))
1736 # container interface:
1737 def open_container(self, file_path, buffersize=10.0,
1738 backsize=0.0, verbose=0, datakey=None,
1739 samplekey=['rate', 'Fs', 'fs'],
1740 timekey=['time'], amplkey=['amax'], unitkey='unit',
1741 metadatakey=['metadata', 'info'],
1742 poskey=['positions'],
1743 spanskey=['spans'], labelskey=['labels'],
1744 descrkey=['descriptions'],
1745 amax=1.0, unit='a.u.'):
1746 """Open generic container file.
1748 Supported file formats are:
1750 - python pickle files (.pkl)
1751 - numpy files (.npz)
1752 - matlab files (.mat)
1754 Parameters
1755 ----------
1756 file_path: str
1757 Path to a container file.
1758 buffersize: float
1759 Size of internal buffer in seconds.
1760 backsize: float
1761 Part of the buffer to be loaded before the requested start index in seconds.
1762 verbose: int
1763 If > 0 show detailed error/warning messages.
1764 datakey: None, str, or list of str
1765 Name of the variable holding the data. If `None` take the
1766 variable that is an 2D array and has the largest number of
1767 elements.
1768 samplekey: str or list of str
1769 Name of the variable holding the sampling rate.
1770 timekey: str or list of str
1771 Name of the variable holding sampling times.
1772 If no sampling rate is available, the sampling rate is retrieved
1773 from the sampling times.
1774 amplkey: str or list of str
1775 Name of the variable holding the amplitude range of the data.
1776 unitkey: str
1777 Name of the variable holding the unit of the data.
1778 metadatakey: str or list of str
1779 Name of the variable holding the metadata.
1780 poskey: str or list of str
1781 Name of the variable holding positions of markers.
1782 spanskey: str or list of str
1783 Name of the variable holding spans of markers.
1784 labelskey: str or list of str
1785 Name of the variable holding labels of markers.
1786 descrkey: str or list of str
1787 Name of the variable holding descriptions of markers.
1788 amax: None or float
1789 If specified and no amplitude range has been found in the data
1790 container, then this is the amplitude range of the data.
1791 unit: None or str
1792 If specified and no unit has been found in the data container,
1793 then return this as the unit of the data.
1795 Raises
1796 ------
1797 ValueError:
1798 Invalid key requested.
1799 """
1800 self.verbose = verbose
1801 data_dict = {}
1802 ext = os.path.splitext(file_path)[1]
1803 if ext == '.pkl':
1804 import pickle
1805 with open(file_path, 'rb') as f:
1806 data_dict = pickle.load(f)
1807 self.format = 'PKL'
1808 elif ext == '.npz':
1809 data_dict = np.load(file_path)
1810 self.format = 'NPZ'
1811 elif ext == '.mat':
1812 from scipy.io import loadmat
1813 data_dict = loadmat(file_path, squeeze_me=True)
1814 self.format = 'MAT'
1815 self.buffer, self.rate, self.unit, amax = \
1816 extract_container_data(data_dict, datakey, samplekey,
1817 timekey, amplkey, unitkey, amax, unit)
1818 self.filepath = file_path
1819 self.channels = self.buffer.shape[1]
1820 self.frames = self.buffer.shape[0]
1821 self.shape = self.buffer.shape
1822 self.ndim = self.buffer.ndim
1823 self.size = self.buffer.size
1824 self.encoding = self.numpy_encodings[self.buffer.dtype]
1825 self.ampl_min = -amax
1826 self.ampl_max = +amax
1827 self.offset = 0
1828 self.buffer_changed = np.zeros(self.channels, dtype=bool)
1829 self.bufferframes = self.frames
1830 self.backsize = 0
1831 self.close = self._close_container
1832 self.load_audio_buffer = self._load_buffer_container
1833 self._metadata = extract_container_metadata(data_dict, metadatakey)
1834 self._load_metadata = None
1835 self._locs, self._labels = extract_container_markers(data_dict,
1836 poskey,
1837 spanskey,
1838 labelskey,
1839 descrkey)
1840 self._load_markers = None
1842 def _close_container(self):
1843 """Close container. """
1844 pass
1846 def _load_buffer_container(self, r_offset, r_size, buffer):
1847 """Load new data from container."""
1848 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :]
1851 # raw data interface:
1852 def open_raw(self, file_path, buffersize=10.0, backsize=0.0,
1853 verbose=0, rate=44000, channels=1, dtype=np.float32,
1854 amax=1.0, unit='a.u.'):
1855 """Load data from a raw file.
1857 Raw files just contain the data and absolutely no metadata, not
1858 even the smapling rate, number of channels, etc.
1859 Supported file formats are:
1861 - raw files (*.raw)
1862 - LabView scandata (*.scandat)
1864 Parameters
1865 ----------
1866 file_path: str
1867 Path of the file to load.
1868 buffersize: float
1869 Size of internal buffer in seconds.
1870 backsize: float
1871 Part of the buffer to be loaded before the requested start index in seconds.
1872 verbose: int
1873 If > 0 show detailed error/warning messages.
1874 rate: float
1875 Sampling rate of the data in Hertz.
1876 channels: int
1877 Number of channels multiplexed in the data.
1878 dtype: str or numpy.dtype
1879 The data type stored in the file.
1880 amax: float
1881 The amplitude range of the data.
1882 unit: str
1883 The unit of the data.
1884 """
1885 self.verbose = verbose
1886 self.filepath = file_path
1887 self.sf = open(file_path, 'rb')
1888 if verbose > 0:
1889 print(f'open_raw(file_path) with file_path={file_path}')
1890 self.dtype = np.dtype(dtype)
1891 self.rate = float(rate)
1892 # file size:
1893 self.sf.seek(0, os.SEEK_END)
1894 self.frames = self.sf.tell()//self.dtype.itemsize
1895 self.sf.seek(0)
1896 self.channels = int(channels)
1897 self.shape = (self.frames, self.channels)
1898 self.ndim = len(self.shape)
1899 self.size = self.frames*self.channels
1900 self.format = 'RAW'
1901 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN')
1902 self.unit = unit
1903 self.ampl_max = float(amax)
1904 self.ampl_min = -self.ampl_max
1905 self.offset = 0
1906 self.bufferframes = int(buffersize*self.rate)
1907 self.backframes = int(backsize*self.rate)
1908 self.init_buffer()
1909 self.close = self._close_raw
1910 self.load_audio_buffer = self._load_buffer_raw
1911 self._metadata = None
1912 self._load_metadata = None
1913 self._locs = None
1914 self._labels = None
1915 self._load_markers = None
1917 def _close_raw(self):
1918 """Close raw file. """
1919 self.sf.close()
1920 self.sf = None
1922 def _load_buffer_raw(self, r_offset, r_size, buffer):
1923 """Load new data from container."""
1924 self.sf.seek(r_offset*self.dtype.itemsize)
1925 raw_data = self.sf.read(r_size*self.dtype.itemsize)
1926 raw_data = np.frombuffer(raw_data, dtype=self.dtype)
1927 raw_data = raw_data.reshape(-1, self.channels)
1928 # recode:
1929 if self.dtype == np.dtype('int16'):
1930 data = raw_data.astype('float32')
1931 data *= amax/2**15
1932 elif self.dtype == np.dtype('int32'):
1933 data = raw_data.astype(float)
1934 data *= amax/2**31
1935 elif self.dtype == np.dtype('int64'):
1936 data = raw_data.astype(float)
1937 data *= amax/2**63
1938 else:
1939 data = raw_data
1940 buffer[:, :] = data
1943 # audioio interface:
1944 def open_audioio(self, file_path, buffersize=10.0, backsize=0.0,
1945 verbose=0, gainkey=default_gain_keys, sep='.',
1946 amax=None, unit='a.u.'):
1947 """Open an audio file.
1949 See the [audioio](https://github.com/bendalab/audioio) package
1950 for details.
1952 Parameters
1953 ----------
1954 file_path: str
1955 Path to an audio file.
1956 buffersize: float
1957 Size of internal buffer in seconds.
1958 backsize: float
1959 Part of the buffer to be loaded before the requested start index
1960 in seconds.
1961 verbose: int
1962 If > 0 show detailed error/warning messages.
1963 gainkey: str or list of str
1964 Key in the file's metadata that holds some gain information.
1965 If found, the data will be multiplied with the gain,
1966 and if available, the corresponding unit is returned.
1967 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
1968 sep: str
1969 String that separates section names in `gainkey`.
1970 amax: None or float
1971 If specified and no gain has been found in the metadata,
1972 then use this as the amplitude range.
1973 unit: None or str
1974 If specified and no gain has been found in the metadata,
1975 then this is the unit of the data.
1977 """
1978 self.verbose = verbose
1979 super(DataLoader, self).open(file_path, buffersize, backsize, verbose)
1980 md = self.metadata()
1981 fac, unit = get_gain(md, gainkey, sep, amax, unit)
1982 if fac is None:
1983 self.gain_fac = 1.0
1984 else:
1985 self.gain_fac = fac
1986 self._load_buffer_audio_org = self.load_audio_buffer
1987 self.load_audio_buffer = self._load_buffer_audioio
1988 self.ampl_min *= self.gain_fac
1989 self.ampl_max *= self.gain_fac
1990 self.unit = unit
1991 return self
1993 def _load_buffer_audioio(self, r_offset, r_size, buffer):
1994 """Load and scale new data from an audio file.
1996 Parameters
1997 ----------
1998 r_offset: int
1999 First frame to be read from file.
2000 r_size: int
2001 Number of frames to be read from file.
2002 buffer: ndarray
2003 Buffer where to store the loaded data.
2004 """
2005 self._load_buffer_audio_org(r_offset, r_size, buffer)
2006 buffer *= self.gain_fac
2009 def open(self, file_path, buffersize=10.0, backsize=0.0,
2010 verbose=0, **kwargs):
2011 """Open file with time-series data for reading.
2013 Parameters
2014 ----------
2015 file_path: str or list of str
2016 Path to a data files or directory.
2017 buffersize: float
2018 Size of internal buffer in seconds.
2019 backsize: float
2020 Part of the buffer to be loaded before the requested start index
2021 in seconds.
2022 verbose: int
2023 If > 0 show detailed error/warning messages.
2024 **kwargs: dict
2025 Further keyword arguments that are passed on to the
2026 format specific opening functions.
2027 For example:
2028 - `amax`: the amplitude range of the data.
2029 - 'unit': the unit of the data.
2031 Raises
2032 ------
2033 ValueError:
2034 `file_path` is empty string.
2035 """
2036 # list of implemented open functions:
2037 data_open_funcs = (
2038 ('relacs', check_relacs, self.open_relacs, 1),
2039 ('fishgrid', check_fishgrid, self.open_fishgrid, 1),
2040 ('container', check_container, self.open_container, 1),
2041 ('raw', check_raw, self.open_raw, 1),
2042 ('audioio', None, self.open_audioio, 0),
2043 )
2044 if len(file_path) == 0:
2045 raise ValueError('input argument file_path is empty string.')
2046 # open data:
2047 for name, check_file, open_file, v in data_open_funcs:
2048 if check_file is None or check_file(file_path):
2049 open_file(file_path, buffersize, backsize, verbose, **kwargs)
2050 if v*verbose > 1:
2051 if self.format is not None:
2052 print(f' format : {self.format}')
2053 if self.encoding is not None:
2054 print(f' encoding : {self.encoding}')
2055 print(f' sampling rate: {self.rate} Hz')
2056 print(f' channels : {self.channels}')
2057 print(f' frames : {self.frames}')
2058 print(f' range : {amax:g}{unit}')
2059 break
2060 return self
2063def demo(file_path, plot=False):
2064 print("try load_data:")
2065 data, rate, unit, amax = load_data(file_path, verbose=2)
2066 if plot:
2067 fig, ax = plt.subplots()
2068 time = np.arange(len(data))/rate
2069 for c in range(data.shape[1]):
2070 ax.plot(time, data[:,c])
2071 ax.set_xlabel('Time [s]')
2072 ax.set_ylabel(f'[{unit}]')
2073 if amax is not None and np.isfinite(amax):
2074 ax.set_ylim(-amax, +amax)
2075 plt.show()
2076 return
2078 print('')
2079 print("try DataLoader:")
2080 with DataLoader(file_path, 2.0, 1.0, 1) as data:
2081 print('sampling rate: %g' % data.rate)
2082 print('frames : %d %d' % (len(data), data.shape[0]))
2083 nframes = int(1.0 * data.rate)
2084 # forward:
2085 for i in range(0, len(data), nframes):
2086 print('forward %d-%d' % (i, i + nframes))
2087 x = data[i:i + nframes, 0]
2088 if plot:
2089 fig, ax = plt.subplots()
2090 ax.plot((i + np.arange(len(x)))/data.rate, x)
2091 ax.set_xlabel('Time [s]')
2092 ax.set_ylabel(f'[{data.unit}]')
2093 plt.show()
2094 # and backwards:
2095 for i in reversed(range(0, len(data), nframes)):
2096 print('backward %d-%d' % (i, i + nframes))
2097 x = data[i:i + nframes, 0]
2098 if plot:
2099 fig, ax = plt.subplots()
2100 ax.plot((i + np.arange(len(x)))/data.rate, x)
2101 ax.set_xlabel('Time [s]')
2102 ax.set_ylabel(f'[{data.unit}]')
2103 plt.show()
2106def main(*cargs):
2107 """Call demo with command line arguments.
2109 Parameters
2110 ----------
2111 cargs: list of str
2112 Command line arguments as provided by sys.argv[1:]
2113 """
2114 import argparse
2115 parser = argparse.ArgumentParser(description=
2116 'Checking thunderlab.dataloader module.')
2117 parser.add_argument('-p', dest='plot', action='store_true',
2118 help='plot loaded data')
2119 parser.add_argument('file', nargs=1, default='', type=str,
2120 help='name of data file')
2121 args = parser.parse_args(cargs)
2122 demo(args.file[0], args.plot)
2125if __name__ == "__main__":
2126 main(*sys.argv[1:])