Coverage for src/thunderlab/dataloader.py: 76%
885 statements
« prev ^ index » next coverage.py v7.6.8, created at 2024-11-29 17:59 +0000
« prev ^ index » next coverage.py v7.6.8, created at 2024-11-29 17:59 +0000
1"""Load time-series data from files.
3```
4data, rate, unit, amax = load_data('data/file.wav')
5```
7The function `data_loader()` loads the whole time-series from the file
8as a numpy array of floats. First dimension is frames, second is
9channels. In contrast to the `audioio.load_audio()` function, the
10values of the data array are not restricted between -1 and 1. They can
11assume any value wihin the range `-amax` to `+amax` with the returned
12`unit`.
14```
15data = DataLoader('data/file.wav', 60.0)
16```
17or
18```
19with DataLoader('data/file.wav', 60.0) as data:
20```
21Create an `DataLoader` object that loads chuncks of 60 seconds long data
22on demand. `data` can be used like a read-only numpy array of floats.
25## Supported file formats
27- python pickle files
28- numpy .npz files
29- matlab .mat files
30- audio files via [`audioio`](https://github.com/bendalab/audioio) package
31- LabView .scandat files
32- relacs trace*.raw files (https://www.relacs.net)
33- fishgrid traces-*.raw files (https://github.com/bendalab/fishgrid)
36## Metadata
38Many file formats allow to store metadata that further describe the
39stored time series data. We handle them as nested dictionary of key-value
40pairs. Load them with the `metadata()` function:
41```
42metadata = metadata('data/file.mat')
43```
45## Markers
47Some file formats also allow to store markers that mark specific
48positions in the time series data. Load marker positions and spans (in
49the 2-D array `locs`) and label and text strings (in the 2-D array
50`labels`) with the `markers()` function:
51```
52locs, labels = markers('data.wav')
53```
55## Aditional, format specific functions
57- `extract_container_metadata()`: extract metadata from dictionary loaded from a container file.
58- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file.
59- `relacs_header()`: read key-value pairs from relacs *.dat file headers.
60- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file.
61- `fishgrid_spacings()`: spacing between grid electrodes.
63"""
65import os
66import sys
67import glob
68import gzip
69import numpy as np
70try:
71 import matplotlib.pyplot as plt
72except ImportError:
73 pass
74from audioio import load_audio, AudioLoader, unflatten_metadata
75from audioio import get_number_unit, get_number, get_int, get_bool, get_gain
76from audioio import default_starttime_keys, default_gain_keys
77from audioio import metadata as metadata_audioio
78from audioio import markers as markers_audioio
81def relacs_samplerate_unit(filepath, channel=0):
82 """Retrieve sampling rate and unit from a relacs stimuli.dat file.
84 Parameters
85 ----------
86 filepath: str
87 Path to a relacs data directory, or a file in a relacs data directory.
88 channel: int
89 Channel (trace) number, if `filepath` does not specify a
90 trace-*.raw file.
92 Returns
93 -------
94 samplerate: float
95 Sampling rate in Hertz
96 unit: str
97 Unit of the trace, can be empty if not found
99 Raises
100 ------
101 IOError/FileNotFoundError:
102 If the stimuli.dat file does not exist.
103 ValueError:
104 stimuli.dat file does not contain sampling rate.
105 """
106 trace = channel + 1
107 relacs_dir = filepath
108 # check for relacs data directory:
109 if not os.path.isdir(filepath):
110 relacs_dir = os.path.dirname(filepath)
111 bn = os.path.basename(filepath).lower()
112 i = bn.find('.raw')
113 if len(bn) > 5 and bn[0:5] == 'trace' and i > 6:
114 trace = int(bn[6:i])
116 # retreive sampling rate and unit from stimuli.dat file:
117 samplerate = None
118 sampleinterval = None
119 unit = ""
121 lines = []
122 stimuli_file = os.path.join(relacs_dir, 'stimuli.dat')
123 if os.path.isfile(stimuli_file + '.gz'):
124 stimuli_file += '.gz'
125 if stimuli_file[-3:] == '.gz':
126 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf:
127 for line in sf:
128 line = line.strip()
129 if len(line) == 0 or line[0] != '#':
130 break
131 lines.append(line)
132 else:
133 with open(stimuli_file, 'r', encoding='latin-1') as sf:
134 for line in sf:
135 line = line.strip()
136 if len(line) == 0 or line[0] != '#':
137 break
138 lines.append(line)
140 for line in lines:
141 if "unit%d" % trace in line:
142 unit = line.split(':')[1].strip()
143 if "sampling rate%d" % trace in line:
144 value = line.split(':')[1].strip()
145 samplerate = float(value.replace('Hz',''))
146 elif "sample interval%d" % trace in line:
147 value = line.split(':')[1].strip()
148 sampleinterval = float(value.replace('ms',''))
150 if samplerate is not None:
151 return samplerate, unit
152 if sampleinterval is not None:
153 return 1000/sampleinterval, unit
154 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}')
157def relacs_header(filepath, store_empty=False, first_only=False,
158 lower_keys=False, flat=False,
159 add_sections=False):
160 """Read key-value pairs from a relacs *.dat file header.
162 Parameters
163 ----------
164 filepath: str
165 A relacs *.dat file, can be also a zipped .gz file.
166 store_empty: bool
167 If `False` do not add meta data with empty values.
168 first_only: bool
169 If `False` only store the first element of a list.
170 lower_keys: bool
171 Make all keys lower case.
172 flat: bool
173 Do not make a nested dictionary.
174 Use this option also to read in very old relacs metadata with
175 ragged left alignment.
176 add_sections: bool
177 If `True`, prepend keys with sections names separated by
178 '.' to make them unique.
180 Returns
181 -------
182 data: dict
183 Nested dictionary with key-value pairs of the file header.
185 Raises
186 ------
187 IOError/FileNotFoundError:
188 If `filepath` cannot be opened.
189 """
190 # read in header from file:
191 lines = []
192 if os.path.isfile(filepath + '.gz'):
193 filepath += '.gz'
194 if filepath[-3:] == '.gz':
195 with gzip.open(filepath, 'r', encoding='latin-1') as sf:
196 for line in sf:
197 line = line.strip()
198 if len(line) == 0 or line[0] != '#':
199 break
200 lines.append(line)
201 else:
202 with open(filepath, 'r', encoding='latin-1') as sf:
203 for line in sf:
204 line = line.strip()
205 if len(line) == 0 or line[0] != '#':
206 break
207 lines.append(line)
208 # parse:
209 data = {}
210 cdatas = [data]
211 sections = ['']
212 ident_offs = None
213 ident = None
214 for line in lines:
215 words = line.split(':')
216 value = ':'.join(words[1:]).strip() if len(words) > 1 else ''
217 if len(words) >= 1:
218 key = words[0].strip('#')
219 # get section level:
220 level = 0
221 if not flat or len(value) == 0:
222 nident = len(key) - len(key.lstrip())
223 if ident_offs is None:
224 ident_offs = nident
225 elif ident is None:
226 if nident > ident_offs:
227 ident = nident - ident_offs
228 level = 1
229 else:
230 level = (nident - ident_offs)//ident
231 # close sections:
232 if not flat:
233 while len(cdatas) > level + 1:
234 cdatas[-1][sections.pop()] = cdatas.pop()
235 else:
236 while len(sections) > level + 1:
237 sections.pop()
238 # key:
239 key = key.strip().strip('"')
240 if lower_keys:
241 key = key.lower()
242 skey = key
243 if add_sections:
244 key = '.'.join(sections[1:] + [key])
245 if len(value) == 0:
246 # new sub-section:
247 if flat:
248 if store_empty:
249 cdatas[-1][key] = None
250 else:
251 cdatas.append({})
252 sections.append(skey)
253 else:
254 # key-value pair:
255 value = value.strip('"')
256 if len(value) > 0 or value != '-' or store_empty:
257 if len(value) > 0 and value[0] == '[' and value[-1] == ']':
258 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')]
259 if first_only:
260 value = value[0]
261 cdatas[-1][key] = value
262 while len(cdatas) > 1:
263 cdatas[-1][sections.pop()] = cdatas.pop()
264 return data
267def check_relacs(file_path):
268 """Check for valid relacs file.
270 Parameters
271 ----------
272 file_path: str
273 Path to a relacs data directory, or a file in a relacs data directory.
275 Returns
276 -------
277 is_relacs: boolean
278 `True` if `file_path` is a valid relacs directory or is a file therein.
279 """
280 # relacs data directory:
281 relacs_dir = file_path
282 if not os.path.isdir(file_path):
283 relacs_dir = os.path.dirname(file_path)
284 # check for a valid relacs data directory:
285 has_stimuli = False
286 has_trace = False
287 for fname in ['stimuli.dat', 'stimuli.dat.gz']:
288 if os.path.isfile(os.path.join(relacs_dir, fname)):
289 has_stimuli = True
290 for fname in ['trace-1.raw', 'trace-1.raw.gz']:
291 if os.path.isfile(os.path.join(relacs_dir, fname)):
292 has_trace = True
293 return has_stimuli and has_trace
296def relacs_trace_files(file_path):
297 """Expand file path for relacs data to appropriate trace*.raw file names.
299 Parameters
300 ----------
301 file_path: str
302 Path to a relacs data directory, or a file in a relacs data directory.
304 Returns
305 -------
306 trace_file_paths: list of str
307 List of relacs trace*.raw files.
308 """
309 relacs_dir = file_path
310 if not os.path.isdir(file_path):
311 relacs_dir = os.path.dirname(file_path)
312 trace_file_paths = []
313 for k in range(10000):
314 fname = os.path.join(relacs_dir, f'trace-{k+1}.raw')
315 if os.path.isfile(fname):
316 trace_file_paths.append(fname)
317 elif os.path.isfile(fname + '.gz'):
318 trace_file_paths.append(fname + '.gz')
319 else:
320 break
321 return trace_file_paths
324def load_relacs(file_path, amax=1.0):
325 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs).
327 Parameters
328 ----------
329 file_path: str
330 Path to a relacs data directory, or a file in a relacs data directory.
331 amax: float
332 The amplitude range of the data.
334 Returns
335 -------
336 data: 2-D array
337 All data traces as an 2-D numpy array, even for single channel data.
338 First dimension is time, second is channel.
339 rate: float
340 Sampling rate of the data in Hz
341 unit: str
342 Unit of the data
343 amax: float
344 Maximum amplitude of data range.
346 Raises
347 ------
348 ValueError:
349 - Invalid name for relacs trace-*.raw file.
350 - Sampling rates of traces differ.
351 - Unit of traces differ.
352 """
353 trace_file_paths = relacs_trace_files(file_path)
354 # load trace*.raw files:
355 nchannels = len(trace_file_paths)
356 data = None
357 nrows = 0
358 rate = None
359 unit = ''
360 for c, path in enumerate(sorted(trace_file_paths)):
361 if path[-3:] == '.gz':
362 with gzip.open(path, 'rb') as sf:
363 x = np.frombuffer(sf.read(), dtype=np.float32)
364 else:
365 x = np.fromfile(path, np.float32)
366 if data is None:
367 nrows = len(x)
368 data = np.zeros((nrows, nchannels))
369 n = min(len(x), nrows)
370 data[:n,c] = x[:n]
371 # retrieve sampling rate and unit:
372 crate, us = relacs_samplerate_unit(path, c)
373 if rate is None:
374 rate = crate
375 elif crate != rate:
376 raise ValueError('sampling rates of traces differ')
377 if len(unit) == 0:
378 unit = us
379 elif us != unit:
380 raise ValueError('unit of traces differ')
381 return data, rate, unit, amax
384def metadata_relacs(file_path, store_empty=False, first_only=False,
385 lower_keys=False, flat=False, add_sections=False):
386 """ Read meta-data of a relacs data set.
388 Parameters
389 ----------
390 file_path: str
391 A relacs data directory or a file therein.
392 store_empty: bool
393 If `False` do not add meta data with empty values.
394 first_only: bool
395 If `False` only store the first element of a list.
396 lower_keys: bool
397 Make all keys lower case.
398 flat: bool
399 Do not make a nested dictionary.
400 Use this option also to read in very old relacs metadata with
401 ragged left alignment.
402 add_sections: bool
403 If `True`, prepend keys with sections names separated by
404 '.' to make them unique.
406 Returns
407 -------
408 data: nested dict
409 Nested dictionary with key-value pairs of the meta data.
410 """
411 relacs_dir = file_path
412 if not os.path.isdir(file_path):
413 relacs_dir = os.path.dirname(file_path)
414 info_path = os.path.join(relacs_dir, 'info.dat')
415 if not os.path.exists(info_path):
416 return dict(), []
417 data = relacs_header(info_path, store_empty, first_only,
418 lower_keys, flat, add_sections)
419 return data
422def fishgrid_spacings(metadata, unit='m'):
423 """Spacing between grid electrodes.
425 Parameters
426 ----------
427 metadata: dict
428 Fishgrid metadata obtained from `metadata_fishgrid()`.
429 unit: str
430 Unit in which to return the spacings.
432 Returns
433 -------
434 grid_dist: list of tuple of float
435 For each grid the distances between rows and columns in `unit`.
436 """
437 grids_dist = []
438 for k in range(4):
439 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0)
440 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0)
441 rows = get_int(metadata, f'Rows{k+1}', default=0)
442 cols = get_int(metadata, f'Columns{k+1}', default=0)
443 if get_bool(metadata, f'Used{k+1}', default=False) or \
444 cols > 0 and rows > 0:
445 grids_dist.append((row_dist, col_dist))
446 return grids_dist
449def fishgrid_grids(metadata):
450 """Retrieve grid sizes from a fishgrid.cfg file.
452 Parameters
453 ----------
454 metadata: dict
455 Fishgrid metadata obtained from `metadata_fishgrid()`.
457 Returns
458 -------
459 grids: list of tuple of int
460 For each grid the number of rows and columns.
461 """
462 grids = []
463 for k in range(4):
464 rows = get_int(metadata, f'Rows{k+1}', default=0)
465 cols = get_int(metadata, f'Columns{k+1}', default=0)
466 if get_bool(metadata, f'Used{k+1}', default=False) or \
467 cols > 0 and rows > 0:
468 grids.append((rows, cols))
469 return grids
472def check_fishgrid(file_path):
473 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid).
475 Parameters
476 ----------
477 file_path: str
478 Path to a fishgrid data directory or a file in a fishgrid
479 data directory.
481 Returns
482 -------
483 is_fishgrid: bool
484 `True` if `file_path` is a valid fishgrid data directory or
485 a file therein.
486 """
487 # fishgrid data directory:
488 fishgrid_dir = file_path
489 if not os.path.isdir(file_path):
490 fishgrid_dir = os.path.dirname(file_path)
491 # check for a valid fishgrid data directory:
492 return (os.path.isfile(os.path.join(fishgrid_dir, 'fishgrid.cfg')) and
493 (os.path.isfile(os.path.join(fishgrid_dir, 'traces-grid1.raw')) or
494 os.path.isfile(os.path.join(fishgrid_dir, 'traces.raw'))))
497def fishgrid_trace_files(file_path):
498 """Expand file paths for fishgrid data to appropriate traces*.raw file names.
500 Parameters
501 ----------
502 file_path: str
503 Path to a fishgrid data directory, or a file therein.
505 Returns
506 -------
507 trace_file_paths: list of str
508 List of fishgrid traces*.raw files.
509 """
510 # find grids:
511 fishgrid_dir = file_path
512 if not os.path.isdir(fishgrid_dir):
513 fishgrid_dir = os.path.dirname(file_path)
514 trace_file_paths = []
515 for k in range(10000):
516 file = os.path.join(fishgrid_dir, f'traces-grid{k+1}.raw')
517 if os.path.isfile(file):
518 trace_file_paths.append(file)
519 else:
520 break
521 if len(trace_file_paths) == 0:
522 file = os.path.join(fishgrid_dir, f'traces.raw')
523 if os.path.isfile(file):
524 trace_file_paths.append(file)
525 return trace_file_paths
528def load_fishgrid(file_path):
529 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid).
531 Parameters
532 ----------
533 file_path: str
534 Path to a fishgrid data directory, or a file therein.
536 Returns
537 -------
538 data: 2-D array
539 All data traces as an 2-D numpy array, even for single channel data.
540 First dimension is time, second is channel.
541 rate: float
542 Sampling rate of the data in Hz.
543 unit: str
544 Unit of the data.
545 amax: float
546 Maximum amplitude of data range.
548 Raises
549 ------
550 FileNotFoundError:
551 Invalid or not existing fishgrid files.
552 """
553 trace_file_paths = fishgrid_trace_files(file_path)
554 if len(trace_file_paths) == 0:
555 raise FileNotFoundError(f'no fishgrid files specified')
556 md = metadata_fishgrid(file_path)
557 grids = fishgrid_grids(md)
558 grid_sizes = [r*c for r,c in grids]
560 # load traces-grid*.raw files:
561 grid_channels = []
562 nchannels = 0
563 for g, path in enumerate(trace_file_paths):
564 grid_channels.append(grid_sizes[g])
565 nchannels += grid_sizes[g]
566 data = None
567 nrows = 0
568 c = 0
569 rate = get_number(md, 'Hz', 'AISampleRate')
570 for path, channels in zip(trace_file_paths, grid_channels):
571 x = np.fromfile(path, np.float32).reshape((-1, channels))
572 if data is None:
573 nrows = len(x)
574 data = np.zeros((nrows, nchannels))
575 n = min(len(x), nrows)
576 data[:n,c:c+channels] = x[:n,:]
577 c += channels
578 amax, unit = get_number_unit(md, 'AIMaxVolt')
579 return data, rate, unit, amax
582# add fishgrid keys:
583default_starttime_keys.append(['StartDate', 'StartTime'])
584default_gain_keys.insert(0, 'AIMaxVolt')
587def metadata_fishgrid(file_path):
588 """ Read meta-data of a fishgrid data set.
590 Parameters
591 ----------
592 file_path: str
593 A fishgrid data directory or a file therein.
595 Returns
596 -------
597 data: nested dict
598 Nested dictionary with key-value pairs of the meta data.
599 """
600 fishgrid_dir = file_path
601 if not os.path.isdir(fishgrid_dir):
602 fishgrid_dir = os.path.dirname(file_path)
603 path = os.path.join(fishgrid_dir, 'fishgrid.cfg')
604 # read in header from file:
605 lines = []
606 if os.path.isfile(path + '.gz'):
607 path += '.gz'
608 if not os.path.exists(path):
609 return {}
610 if path[-3:] == '.gz':
611 with gzip.open(path, 'r', encoding='latin-1') as sf:
612 for line in sf:
613 lines.append(line)
614 else:
615 with open(path, 'r', encoding='latin-1') as sf:
616 for line in sf:
617 lines.append(line)
618 # parse:
619 data = {}
620 cdatas = [data]
621 ident_offs = None
622 ident = None
623 old_style = False
624 grid_n = False
625 for line in lines:
626 if len(line.strip()) == 0:
627 continue
628 if line[0] == '*':
629 key = line[1:].strip()
630 data[key] = {}
631 cdatas = [data, data[key]]
632 elif '----' in line:
633 old_style = True
634 key = line.strip().strip(' -').replace('&', '')
635 if key.upper() == 'SETUP':
636 key = 'Grid 1'
637 grid_n = False
638 if key[:4].lower() == 'grid':
639 grid_n = key[5]
640 cdatas = cdatas[:2]
641 cdatas[1][key] = {}
642 cdatas.append(cdatas[1][key])
643 else:
644 words = line.split(':')
645 key = words[0].strip().strip('"')
646 value = None
647 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style):
648 value = ':'.join(words[1:]).strip().strip('"')
649 if old_style:
650 if value is None:
651 cdatas = cdatas[:3]
652 cdatas[2][key] = {}
653 cdatas.append(cdatas[2][key])
654 else:
655 if grid_n and key[-1] != grid_n:
656 key = key + grid_n
657 cdatas[-1][key] = value
658 else:
659 # get section level:
660 level = 0
661 nident = len(line) - len(line.lstrip())
662 if ident_offs is None:
663 ident_offs = nident
664 elif ident is None:
665 if nident > ident_offs:
666 ident = nident - ident_offs
667 level = 1
668 else:
669 level = (nident - ident_offs)//ident
670 # close sections:
671 cdatas = cdatas[:2 + level]
672 if value is None:
673 # new section:
674 cdatas[-1][key] = {}
675 cdatas.append(cdatas[-1][key])
676 else:
677 # key-value pair:
678 cdatas[-1][key] = value.replace(r'\n', '\n')
679 # remove unused grids:
680 fgm = data.get('FishGrid', {})
681 for i in range(4):
682 gs = f'Grid {i+1}'
683 if gs in fgm:
684 gm = fgm[gs]
685 us = f'Used{i+1}'
686 if us in gm and gm[us].upper() == 'FALSE':
687 del fgm[gs]
688 return data
691def markers_fishgrid(file_path):
692 """ Read markers of a fishgrid data set.
694 Parameters
695 ----------
696 file_path: str
697 A fishgrid data directory or a file therein.
699 Returns
700 -------
701 locs: 2-D array of ints
702 Marker positions (first column) and spans (second column)
703 for each marker (rows).
704 labels: 2-D array of string objects
705 Labels (first column) and texts (second column)
706 for each marker (rows).
707 """
708 def add_marker():
709 if 'index1' in marker:
710 index1 = int(marker['index1'])//nchannels
711 else:
712 index1 = int(marker['index'])//nchannels
713 span1 = int(marker.get('span1', 0))//nchannels
714 locs.append([index1, span1])
715 ls = marker.get('label', 'M')
716 cs = marker.get('comment', '')
717 labels.append([ls, cs])
719 fishgrid_dir = file_path
720 if not os.path.isdir(fishgrid_dir):
721 fishgrid_dir = os.path.dirname(file_path)
722 path = os.path.join(fishgrid_dir, 'timestamps.dat')
723 if not os.path.isfile(path):
724 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
725 # get number of channels:
726 md = metadata_fishgrid(path.replace('timestamps.dat', 'fishgrid.cfg'))
727 grids = fishgrid_grids(md)
728 nchannels = np.prod(grids[0])
729 # read timestamps:
730 locs = []
731 labels = []
732 marker = {}
733 with open(path, 'r') as sf:
734 for line in sf:
735 if len(line.strip()) == 0:
736 add_marker()
737 marker = {}
738 else:
739 words = line.split(':')
740 if len(words) > 1:
741 v = words[1].strip()
742 v = v.strip('"')
743 marker[words[0].strip().lower()] = v
744 if len(marker) > 0:
745 add_marker()
746 if len(locs) > 2:
747 return np.array(locs[1:-1]), np.array(labels[1:-1])
748 else:
749 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
752def check_container(filepath):
753 """Check if file is a generic container file.
755 Supported file formats are:
757 - python pickle files (.pkl)
758 - numpy files (.npz)
759 - matlab files (.mat)
761 Parameters
762 ----------
763 filepath: str
764 Path of the file to check.
766 Returns
767 -------
768 is_container: bool
769 `True`, if `filepath` is a supported container format.
770 """
771 ext = os.path.splitext(filepath)[1]
772 return ext.lower() in ('.pkl', '.npz', '.mat')
775def extract_container_data(data_dict, datakey=None,
776 samplekey=['rate', 'Fs', 'fs'],
777 timekey=['time'], amplkey=['amax'], unitkey='unit',
778 amax=1.0, unit='a.u.'):
779 """Extract data from dictionary loaded from a container file.
781 Parameters
782 ----------
783 data_dict: dict
784 Dictionary of the data items contained in the container.
785 datakey: None, str, or list of str
786 Name of the variable holding the data. If `None` take the
787 variable that is an 2D array and has the largest number of
788 elements.
789 samplekey: str or list of str
790 Name of the variable holding the sampling rate.
791 timekey: str or list of str
792 Name of the variable holding sampling times.
793 If no sampling rate is available, the sampling rate is retrieved
794 from the sampling times.
795 amplkey: str or list of str
796 Name of the variable holding the amplitude range of the data.
797 unitkey: str
798 Name of the variable holding the unit of the data.
799 amax: None or float
800 If specified and no amplitude range has been found in `data_dict`,
801 then this is the amplitude range of the data.
802 unit: None or str
803 If specified and no unit has been found in `data_dict`,
804 then return this as the unit of the data.
806 Returns
807 -------
808 data: 2-D array of floats
809 All data traces as an 2-D numpy array, even for single channel data.
810 First dimension is time, second is channel.
811 rate: float
812 Sampling rate of the data in Hz.
813 unit: str
814 Unit of the data.
815 amax: float
816 Maximum amplitude of data range in `unit`.
818 Raises
819 ------
820 ValueError:
821 Invalid key requested.
822 """
823 # extract format data:
824 if not isinstance(samplekey, (list, tuple, np.ndarray)):
825 samplekey = (samplekey,)
826 if not isinstance(timekey, (list, tuple, np.ndarray)):
827 timekey = (timekey,)
828 if not isinstance(amplkey, (list, tuple, np.ndarray)):
829 amplkey = (amplkey,)
830 rate = 0.0
831 for skey in samplekey:
832 if skey in data_dict:
833 rate = float(data_dict[skey])
834 break
835 if rate == 0.0:
836 for tkey in timekey:
837 if tkey in data_dict:
838 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0])
839 break
840 if rate == 0.0:
841 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times")
842 for akey in amplkey:
843 if akey in data_dict:
844 amax = float(data_dict[akey])
845 break
846 if unitkey in data_dict:
847 unit = data_dict[unitkey]
848 # get data array:
849 raw_data = np.array([])
850 if datakey:
851 # try data keys:
852 if not isinstance(datakey, (list, tuple, np.ndarray)):
853 datakey = (datakey,)
854 for dkey in datakey:
855 if dkey in data_dict:
856 raw_data = data_dict[dkey]
857 break
858 if len(raw_data) == 0:
859 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data")
860 else:
861 # find largest 2D array:
862 for d in data_dict:
863 if hasattr(data_dict[d], 'shape'):
864 if 1 <= len(data_dict[d].shape) <= 2 and \
865 np.max(data_dict[d].shape) > np.max(raw_data.shape):
866 raw_data = data_dict[d]
867 if len(raw_data) == 0:
868 raise ValueError('no data found')
869 # make 2D:
870 if len(raw_data.shape) == 1:
871 raw_data = raw_data.reshape(-1, 1)
872 # transpose if necessary:
873 if np.argmax(raw_data.shape) > 0:
874 raw_data = raw_data.T
875 # recode:
876 if raw_data.dtype == np.dtype('int16'):
877 data = raw_data.astype('float32')
878 data *= amax/2**15
879 elif raw_data.dtype == np.dtype('int32'):
880 data = raw_data.astype(float)
881 data *= amax/2**31
882 elif raw_data.dtype == np.dtype('int64'):
883 data = raw_data.astype(float)
884 data *= amax/2**63
885 else:
886 data = raw_data
887 return data, rate, unit, amax
890def load_container(file_path, datakey=None,
891 samplekey=['rate', 'Fs', 'fs'],
892 timekey=['time'], amplkey=['amax'], unitkey='unit',
893 amax=1.0, unit='a.u.'):
894 """Load data from a generic container file.
896 Supported file formats are:
898 - python pickle files (.pkl)
899 - numpy files (.npz)
900 - matlab files (.mat)
902 Parameters
903 ----------
904 file_path: str
905 Path of the file to load.
906 datakey: None, str, or list of str
907 Name of the variable holding the data. If `None` take the
908 variable that is an 2D array and has the largest number of
909 elements.
910 samplekey: str or list of str
911 Name of the variable holding the sampling rate.
912 timekey: str or list of str
913 Name of the variable holding sampling times.
914 If no sampling rate is available, the sampling rate is retrieved
915 from the sampling times.
916 amplkey: str
917 Name of the variable holding the amplitude range of the data.
918 unitkey: str
919 Name of the variable holding the unit of the data.
920 If `unitkey` is not a valid key, then return `unitkey` as the `unit`.
921 amax: None or float
922 If specified and no amplitude range has been found in the data
923 container, then this is the amplitude range of the data.
924 unit: None or str
925 If specified and no unit has been found in the data container,
926 then return this as the unit of the data.
928 Returns
929 -------
930 data: 2-D array of floats
931 All data traces as an 2-D numpy array, even for single channel data.
932 First dimension is time, second is channel.
933 rate: float
934 Sampling rate of the data in Hz.
935 unit: str
936 Unit of the data.
937 amax: float
938 Maximum amplitude of data range.
940 Raises
941 ------
942 ValueError:
943 Invalid key requested.
944 """
945 # load data:
946 data_dict = {}
947 ext = os.path.splitext(file_path)[1]
948 if ext == '.pkl':
949 import pickle
950 with open(file_path, 'rb') as f:
951 data_dict = pickle.load(f)
952 elif ext == '.npz':
953 data_dict = np.load(file_path)
954 elif ext == '.mat':
955 from scipy.io import loadmat
956 data_dict = loadmat(file_path, squeeze_me=True)
957 return extract_container_data(data_dict, datakey, samplekey,
958 timekey, amplkey, unitkey, amax, unit)
961def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']):
962 """ Extract metadata from dictionary loaded from a container file.
964 Parameters
965 ----------
966 data_dict: dict
967 Dictionary of the data items contained in the container.
968 metadatakey: str or list of str
969 Name of the variable holding the metadata.
971 Returns
972 -------
973 metadata: nested dict
974 Nested dictionary with key-value pairs of the meta data.
975 """
976 if not isinstance(metadatakey, (list, tuple, np.ndarray)):
977 metadatakey = (metadatakey,)
978 # get single metadata dictionary:
979 for mkey in metadatakey:
980 if mkey in data_dict:
981 return data_dict[mkey]
982 # collect all keys starting with metadatakey:
983 metadata = {}
984 for mkey in metadatakey:
985 mkey += '__'
986 for dkey in data_dict:
987 if dkey[:len(mkey)] == mkey:
988 v = data_dict[dkey]
989 if hasattr(v, 'size') and v.ndim == 0:
990 v = v.item()
991 metadata[dkey[len(mkey):]] = v
992 if len(metadata) > 0:
993 return unflatten_metadata(metadata, sep='__')
994 return metadata
997def metadata_container(file_path, metadatakey=['metadata', 'info']):
998 """ Read meta-data of a container file.
1000 Parameters
1001 ----------
1002 file_path: str
1003 A container file.
1004 metadatakey: str or list of str
1005 Name of the variable holding the metadata.
1007 Returns
1008 -------
1009 metadata: nested dict
1010 Nested dictionary with key-value pairs of the meta data.
1011 """
1012 data_dict = {}
1013 ext = os.path.splitext(file_path)[1]
1014 if ext == '.pkl':
1015 import pickle
1016 with open(file_path, 'rb') as f:
1017 data_dict = pickle.load(f)
1018 elif ext == '.npz':
1019 data_dict = np.load(file_path)
1020 elif ext == '.mat':
1021 from scipy.io import loadmat
1022 data_dict = loadmat(file_path, squeeze_me=True)
1023 return extract_container_metadata(data_dict, metadatakey)
1026def extract_container_markers(data_dict, poskey=['positions'],
1027 spanskey=['spans'], labelskey=['labels'],
1028 descrkey=['descriptions']):
1029 """ Extract markers from dictionary loaded from a container file.
1031 Parameters
1032 ----------
1033 data_dict: dict
1034 Dictionary of the data items contained in the container.
1035 poskey: str or list of str
1036 Name of the variable holding positions of markers.
1037 spanskey: str or list of str
1038 Name of the variable holding spans of markers.
1039 labelskey: str or list of str
1040 Name of the variable holding labels of markers.
1041 descrkey: str or list of str
1042 Name of the variable holding descriptions of markers.
1044 Returns
1045 -------
1046 locs: 2-D array of ints
1047 Marker positions (first column) and spans (second column)
1048 for each marker (rows).
1049 labels: 2-D array of string objects
1050 Labels (first column) and texts (second column)
1051 for each marker (rows).
1052 """
1053 if not isinstance(poskey, (list, tuple, np.ndarray)):
1054 poskey = (poskey,)
1055 if not isinstance(spanskey, (list, tuple, np.ndarray)):
1056 spanskey = (spanskey,)
1057 if not isinstance(labelskey, (list, tuple, np.ndarray)):
1058 labelskey = (labelskey,)
1059 if not isinstance(descrkey, (list, tuple, np.ndarray)):
1060 descrkey = (descrkey,)
1061 locs = np.zeros((0, 2), dtype=int)
1062 for pkey in poskey:
1063 if pkey in data_dict:
1064 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int)
1065 locs[:,0] = data_dict[pkey]
1066 break
1067 for skey in spanskey:
1068 if skey in data_dict:
1069 locs[:,1] = data_dict[skey]
1070 break
1071 labels = np.zeros((0, 2), dtype=object)
1072 for lkey in labelskey:
1073 if lkey in data_dict:
1074 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object)
1075 labels[:,0] = data_dict[lkey]
1076 break
1077 for dkey in descrkey:
1078 if dkey in data_dict:
1079 labels[:,1] = data_dict[dkey]
1080 break
1081 return locs, labels
1084def markers_container(file_path, poskey=['positions'],
1085 spanskey=['spans'], labelskey=['labels'],
1086 descrkey=['descriptions']):
1087 """ Read markers of a container file.
1089 Parameters
1090 ----------
1091 file_path: str
1092 A container file.
1093 poskey: str or list of str
1094 Name of the variable holding positions of markers.
1095 spanskey: str or list of str
1096 Name of the variable holding spans of markers.
1097 labelskey: str or list of str
1098 Name of the variable holding labels of markers.
1099 descrkey: str or list of str
1100 Name of the variable holding descriptions of markers.
1102 Returns
1103 -------
1104 locs: 2-D array of ints
1105 Marker positions (first column) and spans (second column)
1106 for each marker (rows).
1107 labels: 2-D array of string objects
1108 Labels (first column) and texts (second column)
1109 for each marker (rows).
1110 """
1111 data_dict = {}
1112 ext = os.path.splitext(file_path)[1]
1113 if ext == '.pkl':
1114 import pickle
1115 with open(file_path, 'rb') as f:
1116 data_dict = pickle.load(f)
1117 elif ext == '.npz':
1118 data_dict = np.load(file_path)
1119 elif ext == '.mat':
1120 from scipy.io import loadmat
1121 data_dict = loadmat(file_path, squeeze_me=True)
1122 return extract_container_markers(data_dict, poskey, spanskey,
1123 labelskey, descrkey)
1126def check_raw(filepath):
1127 """Check if file is a raw file.
1129 The following extensions are interpreted as raw files:
1131 - raw files (*.raw)
1132 - LabView scandata (*.scandat)
1134 Parameters
1135 ----------
1136 filepath: str
1137 Path of the file to check.
1139 Returns
1140 -------
1141 is_raw: bool
1142 `True`, if `filepath` is a raw format.
1143 """
1144 ext = os.path.splitext(filepath)[1]
1145 return ext.lower() in ('.raw', '.scandat', '.mat')
1148def load_raw(file_path, rate=44000, channels=1, dtype=np.float32,
1149 amax=1.0, unit='a.u.'):
1150 """Load data from a raw file.
1152 Raw files just contain the data and absolutely no metadata, not
1153 even the smapling rate, number of channels, etc.
1154 Supported file formats are:
1156 - raw files (*.raw)
1157 - LabView scandata (*.scandat)
1159 Parameters
1160 ----------
1161 file_path: str
1162 Path of the file to load.
1163 rate: float
1164 Sampling rate of the data in Hertz.
1165 channels: int
1166 Number of channels multiplexed in the data.
1167 dtype: str or numpy.dtype
1168 The data type stored in the file.
1169 amax: float
1170 The amplitude range of the data.
1171 unit: str
1172 The unit of the data.
1174 Returns
1175 -------
1176 data: 2-D array of floats
1177 All data traces as an 2-D numpy array, even for single channel data.
1178 First dimension is time, second is channel.
1179 rate: float
1180 Sampling rate of the data in Hz.
1181 unit: str
1182 Unit of the data.
1183 amax: float
1184 Maximum amplitude of data range.
1186 """
1187 raw_data = np.fromfile(file_path, dtype=dtype).reshape(-1, channels)
1188 # recode:
1189 if dtype == np.dtype('int16'):
1190 data = raw_data.astype('float32')
1191 data *= amax/2**15
1192 elif dtype == np.dtype('int32'):
1193 data = raw_data.astype(float)
1194 data *= amax/2**31
1195 elif dtype == np.dtype('int64'):
1196 data = raw_data.astype(float)
1197 data *= amax/2**63
1198 else:
1199 data = raw_data
1200 return data, rate, unit, amax
1203def load_audioio(file_path, verbose=0, gainkey=default_gain_keys, sep='.',
1204 amax=1.0, unit='a.u.'):
1205 """Load data from an audio file.
1207 See the
1208 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio)
1209 function of the [`audioio`](https://github.com/bendalab/audioio)
1210 package for more infos.
1212 Parameters
1213 ----------
1214 file_path: str
1215 Path of the file to load.
1216 verbose: int
1217 If > 0 show detailed error/warning messages.
1218 gainkey: str or list of str
1219 Key in the file's metadata that holds some gain information.
1220 If found, the data will be multiplied with the gain,
1221 and if available, the corresponding unit is returned.
1222 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
1223 sep: str
1224 String that separates section names in `gainkey`.
1225 amax: float
1226 If specified and no gain has been found in the metadata,
1227 then use this as the amplitude range.
1228 unit: str
1229 If specified and no gain has been found in the metadata,
1230 then return this as the unit of the data.
1232 Returns
1233 -------
1234 data: 2-D array of floats
1235 All data traces as an 2-D numpy array, even for single channel data.
1236 First dimension is time, second is channel.
1237 rate: float
1238 Sampling rate of the data in Hz.
1239 unit: str
1240 Unit of the data if found in the metadata (see `gainkey`),
1241 otherwise `unit`.
1242 amax: float
1243 Maximum amplitude of data range.
1244 """
1245 # get gain:
1246 md = metadata_audioio(file_path)
1247 amax, unit = get_gain(md, gainkey, sep, amax, unit)
1248 # load data:
1249 data, rate = load_audio(file_path, verbose)
1250 if amax != 1.0:
1251 data *= amax
1252 return data, rate, unit, amax
1255data_loader_funcs = (
1256 ('relacs', check_relacs, load_relacs, metadata_relacs, None),
1257 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid),
1258 ('container', check_container, load_container, metadata_container, markers_container),
1259 ('raw', check_raw, load_raw, None, None),
1260 ('audioio', None, load_audioio, metadata_audioio, markers_audioio),
1261 )
1262"""List of implemented load functions.
1264Each element of the list is a tuple with the data format's name, its
1265check and its load function.
1267"""
1270def load_data(file_path, verbose=0, **kwargs):
1271 """Load time-series data from a file.
1273 Parameters
1274 ----------
1275 file_path: str
1276 Path and name of the file to load.
1277 verbose: int
1278 If > 0 show detailed error/warning messages.
1279 **kwargs: dict
1280 Further keyword arguments that are passed on to the
1281 format specific loading functions.
1282 For example:
1283 - `amax`: the amplitude range of the data.
1284 - 'unit': the unit of the data.
1286 Returns
1287 -------
1288 data: 2-D array
1289 All data traces as an 2-D numpy array, even for single channel data.
1290 First dimension is time, second is channel.
1291 rate: float
1292 Sampling rate of the data in Hz.
1293 unit: str
1294 Unit of the data.
1295 amax: float
1296 Maximum amplitude of data range.
1298 Raises
1299 ------
1300 ValueError:
1301 `file_path` is empty string.
1302 """
1303 if len(file_path) == 0:
1304 raise ValueError('input argument file_path is empty string.')
1305 # load data:
1306 for name, check_file, load_file, _, _ in data_loader_funcs:
1307 if check_file is None or check_file(file_path):
1308 data, rate, unit, amax = load_file(file_path, **kwargs)
1309 if verbose > 0:
1310 print(f'loaded {name} data from file "{file_path}"')
1311 if verbose > 1:
1312 print(f' sampling rate: {rate:g} Hz')
1313 print(f' channels : {data.shape[1]}')
1314 print(f' frames : {len(data)}')
1315 print(f' range : {amax:g}{unit}')
1316 return data, rate, unit, amax
1317 return np.zeros((0, 1)), 0.0, '', 1.0
1320def metadata(file_path, **kwargs):
1321 """ Read meta-data from a data file.
1323 Parameters
1324 ----------
1325 file_path: str
1326 The full path and name of the file to load. For some file
1327 formats several files can be provided in a list.
1328 **kwargs: dict
1329 Further keyword arguments that are passed on to the
1330 format specific loading functions.
1332 Returns
1333 -------
1334 meta_data: nested dict
1335 Meta data contained in the file. Keys of the nested
1336 dictionaries are always strings. If the corresponding
1337 values are dictionaries, then the key is the section name
1338 of the metadata contained in the dictionary. All other
1339 types of values are values for the respective key. In
1340 particular they are strings, or list of strings. But other
1341 simple types like ints or floats are also allowed.
1343 Raises
1344 ------
1345 ValueError:
1346 `file_path` is empty string.
1347 """
1348 if len(file_path) == 0:
1349 raise ValueError('input argument file_path is empty string.')
1350 # load metadata:
1351 for _, check_file, _, metadata_file, _ in data_loader_funcs:
1352 if check_file is None or check_file(file_path):
1353 if metadata_file is not None:
1354 return metadata_file(file_path, **kwargs)
1355 return {}
1358def markers(file_path):
1359 """ Read markers of a data file.
1361 Parameters
1362 ----------
1363 file_path: str or file handle
1364 The data file.
1366 Returns
1367 -------
1368 locs: 2-D array of ints
1369 Marker positions (first column) and spans (second column)
1370 for each marker (rows).
1371 labels: 2-D array of string objects
1372 Labels (first column) and texts (second column)
1373 for each marker (rows).
1375 Raises
1376 ------
1377 ValueError:
1378 `file_path` is empty string.
1379 """
1380 if len(file_path) == 0:
1381 raise ValueError('input argument file_path is empty string.')
1382 # load markers:
1383 for _, check_file, _, _, markers_file in data_loader_funcs:
1384 if check_file is None or check_file(file_path):
1385 if markers_file is not None:
1386 return markers_file(file_path)
1387 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)
1390class DataLoader(AudioLoader):
1391 """Buffered reading of time-series data for random access of the data in the file.
1393 This allows for reading very large data files that do not fit into
1394 memory. A `DataLoader` instance can be used like a huge
1395 read-only numpy array, i.e.
1396 ```
1397 data = DataLoader('path/to/data/file.dat')
1398 x = data[10000:20000,0]
1399 ```
1400 The first index specifies the frame, the second one the channel.
1402 `DataLoader` first determines the format of the data file and then
1403 opens the file (first line). It then reads data from the file as
1404 necessary for the requested data (second line).
1406 Supported file formats are
1408 - audio files via `audioio` package
1409 - python pickle files
1410 - numpy .npz files
1411 - matlab .mat files
1412 - relacs trace*.raw files (www.relacs.net)
1413 - fishgrid traces-*.raw files
1415 Reading sequentially through the file is always possible. If
1416 previous data are requested, then the file is read from the
1417 beginning. This might slow down access to previous data
1418 considerably. Use the `backsize` argument to the open functions to
1419 make sure some data are loaded before the requested frame. Then a
1420 subsequent access to the data within `backsize` seconds before that
1421 frame can still be handled without the need to reread the file
1422 from the beginning.
1424 Usage:
1425 ------
1426 ```
1427 import thunderlab.dataloader as dl
1428 with dl.DataLoader(file_path, 60.0, 10.0) as data:
1429 # do something with the content of the file:
1430 x = data[0:10000,0]
1431 y = data[10000:20000,0]
1432 z = x + y
1433 ```
1435 Normal open and close:
1436 ```
1437 data = dl.DataLoader(file_path, 60.0)
1438 x = data[:,:] # read the whole file
1439 data.close()
1440 ```
1441 that is the same as:
1442 ```
1443 data = dl.DataLoader()
1444 data.open(file_path, 60.0)
1445 ```
1447 Parameters
1448 ----------
1449 file_path: str
1450 Name of the file.
1451 buffersize: float
1452 Size of internal buffer in seconds.
1453 backsize: float
1454 Part of the buffer to be loaded before the requested start index in seconds.
1455 verbose: int
1456 If larger than zero show detailed error/warning messages.
1457 meta_kwargs: dict
1458 Keyword arguments that are passed on to the _load_metadata() function.
1460 Attributes
1461 ----------
1462 rate: float
1463 The sampling rate of the data in Hertz.
1464 channels: int
1465 The number of channels that are read in.
1466 frames: int
1467 The number of frames in the file.
1468 format: str or None
1469 Format of the audio file.
1470 encoding: str or None
1471 Encoding/subtype of the audio file.
1472 shape: tuple
1473 Number of frames and channels of the data.
1474 ndim: int
1475 Number of dimensions: always 2 (frames and channels).
1476 unit: str
1477 Unit of the data.
1478 ampl_min: float
1479 Minimum amplitude the file format supports.
1480 ampl_max: float
1481 Maximum amplitude the file format supports.
1483 Methods
1484 -------
1486 - `len()`: the number of frames
1487 - `open()`: open a data file.
1488 - `open_*()`: open a data file of a specific format.
1489 - `close()`: close the file.
1490 - `metadata()`: metadata of the file.
1491 - `markers()`: markers of the file.
1492 - `set_unwrap()`: Set parameters for unwrapping clipped data.
1494 """
1496 def __init__(self, file_path=None, buffersize=10.0, backsize=0.0,
1497 verbose=0, **meta_kwargs):
1498 super().__init__(None, buffersize, backsize,
1499 verbose, **meta_kwargs)
1500 if file_path is not None:
1501 self.open(file_path, buffersize, backsize, verbose, **meta_kwargs)
1503 def __getitem__(self, key):
1504 return super(DataLoader, self).__getitem__(key)
1506 def __next__(self):
1507 return super(DataLoader, self).__next__()
1510 # relacs interface:
1511 def open_relacs(self, file_path, buffersize=10.0, backsize=0.0,
1512 verbose=0, amax=1.0):
1513 """Open relacs data files (www.relacs.net) for reading.
1515 Parameters
1516 ----------
1517 file_path: str
1518 Path to a relacs data directory or a file therein.
1519 buffersize: float
1520 Size of internal buffer in seconds.
1521 backsize: float
1522 Part of the buffer to be loaded before the requested start index in seconds.
1523 verbose: int
1524 If > 0 show detailed error/warning messages.
1525 amax: float
1526 The amplitude range of the data.
1528 Raises
1529 ------
1530 ValueError: .gz files not supported.
1531 """
1532 self.verbose = verbose
1534 if self.sf is not None:
1535 self._close_relacs()
1537 trace_file_paths = relacs_trace_files(file_path)
1539 # open trace files:
1540 self.sf = []
1541 self.frames = None
1542 self.rate = None
1543 self.unit = ''
1544 self.filepath = None
1545 if len(trace_file_paths) > 0:
1546 self.filepath = os.path.dirname(trace_file_paths[0])
1547 for path in sorted(trace_file_paths):
1548 if path[-3:] == '.gz':
1549 raise ValueError('.gz files not supported')
1550 sf = open(path, 'rb')
1551 self.sf.append(sf)
1552 if verbose > 0:
1553 print(f'open_relacs(file_path) with file_path={path}')
1554 # file size:
1555 sf.seek(0, os.SEEK_END)
1556 frames = sf.tell()//4
1557 if self.frames is None:
1558 self.frames = frames
1559 elif self.frames != frames:
1560 diff = self.frames - frames
1561 if diff > 1 or diff < -2:
1562 raise ValueError('number of frames of traces differ')
1563 elif diff >= 0:
1564 self.frames = frames
1565 sf.seek(0)
1566 # retrieve sampling rate and unit:
1567 rate, us = relacs_samplerate_unit(path)
1568 if self.rate is None:
1569 self.rate = rate
1570 elif rate != self.rate:
1571 raise ValueError('sampling rates of traces differ')
1572 if len(self.unit) == 0:
1573 self.unit = us
1574 elif us != self.unit:
1575 raise ValueError('unit of traces differ')
1576 self.channels = len(self.sf)
1577 self.shape = (self.frames, self.channels)
1578 self.size = self.frames * self.channels
1579 self.ndim = len(self.shape)
1580 self.format = 'RELACS'
1581 self.encoding = 'FLOAT'
1582 self.bufferframes = int(buffersize*self.rate)
1583 self.backframes = int(backsize*self.rate)
1584 self.init_buffer()
1585 self.offset = 0
1586 self.close = self._close_relacs
1587 self.load_audio_buffer = self._load_buffer_relacs
1588 self.ampl_min = -amax
1589 self.ampl_max = +amax
1590 self._load_metadata = self._metadata_relacs
1591 # TODO: load markers:
1592 self._locs = np.zeros((0, 2), dtype=int)
1593 self._labels = np.zeros((0, 2), dtype=object)
1594 self._load_markers = None
1595 return self
1597 def _close_relacs(self):
1598 """Close the relacs data files.
1599 """
1600 if self.sf is not None:
1601 for file in self.sf:
1602 file.close()
1603 self.sf = None
1605 def _load_buffer_relacs(self, r_offset, r_size, buffer):
1606 """Load new data from relacs data file.
1608 Parameters
1609 ----------
1610 r_offset: int
1611 First frame to be read from file.
1612 r_size: int
1613 Number of frames to be read from file.
1614 buffer: ndarray
1615 Buffer where to store the loaded data.
1616 """
1617 for i, file in enumerate(self.sf):
1618 file.seek(r_offset*4)
1619 data = file.read(r_size*4)
1620 buffer[:, i] = np.frombuffer(data, dtype=np.float32)
1623 def _metadata_relacs(self, store_empty=False, first_only=False):
1624 """ Load meta-data of a relacs data set.
1625 """
1626 info_path = os.path.join(self.filepath, 'info.dat')
1627 if not os.path.exists(info_path):
1628 return {}
1629 return relacs_header(info_path, store_empty, first_only)
1632 # fishgrid interface:
1633 def open_fishgrid(self, file_path, buffersize=10.0, backsize=0.0,
1634 verbose=0):
1635 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading.
1637 Parameters
1638 ----------
1639 file_path: str
1640 Path to a fishgrid data directory, or a file therein.
1641 buffersize: float
1642 Size of internal buffer in seconds.
1643 backsize: float
1644 Part of the buffer to be loaded before the requested start index in seconds.
1645 verbose: int
1646 If > 0 show detailed error/warning messages.
1647 """
1648 self.verbose = verbose
1650 if self.sf is not None:
1651 self._close_fishgrid()
1653 trace_file_paths = fishgrid_trace_files(file_path)
1654 self.filepath = None
1655 if len(trace_file_paths) > 0:
1656 self.filepath = os.path.dirname(trace_file_paths[0])
1657 self._load_metadata = metadata_fishgrid
1658 self._load_markers = markers_fishgrid
1660 # open grid files:
1661 grids = fishgrid_grids(self.metadata())
1662 grid_sizes = [r*c for r,c in grids]
1663 self.channels = 0
1664 for g, path in enumerate(trace_file_paths):
1665 self.channels += grid_sizes[g]
1666 self.sf = []
1667 self.grid_channels = []
1668 self.grid_offs = []
1669 offs = 0
1670 self.frames = None
1671 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate')
1672 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt')
1673 if v is not None:
1674 self.ampl_min = -v
1675 self.ampl_max = +v
1677 for g, path in enumerate(trace_file_paths):
1678 sf = open(path, 'rb')
1679 self.sf.append(sf)
1680 if verbose > 0:
1681 print(f'open_fishgrid(file_path) with file_path={path}')
1682 # grid channels:
1683 self.grid_channels.append(grid_sizes[g])
1684 self.grid_offs.append(offs)
1685 offs += grid_sizes[g]
1686 # file size:
1687 sf.seek(0, os.SEEK_END)
1688 frames = sf.tell()//4//grid_sizes[g]
1689 if self.frames is None:
1690 self.frames = frames
1691 elif self.frames != frames:
1692 diff = self.frames - frames
1693 if diff > 1 or diff < -2:
1694 raise ValueError('number of frames of traces differ')
1695 elif diff >= 0:
1696 self.frames = frames
1697 sf.seek(0)
1698 self.shape = (self.frames, self.channels)
1699 self.size = self.frames * self.channels
1700 self.ndim = len(self.shape)
1701 self.format = 'FISHGRID'
1702 self.encoding = 'FLOAT'
1703 self.bufferframes = int(buffersize*self.rate)
1704 self.backframes = int(backsize*self.rate)
1705 self.init_buffer()
1706 self.offset = 0
1707 self.close = self._close_fishgrid
1708 self.load_audio_buffer = self._load_buffer_fishgrid
1709 return self
1711 def _close_fishgrid(self):
1712 """Close the fishgrid data files.
1713 """
1714 if self.sf is not None:
1715 for file in self.sf:
1716 file.close()
1717 self.sf = None
1719 def _load_buffer_fishgrid(self, r_offset, r_size, buffer):
1720 """Load new data from relacs data file.
1722 Parameters
1723 ----------
1724 r_offset: int
1725 First frame to be read from file.
1726 r_size: int
1727 Number of frames to be read from file.
1728 buffer: ndarray
1729 Buffer where to store the loaded data.
1730 """
1731 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs):
1732 file.seek(r_offset*4*gchannels)
1733 data = file.read(r_size*4*gchannels)
1734 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels))
1737 # container interface:
1738 def open_container(self, file_path, buffersize=10.0,
1739 backsize=0.0, verbose=0, datakey=None,
1740 samplekey=['rate', 'Fs', 'fs'],
1741 timekey=['time'], amplkey=['amax'], unitkey='unit',
1742 metadatakey=['metadata', 'info'],
1743 poskey=['positions'],
1744 spanskey=['spans'], labelskey=['labels'],
1745 descrkey=['descriptions'],
1746 amax=1.0, unit='a.u.'):
1747 """Open generic container file.
1749 Supported file formats are:
1751 - python pickle files (.pkl)
1752 - numpy files (.npz)
1753 - matlab files (.mat)
1755 Parameters
1756 ----------
1757 file_path: str
1758 Path to a container file.
1759 buffersize: float
1760 Size of internal buffer in seconds.
1761 backsize: float
1762 Part of the buffer to be loaded before the requested start index in seconds.
1763 verbose: int
1764 If > 0 show detailed error/warning messages.
1765 datakey: None, str, or list of str
1766 Name of the variable holding the data. If `None` take the
1767 variable that is an 2D array and has the largest number of
1768 elements.
1769 samplekey: str or list of str
1770 Name of the variable holding the sampling rate.
1771 timekey: str or list of str
1772 Name of the variable holding sampling times.
1773 If no sampling rate is available, the sampling rate is retrieved
1774 from the sampling times.
1775 amplkey: str or list of str
1776 Name of the variable holding the amplitude range of the data.
1777 unitkey: str
1778 Name of the variable holding the unit of the data.
1779 metadatakey: str or list of str
1780 Name of the variable holding the metadata.
1781 poskey: str or list of str
1782 Name of the variable holding positions of markers.
1783 spanskey: str or list of str
1784 Name of the variable holding spans of markers.
1785 labelskey: str or list of str
1786 Name of the variable holding labels of markers.
1787 descrkey: str or list of str
1788 Name of the variable holding descriptions of markers.
1789 amax: None or float
1790 If specified and no amplitude range has been found in the data
1791 container, then this is the amplitude range of the data.
1792 unit: None or str
1793 If specified and no unit has been found in the data container,
1794 then return this as the unit of the data.
1796 Raises
1797 ------
1798 ValueError:
1799 Invalid key requested.
1800 """
1801 self.verbose = verbose
1802 data_dict = {}
1803 ext = os.path.splitext(file_path)[1]
1804 if ext == '.pkl':
1805 import pickle
1806 with open(file_path, 'rb') as f:
1807 data_dict = pickle.load(f)
1808 self.format = 'PKL'
1809 elif ext == '.npz':
1810 data_dict = np.load(file_path)
1811 self.format = 'NPZ'
1812 elif ext == '.mat':
1813 from scipy.io import loadmat
1814 data_dict = loadmat(file_path, squeeze_me=True)
1815 self.format = 'MAT'
1816 self.buffer, self.rate, self.unit, amax = \
1817 extract_container_data(data_dict, datakey, samplekey,
1818 timekey, amplkey, unitkey, amax, unit)
1819 self.filepath = file_path
1820 self.channels = self.buffer.shape[1]
1821 self.frames = self.buffer.shape[0]
1822 self.shape = self.buffer.shape
1823 self.ndim = self.buffer.ndim
1824 self.size = self.buffer.size
1825 self.encoding = self.numpy_encodings[self.buffer.dtype]
1826 self.ampl_min = -amax
1827 self.ampl_max = +amax
1828 self.offset = 0
1829 self.buffer_changed = np.zeros(self.channels, dtype=bool)
1830 self.bufferframes = self.frames
1831 self.backsize = 0
1832 self.close = self._close_container
1833 self.load_audio_buffer = self._load_buffer_container
1834 self._metadata = extract_container_metadata(data_dict, metadatakey)
1835 self._load_metadata = None
1836 self._locs, self._labels = extract_container_markers(data_dict,
1837 poskey,
1838 spanskey,
1839 labelskey,
1840 descrkey)
1841 self._load_markers = None
1843 def _close_container(self):
1844 """Close container. """
1845 pass
1847 def _load_buffer_container(self, r_offset, r_size, buffer):
1848 """Load new data from container."""
1849 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :]
1852 # raw data interface:
1853 def open_raw(self, file_path, buffersize=10.0, backsize=0.0,
1854 verbose=0, rate=44000, channels=1, dtype=np.float32,
1855 amax=1.0, unit='a.u.'):
1856 """Load data from a raw file.
1858 Raw files just contain the data and absolutely no metadata, not
1859 even the smapling rate, number of channels, etc.
1860 Supported file formats are:
1862 - raw files (*.raw)
1863 - LabView scandata (*.scandat)
1865 Parameters
1866 ----------
1867 file_path: str
1868 Path of the file to load.
1869 buffersize: float
1870 Size of internal buffer in seconds.
1871 backsize: float
1872 Part of the buffer to be loaded before the requested start index in seconds.
1873 verbose: int
1874 If > 0 show detailed error/warning messages.
1875 rate: float
1876 Sampling rate of the data in Hertz.
1877 channels: int
1878 Number of channels multiplexed in the data.
1879 dtype: str or numpy.dtype
1880 The data type stored in the file.
1881 amax: float
1882 The amplitude range of the data.
1883 unit: str
1884 The unit of the data.
1885 """
1886 self.verbose = verbose
1887 self.filepath = file_path
1888 self.sf = open(file_path, 'rb')
1889 if verbose > 0:
1890 print(f'open_raw(file_path) with file_path={file_path}')
1891 self.dtype = np.dtype(dtype)
1892 self.rate = float(rate)
1893 # file size:
1894 self.sf.seek(0, os.SEEK_END)
1895 self.frames = self.sf.tell()//self.dtype.itemsize
1896 self.sf.seek(0)
1897 self.channels = int(channels)
1898 self.shape = (self.frames, self.channels)
1899 self.ndim = len(self.shape)
1900 self.size = self.frames*self.channels
1901 self.format = 'RAW'
1902 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN')
1903 self.unit = unit
1904 self.ampl_max = float(amax)
1905 self.ampl_min = -self.ampl_max
1906 self.offset = 0
1907 self.bufferframes = int(buffersize*self.rate)
1908 self.backframes = int(backsize*self.rate)
1909 self.init_buffer()
1910 self.close = self._close_raw
1911 self.load_audio_buffer = self._load_buffer_raw
1912 self._metadata = None
1913 self._load_metadata = None
1914 self._locs = None
1915 self._labels = None
1916 self._load_markers = None
1918 def _close_raw(self):
1919 """Close raw file. """
1920 self.sf.close()
1921 self.sf = None
1923 def _load_buffer_raw(self, r_offset, r_size, buffer):
1924 """Load new data from container."""
1925 self.sf.seek(r_offset*self.dtype.itemsize)
1926 raw_data = self.sf.read(r_size*self.dtype.itemsize)
1927 raw_data = np.frombuffer(raw_data, dtype=self.dtype)
1928 raw_data = raw_data.reshape(-1, self.channels)
1929 # recode:
1930 if self.dtype == np.dtype('int16'):
1931 data = raw_data.astype('float32')
1932 data *= self.ampl_max/2**15
1933 elif self.dtype == np.dtype('int32'):
1934 data = raw_data.astype(float)
1935 data *= self.ampl_max/2**31
1936 elif self.dtype == np.dtype('int64'):
1937 data = raw_data.astype(float)
1938 data *= self.ampl_max/2**63
1939 else:
1940 data = raw_data
1941 buffer[:, :] = data
1944 # audioio interface:
1945 def open_audioio(self, file_path, buffersize=10.0, backsize=0.0,
1946 verbose=0, gainkey=default_gain_keys, sep='.',
1947 amax=None, unit='a.u.'):
1948 """Open an audio file.
1950 See the [audioio](https://github.com/bendalab/audioio) package
1951 for details.
1953 Parameters
1954 ----------
1955 file_path: str
1956 Path to an audio file.
1957 buffersize: float
1958 Size of internal buffer in seconds.
1959 backsize: float
1960 Part of the buffer to be loaded before the requested start index
1961 in seconds.
1962 verbose: int
1963 If > 0 show detailed error/warning messages.
1964 gainkey: str or list of str
1965 Key in the file's metadata that holds some gain information.
1966 If found, the data will be multiplied with the gain,
1967 and if available, the corresponding unit is returned.
1968 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.
1969 sep: str
1970 String that separates section names in `gainkey`.
1971 amax: None or float
1972 If specified and no gain has been found in the metadata,
1973 then use this as the amplitude range.
1974 unit: None or str
1975 If specified and no gain has been found in the metadata,
1976 then this is the unit of the data.
1978 """
1979 self.verbose = verbose
1980 super(DataLoader, self).open(file_path, buffersize, backsize, verbose)
1981 md = self.metadata()
1982 fac, unit = get_gain(md, gainkey, sep, amax, unit)
1983 if fac is None:
1984 self.gain_fac = 1.0
1985 else:
1986 self.gain_fac = fac
1987 self._load_buffer_audio_org = self.load_audio_buffer
1988 self.load_audio_buffer = self._load_buffer_audioio
1989 self.ampl_min *= self.gain_fac
1990 self.ampl_max *= self.gain_fac
1991 self.unit = unit
1992 return self
1994 def _load_buffer_audioio(self, r_offset, r_size, buffer):
1995 """Load and scale new data from an audio file.
1997 Parameters
1998 ----------
1999 r_offset: int
2000 First frame to be read from file.
2001 r_size: int
2002 Number of frames to be read from file.
2003 buffer: ndarray
2004 Buffer where to store the loaded data.
2005 """
2006 self._load_buffer_audio_org(r_offset, r_size, buffer)
2007 buffer *= self.gain_fac
2010 def open(self, file_path, buffersize=10.0, backsize=0.0,
2011 verbose=0, **kwargs):
2012 """Open file with time-series data for reading.
2014 Parameters
2015 ----------
2016 file_path: str or list of str
2017 Path to a data files or directory.
2018 buffersize: float
2019 Size of internal buffer in seconds.
2020 backsize: float
2021 Part of the buffer to be loaded before the requested start index
2022 in seconds.
2023 verbose: int
2024 If > 0 show detailed error/warning messages.
2025 **kwargs: dict
2026 Further keyword arguments that are passed on to the
2027 format specific opening functions.
2028 For example:
2029 - `amax`: the amplitude range of the data.
2030 - 'unit': the unit of the data.
2032 Raises
2033 ------
2034 ValueError:
2035 `file_path` is empty string.
2036 """
2037 # list of implemented open functions:
2038 data_open_funcs = (
2039 ('relacs', check_relacs, self.open_relacs, 1),
2040 ('fishgrid', check_fishgrid, self.open_fishgrid, 1),
2041 ('container', check_container, self.open_container, 1),
2042 ('raw', check_raw, self.open_raw, 1),
2043 ('audioio', None, self.open_audioio, 0),
2044 )
2045 if len(file_path) == 0:
2046 raise ValueError('input argument file_path is empty string.')
2047 # open data:
2048 for name, check_file, open_file, v in data_open_funcs:
2049 if check_file is None or check_file(file_path):
2050 open_file(file_path, buffersize, backsize, verbose, **kwargs)
2051 if v*verbose > 1:
2052 if self.format is not None:
2053 print(f' format : {self.format}')
2054 if self.encoding is not None:
2055 print(f' encoding : {self.encoding}')
2056 print(f' sampling rate: {self.rate} Hz')
2057 print(f' channels : {self.channels}')
2058 print(f' frames : {self.frames}')
2059 print(f' range : {self.ampl_max:g}{self.unit}')
2060 break
2061 return self
2064def demo(file_path, plot=False):
2065 print("try load_data:")
2066 data, rate, unit, amax = load_data(file_path, verbose=2)
2067 if plot:
2068 fig, ax = plt.subplots()
2069 time = np.arange(len(data))/rate
2070 for c in range(data.shape[1]):
2071 ax.plot(time, data[:,c])
2072 ax.set_xlabel('Time [s]')
2073 ax.set_ylabel(f'[{unit}]')
2074 if amax is not None and np.isfinite(amax):
2075 ax.set_ylim(-amax, +amax)
2076 plt.show()
2077 return
2079 print('')
2080 print("try DataLoader:")
2081 with DataLoader(file_path, 2.0, 1.0, 1) as data:
2082 print('sampling rate: %g' % data.rate)
2083 print('frames : %d %d' % (len(data), data.shape[0]))
2084 nframes = int(1.0 * data.rate)
2085 # forward:
2086 for i in range(0, len(data), nframes):
2087 print('forward %d-%d' % (i, i + nframes))
2088 x = data[i:i + nframes, 0]
2089 if plot:
2090 fig, ax = plt.subplots()
2091 ax.plot((i + np.arange(len(x)))/data.rate, x)
2092 ax.set_xlabel('Time [s]')
2093 ax.set_ylabel(f'[{data.unit}]')
2094 plt.show()
2095 # and backwards:
2096 for i in reversed(range(0, len(data), nframes)):
2097 print('backward %d-%d' % (i, i + nframes))
2098 x = data[i:i + nframes, 0]
2099 if plot:
2100 fig, ax = plt.subplots()
2101 ax.plot((i + np.arange(len(x)))/data.rate, x)
2102 ax.set_xlabel('Time [s]')
2103 ax.set_ylabel(f'[{data.unit}]')
2104 plt.show()
2107def main(*cargs):
2108 """Call demo with command line arguments.
2110 Parameters
2111 ----------
2112 cargs: list of str
2113 Command line arguments as provided by sys.argv[1:]
2114 """
2115 import argparse
2116 parser = argparse.ArgumentParser(description=
2117 'Checking thunderlab.dataloader module.')
2118 parser.add_argument('-p', dest='plot', action='store_true',
2119 help='plot loaded data')
2120 parser.add_argument('file', nargs=1, default='', type=str,
2121 help='name of data file')
2122 args = parser.parse_args(cargs)
2123 demo(args.file[0], args.plot)
2126if __name__ == "__main__":
2127 main(*sys.argv[1:])