Coverage for src/thunderlab/dataloader.py: 77%

1077 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-03-18 22:36 +0000

1"""Load time-series data from files. 

2 

3``` 

4data, rate, unit, amax = load_data('data/file.wav') 

5``` 

6 

7The function `data_loader()` loads the whole time-series from the file 

8as a numpy array of floats. First dimension is frames, second is 

9channels. In contrast to the `audioio.load_audio()` function, the 

10values of the data array are not restricted between -1 and 1. They can 

11assume any value wihin the range `-amax` to `+amax` with the returned 

12`unit`. 

13 

14``` 

15data = DataLoader('data/file.wav', 60.0) 

16``` 

17or 

18``` 

19with DataLoader('data/file.wav', 60.0) as data: 

20``` 

21Create an `DataLoader` object that loads chuncks of 60 seconds long data 

22on demand. `data` can be used like a read-only numpy array of floats. 

23 

24 

25## Supported file formats 

26 

27- python pickle files 

28- numpy .npz files 

29- matlab .mat files 

30- audio files via [`audioio`](https://github.com/bendalab/audioio) package 

31- LabView .scandat files 

32- relacs trace*.raw files (https://www.relacs.net) 

33- fishgrid traces-*.raw files (https://github.com/bendalab/fishgrid) 

34 

35 

36## Metadata 

37 

38Many file formats allow to store metadata that further describe the 

39stored time series data. We handle them as nested dictionary of key-value 

40pairs. Load them with the `metadata()` function: 

41``` 

42metadata = metadata('data/file.mat') 

43``` 

44 

45## Markers 

46 

47Some file formats also allow to store markers that mark specific 

48positions in the time series data. Load marker positions and spans (in 

49the 2-D array `locs`) and label and text strings (in the 2-D array 

50`labels`) with the `markers()` function: 

51``` 

52locs, labels = markers('data.wav') 

53``` 

54 

55## Aditional, format specific functions 

56 

57- `extract_container_metadata()`: extract metadata from dictionary loaded from a container file. 

58- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file. 

59- `relacs_header()`: read key-value pairs from relacs *.dat file headers. 

60- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file. 

61- `fishgrid_spacings()`: spacing between grid electrodes. 

62 

63""" 

64 

65import gc 

66import os 

67import sys 

68import glob 

69import gzip 

70import numpy as np 

71try: 

72 import matplotlib.pyplot as plt 

73except ImportError: 

74 pass 

75from datetime import timedelta 

76from audioio import load_audio, AudioLoader, unflatten_metadata 

77from audioio import get_number_unit, get_number, get_int, get_bool, get_gain 

78from audioio import default_starttime_keys, default_gain_keys 

79from audioio import get_datetime, flatten_metadata, add_metadata, set_starttime 

80from audioio import metadata as metadata_audioio 

81from audioio import markers as markers_audioio 

82 

83 

84def relacs_samplerate_unit(filepath, channel=0): 

85 """Retrieve sampling rate and unit from a relacs stimuli.dat file. 

86 

87 Parameters 

88 ---------- 

89 filepath: str 

90 Path to a relacs data directory, or a file in a relacs data directory. 

91 channel: int 

92 Channel (trace) number, if `filepath` does not specify a 

93 trace-*.raw file. 

94 

95 Returns 

96 ------- 

97 samplerate: float 

98 Sampling rate in Hertz 

99 unit: str 

100 Unit of the trace, can be empty if not found 

101 

102 Raises 

103 ------ 

104 IOError/FileNotFoundError: 

105 If the stimuli.dat file does not exist. 

106 ValueError: 

107 stimuli.dat file does not contain sampling rate. 

108 """ 

109 trace = channel + 1 

110 relacs_dir = filepath 

111 # check for relacs data directory: 

112 if not os.path.isdir(filepath): 

113 relacs_dir = os.path.dirname(filepath) 

114 bn = os.path.basename(filepath).lower() 

115 i = bn.find('.raw') 

116 if len(bn) > 5 and bn[0:5] == 'trace' and i > 6: 

117 trace = int(bn[6:i]) 

118 

119 # retreive sampling rate and unit from stimuli.dat file: 

120 samplerate = None 

121 sampleinterval = None 

122 unit = "" 

123 

124 lines = [] 

125 stimuli_file = os.path.join(relacs_dir, 'stimuli.dat') 

126 if os.path.isfile(stimuli_file + '.gz'): 

127 stimuli_file += '.gz' 

128 if stimuli_file[-3:] == '.gz': 

129 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf: 

130 for line in sf: 

131 line = line.strip() 

132 if len(line) == 0 or line[0] != '#': 

133 break 

134 lines.append(line) 

135 else: 

136 with open(stimuli_file, 'r', encoding='latin-1') as sf: 

137 for line in sf: 

138 line = line.strip() 

139 if len(line) == 0 or line[0] != '#': 

140 break 

141 lines.append(line) 

142 

143 for line in lines: 

144 if "unit%d" % trace in line: 

145 unit = line.split(':')[1].strip() 

146 if "sampling rate%d" % trace in line: 

147 value = line.split(':')[1].strip() 

148 samplerate = float(value.replace('Hz','')) 

149 elif "sample interval%d" % trace in line: 

150 value = line.split(':')[1].strip() 

151 sampleinterval = float(value.replace('ms','')) 

152 

153 if samplerate is not None: 

154 return samplerate, unit 

155 if sampleinterval is not None: 

156 return 1000/sampleinterval, unit 

157 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}') 

158 

159 

160def relacs_header(filepath, store_empty=False, first_only=False, 

161 lower_keys=False, flat=False, 

162 add_sections=False): 

163 """Read key-value pairs from a relacs *.dat file header. 

164 

165 Parameters 

166 ---------- 

167 filepath: str 

168 A relacs *.dat file, can be also a zipped .gz file. 

169 store_empty: bool 

170 If `False` do not add meta data with empty values. 

171 first_only: bool 

172 If `False` only store the first element of a list. 

173 lower_keys: bool 

174 Make all keys lower case. 

175 flat: bool 

176 Do not make a nested dictionary. 

177 Use this option also to read in very old relacs metadata with 

178 ragged left alignment. 

179 add_sections: bool 

180 If `True`, prepend keys with sections names separated by 

181 '.' to make them unique. 

182 

183 Returns 

184 ------- 

185 data: dict 

186 Nested dictionary with key-value pairs of the file header. 

187  

188 Raises 

189 ------ 

190 IOError/FileNotFoundError: 

191 If `filepath` cannot be opened. 

192 """ 

193 # read in header from file: 

194 lines = [] 

195 if os.path.isfile(filepath + '.gz'): 

196 filepath += '.gz' 

197 if filepath[-3:] == '.gz': 

198 with gzip.open(filepath, 'r', encoding='latin-1') as sf: 

199 for line in sf: 

200 line = line.strip() 

201 if len(line) == 0 or line[0] != '#': 

202 break 

203 lines.append(line) 

204 else: 

205 with open(filepath, 'r', encoding='latin-1') as sf: 

206 for line in sf: 

207 line = line.strip() 

208 if len(line) == 0 or line[0] != '#': 

209 break 

210 lines.append(line) 

211 # parse: 

212 data = {} 

213 cdatas = [data] 

214 sections = [''] 

215 ident_offs = None 

216 ident = None 

217 for line in lines: 

218 words = line.split(':') 

219 value = ':'.join(words[1:]).strip() if len(words) > 1 else '' 

220 if len(words) >= 1: 

221 key = words[0].strip('#') 

222 # get section level: 

223 level = 0 

224 if not flat or len(value) == 0: 

225 nident = len(key) - len(key.lstrip()) 

226 if ident_offs is None: 

227 ident_offs = nident 

228 elif ident is None: 

229 if nident > ident_offs: 

230 ident = nident - ident_offs 

231 level = 1 

232 else: 

233 level = (nident - ident_offs)//ident 

234 # close sections: 

235 if not flat: 

236 while len(cdatas) > level + 1: 

237 cdatas[-1][sections.pop()] = cdatas.pop() 

238 else: 

239 while len(sections) > level + 1: 

240 sections.pop() 

241 # key: 

242 key = key.strip().strip('"') 

243 if lower_keys: 

244 key = key.lower() 

245 skey = key 

246 if add_sections: 

247 key = '.'.join(sections[1:] + [key]) 

248 if len(value) == 0: 

249 # new sub-section: 

250 if flat: 

251 if store_empty: 

252 cdatas[-1][key] = None 

253 else: 

254 cdatas.append({}) 

255 sections.append(skey) 

256 else: 

257 # key-value pair: 

258 value = value.strip('"') 

259 if len(value) > 0 or value != '-' or store_empty: 

260 if len(value) > 0 and value[0] == '[' and value[-1] == ']': 

261 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')] 

262 if first_only: 

263 value = value[0] 

264 cdatas[-1][key] = value 

265 while len(cdatas) > 1: 

266 cdatas[-1][sections.pop()] = cdatas.pop() 

267 return data 

268 

269 

270def check_relacs(filepath): 

271 """Check for valid relacs file. 

272 

273 Parameters 

274 ---------- 

275 filepath: str 

276 Path to a relacs data directory, or a file in a relacs data directory. 

277 

278 Returns 

279 ------- 

280 is_relacs: boolean 

281 `True` if `filepath` is a valid relacs directory or is a file therein. 

282 """ 

283 # relacs data directory: 

284 relacs_dir = filepath 

285 if not os.path.isdir(filepath): 

286 relacs_dir = os.path.dirname(filepath) 

287 # check for a valid relacs data directory: 

288 has_stimuli = False 

289 has_trace = False 

290 for fname in ['stimuli.dat', 'stimuli.dat.gz']: 

291 if os.path.isfile(os.path.join(relacs_dir, fname)): 

292 has_stimuli = True 

293 for fname in ['trace-1.raw', 'trace-1.raw.gz']: 

294 if os.path.isfile(os.path.join(relacs_dir, fname)): 

295 has_trace = True 

296 return has_stimuli and has_trace 

297 

298 

299def relacs_trace_files(filepath): 

300 """Expand file path for relacs data to appropriate trace*.raw file names. 

301 

302 Parameters 

303 ---------- 

304 filepath: str 

305 Path to a relacs data directory, or a file in a relacs data directory. 

306  

307 Returns 

308 ------- 

309 trace_filepaths: list of str 

310 List of relacs trace*.raw files. 

311 """ 

312 relacs_dir = filepath 

313 if not os.path.isdir(filepath): 

314 relacs_dir = os.path.dirname(filepath) 

315 trace_filepaths = [] 

316 for k in range(10000): 

317 fname = os.path.join(relacs_dir, f'trace-{k+1}.raw') 

318 if os.path.isfile(fname): 

319 trace_filepaths.append(fname) 

320 elif os.path.isfile(fname + '.gz'): 

321 trace_filepaths.append(fname + '.gz') 

322 else: 

323 break 

324 return trace_filepaths 

325 

326 

327def load_relacs(filepath, amax=1.0): 

328 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs). 

329 

330 Parameters 

331 ---------- 

332 filepath: str 

333 Path to a relacs data directory, or a file in a relacs data directory. 

334 amax: float 

335 The amplitude range of the data. 

336 

337 Returns 

338 ------- 

339 data: 2-D array 

340 All data traces as an 2-D numpy array, even for single channel data. 

341 First dimension is time, second is channel. 

342 rate: float 

343 Sampling rate of the data in Hz 

344 unit: str 

345 Unit of the data 

346 amax: float 

347 Maximum amplitude of data range. 

348 

349 Raises 

350 ------ 

351 FileNotFoundError: 

352 Invalid or non existing relacs files. 

353 ValueError: 

354 - Invalid name for relacs trace-*.raw file. 

355 - Sampling rates of traces differ. 

356 - Unit of traces differ. 

357 """ 

358 trace_filepaths = relacs_trace_files(filepath) 

359 if len(trace_filepaths) == 0: 

360 raise FileNotFoundError(f'no relacs files found') 

361 # load trace*.raw files: 

362 nchannels = len(trace_filepaths) 

363 data = None 

364 nrows = 0 

365 rate = None 

366 unit = '' 

367 for c, path in enumerate(sorted(trace_filepaths)): 

368 if path[-3:] == '.gz': 

369 with gzip.open(path, 'rb') as sf: 

370 x = np.frombuffer(sf.read(), dtype=np.float32) 

371 else: 

372 x = np.fromfile(path, np.float32) 

373 if data is None: 

374 nrows = len(x) 

375 data = np.zeros((nrows, nchannels)) 

376 n = min(len(x), nrows) 

377 data[:n,c] = x[:n] 

378 # retrieve sampling rate and unit: 

379 crate, us = relacs_samplerate_unit(path, c) 

380 if rate is None: 

381 rate = crate 

382 elif crate != rate: 

383 raise ValueError('sampling rates of traces differ') 

384 if len(unit) == 0: 

385 unit = us 

386 elif us != unit: 

387 raise ValueError('unit of traces differ') 

388 return data, rate, unit, amax 

389 

390 

391def metadata_relacs(filepath, store_empty=False, first_only=False, 

392 lower_keys=False, flat=False, add_sections=False): 

393 """ Read meta-data of a relacs data set. 

394 

395 Parameters 

396 ---------- 

397 filepath: str 

398 A relacs data directory or a file therein. 

399 store_empty: bool 

400 If `False` do not add meta data with empty values. 

401 first_only: bool 

402 If `False` only store the first element of a list. 

403 lower_keys: bool 

404 Make all keys lower case. 

405 flat: bool 

406 Do not make a nested dictionary. 

407 Use this option also to read in very old relacs metadata with 

408 ragged left alignment. 

409 add_sections: bool 

410 If `True`, prepend keys with sections names separated by 

411 '.' to make them unique. 

412 

413 Returns 

414 ------- 

415 data: nested dict 

416 Nested dictionary with key-value pairs of the meta data. 

417 """ 

418 relacs_dir = filepath 

419 if not os.path.isdir(filepath): 

420 relacs_dir = os.path.dirname(filepath) 

421 info_path = os.path.join(relacs_dir, 'info.dat') 

422 if not os.path.exists(info_path): 

423 return dict(), [] 

424 data = relacs_header(info_path, store_empty, first_only, 

425 lower_keys, flat, add_sections) 

426 return data 

427 

428 

429def fishgrid_spacings(metadata, unit='m'): 

430 """Spacing between grid electrodes. 

431 

432 Parameters 

433 ---------- 

434 metadata: dict 

435 Fishgrid metadata obtained from `metadata_fishgrid()`. 

436 unit: str 

437 Unit in which to return the spacings. 

438 

439 Returns 

440 ------- 

441 grid_dist: list of tuple of float 

442 For each grid the distances between rows and columns in `unit`. 

443 """ 

444 grids_dist = [] 

445 for k in range(4): 

446 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0) 

447 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0) 

448 rows = get_int(metadata, f'Rows{k+1}', default=0) 

449 cols = get_int(metadata, f'Columns{k+1}', default=0) 

450 if get_bool(metadata, f'Used{k+1}', default=False) or \ 

451 cols > 0 and rows > 0: 

452 grids_dist.append((row_dist, col_dist)) 

453 return grids_dist 

454 

455 

456def fishgrid_grids(metadata): 

457 """Retrieve grid sizes from a fishgrid.cfg file. 

458 

459 Parameters 

460 ---------- 

461 metadata: dict 

462 Fishgrid metadata obtained from `metadata_fishgrid()`. 

463 

464 Returns 

465 ------- 

466 grids: list of tuple of int 

467 For each grid the number of rows and columns. 

468 """ 

469 grids = [] 

470 for k in range(4): 

471 rows = get_int(metadata, f'Rows{k+1}', default=0) 

472 cols = get_int(metadata, f'Columns{k+1}', default=0) 

473 if get_bool(metadata, f'Used{k+1}', default=False) or \ 

474 cols > 0 and rows > 0: 

475 grids.append((rows, cols)) 

476 return grids 

477 

478 

479def check_fishgrid(filepath): 

480 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid). 

481 

482 Parameters 

483 ---------- 

484 filepath: str 

485 Path to a fishgrid data directory or a file in a fishgrid 

486 data directory. 

487 

488 Returns 

489 ------- 

490 is_fishgrid: bool 

491 `True` if `filepath` is a valid fishgrid data directory or 

492 a file therein. 

493 """ 

494 # fishgrid data directory: 

495 fishgrid_dir = filepath 

496 if not os.path.isdir(filepath): 

497 fishgrid_dir = os.path.dirname(filepath) 

498 # check for a valid fishgrid data directory: 

499 return (os.path.isfile(os.path.join(fishgrid_dir, 'fishgrid.cfg')) and 

500 (os.path.isfile(os.path.join(fishgrid_dir, 'traces-grid1.raw')) or 

501 os.path.isfile(os.path.join(fishgrid_dir, 'traces.raw')))) 

502 

503 

504def fishgrid_trace_files(filepath): 

505 """Expand file paths for fishgrid data to appropriate traces*.raw file names. 

506 

507 Parameters 

508 ---------- 

509 filepath: str 

510 Path to a fishgrid data directory, or a file therein. 

511  

512 Returns 

513 ------- 

514 trace_filepaths: list of str 

515 List of fishgrid traces*.raw files. 

516 """ 

517 # find grids: 

518 fishgrid_dir = filepath 

519 if not os.path.isdir(fishgrid_dir): 

520 fishgrid_dir = os.path.dirname(filepath) 

521 trace_filepaths = [] 

522 for k in range(10000): 

523 file = os.path.join(fishgrid_dir, f'traces-grid{k+1}.raw') 

524 if os.path.isfile(file): 

525 trace_filepaths.append(file) 

526 else: 

527 break 

528 if len(trace_filepaths) == 0: 

529 file = os.path.join(fishgrid_dir, f'traces.raw') 

530 if os.path.isfile(file): 

531 trace_filepaths.append(file) 

532 return trace_filepaths 

533 

534 

535def load_fishgrid(filepath): 

536 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid). 

537 

538 Parameters 

539 ---------- 

540 filepath: str 

541 Path to a fishgrid data directory, or a file therein. 

542 

543 Returns 

544 ------- 

545 data: 2-D array 

546 All data traces as an 2-D numpy array, even for single channel data. 

547 First dimension is time, second is channel. 

548 rate: float 

549 Sampling rate of the data in Hz. 

550 unit: str 

551 Unit of the data. 

552 amax: float 

553 Maximum amplitude of data range. 

554 

555 Raises 

556 ------ 

557 FileNotFoundError: 

558 Invalid or non existing fishgrid files. 

559 """ 

560 trace_filepaths = fishgrid_trace_files(filepath) 

561 if len(trace_filepaths) == 0: 

562 raise FileNotFoundError(f'no fishgrid files found') 

563 md = metadata_fishgrid(filepath) 

564 grids = fishgrid_grids(md) 

565 grid_sizes = [r*c for r, c in grids] 

566 

567 # load traces-grid*.raw files: 

568 grid_channels = [] 

569 nchannels = 0 

570 for g, path in enumerate(trace_filepaths): 

571 grid_channels.append(grid_sizes[g]) 

572 nchannels += grid_sizes[g] 

573 data = None 

574 nrows = 0 

575 c = 0 

576 rate = get_number(md, 'Hz', 'AISampleRate') 

577 for path, channels in zip(trace_filepaths, grid_channels): 

578 x = np.fromfile(path, np.float32).reshape((-1, channels)) 

579 if data is None: 

580 nrows = len(x) 

581 data = np.zeros((nrows, nchannels)) 

582 n = min(len(x), nrows) 

583 data[:n,c:c+channels] = x[:n,:] 

584 c += channels 

585 amax, unit = get_number_unit(md, 'AIMaxVolt') 

586 return data, rate, unit, amax 

587 

588 

589# add fishgrid keys: 

590default_starttime_keys.append(['StartDate', 'StartTime']) 

591default_gain_keys.insert(0, 'AIMaxVolt') 

592 

593 

594def metadata_fishgrid(filepath): 

595 """ Read meta-data of a fishgrid data set. 

596 

597 Parameters 

598 ---------- 

599 filepath: str 

600 A fishgrid data directory or a file therein. 

601 

602 Returns 

603 ------- 

604 data: nested dict 

605 Nested dictionary with key-value pairs of the meta data. 

606 """ 

607 fishgrid_dir = filepath 

608 if not os.path.isdir(fishgrid_dir): 

609 fishgrid_dir = os.path.dirname(filepath) 

610 path = os.path.join(fishgrid_dir, 'fishgrid.cfg') 

611 # read in header from file: 

612 lines = [] 

613 if os.path.isfile(path + '.gz'): 

614 path += '.gz' 

615 if not os.path.exists(path): 

616 return {} 

617 if path[-3:] == '.gz': 

618 with gzip.open(path, 'r', encoding='latin-1') as sf: 

619 for line in sf: 

620 lines.append(line) 

621 else: 

622 with open(path, 'r', encoding='latin-1') as sf: 

623 for line in sf: 

624 lines.append(line) 

625 # parse: 

626 data = {} 

627 cdatas = [data] 

628 ident_offs = None 

629 ident = None 

630 old_style = False 

631 grid_n = False 

632 for line in lines: 

633 if len(line.strip()) == 0: 

634 continue 

635 if line[0] == '*': 

636 key = line[1:].strip() 

637 data[key] = {} 

638 cdatas = [data, data[key]] 

639 elif '----' in line: 

640 old_style = True 

641 key = line.strip().strip(' -').replace('&', '') 

642 if key.upper() == 'SETUP': 

643 key = 'Grid 1' 

644 grid_n = False 

645 if key[:4].lower() == 'grid': 

646 grid_n = key[5] 

647 cdatas = cdatas[:2] 

648 cdatas[1][key] = {} 

649 cdatas.append(cdatas[1][key]) 

650 else: 

651 words = line.split(':') 

652 key = words[0].strip().strip('"') 

653 value = None 

654 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style): 

655 value = ':'.join(words[1:]).strip().strip('"') 

656 if old_style: 

657 if value is None: 

658 cdatas = cdatas[:3] 

659 cdatas[2][key] = {} 

660 cdatas.append(cdatas[2][key]) 

661 else: 

662 if grid_n and key[-1] != grid_n: 

663 key = key + grid_n 

664 cdatas[-1][key] = value 

665 else: 

666 # get section level: 

667 level = 0 

668 nident = len(line) - len(line.lstrip()) 

669 if ident_offs is None: 

670 ident_offs = nident 

671 elif ident is None: 

672 if nident > ident_offs: 

673 ident = nident - ident_offs 

674 level = 1 

675 else: 

676 level = (nident - ident_offs)//ident 

677 # close sections: 

678 cdatas = cdatas[:2 + level] 

679 if value is None: 

680 # new section: 

681 cdatas[-1][key] = {} 

682 cdatas.append(cdatas[-1][key]) 

683 else: 

684 # key-value pair: 

685 cdatas[-1][key] = value.replace(r'\n', '\n') 

686 # remove unused grids: 

687 fgm = data.get('FishGrid', {}) 

688 for i in range(4): 

689 gs = f'Grid {i+1}' 

690 if gs in fgm: 

691 gm = fgm[gs] 

692 us = f'Used{i+1}' 

693 if us in gm and gm[us].upper() == 'FALSE': 

694 del fgm[gs] 

695 return data 

696 

697 

698def markers_fishgrid(filepath): 

699 """ Read markers of a fishgrid data set. 

700 

701 Parameters 

702 ---------- 

703 filepath: str 

704 A fishgrid data directory or a file therein. 

705 

706 Returns 

707 ------- 

708 locs: 2-D array of ints 

709 Marker positions (first column) and spans (second column) 

710 for each marker (rows). 

711 labels: 2-D array of string objects 

712 Labels (first column) and texts (second column) 

713 for each marker (rows). 

714 """ 

715 def add_marker(): 

716 if 'index1' in marker: 

717 index1 = int(marker['index1'])//nchannels 

718 else: 

719 index1 = int(marker['index'])//nchannels 

720 span1 = int(marker.get('span1', 0))//nchannels 

721 locs.append([index1, span1]) 

722 ls = marker.get('label', 'M') 

723 cs = marker.get('comment', '') 

724 labels.append([ls, cs]) 

725 

726 fishgrid_dir = filepath 

727 if not os.path.isdir(fishgrid_dir): 

728 fishgrid_dir = os.path.dirname(filepath) 

729 path = os.path.join(fishgrid_dir, 'timestamps.dat') 

730 if not os.path.isfile(path): 

731 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

732 # get number of channels: 

733 md = metadata_fishgrid(path.replace('timestamps.dat', 'fishgrid.cfg')) 

734 grids = fishgrid_grids(md) 

735 nchannels = np.prod(grids[0]) 

736 # read timestamps: 

737 locs = [] 

738 labels = [] 

739 marker = {} 

740 with open(path, 'r') as sf: 

741 for line in sf: 

742 if len(line.strip()) == 0: 

743 add_marker() 

744 marker = {} 

745 else: 

746 words = line.split(':') 

747 if len(words) > 1: 

748 v = words[1].strip() 

749 v = v.strip('"') 

750 marker[words[0].strip().lower()] = v 

751 if len(marker) > 0: 

752 add_marker() 

753 if len(locs) > 2: 

754 return np.array(locs[1:-1]), np.array(labels[1:-1]) 

755 else: 

756 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

757 

758 

759def check_container(filepath): 

760 """Check if file is a generic container file. 

761 

762 Supported file formats are: 

763 

764 - python pickle files (.pkl) 

765 - numpy files (.npz) 

766 - matlab files (.mat) 

767 

768 Parameters 

769 ---------- 

770 filepath: str 

771 Path of the file to check. 

772  

773 Returns 

774 ------- 

775 is_container: bool 

776 `True`, if `filepath` is a supported container format. 

777 """ 

778 ext = os.path.splitext(filepath)[1] 

779 return ext.lower() in ('.pkl', '.npz', '.mat') 

780 

781 

782def extract_container_data(data_dict, datakey=None, 

783 samplekey=['rate', 'Fs', 'fs'], 

784 timekey=['time'], amplkey=['amax'], unitkey='unit', 

785 amax=1.0, unit='a.u.'): 

786 """Extract data from dictionary loaded from a container file. 

787 

788 Parameters 

789 ---------- 

790 data_dict: dict 

791 Dictionary of the data items contained in the container. 

792 datakey: None, str, or list of str 

793 Name of the variable holding the data. If `None` take the 

794 variable that is an 2D array and has the largest number of 

795 elements. 

796 samplekey: str or list of str 

797 Name of the variable holding the sampling rate. 

798 timekey: str or list of str 

799 Name of the variable holding sampling times. 

800 If no sampling rate is available, the sampling rate is retrieved 

801 from the sampling times. 

802 amplkey: str or list of str 

803 Name of the variable holding the amplitude range of the data. 

804 unitkey: str 

805 Name of the variable holding the unit of the data. 

806 amax: None or float 

807 If specified and no amplitude range has been found in `data_dict`, 

808 then this is the amplitude range of the data. 

809 unit: None or str 

810 If specified and no unit has been found in `data_dict`, 

811 then return this as the unit of the data. 

812 

813 Returns 

814 ------- 

815 data: 2-D array of floats 

816 All data traces as an 2-D numpy array, even for single channel data. 

817 First dimension is time, second is channel. 

818 rate: float 

819 Sampling rate of the data in Hz. 

820 unit: str 

821 Unit of the data. 

822 amax: float 

823 Maximum amplitude of data range in `unit`. 

824 

825 Raises 

826 ------ 

827 ValueError: 

828 Invalid key requested. 

829 """ 

830 # extract format data: 

831 if not isinstance(samplekey, (list, tuple, np.ndarray)): 

832 samplekey = (samplekey,) 

833 if not isinstance(timekey, (list, tuple, np.ndarray)): 

834 timekey = (timekey,) 

835 if not isinstance(amplkey, (list, tuple, np.ndarray)): 

836 amplkey = (amplkey,) 

837 rate = 0.0 

838 for skey in samplekey: 

839 if skey in data_dict: 

840 rate = float(data_dict[skey]) 

841 break 

842 if rate == 0.0: 

843 for tkey in timekey: 

844 if tkey in data_dict: 

845 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0]) 

846 break 

847 if rate == 0.0: 

848 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times") 

849 for akey in amplkey: 

850 if akey in data_dict: 

851 amax = float(data_dict[akey]) 

852 break 

853 if unitkey in data_dict: 

854 unit = data_dict[unitkey] 

855 # get data array: 

856 raw_data = np.array([]) 

857 if datakey: 

858 # try data keys: 

859 if not isinstance(datakey, (list, tuple, np.ndarray)): 

860 datakey = (datakey,) 

861 for dkey in datakey: 

862 if dkey in data_dict: 

863 raw_data = data_dict[dkey] 

864 break 

865 if len(raw_data) == 0: 

866 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data") 

867 else: 

868 # find largest 2D array: 

869 for d in data_dict: 

870 if hasattr(data_dict[d], 'shape'): 

871 if 1 <= len(data_dict[d].shape) <= 2 and \ 

872 np.max(data_dict[d].shape) > np.max(raw_data.shape): 

873 raw_data = data_dict[d] 

874 if len(raw_data) == 0: 

875 raise ValueError('no data found') 

876 # make 2D: 

877 if len(raw_data.shape) == 1: 

878 raw_data = raw_data.reshape(-1, 1) 

879 # transpose if necessary: 

880 if np.argmax(raw_data.shape) > 0: 

881 raw_data = raw_data.T 

882 # recode: 

883 if raw_data.dtype == np.dtype('int16'): 

884 data = raw_data.astype('float32') 

885 data *= amax/2**15 

886 elif raw_data.dtype == np.dtype('int32'): 

887 data = raw_data.astype(float) 

888 data *= amax/2**31 

889 elif raw_data.dtype == np.dtype('int64'): 

890 data = raw_data.astype(float) 

891 data *= amax/2**63 

892 else: 

893 data = raw_data 

894 return data, rate, unit, amax 

895 

896 

897def load_container(filepath, datakey=None, 

898 samplekey=['rate', 'Fs', 'fs'], 

899 timekey=['time'], amplkey=['amax'], unitkey='unit', 

900 amax=1.0, unit='a.u.'): 

901 """Load data from a generic container file. 

902 

903 Supported file formats are: 

904 

905 - python pickle files (.pkl) 

906 - numpy files (.npz) 

907 - matlab files (.mat) 

908 

909 Parameters 

910 ---------- 

911 filepath: str 

912 Path of the file to load. 

913 datakey: None, str, or list of str 

914 Name of the variable holding the data. If `None` take the 

915 variable that is an 2D array and has the largest number of 

916 elements. 

917 samplekey: str or list of str 

918 Name of the variable holding the sampling rate. 

919 timekey: str or list of str 

920 Name of the variable holding sampling times. 

921 If no sampling rate is available, the sampling rate is retrieved 

922 from the sampling times. 

923 amplkey: str 

924 Name of the variable holding the amplitude range of the data. 

925 unitkey: str 

926 Name of the variable holding the unit of the data. 

927 If `unitkey` is not a valid key, then return `unitkey` as the `unit`. 

928 amax: None or float 

929 If specified and no amplitude range has been found in the data 

930 container, then this is the amplitude range of the data. 

931 unit: None or str 

932 If specified and no unit has been found in the data container, 

933 then return this as the unit of the data. 

934 

935 Returns 

936 ------- 

937 data: 2-D array of floats 

938 All data traces as an 2-D numpy array, even for single channel data. 

939 First dimension is time, second is channel. 

940 rate: float 

941 Sampling rate of the data in Hz. 

942 unit: str 

943 Unit of the data. 

944 amax: float 

945 Maximum amplitude of data range. 

946 

947 Raises 

948 ------ 

949 ValueError: 

950 Invalid key requested. 

951 """ 

952 # load data: 

953 data_dict = {} 

954 ext = os.path.splitext(filepath)[1] 

955 if ext == '.pkl': 

956 import pickle 

957 with open(filepath, 'rb') as f: 

958 data_dict = pickle.load(f) 

959 elif ext == '.npz': 

960 data_dict = np.load(filepath) 

961 elif ext == '.mat': 

962 from scipy.io import loadmat 

963 data_dict = loadmat(filepath, squeeze_me=True) 

964 return extract_container_data(data_dict, datakey, samplekey, 

965 timekey, amplkey, unitkey, amax, unit) 

966 

967 

968def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']): 

969 """ Extract metadata from dictionary loaded from a container file. 

970 

971 Parameters 

972 ---------- 

973 data_dict: dict 

974 Dictionary of the data items contained in the container. 

975 metadatakey: str or list of str 

976 Name of the variable holding the metadata. 

977 

978 Returns 

979 ------- 

980 metadata: nested dict 

981 Nested dictionary with key-value pairs of the meta data. 

982 """ 

983 if not isinstance(metadatakey, (list, tuple, np.ndarray)): 

984 metadatakey = (metadatakey,) 

985 # get single metadata dictionary: 

986 for mkey in metadatakey: 

987 if mkey in data_dict: 

988 return data_dict[mkey] 

989 # collect all keys starting with metadatakey: 

990 metadata = {} 

991 for mkey in metadatakey: 

992 mkey += '__' 

993 for dkey in data_dict: 

994 if dkey[:len(mkey)] == mkey: 

995 v = data_dict[dkey] 

996 if hasattr(v, 'size') and v.ndim == 0: 

997 v = v.item() 

998 metadata[dkey[len(mkey):]] = v 

999 if len(metadata) > 0: 

1000 return unflatten_metadata(metadata, sep='__') 

1001 return metadata 

1002 

1003 

1004def metadata_container(filepath, metadatakey=['metadata', 'info']): 

1005 """ Read meta-data of a container file. 

1006 

1007 Parameters 

1008 ---------- 

1009 filepath: str 

1010 A container file. 

1011 metadatakey: str or list of str 

1012 Name of the variable holding the metadata. 

1013 

1014 Returns 

1015 ------- 

1016 metadata: nested dict 

1017 Nested dictionary with key-value pairs of the meta data. 

1018 """ 

1019 data_dict = {} 

1020 ext = os.path.splitext(filepath)[1] 

1021 if ext == '.pkl': 

1022 import pickle 

1023 with open(filepath, 'rb') as f: 

1024 data_dict = pickle.load(f) 

1025 elif ext == '.npz': 

1026 data_dict = np.load(filepath) 

1027 elif ext == '.mat': 

1028 from scipy.io import loadmat 

1029 data_dict = loadmat(filepath, squeeze_me=True) 

1030 return extract_container_metadata(data_dict, metadatakey) 

1031 

1032 

1033def extract_container_markers(data_dict, poskey=['positions'], 

1034 spanskey=['spans'], labelskey=['labels'], 

1035 descrkey=['descriptions']): 

1036 """ Extract markers from dictionary loaded from a container file. 

1037 

1038 Parameters 

1039 ---------- 

1040 data_dict: dict 

1041 Dictionary of the data items contained in the container. 

1042 poskey: str or list of str 

1043 Name of the variable holding positions of markers. 

1044 spanskey: str or list of str 

1045 Name of the variable holding spans of markers. 

1046 labelskey: str or list of str 

1047 Name of the variable holding labels of markers. 

1048 descrkey: str or list of str 

1049 Name of the variable holding descriptions of markers. 

1050 

1051 Returns 

1052 ------- 

1053 locs: 2-D array of ints 

1054 Marker positions (first column) and spans (second column) 

1055 for each marker (rows). 

1056 labels: 2-D array of string objects 

1057 Labels (first column) and texts (second column) 

1058 for each marker (rows). 

1059 """ 

1060 if not isinstance(poskey, (list, tuple, np.ndarray)): 

1061 poskey = (poskey,) 

1062 if not isinstance(spanskey, (list, tuple, np.ndarray)): 

1063 spanskey = (spanskey,) 

1064 if not isinstance(labelskey, (list, tuple, np.ndarray)): 

1065 labelskey = (labelskey,) 

1066 if not isinstance(descrkey, (list, tuple, np.ndarray)): 

1067 descrkey = (descrkey,) 

1068 locs = np.zeros((0, 2), dtype=int) 

1069 for pkey in poskey: 

1070 if pkey in data_dict: 

1071 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int) 

1072 locs[:,0] = data_dict[pkey] 

1073 break 

1074 for skey in spanskey: 

1075 if skey in data_dict: 

1076 locs[:,1] = data_dict[skey] 

1077 break 

1078 labels = np.zeros((0, 2), dtype=object) 

1079 for lkey in labelskey: 

1080 if lkey in data_dict: 

1081 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object) 

1082 labels[:,0] = data_dict[lkey] 

1083 break 

1084 for dkey in descrkey: 

1085 if dkey in data_dict: 

1086 labels[:,1] = data_dict[dkey] 

1087 break 

1088 return locs, labels 

1089 

1090 

1091def markers_container(filepath, poskey=['positions'], 

1092 spanskey=['spans'], labelskey=['labels'], 

1093 descrkey=['descriptions']): 

1094 """ Read markers of a container file. 

1095 

1096 Parameters 

1097 ---------- 

1098 filepath: str 

1099 A container file. 

1100 poskey: str or list of str 

1101 Name of the variable holding positions of markers. 

1102 spanskey: str or list of str 

1103 Name of the variable holding spans of markers. 

1104 labelskey: str or list of str 

1105 Name of the variable holding labels of markers. 

1106 descrkey: str or list of str 

1107 Name of the variable holding descriptions of markers. 

1108 

1109 Returns 

1110 ------- 

1111 locs: 2-D array of ints 

1112 Marker positions (first column) and spans (second column) 

1113 for each marker (rows). 

1114 labels: 2-D array of string objects 

1115 Labels (first column) and texts (second column) 

1116 for each marker (rows). 

1117 """ 

1118 data_dict = {} 

1119 ext = os.path.splitext(filepath)[1] 

1120 if ext == '.pkl': 

1121 import pickle 

1122 with open(filepath, 'rb') as f: 

1123 data_dict = pickle.load(f) 

1124 elif ext == '.npz': 

1125 data_dict = np.load(filepath) 

1126 elif ext == '.mat': 

1127 from scipy.io import loadmat 

1128 data_dict = loadmat(filepath, squeeze_me=True) 

1129 return extract_container_markers(data_dict, poskey, spanskey, 

1130 labelskey, descrkey) 

1131 

1132 

1133def check_raw(filepath): 

1134 """Check if file is a raw file. 

1135 

1136 The following extensions are interpreted as raw files: 

1137 

1138 - raw files (*.raw) 

1139 - LabView scandata (*.scandat) 

1140 

1141 Parameters 

1142 ---------- 

1143 filepath: str 

1144 Path of the file to check. 

1145  

1146 Returns 

1147 ------- 

1148 is_raw: bool 

1149 `True`, if `filepath` is a raw format. 

1150 """ 

1151 ext = os.path.splitext(filepath)[1] 

1152 return ext.lower() in ('.raw', '.scandat', '.mat') 

1153 

1154 

1155def load_raw(filepath, rate=44000, channels=1, dtype=np.float32, 

1156 amax=1.0, unit='a.u.'): 

1157 """Load data from a raw file. 

1158 

1159 Raw files just contain the data and absolutely no metadata, not 

1160 even the smapling rate, number of channels, etc. 

1161 Supported file formats are: 

1162 

1163 - raw files (*.raw) 

1164 - LabView scandata (*.scandat) 

1165 

1166 Parameters 

1167 ---------- 

1168 filepath: str 

1169 Path of the file to load. 

1170 rate: float 

1171 Sampling rate of the data in Hertz. 

1172 channels: int 

1173 Number of channels multiplexed in the data. 

1174 dtype: str or numpy.dtype 

1175 The data type stored in the file. 

1176 amax: float 

1177 The amplitude range of the data. 

1178 unit: str 

1179 The unit of the data. 

1180 

1181 Returns 

1182 ------- 

1183 data: 2-D array of floats 

1184 All data traces as an 2-D numpy array, even for single channel data. 

1185 First dimension is time, second is channel. 

1186 rate: float 

1187 Sampling rate of the data in Hz. 

1188 unit: str 

1189 Unit of the data. 

1190 amax: float 

1191 Maximum amplitude of data range. 

1192 

1193 """ 

1194 raw_data = np.fromfile(filepath, dtype=dtype).reshape(-1, channels) 

1195 # recode: 

1196 if dtype == np.dtype('int16'): 

1197 data = raw_data.astype('float32') 

1198 data *= amax/2**15 

1199 elif dtype == np.dtype('int32'): 

1200 data = raw_data.astype(float) 

1201 data *= amax/2**31 

1202 elif dtype == np.dtype('int64'): 

1203 data = raw_data.astype(float) 

1204 data *= amax/2**63 

1205 else: 

1206 data = raw_data 

1207 return data, rate, unit, amax 

1208 

1209 

1210def load_audioio(filepath, verbose=0, gainkey=default_gain_keys, sep='.', 

1211 amax=1.0, unit='a.u.'): 

1212 """Load data from an audio file. 

1213 

1214 See the 

1215 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio) 

1216 function of the [`audioio`](https://github.com/bendalab/audioio) 

1217 package for more infos. 

1218 

1219 Parameters 

1220 ---------- 

1221 filepath: str 

1222 Path of the file to load. 

1223 verbose: int 

1224 If > 0 show detailed error/warning messages. 

1225 gainkey: str or list of str 

1226 Key in the file's metadata that holds some gain information. 

1227 If found, the data will be multiplied with the gain, 

1228 and if available, the corresponding unit is returned. 

1229 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details. 

1230 sep: str 

1231 String that separates section names in `gainkey`. 

1232 amax: float 

1233 If specified and no gain has been found in the metadata, 

1234 then use this as the amplitude range. 

1235 unit: str 

1236 If specified and no gain has been found in the metadata, 

1237 then return this as the unit of the data. 

1238 

1239 Returns 

1240 ------- 

1241 data: 2-D array of floats 

1242 All data traces as an 2-D numpy array, even for single channel data. 

1243 First dimension is time, second is channel. 

1244 rate: float 

1245 Sampling rate of the data in Hz. 

1246 unit: str 

1247 Unit of the data if found in the metadata (see `gainkey`), 

1248 otherwise `unit`. 

1249 amax: float 

1250 Maximum amplitude of data range. 

1251 """ 

1252 # get gain: 

1253 md = metadata_audioio(filepath) 

1254 amax, unit = get_gain(md, gainkey, sep, amax, unit) 

1255 # load data: 

1256 data, rate = load_audio(filepath, verbose) 

1257 if amax != 1.0: 

1258 data *= amax 

1259 return data, rate, unit, amax 

1260 

1261 

1262data_loader_funcs = ( 

1263 ('relacs', check_relacs, load_relacs, metadata_relacs, None), 

1264 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid), 

1265 ('container', check_container, load_container, metadata_container, markers_container), 

1266 ('raw', check_raw, load_raw, None, None), 

1267 ('audioio', None, load_audioio, metadata_audioio, markers_audioio), 

1268 ) 

1269"""List of implemented load functions. 

1270 

1271Each element of the list is a tuple with the data format's name, its 

1272check and its load function. 

1273 

1274""" 

1275 

1276 

1277def load_data(filepath, verbose=0, **kwargs): 

1278 """Load time-series data from a file. 

1279 

1280 Parameters 

1281 ---------- 

1282 filepath: str 

1283 Path and name of the file to load. 

1284 verbose: int 

1285 If > 0 show detailed error/warning messages. 

1286 **kwargs: dict 

1287 Further keyword arguments that are passed on to the  

1288 format specific loading functions. 

1289 For example: 

1290 - `amax`: the amplitude range of the data. 

1291 - 'unit': the unit of the data. 

1292 

1293 Returns 

1294 ------- 

1295 data: 2-D array 

1296 All data traces as an 2-D numpy array, even for single channel data. 

1297 First dimension is time, second is channel. 

1298 rate: float 

1299 Sampling rate of the data in Hz. 

1300 unit: str 

1301 Unit of the data. 

1302 amax: float 

1303 Maximum amplitude of data range. 

1304 

1305 Raises 

1306 ------ 

1307 ValueError: 

1308 `filepath` is empty string. 

1309 """ 

1310 if len(filepath) == 0: 

1311 raise ValueError('input argument filepath is empty string.') 

1312 # load data: 

1313 for name, check_file, load_file, _, _ in data_loader_funcs: 

1314 if check_file is None or check_file(filepath): 

1315 data, rate, unit, amax = load_file(filepath, **kwargs) 

1316 if verbose > 0: 

1317 print(f'loaded {name} data from file "{filepath}"') 

1318 if verbose > 1: 

1319 print(f' sampling rate: {rate:g} Hz') 

1320 print(f' channels : {data.shape[1]}') 

1321 print(f' frames : {len(data)}') 

1322 print(f' range : {amax:g}{unit}') 

1323 return data, rate, unit, amax 

1324 return np.zeros((0, 1)), 0.0, '', 1.0 

1325 

1326 

1327def metadata(filepath, **kwargs): 

1328 """ Read meta-data from a data file. 

1329 

1330 Parameters 

1331 ---------- 

1332 filepath: str 

1333 The full path and name of the file to load. For some file 

1334 formats several files can be provided in a list. 

1335 **kwargs: dict 

1336 Further keyword arguments that are passed on to the  

1337 format specific loading functions. 

1338 

1339 Returns 

1340 ------- 

1341 meta_data: nested dict 

1342 Meta data contained in the file. Keys of the nested 

1343 dictionaries are always strings. If the corresponding 

1344 values are dictionaries, then the key is the section name 

1345 of the metadata contained in the dictionary. All other 

1346 types of values are values for the respective key. In 

1347 particular they are strings, or list of strings. But other 

1348 simple types like ints or floats are also allowed. 

1349 

1350 Raises 

1351 ------ 

1352 ValueError: 

1353 `filepath` is empty string. 

1354 """ 

1355 if len(filepath) == 0: 

1356 raise ValueError('input argument filepath is empty string.') 

1357 # load metadata: 

1358 for _, check_file, _, metadata_file, _ in data_loader_funcs: 

1359 if check_file is None or check_file(filepath): 

1360 if metadata_file is not None: 

1361 return metadata_file(filepath, **kwargs) 

1362 return {} 

1363 

1364 

1365def markers(filepath): 

1366 """ Read markers of a data file. 

1367 

1368 Parameters 

1369 ---------- 

1370 filepath: str or file handle 

1371 The data file. 

1372 

1373 Returns 

1374 ------- 

1375 locs: 2-D array of ints 

1376 Marker positions (first column) and spans (second column) 

1377 for each marker (rows). 

1378 labels: 2-D array of string objects 

1379 Labels (first column) and texts (second column) 

1380 for each marker (rows). 

1381 

1382 Raises 

1383 ------ 

1384 ValueError: 

1385 `filepath` is empty string. 

1386 """ 

1387 if len(filepath) == 0: 

1388 raise ValueError('input argument filepath is empty string.') 

1389 # load markers: 

1390 for _, check_file, _, _, markers_file in data_loader_funcs: 

1391 if check_file is None or check_file(filepath): 

1392 if markers_file is not None: 

1393 return markers_file(filepath) 

1394 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

1395 

1396 

1397class DataLoader(AudioLoader): 

1398 """Buffered reading of time-series data for random access of the data in the file. 

1399  

1400 This allows for reading very large data files that do not fit into 

1401 memory. A `DataLoader` instance can be used like a huge 

1402 read-only numpy array, i.e. 

1403 ``` 

1404 data = DataLoader('path/to/data/file.dat') 

1405 x = data[10000:20000,0] 

1406 ``` 

1407 The first index specifies the frame, the second one the channel. 

1408 

1409 `DataLoader` first determines the format of the data file and then 

1410 opens the file (first line). It then reads data from the file as 

1411 necessary for the requested data (second line). 

1412 

1413 Supported file formats are 

1414 

1415 - audio files via `audioio` package 

1416 - python pickle files 

1417 - numpy .npz files 

1418 - matlab .mat files 

1419 - relacs trace*.raw files (www.relacs.net) 

1420 - fishgrid traces-*.raw files 

1421 

1422 Reading sequentially through the file is always possible. If 

1423 previous data are requested, then the file is read from the 

1424 beginning. This might slow down access to previous data 

1425 considerably. Use the `backsize` argument to the open functions to 

1426 make sure some data are loaded before the requested frame. Then a 

1427 subsequent access to the data within `backsize` seconds before that 

1428 frame can still be handled without the need to reread the file 

1429 from the beginning. 

1430 

1431 Usage: 

1432 ------ 

1433 ``` 

1434 import thunderlab.dataloader as dl 

1435 with dl.DataLoader(filepath, 60.0, 10.0) as data: 

1436 # do something with the content of the file: 

1437 x = data[0:10000,0] 

1438 y = data[10000:20000,0] 

1439 z = x + y 

1440 ``` 

1441  

1442 Normal open and close: 

1443 ``` 

1444 data = dl.DataLoader(filepath, 60.0) 

1445 x = data[:,:] # read the whole file 

1446 data.close() 

1447 ```  

1448 that is the same as: 

1449 ``` 

1450 data = dl.DataLoader() 

1451 data.open(filepath, 60.0) 

1452 ``` 

1453  

1454 Parameters 

1455 ---------- 

1456 filepath: str 

1457 Name of the file. 

1458 buffersize: float 

1459 Size of internal buffer in seconds. 

1460 backsize: float 

1461 Part of the buffer to be loaded before the requested start index in seconds. 

1462 verbose: int 

1463 If larger than zero show detailed error/warning messages. 

1464 meta_kwargs: dict 

1465 Keyword arguments that are passed on to the _load_metadata() function. 

1466 

1467 Attributes 

1468 ---------- 

1469 rate: float 

1470 The sampling rate of the data in Hertz. 

1471 channels: int 

1472 The number of channels that are read in. 

1473 frames: int 

1474 The number of frames in the file. 

1475 format: str or None 

1476 Format of the audio file. 

1477 encoding: str or None 

1478 Encoding/subtype of the audio file. 

1479 shape: tuple 

1480 Number of frames and channels of the data. 

1481 ndim: int 

1482 Number of dimensions: always 2 (frames and channels). 

1483 unit: str 

1484 Unit of the data. 

1485 ampl_min: float 

1486 Minimum amplitude the file format supports. 

1487 ampl_max: float 

1488 Maximum amplitude the file format supports. 

1489 

1490 Methods 

1491 ------- 

1492 

1493 - `len()`: the number of frames 

1494 - `open()`: open a data file. 

1495 - `open_*()`: open a data file of a specific format. 

1496 - `close()`: close the file. 

1497 - `metadata()`: metadata of the file. 

1498 - `markers()`: markers of the file. 

1499 - `set_unwrap()`: Set parameters for unwrapping clipped data. 

1500 

1501 """ 

1502 

1503 def __init__(self, filepath=None, buffersize=10.0, backsize=0.0, 

1504 verbose=0, **meta_kwargs): 

1505 super().__init__(None, buffersize, backsize, 

1506 verbose, **meta_kwargs) 

1507 if filepath is not None: 

1508 self.open(filepath, buffersize, backsize, verbose, **meta_kwargs) 

1509 

1510 def __getitem__(self, key): 

1511 return super(DataLoader, self).__getitem__(key) 

1512 

1513 def __next__(self): 

1514 return super(DataLoader, self).__next__() 

1515 

1516 

1517 # relacs interface:  

1518 def open_relacs(self, filepath, buffersize=10.0, backsize=0.0, 

1519 verbose=0, amax=1.0): 

1520 """Open relacs data files (www.relacs.net) for reading. 

1521 

1522 Parameters 

1523 ---------- 

1524 filepath: str 

1525 Path to a relacs data directory or a file therein. 

1526 buffersize: float 

1527 Size of internal buffer in seconds. 

1528 backsize: float 

1529 Part of the buffer to be loaded before the requested start index in seconds. 

1530 verbose: int 

1531 If > 0 show detailed error/warning messages. 

1532 amax: float 

1533 The amplitude range of the data. 

1534 

1535 Raises 

1536 ------ 

1537 FileNotFoundError: 

1538 Invalid or non existing fishgrid files. 

1539 ValueError: 

1540 .gz files not supported. 

1541 """ 

1542 self.verbose = verbose 

1543 

1544 # open trace files: 

1545 self.trace_filepaths = relacs_trace_files(filepath) 

1546 if len(self.trace_filepaths) == 0: 

1547 raise FileNotFoundError(f'no relacs files found') 

1548 self.sf = [] 

1549 self.frames = None 

1550 self.rate = None 

1551 self.unit = '' 

1552 self.filepath = os.path.dirname(self.trace_filepaths[0]) 

1553 self.file_paths = [self.filepath] 

1554 self.file_indices = [0] 

1555 for path in self.trace_filepaths: 

1556 if path[-3:] == '.gz': 

1557 raise ValueError('.gz files not supported') 

1558 sf = open(path, 'rb') 

1559 self.sf.append(sf) 

1560 if verbose > 0: 

1561 print(f'open_relacs(filepath) with filepath={path}') 

1562 # file size: 

1563 sf.seek(0, os.SEEK_END) 

1564 frames = sf.tell()//4 

1565 if self.frames is None: 

1566 self.frames = frames 

1567 elif self.frames != frames: 

1568 diff = self.frames - frames 

1569 if diff > 1 or diff < -2: 

1570 raise ValueError('number of frames of traces differ') 

1571 elif diff >= 0: 

1572 self.frames = frames 

1573 sf.seek(0) 

1574 # retrieve sampling rate and unit: 

1575 rate, us = relacs_samplerate_unit(path) 

1576 if self.rate is None: 

1577 self.rate = rate 

1578 elif rate != self.rate: 

1579 raise ValueError('sampling rates of traces differ') 

1580 if len(self.unit) == 0: 

1581 self.unit = us 

1582 elif us != self.unit: 

1583 raise ValueError('unit of traces differ') 

1584 self.channels = len(self.sf) 

1585 self.shape = (self.frames, self.channels) 

1586 self.size = self.frames * self.channels 

1587 self.ndim = len(self.shape) 

1588 self.format = 'RELACS' 

1589 self.encoding = 'FLOAT' 

1590 self.bufferframes = int(buffersize*self.rate) 

1591 self.backframes = int(backsize*self.rate) 

1592 self.init_buffer() 

1593 self.offset = 0 

1594 self.close = self._close_relacs 

1595 self.load_audio_buffer = self._load_buffer_relacs 

1596 self.ampl_min = -amax 

1597 self.ampl_max = +amax 

1598 self._load_metadata = self._metadata_relacs 

1599 # TODO: load markers: 

1600 self._locs = np.zeros((0, 2), dtype=int) 

1601 self._labels = np.zeros((0, 2), dtype=object) 

1602 self._load_markers = None 

1603 return self 

1604 

1605 def _close_relacs(self): 

1606 """Close the relacs data files. 

1607 """ 

1608 for file in self.sf: 

1609 file.close() 

1610 self.sf = [] 

1611 

1612 def _load_buffer_relacs(self, r_offset, r_size, buffer): 

1613 """Load new data from relacs data file. 

1614 

1615 Parameters 

1616 ---------- 

1617 r_offset: int 

1618 First frame to be read from file. 

1619 r_size: int 

1620 Number of frames to be read from file. 

1621 buffer: ndarray 

1622 Buffer where to store the loaded data. 

1623 """ 

1624 if len(self.sf) == 0 and len(self.trace_filepaths) > 0: 

1625 for path in self.trace_filepaths: 

1626 self.sf.append(open(path, 'rb')) 

1627 for i, file in enumerate(self.sf): 

1628 file.seek(r_offset*4) 

1629 data = file.read(r_size*4) 

1630 buffer[:, i] = np.frombuffer(data, dtype=np.float32) 

1631 

1632 

1633 def _metadata_relacs(self, store_empty=False, first_only=False): 

1634 """ Load meta-data of a relacs data set. 

1635 """ 

1636 info_path = os.path.join(self.filepath, 'info.dat') 

1637 if not os.path.exists(info_path): 

1638 return {} 

1639 return relacs_header(info_path, store_empty, first_only) 

1640 

1641 

1642 # fishgrid interface:  

1643 def open_fishgrid(self, filepath, buffersize=10.0, backsize=0.0, 

1644 verbose=0): 

1645 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading. 

1646 

1647 Parameters 

1648 ---------- 

1649 filepath: str 

1650 Path to a fishgrid data directory, or a file therein. 

1651 buffersize: float 

1652 Size of internal buffer in seconds. 

1653 backsize: float 

1654 Part of the buffer to be loaded before the requested start index in seconds. 

1655 verbose: int 

1656 If > 0 show detailed error/warning messages. 

1657 

1658 Raises 

1659 ------ 

1660 FileNotFoundError: 

1661 Invalid or non existing fishgrid files. 

1662 """ 

1663 self.verbose = verbose 

1664 

1665 self.trace_filepaths = fishgrid_trace_files(filepath) 

1666 if len(self.trace_filepaths) == 0: 

1667 raise FileNotFoundError(f'no fishgrid files found') 

1668 self.filepath = os.path.dirname(self.trace_filepaths[0]) 

1669 self.file_paths = [self.filepath] 

1670 self.file_indices = [0] 

1671 self._load_metadata = metadata_fishgrid 

1672 self._load_markers = markers_fishgrid 

1673 

1674 # open grid files: 

1675 grids = fishgrid_grids(self.metadata()) 

1676 grid_sizes = [r*c for r,c in grids] 

1677 self.channels = 0 

1678 for g, path in enumerate(self.trace_filepaths): 

1679 self.channels += grid_sizes[g] 

1680 self.sf = [] 

1681 self.grid_channels = [] 

1682 self.grid_offs = [] 

1683 offs = 0 

1684 self.frames = None 

1685 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate') 

1686 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt') 

1687 if v is not None: 

1688 self.ampl_min = -v 

1689 self.ampl_max = +v 

1690 

1691 for g, path in enumerate(self.trace_filepaths): 

1692 sf = open(path, 'rb') 

1693 self.sf.append(sf) 

1694 if verbose > 0: 

1695 print(f'open_fishgrid(filepath) with filepath={path}') 

1696 # grid channels: 

1697 self.grid_channels.append(grid_sizes[g]) 

1698 self.grid_offs.append(offs) 

1699 offs += grid_sizes[g] 

1700 # file size: 

1701 sf.seek(0, os.SEEK_END) 

1702 frames = sf.tell()//4//grid_sizes[g] 

1703 if self.frames is None: 

1704 self.frames = frames 

1705 elif self.frames != frames: 

1706 diff = self.frames - frames 

1707 if diff > 1 or diff < -2: 

1708 raise ValueError('number of frames of traces differ') 

1709 elif diff >= 0: 

1710 self.frames = frames 

1711 sf.seek(0) 

1712 self.shape = (self.frames, self.channels) 

1713 self.size = self.frames * self.channels 

1714 self.ndim = len(self.shape) 

1715 self.format = 'FISHGRID' 

1716 self.encoding = 'FLOAT' 

1717 self.bufferframes = int(buffersize*self.rate) 

1718 self.backframes = int(backsize*self.rate) 

1719 self.init_buffer() 

1720 self.offset = 0 

1721 self.close = self._close_fishgrid 

1722 self.load_audio_buffer = self._load_buffer_fishgrid 

1723 return self 

1724 

1725 def _close_fishgrid(self): 

1726 """Close the fishgrid data files. 

1727 """ 

1728 for file in self.sf: 

1729 file.close() 

1730 self.sf = [] 

1731 

1732 def _load_buffer_fishgrid(self, r_offset, r_size, buffer): 

1733 """Load new data from relacs data file. 

1734 

1735 Parameters 

1736 ---------- 

1737 r_offset: int 

1738 First frame to be read from file. 

1739 r_size: int 

1740 Number of frames to be read from file. 

1741 buffer: ndarray 

1742 Buffer where to store the loaded data. 

1743 """ 

1744 if len(self.sf) == 0 and len(self.trace_filepaths) > 0: 

1745 for path in self.trace_filepaths: 

1746 self.sf.append(open(path, 'rb')) 

1747 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs): 

1748 file.seek(r_offset*4*gchannels) 

1749 data = file.read(r_size*4*gchannels) 

1750 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels)) 

1751 

1752 

1753 # container interface: 

1754 def open_container(self, filepath, buffersize=10.0, 

1755 backsize=0.0, verbose=0, datakey=None, 

1756 samplekey=['rate', 'Fs', 'fs'], 

1757 timekey=['time'], amplkey=['amax'], unitkey='unit', 

1758 metadatakey=['metadata', 'info'], 

1759 poskey=['positions'], 

1760 spanskey=['spans'], labelskey=['labels'], 

1761 descrkey=['descriptions'], 

1762 amax=1.0, unit='a.u.'): 

1763 """Open generic container file. 

1764 

1765 Supported file formats are: 

1766 

1767 - python pickle files (.pkl) 

1768 - numpy files (.npz) 

1769 - matlab files (.mat) 

1770 

1771 Parameters 

1772 ---------- 

1773 filepath: str 

1774 Path to a container file. 

1775 buffersize: float 

1776 Size of internal buffer in seconds. 

1777 backsize: float 

1778 Part of the buffer to be loaded before the requested start index in seconds. 

1779 verbose: int 

1780 If > 0 show detailed error/warning messages. 

1781 datakey: None, str, or list of str 

1782 Name of the variable holding the data. If `None` take the 

1783 variable that is an 2D array and has the largest number of 

1784 elements. 

1785 samplekey: str or list of str 

1786 Name of the variable holding the sampling rate. 

1787 timekey: str or list of str 

1788 Name of the variable holding sampling times. 

1789 If no sampling rate is available, the sampling rate is retrieved 

1790 from the sampling times. 

1791 amplkey: str or list of str 

1792 Name of the variable holding the amplitude range of the data. 

1793 unitkey: str 

1794 Name of the variable holding the unit of the data. 

1795 metadatakey: str or list of str 

1796 Name of the variable holding the metadata. 

1797 poskey: str or list of str 

1798 Name of the variable holding positions of markers. 

1799 spanskey: str or list of str 

1800 Name of the variable holding spans of markers. 

1801 labelskey: str or list of str 

1802 Name of the variable holding labels of markers. 

1803 descrkey: str or list of str 

1804 Name of the variable holding descriptions of markers. 

1805 amax: None or float 

1806 If specified and no amplitude range has been found in the data 

1807 container, then this is the amplitude range of the data. 

1808 unit: None or str 

1809 If specified and no unit has been found in the data container, 

1810 then return this as the unit of the data. 

1811 

1812 Raises 

1813 ------ 

1814 ValueError: 

1815 Invalid key requested. 

1816 """ 

1817 self.verbose = verbose 

1818 data_dict = {} 

1819 ext = os.path.splitext(filepath)[1] 

1820 if ext == '.pkl': 

1821 import pickle 

1822 with open(filepath, 'rb') as f: 

1823 data_dict = pickle.load(f) 

1824 self.format = 'PKL' 

1825 elif ext == '.npz': 

1826 data_dict = np.load(filepath) 

1827 self.format = 'NPZ' 

1828 elif ext == '.mat': 

1829 from scipy.io import loadmat 

1830 data_dict = loadmat(filepath, squeeze_me=True) 

1831 self.format = 'MAT' 

1832 self.buffer, self.rate, self.unit, amax = \ 

1833 extract_container_data(data_dict, datakey, samplekey, 

1834 timekey, amplkey, unitkey, amax, unit) 

1835 self.filepath = filepath 

1836 self.file_paths = [self.filepath] 

1837 self.file_indices = [0] 

1838 self.channels = self.buffer.shape[1] 

1839 self.frames = self.buffer.shape[0] 

1840 self.shape = self.buffer.shape 

1841 self.ndim = self.buffer.ndim 

1842 self.size = self.buffer.size 

1843 self.encoding = self.numpy_encodings[self.buffer.dtype] 

1844 self.ampl_min = -amax 

1845 self.ampl_max = +amax 

1846 self.offset = 0 

1847 self.buffer_changed = np.zeros(self.channels, dtype=bool) 

1848 self.bufferframes = self.frames 

1849 self.backsize = 0 

1850 self.close = self._close_container 

1851 self.load_audio_buffer = self._load_buffer_container 

1852 self._metadata = extract_container_metadata(data_dict, metadatakey) 

1853 self._load_metadata = None 

1854 self._locs, self._labels = extract_container_markers(data_dict, 

1855 poskey, 

1856 spanskey, 

1857 labelskey, 

1858 descrkey) 

1859 self._load_markers = None 

1860 

1861 def _close_container(self): 

1862 """Close container. """ 

1863 pass 

1864 

1865 def _load_buffer_container(self, r_offset, r_size, buffer): 

1866 """Load new data from container.""" 

1867 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :] 

1868 

1869 

1870 # raw data interface: 

1871 def open_raw(self, filepath, buffersize=10.0, backsize=0.0, 

1872 verbose=0, rate=44000, channels=1, dtype=np.float32, 

1873 amax=1.0, unit='a.u.'): 

1874 """Load data from a raw file. 

1875 

1876 Raw files just contain the data and absolutely no metadata, not 

1877 even the smapling rate, number of channels, etc. 

1878 Supported file formats are: 

1879 

1880 - raw files (*.raw) 

1881 - LabView scandata (*.scandat) 

1882 

1883 Parameters 

1884 ---------- 

1885 filepath: str 

1886 Path of the file to load. 

1887 buffersize: float 

1888 Size of internal buffer in seconds. 

1889 backsize: float 

1890 Part of the buffer to be loaded before the requested start index in seconds. 

1891 verbose: int 

1892 If > 0 show detailed error/warning messages. 

1893 rate: float 

1894 Sampling rate of the data in Hertz. 

1895 channels: int 

1896 Number of channels multiplexed in the data. 

1897 dtype: str or numpy.dtype 

1898 The data type stored in the file. 

1899 amax: float 

1900 The amplitude range of the data. 

1901 unit: str 

1902 The unit of the data. 

1903 """ 

1904 self.verbose = verbose 

1905 self.filepath = filepath 

1906 self.file_paths = [self.filepath] 

1907 self.file_indices = [0] 

1908 self.sf = open(self.filepath, 'rb') 

1909 if verbose > 0: 

1910 print(f'open_raw(filepath) with filepath={filepath}') 

1911 self.dtype = np.dtype(dtype) 

1912 self.rate = float(rate) 

1913 # file size: 

1914 self.sf.seek(0, os.SEEK_END) 

1915 self.frames = self.sf.tell()//self.dtype.itemsize 

1916 self.sf.seek(0) 

1917 self.channels = int(channels) 

1918 self.shape = (self.frames, self.channels) 

1919 self.ndim = len(self.shape) 

1920 self.size = self.frames*self.channels 

1921 self.format = 'RAW' 

1922 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN') 

1923 self.unit = unit 

1924 self.ampl_max = float(amax) 

1925 self.ampl_min = -self.ampl_max 

1926 self.offset = 0 

1927 self.bufferframes = int(buffersize*self.rate) 

1928 self.backframes = int(backsize*self.rate) 

1929 self.init_buffer() 

1930 self.close = self._close_raw 

1931 self.load_audio_buffer = self._load_buffer_raw 

1932 self._metadata = None 

1933 self._load_metadata = None 

1934 self._locs = None 

1935 self._labels = None 

1936 self._load_markers = None 

1937 

1938 def _close_raw(self): 

1939 """Close raw file. """ 

1940 self.sf.close() 

1941 self.sf = None 

1942 

1943 def _load_buffer_raw(self, r_offset, r_size, buffer): 

1944 """Load new data from container.""" 

1945 if self.sf is None: 

1946 self.sf = open(self.filepath, 'rb') 

1947 self.sf.seek(r_offset*self.dtype.itemsize) 

1948 raw_data = self.sf.read(r_size*self.dtype.itemsize) 

1949 raw_data = np.frombuffer(raw_data, dtype=self.dtype) 

1950 raw_data = raw_data.reshape(-1, self.channels) 

1951 # recode: 

1952 if self.dtype == np.dtype('int16'): 

1953 data = raw_data.astype('float32') 

1954 data *= self.ampl_max/2**15 

1955 elif self.dtype == np.dtype('int32'): 

1956 data = raw_data.astype(float) 

1957 data *= self.ampl_max/2**31 

1958 elif self.dtype == np.dtype('int64'): 

1959 data = raw_data.astype(float) 

1960 data *= self.ampl_max/2**63 

1961 else: 

1962 data = raw_data 

1963 buffer[:, :] = data 

1964 

1965 

1966 # audioio interface:  

1967 def open_audioio(self, filepath, buffersize=10.0, backsize=0.0, 

1968 verbose=0, gainkey=default_gain_keys, sep='.', 

1969 amax=None, unit='a.u.'): 

1970 """Open an audio file. 

1971 

1972 See the [audioio](https://github.com/bendalab/audioio) package 

1973 for details. 

1974 

1975 Parameters 

1976 ---------- 

1977 filepath: str 

1978 Path to an audio file. 

1979 buffersize: float 

1980 Size of internal buffer in seconds. 

1981 backsize: float 

1982 Part of the buffer to be loaded before the requested start index 

1983 in seconds. 

1984 verbose: int 

1985 If > 0 show detailed error/warning messages. 

1986 gainkey: str or list of str 

1987 Key in the file's metadata that holds some gain information. 

1988 If found, the data will be multiplied with the gain, 

1989 and if available, the corresponding unit is returned. 

1990 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details. 

1991 sep: str 

1992 String that separates section names in `gainkey`. 

1993 amax: None or float 

1994 If specified and no gain has been found in the metadata, 

1995 then use this as the amplitude range. 

1996 unit: None or str 

1997 If specified and no gain has been found in the metadata, 

1998 then this is the unit of the data. 

1999 

2000 """ 

2001 self.verbose = verbose 

2002 super(DataLoader, self).open(filepath, buffersize, backsize, verbose) 

2003 md = self.metadata() 

2004 fac, unit = get_gain(md, gainkey, sep, amax, unit) 

2005 if fac is None: 

2006 self.gain_fac = 1.0 

2007 else: 

2008 self.gain_fac = fac 

2009 self._load_buffer_audio_org = self.load_audio_buffer 

2010 self.load_audio_buffer = self._load_buffer_audioio 

2011 self.ampl_min *= self.gain_fac 

2012 self.ampl_max *= self.gain_fac 

2013 self.unit = unit 

2014 return self 

2015 

2016 def _load_buffer_audioio(self, r_offset, r_size, buffer): 

2017 """Load and scale new data from an audio file. 

2018 

2019 Parameters 

2020 ---------- 

2021 r_offset: int 

2022 First frame to be read from file. 

2023 r_size: int 

2024 Number of frames to be read from file. 

2025 buffer: ndarray 

2026 Buffer where to store the loaded data. 

2027 """ 

2028 self._load_buffer_audio_org(r_offset, r_size, buffer) 

2029 buffer *= self.gain_fac 

2030 

2031 

2032 # open multiple files as one: 

2033 def open_multiple(self, filepaths, buffersize=10.0, backsize=0.0, 

2034 verbose=0, rate=None, channels=None, 

2035 unit=None, amax=None, end_indices=None): 

2036 """Open multiple files as a single concatenated array. 

2037 

2038 Parameters 

2039 ---------- 

2040 filepaths: list of str 

2041 List of file names of audio files. 

2042 buffersize: float 

2043 Size of internal buffer in seconds. 

2044 backsize: float 

2045 Part of the buffer to be loaded before the requested start index in seconds. 

2046 verbose: int 

2047 If larger than zero show detailed error/warning messages. 

2048 rate: float 

2049 If provided, do a minimal initialization (no checking) 

2050 using the provided sampling rate (in Hertz), channels, 

2051 unit, maximum amplitude, and end_indices. 

2052 channels: int 

2053 If provided, do a minimal initialization (no checking) 

2054 using the provided rate, number of channels, 

2055 unit, maximum amplitude, and end_indices. 

2056 unit: str 

2057 If provided, do a minimal initialization (no checking) 

2058 using the provided rate, number of channels, 

2059 unit, maximum amplitude, and end_indices. 

2060 amax: float 

2061 If provided, do a minimal initialization (no checking) 

2062 using the provided rate, number of channels, 

2063 unit, maximum amplitude amax, and end_indices. 

2064 end_indices: sequence of int 

2065 If provided, do a minimal initialization (no checking) 

2066 using the provided rate, channels, 

2067 unit, maximum amplitude, and end_indices. 

2068 

2069 Raises 

2070 ------ 

2071 TypeError 

2072 `filepaths` must be a sequence. 

2073 ValueError 

2074 Empty `filepaths`. 

2075 FileNotFoundError 

2076 `filepaths` does not contain a single valid file. 

2077 

2078 """ 

2079 if not isinstance(filepaths, (list, tuple, np.ndarray)): 

2080 raise TypeError('input argument filepaths is not a sequence!') 

2081 if len(filepaths) == 0: 

2082 raise ValueError('input argument filepaths is empy sequence!') 

2083 self.buffersize = buffersize 

2084 self.backsize = backsize 

2085 self.filepath = None 

2086 self.file_paths = [] 

2087 self.open_files = [] 

2088 self.open_loaders = [] 

2089 self.data_files = [] 

2090 self.collect_counter = 0 

2091 self.frames = 0 

2092 self.start_indices = [] 

2093 self.end_indices = [] 

2094 self.start_time = None 

2095 start_time = None 

2096 self._metadata = {} 

2097 self._locs = np.zeros((0, 2), dtype=int) 

2098 self._labels = np.zeros((0, 2), dtype=object) 

2099 if end_indices is not None: 

2100 self.filepath = filepaths[0] 

2101 self.file_paths = filepaths 

2102 self.data_files = [None] * len(filepaths) 

2103 self.frames = end_indices[-1] 

2104 self.start_indices = [0] + list(end_indices[:-1]) 

2105 self.end_indices = end_indices 

2106 self.format = None 

2107 self.encoding = None 

2108 self.rate = rate 

2109 self.channels = channels 

2110 self.unit = unit 

2111 self.ampl_max = amax 

2112 self.ampl_min = -amax 

2113 else: 

2114 for filepath in filepaths: 

2115 try: 

2116 a = DataLoader(filepath, buffersize, backsize, verbose) 

2117 except Exception as e: 

2118 if verbose > 0: 

2119 print(e) 

2120 continue 

2121 # collect metadata: 

2122 md = a.metadata() 

2123 fmd = flatten_metadata(md, True) 

2124 add_metadata(self._metadata, fmd) 

2125 if self.filepath is None: 

2126 # first file: 

2127 self.filepath = a.filepath 

2128 self.format = a.format 

2129 self.encoding = a.encoding 

2130 self.rate = a.rate 

2131 self.channels = a.channels 

2132 self.unit = a.unit 

2133 self.ampl_max = a.ampl_max 

2134 self.ampl_min = a.ampl_min 

2135 self.start_time = get_datetime(md) 

2136 start_time = self.start_time 

2137 else: 

2138 # check channels, rate, and amplitudes: 

2139 error_str = None 

2140 if a.channels != self.channels: 

2141 error_str = f'number of channels differs: ' \ 

2142 f'{a.channels} in {a.filepath} versus ' \ 

2143 f'{self.channels} in {self.filepath}' 

2144 if a.rate != self.rate: 

2145 error_str = f'sampling rates differ: ' \ 

2146 f'{a.rate} in {a.filepath} versus ' \ 

2147 f'{self.rate} in {self.filepath}' 

2148 if a.ampl_min != self.ampl_min: 

2149 error_str = f'minimum amplitudes differ: ' \ 

2150 f'{a.ampl_min} in {a.filepath} versus ' \ 

2151 f'{self.ampl_min} in {self.filepath}' 

2152 if a.ampl_max != self.ampl_max: 

2153 error_Str = f'maximum amplitudes differ: ' \ 

2154 f'{a.ampl_max} in {a.filepath} versus ' \ 

2155 f'{self.ampl_max} in {self.filepath}' 

2156 # check start time of recording: 

2157 stime = get_datetime(md) 

2158 if start_time is None or stime is None or \ 

2159 abs(start_time - stime) > timedelta(seconds=1): 

2160 error_str = f'start time does not indicate continuous recording: ' \ 

2161 f'expected {start_time} instead of ' \ 

2162 f'{stime} in {a.filepath}' 

2163 if error_str is not None: 

2164 if verbose > 0: 

2165 print(error_str) 

2166 a.close() 

2167 del a 

2168 break 

2169 # markers: 

2170 locs, labels = a.markers() 

2171 locs[:,0] += self.frames 

2172 self._locs = np.vstack((self._locs, locs)) 

2173 self._labels = np.vstack((self._labels, labels)) 

2174 # indices: 

2175 self.start_indices.append(self.frames) 

2176 self.frames += a.frames 

2177 self.end_indices.append(self.frames) 

2178 if start_time is not None: 

2179 start_time += timedelta(seconds=a.frames/a.rate) 

2180 # add file to lists: 

2181 self.file_paths.append(filepath) 

2182 if len(self.open_files) < AudioLoader.max_open_files: 

2183 self.open_files.append(a) 

2184 else: 

2185 a.close() 

2186 if len(self.open_loaders) < AudioLoader.max_open_loaders: 

2187 self.data_files.append(a) 

2188 self.open_loaders.append(a) 

2189 else: 

2190 a.close() 

2191 del a 

2192 self.data_files.append(None) 

2193 if len(self.data_files) == 0: 

2194 raise FileNotFoundError('input argument filepaths does not contain any valid audio file!') 

2195 # set startime from first file: 

2196 if self.start_time is not None: 

2197 set_starttime(self._metadata, self.start_time) 

2198 # setup infrastructure: 

2199 self.file_indices = self.start_indices 

2200 self.start_indices = np.array(self.start_indices) 

2201 self.end_indices = np.array(self.end_indices) 

2202 self.shape = (self.frames, self.channels) 

2203 self.bufferframes = int(buffersize*self.rate) 

2204 self.backframes = int(backsize*self.rate) 

2205 self.init_buffer() 

2206 self.close = self._close_multiple 

2207 self.load_audio_buffer = self._load_buffer_multiple 

2208 self._load_metadata = None 

2209 self._load_markers = None 

2210 return self 

2211 

2212 def _close_multiple(self): 

2213 """Close all the data files. """ 

2214 self.open_files = [] 

2215 self.open_loaders = [] 

2216 if hasattr(self, 'data_files'): 

2217 for a in self.data_files: 

2218 if a is not None: 

2219 a.close() 

2220 self.data_files = [] 

2221 self.filepath = None 

2222 self.file_paths = [] 

2223 self.file_indices = [] 

2224 self.start_indices = [] 

2225 self.end_indices = [] 

2226 del self.data_files 

2227 del self.open_files 

2228 del self.open_loaders 

2229 del self.start_indices 

2230 del self.end_indices 

2231 

2232 def _load_buffer_multiple(self, r_offset, r_size, buffer): 

2233 """Load new data from the underlying files. 

2234 

2235 Parameters 

2236 ---------- 

2237 r_offset: int 

2238 First frame to be read from file. 

2239 r_size: int 

2240 Number of frames to be read from file. 

2241 buffer: ndarray 

2242 Buffer where to store the loaded data. 

2243 """ 

2244 offs = r_offset 

2245 size = r_size 

2246 boffs = 0 

2247 ai = np.searchsorted(self.end_indices, offs, side='right') 

2248 while size > 0: 

2249 if self.data_files[ai] is None: 

2250 a = DataLoader(self.file_paths[ai], 

2251 self.buffersize, self.backsize, 0) 

2252 self.data_files[ai] = a 

2253 self.open_loaders.append(a) 

2254 self.open_files.append(a) 

2255 if len(self.open_files) > AudioLoader.max_open_files: 

2256 a0 = self.open_files.pop(0) 

2257 a0.close() 

2258 if len(self.open_loaders) > AudioLoader.max_open_loaders: 

2259 a0 = self.open_loaders.pop(0) 

2260 self.data_files[self.data_files.index(a0)] = None 

2261 a0.close() 

2262 del a0 

2263 self.collect_counter += 1 

2264 if self.collect_counter > AudioLoader.max_open_loaders//2: 

2265 gc.collect() # takes time! 

2266 self.collect_counter = 0 

2267 else: 

2268 self.open_loaders.pop(self.open_loaders.index(self.data_files[ai])) 

2269 self.open_loaders.append(self.data_files[ai]) 

2270 ai0 = offs - self.start_indices[ai] 

2271 ai1 = offs + size 

2272 if ai1 > self.end_indices[ai]: 

2273 ai1 = self.end_indices[ai] 

2274 ai1 -= self.start_indices[ai] 

2275 n = ai1 - ai0 

2276 self.data_files[ai].load_audio_buffer(ai0, n, 

2277 buffer[boffs:boffs + n,:]) 

2278 if self.data_files[ai] in self.open_files: 

2279 self.open_files.pop(self.open_files.index(self.data_files[ai])) 

2280 self.open_files.append(self.data_files[ai]) 

2281 if len(self.open_files) > AudioLoader.max_open_files: 

2282 self.open_files[0].close() 

2283 self.open_files.pop(0) 

2284 boffs += n 

2285 offs += n 

2286 size -= n 

2287 ai += 1 

2288 

2289 

2290 def open(self, filepath, buffersize=10.0, backsize=0.0, 

2291 verbose=0, **kwargs): 

2292 """Open file with time-series data for reading. 

2293 

2294 Parameters 

2295 ---------- 

2296 filepath: str or list of str 

2297 Name of the file or list of many file names that should be 

2298 made accessible as a single array. 

2299 buffersize: float 

2300 Size of internal buffer in seconds. 

2301 backsize: float 

2302 Part of the buffer to be loaded before the requested start index 

2303 in seconds. 

2304 verbose: int 

2305 If > 0 show detailed error/warning messages. 

2306 **kwargs: dict 

2307 Further keyword arguments that are passed on to the  

2308 format specific opening functions. 

2309 For example: 

2310 - `amax`: the amplitude range of the data. 

2311 - 'unit': the unit of the data. 

2312 

2313 Raises 

2314 ------ 

2315 ValueError: 

2316 `filepath` is empty string. 

2317 """ 

2318 # list of implemented open functions: 

2319 data_open_funcs = ( 

2320 ('relacs', check_relacs, self.open_relacs, 1), 

2321 ('fishgrid', check_fishgrid, self.open_fishgrid, 1), 

2322 ('container', check_container, self.open_container, 1), 

2323 ('raw', check_raw, self.open_raw, 1), 

2324 ('audioio', None, self.open_audioio, 0), 

2325 ) 

2326 

2327 self.buffer = np.array([]) 

2328 self.rate = 0.0 

2329 if not filepath: 

2330 raise ValueError('input argument filepath is empty string.') 

2331 if isinstance(filepath, (list, tuple, np.ndarray)): 

2332 self.open_multiple(filepath, buffersize, backsize, 

2333 verbose, **kwargs) 

2334 if len(self.file_paths) > 1: 

2335 return self 

2336 filepath = self.file_paths[0] 

2337 self.close() 

2338 # open data: 

2339 for name, check_file, open_file, v in data_open_funcs: 

2340 if check_file is None or check_file(filepath): 

2341 open_file(filepath, buffersize, backsize, verbose, **kwargs) 

2342 if v*verbose > 1: 

2343 if self.format is not None: 

2344 print(f' format : {self.format}') 

2345 if self.encoding is not None: 

2346 print(f' encoding : {self.encoding}') 

2347 print(f' sampling rate: {self.rate} Hz') 

2348 print(f' channels : {self.channels}') 

2349 print(f' frames : {self.frames}') 

2350 print(f' range : {self.ampl_max:g}{self.unit}') 

2351 break 

2352 return self 

2353 

2354 

2355def demo(filepath, plot=False): 

2356 print("try load_data:") 

2357 data, rate, unit, amax = load_data(filepath, verbose=2) 

2358 if plot: 

2359 fig, ax = plt.subplots() 

2360 time = np.arange(len(data))/rate 

2361 for c in range(data.shape[1]): 

2362 ax.plot(time, data[:,c]) 

2363 ax.set_xlabel('Time [s]') 

2364 ax.set_ylabel(f'[{unit}]') 

2365 if amax is not None and np.isfinite(amax): 

2366 ax.set_ylim(-amax, +amax) 

2367 plt.show() 

2368 return 

2369 

2370 print('') 

2371 print("try DataLoader:") 

2372 with DataLoader(filepath, 2.0, 1.0, 1) as data: 

2373 print('sampling rate: %g' % data.rate) 

2374 print('frames : %d %d' % (len(data), data.shape[0])) 

2375 nframes = int(1.0 * data.rate) 

2376 # forward: 

2377 for i in range(0, len(data), nframes): 

2378 print('forward %d-%d' % (i, i + nframes)) 

2379 x = data[i:i + nframes, 0] 

2380 if plot: 

2381 fig, ax = plt.subplots() 

2382 ax.plot((i + np.arange(len(x)))/data.rate, x) 

2383 ax.set_xlabel('Time [s]') 

2384 ax.set_ylabel(f'[{data.unit}]') 

2385 plt.show() 

2386 # and backwards: 

2387 for i in reversed(range(0, len(data), nframes)): 

2388 print('backward %d-%d' % (i, i + nframes)) 

2389 x = data[i:i + nframes, 0] 

2390 if plot: 

2391 fig, ax = plt.subplots() 

2392 ax.plot((i + np.arange(len(x)))/data.rate, x) 

2393 ax.set_xlabel('Time [s]') 

2394 ax.set_ylabel(f'[{data.unit}]') 

2395 plt.show() 

2396 

2397 

2398def main(*cargs): 

2399 """Call demo with command line arguments. 

2400 

2401 Parameters 

2402 ---------- 

2403 cargs: list of str 

2404 Command line arguments as provided by sys.argv[1:] 

2405 """ 

2406 import argparse 

2407 parser = argparse.ArgumentParser(description= 

2408 'Checking thunderlab.dataloader module.') 

2409 parser.add_argument('-p', dest='plot', action='store_true', 

2410 help='plot loaded data') 

2411 parser.add_argument('file', nargs=1, default='', type=str, 

2412 help='name of data file') 

2413 args = parser.parse_args(cargs) 

2414 demo(args.file[0], args.plot) 

2415 

2416 

2417if __name__ == "__main__": 

2418 main(*sys.argv[1:])