Coverage for src/thunderlab/dataloader.py: 77%

1097 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-18 22:10 +0000

1"""Load time-series data from files. 

2 

3``` 

4data, rate, unit, amax = load_data('data/file.wav') 

5``` 

6 

7The function `data_loader()` loads the whole time-series from the file 

8as a numpy array of floats. First dimension is frames, second is 

9channels. In contrast to the `audioio.load_audio()` function, the 

10values of the data array are not restricted between -1 and 1. They can 

11assume any value wihin the range `-amax` to `+amax` with the returned 

12`unit`. 

13 

14``` 

15data = DataLoader('data/file.wav', 60.0) 

16``` 

17or 

18``` 

19with DataLoader('data/file.wav', 60.0) as data: 

20``` 

21Create an `DataLoader` object that loads chuncks of 60 seconds long data 

22on demand. `data` can be used like a read-only numpy array of floats. 

23 

24 

25## Supported file formats 

26 

27- python pickle files 

28- numpy .npz files 

29- matlab .mat files 

30- audio files via [`audioio`](https://github.com/bendalab/audioio) package 

31- LabView .scandat files 

32- relacs trace*.raw files (https://www.relacs.net) 

33- fishgrid traces-*.raw files (https://github.com/bendalab/fishgrid) 

34 

35 

36## Metadata 

37 

38Many file formats allow to store metadata that further describe the 

39stored time series data. We handle them as nested dictionary of key-value 

40pairs. Load them with the `metadata()` function: 

41``` 

42metadata = metadata('data/file.mat') 

43``` 

44 

45## Markers 

46 

47Some file formats also allow to store markers that mark specific 

48positions in the time series data. Load marker positions and spans (in 

49the 2-D array `locs`) and label and text strings (in the 2-D array 

50`labels`) with the `markers()` function: 

51``` 

52locs, labels = markers('data.wav') 

53``` 

54 

55## Aditional, format specific functions 

56 

57- `extract_container_metadata()`: extract metadata from dictionary loaded from a container file. 

58- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file. 

59- `relacs_header()`: read key-value pairs from relacs *.dat file headers. 

60- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file. 

61- `fishgrid_spacings()`: spacing between grid electrodes. 

62 

63""" 

64 

65import gc 

66import os 

67import sys 

68import glob 

69import gzip 

70import numpy as np 

71try: 

72 import matplotlib.pyplot as plt 

73except ImportError: 

74 pass 

75from pathlib import Path 

76from datetime import timedelta 

77from audioio import load_audio, AudioLoader, unflatten_metadata 

78from audioio import get_number_unit, get_number, get_int, get_bool, get_gain 

79from audioio import default_starttime_keys, default_gain_keys 

80from audioio import get_datetime, flatten_metadata, add_metadata, set_starttime 

81from audioio import metadata as metadata_audioio 

82from audioio import markers as markers_audioio 

83 

84 

85def relacs_samplerate_unit(filepath, channel=0): 

86 """Retrieve sampling rate and unit from a relacs stimuli.dat file. 

87 

88 Parameters 

89 ---------- 

90 filepath: str 

91 Path to a relacs data directory, or a file in a relacs data directory. 

92 channel: int 

93 Channel (trace) number, if `filepath` does not specify a 

94 trace-*.raw file. 

95 

96 Returns 

97 ------- 

98 samplerate: float 

99 Sampling rate in Hertz 

100 unit: str 

101 Unit of the trace, can be empty if not found 

102 

103 Raises 

104 ------ 

105 IOError/FileNotFoundError: 

106 If the stimuli.dat file does not exist. 

107 ValueError: 

108 stimuli.dat file does not contain sampling rate. 

109 """ 

110 trace = channel + 1 

111 relacs_dir = filepath 

112 # check for relacs data directory: 

113 if not os.path.isdir(filepath): 

114 relacs_dir = os.path.dirname(filepath) 

115 bn = os.path.basename(filepath).lower() 

116 i = bn.find('.raw') 

117 if len(bn) > 5 and bn[0:5] == 'trace' and i > 6: 

118 trace = int(bn[6:i]) 

119 

120 # retreive sampling rate and unit from stimuli.dat file: 

121 samplerate = None 

122 sampleinterval = None 

123 unit = "" 

124 

125 lines = [] 

126 stimuli_file = os.path.join(relacs_dir, 'stimuli.dat') 

127 if os.path.isfile(stimuli_file + '.gz'): 

128 stimuli_file += '.gz' 

129 if stimuli_file[-3:] == '.gz': 

130 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf: 

131 for line in sf: 

132 line = line.strip() 

133 if len(line) == 0 or line[0] != '#': 

134 break 

135 lines.append(line) 

136 else: 

137 with open(stimuli_file, 'r', encoding='latin-1') as sf: 

138 for line in sf: 

139 line = line.strip() 

140 if len(line) == 0 or line[0] != '#': 

141 break 

142 lines.append(line) 

143 

144 for line in lines: 

145 if "unit%d" % trace in line: 

146 unit = line.split(':')[1].strip() 

147 if "sampling rate%d" % trace in line: 

148 value = line.split(':')[1].strip() 

149 samplerate = float(value.replace('Hz','')) 

150 elif "sample interval%d" % trace in line: 

151 value = line.split(':')[1].strip() 

152 sampleinterval = float(value.replace('ms','')) 

153 

154 if samplerate is not None: 

155 return samplerate, unit 

156 if sampleinterval is not None: 

157 return 1000/sampleinterval, unit 

158 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}') 

159 

160 

161def relacs_header(filepath, store_empty=False, first_only=False, 

162 lower_keys=False, flat=False, 

163 add_sections=False): 

164 """Read key-value pairs from a relacs *.dat file header. 

165 

166 Parameters 

167 ---------- 

168 filepath: str 

169 A relacs *.dat file, can be also a zipped .gz file. 

170 store_empty: bool 

171 If `False` do not add meta data with empty values. 

172 first_only: bool 

173 If `False` only store the first element of a list. 

174 lower_keys: bool 

175 Make all keys lower case. 

176 flat: bool 

177 Do not make a nested dictionary. 

178 Use this option also to read in very old relacs metadata with 

179 ragged left alignment. 

180 add_sections: bool 

181 If `True`, prepend keys with sections names separated by 

182 '.' to make them unique. 

183 

184 Returns 

185 ------- 

186 data: dict 

187 Nested dictionary with key-value pairs of the file header. 

188  

189 Raises 

190 ------ 

191 IOError/FileNotFoundError: 

192 If `filepath` cannot be opened. 

193 """ 

194 # read in header from file: 

195 lines = [] 

196 if os.path.isfile(filepath + '.gz'): 

197 filepath += '.gz' 

198 if filepath[-3:] == '.gz': 

199 with gzip.open(filepath, 'r', encoding='latin-1') as sf: 

200 for line in sf: 

201 line = line.strip() 

202 if len(line) == 0 or line[0] != '#': 

203 break 

204 lines.append(line) 

205 else: 

206 with open(filepath, 'r', encoding='latin-1') as sf: 

207 for line in sf: 

208 line = line.strip() 

209 if len(line) == 0 or line[0] != '#': 

210 break 

211 lines.append(line) 

212 # parse: 

213 data = {} 

214 cdatas = [data] 

215 sections = [''] 

216 ident_offs = None 

217 ident = None 

218 for line in lines: 

219 words = line.split(':') 

220 value = ':'.join(words[1:]).strip() if len(words) > 1 else '' 

221 if len(words) >= 1: 

222 key = words[0].strip('#') 

223 # get section level: 

224 level = 0 

225 if not flat or len(value) == 0: 

226 nident = len(key) - len(key.lstrip()) 

227 if ident_offs is None: 

228 ident_offs = nident 

229 elif ident is None: 

230 if nident > ident_offs: 

231 ident = nident - ident_offs 

232 level = 1 

233 else: 

234 level = (nident - ident_offs)//ident 

235 # close sections: 

236 if not flat: 

237 while len(cdatas) > level + 1: 

238 cdatas[-1][sections.pop()] = cdatas.pop() 

239 else: 

240 while len(sections) > level + 1: 

241 sections.pop() 

242 # key: 

243 key = key.strip().strip('"') 

244 if lower_keys: 

245 key = key.lower() 

246 skey = key 

247 if add_sections: 

248 key = '.'.join(sections[1:] + [key]) 

249 if len(value) == 0: 

250 # new sub-section: 

251 if flat: 

252 if store_empty: 

253 cdatas[-1][key] = None 

254 else: 

255 cdatas.append({}) 

256 sections.append(skey) 

257 else: 

258 # key-value pair: 

259 value = value.strip('"') 

260 if len(value) > 0 or value != '-' or store_empty: 

261 if len(value) > 0 and value[0] == '[' and value[-1] == ']': 

262 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')] 

263 if first_only: 

264 value = value[0] 

265 cdatas[-1][key] = value 

266 while len(cdatas) > 1: 

267 cdatas[-1][sections.pop()] = cdatas.pop() 

268 return data 

269 

270 

271def check_relacs(filepath): 

272 """Check for valid relacs file. 

273 

274 Parameters 

275 ---------- 

276 filepath: str 

277 Path to a relacs data directory, or a file in a relacs data directory. 

278 

279 Returns 

280 ------- 

281 is_relacs: boolean 

282 `True` if `filepath` is a valid relacs directory or is a file therein. 

283 """ 

284 # relacs data directory: 

285 relacs_dir = filepath 

286 if not os.path.isdir(filepath): 

287 relacs_dir = os.path.dirname(filepath) 

288 # check for a valid relacs data directory: 

289 has_stimuli = False 

290 has_trace = False 

291 for fname in ['stimuli.dat', 'stimuli.dat.gz']: 

292 if os.path.isfile(os.path.join(relacs_dir, fname)): 

293 has_stimuli = True 

294 for fname in ['trace-1.raw', 'trace-1.raw.gz']: 

295 if os.path.isfile(os.path.join(relacs_dir, fname)): 

296 has_trace = True 

297 return has_stimuli and has_trace 

298 

299 

300def relacs_trace_files(filepath): 

301 """Expand file path for relacs data to appropriate trace*.raw file names. 

302 

303 Parameters 

304 ---------- 

305 filepath: str 

306 Path to a relacs data directory, or a file in a relacs data directory. 

307  

308 Returns 

309 ------- 

310 trace_filepaths: list of str 

311 List of relacs trace*.raw files. 

312 """ 

313 relacs_dir = filepath 

314 if not os.path.isdir(filepath): 

315 relacs_dir = os.path.dirname(filepath) 

316 trace_filepaths = [] 

317 for k in range(10000): 

318 fname = os.path.join(relacs_dir, f'trace-{k+1}.raw') 

319 if os.path.isfile(fname): 

320 trace_filepaths.append(fname) 

321 elif os.path.isfile(fname + '.gz'): 

322 trace_filepaths.append(fname + '.gz') 

323 else: 

324 break 

325 return trace_filepaths 

326 

327 

328def load_relacs(filepath, amax=1.0): 

329 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs). 

330 

331 Parameters 

332 ---------- 

333 filepath: str 

334 Path to a relacs data directory, or a file in a relacs data directory. 

335 amax: float 

336 The amplitude range of the data. 

337 

338 Returns 

339 ------- 

340 data: 2-D array 

341 All data traces as an 2-D numpy array, even for single channel data. 

342 First dimension is time, second is channel. 

343 rate: float 

344 Sampling rate of the data in Hz 

345 unit: str 

346 Unit of the data 

347 amax: float 

348 Maximum amplitude of data range. 

349 

350 Raises 

351 ------ 

352 FileNotFoundError: 

353 Invalid or non existing relacs files. 

354 ValueError: 

355 - Invalid name for relacs trace-*.raw file. 

356 - Sampling rates of traces differ. 

357 - Unit of traces differ. 

358 """ 

359 trace_filepaths = relacs_trace_files(filepath) 

360 if len(trace_filepaths) == 0: 

361 raise FileNotFoundError(f'no relacs files found') 

362 # load trace*.raw files: 

363 nchannels = len(trace_filepaths) 

364 data = None 

365 nrows = 0 

366 rate = None 

367 unit = '' 

368 for c, path in enumerate(sorted(trace_filepaths)): 

369 if path[-3:] == '.gz': 

370 with gzip.open(path, 'rb') as sf: 

371 x = np.frombuffer(sf.read(), dtype=np.float32) 

372 else: 

373 x = np.fromfile(path, np.float32) 

374 if data is None: 

375 nrows = len(x) 

376 data = np.zeros((nrows, nchannels)) 

377 n = min(len(x), nrows) 

378 data[:n,c] = x[:n] 

379 # retrieve sampling rate and unit: 

380 crate, us = relacs_samplerate_unit(path, c) 

381 if rate is None: 

382 rate = crate 

383 elif crate != rate: 

384 raise ValueError('sampling rates of traces differ') 

385 if len(unit) == 0: 

386 unit = us 

387 elif us != unit: 

388 raise ValueError('unit of traces differ') 

389 return data, rate, unit, amax 

390 

391 

392def metadata_relacs(filepath, store_empty=False, first_only=False, 

393 lower_keys=False, flat=False, add_sections=False): 

394 """ Read meta-data of a relacs data set. 

395 

396 Parameters 

397 ---------- 

398 filepath: str 

399 A relacs data directory or a file therein. 

400 store_empty: bool 

401 If `False` do not add meta data with empty values. 

402 first_only: bool 

403 If `False` only store the first element of a list. 

404 lower_keys: bool 

405 Make all keys lower case. 

406 flat: bool 

407 Do not make a nested dictionary. 

408 Use this option also to read in very old relacs metadata with 

409 ragged left alignment. 

410 add_sections: bool 

411 If `True`, prepend keys with sections names separated by 

412 '.' to make them unique. 

413 

414 Returns 

415 ------- 

416 data: nested dict 

417 Nested dictionary with key-value pairs of the meta data. 

418 """ 

419 relacs_dir = filepath 

420 if not os.path.isdir(filepath): 

421 relacs_dir = os.path.dirname(filepath) 

422 info_path = os.path.join(relacs_dir, 'info.dat') 

423 if not os.path.exists(info_path): 

424 return dict(), [] 

425 data = relacs_header(info_path, store_empty, first_only, 

426 lower_keys, flat, add_sections) 

427 return data 

428 

429 

430def fishgrid_spacings(metadata, unit='m'): 

431 """Spacing between grid electrodes. 

432 

433 Parameters 

434 ---------- 

435 metadata: dict 

436 Fishgrid metadata obtained from `metadata_fishgrid()`. 

437 unit: str 

438 Unit in which to return the spacings. 

439 

440 Returns 

441 ------- 

442 grid_dist: list of tuple of float 

443 For each grid the distances between rows and columns in `unit`. 

444 """ 

445 grids_dist = [] 

446 for k in range(4): 

447 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0) 

448 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0) 

449 rows = get_int(metadata, f'Rows{k+1}', default=0) 

450 cols = get_int(metadata, f'Columns{k+1}', default=0) 

451 if get_bool(metadata, f'Used{k+1}', default=False) or \ 

452 cols > 0 and rows > 0: 

453 grids_dist.append((row_dist, col_dist)) 

454 return grids_dist 

455 

456 

457def fishgrid_grids(metadata): 

458 """Retrieve grid sizes from a fishgrid.cfg file. 

459 

460 Parameters 

461 ---------- 

462 metadata: dict 

463 Fishgrid metadata obtained from `metadata_fishgrid()`. 

464 

465 Returns 

466 ------- 

467 grids: list of tuple of int 

468 For each grid the number of rows and columns. 

469 """ 

470 grids = [] 

471 for k in range(4): 

472 rows = get_int(metadata, f'Rows{k+1}', default=0) 

473 cols = get_int(metadata, f'Columns{k+1}', default=0) 

474 if get_bool(metadata, f'Used{k+1}', default=False) or \ 

475 cols > 0 and rows > 0: 

476 grids.append((rows, cols)) 

477 return grids 

478 

479 

480def check_fishgrid(filepath): 

481 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid). 

482 

483 Parameters 

484 ---------- 

485 filepath: str 

486 Path to a fishgrid data directory or a file in a fishgrid 

487 data directory. 

488 

489 Returns 

490 ------- 

491 is_fishgrid: bool 

492 `True` if `filepath` is a valid fishgrid data directory or 

493 a file therein. 

494 """ 

495 # fishgrid data directory: 

496 fishgrid_dir = filepath 

497 if not os.path.isdir(filepath): 

498 fishgrid_dir = os.path.dirname(filepath) 

499 # check for a valid fishgrid data directory: 

500 return (os.path.isfile(os.path.join(fishgrid_dir, 'fishgrid.cfg')) and 

501 (os.path.isfile(os.path.join(fishgrid_dir, 'traces-grid1.raw')) or 

502 os.path.isfile(os.path.join(fishgrid_dir, 'traces.raw')))) 

503 

504 

505def fishgrid_trace_files(filepath): 

506 """Expand file paths for fishgrid data to appropriate traces*.raw file names. 

507 

508 Parameters 

509 ---------- 

510 filepath: str 

511 Path to a fishgrid data directory, or a file therein. 

512  

513 Returns 

514 ------- 

515 trace_filepaths: list of str 

516 List of fishgrid traces*.raw files. 

517 """ 

518 # find grids: 

519 fishgrid_dir = filepath 

520 if not os.path.isdir(fishgrid_dir): 

521 fishgrid_dir = os.path.dirname(filepath) 

522 trace_filepaths = [] 

523 for k in range(10000): 

524 file = os.path.join(fishgrid_dir, f'traces-grid{k+1}.raw') 

525 if os.path.isfile(file): 

526 trace_filepaths.append(file) 

527 else: 

528 break 

529 if len(trace_filepaths) == 0: 

530 file = os.path.join(fishgrid_dir, f'traces.raw') 

531 if os.path.isfile(file): 

532 trace_filepaths.append(file) 

533 return trace_filepaths 

534 

535 

536def load_fishgrid(filepath): 

537 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid). 

538 

539 Parameters 

540 ---------- 

541 filepath: str 

542 Path to a fishgrid data directory, or a file therein. 

543 

544 Returns 

545 ------- 

546 data: 2-D array 

547 All data traces as an 2-D numpy array, even for single channel data. 

548 First dimension is time, second is channel. 

549 rate: float 

550 Sampling rate of the data in Hz. 

551 unit: str 

552 Unit of the data. 

553 amax: float 

554 Maximum amplitude of data range. 

555 

556 Raises 

557 ------ 

558 FileNotFoundError: 

559 Invalid or non existing fishgrid files. 

560 """ 

561 trace_filepaths = fishgrid_trace_files(filepath) 

562 if len(trace_filepaths) == 0: 

563 raise FileNotFoundError(f'no fishgrid files found') 

564 md = metadata_fishgrid(filepath) 

565 grids = fishgrid_grids(md) 

566 grid_sizes = [r*c for r, c in grids] 

567 

568 # load traces-grid*.raw files: 

569 grid_channels = [] 

570 nchannels = 0 

571 for g, path in enumerate(trace_filepaths): 

572 grid_channels.append(grid_sizes[g]) 

573 nchannels += grid_sizes[g] 

574 data = None 

575 nrows = 0 

576 c = 0 

577 rate = get_number(md, 'Hz', 'AISampleRate') 

578 for path, channels in zip(trace_filepaths, grid_channels): 

579 x = np.fromfile(path, np.float32).reshape((-1, channels)) 

580 if data is None: 

581 nrows = len(x) 

582 data = np.zeros((nrows, nchannels)) 

583 n = min(len(x), nrows) 

584 data[:n,c:c+channels] = x[:n,:] 

585 c += channels 

586 amax, unit = get_number_unit(md, 'AIMaxVolt') 

587 return data, rate, unit, amax 

588 

589 

590# add fishgrid keys: 

591default_starttime_keys.append(['StartDate', 'StartTime']) 

592default_gain_keys.insert(0, 'AIMaxVolt') 

593 

594 

595def metadata_fishgrid(filepath): 

596 """ Read meta-data of a fishgrid data set. 

597 

598 Parameters 

599 ---------- 

600 filepath: str 

601 A fishgrid data directory or a file therein. 

602 

603 Returns 

604 ------- 

605 data: nested dict 

606 Nested dictionary with key-value pairs of the meta data. 

607 """ 

608 fishgrid_dir = filepath 

609 if not os.path.isdir(fishgrid_dir): 

610 fishgrid_dir = os.path.dirname(filepath) 

611 path = os.path.join(fishgrid_dir, 'fishgrid.cfg') 

612 # read in header from file: 

613 lines = [] 

614 if os.path.isfile(path + '.gz'): 

615 path += '.gz' 

616 if not os.path.exists(path): 

617 return {} 

618 if path[-3:] == '.gz': 

619 with gzip.open(path, 'r', encoding='latin-1') as sf: 

620 for line in sf: 

621 lines.append(line) 

622 else: 

623 with open(path, 'r', encoding='latin-1') as sf: 

624 for line in sf: 

625 lines.append(line) 

626 # parse: 

627 data = {} 

628 cdatas = [data] 

629 ident_offs = None 

630 ident = None 

631 old_style = False 

632 grid_n = False 

633 for line in lines: 

634 if len(line.strip()) == 0: 

635 continue 

636 if line[0] == '*': 

637 key = line[1:].strip() 

638 data[key] = {} 

639 cdatas = [data, data[key]] 

640 elif '----' in line: 

641 old_style = True 

642 key = line.strip().strip(' -').replace('&', '') 

643 if key.upper() == 'SETUP': 

644 key = 'Grid 1' 

645 grid_n = False 

646 if key[:4].lower() == 'grid': 

647 grid_n = key[5] 

648 cdatas = cdatas[:2] 

649 cdatas[1][key] = {} 

650 cdatas.append(cdatas[1][key]) 

651 else: 

652 words = line.split(':') 

653 key = words[0].strip().strip('"') 

654 value = None 

655 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style): 

656 value = ':'.join(words[1:]).strip().strip('"') 

657 if old_style: 

658 if value is None: 

659 cdatas = cdatas[:3] 

660 cdatas[2][key] = {} 

661 cdatas.append(cdatas[2][key]) 

662 else: 

663 if grid_n and key[-1] != grid_n: 

664 key = key + grid_n 

665 cdatas[-1][key] = value 

666 else: 

667 # get section level: 

668 level = 0 

669 nident = len(line) - len(line.lstrip()) 

670 if ident_offs is None: 

671 ident_offs = nident 

672 elif ident is None: 

673 if nident > ident_offs: 

674 ident = nident - ident_offs 

675 level = 1 

676 else: 

677 level = (nident - ident_offs)//ident 

678 # close sections: 

679 cdatas = cdatas[:2 + level] 

680 if value is None: 

681 # new section: 

682 cdatas[-1][key] = {} 

683 cdatas.append(cdatas[-1][key]) 

684 else: 

685 # key-value pair: 

686 cdatas[-1][key] = value.replace(r'\n', '\n') 

687 # remove unused grids: 

688 fgm = data.get('FishGrid', {}) 

689 for i in range(4): 

690 gs = f'Grid {i+1}' 

691 if gs in fgm: 

692 gm = fgm[gs] 

693 us = f'Used{i+1}' 

694 if us in gm and gm[us].upper() == 'FALSE': 

695 del fgm[gs] 

696 return data 

697 

698 

699def markers_fishgrid(filepath): 

700 """ Read markers of a fishgrid data set. 

701 

702 Parameters 

703 ---------- 

704 filepath: str 

705 A fishgrid data directory or a file therein. 

706 

707 Returns 

708 ------- 

709 locs: 2-D array of ints 

710 Marker positions (first column) and spans (second column) 

711 for each marker (rows). 

712 labels: 2-D array of string objects 

713 Labels (first column) and texts (second column) 

714 for each marker (rows). 

715 """ 

716 def add_marker(): 

717 if 'index1' in marker: 

718 index1 = int(marker['index1'])//nchannels 

719 else: 

720 index1 = int(marker['index'])//nchannels 

721 span1 = int(marker.get('span1', 0))//nchannels 

722 locs.append([index1, span1]) 

723 ls = marker.get('label', 'M') 

724 cs = marker.get('comment', '') 

725 labels.append([ls, cs]) 

726 

727 fishgrid_dir = filepath 

728 if not os.path.isdir(fishgrid_dir): 

729 fishgrid_dir = os.path.dirname(filepath) 

730 path = os.path.join(fishgrid_dir, 'timestamps.dat') 

731 if not os.path.isfile(path): 

732 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

733 # get number of channels: 

734 md = metadata_fishgrid(path.replace('timestamps.dat', 'fishgrid.cfg')) 

735 grids = fishgrid_grids(md) 

736 nchannels = np.prod(grids[0]) 

737 # read timestamps: 

738 locs = [] 

739 labels = [] 

740 marker = {} 

741 with open(path, 'r') as sf: 

742 for line in sf: 

743 if len(line.strip()) == 0: 

744 add_marker() 

745 marker = {} 

746 else: 

747 words = line.split(':') 

748 if len(words) > 1: 

749 v = words[1].strip() 

750 v = v.strip('"') 

751 marker[words[0].strip().lower()] = v 

752 if len(marker) > 0: 

753 add_marker() 

754 if len(locs) > 2: 

755 return np.array(locs[1:-1]), np.array(labels[1:-1]) 

756 else: 

757 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

758 

759 

760def check_container(filepath): 

761 """Check if file is a generic container file. 

762 

763 Supported file formats are: 

764 

765 - python pickle files (.pkl) 

766 - numpy files (.npz) 

767 - matlab files (.mat) 

768 

769 Parameters 

770 ---------- 

771 filepath: str 

772 Path of the file to check. 

773  

774 Returns 

775 ------- 

776 is_container: bool 

777 `True`, if `filepath` is a supported container format. 

778 """ 

779 ext = os.path.splitext(filepath)[1] 

780 return ext.lower() in ('.pkl', '.npz', '.mat') 

781 

782 

783def extract_container_data(data_dict, datakey=None, 

784 samplekey=['rate', 'Fs', 'fs'], 

785 timekey=['time'], amplkey=['amax'], unitkey='unit', 

786 amax=1.0, unit='a.u.'): 

787 """Extract data from dictionary loaded from a container file. 

788 

789 Parameters 

790 ---------- 

791 data_dict: dict 

792 Dictionary of the data items contained in the container. 

793 datakey: None, str, or list of str 

794 Name of the variable holding the data. If `None` take the 

795 variable that is an 2D array and has the largest number of 

796 elements. 

797 samplekey: str or list of str 

798 Name of the variable holding the sampling rate. 

799 timekey: str or list of str 

800 Name of the variable holding sampling times. 

801 If no sampling rate is available, the sampling rate is retrieved 

802 from the sampling times. 

803 amplkey: str or list of str 

804 Name of the variable holding the amplitude range of the data. 

805 unitkey: str 

806 Name of the variable holding the unit of the data. 

807 amax: None or float 

808 If specified and no amplitude range has been found in `data_dict`, 

809 then this is the amplitude range of the data. 

810 unit: None or str 

811 If specified and no unit has been found in `data_dict`, 

812 then return this as the unit of the data. 

813 

814 Returns 

815 ------- 

816 data: 2-D array of floats 

817 All data traces as an 2-D numpy array, even for single channel data. 

818 First dimension is time, second is channel. 

819 rate: float 

820 Sampling rate of the data in Hz. 

821 unit: str 

822 Unit of the data. 

823 amax: float 

824 Maximum amplitude of data range in `unit`. 

825 

826 Raises 

827 ------ 

828 ValueError: 

829 Invalid key requested. 

830 """ 

831 # extract format data: 

832 if not isinstance(samplekey, (list, tuple, np.ndarray)): 

833 samplekey = (samplekey,) 

834 if not isinstance(timekey, (list, tuple, np.ndarray)): 

835 timekey = (timekey,) 

836 if not isinstance(amplkey, (list, tuple, np.ndarray)): 

837 amplkey = (amplkey,) 

838 rate = 0.0 

839 for skey in samplekey: 

840 if skey in data_dict: 

841 rate = float(data_dict[skey]) 

842 break 

843 if rate == 0.0: 

844 for tkey in timekey: 

845 if tkey in data_dict: 

846 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0]) 

847 break 

848 if rate == 0.0: 

849 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times") 

850 for akey in amplkey: 

851 if akey in data_dict: 

852 amax = float(data_dict[akey]) 

853 break 

854 if unitkey in data_dict: 

855 unit = data_dict[unitkey] 

856 # get data array: 

857 raw_data = np.array([]) 

858 if datakey: 

859 # try data keys: 

860 if not isinstance(datakey, (list, tuple, np.ndarray)): 

861 datakey = (datakey,) 

862 for dkey in datakey: 

863 if dkey in data_dict: 

864 raw_data = data_dict[dkey] 

865 break 

866 if len(raw_data) == 0: 

867 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data") 

868 else: 

869 # find largest 2D array: 

870 for d in data_dict: 

871 if hasattr(data_dict[d], 'shape'): 

872 if 1 <= len(data_dict[d].shape) <= 2 and \ 

873 np.max(data_dict[d].shape) > np.max(raw_data.shape): 

874 raw_data = data_dict[d] 

875 if len(raw_data) == 0: 

876 raise ValueError('no data found') 

877 # make 2D: 

878 if len(raw_data.shape) == 1: 

879 raw_data = raw_data.reshape(-1, 1) 

880 # transpose if necessary: 

881 if np.argmax(raw_data.shape) > 0: 

882 raw_data = raw_data.T 

883 # recode: 

884 if raw_data.dtype == np.dtype('int16'): 

885 data = raw_data.astype('float32') 

886 data *= amax/2**15 

887 elif raw_data.dtype == np.dtype('int32'): 

888 data = raw_data.astype(float) 

889 data *= amax/2**31 

890 elif raw_data.dtype == np.dtype('int64'): 

891 data = raw_data.astype(float) 

892 data *= amax/2**63 

893 else: 

894 data = raw_data 

895 return data, rate, unit, amax 

896 

897 

898def load_container(filepath, datakey=None, 

899 samplekey=['rate', 'Fs', 'fs'], 

900 timekey=['time'], amplkey=['amax'], unitkey='unit', 

901 amax=1.0, unit='a.u.'): 

902 """Load data from a generic container file. 

903 

904 Supported file formats are: 

905 

906 - python pickle files (.pkl) 

907 - numpy files (.npz) 

908 - matlab files (.mat) 

909 

910 Parameters 

911 ---------- 

912 filepath: str 

913 Path of the file to load. 

914 datakey: None, str, or list of str 

915 Name of the variable holding the data. If `None` take the 

916 variable that is an 2D array and has the largest number of 

917 elements. 

918 samplekey: str or list of str 

919 Name of the variable holding the sampling rate. 

920 timekey: str or list of str 

921 Name of the variable holding sampling times. 

922 If no sampling rate is available, the sampling rate is retrieved 

923 from the sampling times. 

924 amplkey: str 

925 Name of the variable holding the amplitude range of the data. 

926 unitkey: str 

927 Name of the variable holding the unit of the data. 

928 If `unitkey` is not a valid key, then return `unitkey` as the `unit`. 

929 amax: None or float 

930 If specified and no amplitude range has been found in the data 

931 container, then this is the amplitude range of the data. 

932 unit: None or str 

933 If specified and no unit has been found in the data container, 

934 then return this as the unit of the data. 

935 

936 Returns 

937 ------- 

938 data: 2-D array of floats 

939 All data traces as an 2-D numpy array, even for single channel data. 

940 First dimension is time, second is channel. 

941 rate: float 

942 Sampling rate of the data in Hz. 

943 unit: str 

944 Unit of the data. 

945 amax: float 

946 Maximum amplitude of data range. 

947 

948 Raises 

949 ------ 

950 ValueError: 

951 Invalid key requested. 

952 """ 

953 # load data: 

954 data_dict = {} 

955 ext = os.path.splitext(filepath)[1] 

956 if ext == '.pkl': 

957 import pickle 

958 with open(filepath, 'rb') as f: 

959 data_dict = pickle.load(f) 

960 elif ext == '.npz': 

961 data_dict = np.load(filepath) 

962 elif ext == '.mat': 

963 from scipy.io import loadmat 

964 data_dict = loadmat(filepath, squeeze_me=True) 

965 return extract_container_data(data_dict, datakey, samplekey, 

966 timekey, amplkey, unitkey, amax, unit) 

967 

968 

969def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']): 

970 """ Extract metadata from dictionary loaded from a container file. 

971 

972 Parameters 

973 ---------- 

974 data_dict: dict 

975 Dictionary of the data items contained in the container. 

976 metadatakey: str or list of str 

977 Name of the variable holding the metadata. 

978 

979 Returns 

980 ------- 

981 metadata: nested dict 

982 Nested dictionary with key-value pairs of the meta data. 

983 """ 

984 if not isinstance(metadatakey, (list, tuple, np.ndarray)): 

985 metadatakey = (metadatakey,) 

986 # get single metadata dictionary: 

987 for mkey in metadatakey: 

988 if mkey in data_dict: 

989 return data_dict[mkey] 

990 # collect all keys starting with metadatakey: 

991 metadata = {} 

992 for mkey in metadatakey: 

993 mkey += '__' 

994 for dkey in data_dict: 

995 if dkey[:len(mkey)] == mkey: 

996 v = data_dict[dkey] 

997 if hasattr(v, 'size') and v.ndim == 0: 

998 v = v.item() 

999 metadata[dkey[len(mkey):]] = v 

1000 if len(metadata) > 0: 

1001 return unflatten_metadata(metadata, sep='__') 

1002 return metadata 

1003 

1004 

1005def metadata_container(filepath, metadatakey=['metadata', 'info']): 

1006 """ Read meta-data of a container file. 

1007 

1008 Parameters 

1009 ---------- 

1010 filepath: str 

1011 A container file. 

1012 metadatakey: str or list of str 

1013 Name of the variable holding the metadata. 

1014 

1015 Returns 

1016 ------- 

1017 metadata: nested dict 

1018 Nested dictionary with key-value pairs of the meta data. 

1019 """ 

1020 data_dict = {} 

1021 ext = os.path.splitext(filepath)[1] 

1022 if ext == '.pkl': 

1023 import pickle 

1024 with open(filepath, 'rb') as f: 

1025 data_dict = pickle.load(f) 

1026 elif ext == '.npz': 

1027 data_dict = np.load(filepath) 

1028 elif ext == '.mat': 

1029 from scipy.io import loadmat 

1030 data_dict = loadmat(filepath, squeeze_me=True) 

1031 return extract_container_metadata(data_dict, metadatakey) 

1032 

1033 

1034def extract_container_markers(data_dict, poskey=['positions'], 

1035 spanskey=['spans'], labelskey=['labels'], 

1036 descrkey=['descriptions']): 

1037 """ Extract markers from dictionary loaded from a container file. 

1038 

1039 Parameters 

1040 ---------- 

1041 data_dict: dict 

1042 Dictionary of the data items contained in the container. 

1043 poskey: str or list of str 

1044 Name of the variable holding positions of markers. 

1045 spanskey: str or list of str 

1046 Name of the variable holding spans of markers. 

1047 labelskey: str or list of str 

1048 Name of the variable holding labels of markers. 

1049 descrkey: str or list of str 

1050 Name of the variable holding descriptions of markers. 

1051 

1052 Returns 

1053 ------- 

1054 locs: 2-D array of ints 

1055 Marker positions (first column) and spans (second column) 

1056 for each marker (rows). 

1057 labels: 2-D array of string objects 

1058 Labels (first column) and texts (second column) 

1059 for each marker (rows). 

1060 """ 

1061 if not isinstance(poskey, (list, tuple, np.ndarray)): 

1062 poskey = (poskey,) 

1063 if not isinstance(spanskey, (list, tuple, np.ndarray)): 

1064 spanskey = (spanskey,) 

1065 if not isinstance(labelskey, (list, tuple, np.ndarray)): 

1066 labelskey = (labelskey,) 

1067 if not isinstance(descrkey, (list, tuple, np.ndarray)): 

1068 descrkey = (descrkey,) 

1069 locs = np.zeros((0, 2), dtype=int) 

1070 for pkey in poskey: 

1071 if pkey in data_dict: 

1072 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int) 

1073 locs[:,0] = data_dict[pkey] 

1074 break 

1075 for skey in spanskey: 

1076 if skey in data_dict: 

1077 locs[:,1] = data_dict[skey] 

1078 break 

1079 labels = np.zeros((0, 2), dtype=object) 

1080 for lkey in labelskey: 

1081 if lkey in data_dict: 

1082 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object) 

1083 labels[:,0] = data_dict[lkey] 

1084 break 

1085 for dkey in descrkey: 

1086 if dkey in data_dict: 

1087 labels[:,1] = data_dict[dkey] 

1088 break 

1089 return locs, labels 

1090 

1091 

1092def markers_container(filepath, poskey=['positions'], 

1093 spanskey=['spans'], labelskey=['labels'], 

1094 descrkey=['descriptions']): 

1095 """ Read markers of a container file. 

1096 

1097 Parameters 

1098 ---------- 

1099 filepath: str 

1100 A container file. 

1101 poskey: str or list of str 

1102 Name of the variable holding positions of markers. 

1103 spanskey: str or list of str 

1104 Name of the variable holding spans of markers. 

1105 labelskey: str or list of str 

1106 Name of the variable holding labels of markers. 

1107 descrkey: str or list of str 

1108 Name of the variable holding descriptions of markers. 

1109 

1110 Returns 

1111 ------- 

1112 locs: 2-D array of ints 

1113 Marker positions (first column) and spans (second column) 

1114 for each marker (rows). 

1115 labels: 2-D array of string objects 

1116 Labels (first column) and texts (second column) 

1117 for each marker (rows). 

1118 """ 

1119 data_dict = {} 

1120 ext = os.path.splitext(filepath)[1] 

1121 if ext == '.pkl': 

1122 import pickle 

1123 with open(filepath, 'rb') as f: 

1124 data_dict = pickle.load(f) 

1125 elif ext == '.npz': 

1126 data_dict = np.load(filepath) 

1127 elif ext == '.mat': 

1128 from scipy.io import loadmat 

1129 data_dict = loadmat(filepath, squeeze_me=True) 

1130 return extract_container_markers(data_dict, poskey, spanskey, 

1131 labelskey, descrkey) 

1132 

1133 

1134def check_raw(filepath): 

1135 """Check if file is a raw file. 

1136 

1137 The following extensions are interpreted as raw files: 

1138 

1139 - raw files (*.raw) 

1140 - LabView scandata (*.scandat) 

1141 

1142 Parameters 

1143 ---------- 

1144 filepath: str 

1145 Path of the file to check. 

1146  

1147 Returns 

1148 ------- 

1149 is_raw: bool 

1150 `True`, if `filepath` is a raw format. 

1151 """ 

1152 ext = os.path.splitext(filepath)[1] 

1153 return ext.lower() in ('.raw', '.scandat', '.mat') 

1154 

1155 

1156def load_raw(filepath, rate=44000, channels=1, dtype=np.float32, 

1157 amax=1.0, unit='a.u.'): 

1158 """Load data from a raw file. 

1159 

1160 Raw files just contain the data and absolutely no metadata, not 

1161 even the smapling rate, number of channels, etc. 

1162 Supported file formats are: 

1163 

1164 - raw files (*.raw) 

1165 - LabView scandata (*.scandat) 

1166 

1167 Parameters 

1168 ---------- 

1169 filepath: str 

1170 Path of the file to load. 

1171 rate: float 

1172 Sampling rate of the data in Hertz. 

1173 channels: int 

1174 Number of channels multiplexed in the data. 

1175 dtype: str or numpy.dtype 

1176 The data type stored in the file. 

1177 amax: float 

1178 The amplitude range of the data. 

1179 unit: str 

1180 The unit of the data. 

1181 

1182 Returns 

1183 ------- 

1184 data: 2-D array of floats 

1185 All data traces as an 2-D numpy array, even for single channel data. 

1186 First dimension is time, second is channel. 

1187 rate: float 

1188 Sampling rate of the data in Hz. 

1189 unit: str 

1190 Unit of the data. 

1191 amax: float 

1192 Maximum amplitude of data range. 

1193 

1194 """ 

1195 raw_data = np.fromfile(filepath, dtype=dtype).reshape(-1, channels) 

1196 # recode: 

1197 if dtype == np.dtype('int16'): 

1198 data = raw_data.astype('float32') 

1199 data *= amax/2**15 

1200 elif dtype == np.dtype('int32'): 

1201 data = raw_data.astype(float) 

1202 data *= amax/2**31 

1203 elif dtype == np.dtype('int64'): 

1204 data = raw_data.astype(float) 

1205 data *= amax/2**63 

1206 else: 

1207 data = raw_data 

1208 return data, rate, unit, amax 

1209 

1210 

1211def load_audioio(filepath, verbose=0, gainkey=default_gain_keys, sep='.', 

1212 amax=1.0, unit='a.u.'): 

1213 """Load data from an audio file. 

1214 

1215 See the 

1216 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio) 

1217 function of the [`audioio`](https://github.com/bendalab/audioio) 

1218 package for more infos. 

1219 

1220 Parameters 

1221 ---------- 

1222 filepath: str 

1223 Path of the file to load. 

1224 verbose: int 

1225 If > 0 show detailed error/warning messages. 

1226 gainkey: str or list of str 

1227 Key in the file's metadata that holds some gain information. 

1228 If found, the data will be multiplied with the gain, 

1229 and if available, the corresponding unit is returned. 

1230 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details. 

1231 sep: str 

1232 String that separates section names in `gainkey`. 

1233 amax: float 

1234 If specified and no gain has been found in the metadata, 

1235 then use this as the amplitude range. 

1236 unit: str 

1237 If specified and no gain has been found in the metadata, 

1238 then return this as the unit of the data. 

1239 

1240 Returns 

1241 ------- 

1242 data: 2-D array of floats 

1243 All data traces as an 2-D numpy array, even for single channel data. 

1244 First dimension is time, second is channel. 

1245 rate: float 

1246 Sampling rate of the data in Hz. 

1247 unit: str 

1248 Unit of the data if found in the metadata (see `gainkey`), 

1249 otherwise `unit`. 

1250 amax: float 

1251 Maximum amplitude of data range. 

1252 """ 

1253 # get gain: 

1254 md = metadata_audioio(filepath) 

1255 amax, unit = get_gain(md, gainkey, sep, amax, unit) 

1256 # load data: 

1257 data, rate = load_audio(filepath, verbose) 

1258 if amax != 1.0: 

1259 data *= amax 

1260 return data, rate, unit, amax 

1261 

1262 

1263data_loader_funcs = ( 

1264 ('relacs', check_relacs, load_relacs, metadata_relacs, None), 

1265 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid), 

1266 ('container', check_container, load_container, metadata_container, markers_container), 

1267 ('raw', check_raw, load_raw, None, None), 

1268 ('audioio', None, load_audioio, metadata_audioio, markers_audioio), 

1269 ) 

1270"""List of implemented load functions. 

1271 

1272Each element of the list is a tuple with the data format's name, its 

1273check and its load function. 

1274 

1275""" 

1276 

1277 

1278def load_data(filepath, verbose=0, **kwargs): 

1279 """Load time-series data from a file. 

1280 

1281 Parameters 

1282 ---------- 

1283 filepath: str 

1284 Path and name of the file to load. 

1285 verbose: int 

1286 If > 0 show detailed error/warning messages. 

1287 **kwargs: dict 

1288 Further keyword arguments that are passed on to the  

1289 format specific loading functions. 

1290 For example: 

1291 - `amax`: the amplitude range of the data. 

1292 - 'unit': the unit of the data. 

1293 

1294 Returns 

1295 ------- 

1296 data: 2-D array 

1297 All data traces as an 2-D numpy array, even for single channel data. 

1298 First dimension is time, second is channel. 

1299 rate: float 

1300 Sampling rate of the data in Hz. 

1301 unit: str 

1302 Unit of the data. 

1303 amax: float 

1304 Maximum amplitude of data range. 

1305 

1306 Raises 

1307 ------ 

1308 ValueError: 

1309 `filepath` is empty string. 

1310 """ 

1311 if len(filepath) == 0: 

1312 raise ValueError('input argument filepath is empty string.') 

1313 # load data: 

1314 for name, check_file, load_file, _, _ in data_loader_funcs: 

1315 if check_file is None or check_file(filepath): 

1316 data, rate, unit, amax = load_file(filepath, **kwargs) 

1317 if verbose > 0: 

1318 print(f'loaded {name} data from file "{filepath}"') 

1319 if verbose > 1: 

1320 print(f' sampling rate: {rate:g} Hz') 

1321 print(f' channels : {data.shape[1]}') 

1322 print(f' frames : {len(data)}') 

1323 print(f' range : {amax:g}{unit}') 

1324 return data, rate, unit, amax 

1325 return np.zeros((0, 1)), 0.0, '', 1.0 

1326 

1327 

1328def metadata(filepath, **kwargs): 

1329 """ Read meta-data from a data file. 

1330 

1331 Parameters 

1332 ---------- 

1333 filepath: str 

1334 The full path and name of the file to load. For some file 

1335 formats several files can be provided in a list. 

1336 **kwargs: dict 

1337 Further keyword arguments that are passed on to the  

1338 format specific loading functions. 

1339 

1340 Returns 

1341 ------- 

1342 meta_data: nested dict 

1343 Meta data contained in the file. Keys of the nested 

1344 dictionaries are always strings. If the corresponding 

1345 values are dictionaries, then the key is the section name 

1346 of the metadata contained in the dictionary. All other 

1347 types of values are values for the respective key. In 

1348 particular they are strings, or list of strings. But other 

1349 simple types like ints or floats are also allowed. 

1350 

1351 Raises 

1352 ------ 

1353 ValueError: 

1354 `filepath` is empty string. 

1355 """ 

1356 if len(filepath) == 0: 

1357 raise ValueError('input argument filepath is empty string.') 

1358 # load metadata: 

1359 for _, check_file, _, metadata_file, _ in data_loader_funcs: 

1360 if check_file is None or check_file(filepath): 

1361 if metadata_file is not None: 

1362 return metadata_file(filepath, **kwargs) 

1363 return {} 

1364 

1365 

1366def markers(filepath): 

1367 """ Read markers of a data file. 

1368 

1369 Parameters 

1370 ---------- 

1371 filepath: str or file handle 

1372 The data file. 

1373 

1374 Returns 

1375 ------- 

1376 locs: 2-D array of ints 

1377 Marker positions (first column) and spans (second column) 

1378 for each marker (rows). 

1379 labels: 2-D array of string objects 

1380 Labels (first column) and texts (second column) 

1381 for each marker (rows). 

1382 

1383 Raises 

1384 ------ 

1385 ValueError: 

1386 `filepath` is empty string. 

1387 """ 

1388 if len(filepath) == 0: 

1389 raise ValueError('input argument filepath is empty string.') 

1390 # load markers: 

1391 for _, check_file, _, _, markers_file in data_loader_funcs: 

1392 if check_file is None or check_file(filepath): 

1393 if markers_file is not None: 

1394 return markers_file(filepath) 

1395 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

1396 

1397 

1398class DataLoader(AudioLoader): 

1399 """Buffered reading of time-series data for random access of the data in the file. 

1400  

1401 This allows for reading very large data files that do not fit into 

1402 memory. A `DataLoader` instance can be used like a huge 

1403 read-only numpy array, i.e. 

1404 ``` 

1405 data = DataLoader('path/to/data/file.dat') 

1406 x = data[10000:20000,0] 

1407 ``` 

1408 The first index specifies the frame, the second one the channel. 

1409 

1410 `DataLoader` first determines the format of the data file and then 

1411 opens the file (first line). It then reads data from the file as 

1412 necessary for the requested data (second line). 

1413 

1414 Supported file formats are 

1415 

1416 - audio files via `audioio` package 

1417 - python pickle files 

1418 - numpy .npz files 

1419 - matlab .mat files 

1420 - relacs trace*.raw files (www.relacs.net) 

1421 - fishgrid traces-*.raw files 

1422 

1423 Reading sequentially through the file is always possible. If 

1424 previous data are requested, then the file is read from the 

1425 beginning. This might slow down access to previous data 

1426 considerably. Use the `backsize` argument to the open functions to 

1427 make sure some data are loaded before the requested frame. Then a 

1428 subsequent access to the data within `backsize` seconds before that 

1429 frame can still be handled without the need to reread the file 

1430 from the beginning. 

1431 

1432 Usage: 

1433 ------ 

1434 ``` 

1435 import thunderlab.dataloader as dl 

1436 with dl.DataLoader(filepath, 60.0, 10.0) as data: 

1437 # do something with the content of the file: 

1438 x = data[0:10000,0] 

1439 y = data[10000:20000,0] 

1440 z = x + y 

1441 ``` 

1442  

1443 Normal open and close: 

1444 ``` 

1445 data = dl.DataLoader(filepath, 60.0) 

1446 x = data[:,:] # read the whole file 

1447 data.close() 

1448 ```  

1449 that is the same as: 

1450 ``` 

1451 data = dl.DataLoader() 

1452 data.open(filepath, 60.0) 

1453 ``` 

1454  

1455 Parameters 

1456 ---------- 

1457 filepath: str 

1458 Name of the file. 

1459 buffersize: float 

1460 Size of internal buffer in seconds. 

1461 backsize: float 

1462 Part of the buffer to be loaded before the requested start index in seconds. 

1463 verbose: int 

1464 If larger than zero show detailed error/warning messages. 

1465 meta_kwargs: dict 

1466 Keyword arguments that are passed on to the _load_metadata() function. 

1467 

1468 Attributes 

1469 ---------- 

1470 rate: float 

1471 The sampling rate of the data in Hertz. 

1472 channels: int 

1473 The number of channels that are read in. 

1474 frames: int 

1475 The number of frames in the file. 

1476 format: str or None 

1477 Format of the audio file. 

1478 encoding: str or None 

1479 Encoding/subtype of the audio file. 

1480 shape: tuple 

1481 Number of frames and channels of the data. 

1482 ndim: int 

1483 Number of dimensions: always 2 (frames and channels). 

1484 unit: str 

1485 Unit of the data. 

1486 ampl_min: float 

1487 Minimum amplitude the file format supports. 

1488 ampl_max: float 

1489 Maximum amplitude the file format supports. 

1490 

1491 Methods 

1492 ------- 

1493 

1494 - `len()`: the number of frames 

1495 - `open()`: open a data file. 

1496 - `open_*()`: open a data file of a specific format. 

1497 - `close()`: close the file. 

1498 - `metadata()`: metadata of the file. 

1499 - `markers()`: markers of the file. 

1500 - `set_unwrap()`: Set parameters for unwrapping clipped data. 

1501 

1502 """ 

1503 

1504 def __init__(self, filepath=None, buffersize=10.0, backsize=0.0, 

1505 verbose=0, **meta_kwargs): 

1506 super().__init__(None, buffersize, backsize, 

1507 verbose, **meta_kwargs) 

1508 if filepath is not None: 

1509 self.open(filepath, buffersize, backsize, verbose, **meta_kwargs) 

1510 

1511 def __getitem__(self, key): 

1512 return super(DataLoader, self).__getitem__(key) 

1513 

1514 def __next__(self): 

1515 return super(DataLoader, self).__next__() 

1516 

1517 

1518 # relacs interface:  

1519 def open_relacs(self, filepath, buffersize=10.0, backsize=0.0, 

1520 verbose=0, amax=1.0): 

1521 """Open relacs data files (www.relacs.net) for reading. 

1522 

1523 Parameters 

1524 ---------- 

1525 filepath: str 

1526 Path to a relacs data directory or a file therein. 

1527 buffersize: float 

1528 Size of internal buffer in seconds. 

1529 backsize: float 

1530 Part of the buffer to be loaded before the requested start index in seconds. 

1531 verbose: int 

1532 If > 0 show detailed error/warning messages. 

1533 amax: float 

1534 The amplitude range of the data. 

1535 

1536 Raises 

1537 ------ 

1538 FileNotFoundError: 

1539 Invalid or non existing fishgrid files. 

1540 ValueError: 

1541 .gz files not supported. 

1542 """ 

1543 self.verbose = verbose 

1544 

1545 # open trace files: 

1546 self.trace_filepaths = relacs_trace_files(filepath) 

1547 if len(self.trace_filepaths) == 0: 

1548 raise FileNotFoundError(f'no relacs files found') 

1549 self.sf = [] 

1550 self.frames = None 

1551 self.rate = None 

1552 self.unit = '' 

1553 self.filepath = filepath 

1554 self.file_paths = [self.filepath] 

1555 self.file_indices = [0] 

1556 for path in self.trace_filepaths: 

1557 if path[-3:] == '.gz': 

1558 raise ValueError('.gz files not supported') 

1559 sf = open(path, 'rb') 

1560 self.sf.append(sf) 

1561 if verbose > 0: 

1562 print(f'open_relacs(filepath) with filepath={path}') 

1563 # file size: 

1564 sf.seek(0, os.SEEK_END) 

1565 frames = sf.tell()//4 

1566 if self.frames is None: 

1567 self.frames = frames 

1568 elif self.frames != frames: 

1569 diff = self.frames - frames 

1570 if diff > 1 or diff < -2: 

1571 raise ValueError('number of frames of traces differ') 

1572 elif diff >= 0: 

1573 self.frames = frames 

1574 sf.seek(0) 

1575 # retrieve sampling rate and unit: 

1576 rate, us = relacs_samplerate_unit(path) 

1577 if self.rate is None: 

1578 self.rate = rate 

1579 elif rate != self.rate: 

1580 raise ValueError('sampling rates of traces differ') 

1581 if len(self.unit) == 0: 

1582 self.unit = us 

1583 elif us != self.unit: 

1584 raise ValueError('unit of traces differ') 

1585 self.channels = len(self.sf) 

1586 self.shape = (self.frames, self.channels) 

1587 self.size = self.frames * self.channels 

1588 self.ndim = len(self.shape) 

1589 self.format = 'RELACS' 

1590 self.encoding = 'FLOAT' 

1591 self.bufferframes = int(buffersize*self.rate) 

1592 self.backframes = int(backsize*self.rate) 

1593 self.init_buffer() 

1594 self.offset = 0 

1595 self.close = self._close_relacs 

1596 self.load_audio_buffer = self._load_buffer_relacs 

1597 self.basename = self._basename_relacs 

1598 self.ampl_min = -amax 

1599 self.ampl_max = +amax 

1600 self._load_metadata = self._metadata_relacs 

1601 # TODO: load markers: 

1602 self._locs = np.zeros((0, 2), dtype=int) 

1603 self._labels = np.zeros((0, 2), dtype=object) 

1604 self._load_markers = None 

1605 return self 

1606 

1607 def _close_relacs(self): 

1608 """Close the relacs data files. 

1609 """ 

1610 for file in self.sf: 

1611 file.close() 

1612 self.sf = [] 

1613 

1614 def _load_buffer_relacs(self, r_offset, r_size, buffer): 

1615 """Load new data from relacs data file. 

1616 

1617 Parameters 

1618 ---------- 

1619 r_offset: int 

1620 First frame to be read from file. 

1621 r_size: int 

1622 Number of frames to be read from file. 

1623 buffer: ndarray 

1624 Buffer where to store the loaded data. 

1625 """ 

1626 if len(self.sf) == 0 and len(self.trace_filepaths) > 0: 

1627 for path in self.trace_filepaths: 

1628 self.sf.append(open(path, 'rb')) 

1629 for i, file in enumerate(self.sf): 

1630 file.seek(r_offset*4) 

1631 data = file.read(r_size*4) 

1632 buffer[:, i] = np.frombuffer(data, dtype=np.float32) 

1633 

1634 

1635 def _metadata_relacs(self, store_empty=False, first_only=False): 

1636 """ Load meta-data of a relacs data set. 

1637 """ 

1638 path = os.path.dirname(self.filepath) 

1639 info_path = os.path.join(path, 'info.dat') 

1640 if not os.path.exists(info_path): 

1641 return {} 

1642 return relacs_header(info_path, store_empty, first_only) 

1643 

1644 def _basename_relacs(self, path=None): 

1645 """ Base name of the relacs data files. 

1646 

1647 Parameters 

1648 ---------- 

1649 path: str or None 

1650 Path of a relacs data file (*.raw, info.dat, or just the directory). 

1651 If `None`, use `self.filepath`. 

1652 

1653 Returns 

1654 ------- 

1655 s: str 

1656 The base name, i.e. the name of the directory containing the 

1657 relacs data files. 

1658 

1659 """ 

1660 if path is None: 

1661 path = self.filepath 

1662 path = Path(path) 

1663 if path.is_dir(): 

1664 return path.name 

1665 else: 

1666 return path.parent.name 

1667 

1668 

1669 # fishgrid interface:  

1670 def open_fishgrid(self, filepath, buffersize=10.0, backsize=0.0, 

1671 verbose=0): 

1672 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading. 

1673 

1674 Parameters 

1675 ---------- 

1676 filepath: str 

1677 Path to a fishgrid data directory, or a file therein. 

1678 buffersize: float 

1679 Size of internal buffer in seconds. 

1680 backsize: float 

1681 Part of the buffer to be loaded before the requested start index in seconds. 

1682 verbose: int 

1683 If > 0 show detailed error/warning messages. 

1684 

1685 Raises 

1686 ------ 

1687 FileNotFoundError: 

1688 Invalid or non existing fishgrid files. 

1689 """ 

1690 self.verbose = verbose 

1691 

1692 self.trace_filepaths = fishgrid_trace_files(filepath) 

1693 if len(self.trace_filepaths) == 0: 

1694 raise FileNotFoundError(f'no fishgrid files found') 

1695 self.filepath = filepath 

1696 self.file_paths = [self.filepath] 

1697 self.file_indices = [0] 

1698 self._load_metadata = metadata_fishgrid 

1699 self._load_markers = markers_fishgrid 

1700 

1701 # open grid files: 

1702 grids = fishgrid_grids(self.metadata()) 

1703 grid_sizes = [r*c for r,c in grids] 

1704 self.channels = 0 

1705 for g, path in enumerate(self.trace_filepaths): 

1706 self.channels += grid_sizes[g] 

1707 self.sf = [] 

1708 self.grid_channels = [] 

1709 self.grid_offs = [] 

1710 offs = 0 

1711 self.frames = None 

1712 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate') 

1713 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt') 

1714 if v is not None: 

1715 self.ampl_min = -v 

1716 self.ampl_max = +v 

1717 

1718 for g, path in enumerate(self.trace_filepaths): 

1719 sf = open(path, 'rb') 

1720 self.sf.append(sf) 

1721 if verbose > 0: 

1722 print(f'open_fishgrid(filepath) with filepath={path}') 

1723 # grid channels: 

1724 self.grid_channels.append(grid_sizes[g]) 

1725 self.grid_offs.append(offs) 

1726 offs += grid_sizes[g] 

1727 # file size: 

1728 sf.seek(0, os.SEEK_END) 

1729 frames = sf.tell()//4//grid_sizes[g] 

1730 if self.frames is None: 

1731 self.frames = frames 

1732 elif self.frames != frames: 

1733 diff = self.frames - frames 

1734 if diff > 1 or diff < -2: 

1735 raise ValueError('number of frames of traces differ') 

1736 elif diff >= 0: 

1737 self.frames = frames 

1738 sf.seek(0) 

1739 self.shape = (self.frames, self.channels) 

1740 self.size = self.frames * self.channels 

1741 self.ndim = len(self.shape) 

1742 self.format = 'FISHGRID' 

1743 self.encoding = 'FLOAT' 

1744 self.bufferframes = int(buffersize*self.rate) 

1745 self.backframes = int(backsize*self.rate) 

1746 self.init_buffer() 

1747 self.offset = 0 

1748 self.close = self._close_fishgrid 

1749 self.load_audio_buffer = self._load_buffer_fishgrid 

1750 self.basename = self._basename_fishgrid 

1751 return self 

1752 

1753 def _close_fishgrid(self): 

1754 """Close the fishgrid data files. 

1755 """ 

1756 for file in self.sf: 

1757 file.close() 

1758 self.sf = [] 

1759 

1760 def _load_buffer_fishgrid(self, r_offset, r_size, buffer): 

1761 """Load new data from relacs data file. 

1762 

1763 Parameters 

1764 ---------- 

1765 r_offset: int 

1766 First frame to be read from file. 

1767 r_size: int 

1768 Number of frames to be read from file. 

1769 buffer: ndarray 

1770 Buffer where to store the loaded data. 

1771 """ 

1772 if len(self.sf) == 0 and len(self.trace_filepaths) > 0: 

1773 for path in self.trace_filepaths: 

1774 self.sf.append(open(path, 'rb')) 

1775 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs): 

1776 file.seek(r_offset*4*gchannels) 

1777 data = file.read(r_size*4*gchannels) 

1778 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels)) 

1779 

1780 def _basename_fishgrid(self, path=None): 

1781 """ Base name of the fishgrid data files. 

1782 

1783 Parameters 

1784 ---------- 

1785 path: str or None 

1786 Path of a fishgrid data file 

1787 (*.raw, fishgrid.cfg, or just the directory). 

1788 If `None`, use `self.filepath`. 

1789 

1790 Returns 

1791 ------- 

1792 s: str 

1793 The base name, i.e. the name of the directory containing the 

1794 fishgrid data files. 

1795 

1796 """ 

1797 if path is None: 

1798 path = self.filepath 

1799 path = Path(path) 

1800 if path.is_dir(): 

1801 return path.name 

1802 else: 

1803 return path.parent.name 

1804 

1805 

1806 

1807 # container interface: 

1808 def open_container(self, filepath, buffersize=10.0, 

1809 backsize=0.0, verbose=0, datakey=None, 

1810 samplekey=['rate', 'Fs', 'fs'], 

1811 timekey=['time'], amplkey=['amax'], unitkey='unit', 

1812 metadatakey=['metadata', 'info'], 

1813 poskey=['positions'], 

1814 spanskey=['spans'], labelskey=['labels'], 

1815 descrkey=['descriptions'], 

1816 amax=1.0, unit='a.u.'): 

1817 """Open generic container file. 

1818 

1819 Supported file formats are: 

1820 

1821 - python pickle files (.pkl) 

1822 - numpy files (.npz) 

1823 - matlab files (.mat) 

1824 

1825 Parameters 

1826 ---------- 

1827 filepath: str 

1828 Path to a container file. 

1829 buffersize: float 

1830 Size of internal buffer in seconds. 

1831 backsize: float 

1832 Part of the buffer to be loaded before the requested start index in seconds. 

1833 verbose: int 

1834 If > 0 show detailed error/warning messages. 

1835 datakey: None, str, or list of str 

1836 Name of the variable holding the data. If `None` take the 

1837 variable that is an 2D array and has the largest number of 

1838 elements. 

1839 samplekey: str or list of str 

1840 Name of the variable holding the sampling rate. 

1841 timekey: str or list of str 

1842 Name of the variable holding sampling times. 

1843 If no sampling rate is available, the sampling rate is retrieved 

1844 from the sampling times. 

1845 amplkey: str or list of str 

1846 Name of the variable holding the amplitude range of the data. 

1847 unitkey: str 

1848 Name of the variable holding the unit of the data. 

1849 metadatakey: str or list of str 

1850 Name of the variable holding the metadata. 

1851 poskey: str or list of str 

1852 Name of the variable holding positions of markers. 

1853 spanskey: str or list of str 

1854 Name of the variable holding spans of markers. 

1855 labelskey: str or list of str 

1856 Name of the variable holding labels of markers. 

1857 descrkey: str or list of str 

1858 Name of the variable holding descriptions of markers. 

1859 amax: None or float 

1860 If specified and no amplitude range has been found in the data 

1861 container, then this is the amplitude range of the data. 

1862 unit: None or str 

1863 If specified and no unit has been found in the data container, 

1864 then return this as the unit of the data. 

1865 

1866 Raises 

1867 ------ 

1868 ValueError: 

1869 Invalid key requested. 

1870 """ 

1871 self.verbose = verbose 

1872 data_dict = {} 

1873 ext = os.path.splitext(filepath)[1] 

1874 if ext == '.pkl': 

1875 import pickle 

1876 with open(filepath, 'rb') as f: 

1877 data_dict = pickle.load(f) 

1878 self.format = 'PKL' 

1879 elif ext == '.npz': 

1880 data_dict = np.load(filepath) 

1881 self.format = 'NPZ' 

1882 elif ext == '.mat': 

1883 from scipy.io import loadmat 

1884 data_dict = loadmat(filepath, squeeze_me=True) 

1885 self.format = 'MAT' 

1886 self.buffer, self.rate, self.unit, amax = \ 

1887 extract_container_data(data_dict, datakey, samplekey, 

1888 timekey, amplkey, unitkey, amax, unit) 

1889 self.filepath = filepath 

1890 self.file_paths = [self.filepath] 

1891 self.file_indices = [0] 

1892 self.channels = self.buffer.shape[1] 

1893 self.frames = self.buffer.shape[0] 

1894 self.shape = self.buffer.shape 

1895 self.ndim = self.buffer.ndim 

1896 self.size = self.buffer.size 

1897 self.encoding = self.numpy_encodings[self.buffer.dtype] 

1898 self.ampl_min = -amax 

1899 self.ampl_max = +amax 

1900 self.offset = 0 

1901 self.buffer_changed = np.zeros(self.channels, dtype=bool) 

1902 self.bufferframes = self.frames 

1903 self.backsize = 0 

1904 self.close = self._close_container 

1905 self.load_audio_buffer = self._load_buffer_container 

1906 self._metadata = extract_container_metadata(data_dict, metadatakey) 

1907 self._load_metadata = None 

1908 self._locs, self._labels = extract_container_markers(data_dict, 

1909 poskey, 

1910 spanskey, 

1911 labelskey, 

1912 descrkey) 

1913 self._load_markers = None 

1914 

1915 def _close_container(self): 

1916 """Close container. """ 

1917 pass 

1918 

1919 def _load_buffer_container(self, r_offset, r_size, buffer): 

1920 """Load new data from container.""" 

1921 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :] 

1922 

1923 

1924 # raw data interface: 

1925 def open_raw(self, filepath, buffersize=10.0, backsize=0.0, 

1926 verbose=0, rate=44000, channels=1, dtype=np.float32, 

1927 amax=1.0, unit='a.u.'): 

1928 """Load data from a raw file. 

1929 

1930 Raw files just contain the data and absolutely no metadata, not 

1931 even the smapling rate, number of channels, etc. 

1932 Supported file formats are: 

1933 

1934 - raw files (*.raw) 

1935 - LabView scandata (*.scandat) 

1936 

1937 Parameters 

1938 ---------- 

1939 filepath: str 

1940 Path of the file to load. 

1941 buffersize: float 

1942 Size of internal buffer in seconds. 

1943 backsize: float 

1944 Part of the buffer to be loaded before the requested start index in seconds. 

1945 verbose: int 

1946 If > 0 show detailed error/warning messages. 

1947 rate: float 

1948 Sampling rate of the data in Hertz. 

1949 channels: int 

1950 Number of channels multiplexed in the data. 

1951 dtype: str or numpy.dtype 

1952 The data type stored in the file. 

1953 amax: float 

1954 The amplitude range of the data. 

1955 unit: str 

1956 The unit of the data. 

1957 """ 

1958 self.verbose = verbose 

1959 self.filepath = filepath 

1960 self.file_paths = [self.filepath] 

1961 self.file_indices = [0] 

1962 self.sf = open(self.filepath, 'rb') 

1963 if verbose > 0: 

1964 print(f'open_raw(filepath) with filepath={filepath}') 

1965 self.dtype = np.dtype(dtype) 

1966 self.rate = float(rate) 

1967 # file size: 

1968 self.sf.seek(0, os.SEEK_END) 

1969 self.frames = self.sf.tell()//self.dtype.itemsize 

1970 self.sf.seek(0) 

1971 self.channels = int(channels) 

1972 self.shape = (self.frames, self.channels) 

1973 self.ndim = len(self.shape) 

1974 self.size = self.frames*self.channels 

1975 self.format = 'RAW' 

1976 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN') 

1977 self.unit = unit 

1978 self.ampl_max = float(amax) 

1979 self.ampl_min = -self.ampl_max 

1980 self.offset = 0 

1981 self.bufferframes = int(buffersize*self.rate) 

1982 self.backframes = int(backsize*self.rate) 

1983 self.init_buffer() 

1984 self.close = self._close_raw 

1985 self.load_audio_buffer = self._load_buffer_raw 

1986 self._metadata = None 

1987 self._load_metadata = None 

1988 self._locs = None 

1989 self._labels = None 

1990 self._load_markers = None 

1991 

1992 def _close_raw(self): 

1993 """Close raw file. """ 

1994 self.sf.close() 

1995 self.sf = None 

1996 

1997 def _load_buffer_raw(self, r_offset, r_size, buffer): 

1998 """Load new data from container.""" 

1999 if self.sf is None: 

2000 self.sf = open(self.filepath, 'rb') 

2001 self.sf.seek(r_offset*self.dtype.itemsize) 

2002 raw_data = self.sf.read(r_size*self.dtype.itemsize) 

2003 raw_data = np.frombuffer(raw_data, dtype=self.dtype) 

2004 raw_data = raw_data.reshape(-1, self.channels) 

2005 # recode: 

2006 if self.dtype == np.dtype('int16'): 

2007 data = raw_data.astype('float32') 

2008 data *= self.ampl_max/2**15 

2009 elif self.dtype == np.dtype('int32'): 

2010 data = raw_data.astype(float) 

2011 data *= self.ampl_max/2**31 

2012 elif self.dtype == np.dtype('int64'): 

2013 data = raw_data.astype(float) 

2014 data *= self.ampl_max/2**63 

2015 else: 

2016 data = raw_data 

2017 buffer[:, :] = data 

2018 

2019 

2020 # audioio interface:  

2021 def open_audioio(self, filepath, buffersize=10.0, backsize=0.0, 

2022 verbose=0, gainkey=default_gain_keys, sep='.', 

2023 amax=None, unit='a.u.'): 

2024 """Open an audio file. 

2025 

2026 See the [audioio](https://github.com/bendalab/audioio) package 

2027 for details. 

2028 

2029 Parameters 

2030 ---------- 

2031 filepath: str 

2032 Path to an audio file. 

2033 buffersize: float 

2034 Size of internal buffer in seconds. 

2035 backsize: float 

2036 Part of the buffer to be loaded before the requested start index 

2037 in seconds. 

2038 verbose: int 

2039 If > 0 show detailed error/warning messages. 

2040 gainkey: str or list of str 

2041 Key in the file's metadata that holds some gain information. 

2042 If found, the data will be multiplied with the gain, 

2043 and if available, the corresponding unit is returned. 

2044 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details. 

2045 sep: str 

2046 String that separates section names in `gainkey`. 

2047 amax: None or float 

2048 If specified and no gain has been found in the metadata, 

2049 then use this as the amplitude range. 

2050 unit: None or str 

2051 If specified and no gain has been found in the metadata, 

2052 then this is the unit of the data. 

2053 

2054 """ 

2055 self.verbose = verbose 

2056 super(DataLoader, self).open(filepath, buffersize, backsize, verbose) 

2057 md = self.metadata() 

2058 fac, unit = get_gain(md, gainkey, sep, amax, unit) 

2059 if fac is None: 

2060 self.gain_fac = 1.0 

2061 else: 

2062 self.gain_fac = fac 

2063 self._load_buffer_audio_org = self.load_audio_buffer 

2064 self.load_audio_buffer = self._load_buffer_audioio 

2065 self.ampl_min *= self.gain_fac 

2066 self.ampl_max *= self.gain_fac 

2067 self.unit = unit 

2068 return self 

2069 

2070 def _load_buffer_audioio(self, r_offset, r_size, buffer): 

2071 """Load and scale new data from an audio file. 

2072 

2073 Parameters 

2074 ---------- 

2075 r_offset: int 

2076 First frame to be read from file. 

2077 r_size: int 

2078 Number of frames to be read from file. 

2079 buffer: ndarray 

2080 Buffer where to store the loaded data. 

2081 """ 

2082 self._load_buffer_audio_org(r_offset, r_size, buffer) 

2083 buffer *= self.gain_fac 

2084 

2085 

2086 # open multiple files as one: 

2087 def open_multiple(self, filepaths, buffersize=10.0, backsize=0.0, 

2088 verbose=0, rate=None, channels=None, 

2089 unit=None, amax=None, end_indices=None): 

2090 """Open multiple files as a single concatenated array. 

2091 

2092 Parameters 

2093 ---------- 

2094 filepaths: list of str 

2095 List of file names of audio files. 

2096 buffersize: float 

2097 Size of internal buffer in seconds. 

2098 backsize: float 

2099 Part of the buffer to be loaded before the requested start index in seconds. 

2100 verbose: int 

2101 If larger than zero show detailed error/warning messages. 

2102 rate: float 

2103 If provided, do a minimal initialization (no checking) 

2104 using the provided sampling rate (in Hertz), channels, 

2105 unit, maximum amplitude, and end_indices. 

2106 channels: int 

2107 If provided, do a minimal initialization (no checking) 

2108 using the provided rate, number of channels, 

2109 unit, maximum amplitude, and end_indices. 

2110 unit: str 

2111 If provided, do a minimal initialization (no checking) 

2112 using the provided rate, number of channels, 

2113 unit, maximum amplitude, and end_indices. 

2114 amax: float 

2115 If provided, do a minimal initialization (no checking) 

2116 using the provided rate, number of channels, 

2117 unit, maximum amplitude amax, and end_indices. 

2118 end_indices: sequence of int 

2119 If provided, do a minimal initialization (no checking) 

2120 using the provided rate, channels, 

2121 unit, maximum amplitude, and end_indices. 

2122 

2123 Raises 

2124 ------ 

2125 TypeError 

2126 `filepaths` must be a sequence. 

2127 ValueError 

2128 Empty `filepaths`. 

2129 FileNotFoundError 

2130 `filepaths` does not contain a single valid file. 

2131 

2132 """ 

2133 if not isinstance(filepaths, (list, tuple, np.ndarray)): 

2134 raise TypeError('input argument filepaths is not a sequence!') 

2135 if len(filepaths) == 0: 

2136 raise ValueError('input argument filepaths is empy sequence!') 

2137 self.buffersize = buffersize 

2138 self.backsize = backsize 

2139 self.filepath = None 

2140 self.file_paths = [] 

2141 self.open_files = [] 

2142 self.open_loaders = [] 

2143 self.data_files = [] 

2144 self.collect_counter = 0 

2145 self.frames = 0 

2146 self.start_indices = [] 

2147 self.end_indices = [] 

2148 self.start_time = None 

2149 start_time = None 

2150 self._metadata = {} 

2151 self._locs = np.zeros((0, 2), dtype=int) 

2152 self._labels = np.zeros((0, 2), dtype=object) 

2153 if end_indices is not None: 

2154 self.filepath = filepaths[0] 

2155 self.file_paths = filepaths 

2156 self.data_files = [None] * len(filepaths) 

2157 self.frames = end_indices[-1] 

2158 self.start_indices = [0] + list(end_indices[:-1]) 

2159 self.end_indices = end_indices 

2160 self.format = None 

2161 self.encoding = None 

2162 self.rate = rate 

2163 self.channels = channels 

2164 self.unit = unit 

2165 self.ampl_max = amax 

2166 self.ampl_min = -amax 

2167 else: 

2168 for filepath in filepaths: 

2169 try: 

2170 a = DataLoader(filepath, buffersize, backsize, verbose) 

2171 except Exception as e: 

2172 if verbose > 0: 

2173 print(e) 

2174 continue 

2175 # collect metadata: 

2176 md = a.metadata() 

2177 fmd = flatten_metadata(md, True) 

2178 add_metadata(self._metadata, fmd) 

2179 if self.filepath is None: 

2180 # first file: 

2181 self.filepath = a.filepath 

2182 self.format = a.format 

2183 self.encoding = a.encoding 

2184 self.rate = a.rate 

2185 self.channels = a.channels 

2186 self.unit = a.unit 

2187 self.ampl_max = a.ampl_max 

2188 self.ampl_min = a.ampl_min 

2189 self.start_time = get_datetime(md) 

2190 start_time = self.start_time 

2191 else: 

2192 # check channels, rate, and amplitudes: 

2193 error_str = None 

2194 if a.channels != self.channels: 

2195 error_str = f'number of channels differs: ' \ 

2196 f'{a.channels} in {a.filepath} versus ' \ 

2197 f'{self.channels} in {self.filepath}' 

2198 if a.rate != self.rate: 

2199 error_str = f'sampling rates differ: ' \ 

2200 f'{a.rate} in {a.filepath} versus ' \ 

2201 f'{self.rate} in {self.filepath}' 

2202 if a.ampl_min != self.ampl_min: 

2203 error_str = f'minimum amplitudes differ: ' \ 

2204 f'{a.ampl_min} in {a.filepath} versus ' \ 

2205 f'{self.ampl_min} in {self.filepath}' 

2206 if a.ampl_max != self.ampl_max: 

2207 error_Str = f'maximum amplitudes differ: ' \ 

2208 f'{a.ampl_max} in {a.filepath} versus ' \ 

2209 f'{self.ampl_max} in {self.filepath}' 

2210 # check start time of recording: 

2211 stime = get_datetime(md) 

2212 if start_time is None or stime is None or \ 

2213 abs(start_time - stime) > timedelta(seconds=1): 

2214 error_str = f'start time does not indicate continuous recording: ' \ 

2215 f'expected {start_time} instead of ' \ 

2216 f'{stime} in {a.filepath}' 

2217 if error_str is not None: 

2218 if verbose > 0: 

2219 print(error_str) 

2220 a.close() 

2221 del a 

2222 break 

2223 # markers: 

2224 locs, labels = a.markers() 

2225 locs[:,0] += self.frames 

2226 self._locs = np.vstack((self._locs, locs)) 

2227 self._labels = np.vstack((self._labels, labels)) 

2228 # indices: 

2229 self.start_indices.append(self.frames) 

2230 self.frames += a.frames 

2231 self.end_indices.append(self.frames) 

2232 if start_time is not None: 

2233 start_time += timedelta(seconds=a.frames/a.rate) 

2234 # add file to lists: 

2235 self.file_paths.append(filepath) 

2236 if len(self.open_files) < AudioLoader.max_open_files: 

2237 self.open_files.append(a) 

2238 else: 

2239 a.close() 

2240 if len(self.open_loaders) < AudioLoader.max_open_loaders: 

2241 self.data_files.append(a) 

2242 self.open_loaders.append(a) 

2243 else: 

2244 a.close() 

2245 del a 

2246 self.data_files.append(None) 

2247 if len(self.data_files) == 0: 

2248 raise FileNotFoundError('input argument filepaths does not contain any valid audio file!') 

2249 # set startime from first file: 

2250 if self.start_time is not None: 

2251 set_starttime(self._metadata, self.start_time) 

2252 # setup infrastructure: 

2253 self.file_indices = self.start_indices 

2254 self.start_indices = np.array(self.start_indices) 

2255 self.end_indices = np.array(self.end_indices) 

2256 self.shape = (self.frames, self.channels) 

2257 self.bufferframes = int(buffersize*self.rate) 

2258 self.backframes = int(backsize*self.rate) 

2259 self.init_buffer() 

2260 self.close = self._close_multiple 

2261 self.load_audio_buffer = self._load_buffer_multiple 

2262 self._load_metadata = None 

2263 self._load_markers = None 

2264 return self 

2265 

2266 def _close_multiple(self): 

2267 """Close all the data files. """ 

2268 self.open_files = [] 

2269 self.open_loaders = [] 

2270 if hasattr(self, 'data_files'): 

2271 for a in self.data_files: 

2272 if a is not None: 

2273 a.close() 

2274 self.data_files = [] 

2275 self.filepath = None 

2276 self.file_paths = [] 

2277 self.file_indices = [] 

2278 self.start_indices = [] 

2279 self.end_indices = [] 

2280 del self.data_files 

2281 del self.open_files 

2282 del self.open_loaders 

2283 del self.start_indices 

2284 del self.end_indices 

2285 

2286 def _load_buffer_multiple(self, r_offset, r_size, buffer): 

2287 """Load new data from the underlying files. 

2288 

2289 Parameters 

2290 ---------- 

2291 r_offset: int 

2292 First frame to be read from file. 

2293 r_size: int 

2294 Number of frames to be read from file. 

2295 buffer: ndarray 

2296 Buffer where to store the loaded data. 

2297 """ 

2298 offs = r_offset 

2299 size = r_size 

2300 boffs = 0 

2301 ai = np.searchsorted(self.end_indices, offs, side='right') 

2302 while size > 0: 

2303 if self.data_files[ai] is None: 

2304 a = DataLoader(self.file_paths[ai], 

2305 self.buffersize, self.backsize, 0) 

2306 self.data_files[ai] = a 

2307 self.open_loaders.append(a) 

2308 self.open_files.append(a) 

2309 if len(self.open_files) > AudioLoader.max_open_files: 

2310 a0 = self.open_files.pop(0) 

2311 a0.close() 

2312 if len(self.open_loaders) > AudioLoader.max_open_loaders: 

2313 a0 = self.open_loaders.pop(0) 

2314 self.data_files[self.data_files.index(a0)] = None 

2315 a0.close() 

2316 del a0 

2317 self.collect_counter += 1 

2318 if self.collect_counter > AudioLoader.max_open_loaders//2: 

2319 gc.collect() # takes time! 

2320 self.collect_counter = 0 

2321 else: 

2322 self.open_loaders.pop(self.open_loaders.index(self.data_files[ai])) 

2323 self.open_loaders.append(self.data_files[ai]) 

2324 ai0 = offs - self.start_indices[ai] 

2325 ai1 = offs + size 

2326 if ai1 > self.end_indices[ai]: 

2327 ai1 = self.end_indices[ai] 

2328 ai1 -= self.start_indices[ai] 

2329 n = ai1 - ai0 

2330 self.data_files[ai].load_audio_buffer(ai0, n, 

2331 buffer[boffs:boffs + n,:]) 

2332 if self.data_files[ai] in self.open_files: 

2333 self.open_files.pop(self.open_files.index(self.data_files[ai])) 

2334 self.open_files.append(self.data_files[ai]) 

2335 if len(self.open_files) > AudioLoader.max_open_files: 

2336 self.open_files[0].close() 

2337 self.open_files.pop(0) 

2338 boffs += n 

2339 offs += n 

2340 size -= n 

2341 ai += 1 

2342 

2343 

2344 def open(self, filepath, buffersize=10.0, backsize=0.0, 

2345 verbose=0, **kwargs): 

2346 """Open file with time-series data for reading. 

2347 

2348 Parameters 

2349 ---------- 

2350 filepath: str or list of str 

2351 Name of the file or list of many file names that should be 

2352 made accessible as a single array. 

2353 buffersize: float 

2354 Size of internal buffer in seconds. 

2355 backsize: float 

2356 Part of the buffer to be loaded before the requested start index 

2357 in seconds. 

2358 verbose: int 

2359 If > 0 show detailed error/warning messages. 

2360 **kwargs: dict 

2361 Further keyword arguments that are passed on to the  

2362 format specific opening functions. 

2363 For example: 

2364 - `amax`: the amplitude range of the data. 

2365 - 'unit': the unit of the data. 

2366 

2367 Raises 

2368 ------ 

2369 ValueError: 

2370 `filepath` is empty string. 

2371 """ 

2372 # list of implemented open functions: 

2373 data_open_funcs = ( 

2374 ('relacs', check_relacs, self.open_relacs, 1), 

2375 ('fishgrid', check_fishgrid, self.open_fishgrid, 1), 

2376 ('container', check_container, self.open_container, 1), 

2377 ('raw', check_raw, self.open_raw, 1), 

2378 ('audioio', None, self.open_audioio, 0), 

2379 ) 

2380 

2381 self.buffer = np.array([]) 

2382 self.rate = 0.0 

2383 if not filepath: 

2384 raise ValueError('input argument filepath is empty string.') 

2385 if isinstance(filepath, (list, tuple, np.ndarray)): 

2386 if len(filepath) > 1: 

2387 self.open_multiple(filepath, buffersize, backsize, 

2388 verbose, **kwargs) 

2389 if len(self.file_paths) > 1: 

2390 return self 

2391 filepath = self.file_paths[0] 

2392 self.close() 

2393 else: 

2394 filepath = filepath[0] 

2395 # open data: 

2396 for name, check_file, open_file, v in data_open_funcs: 

2397 if check_file is None or check_file(filepath): 

2398 open_file(filepath, buffersize, backsize, verbose, **kwargs) 

2399 if v*verbose > 1: 

2400 if self.format is not None: 

2401 print(f' format : {self.format}') 

2402 if self.encoding is not None: 

2403 print(f' encoding : {self.encoding}') 

2404 print(f' sampling rate: {self.rate} Hz') 

2405 print(f' channels : {self.channels}') 

2406 print(f' frames : {self.frames}') 

2407 print(f' range : {self.ampl_max:g}{self.unit}') 

2408 break 

2409 return self 

2410 

2411 

2412def demo(filepath, plot=False): 

2413 print("try load_data:") 

2414 data, rate, unit, amax = load_data(filepath, verbose=2) 

2415 if plot: 

2416 fig, ax = plt.subplots() 

2417 time = np.arange(len(data))/rate 

2418 for c in range(data.shape[1]): 

2419 ax.plot(time, data[:,c]) 

2420 ax.set_xlabel('Time [s]') 

2421 ax.set_ylabel(f'[{unit}]') 

2422 if amax is not None and np.isfinite(amax): 

2423 ax.set_ylim(-amax, +amax) 

2424 plt.show() 

2425 return 

2426 

2427 print('') 

2428 print("try DataLoader:") 

2429 with DataLoader(filepath, 2.0, 1.0, 1) as data: 

2430 print('sampling rate: %g' % data.rate) 

2431 print('frames : %d %d' % (len(data), data.shape[0])) 

2432 nframes = int(1.0 * data.rate) 

2433 # forward: 

2434 for i in range(0, len(data), nframes): 

2435 print('forward %d-%d' % (i, i + nframes)) 

2436 x = data[i:i + nframes, 0] 

2437 if plot: 

2438 fig, ax = plt.subplots() 

2439 ax.plot((i + np.arange(len(x)))/data.rate, x) 

2440 ax.set_xlabel('Time [s]') 

2441 ax.set_ylabel(f'[{data.unit}]') 

2442 plt.show() 

2443 # and backwards: 

2444 for i in reversed(range(0, len(data), nframes)): 

2445 print('backward %d-%d' % (i, i + nframes)) 

2446 x = data[i:i + nframes, 0] 

2447 if plot: 

2448 fig, ax = plt.subplots() 

2449 ax.plot((i + np.arange(len(x)))/data.rate, x) 

2450 ax.set_xlabel('Time [s]') 

2451 ax.set_ylabel(f'[{data.unit}]') 

2452 plt.show() 

2453 

2454 

2455def main(*cargs): 

2456 """Call demo with command line arguments. 

2457 

2458 Parameters 

2459 ---------- 

2460 cargs: list of str 

2461 Command line arguments as provided by sys.argv[1:] 

2462 """ 

2463 import argparse 

2464 parser = argparse.ArgumentParser(description= 

2465 'Checking thunderlab.dataloader module.') 

2466 parser.add_argument('-p', dest='plot', action='store_true', 

2467 help='plot loaded data') 

2468 parser.add_argument('file', nargs=1, default='', type=str, 

2469 help='name of data file') 

2470 args = parser.parse_args(cargs) 

2471 demo(args.file[0], args.plot) 

2472 

2473 

2474if __name__ == "__main__": 

2475 main(*sys.argv[1:])