Coverage for src / thunderlab / dataloader.py: 84%

1114 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-10 21:21 +0000

1"""Load time-series data from files. 

2 

3``` 

4data, rate, unit, amax = load_data('data/file.wav') 

5``` 

6 

7The function `data_loader()` loads the whole time-series from the file 

8as a numpy array of floats. First dimension is frames, second is 

9channels. In contrast to the `audioio.load_audio()` function, the 

10values of the data array are not restricted between -1 and 1. They can 

11assume any value wihin the range `-amax` to `+amax` with the returned 

12`unit`. 

13 

14``` 

15data = DataLoader('data/file.wav', 60.0) 

16``` 

17or 

18``` 

19with DataLoader('data/file.wav', 60.0) as data: 

20``` 

21Create an `DataLoader` object that loads chuncks of 60 seconds long data 

22on demand. `data` can be used like a read-only numpy array of floats. 

23 

24 

25## Supported file formats 

26 

27- python pickle files 

28- numpy .npz files 

29- matlab .mat files 

30- audio files via [`audioio`](https://github.com/bendalab/audioio) package 

31- LabView .scandat files 

32- raw files 

33- relacs files (https://www.relacs.net) 

34- fishgrid files (https://github.com/bendalab/fishgrid) 

35 

36 

37## Metadata 

38 

39Many file formats allow to store metadata that further describe the 

40stored time series data. We handle them as nested dictionary of key-value 

41pairs. Load them with the `metadata()` function: 

42``` 

43metadata = metadata('data/file.mat') 

44``` 

45 

46## Markers 

47 

48Some file formats also allow to store markers that mark specific 

49positions in the time series data. Load marker positions and spans (in 

50the 2-D array `locs`) and label and text strings (in the 2-D array 

51`labels`) with the `markers()` function: 

52``` 

53locs, labels = markers('data.wav') 

54``` 

55 

56## Aditional, format specific functions 

57 

58- `extract_container_metadata()`: extract metadata from dictionary loaded from a container file. 

59- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file. 

60- `relacs_header()`: read key-value pairs from relacs *.dat file headers. 

61- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file. 

62- `fishgrid_spacings()`: spacing between grid electrodes. 

63 

64""" 

65 

66import gc 

67import os 

68import sys 

69import gzip 

70import pickle 

71import numpy as np 

72try: 

73 import matplotlib.pyplot as plt 

74except ImportError: 

75 pass 

76 

77from pathlib import Path 

78from datetime import timedelta 

79 

80from audioio import load_audio, AudioLoader, unflatten_metadata 

81from audioio import get_number_unit, get_number, get_int, get_bool, get_gain 

82from audioio import default_starttime_keys, default_gain_keys 

83from audioio import get_datetime, flatten_metadata, add_metadata, set_starttime 

84from audioio import metadata as metadata_audioio 

85from audioio import markers as markers_audioio 

86 

87 

88def relacs_samplerate_unit(filepath, channel=0): 

89 """Retrieve sampling rate and unit from a relacs stimuli.dat file. 

90 

91 Parameters 

92 ---------- 

93 filepath: str or Path 

94 Path to a relacs data directory, or a file in a relacs data directory. 

95 channel: int 

96 Channel (trace) number, if `filepath` does not specify a 

97 trace-*.raw file. 

98 

99 Returns 

100 ------- 

101 samplerate: float 

102 Sampling rate in Hertz 

103 unit: str 

104 Unit of the trace, can be empty if not found 

105 

106 Raises 

107 ------ 

108 IOError/FileNotFoundError: 

109 If the stimuli.dat file does not exist. 

110 ValueError: 

111 stimuli.dat file does not contain sampling rate. 

112 """ 

113 trace = channel + 1 

114 relacs_dir = Path(filepath) 

115 # check for relacs data directory: 

116 if not relacs_dir.is_dir(): 

117 bn = relacs_dir.stem.lower() 

118 ext = relacs_dir.suffix.lower() 

119 relacs_dir = relacs_dir.parent 

120 if len(bn) > 6 and bn[:6] == 'trace-': 

121 trace = int(bn[6:]) 

122 

123 # retreive sampling rate and unit from stimuli.dat file: 

124 samplerate = None 

125 sampleinterval = None 

126 unit = "" 

127 

128 # load stimuli.dat file: 

129 lines = [] 

130 stimuli_file = relacs_dir / 'stimuli.dat.gz' 

131 if stimuli_file.is_file(): 

132 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf: 

133 for line in sf: 

134 line = line.strip() 

135 if len(line) == 0 or line[0] != '#': 

136 break 

137 lines.append(line) 

138 else: 

139 stimuli_file = relacs_dir / 'stimuli.dat' 

140 with open(stimuli_file, 'r', encoding='latin-1') as sf: 

141 for line in sf: 

142 line = line.strip() 

143 if len(line) == 0 or line[0] != '#': 

144 break 

145 lines.append(line) 

146 # extract unit and sampling rate:  

147 for line in lines: 

148 if f'unit{trace}' in line: 

149 unit = line.split(':')[1].strip() 

150 if f'sampling rate{trace}' in line: 

151 value = line.split(':')[1].strip() 

152 samplerate = float(value.replace('Hz','')) 

153 elif f'sample interval{trace}' in line: 

154 value = line.split(':')[1].strip() 

155 sampleinterval = float(value.replace('ms','')) 

156 

157 if samplerate is not None: 

158 return samplerate, unit 

159 if sampleinterval is not None: 

160 return 1000/sampleinterval, unit 

161 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}') 

162 

163 

164def relacs_header(filepath, store_empty=False, first_only=False, 

165 lower_keys=False, flat=False, 

166 add_sections=False): 

167 """Read key-value pairs from a relacs *.dat file header. 

168 

169 Parameters 

170 ---------- 

171 filepath: str or Path 

172 A relacs *.dat file, can be also a zipped .gz file. 

173 store_empty: bool 

174 If `False` do not add meta data with empty values. 

175 first_only: bool 

176 If `False` only store the first element of a list. 

177 lower_keys: bool 

178 Make all keys lower case. 

179 flat: bool 

180 Do not make a nested dictionary. 

181 Use this option also to read in very old relacs metadata with 

182 ragged left alignment. 

183 add_sections: bool 

184 If `True`, prepend keys with sections names separated by 

185 '.' to make them unique. 

186 

187 Returns 

188 ------- 

189 data: dict 

190 Nested dictionary with key-value pairs of the file header. 

191  

192 Raises 

193 ------ 

194 IOError/FileNotFoundError: 

195 If `filepath` cannot be opened. 

196 """ 

197 filepath = Path(filepath) 

198 # read in header from file: 

199 lines = [] 

200 gzfilepath = filepath.with_suffix(filepath.suffix + '.gz') 

201 if gzfilepath.is_file(): 

202 with gzip.open(gzfilepath, 'r', encoding='latin-1') as sf: 

203 for line in sf: 

204 line = line.strip() 

205 if len(line) == 0 or line[0] != '#': 

206 break 

207 lines.append(line) 

208 else: 

209 with open(filepath, 'r', encoding='latin-1') as sf: 

210 for line in sf: 

211 line = line.strip() 

212 if len(line) == 0 or line[0] != '#': 

213 break 

214 lines.append(line) 

215 # parse: 

216 data = {} 

217 cdatas = [data] 

218 sections = [''] 

219 ident_offs = None 

220 ident = None 

221 for line in lines: 

222 words = line.split(':') 

223 value = ':'.join(words[1:]).strip() if len(words) > 1 else '' 

224 if len(words) >= 1: 

225 key = words[0].strip('#') 

226 # get section level: 

227 level = 0 

228 if not flat or len(value) == 0: 

229 nident = len(key) - len(key.lstrip()) 

230 if ident_offs is None: 

231 ident_offs = nident 

232 elif ident is None: 

233 if nident > ident_offs: 

234 ident = nident - ident_offs 

235 level = 1 

236 else: 

237 level = (nident - ident_offs)//ident 

238 # close sections: 

239 if not flat: 

240 while len(cdatas) > level + 1: 

241 cdatas[-1][sections.pop()] = cdatas.pop() 

242 else: 

243 while len(sections) > level + 1: 

244 sections.pop() 

245 # key: 

246 key = key.strip().strip('"') 

247 if lower_keys: 

248 key = key.lower() 

249 skey = key 

250 if add_sections: 

251 key = '.'.join(sections[1:] + [key]) 

252 if len(value) == 0: 

253 # new sub-section: 

254 if flat: 

255 if store_empty: 

256 cdatas[-1][key] = None 

257 else: 

258 cdatas.append({}) 

259 sections.append(skey) 

260 else: 

261 # key-value pair: 

262 value = value.strip('"') 

263 if len(value) > 0 or value != '-' or store_empty: 

264 if len(value) > 0 and value[0] == '[' and value[-1] == ']': 

265 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')] 

266 if first_only: 

267 value = value[0] 

268 cdatas[-1][key] = value 

269 while len(cdatas) > 1: 

270 cdatas[-1][sections.pop()] = cdatas.pop() 

271 return data 

272 

273 

274def check_relacs(filepath): 

275 """Check for valid relacs file. 

276 

277 Parameters 

278 ---------- 

279 filepath: str or Path 

280 Path to a relacs data directory, or a file in a relacs data directory. 

281 

282 Returns 

283 ------- 

284 is_relacs: boolean 

285 `True` if `filepath` is a valid relacs directory or is a file therein. 

286 """ 

287 # relacs data directory: 

288 relacs_dir = Path(filepath) 

289 if not relacs_dir.is_dir(): 

290 relacs_dir = relacs_dir.parent 

291 # check for a valid relacs data directory: 

292 has_stimuli = False 

293 has_trace = False 

294 for fname in ['stimuli.dat', 'stimuli.dat.gz']: 

295 if (relacs_dir / fname).is_file(): 

296 has_stimuli = True 

297 break 

298 for fname in ['trace-1.raw', 'trace-1.raw.gz']: 

299 if (relacs_dir / fname).is_file(): 

300 has_trace = True 

301 break 

302 return has_stimuli and has_trace 

303 

304 

305def relacs_trace_files(filepath): 

306 """Expand file path for relacs data to appropriate trace*.raw file names. 

307 

308 Parameters 

309 ---------- 

310 filepath: str or Path 

311 Path to a relacs data directory, or a file in a relacs data directory. 

312  

313 Returns 

314 ------- 

315 trace_filepaths: list of Path 

316 List of relacs trace*.raw files. 

317 """ 

318 relacs_dir = Path(filepath) 

319 if not relacs_dir.is_dir(): 

320 relacs_dir = relacs_dir.parent 

321 trace_filepaths = [] 

322 for k in range(10000): 

323 trace_file = relacs_dir / f'trace-{k+1}.raw' 

324 gz_trace_file = relacs_dir / f'trace-{k+1}.raw.gz' 

325 if trace_file.is_file(): 

326 trace_filepaths.append(trace_file) 

327 elif gz_trace_file.is_file(): 

328 trace_filepaths.append(gz_trace_file) 

329 else: 

330 break 

331 return trace_filepaths 

332 

333 

334def load_relacs(filepath, amax=1.0): 

335 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs). 

336 

337 Parameters 

338 ---------- 

339 filepath: str of Path 

340 Path to a relacs data directory, or a file in a relacs data directory. 

341 amax: float 

342 The amplitude range of the data. 

343 

344 Returns 

345 ------- 

346 data: 2-D array 

347 All data traces as an 2-D numpy array, even for single channel data. 

348 First dimension is time, second is channel. 

349 rate: float 

350 Sampling rate of the data in Hz 

351 unit: str 

352 Unit of the data 

353 amax: float 

354 Maximum amplitude of data range. 

355 

356 Raises 

357 ------ 

358 FileNotFoundError: 

359 Invalid or non existing relacs files. 

360 ValueError: 

361 - Invalid name for relacs trace-*.raw file. 

362 - Sampling rates of traces differ. 

363 - Unit of traces differ. 

364 """ 

365 trace_filepaths = relacs_trace_files(filepath) 

366 if len(trace_filepaths) == 0: 

367 raise FileNotFoundError(f'no relacs files found') 

368 # load trace*.raw files: 

369 nchannels = len(trace_filepaths) 

370 data = None 

371 nrows = 0 

372 rate = None 

373 unit = '' 

374 for c, path in enumerate(sorted(trace_filepaths)): 

375 if path.suffix == '.gz': 

376 with gzip.open(path, 'rb') as sf: 

377 x = np.frombuffer(sf.read(), dtype=np.float32) 

378 else: 

379 x = np.fromfile(path, np.float32) 

380 if data is None: 

381 nrows = len(x) 

382 data = np.zeros((nrows, nchannels)) 

383 n = min(len(x), nrows) 

384 data[:n,c] = x[:n] 

385 # retrieve sampling rate and unit: 

386 crate, us = relacs_samplerate_unit(path, c) 

387 if rate is None: 

388 rate = crate 

389 elif crate != rate: 

390 raise ValueError('sampling rates of traces differ') 

391 if len(unit) == 0: 

392 unit = us 

393 elif us != unit: 

394 raise ValueError('unit of traces differ') 

395 return data, rate, unit, amax 

396 

397 

398def metadata_relacs(filepath, store_empty=False, first_only=False, 

399 lower_keys=False, flat=False, add_sections=False): 

400 """ Read meta-data of a relacs data set. 

401 

402 Parameters 

403 ---------- 

404 filepath: str or Path 

405 A relacs data directory or a file therein. 

406 store_empty: bool 

407 If `False` do not add meta data with empty values. 

408 first_only: bool 

409 If `False` only store the first element of a list. 

410 lower_keys: bool 

411 Make all keys lower case. 

412 flat: bool 

413 Do not make a nested dictionary. 

414 Use this option also to read in very old relacs metadata with 

415 ragged left alignment. 

416 add_sections: bool 

417 If `True`, prepend keys with sections names separated by 

418 '.' to make them unique. 

419 

420 Returns 

421 ------- 

422 data: nested dict 

423 Nested dictionary with key-value pairs of the meta data. 

424 """ 

425 relacs_dir = Path(filepath) 

426 if not relacs_dir.is_dir(): 

427 relacs_dir = relacs_dir.parent 

428 info_path = relacs_dir / 'info.dat' 

429 if not info_path.is_file(): 

430 return dict() 

431 data = relacs_header(info_path, store_empty, first_only, 

432 lower_keys, flat, add_sections) 

433 return data 

434 

435 

436def fishgrid_spacings(metadata, unit='m'): 

437 """Spacing between grid electrodes. 

438 

439 Parameters 

440 ---------- 

441 metadata: dict 

442 Fishgrid metadata obtained from `metadata_fishgrid()`. 

443 unit: str 

444 Unit in which to return the spacings. 

445 

446 Returns 

447 ------- 

448 grid_dist: list of tuple of float 

449 For each grid the distances between rows and columns in `unit`. 

450 """ 

451 grids_dist = [] 

452 for k in range(4): 

453 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0) 

454 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0) 

455 rows = get_int(metadata, f'Rows{k+1}', default=0) 

456 cols = get_int(metadata, f'Columns{k+1}', default=0) 

457 if get_bool(metadata, f'Used{k+1}', default=False) or \ 

458 cols > 0 and rows > 0: 

459 grids_dist.append((row_dist, col_dist)) 

460 return grids_dist 

461 

462 

463def fishgrid_grids(metadata): 

464 """Retrieve grid sizes from a fishgrid.cfg file. 

465 

466 Parameters 

467 ---------- 

468 metadata: dict 

469 Fishgrid metadata obtained from `metadata_fishgrid()`. 

470 

471 Returns 

472 ------- 

473 grids: list of tuple of int 

474 For each grid the number of rows and columns. 

475 """ 

476 grids = [] 

477 for k in range(4): 

478 rows = get_int(metadata, f'Rows{k+1}', default=0) 

479 cols = get_int(metadata, f'Columns{k+1}', default=0) 

480 if get_bool(metadata, f'Used{k+1}', default=False) or \ 

481 cols > 0 and rows > 0: 

482 grids.append((rows, cols)) 

483 return grids 

484 

485 

486def check_fishgrid(filepath): 

487 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid). 

488 

489 Parameters 

490 ---------- 

491 filepath: str or Path 

492 Path to a fishgrid data directory or a file in a fishgrid 

493 data directory. 

494 

495 Returns 

496 ------- 

497 is_fishgrid: bool 

498 `True` if `filepath` is a valid fishgrid data directory or 

499 a file therein. 

500 """ 

501 # fishgrid data directory: 

502 fishgrid_dir = Path(filepath) 

503 if not fishgrid_dir.is_dir(): 

504 fishgrid_dir = fishgrid_dir.parent 

505 # check for a valid fishgrid data directory: 

506 return ((fishgrid_dir / 'fishgrid.cfg').is_file() and 

507 ((fishgrid_dir / 'traces-grid1.raw').is_file() or 

508 (fishgrid_dir / 'traces.raw').is_file())) 

509 

510 

511def fishgrid_trace_files(filepath): 

512 """Expand file paths for fishgrid data to appropriate traces*.raw file names. 

513 

514 Parameters 

515 ---------- 

516 filepath: str or Path 

517 Path to a fishgrid data directory, or a file therein. 

518  

519 Returns 

520 ------- 

521 trace_filepaths: list of Path 

522 List of fishgrid traces*.raw files. 

523 """ 

524 fishgrid_dir = Path(filepath) 

525 if not fishgrid_dir.is_dir(): 

526 fishgrid_dir = fishgrid_dir.parent 

527 # find grids: 

528 trace_filepaths = [] 

529 for k in range(10000): 

530 trace_file = fishgrid_dir / f'traces-grid{k+1}.raw' 

531 gz_trace_file = fishgrid_dir / f'traces-grid{k+1}.raw.gz' 

532 if trace_file.is_file(): 

533 trace_filepaths.append(trace_file) 

534 elif gz_trace_file.is_file(): 

535 trace_filepaths.append(gz_trace_file) 

536 else: 

537 break 

538 if len(trace_filepaths) == 0: 

539 trace_file = fishgrid_dir / f'traces.raw' 

540 gz_trace_file = fishgrid_dir / f'traces.raw.gz' 

541 if trace_file.is_file(): 

542 trace_filepaths.append(trace_file) 

543 elif gz_trace_file.is_file(): 

544 trace_filepaths.append(gz_trace_file) 

545 return trace_filepaths 

546 

547 

548def load_fishgrid(filepath): 

549 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid). 

550 

551 Parameters 

552 ---------- 

553 filepath: str 

554 Path to a fishgrid data directory, or a file therein. 

555 

556 Returns 

557 ------- 

558 data: 2-D array 

559 All data traces as an 2-D numpy array, even for single channel data. 

560 First dimension is time, second is channel. 

561 rate: float 

562 Sampling rate of the data in Hz. 

563 unit: str 

564 Unit of the data. 

565 amax: float 

566 Maximum amplitude of data range. 

567 

568 Raises 

569 ------ 

570 FileNotFoundError: 

571 Invalid or non existing fishgrid files. 

572 """ 

573 trace_filepaths = fishgrid_trace_files(filepath) 

574 if len(trace_filepaths) == 0: 

575 raise FileNotFoundError(f'no fishgrid files found') 

576 md = metadata_fishgrid(filepath) 

577 grids = fishgrid_grids(md) 

578 grid_sizes = [r*c for r, c in grids] 

579 

580 # load traces-grid*.raw files: 

581 grid_channels = [] 

582 nchannels = 0 

583 for g, path in enumerate(trace_filepaths): 

584 grid_channels.append(grid_sizes[g]) 

585 nchannels += grid_sizes[g] 

586 data = None 

587 nrows = 0 

588 c = 0 

589 rate = get_number(md, 'Hz', 'AISampleRate') 

590 for path, channels in zip(trace_filepaths, grid_channels): 

591 if path.suffix == '.gz': 

592 with gzip.open(path, 'rb') as sf: 

593 x = np.frombuffer(sf.read(), dtype=np.float32) 

594 else: 

595 x = np.fromfile(path, np.float32).reshape((-1, channels)) 

596 if data is None: 

597 nrows = len(x) 

598 data = np.zeros((nrows, nchannels)) 

599 n = min(len(x), nrows) 

600 data[:n, c:c + channels] = x[:n, :] 

601 c += channels 

602 amax, unit = get_number_unit(md, 'AIMaxVolt') 

603 return data, rate, unit, amax 

604 

605 

606# add fishgrid keys: 

607default_starttime_keys.append(['StartDate', 'StartTime']) 

608default_gain_keys.insert(0, 'AIMaxVolt') 

609 

610 

611def metadata_fishgrid(filepath): 

612 """ Read meta-data of a fishgrid data set. 

613 

614 Parameters 

615 ---------- 

616 filepath: str or Path 

617 A fishgrid data directory or a file therein. 

618 

619 Returns 

620 ------- 

621 data: nested dict 

622 Nested dictionary with key-value pairs of the meta data. 

623 """ 

624 fishgrid_dir = Path(filepath) 

625 if not fishgrid_dir.is_dir(): 

626 fishgrid_dir = fishgrid_dir.parent 

627 config_path = fishgrid_dir / 'fishgrid.cfg' 

628 gz_config_path = fishgrid_dir / 'fishgrid.cfg.gz' 

629 # read in header from file: 

630 lines = [] 

631 if gz_config_path.is_file(): 

632 with gzip.open(gz_config_path, 'r', encoding='latin-1') as sf: 

633 for line in sf: 

634 lines.append(line) 

635 elif config_path.is_file(): 

636 with open(config_path, 'r', encoding='latin-1') as sf: 

637 for line in sf: 

638 lines.append(line) 

639 else: 

640 return {} 

641 # parse: 

642 data = {} 

643 cdatas = [data] 

644 ident_offs = None 

645 ident = None 

646 old_style = False 

647 grid_n = False 

648 for line in lines: 

649 if len(line.strip()) == 0: 

650 continue 

651 if line[0] == '*': 

652 key = line[1:].strip() 

653 data[key] = {} 

654 cdatas = [data, data[key]] 

655 elif '----' in line: 

656 old_style = True 

657 key = line.strip().strip(' -').replace('&', '') 

658 if key.upper() == 'SETUP': 

659 key = 'Grid 1' 

660 grid_n = False 

661 if key[:4].lower() == 'grid': 

662 grid_n = key[5] 

663 cdatas = cdatas[:2] 

664 cdatas[1][key] = {} 

665 cdatas.append(cdatas[1][key]) 

666 else: 

667 words = line.split(':') 

668 key = words[0].strip().strip('"') 

669 value = None 

670 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style): 

671 value = ':'.join(words[1:]).strip().strip('"') 

672 if old_style: 

673 if value is None: 

674 cdatas = cdatas[:3] 

675 cdatas[2][key] = {} 

676 cdatas.append(cdatas[2][key]) 

677 else: 

678 if grid_n and key[-1] != grid_n: 

679 key = key + grid_n 

680 cdatas[-1][key] = value 

681 else: 

682 # get section level: 

683 level = 0 

684 nident = len(line) - len(line.lstrip()) 

685 if ident_offs is None: 

686 ident_offs = nident 

687 elif ident is None: 

688 if nident > ident_offs: 

689 ident = nident - ident_offs 

690 level = 1 

691 else: 

692 level = (nident - ident_offs)//ident 

693 # close sections: 

694 cdatas = cdatas[:2 + level] 

695 if value is None: 

696 # new section: 

697 cdatas[-1][key] = {} 

698 cdatas.append(cdatas[-1][key]) 

699 else: 

700 # key-value pair: 

701 cdatas[-1][key] = value.replace(r'\n', '\n') 

702 # remove unused grids: 

703 fgm = data.get('FishGrid', {}) 

704 for i in range(4): 

705 gs = f'Grid {i+1}' 

706 if gs in fgm: 

707 gm = fgm[gs] 

708 us = f'Used{i+1}' 

709 if us in gm and gm[us].upper() == 'FALSE': 

710 del fgm[gs] 

711 return data 

712 

713 

714def markers_fishgrid(filepath): 

715 """ Read markers of a fishgrid data set. 

716 

717 Parameters 

718 ---------- 

719 filepath: str or Path 

720 A fishgrid data directory or a file therein. 

721 

722 Returns 

723 ------- 

724 locs: 2-D array of ints 

725 Marker positions (first column) and spans (second column) 

726 for each marker (rows). 

727 labels: 2-D array of string objects 

728 Labels (first column) and texts (second column) 

729 for each marker (rows). 

730 """ 

731 def add_marker(): 

732 if 'index1' in marker: 

733 index1 = int(marker['index1'])//nchannels 

734 else: 

735 index1 = int(marker['index'])//nchannels 

736 span1 = int(marker.get('span1', 0))//nchannels 

737 locs.append([index1, span1]) 

738 ls = marker.get('label', 'M') 

739 cs = marker.get('comment', '') 

740 labels.append([ls, cs]) 

741 

742 fishgrid_dir = Path(filepath) 

743 if not fishgrid_dir.is_dir(): 

744 fishgrid_dir = fishgrid_dir.parent 

745 path = fishgrid_dir / 'timestamps.dat' 

746 if not path.is_file(): 

747 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

748 # get number of channels: 

749 md = metadata_fishgrid(path.with_name('fishgrid.cfg')) 

750 grids = fishgrid_grids(md) 

751 nchannels = np.prod(grids[0]) 

752 # read timestamps: 

753 locs = [] 

754 labels = [] 

755 marker = {} 

756 with open(path, 'r') as sf: 

757 for line in sf: 

758 if len(line.strip()) == 0: 

759 add_marker() 

760 marker = {} 

761 else: 

762 words = line.split(':') 

763 if len(words) > 1: 

764 v = words[1].strip() 

765 v = v.strip('"') 

766 marker[words[0].strip().lower()] = v 

767 if len(marker) > 0: 

768 add_marker() 

769 if len(locs) > 2: 

770 return np.array(locs[1:-1]), np.array(labels[1:-1]) 

771 else: 

772 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

773 

774 

775def check_container(filepath): 

776 """Check if file is a generic container file. 

777 

778 Supported file formats are: 

779 

780 - python pickle files (.pkl) 

781 - numpy files (.npz) 

782 - matlab files (.mat) 

783 

784 Parameters 

785 ---------- 

786 filepath: str or Path 

787 Path of the file to check. 

788  

789 Returns 

790 ------- 

791 is_container: bool 

792 `True`, if `filepath` is a supported container format. 

793 """ 

794 ext = Path(filepath).suffix 

795 return ext.lower() in ('.pkl', '.npz', '.mat') 

796 

797 

798def extract_container_data(data_dict, datakey=None, 

799 samplekey=['rate', 'Fs', 'fs'], 

800 timekey=['time'], amplkey=['amax'], unitkey='unit', 

801 amax=1.0, unit='a.u.'): 

802 """Extract data from dictionary loaded from a container file. 

803 

804 Parameters 

805 ---------- 

806 data_dict: dict 

807 Dictionary of the data items contained in the container. 

808 datakey: None, str, or list of str 

809 Name of the variable holding the data. If `None` take the 

810 variable that is an 2D array and has the largest number of 

811 elements. 

812 samplekey: str or list of str 

813 Name of the variable holding the sampling rate. 

814 timekey: str or list of str 

815 Name of the variable holding sampling times. 

816 If no sampling rate is available, the sampling rate is retrieved 

817 from the sampling times. 

818 amplkey: str or list of str 

819 Name of the variable holding the amplitude range of the data. 

820 unitkey: str 

821 Name of the variable holding the unit of the data. 

822 amax: None or float 

823 If specified and no amplitude range has been found in `data_dict`, 

824 then this is the amplitude range of the data. 

825 unit: None or str 

826 If specified and no unit has been found in `data_dict`, 

827 then return this as the unit of the data. 

828 

829 Returns 

830 ------- 

831 data: 2-D array of floats 

832 All data traces as an 2-D numpy array, even for single channel data. 

833 First dimension is time, second is channel. 

834 rate: float 

835 Sampling rate of the data in Hz. 

836 unit: str 

837 Unit of the data. 

838 amax: float 

839 Maximum amplitude of data range in `unit`. 

840 

841 Raises 

842 ------ 

843 ValueError: 

844 Invalid key requested. 

845 """ 

846 # extract format data: 

847 if not isinstance(samplekey, (list, tuple, np.ndarray)): 

848 samplekey = (samplekey,) 

849 if not isinstance(timekey, (list, tuple, np.ndarray)): 

850 timekey = (timekey,) 

851 if not isinstance(amplkey, (list, tuple, np.ndarray)): 

852 amplkey = (amplkey,) 

853 rate = 0.0 

854 for skey in samplekey: 

855 if skey in data_dict: 

856 rate = float(data_dict[skey]) 

857 break 

858 if rate == 0.0: 

859 for tkey in timekey: 

860 if tkey in data_dict: 

861 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0]) 

862 break 

863 if rate == 0.0: 

864 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times") 

865 for akey in amplkey: 

866 if akey in data_dict: 

867 amax = float(data_dict[akey]) 

868 break 

869 if unitkey in data_dict: 

870 unit = data_dict[unitkey] 

871 # get data array: 

872 raw_data = np.array([]) 

873 if datakey: 

874 # try data keys: 

875 if not isinstance(datakey, (list, tuple, np.ndarray)): 

876 datakey = (datakey,) 

877 for dkey in datakey: 

878 if dkey in data_dict: 

879 raw_data = data_dict[dkey] 

880 break 

881 if len(raw_data) == 0: 

882 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data") 

883 else: 

884 # find largest 2D array: 

885 for d in data_dict: 

886 if hasattr(data_dict[d], 'shape'): 

887 if 1 <= len(data_dict[d].shape) <= 2 and \ 

888 np.max(data_dict[d].shape) > np.max(raw_data.shape): 

889 raw_data = data_dict[d] 

890 if len(raw_data) == 0: 

891 raise ValueError('no data found') 

892 # make 2D: 

893 if len(raw_data.shape) == 1: 

894 raw_data = raw_data.reshape(-1, 1) 

895 # transpose if necessary: 

896 if np.argmax(raw_data.shape) > 0: 

897 raw_data = raw_data.T 

898 # recode: 

899 if raw_data.dtype == np.dtype('int16'): 

900 data = raw_data.astype('float32') 

901 data *= amax/2**15 

902 elif raw_data.dtype == np.dtype('int32'): 

903 data = raw_data.astype(float) 

904 data *= amax/2**31 

905 elif raw_data.dtype == np.dtype('int64'): 

906 data = raw_data.astype(float) 

907 data *= amax/2**63 

908 else: 

909 data = raw_data 

910 return data, rate, unit, amax 

911 

912 

913def load_container(filepath, datakey=None, 

914 samplekey=['rate', 'Fs', 'fs'], 

915 timekey=['time'], amplkey=['amax'], unitkey='unit', 

916 amax=1.0, unit='a.u.'): 

917 """Load data from a generic container file. 

918 

919 Supported file formats are: 

920 

921 - python pickle files (.pkl) 

922 - numpy files (.npz) 

923 - matlab files (.mat) 

924 

925 Parameters 

926 ---------- 

927 filepath: str or Path 

928 Path of the file to load. 

929 datakey: None, str, or list of str 

930 Name of the variable holding the data. If `None` take the 

931 variable that is an 2D array and has the largest number of 

932 elements. 

933 samplekey: str or list of str 

934 Name of the variable holding the sampling rate. 

935 timekey: str or list of str 

936 Name of the variable holding sampling times. 

937 If no sampling rate is available, the sampling rate is retrieved 

938 from the sampling times. 

939 amplkey: str 

940 Name of the variable holding the amplitude range of the data. 

941 unitkey: str 

942 Name of the variable holding the unit of the data. 

943 If `unitkey` is not a valid key, then return `unitkey` as the `unit`. 

944 amax: None or float 

945 If specified and no amplitude range has been found in the data 

946 container, then this is the amplitude range of the data. 

947 unit: None or str 

948 If specified and no unit has been found in the data container, 

949 then return this as the unit of the data. 

950 

951 Returns 

952 ------- 

953 data: 2-D array of floats 

954 All data traces as an 2-D numpy array, even for single channel data. 

955 First dimension is time, second is channel. 

956 rate: float 

957 Sampling rate of the data in Hz. 

958 unit: str 

959 Unit of the data. 

960 amax: float 

961 Maximum amplitude of data range. 

962 

963 Raises 

964 ------ 

965 ValueError: 

966 Invalid key requested. 

967 """ 

968 # load data: 

969 data_dict = {} 

970 filepath = Path(filepath) 

971 ext = filepath.suffix.lower() 

972 if ext == '.pkl': 

973 with open(filepath, 'rb') as f: 

974 data_dict = pickle.load(f) 

975 elif ext == '.npz': 

976 data_dict = np.load(filepath) 

977 elif ext == '.mat': 

978 from scipy.io import loadmat 

979 data_dict = loadmat(filepath, squeeze_me=True) 

980 return extract_container_data(data_dict, datakey, samplekey, 

981 timekey, amplkey, unitkey, amax, unit) 

982 

983 

984def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']): 

985 """ Extract metadata from dictionary loaded from a container file. 

986 

987 Parameters 

988 ---------- 

989 data_dict: dict 

990 Dictionary of the data items contained in the container. 

991 metadatakey: str or list of str 

992 Name of the variable holding the metadata. 

993 

994 Returns 

995 ------- 

996 metadata: nested dict 

997 Nested dictionary with key-value pairs of the meta data. 

998 """ 

999 if not isinstance(metadatakey, (list, tuple, np.ndarray)): 

1000 metadatakey = (metadatakey,) 

1001 # get single metadata dictionary: 

1002 for mkey in metadatakey: 

1003 if mkey in data_dict: 

1004 return data_dict[mkey] 

1005 # collect all keys starting with metadatakey: 

1006 metadata = {} 

1007 for mkey in metadatakey: 

1008 mkey += '__' 

1009 for dkey in data_dict: 

1010 if dkey[:len(mkey)] == mkey: 

1011 v = data_dict[dkey] 

1012 if hasattr(v, 'size') and v.ndim == 0: 

1013 v = v.item() 

1014 metadata[dkey[len(mkey):]] = v 

1015 if len(metadata) > 0: 

1016 return unflatten_metadata(metadata, sep='__') 

1017 return metadata 

1018 

1019 

1020def metadata_container(filepath, metadatakey=['metadata', 'info']): 

1021 """ Read meta-data of a container file. 

1022 

1023 Parameters 

1024 ---------- 

1025 filepath: str or Path 

1026 A container file. 

1027 metadatakey: str or list of str 

1028 Name of the variable holding the metadata. 

1029 

1030 Returns 

1031 ------- 

1032 metadata: nested dict 

1033 Nested dictionary with key-value pairs of the meta data. 

1034 """ 

1035 data_dict = {} 

1036 filepath = Path(filepath) 

1037 ext = filepath.suffix.lower() 

1038 if ext == '.pkl': 

1039 with open(filepath, 'rb') as f: 

1040 data_dict = pickle.load(f) 

1041 elif ext == '.npz': 

1042 data_dict = np.load(filepath) 

1043 elif ext == '.mat': 

1044 from scipy.io import loadmat 

1045 data_dict = loadmat(filepath, squeeze_me=True) 

1046 return extract_container_metadata(data_dict, metadatakey) 

1047 

1048 

1049def extract_container_markers(data_dict, poskey=['positions'], 

1050 spanskey=['spans'], labelskey=['labels'], 

1051 descrkey=['descriptions']): 

1052 """ Extract markers from dictionary loaded from a container file. 

1053 

1054 Parameters 

1055 ---------- 

1056 data_dict: dict 

1057 Dictionary of the data items contained in the container. 

1058 poskey: str or list of str 

1059 Name of the variable holding positions of markers. 

1060 spanskey: str or list of str 

1061 Name of the variable holding spans of markers. 

1062 labelskey: str or list of str 

1063 Name of the variable holding labels of markers. 

1064 descrkey: str or list of str 

1065 Name of the variable holding descriptions of markers. 

1066 

1067 Returns 

1068 ------- 

1069 locs: 2-D array of ints 

1070 Marker positions (first column) and spans (second column) 

1071 for each marker (rows). 

1072 labels: 2-D array of string objects 

1073 Labels (first column) and texts (second column) 

1074 for each marker (rows). 

1075 """ 

1076 if not isinstance(poskey, (list, tuple, np.ndarray)): 

1077 poskey = (poskey,) 

1078 if not isinstance(spanskey, (list, tuple, np.ndarray)): 

1079 spanskey = (spanskey,) 

1080 if not isinstance(labelskey, (list, tuple, np.ndarray)): 

1081 labelskey = (labelskey,) 

1082 if not isinstance(descrkey, (list, tuple, np.ndarray)): 

1083 descrkey = (descrkey,) 

1084 locs = np.zeros((0, 2), dtype=int) 

1085 for pkey in poskey: 

1086 if pkey in data_dict: 

1087 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int) 

1088 locs[:,0] = data_dict[pkey] 

1089 break 

1090 for skey in spanskey: 

1091 if skey in data_dict: 

1092 locs[:,1] = data_dict[skey] 

1093 break 

1094 labels = np.zeros((0, 2), dtype=object) 

1095 for lkey in labelskey: 

1096 if lkey in data_dict: 

1097 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object) 

1098 labels[:,0] = data_dict[lkey] 

1099 break 

1100 for dkey in descrkey: 

1101 if dkey in data_dict: 

1102 labels[:,1] = data_dict[dkey] 

1103 break 

1104 return locs, labels 

1105 

1106 

1107def markers_container(filepath, poskey=['positions'], 

1108 spanskey=['spans'], labelskey=['labels'], 

1109 descrkey=['descriptions']): 

1110 """ Read markers of a container file. 

1111 

1112 Parameters 

1113 ---------- 

1114 filepath: str or Path 

1115 A container file. 

1116 poskey: str or list of str 

1117 Name of the variable holding positions of markers. 

1118 spanskey: str or list of str 

1119 Name of the variable holding spans of markers. 

1120 labelskey: str or list of str 

1121 Name of the variable holding labels of markers. 

1122 descrkey: str or list of str 

1123 Name of the variable holding descriptions of markers. 

1124 

1125 Returns 

1126 ------- 

1127 locs: 2-D array of ints 

1128 Marker positions (first column) and spans (second column) 

1129 for each marker (rows). 

1130 labels: 2-D array of string objects 

1131 Labels (first column) and texts (second column) 

1132 for each marker (rows). 

1133 """ 

1134 data_dict = {} 

1135 filepath = Path(filepath) 

1136 ext = filepath.suffix.lower() 

1137 if ext == '.pkl': 

1138 with open(filepath, 'rb') as f: 

1139 data_dict = pickle.load(f) 

1140 elif ext == '.npz': 

1141 data_dict = np.load(filepath) 

1142 elif ext == '.mat': 

1143 from scipy.io import loadmat 

1144 data_dict = loadmat(filepath, squeeze_me=True) 

1145 return extract_container_markers(data_dict, poskey, spanskey, 

1146 labelskey, descrkey) 

1147 

1148 

1149def check_raw(filepath): 

1150 """Check if file is a raw file. 

1151 

1152 The following extensions are interpreted as raw files: 

1153 

1154 - raw files (*.raw) 

1155 - LabView scandata (*.scandat) 

1156 

1157 Parameters 

1158 ---------- 

1159 filepath: str or Path 

1160 Path of the file to check. 

1161  

1162 Returns 

1163 ------- 

1164 is_raw: bool 

1165 `True`, if `filepath` is a raw format. 

1166 """ 

1167 ext = Path(filepath).suffix 

1168 return ext.lower() in ('.raw', '.scandat') 

1169 

1170 

1171def load_raw(filepath, rate=44000, channels=1, encoding='FLOAT', 

1172 amax=1.0, unit='a.u.'): 

1173 """Load data from a raw file. 

1174 

1175 Raw files just contain the data and absolutely no metadata, not 

1176 even the sampling rate, number of channels, etc. 

1177 Supported file formats are: 

1178 

1179 - raw files (*.raw) 

1180 - LabView scandata (*.scandat) 

1181 

1182 Parameters 

1183 ---------- 

1184 filepath: str or Path 

1185 Path of the file to load. 

1186 rate: float 

1187 Sampling rate of the data in Hertz. 

1188 channels: int 

1189 Number of channels multiplexed in the data. 

1190 encoding: str 

1191 The encoding of the data stored in the file. 

1192 Valid encodings are 'PCM_16', 'PCM_32', 'PCM_64', 'FLOAT', or 

1193 'DOUBLE' or lower-case versions thereof. 

1194 amax: float 

1195 The amplitude range of the data. 

1196 unit: str 

1197 The unit of the data. 

1198 

1199 Returns 

1200 ------- 

1201 data: 2-D array of floats 

1202 All data traces as an 2-D numpy array, even for single channel data. 

1203 First dimension is time, second is channel. 

1204 rate: float 

1205 Sampling rate of the data in Hz. 

1206 unit: str 

1207 Unit of the data. 

1208 amax: float 

1209 Maximum amplitude of data range. 

1210 

1211 Raises 

1212 ------ 

1213 ValueError: 

1214 Invalid encoding. 

1215 

1216 """ 

1217 encodings = {'PCM_16': 'i2', 

1218 'PCM_32': 'i4', 

1219 'PCM_64': 'i8', 

1220 'FLOAT': 'f', 

1221 'DOUBLE': 'd'} 

1222 encoding = encoding.upper() 

1223 if not encoding in encodings: 

1224 raise ValueError(f'invalid encoding {encoding} for raw file!') 

1225 dtype = np.dtype(encodings[encoding]) 

1226 raw_data = np.fromfile(filepath, dtype=dtype).reshape(-1, channels) 

1227 # recode: 

1228 if dtype == np.dtype('int16'): 

1229 data = raw_data.astype('float32') 

1230 data *= amax/2**15 

1231 elif dtype == np.dtype('int32'): 

1232 data = raw_data.astype(float) 

1233 data *= amax/2**31 

1234 elif dtype == np.dtype('int64'): 

1235 data = raw_data.astype(float) 

1236 data *= amax/2**63 

1237 else: 

1238 data = raw_data 

1239 return data, rate, unit, amax 

1240 

1241 

1242def load_audioio(filepath, verbose=0, gainkey=default_gain_keys, sep='.', 

1243 amax=1.0, unit='a.u.'): 

1244 """Load data from an audio file. 

1245 

1246 See the 

1247 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio) 

1248 function of the [`audioio`](https://github.com/bendalab/audioio) 

1249 package for more infos. 

1250 

1251 Parameters 

1252 ---------- 

1253 filepath: str or Path 

1254 Path of the file to load. 

1255 verbose: int 

1256 If > 0 show detailed error/warning messages. 

1257 gainkey: str or list of str 

1258 Key in the file's metadata that holds some gain information. 

1259 If found, the data will be multiplied with the gain, 

1260 and if available, the corresponding unit is returned. 

1261 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details. 

1262 sep: str 

1263 String that separates section names in `gainkey`. 

1264 amax: float 

1265 If specified and no gain has been found in the metadata, 

1266 then use this as the amplitude range. 

1267 unit: str 

1268 If specified and no gain has been found in the metadata, 

1269 then return this as the unit of the data. 

1270 

1271 Returns 

1272 ------- 

1273 data: 2-D array of floats 

1274 All data traces as an 2-D numpy array, even for single channel data. 

1275 First dimension is time, second is channel. 

1276 rate: float 

1277 Sampling rate of the data in Hz. 

1278 unit: str 

1279 Unit of the data if found in the metadata (see `gainkey`), 

1280 otherwise `unit`. 

1281 amax: float 

1282 Maximum amplitude of data range. 

1283 """ 

1284 # get gain: 

1285 md = metadata_audioio(filepath) 

1286 amax, unit = get_gain(md, gainkey, sep, amax, unit) 

1287 # load data: 

1288 data, rate = load_audio(filepath, verbose) 

1289 if amax != 1.0: 

1290 data *= amax 

1291 return data, rate, unit, amax 

1292 

1293 

1294data_loader_funcs = ( 

1295 ('relacs', check_relacs, load_relacs, metadata_relacs, None), 

1296 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid), 

1297 ('container', check_container, load_container, metadata_container, markers_container), 

1298 ('raw', check_raw, load_raw, None, None), 

1299 ('audioio', None, load_audioio, metadata_audioio, markers_audioio), 

1300 ) 

1301"""List of implemented load functions. 

1302 

1303Each element of the list is a tuple with the data format's name, its 

1304check and its load function. 

1305 

1306""" 

1307 

1308 

1309def load_data(filepath, verbose=0, **kwargs): 

1310 """Load time-series data from a file. 

1311 

1312 Parameters 

1313 ---------- 

1314 filepath: str or Path 

1315 Path and name of the file to load. 

1316 verbose: int 

1317 If > 0 show detailed error/warning messages. 

1318 **kwargs: dict 

1319 Further keyword arguments that are passed on to the  

1320 format specific loading functions. 

1321 For example: 

1322 - `amax`: the amplitude range of the data. 

1323 - 'unit': the unit of the data. 

1324 

1325 Returns 

1326 ------- 

1327 data: 2-D array 

1328 All data traces as an 2-D numpy array, even for single channel data. 

1329 First dimension is time, second is channel. 

1330 rate: float 

1331 Sampling rate of the data in Hz. 

1332 unit: str 

1333 Unit of the data. 

1334 amax: float 

1335 Maximum amplitude of data range. 

1336 """ 

1337 # load data: 

1338 for name, check_file, load_file, _, _ in data_loader_funcs: 

1339 if check_file is None or check_file(filepath): 

1340 data, rate, unit, amax = load_file(filepath, **kwargs) 

1341 if verbose > 0: 

1342 print(f'loaded {name} data from file "{filepath}"') 

1343 if verbose > 1: 

1344 print(f' sampling rate: {rate:g} Hz') 

1345 print(f' channels : {data.shape[1]}') 

1346 print(f' frames : {len(data)}') 

1347 print(f' range : {amax:g}{unit}') 

1348 return data, rate, unit, amax 

1349 return np.zeros((0, 1)), 0.0, '', 1.0 

1350 

1351 

1352def metadata(filepath, **kwargs): 

1353 """ Read meta-data from a data file. 

1354 

1355 Parameters 

1356 ---------- 

1357 filepath: str or Path 

1358 The full path and name of the file to load. For some file 

1359 formats several files can be provided in a list. 

1360 **kwargs: dict 

1361 Further keyword arguments that are passed on to the  

1362 format specific loading functions. 

1363 

1364 Returns 

1365 ------- 

1366 meta_data: nested dict 

1367 Meta data contained in the file. Keys of the nested 

1368 dictionaries are always strings. If the corresponding 

1369 values are dictionaries, then the key is the section name 

1370 of the metadata contained in the dictionary. All other 

1371 types of values are values for the respective key. In 

1372 particular they are strings, or list of strings. But other 

1373 simple types like ints or floats are also allowed. 

1374 """ 

1375 # load metadata: 

1376 for _, check_file, _, metadata_file, _ in data_loader_funcs: 

1377 if check_file is None or check_file(filepath): 

1378 if metadata_file is not None: 

1379 return metadata_file(filepath, **kwargs) 

1380 return {} 

1381 

1382 

1383def markers(filepath): 

1384 """ Read markers of a data file. 

1385 

1386 Parameters 

1387 ---------- 

1388 filepath: str or Path 

1389 The data file. 

1390 

1391 Returns 

1392 ------- 

1393 locs: 2-D array of ints 

1394 Marker positions (first column) and spans (second column) 

1395 for each marker (rows). 

1396 labels: 2-D array of string objects 

1397 Labels (first column) and texts (second column) 

1398 for each marker (rows). 

1399 """ 

1400 # load markers: 

1401 for _, check_file, _, _, markers_file in data_loader_funcs: 

1402 if check_file is None or check_file(filepath): 

1403 if markers_file is not None: 

1404 return markers_file(filepath) 

1405 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

1406 

1407 

1408class DataLoader(AudioLoader): 

1409 """Buffered reading of time-series data for random access of the data in the file. 

1410  

1411 This allows for reading very large data files that do not fit into 

1412 memory. A `DataLoader` instance can be used like a huge 

1413 read-only numpy array, i.e. 

1414 ``` 

1415 data = DataLoader('path/to/data/file.dat') 

1416 x = data[10000:20000,0] 

1417 ``` 

1418 The first index specifies the frame, the second one the channel. 

1419 

1420 `DataLoader` first determines the format of the data file and then 

1421 opens the file (first line). It then reads data from the file as 

1422 necessary for the requested data (second line). 

1423 

1424 Supported file formats are 

1425  

1426 - python pickle files 

1427 - numpy .npz files 

1428 - matlab .mat files 

1429 - audio files via [`audioio`](https://github.com/bendalab/audioio) package 

1430 - LabView .scandat files 

1431 - raw files 

1432 - relacs files (https://www.relacs.net) 

1433 - fishgrid files (https://github.com/bendalab/fishgrid) 

1434 

1435 Reading sequentially through the file is always possible. If 

1436 previous data are requested, then the file is read from the 

1437 beginning. This might slow down access to previous data 

1438 considerably. Use the `backsize` argument to the open functions to 

1439 make sure some data are loaded before the requested frame. Then a 

1440 subsequent access to the data within `backsize` seconds before that 

1441 frame can still be handled without the need to reread the file 

1442 from the beginning. 

1443 

1444 Usage: 

1445 ------ 

1446 ``` 

1447 import thunderlab.dataloader as dl 

1448 with dl.DataLoader(filepath, 60.0, 10.0) as data: 

1449 # do something with the content of the file: 

1450 x = data[0:10000,0] 

1451 y = data[10000:20000,0] 

1452 z = x + y 

1453 ``` 

1454  

1455 Normal open and close: 

1456 ``` 

1457 data = dl.DataLoader(filepath, 60.0) 

1458 x = data[:,:] # read the whole file 

1459 data.close() 

1460 ```  

1461 that is the same as: 

1462 ``` 

1463 data = dl.DataLoader() 

1464 data.open(filepath, 60.0) 

1465 ``` 

1466  

1467 Parameters 

1468 ---------- 

1469 filepath: str or Path 

1470 Path of the data file. 

1471 buffersize: float 

1472 Size of internal buffer in seconds. 

1473 backsize: float 

1474 Part of the buffer to be loaded before the requested start index in seconds. 

1475 verbose: int 

1476 If larger than zero show detailed error/warning messages. 

1477 meta_kwargs: dict 

1478 Keyword arguments that are passed on to the _load_metadata() function. 

1479 **kwargs: dict 

1480 Further keyword arguments that are passed on to the  

1481 specific open() functions. 

1482 

1483 Attributes 

1484 ---------- 

1485 filepath: Path 

1486 Name and path of the opened file. In case of many files, the first one. 

1487 file_paths: list of Path 

1488 List of pathes of the opened files that are made accessible 

1489 as a single array. 

1490 file_indices: list of int 

1491 For each file the index of its first sample. 

1492 rate: float 

1493 The sampling rate of the data in Hertz. 

1494 channels: int 

1495 The number of channels that are read in. 

1496 frames: int 

1497 The number of frames in the file. 

1498 format: str or None 

1499 Format of the audio file. 

1500 encoding: str or None 

1501 Encoding/subtype of the audio file. 

1502 shape: tuple 

1503 Number of frames and channels of the data. 

1504 ndim: int 

1505 Number of dimensions: always 2 (frames and channels). 

1506 offset: int 

1507 Index of first frame in the current buffer. 

1508 buffer: ndarray of floats 

1509 The curently available data from the file. 

1510 unit: str 

1511 Unit of the data. 

1512 ampl_min: float 

1513 Minimum amplitude the file format supports. 

1514 ampl_max: float 

1515 Maximum amplitude the file format supports. 

1516 

1517 Methods 

1518 ------- 

1519 

1520 - `len()`: the number of frames 

1521 - `open()`: open a data file. 

1522 - `open_*()`: open a data file of a specific format. 

1523 - `close()`: close the file. 

1524 - `basename()`: Base name of the audio data. 

1525 - `format_dict()`: technical infos about how the data are stored. 

1526 - `metadata()`: metadata of the file. 

1527 - `markers()`: markers of the file. 

1528 - `set_unwrap()`: Set parameters for unwrapping clipped data. 

1529 

1530 See audioio.audioloader.AudioLoader for more methods. 

1531 

1532 """ 

1533 

1534 def __init__(self, filepath=None, buffersize=10.0, backsize=0.0, 

1535 verbose=0, meta_kwargs={}, **kwargs): 

1536 super().__init__(None, buffersize, backsize, 

1537 verbose, meta_kwargs) 

1538 if filepath is not None: 

1539 self.open(filepath, buffersize, backsize, verbose, **kwargs) 

1540 

1541 def __getitem__(self, key): 

1542 return super(DataLoader, self).__getitem__(key) 

1543 

1544 def __next__(self): 

1545 return super(DataLoader, self).__next__() 

1546 

1547 

1548 # relacs interface:  

1549 def open_relacs(self, filepath, buffersize=10.0, backsize=0.0, 

1550 verbose=0, amax=1.0): 

1551 """Open relacs data files (www.relacs.net) for reading. 

1552 

1553 Parameters 

1554 ---------- 

1555 filepath: str 

1556 Path to a relacs data directory or a file therein. 

1557 buffersize: float 

1558 Size of internal buffer in seconds. 

1559 backsize: float 

1560 Part of the buffer to be loaded before the requested start index in seconds. 

1561 verbose: int 

1562 If > 0 show detailed error/warning messages. 

1563 amax: float 

1564 The amplitude range of the data. 

1565 

1566 Raises 

1567 ------ 

1568 FileNotFoundError: 

1569 Invalid or non existing fishgrid files. 

1570 ValueError: 

1571 .gz files not supported. 

1572 """ 

1573 self.verbose = verbose 

1574 

1575 # open trace files: 

1576 filepath = Path(filepath) 

1577 self.trace_filepaths = relacs_trace_files(filepath) 

1578 if len(self.trace_filepaths) == 0: 

1579 raise FileNotFoundError('no relacs files found') 

1580 self.sf = [] 

1581 self.frames = None 

1582 self.rate = None 

1583 self.unit = '' 

1584 self.filepath = filepath 

1585 self.file_paths = [self.filepath] 

1586 self.file_indices = [0] 

1587 for path in self.trace_filepaths: 

1588 if path.suffix == '.gz': 

1589 raise ValueError('.gz files not supported') 

1590 sf = open(path, 'rb') 

1591 self.sf.append(sf) 

1592 if self.verbose > 0: 

1593 print(f'open_relacs("{path}")') 

1594 # file size: 

1595 sf.seek(0, os.SEEK_END) 

1596 frames = sf.tell()//4 

1597 if self.frames is None: 

1598 self.frames = frames 

1599 elif self.frames != frames: 

1600 diff = self.frames - frames 

1601 if diff > 1 or diff < -2: 

1602 raise ValueError('number of frames of traces differ') 

1603 elif diff >= 0: 

1604 self.frames = frames 

1605 sf.seek(0) 

1606 # retrieve sampling rate and unit: 

1607 rate, us = relacs_samplerate_unit(path) 

1608 if self.rate is None: 

1609 self.rate = rate 

1610 elif rate != self.rate: 

1611 raise ValueError('sampling rates of traces differ') 

1612 if len(self.unit) == 0: 

1613 self.unit = us 

1614 elif us != self.unit: 

1615 raise ValueError('unit of traces differ') 

1616 self.channels = len(self.sf) 

1617 self.shape = (self.frames, self.channels) 

1618 self.size = self.frames * self.channels 

1619 self.ndim = len(self.shape) 

1620 self.format = 'RELACS' 

1621 self.encoding = 'FLOAT' 

1622 self.bufferframes = int(buffersize*self.rate) 

1623 self.backframes = int(backsize*self.rate) 

1624 self.init_buffer() 

1625 self.offset = 0 

1626 self.close = self._close_relacs 

1627 self.load_audio_buffer = self._load_buffer_relacs 

1628 self.basename = self._basename_relacs 

1629 self.ampl_min = -amax 

1630 self.ampl_max = +amax 

1631 self._load_metadata = metadata_relacs 

1632 # TODO: load markers: 

1633 self._locs = np.zeros((0, 2), dtype=int) 

1634 self._labels = np.zeros((0, 2), dtype=object) 

1635 self._load_markers = None 

1636 return self 

1637 

1638 def _close_relacs(self): 

1639 """Close the relacs data files. 

1640 """ 

1641 for f in self.sf: 

1642 f.close() 

1643 self.sf = [] 

1644 

1645 def _load_buffer_relacs(self, r_offset, r_size, buffer): 

1646 """Load new data from relacs data file. 

1647 

1648 Parameters 

1649 ---------- 

1650 r_offset: int 

1651 First frame to be read from file. 

1652 r_size: int 

1653 Number of frames to be read from file. 

1654 buffer: ndarray 

1655 Buffer where to store the loaded data. 

1656 """ 

1657 if len(self.sf) == 0 and len(self.trace_filepaths) > 0: 

1658 for path in self.trace_filepaths: 

1659 self.sf.append(open(path, 'rb')) 

1660 for i, f in enumerate(self.sf): 

1661 f.seek(r_offset*4) 

1662 data = f.read(r_size*4) 

1663 buffer[:, i] = np.frombuffer(data, dtype=np.float32) 

1664 

1665 

1666 def _basename_relacs(self, path=None): 

1667 """ Base name of the relacs data files. 

1668 

1669 Parameters 

1670 ---------- 

1671 path: str or None 

1672 Path of a relacs data file (*.raw, info.dat, or just the directory). 

1673 If `None`, use `self.filepath`. 

1674 

1675 Returns 

1676 ------- 

1677 s: str 

1678 The base name, i.e. the name of the directory containing the 

1679 relacs data files. 

1680 

1681 """ 

1682 if path is None: 

1683 path = self.filepath 

1684 else: 

1685 path = Path(path) 

1686 if path.is_dir(): 

1687 return path.name 

1688 else: 

1689 return path.parent.name 

1690 

1691 

1692 # fishgrid interface:  

1693 def open_fishgrid(self, filepath, buffersize=10.0, backsize=0.0, 

1694 verbose=0): 

1695 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading. 

1696 

1697 Parameters 

1698 ---------- 

1699 filepath: str 

1700 Path to a fishgrid data directory, or a file therein. 

1701 buffersize: float 

1702 Size of internal buffer in seconds. 

1703 backsize: float 

1704 Part of the buffer to be loaded before the requested start index in seconds. 

1705 verbose: int 

1706 If > 0 show detailed error/warning messages. 

1707 

1708 Raises 

1709 ------ 

1710 FileNotFoundError: 

1711 Invalid or non existing fishgrid files. 

1712 ValueError: 

1713 .gz files not supported. 

1714 """ 

1715 self.verbose = verbose 

1716 

1717 filepath = Path(filepath) 

1718 self.trace_filepaths = fishgrid_trace_files(filepath) 

1719 if len(self.trace_filepaths) == 0: 

1720 raise FileNotFoundError(f'no fishgrid files found') 

1721 self.filepath = filepath 

1722 self.file_paths = [self.filepath] 

1723 self.file_indices = [0] 

1724 self._load_metadata = metadata_fishgrid 

1725 self._load_markers = markers_fishgrid 

1726 

1727 # open grid files: 

1728 grids = fishgrid_grids(self.metadata()) 

1729 grid_sizes = [r*c for r,c in grids] 

1730 self.channels = 0 

1731 for g, path in enumerate(self.trace_filepaths): 

1732 self.channels += grid_sizes[g] 

1733 self.sf = [] 

1734 self.grid_channels = [] 

1735 self.grid_offs = [] 

1736 offs = 0 

1737 self.frames = None 

1738 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate') 

1739 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt') 

1740 if v is not None: 

1741 self.ampl_min = -v 

1742 self.ampl_max = +v 

1743 

1744 for g, path in enumerate(self.trace_filepaths): 

1745 if path.suffix == '.gz': 

1746 raise ValueError('.gz files not supported') 

1747 sf = open(path, 'rb') 

1748 self.sf.append(sf) 

1749 if self.verbose > 0: 

1750 print(f'open_fishgrid("{path}")') 

1751 # grid channels: 

1752 self.grid_channels.append(grid_sizes[g]) 

1753 self.grid_offs.append(offs) 

1754 offs += grid_sizes[g] 

1755 # file size: 

1756 sf.seek(0, os.SEEK_END) 

1757 frames = sf.tell()//4//grid_sizes[g] 

1758 if self.frames is None: 

1759 self.frames = frames 

1760 elif self.frames != frames: 

1761 diff = self.frames - frames 

1762 if diff > 1 or diff < -2: 

1763 raise ValueError('number of frames of traces differ') 

1764 elif diff >= 0: 

1765 self.frames = frames 

1766 sf.seek(0) 

1767 self.shape = (self.frames, self.channels) 

1768 self.size = self.frames * self.channels 

1769 self.ndim = len(self.shape) 

1770 self.format = 'FISHGRID' 

1771 self.encoding = 'FLOAT' 

1772 self.bufferframes = int(buffersize*self.rate) 

1773 self.backframes = int(backsize*self.rate) 

1774 self.init_buffer() 

1775 self.offset = 0 

1776 self.close = self._close_fishgrid 

1777 self.load_audio_buffer = self._load_buffer_fishgrid 

1778 self.basename = self._basename_fishgrid 

1779 return self 

1780 

1781 def _close_fishgrid(self): 

1782 """Close the fishgrid data files. 

1783 """ 

1784 for file in self.sf: 

1785 file.close() 

1786 self.sf = [] 

1787 

1788 def _load_buffer_fishgrid(self, r_offset, r_size, buffer): 

1789 """Load new data from relacs data file. 

1790 

1791 Parameters 

1792 ---------- 

1793 r_offset: int 

1794 First frame to be read from file. 

1795 r_size: int 

1796 Number of frames to be read from file. 

1797 buffer: ndarray 

1798 Buffer where to store the loaded data. 

1799 """ 

1800 if len(self.sf) == 0 and len(self.trace_filepaths) > 0: 

1801 for path in self.trace_filepaths: 

1802 self.sf.append(open(path, 'rb')) 

1803 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs): 

1804 file.seek(r_offset*4*gchannels) 

1805 data = file.read(r_size*4*gchannels) 

1806 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels)) 

1807 

1808 def _basename_fishgrid(self, path=None): 

1809 """ Base name of the fishgrid data files. 

1810 

1811 Parameters 

1812 ---------- 

1813 path: str or Path or None 

1814 Path of a fishgrid data file 

1815 (*.raw, fishgrid.cfg, or just the directory). 

1816 If `None`, use `self.filepath`. 

1817 

1818 Returns 

1819 ------- 

1820 s: str 

1821 The base name, i.e. the name of the directory containing the 

1822 fishgrid data files. 

1823 

1824 """ 

1825 if path is None: 

1826 path = self.filepath 

1827 else: 

1828 path = Path(path) 

1829 if path.is_dir(): 

1830 return path.name 

1831 else: 

1832 return path.parent.name 

1833 

1834 

1835 

1836 # container interface: 

1837 def open_container(self, filepath, buffersize=10.0, 

1838 backsize=0.0, verbose=0, datakey=None, 

1839 samplekey=['rate', 'Fs', 'fs'], 

1840 timekey=['time'], amplkey=['amax'], unitkey='unit', 

1841 metadatakey=['metadata', 'info'], 

1842 poskey=['positions'], 

1843 spanskey=['spans'], labelskey=['labels'], 

1844 descrkey=['descriptions'], 

1845 amax=1.0, unit='a.u.'): 

1846 """Open generic container file. 

1847 

1848 Supported file formats are: 

1849 

1850 - python pickle files (.pkl) 

1851 - numpy files (.npz) 

1852 - matlab files (.mat) 

1853 

1854 Parameters 

1855 ---------- 

1856 filepath: str 

1857 Path to a container file. 

1858 buffersize: float 

1859 Size of internal buffer in seconds. 

1860 backsize: float 

1861 Part of the buffer to be loaded before the requested start index in seconds. 

1862 verbose: int 

1863 If > 0 show detailed error/warning messages. 

1864 datakey: None, str, or list of str 

1865 Name of the variable holding the data. If `None` take the 

1866 variable that is an 2D array and has the largest number of 

1867 elements. 

1868 samplekey: str or list of str 

1869 Name of the variable holding the sampling rate. 

1870 timekey: str or list of str 

1871 Name of the variable holding sampling times. 

1872 If no sampling rate is available, the sampling rate is retrieved 

1873 from the sampling times. 

1874 amplkey: str or list of str 

1875 Name of the variable holding the amplitude range of the data. 

1876 unitkey: str 

1877 Name of the variable holding the unit of the data. 

1878 metadatakey: str or list of str 

1879 Name of the variable holding the metadata. 

1880 poskey: str or list of str 

1881 Name of the variable holding positions of markers. 

1882 spanskey: str or list of str 

1883 Name of the variable holding spans of markers. 

1884 labelskey: str or list of str 

1885 Name of the variable holding labels of markers. 

1886 descrkey: str or list of str 

1887 Name of the variable holding descriptions of markers. 

1888 amax: None or float 

1889 If specified and no amplitude range has been found in the data 

1890 container, then this is the amplitude range of the data. 

1891 unit: None or str 

1892 If specified and no unit has been found in the data container, 

1893 then return this as the unit of the data. 

1894 

1895 Raises 

1896 ------ 

1897 ValueError: 

1898 Invalid key requested. 

1899 """ 

1900 self.verbose = verbose 

1901 data_dict = {} 

1902 filepath = Path(filepath) 

1903 ext = filepath.suffix.lower() 

1904 if ext == '.pkl': 

1905 with open(filepath, 'rb') as f: 

1906 data_dict = pickle.load(f) 

1907 self.format = 'PKL' 

1908 elif ext == '.npz': 

1909 data_dict = np.load(filepath) 

1910 self.format = 'NPZ' 

1911 elif ext == '.mat': 

1912 from scipy.io import loadmat 

1913 data_dict = loadmat(filepath, squeeze_me=True) 

1914 self.format = 'MAT' 

1915 if self.verbose > 0: 

1916 print(f'open_container("{filepath}")') 

1917 self.buffer, self.rate, self.unit, amax = \ 

1918 extract_container_data(data_dict, datakey, samplekey, 

1919 timekey, amplkey, unitkey, amax, unit) 

1920 self.filepath = filepath 

1921 self.file_paths = [self.filepath] 

1922 self.file_indices = [0] 

1923 self.channels = self.buffer.shape[1] 

1924 self.frames = self.buffer.shape[0] 

1925 self.shape = self.buffer.shape 

1926 self.ndim = self.buffer.ndim 

1927 self.size = self.buffer.size 

1928 self.encoding = self.numpy_encodings[self.buffer.dtype] 

1929 self.ampl_min = -amax 

1930 self.ampl_max = +amax 

1931 self.offset = 0 

1932 self.buffer_changed = np.zeros(self.channels, dtype=bool) 

1933 self.bufferframes = self.frames 

1934 self.backsize = 0 

1935 self.close = self._close_container 

1936 self.load_audio_buffer = self._load_buffer_container 

1937 self._metadata = extract_container_metadata(data_dict, metadatakey) 

1938 self._load_metadata = None 

1939 self._locs, self._labels = extract_container_markers(data_dict, 

1940 poskey, 

1941 spanskey, 

1942 labelskey, 

1943 descrkey) 

1944 self._load_markers = None 

1945 

1946 def _close_container(self): 

1947 """Close container. """ 

1948 pass 

1949 

1950 def _load_buffer_container(self, r_offset, r_size, buffer): 

1951 """Load new data from container.""" 

1952 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :] 

1953 

1954 

1955 # raw data interface: 

1956 def open_raw(self, filepath, buffersize=10.0, backsize=0.0, 

1957 verbose=0, rate=44000, channels=1, encoding='FLOAT', 

1958 amax=1.0, unit='a.u.'): 

1959 """Load data from a raw file. 

1960 

1961 Raw files just contain the data and absolutely no metadata, not 

1962 even the smapling rate, number of channels, etc. 

1963 Supported file formats are: 

1964 

1965 - raw files (*.raw) 

1966 - LabView scandata (*.scandat) 

1967 

1968 Parameters 

1969 ---------- 

1970 filepath: str or Path 

1971 Path of the file to load. 

1972 buffersize: float 

1973 Size of internal buffer in seconds. 

1974 backsize: float 

1975 Part of the buffer to be loaded before the requested start index in seconds. 

1976 verbose: int 

1977 If > 0 show detailed error/warning messages. 

1978 rate: float 

1979 Sampling rate of the data in Hertz. 

1980 channels: int 

1981 Number of channels multiplexed in the data. 

1982 encoding: str 

1983 The encoding of the data stored in the file. 

1984 Valid encodings are 'PCM_16', 'PCM_32', 'PCM_64', 'FLOAT', or 

1985 'DOUBLE' or lower-case versions thereof. 

1986 amax: float 

1987 The amplitude range of the data. 

1988 unit: str 

1989 The unit of the data. 

1990 """ 

1991 encodings = {'PCM_16': 'i2', 

1992 'PCM_32': 'i4', 

1993 'PCM_64': 'i8', 

1994 'FLOAT': 'f', 

1995 'DOUBLE': 'd'} 

1996 encoding = encoding.upper() 

1997 if not encoding in encodings: 

1998 raise ValueError(f'invalid encoding {encoding} for raw file!') 

1999 self.dtype = np.dtype(encodings[encoding]) 

2000 self.verbose = verbose 

2001 self.filepath = Path(filepath) 

2002 self.file_paths = [self.filepath] 

2003 self.file_indices = [0] 

2004 self.sf = open(self.filepath, 'rb') 

2005 if self.verbose > 0: 

2006 print(f'open_raw("{self.filepath}")') 

2007 self.rate = float(rate) 

2008 # file size: 

2009 self.channels = int(channels) 

2010 self.sf.seek(0, os.SEEK_END) 

2011 self.frames = self.sf.tell()//self.dtype.itemsize//self.channels 

2012 self.sf.seek(0) 

2013 self.shape = (self.frames, self.channels) 

2014 self.ndim = len(self.shape) 

2015 self.size = self.frames*self.channels 

2016 self.format = 'RAW' 

2017 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN') 

2018 self.unit = unit 

2019 self.ampl_max = float(amax) 

2020 self.ampl_min = -self.ampl_max 

2021 self.offset = 0 

2022 self.bufferframes = int(buffersize*self.rate) 

2023 self.backframes = int(backsize*self.rate) 

2024 self.init_buffer() 

2025 self.close = self._close_raw 

2026 self.load_audio_buffer = self._load_buffer_raw 

2027 self._metadata = None 

2028 self._load_metadata = None 

2029 self._locs = None 

2030 self._labels = None 

2031 self._load_markers = None 

2032 

2033 def _close_raw(self): 

2034 """Close raw file. """ 

2035 if self.sf is not None: 

2036 self.sf.close() 

2037 self.sf = None 

2038 

2039 def _load_buffer_raw(self, r_offset, r_size, buffer): 

2040 """Load new data from container.""" 

2041 if self.sf is None: 

2042 self.sf = open(self.filepath, 'rb') 

2043 self.sf.seek(r_offset*self.dtype.itemsize*self.channels) 

2044 raw_data = self.sf.read(r_size*self.dtype.itemsize*self.channels) 

2045 raw_data = np.frombuffer(raw_data, dtype=self.dtype) 

2046 raw_data = raw_data.reshape(-1, self.channels) 

2047 # recode: 

2048 if self.dtype == np.dtype('int16'): 

2049 data = raw_data.astype('float32') 

2050 data *= self.ampl_max/2**15 

2051 elif self.dtype == np.dtype('int32'): 

2052 data = raw_data.astype(float) 

2053 data *= self.ampl_max/2**31 

2054 elif self.dtype == np.dtype('int64'): 

2055 data = raw_data.astype(float) 

2056 data *= self.ampl_max/2**63 

2057 else: 

2058 data = raw_data 

2059 buffer[:, :] = data 

2060 

2061 

2062 # audioio interface:  

2063 def open_audioio(self, filepath, buffersize=10.0, backsize=0.0, 

2064 verbose=0, gainkey=default_gain_keys, sep='.', 

2065 amax=None, unit='a.u.'): 

2066 """Open an audio file. 

2067 

2068 See the [audioio](https://github.com/bendalab/audioio) package 

2069 for details. 

2070 

2071 Parameters 

2072 ---------- 

2073 filepath: str 

2074 Path to an audio file. 

2075 buffersize: float 

2076 Size of internal buffer in seconds. 

2077 backsize: float 

2078 Part of the buffer to be loaded before the requested start index 

2079 in seconds. 

2080 verbose: int 

2081 If > 0 show detailed error/warning messages. 

2082 gainkey: str or list of str 

2083 Key in the file's metadata that holds some gain information. 

2084 If found, the data will be multiplied with the gain, 

2085 and if available, the corresponding unit is returned. 

2086 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details. 

2087 sep: str 

2088 String that separates section names in `gainkey`. 

2089 amax: None or float 

2090 If specified and no gain has been found in the metadata, 

2091 then use this as the amplitude range. 

2092 unit: None or str 

2093 If specified and no gain has been found in the metadata, 

2094 then this is the unit of the data. 

2095 

2096 """ 

2097 self.verbose = verbose 

2098 super(DataLoader, self).open(filepath, buffersize, backsize, verbose) 

2099 md = self.metadata() 

2100 fac, unit = get_gain(md, gainkey, sep, amax, unit) 

2101 if fac is None: 

2102 self.gain_fac = 1.0 

2103 else: 

2104 self.gain_fac = fac 

2105 self._load_buffer_audio_org = self.load_audio_buffer 

2106 self.load_audio_buffer = self._load_buffer_audioio 

2107 self.ampl_min *= self.gain_fac 

2108 self.ampl_max *= self.gain_fac 

2109 self.unit = unit 

2110 return self 

2111 

2112 def _load_buffer_audioio(self, r_offset, r_size, buffer): 

2113 """Load and scale new data from an audio file. 

2114 

2115 Parameters 

2116 ---------- 

2117 r_offset: int 

2118 First frame to be read from file. 

2119 r_size: int 

2120 Number of frames to be read from file. 

2121 buffer: ndarray 

2122 Buffer where to store the loaded data. 

2123 """ 

2124 self._load_buffer_audio_org(r_offset, r_size, buffer) 

2125 buffer *= self.gain_fac 

2126 

2127 

2128 # open multiple files as one: 

2129 def open_multiple(self, filepaths, buffersize=10.0, backsize=0.0, 

2130 verbose=0, mode='strict', rate=None, channels=None, 

2131 unit=None, amax=None, end_indices=None): 

2132 """Open multiple files as a single concatenated array. 

2133 

2134 Parameters 

2135 ---------- 

2136 filepaths: list of str or Path 

2137 List of file paths of audio files. 

2138 buffersize: float 

2139 Size of internal buffer in seconds. 

2140 backsize: float 

2141 Part of the buffer to be loaded before the requested start index in seconds. 

2142 verbose: int 

2143 If larger than zero show detailed error/warning messages. 

2144 mode: 'relaxed' or 'strict' 

2145 If 'strict', only concatenate files if they contain 

2146 a start time in their meta data. 

2147 rate: float 

2148 If provided, do a minimal initialization (no checking) 

2149 using the provided sampling rate (in Hertz), channels, 

2150 unit, maximum amplitude, and end_indices. 

2151 channels: int 

2152 If provided, do a minimal initialization (no checking) 

2153 using the provided rate, number of channels, 

2154 unit, maximum amplitude, and end_indices. 

2155 unit: str 

2156 If provided, do a minimal initialization (no checking) 

2157 using the provided rate, number of channels, 

2158 unit, maximum amplitude, and end_indices. 

2159 amax: float 

2160 If provided, do a minimal initialization (no checking) 

2161 using the provided rate, number of channels, 

2162 unit, maximum amplitude amax, and end_indices. 

2163 end_indices: sequence of int 

2164 If provided, do a minimal initialization (no checking) 

2165 using the provided rate, channels, 

2166 unit, maximum amplitude, and end_indices. 

2167 

2168 Raises 

2169 ------ 

2170 TypeError 

2171 `filepaths` must be a sequence. 

2172 ValueError 

2173 Empty `filepaths`. 

2174 FileNotFoundError 

2175 `filepaths` does not contain a single valid file. 

2176 

2177 """ 

2178 if not isinstance(filepaths, (list, tuple, np.ndarray)): 

2179 raise TypeError('input argument filepaths is not a sequence!') 

2180 if len(filepaths) == 0: 

2181 raise ValueError('input argument filepaths is empy sequence!') 

2182 self.buffersize = buffersize 

2183 self.backsize = backsize 

2184 self.filepath = None 

2185 self.file_paths = [] 

2186 self.open_files = [] 

2187 self.open_loaders = [] 

2188 self.data_files = [] 

2189 self.collect_counter = 0 

2190 self.frames = 0 

2191 self.start_indices = [] 

2192 self.end_indices = [] 

2193 self.start_time = None 

2194 start_time = None 

2195 self._metadata = {} 

2196 self._locs = np.zeros((0, 2), dtype=int) 

2197 self._labels = np.zeros((0, 2), dtype=object) 

2198 if end_indices is not None: 

2199 self.file_paths = [Path(fp) for fp in filepaths] 

2200 self.filepath = self.file_paths[0] 

2201 self.data_files = [None] * len(self.file_paths) 

2202 self.frames = end_indices[-1] 

2203 self.start_indices = [0] + list(end_indices[:-1]) 

2204 self.end_indices = end_indices 

2205 self.format = None 

2206 self.encoding = None 

2207 self.rate = rate 

2208 self.channels = channels 

2209 self.unit = unit 

2210 self.ampl_max = amax 

2211 self.ampl_min = -amax 

2212 else: 

2213 for filepath in filepaths: 

2214 try: 

2215 a = DataLoader(filepath, buffersize, backsize, verbose) 

2216 except Exception as e: 

2217 if verbose > 0: 

2218 print(e) 

2219 continue 

2220 # collect metadata: 

2221 md = a.metadata() 

2222 fmd = flatten_metadata(md, True) 

2223 add_metadata(self._metadata, fmd) 

2224 if self.filepath is None: 

2225 # first file: 

2226 self.filepath = a.filepath 

2227 self.format = a.format 

2228 self.encoding = a.encoding 

2229 self.rate = a.rate 

2230 self.channels = a.channels 

2231 self.unit = a.unit 

2232 self.ampl_max = a.ampl_max 

2233 self.ampl_min = a.ampl_min 

2234 self.start_time = get_datetime(md) 

2235 start_time = self.start_time 

2236 stime = self.start_time 

2237 else: 

2238 # check channels, rate, and amplitudes: 

2239 error_str = None 

2240 if a.channels != self.channels: 

2241 error_str = f'number of channels differs: ' \ 

2242 f'{a.channels} in {a.filepath} versus ' \ 

2243 f'{self.channels} in {self.filepath}' 

2244 if a.rate != self.rate: 

2245 error_str = f'sampling rates differ: ' \ 

2246 f'{a.rate} in {a.filepath} versus ' \ 

2247 f'{self.rate} in {self.filepath}' 

2248 if a.ampl_min != self.ampl_min: 

2249 error_str = f'minimum amplitudes differ: ' \ 

2250 f'{a.ampl_min} in {a.filepath} versus ' \ 

2251 f'{self.ampl_min} in {self.filepath}' 

2252 if a.ampl_max != self.ampl_max: 

2253 error_Str = f'maximum amplitudes differ: ' \ 

2254 f'{a.ampl_max} in {a.filepath} versus ' \ 

2255 f'{self.ampl_max} in {self.filepath}' 

2256 # check start time of recording: 

2257 stime = get_datetime(md) 

2258 if mode == 'strict' and (start_time is None or stime is None): 

2259 error_str = 'file does not contain a start time in its meta data' 

2260 if start_time is not None and stime is not None and \ 

2261 abs(start_time - stime) > timedelta(seconds=self._max_time_diff): 

2262 error_str = f'start time does not indicate continuous recording: ' \ 

2263 f'expected {start_time} instead of ' \ 

2264 f'{stime} in {a.filepath}' 

2265 if error_str is not None: 

2266 if verbose > 0: 

2267 print(error_str) 

2268 a.close() 

2269 del a 

2270 break 

2271 # markers: 

2272 locs, labels = a.markers() 

2273 locs[:,0] += self.frames 

2274 self._locs = np.vstack((self._locs, locs)) 

2275 self._labels = np.vstack((self._labels, labels)) 

2276 # indices: 

2277 self.start_indices.append(self.frames) 

2278 self.frames += a.frames 

2279 self.end_indices.append(self.frames) 

2280 if stime is not None: 

2281 start_time = stime + timedelta(seconds=a.frames/a.rate) 

2282 # add file to lists: 

2283 self.file_paths.append(a.filepath) 

2284 if len(self.open_files) < AudioLoader.max_open_files: 

2285 self.open_files.append(a) 

2286 else: 

2287 a.close() 

2288 if len(self.open_loaders) < AudioLoader.max_open_loaders: 

2289 self.data_files.append(a) 

2290 self.open_loaders.append(a) 

2291 else: 

2292 a.close() 

2293 del a 

2294 self.data_files.append(None) 

2295 if len(self.data_files) == 0: 

2296 raise FileNotFoundError('input argument filepaths does not contain any valid audio file!') 

2297 # set startime from first file: 

2298 if self.start_time is not None: 

2299 set_starttime(self._metadata, self.start_time) 

2300 # setup infrastructure: 

2301 self.file_indices = self.start_indices 

2302 self.start_indices = np.array(self.start_indices) 

2303 self.end_indices = np.array(self.end_indices) 

2304 self.shape = (self.frames, self.channels) 

2305 self.bufferframes = int(buffersize*self.rate) 

2306 self.backframes = int(backsize*self.rate) 

2307 self.init_buffer() 

2308 self.close = self._close_multiple 

2309 self.load_audio_buffer = self._load_buffer_multiple 

2310 self._load_metadata = None 

2311 self._load_markers = None 

2312 return self 

2313 

2314 def _close_multiple(self): 

2315 """Close all the data files. """ 

2316 self.open_files = [] 

2317 self.open_loaders = [] 

2318 if hasattr(self, 'data_files'): 

2319 for a in self.data_files: 

2320 if a is not None: 

2321 a.close() 

2322 self.data_files = [] 

2323 self.filepath = None 

2324 self.file_paths = [] 

2325 self.file_indices = [] 

2326 self.start_indices = [] 

2327 self.end_indices = [] 

2328 del self.data_files 

2329 del self.open_files 

2330 del self.open_loaders 

2331 del self.start_indices 

2332 del self.end_indices 

2333 

2334 def _load_buffer_multiple(self, r_offset, r_size, buffer): 

2335 """Load new data from the underlying files. 

2336 

2337 Parameters 

2338 ---------- 

2339 r_offset: int 

2340 First frame to be read from file. 

2341 r_size: int 

2342 Number of frames to be read from file. 

2343 buffer: ndarray 

2344 Buffer where to store the loaded data. 

2345 """ 

2346 offs = r_offset 

2347 size = r_size 

2348 boffs = 0 

2349 ai = np.searchsorted(self.end_indices, offs, side='right') 

2350 while size > 0: 

2351 if self.data_files[ai] is None: 

2352 a = DataLoader(self.file_paths[ai], 

2353 self.buffersize, self.backsize, 0) 

2354 self.data_files[ai] = a 

2355 self.open_loaders.append(a) 

2356 self.open_files.append(a) 

2357 if len(self.open_files) > AudioLoader.max_open_files: 

2358 a0 = self.open_files.pop(0) 

2359 a0.close() 

2360 if len(self.open_loaders) > AudioLoader.max_open_loaders: 

2361 a0 = self.open_loaders.pop(0) 

2362 self.data_files[self.data_files.index(a0)] = None 

2363 a0.close() 

2364 del a0 

2365 self.collect_counter += 1 

2366 if self.collect_counter > AudioLoader.max_open_loaders//2: 

2367 gc.collect() # takes time! 

2368 self.collect_counter = 0 

2369 else: 

2370 self.open_loaders.pop(self.open_loaders.index(self.data_files[ai])) 

2371 self.open_loaders.append(self.data_files[ai]) 

2372 ai0 = offs - self.start_indices[ai] 

2373 ai1 = offs + size 

2374 if ai1 > self.end_indices[ai]: 

2375 ai1 = self.end_indices[ai] 

2376 ai1 -= self.start_indices[ai] 

2377 n = ai1 - ai0 

2378 self.data_files[ai].load_audio_buffer(ai0, n, 

2379 buffer[boffs:boffs + n,:]) 

2380 if self.data_files[ai] in self.open_files: 

2381 self.open_files.pop(self.open_files.index(self.data_files[ai])) 

2382 self.open_files.append(self.data_files[ai]) 

2383 if len(self.open_files) > AudioLoader.max_open_files: 

2384 self.open_files[0].close() 

2385 self.open_files.pop(0) 

2386 boffs += n 

2387 offs += n 

2388 size -= n 

2389 ai += 1 

2390 

2391 

2392 def open(self, filepath, buffersize=10.0, backsize=0.0, 

2393 verbose=0, **kwargs): 

2394 """Open file with time-series data for reading. 

2395 

2396 Parameters 

2397 ---------- 

2398 filepath: str or list of str 

2399 Name of the file or list of many file names that should be 

2400 made accessible as a single array. 

2401 buffersize: float 

2402 Size of internal buffer in seconds. 

2403 backsize: float 

2404 Part of the buffer to be loaded before the requested start index 

2405 in seconds. 

2406 verbose: int 

2407 If > 0 show detailed error/warning messages. 

2408 **kwargs: dict 

2409 Further keyword arguments that are passed on to the  

2410 format specific opening functions. 

2411 For example: 

2412 - `amax`: the amplitude range of the data. 

2413 - 'unit': the unit of the data. 

2414 

2415 Raises 

2416 ------ 

2417 ValueError: 

2418 `filepath` is empty string. 

2419 """ 

2420 # list of implemented open functions: 

2421 data_open_funcs = ( 

2422 ('relacs', check_relacs, self.open_relacs, 1), 

2423 ('fishgrid', check_fishgrid, self.open_fishgrid, 1), 

2424 ('container', check_container, self.open_container, 1), 

2425 ('raw', check_raw, self.open_raw, 1), 

2426 ('audioio', None, self.open_audioio, 0), 

2427 ) 

2428 

2429 self.buffer = np.array([]) 

2430 self.rate = 0.0 

2431 if not filepath: 

2432 raise ValueError('input argument filepath is empty string.') 

2433 if isinstance(filepath, (list, tuple, np.ndarray)): 

2434 if len(filepath) > 1: 

2435 self.open_multiple(filepath, buffersize, backsize, 

2436 verbose, **kwargs) 

2437 if len(self.file_paths) > 1: 

2438 return self 

2439 filepath = self.file_paths[0] 

2440 self.close() 

2441 else: 

2442 filepath = filepath[0] 

2443 # open data: 

2444 for name, check_file, open_file, v in data_open_funcs: 

2445 if check_file is None or check_file(filepath): 

2446 open_file(filepath, buffersize, backsize, verbose, **kwargs) 

2447 if v*verbose > 1: 

2448 if self.format is not None: 

2449 print(f' format : {self.format}') 

2450 if self.encoding is not None: 

2451 print(f' encoding : {self.encoding}') 

2452 print(f' sampling rate: {self.rate} Hz') 

2453 print(f' channels : {self.channels}') 

2454 print(f' frames : {self.frames}') 

2455 print(f' range : {self.ampl_max:g}{self.unit}') 

2456 break 

2457 return self 

2458 

2459 

2460def demo(filepath, plot=False): 

2461 print("try load_data:") 

2462 data, rate, unit, amax = load_data(filepath, verbose=2) 

2463 if plot: 

2464 fig, ax = plt.subplots() 

2465 time = np.arange(len(data))/rate 

2466 for c in range(data.shape[1]): 

2467 ax.plot(time, data[:,c]) 

2468 ax.set_xlabel('Time [s]') 

2469 ax.set_ylabel(f'[{unit}]') 

2470 if amax is not None and np.isfinite(amax): 

2471 ax.set_ylim(-amax, +amax) 

2472 plt.show() 

2473 return 

2474 

2475 print('') 

2476 print("try DataLoader:") 

2477 with DataLoader(filepath, 2.0, 1.0, 1) as data: 

2478 print('sampling rate: %g' % data.rate) 

2479 print('frames : %d %d' % (len(data), data.shape[0])) 

2480 nframes = int(1.0 * data.rate) 

2481 # forward: 

2482 for i in range(0, len(data), nframes): 

2483 print('forward %d-%d' % (i, i + nframes)) 

2484 x = data[i:i + nframes, 0] 

2485 if plot: 

2486 fig, ax = plt.subplots() 

2487 ax.plot((i + np.arange(len(x)))/data.rate, x) 

2488 ax.set_xlabel('Time [s]') 

2489 ax.set_ylabel(f'[{data.unit}]') 

2490 plt.show() 

2491 # and backwards: 

2492 for i in reversed(range(0, len(data), nframes)): 

2493 print('backward %d-%d' % (i, i + nframes)) 

2494 x = data[i:i + nframes, 0] 

2495 if plot: 

2496 fig, ax = plt.subplots() 

2497 ax.plot((i + np.arange(len(x)))/data.rate, x) 

2498 ax.set_xlabel('Time [s]') 

2499 ax.set_ylabel(f'[{data.unit}]') 

2500 plt.show() 

2501 

2502 

2503def main(*cargs): 

2504 """Call demo with command line arguments. 

2505 

2506 Parameters 

2507 ---------- 

2508 cargs: list of str 

2509 Command line arguments as provided by sys.argv[1:] 

2510 """ 

2511 import argparse 

2512 parser = argparse.ArgumentParser(description= 

2513 'Checking thunderlab.dataloader module.') 

2514 parser.add_argument('-p', dest='plot', action='store_true', 

2515 help='plot loaded data') 

2516 parser.add_argument('file', nargs=1, default='', type=str, 

2517 help='name of data file') 

2518 args = parser.parse_args(cargs) 

2519 demo(args.file[0], args.plot) 

2520 

2521 

2522if __name__ == "__main__": 

2523 main(*sys.argv[1:])