Coverage for src/thunderlab/dataloader.py: 77%

981 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-16 21:22 +0000

1"""Load time-series data from files. 

2 

3``` 

4data, rate, unit, amax = load_data('data/file.wav') 

5``` 

6 

7The function `data_loader()` loads the whole time-series from the file 

8as a numpy array of floats. First dimension is frames, second is 

9channels. In contrast to the `audioio.load_audio()` function, the 

10values of the data array are not restricted between -1 and 1. They can 

11assume any value wihin the range `-amax` to `+amax` with the returned 

12`unit`. 

13 

14``` 

15data = DataLoader('data/file.wav', 60.0) 

16``` 

17or 

18``` 

19with DataLoader('data/file.wav', 60.0) as data: 

20``` 

21Create an `DataLoader` object that loads chuncks of 60 seconds long data 

22on demand. `data` can be used like a read-only numpy array of floats. 

23 

24 

25## Supported file formats 

26 

27- python pickle files 

28- numpy .npz files 

29- matlab .mat files 

30- audio files via [`audioio`](https://github.com/bendalab/audioio) package 

31- LabView .scandat files 

32- relacs trace*.raw files (https://www.relacs.net) 

33- fishgrid traces-*.raw files (https://github.com/bendalab/fishgrid) 

34 

35 

36## Metadata 

37 

38Many file formats allow to store metadata that further describe the 

39stored time series data. We handle them as nested dictionary of key-value 

40pairs. Load them with the `metadata()` function: 

41``` 

42metadata = metadata('data/file.mat') 

43``` 

44 

45## Markers 

46 

47Some file formats also allow to store markers that mark specific 

48positions in the time series data. Load marker positions and spans (in 

49the 2-D array `locs`) and label and text strings (in the 2-D array 

50`labels`) with the `markers()` function: 

51``` 

52locs, labels = markers('data.wav') 

53``` 

54 

55## Aditional, format specific functions 

56 

57- `extract_container_metadata()`: extract metadata from dictionary loaded from a container file. 

58- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file. 

59- `relacs_header()`: read key-value pairs from relacs *.dat file headers. 

60- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file. 

61- `fishgrid_spacings()`: spacing between grid electrodes. 

62 

63""" 

64 

65import os 

66import sys 

67import glob 

68import gzip 

69import numpy as np 

70try: 

71 import matplotlib.pyplot as plt 

72except ImportError: 

73 pass 

74from datetime import timedelta 

75from audioio import load_audio, AudioLoader, unflatten_metadata 

76from audioio import get_number_unit, get_number, get_int, get_bool, get_gain 

77from audioio import default_starttime_keys, default_gain_keys 

78from audioio import get_datetime, flatten_metadata, add_metadata, set_starttime 

79from audioio import metadata as metadata_audioio 

80from audioio import markers as markers_audioio 

81 

82 

83def relacs_samplerate_unit(filepath, channel=0): 

84 """Retrieve sampling rate and unit from a relacs stimuli.dat file. 

85 

86 Parameters 

87 ---------- 

88 filepath: str 

89 Path to a relacs data directory, or a file in a relacs data directory. 

90 channel: int 

91 Channel (trace) number, if `filepath` does not specify a 

92 trace-*.raw file. 

93 

94 Returns 

95 ------- 

96 samplerate: float 

97 Sampling rate in Hertz 

98 unit: str 

99 Unit of the trace, can be empty if not found 

100 

101 Raises 

102 ------ 

103 IOError/FileNotFoundError: 

104 If the stimuli.dat file does not exist. 

105 ValueError: 

106 stimuli.dat file does not contain sampling rate. 

107 """ 

108 trace = channel + 1 

109 relacs_dir = filepath 

110 # check for relacs data directory: 

111 if not os.path.isdir(filepath): 

112 relacs_dir = os.path.dirname(filepath) 

113 bn = os.path.basename(filepath).lower() 

114 i = bn.find('.raw') 

115 if len(bn) > 5 and bn[0:5] == 'trace' and i > 6: 

116 trace = int(bn[6:i]) 

117 

118 # retreive sampling rate and unit from stimuli.dat file: 

119 samplerate = None 

120 sampleinterval = None 

121 unit = "" 

122 

123 lines = [] 

124 stimuli_file = os.path.join(relacs_dir, 'stimuli.dat') 

125 if os.path.isfile(stimuli_file + '.gz'): 

126 stimuli_file += '.gz' 

127 if stimuli_file[-3:] == '.gz': 

128 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf: 

129 for line in sf: 

130 line = line.strip() 

131 if len(line) == 0 or line[0] != '#': 

132 break 

133 lines.append(line) 

134 else: 

135 with open(stimuli_file, 'r', encoding='latin-1') as sf: 

136 for line in sf: 

137 line = line.strip() 

138 if len(line) == 0 or line[0] != '#': 

139 break 

140 lines.append(line) 

141 

142 for line in lines: 

143 if "unit%d" % trace in line: 

144 unit = line.split(':')[1].strip() 

145 if "sampling rate%d" % trace in line: 

146 value = line.split(':')[1].strip() 

147 samplerate = float(value.replace('Hz','')) 

148 elif "sample interval%d" % trace in line: 

149 value = line.split(':')[1].strip() 

150 sampleinterval = float(value.replace('ms','')) 

151 

152 if samplerate is not None: 

153 return samplerate, unit 

154 if sampleinterval is not None: 

155 return 1000/sampleinterval, unit 

156 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}') 

157 

158 

159def relacs_header(filepath, store_empty=False, first_only=False, 

160 lower_keys=False, flat=False, 

161 add_sections=False): 

162 """Read key-value pairs from a relacs *.dat file header. 

163 

164 Parameters 

165 ---------- 

166 filepath: str 

167 A relacs *.dat file, can be also a zipped .gz file. 

168 store_empty: bool 

169 If `False` do not add meta data with empty values. 

170 first_only: bool 

171 If `False` only store the first element of a list. 

172 lower_keys: bool 

173 Make all keys lower case. 

174 flat: bool 

175 Do not make a nested dictionary. 

176 Use this option also to read in very old relacs metadata with 

177 ragged left alignment. 

178 add_sections: bool 

179 If `True`, prepend keys with sections names separated by 

180 '.' to make them unique. 

181 

182 Returns 

183 ------- 

184 data: dict 

185 Nested dictionary with key-value pairs of the file header. 

186  

187 Raises 

188 ------ 

189 IOError/FileNotFoundError: 

190 If `filepath` cannot be opened. 

191 """ 

192 # read in header from file: 

193 lines = [] 

194 if os.path.isfile(filepath + '.gz'): 

195 filepath += '.gz' 

196 if filepath[-3:] == '.gz': 

197 with gzip.open(filepath, 'r', encoding='latin-1') as sf: 

198 for line in sf: 

199 line = line.strip() 

200 if len(line) == 0 or line[0] != '#': 

201 break 

202 lines.append(line) 

203 else: 

204 with open(filepath, 'r', encoding='latin-1') as sf: 

205 for line in sf: 

206 line = line.strip() 

207 if len(line) == 0 or line[0] != '#': 

208 break 

209 lines.append(line) 

210 # parse: 

211 data = {} 

212 cdatas = [data] 

213 sections = [''] 

214 ident_offs = None 

215 ident = None 

216 for line in lines: 

217 words = line.split(':') 

218 value = ':'.join(words[1:]).strip() if len(words) > 1 else '' 

219 if len(words) >= 1: 

220 key = words[0].strip('#') 

221 # get section level: 

222 level = 0 

223 if not flat or len(value) == 0: 

224 nident = len(key) - len(key.lstrip()) 

225 if ident_offs is None: 

226 ident_offs = nident 

227 elif ident is None: 

228 if nident > ident_offs: 

229 ident = nident - ident_offs 

230 level = 1 

231 else: 

232 level = (nident - ident_offs)//ident 

233 # close sections: 

234 if not flat: 

235 while len(cdatas) > level + 1: 

236 cdatas[-1][sections.pop()] = cdatas.pop() 

237 else: 

238 while len(sections) > level + 1: 

239 sections.pop() 

240 # key: 

241 key = key.strip().strip('"') 

242 if lower_keys: 

243 key = key.lower() 

244 skey = key 

245 if add_sections: 

246 key = '.'.join(sections[1:] + [key]) 

247 if len(value) == 0: 

248 # new sub-section: 

249 if flat: 

250 if store_empty: 

251 cdatas[-1][key] = None 

252 else: 

253 cdatas.append({}) 

254 sections.append(skey) 

255 else: 

256 # key-value pair: 

257 value = value.strip('"') 

258 if len(value) > 0 or value != '-' or store_empty: 

259 if len(value) > 0 and value[0] == '[' and value[-1] == ']': 

260 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')] 

261 if first_only: 

262 value = value[0] 

263 cdatas[-1][key] = value 

264 while len(cdatas) > 1: 

265 cdatas[-1][sections.pop()] = cdatas.pop() 

266 return data 

267 

268 

269def check_relacs(filepath): 

270 """Check for valid relacs file. 

271 

272 Parameters 

273 ---------- 

274 filepath: str 

275 Path to a relacs data directory, or a file in a relacs data directory. 

276 

277 Returns 

278 ------- 

279 is_relacs: boolean 

280 `True` if `filepath` is a valid relacs directory or is a file therein. 

281 """ 

282 # relacs data directory: 

283 relacs_dir = filepath 

284 if not os.path.isdir(filepath): 

285 relacs_dir = os.path.dirname(filepath) 

286 # check for a valid relacs data directory: 

287 has_stimuli = False 

288 has_trace = False 

289 for fname in ['stimuli.dat', 'stimuli.dat.gz']: 

290 if os.path.isfile(os.path.join(relacs_dir, fname)): 

291 has_stimuli = True 

292 for fname in ['trace-1.raw', 'trace-1.raw.gz']: 

293 if os.path.isfile(os.path.join(relacs_dir, fname)): 

294 has_trace = True 

295 return has_stimuli and has_trace 

296 

297 

298def relacs_trace_files(filepath): 

299 """Expand file path for relacs data to appropriate trace*.raw file names. 

300 

301 Parameters 

302 ---------- 

303 filepath: str 

304 Path to a relacs data directory, or a file in a relacs data directory. 

305  

306 Returns 

307 ------- 

308 trace_filepaths: list of str 

309 List of relacs trace*.raw files. 

310 """ 

311 relacs_dir = filepath 

312 if not os.path.isdir(filepath): 

313 relacs_dir = os.path.dirname(filepath) 

314 trace_filepaths = [] 

315 for k in range(10000): 

316 fname = os.path.join(relacs_dir, f'trace-{k+1}.raw') 

317 if os.path.isfile(fname): 

318 trace_filepaths.append(fname) 

319 elif os.path.isfile(fname + '.gz'): 

320 trace_filepaths.append(fname + '.gz') 

321 else: 

322 break 

323 return trace_filepaths 

324 

325 

326def load_relacs(filepath, amax=1.0): 

327 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs). 

328 

329 Parameters 

330 ---------- 

331 filepath: str 

332 Path to a relacs data directory, or a file in a relacs data directory. 

333 amax: float 

334 The amplitude range of the data. 

335 

336 Returns 

337 ------- 

338 data: 2-D array 

339 All data traces as an 2-D numpy array, even for single channel data. 

340 First dimension is time, second is channel. 

341 rate: float 

342 Sampling rate of the data in Hz 

343 unit: str 

344 Unit of the data 

345 amax: float 

346 Maximum amplitude of data range. 

347 

348 Raises 

349 ------ 

350 ValueError: 

351 - Invalid name for relacs trace-*.raw file. 

352 - Sampling rates of traces differ. 

353 - Unit of traces differ. 

354 """ 

355 trace_filepaths = relacs_trace_files(filepath) 

356 # load trace*.raw files: 

357 nchannels = len(trace_filepaths) 

358 data = None 

359 nrows = 0 

360 rate = None 

361 unit = '' 

362 for c, path in enumerate(sorted(trace_filepaths)): 

363 if path[-3:] == '.gz': 

364 with gzip.open(path, 'rb') as sf: 

365 x = np.frombuffer(sf.read(), dtype=np.float32) 

366 else: 

367 x = np.fromfile(path, np.float32) 

368 if data is None: 

369 nrows = len(x) 

370 data = np.zeros((nrows, nchannels)) 

371 n = min(len(x), nrows) 

372 data[:n,c] = x[:n] 

373 # retrieve sampling rate and unit: 

374 crate, us = relacs_samplerate_unit(path, c) 

375 if rate is None: 

376 rate = crate 

377 elif crate != rate: 

378 raise ValueError('sampling rates of traces differ') 

379 if len(unit) == 0: 

380 unit = us 

381 elif us != unit: 

382 raise ValueError('unit of traces differ') 

383 return data, rate, unit, amax 

384 

385 

386def metadata_relacs(filepath, store_empty=False, first_only=False, 

387 lower_keys=False, flat=False, add_sections=False): 

388 """ Read meta-data of a relacs data set. 

389 

390 Parameters 

391 ---------- 

392 filepath: str 

393 A relacs data directory or a file therein. 

394 store_empty: bool 

395 If `False` do not add meta data with empty values. 

396 first_only: bool 

397 If `False` only store the first element of a list. 

398 lower_keys: bool 

399 Make all keys lower case. 

400 flat: bool 

401 Do not make a nested dictionary. 

402 Use this option also to read in very old relacs metadata with 

403 ragged left alignment. 

404 add_sections: bool 

405 If `True`, prepend keys with sections names separated by 

406 '.' to make them unique. 

407 

408 Returns 

409 ------- 

410 data: nested dict 

411 Nested dictionary with key-value pairs of the meta data. 

412 """ 

413 relacs_dir = filepath 

414 if not os.path.isdir(filepath): 

415 relacs_dir = os.path.dirname(filepath) 

416 info_path = os.path.join(relacs_dir, 'info.dat') 

417 if not os.path.exists(info_path): 

418 return dict(), [] 

419 data = relacs_header(info_path, store_empty, first_only, 

420 lower_keys, flat, add_sections) 

421 return data 

422 

423 

424def fishgrid_spacings(metadata, unit='m'): 

425 """Spacing between grid electrodes. 

426 

427 Parameters 

428 ---------- 

429 metadata: dict 

430 Fishgrid metadata obtained from `metadata_fishgrid()`. 

431 unit: str 

432 Unit in which to return the spacings. 

433 

434 Returns 

435 ------- 

436 grid_dist: list of tuple of float 

437 For each grid the distances between rows and columns in `unit`. 

438 """ 

439 grids_dist = [] 

440 for k in range(4): 

441 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0) 

442 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0) 

443 rows = get_int(metadata, f'Rows{k+1}', default=0) 

444 cols = get_int(metadata, f'Columns{k+1}', default=0) 

445 if get_bool(metadata, f'Used{k+1}', default=False) or \ 

446 cols > 0 and rows > 0: 

447 grids_dist.append((row_dist, col_dist)) 

448 return grids_dist 

449 

450 

451def fishgrid_grids(metadata): 

452 """Retrieve grid sizes from a fishgrid.cfg file. 

453 

454 Parameters 

455 ---------- 

456 metadata: dict 

457 Fishgrid metadata obtained from `metadata_fishgrid()`. 

458 

459 Returns 

460 ------- 

461 grids: list of tuple of int 

462 For each grid the number of rows and columns. 

463 """ 

464 grids = [] 

465 for k in range(4): 

466 rows = get_int(metadata, f'Rows{k+1}', default=0) 

467 cols = get_int(metadata, f'Columns{k+1}', default=0) 

468 if get_bool(metadata, f'Used{k+1}', default=False) or \ 

469 cols > 0 and rows > 0: 

470 grids.append((rows, cols)) 

471 return grids 

472 

473 

474def check_fishgrid(filepath): 

475 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid). 

476 

477 Parameters 

478 ---------- 

479 filepath: str 

480 Path to a fishgrid data directory or a file in a fishgrid 

481 data directory. 

482 

483 Returns 

484 ------- 

485 is_fishgrid: bool 

486 `True` if `filepath` is a valid fishgrid data directory or 

487 a file therein. 

488 """ 

489 # fishgrid data directory: 

490 fishgrid_dir = filepath 

491 if not os.path.isdir(filepath): 

492 fishgrid_dir = os.path.dirname(filepath) 

493 # check for a valid fishgrid data directory: 

494 return (os.path.isfile(os.path.join(fishgrid_dir, 'fishgrid.cfg')) and 

495 (os.path.isfile(os.path.join(fishgrid_dir, 'traces-grid1.raw')) or 

496 os.path.isfile(os.path.join(fishgrid_dir, 'traces.raw')))) 

497 

498 

499def fishgrid_trace_files(filepath): 

500 """Expand file paths for fishgrid data to appropriate traces*.raw file names. 

501 

502 Parameters 

503 ---------- 

504 filepath: str 

505 Path to a fishgrid data directory, or a file therein. 

506  

507 Returns 

508 ------- 

509 trace_filepaths: list of str 

510 List of fishgrid traces*.raw files. 

511 """ 

512 # find grids: 

513 fishgrid_dir = filepath 

514 if not os.path.isdir(fishgrid_dir): 

515 fishgrid_dir = os.path.dirname(filepath) 

516 trace_filepaths = [] 

517 for k in range(10000): 

518 file = os.path.join(fishgrid_dir, f'traces-grid{k+1}.raw') 

519 if os.path.isfile(file): 

520 trace_filepaths.append(file) 

521 else: 

522 break 

523 if len(trace_filepaths) == 0: 

524 file = os.path.join(fishgrid_dir, f'traces.raw') 

525 if os.path.isfile(file): 

526 trace_filepaths.append(file) 

527 return trace_filepaths 

528 

529 

530def load_fishgrid(filepath): 

531 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid). 

532 

533 Parameters 

534 ---------- 

535 filepath: str 

536 Path to a fishgrid data directory, or a file therein. 

537 

538 Returns 

539 ------- 

540 data: 2-D array 

541 All data traces as an 2-D numpy array, even for single channel data. 

542 First dimension is time, second is channel. 

543 rate: float 

544 Sampling rate of the data in Hz. 

545 unit: str 

546 Unit of the data. 

547 amax: float 

548 Maximum amplitude of data range. 

549 

550 Raises 

551 ------ 

552 FileNotFoundError: 

553 Invalid or not existing fishgrid files. 

554 """ 

555 trace_filepaths = fishgrid_trace_files(filepath) 

556 if len(trace_filepaths) == 0: 

557 raise FileNotFoundError(f'no fishgrid files specified') 

558 md = metadata_fishgrid(filepath) 

559 grids = fishgrid_grids(md) 

560 grid_sizes = [r*c for r, c in grids] 

561 

562 # load traces-grid*.raw files: 

563 grid_channels = [] 

564 nchannels = 0 

565 for g, path in enumerate(trace_filepaths): 

566 grid_channels.append(grid_sizes[g]) 

567 nchannels += grid_sizes[g] 

568 data = None 

569 nrows = 0 

570 c = 0 

571 rate = get_number(md, 'Hz', 'AISampleRate') 

572 for path, channels in zip(trace_filepaths, grid_channels): 

573 x = np.fromfile(path, np.float32).reshape((-1, channels)) 

574 if data is None: 

575 nrows = len(x) 

576 data = np.zeros((nrows, nchannels)) 

577 n = min(len(x), nrows) 

578 data[:n,c:c+channels] = x[:n,:] 

579 c += channels 

580 amax, unit = get_number_unit(md, 'AIMaxVolt') 

581 return data, rate, unit, amax 

582 

583 

584# add fishgrid keys: 

585default_starttime_keys.append(['StartDate', 'StartTime']) 

586default_gain_keys.insert(0, 'AIMaxVolt') 

587 

588 

589def metadata_fishgrid(filepath): 

590 """ Read meta-data of a fishgrid data set. 

591 

592 Parameters 

593 ---------- 

594 filepath: str 

595 A fishgrid data directory or a file therein. 

596 

597 Returns 

598 ------- 

599 data: nested dict 

600 Nested dictionary with key-value pairs of the meta data. 

601 """ 

602 fishgrid_dir = filepath 

603 if not os.path.isdir(fishgrid_dir): 

604 fishgrid_dir = os.path.dirname(filepath) 

605 path = os.path.join(fishgrid_dir, 'fishgrid.cfg') 

606 # read in header from file: 

607 lines = [] 

608 if os.path.isfile(path + '.gz'): 

609 path += '.gz' 

610 if not os.path.exists(path): 

611 return {} 

612 if path[-3:] == '.gz': 

613 with gzip.open(path, 'r', encoding='latin-1') as sf: 

614 for line in sf: 

615 lines.append(line) 

616 else: 

617 with open(path, 'r', encoding='latin-1') as sf: 

618 for line in sf: 

619 lines.append(line) 

620 # parse: 

621 data = {} 

622 cdatas = [data] 

623 ident_offs = None 

624 ident = None 

625 old_style = False 

626 grid_n = False 

627 for line in lines: 

628 if len(line.strip()) == 0: 

629 continue 

630 if line[0] == '*': 

631 key = line[1:].strip() 

632 data[key] = {} 

633 cdatas = [data, data[key]] 

634 elif '----' in line: 

635 old_style = True 

636 key = line.strip().strip(' -').replace('&', '') 

637 if key.upper() == 'SETUP': 

638 key = 'Grid 1' 

639 grid_n = False 

640 if key[:4].lower() == 'grid': 

641 grid_n = key[5] 

642 cdatas = cdatas[:2] 

643 cdatas[1][key] = {} 

644 cdatas.append(cdatas[1][key]) 

645 else: 

646 words = line.split(':') 

647 key = words[0].strip().strip('"') 

648 value = None 

649 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style): 

650 value = ':'.join(words[1:]).strip().strip('"') 

651 if old_style: 

652 if value is None: 

653 cdatas = cdatas[:3] 

654 cdatas[2][key] = {} 

655 cdatas.append(cdatas[2][key]) 

656 else: 

657 if grid_n and key[-1] != grid_n: 

658 key = key + grid_n 

659 cdatas[-1][key] = value 

660 else: 

661 # get section level: 

662 level = 0 

663 nident = len(line) - len(line.lstrip()) 

664 if ident_offs is None: 

665 ident_offs = nident 

666 elif ident is None: 

667 if nident > ident_offs: 

668 ident = nident - ident_offs 

669 level = 1 

670 else: 

671 level = (nident - ident_offs)//ident 

672 # close sections: 

673 cdatas = cdatas[:2 + level] 

674 if value is None: 

675 # new section: 

676 cdatas[-1][key] = {} 

677 cdatas.append(cdatas[-1][key]) 

678 else: 

679 # key-value pair: 

680 cdatas[-1][key] = value.replace(r'\n', '\n') 

681 # remove unused grids: 

682 fgm = data.get('FishGrid', {}) 

683 for i in range(4): 

684 gs = f'Grid {i+1}' 

685 if gs in fgm: 

686 gm = fgm[gs] 

687 us = f'Used{i+1}' 

688 if us in gm and gm[us].upper() == 'FALSE': 

689 del fgm[gs] 

690 return data 

691 

692 

693def markers_fishgrid(filepath): 

694 """ Read markers of a fishgrid data set. 

695 

696 Parameters 

697 ---------- 

698 filepath: str 

699 A fishgrid data directory or a file therein. 

700 

701 Returns 

702 ------- 

703 locs: 2-D array of ints 

704 Marker positions (first column) and spans (second column) 

705 for each marker (rows). 

706 labels: 2-D array of string objects 

707 Labels (first column) and texts (second column) 

708 for each marker (rows). 

709 """ 

710 def add_marker(): 

711 if 'index1' in marker: 

712 index1 = int(marker['index1'])//nchannels 

713 else: 

714 index1 = int(marker['index'])//nchannels 

715 span1 = int(marker.get('span1', 0))//nchannels 

716 locs.append([index1, span1]) 

717 ls = marker.get('label', 'M') 

718 cs = marker.get('comment', '') 

719 labels.append([ls, cs]) 

720 

721 fishgrid_dir = filepath 

722 if not os.path.isdir(fishgrid_dir): 

723 fishgrid_dir = os.path.dirname(filepath) 

724 path = os.path.join(fishgrid_dir, 'timestamps.dat') 

725 if not os.path.isfile(path): 

726 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

727 # get number of channels: 

728 md = metadata_fishgrid(path.replace('timestamps.dat', 'fishgrid.cfg')) 

729 grids = fishgrid_grids(md) 

730 nchannels = np.prod(grids[0]) 

731 # read timestamps: 

732 locs = [] 

733 labels = [] 

734 marker = {} 

735 with open(path, 'r') as sf: 

736 for line in sf: 

737 if len(line.strip()) == 0: 

738 add_marker() 

739 marker = {} 

740 else: 

741 words = line.split(':') 

742 if len(words) > 1: 

743 v = words[1].strip() 

744 v = v.strip('"') 

745 marker[words[0].strip().lower()] = v 

746 if len(marker) > 0: 

747 add_marker() 

748 if len(locs) > 2: 

749 return np.array(locs[1:-1]), np.array(labels[1:-1]) 

750 else: 

751 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

752 

753 

754def check_container(filepath): 

755 """Check if file is a generic container file. 

756 

757 Supported file formats are: 

758 

759 - python pickle files (.pkl) 

760 - numpy files (.npz) 

761 - matlab files (.mat) 

762 

763 Parameters 

764 ---------- 

765 filepath: str 

766 Path of the file to check. 

767  

768 Returns 

769 ------- 

770 is_container: bool 

771 `True`, if `filepath` is a supported container format. 

772 """ 

773 ext = os.path.splitext(filepath)[1] 

774 return ext.lower() in ('.pkl', '.npz', '.mat') 

775 

776 

777def extract_container_data(data_dict, datakey=None, 

778 samplekey=['rate', 'Fs', 'fs'], 

779 timekey=['time'], amplkey=['amax'], unitkey='unit', 

780 amax=1.0, unit='a.u.'): 

781 """Extract data from dictionary loaded from a container file. 

782 

783 Parameters 

784 ---------- 

785 data_dict: dict 

786 Dictionary of the data items contained in the container. 

787 datakey: None, str, or list of str 

788 Name of the variable holding the data. If `None` take the 

789 variable that is an 2D array and has the largest number of 

790 elements. 

791 samplekey: str or list of str 

792 Name of the variable holding the sampling rate. 

793 timekey: str or list of str 

794 Name of the variable holding sampling times. 

795 If no sampling rate is available, the sampling rate is retrieved 

796 from the sampling times. 

797 amplkey: str or list of str 

798 Name of the variable holding the amplitude range of the data. 

799 unitkey: str 

800 Name of the variable holding the unit of the data. 

801 amax: None or float 

802 If specified and no amplitude range has been found in `data_dict`, 

803 then this is the amplitude range of the data. 

804 unit: None or str 

805 If specified and no unit has been found in `data_dict`, 

806 then return this as the unit of the data. 

807 

808 Returns 

809 ------- 

810 data: 2-D array of floats 

811 All data traces as an 2-D numpy array, even for single channel data. 

812 First dimension is time, second is channel. 

813 rate: float 

814 Sampling rate of the data in Hz. 

815 unit: str 

816 Unit of the data. 

817 amax: float 

818 Maximum amplitude of data range in `unit`. 

819 

820 Raises 

821 ------ 

822 ValueError: 

823 Invalid key requested. 

824 """ 

825 # extract format data: 

826 if not isinstance(samplekey, (list, tuple, np.ndarray)): 

827 samplekey = (samplekey,) 

828 if not isinstance(timekey, (list, tuple, np.ndarray)): 

829 timekey = (timekey,) 

830 if not isinstance(amplkey, (list, tuple, np.ndarray)): 

831 amplkey = (amplkey,) 

832 rate = 0.0 

833 for skey in samplekey: 

834 if skey in data_dict: 

835 rate = float(data_dict[skey]) 

836 break 

837 if rate == 0.0: 

838 for tkey in timekey: 

839 if tkey in data_dict: 

840 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0]) 

841 break 

842 if rate == 0.0: 

843 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times") 

844 for akey in amplkey: 

845 if akey in data_dict: 

846 amax = float(data_dict[akey]) 

847 break 

848 if unitkey in data_dict: 

849 unit = data_dict[unitkey] 

850 # get data array: 

851 raw_data = np.array([]) 

852 if datakey: 

853 # try data keys: 

854 if not isinstance(datakey, (list, tuple, np.ndarray)): 

855 datakey = (datakey,) 

856 for dkey in datakey: 

857 if dkey in data_dict: 

858 raw_data = data_dict[dkey] 

859 break 

860 if len(raw_data) == 0: 

861 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data") 

862 else: 

863 # find largest 2D array: 

864 for d in data_dict: 

865 if hasattr(data_dict[d], 'shape'): 

866 if 1 <= len(data_dict[d].shape) <= 2 and \ 

867 np.max(data_dict[d].shape) > np.max(raw_data.shape): 

868 raw_data = data_dict[d] 

869 if len(raw_data) == 0: 

870 raise ValueError('no data found') 

871 # make 2D: 

872 if len(raw_data.shape) == 1: 

873 raw_data = raw_data.reshape(-1, 1) 

874 # transpose if necessary: 

875 if np.argmax(raw_data.shape) > 0: 

876 raw_data = raw_data.T 

877 # recode: 

878 if raw_data.dtype == np.dtype('int16'): 

879 data = raw_data.astype('float32') 

880 data *= amax/2**15 

881 elif raw_data.dtype == np.dtype('int32'): 

882 data = raw_data.astype(float) 

883 data *= amax/2**31 

884 elif raw_data.dtype == np.dtype('int64'): 

885 data = raw_data.astype(float) 

886 data *= amax/2**63 

887 else: 

888 data = raw_data 

889 return data, rate, unit, amax 

890 

891 

892def load_container(filepath, datakey=None, 

893 samplekey=['rate', 'Fs', 'fs'], 

894 timekey=['time'], amplkey=['amax'], unitkey='unit', 

895 amax=1.0, unit='a.u.'): 

896 """Load data from a generic container file. 

897 

898 Supported file formats are: 

899 

900 - python pickle files (.pkl) 

901 - numpy files (.npz) 

902 - matlab files (.mat) 

903 

904 Parameters 

905 ---------- 

906 filepath: str 

907 Path of the file to load. 

908 datakey: None, str, or list of str 

909 Name of the variable holding the data. If `None` take the 

910 variable that is an 2D array and has the largest number of 

911 elements. 

912 samplekey: str or list of str 

913 Name of the variable holding the sampling rate. 

914 timekey: str or list of str 

915 Name of the variable holding sampling times. 

916 If no sampling rate is available, the sampling rate is retrieved 

917 from the sampling times. 

918 amplkey: str 

919 Name of the variable holding the amplitude range of the data. 

920 unitkey: str 

921 Name of the variable holding the unit of the data. 

922 If `unitkey` is not a valid key, then return `unitkey` as the `unit`. 

923 amax: None or float 

924 If specified and no amplitude range has been found in the data 

925 container, then this is the amplitude range of the data. 

926 unit: None or str 

927 If specified and no unit has been found in the data container, 

928 then return this as the unit of the data. 

929 

930 Returns 

931 ------- 

932 data: 2-D array of floats 

933 All data traces as an 2-D numpy array, even for single channel data. 

934 First dimension is time, second is channel. 

935 rate: float 

936 Sampling rate of the data in Hz. 

937 unit: str 

938 Unit of the data. 

939 amax: float 

940 Maximum amplitude of data range. 

941 

942 Raises 

943 ------ 

944 ValueError: 

945 Invalid key requested. 

946 """ 

947 # load data: 

948 data_dict = {} 

949 ext = os.path.splitext(filepath)[1] 

950 if ext == '.pkl': 

951 import pickle 

952 with open(filepath, 'rb') as f: 

953 data_dict = pickle.load(f) 

954 elif ext == '.npz': 

955 data_dict = np.load(filepath) 

956 elif ext == '.mat': 

957 from scipy.io import loadmat 

958 data_dict = loadmat(filepath, squeeze_me=True) 

959 return extract_container_data(data_dict, datakey, samplekey, 

960 timekey, amplkey, unitkey, amax, unit) 

961 

962 

963def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']): 

964 """ Extract metadata from dictionary loaded from a container file. 

965 

966 Parameters 

967 ---------- 

968 data_dict: dict 

969 Dictionary of the data items contained in the container. 

970 metadatakey: str or list of str 

971 Name of the variable holding the metadata. 

972 

973 Returns 

974 ------- 

975 metadata: nested dict 

976 Nested dictionary with key-value pairs of the meta data. 

977 """ 

978 if not isinstance(metadatakey, (list, tuple, np.ndarray)): 

979 metadatakey = (metadatakey,) 

980 # get single metadata dictionary: 

981 for mkey in metadatakey: 

982 if mkey in data_dict: 

983 return data_dict[mkey] 

984 # collect all keys starting with metadatakey: 

985 metadata = {} 

986 for mkey in metadatakey: 

987 mkey += '__' 

988 for dkey in data_dict: 

989 if dkey[:len(mkey)] == mkey: 

990 v = data_dict[dkey] 

991 if hasattr(v, 'size') and v.ndim == 0: 

992 v = v.item() 

993 metadata[dkey[len(mkey):]] = v 

994 if len(metadata) > 0: 

995 return unflatten_metadata(metadata, sep='__') 

996 return metadata 

997 

998 

999def metadata_container(filepath, metadatakey=['metadata', 'info']): 

1000 """ Read meta-data of a container file. 

1001 

1002 Parameters 

1003 ---------- 

1004 filepath: str 

1005 A container file. 

1006 metadatakey: str or list of str 

1007 Name of the variable holding the metadata. 

1008 

1009 Returns 

1010 ------- 

1011 metadata: nested dict 

1012 Nested dictionary with key-value pairs of the meta data. 

1013 """ 

1014 data_dict = {} 

1015 ext = os.path.splitext(filepath)[1] 

1016 if ext == '.pkl': 

1017 import pickle 

1018 with open(filepath, 'rb') as f: 

1019 data_dict = pickle.load(f) 

1020 elif ext == '.npz': 

1021 data_dict = np.load(filepath) 

1022 elif ext == '.mat': 

1023 from scipy.io import loadmat 

1024 data_dict = loadmat(filepath, squeeze_me=True) 

1025 return extract_container_metadata(data_dict, metadatakey) 

1026 

1027 

1028def extract_container_markers(data_dict, poskey=['positions'], 

1029 spanskey=['spans'], labelskey=['labels'], 

1030 descrkey=['descriptions']): 

1031 """ Extract markers from dictionary loaded from a container file. 

1032 

1033 Parameters 

1034 ---------- 

1035 data_dict: dict 

1036 Dictionary of the data items contained in the container. 

1037 poskey: str or list of str 

1038 Name of the variable holding positions of markers. 

1039 spanskey: str or list of str 

1040 Name of the variable holding spans of markers. 

1041 labelskey: str or list of str 

1042 Name of the variable holding labels of markers. 

1043 descrkey: str or list of str 

1044 Name of the variable holding descriptions of markers. 

1045 

1046 Returns 

1047 ------- 

1048 locs: 2-D array of ints 

1049 Marker positions (first column) and spans (second column) 

1050 for each marker (rows). 

1051 labels: 2-D array of string objects 

1052 Labels (first column) and texts (second column) 

1053 for each marker (rows). 

1054 """ 

1055 if not isinstance(poskey, (list, tuple, np.ndarray)): 

1056 poskey = (poskey,) 

1057 if not isinstance(spanskey, (list, tuple, np.ndarray)): 

1058 spanskey = (spanskey,) 

1059 if not isinstance(labelskey, (list, tuple, np.ndarray)): 

1060 labelskey = (labelskey,) 

1061 if not isinstance(descrkey, (list, tuple, np.ndarray)): 

1062 descrkey = (descrkey,) 

1063 locs = np.zeros((0, 2), dtype=int) 

1064 for pkey in poskey: 

1065 if pkey in data_dict: 

1066 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int) 

1067 locs[:,0] = data_dict[pkey] 

1068 break 

1069 for skey in spanskey: 

1070 if skey in data_dict: 

1071 locs[:,1] = data_dict[skey] 

1072 break 

1073 labels = np.zeros((0, 2), dtype=object) 

1074 for lkey in labelskey: 

1075 if lkey in data_dict: 

1076 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object) 

1077 labels[:,0] = data_dict[lkey] 

1078 break 

1079 for dkey in descrkey: 

1080 if dkey in data_dict: 

1081 labels[:,1] = data_dict[dkey] 

1082 break 

1083 return locs, labels 

1084 

1085 

1086def markers_container(filepath, poskey=['positions'], 

1087 spanskey=['spans'], labelskey=['labels'], 

1088 descrkey=['descriptions']): 

1089 """ Read markers of a container file. 

1090 

1091 Parameters 

1092 ---------- 

1093 filepath: str 

1094 A container file. 

1095 poskey: str or list of str 

1096 Name of the variable holding positions of markers. 

1097 spanskey: str or list of str 

1098 Name of the variable holding spans of markers. 

1099 labelskey: str or list of str 

1100 Name of the variable holding labels of markers. 

1101 descrkey: str or list of str 

1102 Name of the variable holding descriptions of markers. 

1103 

1104 Returns 

1105 ------- 

1106 locs: 2-D array of ints 

1107 Marker positions (first column) and spans (second column) 

1108 for each marker (rows). 

1109 labels: 2-D array of string objects 

1110 Labels (first column) and texts (second column) 

1111 for each marker (rows). 

1112 """ 

1113 data_dict = {} 

1114 ext = os.path.splitext(filepath)[1] 

1115 if ext == '.pkl': 

1116 import pickle 

1117 with open(filepath, 'rb') as f: 

1118 data_dict = pickle.load(f) 

1119 elif ext == '.npz': 

1120 data_dict = np.load(filepath) 

1121 elif ext == '.mat': 

1122 from scipy.io import loadmat 

1123 data_dict = loadmat(filepath, squeeze_me=True) 

1124 return extract_container_markers(data_dict, poskey, spanskey, 

1125 labelskey, descrkey) 

1126 

1127 

1128def check_raw(filepath): 

1129 """Check if file is a raw file. 

1130 

1131 The following extensions are interpreted as raw files: 

1132 

1133 - raw files (*.raw) 

1134 - LabView scandata (*.scandat) 

1135 

1136 Parameters 

1137 ---------- 

1138 filepath: str 

1139 Path of the file to check. 

1140  

1141 Returns 

1142 ------- 

1143 is_raw: bool 

1144 `True`, if `filepath` is a raw format. 

1145 """ 

1146 ext = os.path.splitext(filepath)[1] 

1147 return ext.lower() in ('.raw', '.scandat', '.mat') 

1148 

1149 

1150def load_raw(filepath, rate=44000, channels=1, dtype=np.float32, 

1151 amax=1.0, unit='a.u.'): 

1152 """Load data from a raw file. 

1153 

1154 Raw files just contain the data and absolutely no metadata, not 

1155 even the smapling rate, number of channels, etc. 

1156 Supported file formats are: 

1157 

1158 - raw files (*.raw) 

1159 - LabView scandata (*.scandat) 

1160 

1161 Parameters 

1162 ---------- 

1163 filepath: str 

1164 Path of the file to load. 

1165 rate: float 

1166 Sampling rate of the data in Hertz. 

1167 channels: int 

1168 Number of channels multiplexed in the data. 

1169 dtype: str or numpy.dtype 

1170 The data type stored in the file. 

1171 amax: float 

1172 The amplitude range of the data. 

1173 unit: str 

1174 The unit of the data. 

1175 

1176 Returns 

1177 ------- 

1178 data: 2-D array of floats 

1179 All data traces as an 2-D numpy array, even for single channel data. 

1180 First dimension is time, second is channel. 

1181 rate: float 

1182 Sampling rate of the data in Hz. 

1183 unit: str 

1184 Unit of the data. 

1185 amax: float 

1186 Maximum amplitude of data range. 

1187 

1188 """ 

1189 raw_data = np.fromfile(filepath, dtype=dtype).reshape(-1, channels) 

1190 # recode: 

1191 if dtype == np.dtype('int16'): 

1192 data = raw_data.astype('float32') 

1193 data *= amax/2**15 

1194 elif dtype == np.dtype('int32'): 

1195 data = raw_data.astype(float) 

1196 data *= amax/2**31 

1197 elif dtype == np.dtype('int64'): 

1198 data = raw_data.astype(float) 

1199 data *= amax/2**63 

1200 else: 

1201 data = raw_data 

1202 return data, rate, unit, amax 

1203 

1204 

1205def load_audioio(filepath, verbose=0, gainkey=default_gain_keys, sep='.', 

1206 amax=1.0, unit='a.u.'): 

1207 """Load data from an audio file. 

1208 

1209 See the 

1210 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio) 

1211 function of the [`audioio`](https://github.com/bendalab/audioio) 

1212 package for more infos. 

1213 

1214 Parameters 

1215 ---------- 

1216 filepath: str 

1217 Path of the file to load. 

1218 verbose: int 

1219 If > 0 show detailed error/warning messages. 

1220 gainkey: str or list of str 

1221 Key in the file's metadata that holds some gain information. 

1222 If found, the data will be multiplied with the gain, 

1223 and if available, the corresponding unit is returned. 

1224 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details. 

1225 sep: str 

1226 String that separates section names in `gainkey`. 

1227 amax: float 

1228 If specified and no gain has been found in the metadata, 

1229 then use this as the amplitude range. 

1230 unit: str 

1231 If specified and no gain has been found in the metadata, 

1232 then return this as the unit of the data. 

1233 

1234 Returns 

1235 ------- 

1236 data: 2-D array of floats 

1237 All data traces as an 2-D numpy array, even for single channel data. 

1238 First dimension is time, second is channel. 

1239 rate: float 

1240 Sampling rate of the data in Hz. 

1241 unit: str 

1242 Unit of the data if found in the metadata (see `gainkey`), 

1243 otherwise `unit`. 

1244 amax: float 

1245 Maximum amplitude of data range. 

1246 """ 

1247 # get gain: 

1248 md = metadata_audioio(filepath) 

1249 amax, unit = get_gain(md, gainkey, sep, amax, unit) 

1250 # load data: 

1251 data, rate = load_audio(filepath, verbose) 

1252 if amax != 1.0: 

1253 data *= amax 

1254 return data, rate, unit, amax 

1255 

1256 

1257data_loader_funcs = ( 

1258 ('relacs', check_relacs, load_relacs, metadata_relacs, None), 

1259 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid), 

1260 ('container', check_container, load_container, metadata_container, markers_container), 

1261 ('raw', check_raw, load_raw, None, None), 

1262 ('audioio', None, load_audioio, metadata_audioio, markers_audioio), 

1263 ) 

1264"""List of implemented load functions. 

1265 

1266Each element of the list is a tuple with the data format's name, its 

1267check and its load function. 

1268 

1269""" 

1270 

1271 

1272def load_data(filepath, verbose=0, **kwargs): 

1273 """Load time-series data from a file. 

1274 

1275 Parameters 

1276 ---------- 

1277 filepath: str 

1278 Path and name of the file to load. 

1279 verbose: int 

1280 If > 0 show detailed error/warning messages. 

1281 **kwargs: dict 

1282 Further keyword arguments that are passed on to the  

1283 format specific loading functions. 

1284 For example: 

1285 - `amax`: the amplitude range of the data. 

1286 - 'unit': the unit of the data. 

1287 

1288 Returns 

1289 ------- 

1290 data: 2-D array 

1291 All data traces as an 2-D numpy array, even for single channel data. 

1292 First dimension is time, second is channel. 

1293 rate: float 

1294 Sampling rate of the data in Hz. 

1295 unit: str 

1296 Unit of the data. 

1297 amax: float 

1298 Maximum amplitude of data range. 

1299 

1300 Raises 

1301 ------ 

1302 ValueError: 

1303 `filepath` is empty string. 

1304 """ 

1305 if len(filepath) == 0: 

1306 raise ValueError('input argument filepath is empty string.') 

1307 # load data: 

1308 for name, check_file, load_file, _, _ in data_loader_funcs: 

1309 if check_file is None or check_file(filepath): 

1310 data, rate, unit, amax = load_file(filepath, **kwargs) 

1311 if verbose > 0: 

1312 print(f'loaded {name} data from file "{filepath}"') 

1313 if verbose > 1: 

1314 print(f' sampling rate: {rate:g} Hz') 

1315 print(f' channels : {data.shape[1]}') 

1316 print(f' frames : {len(data)}') 

1317 print(f' range : {amax:g}{unit}') 

1318 return data, rate, unit, amax 

1319 return np.zeros((0, 1)), 0.0, '', 1.0 

1320 

1321 

1322def metadata(filepath, **kwargs): 

1323 """ Read meta-data from a data file. 

1324 

1325 Parameters 

1326 ---------- 

1327 filepath: str 

1328 The full path and name of the file to load. For some file 

1329 formats several files can be provided in a list. 

1330 **kwargs: dict 

1331 Further keyword arguments that are passed on to the  

1332 format specific loading functions. 

1333 

1334 Returns 

1335 ------- 

1336 meta_data: nested dict 

1337 Meta data contained in the file. Keys of the nested 

1338 dictionaries are always strings. If the corresponding 

1339 values are dictionaries, then the key is the section name 

1340 of the metadata contained in the dictionary. All other 

1341 types of values are values for the respective key. In 

1342 particular they are strings, or list of strings. But other 

1343 simple types like ints or floats are also allowed. 

1344 

1345 Raises 

1346 ------ 

1347 ValueError: 

1348 `filepath` is empty string. 

1349 """ 

1350 if len(filepath) == 0: 

1351 raise ValueError('input argument filepath is empty string.') 

1352 # load metadata: 

1353 for _, check_file, _, metadata_file, _ in data_loader_funcs: 

1354 if check_file is None or check_file(filepath): 

1355 if metadata_file is not None: 

1356 return metadata_file(filepath, **kwargs) 

1357 return {} 

1358 

1359 

1360def markers(filepath): 

1361 """ Read markers of a data file. 

1362 

1363 Parameters 

1364 ---------- 

1365 filepath: str or file handle 

1366 The data file. 

1367 

1368 Returns 

1369 ------- 

1370 locs: 2-D array of ints 

1371 Marker positions (first column) and spans (second column) 

1372 for each marker (rows). 

1373 labels: 2-D array of string objects 

1374 Labels (first column) and texts (second column) 

1375 for each marker (rows). 

1376 

1377 Raises 

1378 ------ 

1379 ValueError: 

1380 `filepath` is empty string. 

1381 """ 

1382 if len(filepath) == 0: 

1383 raise ValueError('input argument filepath is empty string.') 

1384 # load markers: 

1385 for _, check_file, _, _, markers_file in data_loader_funcs: 

1386 if check_file is None or check_file(filepath): 

1387 if markers_file is not None: 

1388 return markers_file(filepath) 

1389 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object) 

1390 

1391 

1392class DataLoader(AudioLoader): 

1393 """Buffered reading of time-series data for random access of the data in the file. 

1394  

1395 This allows for reading very large data files that do not fit into 

1396 memory. A `DataLoader` instance can be used like a huge 

1397 read-only numpy array, i.e. 

1398 ``` 

1399 data = DataLoader('path/to/data/file.dat') 

1400 x = data[10000:20000,0] 

1401 ``` 

1402 The first index specifies the frame, the second one the channel. 

1403 

1404 `DataLoader` first determines the format of the data file and then 

1405 opens the file (first line). It then reads data from the file as 

1406 necessary for the requested data (second line). 

1407 

1408 Supported file formats are 

1409 

1410 - audio files via `audioio` package 

1411 - python pickle files 

1412 - numpy .npz files 

1413 - matlab .mat files 

1414 - relacs trace*.raw files (www.relacs.net) 

1415 - fishgrid traces-*.raw files 

1416 

1417 Reading sequentially through the file is always possible. If 

1418 previous data are requested, then the file is read from the 

1419 beginning. This might slow down access to previous data 

1420 considerably. Use the `backsize` argument to the open functions to 

1421 make sure some data are loaded before the requested frame. Then a 

1422 subsequent access to the data within `backsize` seconds before that 

1423 frame can still be handled without the need to reread the file 

1424 from the beginning. 

1425 

1426 Usage: 

1427 ------ 

1428 ``` 

1429 import thunderlab.dataloader as dl 

1430 with dl.DataLoader(filepath, 60.0, 10.0) as data: 

1431 # do something with the content of the file: 

1432 x = data[0:10000,0] 

1433 y = data[10000:20000,0] 

1434 z = x + y 

1435 ``` 

1436  

1437 Normal open and close: 

1438 ``` 

1439 data = dl.DataLoader(filepath, 60.0) 

1440 x = data[:,:] # read the whole file 

1441 data.close() 

1442 ```  

1443 that is the same as: 

1444 ``` 

1445 data = dl.DataLoader() 

1446 data.open(filepath, 60.0) 

1447 ``` 

1448  

1449 Parameters 

1450 ---------- 

1451 filepath: str 

1452 Name of the file. 

1453 buffersize: float 

1454 Size of internal buffer in seconds. 

1455 backsize: float 

1456 Part of the buffer to be loaded before the requested start index in seconds. 

1457 verbose: int 

1458 If larger than zero show detailed error/warning messages. 

1459 meta_kwargs: dict 

1460 Keyword arguments that are passed on to the _load_metadata() function. 

1461 

1462 Attributes 

1463 ---------- 

1464 rate: float 

1465 The sampling rate of the data in Hertz. 

1466 channels: int 

1467 The number of channels that are read in. 

1468 frames: int 

1469 The number of frames in the file. 

1470 format: str or None 

1471 Format of the audio file. 

1472 encoding: str or None 

1473 Encoding/subtype of the audio file. 

1474 shape: tuple 

1475 Number of frames and channels of the data. 

1476 ndim: int 

1477 Number of dimensions: always 2 (frames and channels). 

1478 unit: str 

1479 Unit of the data. 

1480 ampl_min: float 

1481 Minimum amplitude the file format supports. 

1482 ampl_max: float 

1483 Maximum amplitude the file format supports. 

1484 

1485 Methods 

1486 ------- 

1487 

1488 - `len()`: the number of frames 

1489 - `open()`: open a data file. 

1490 - `open_*()`: open a data file of a specific format. 

1491 - `close()`: close the file. 

1492 - `metadata()`: metadata of the file. 

1493 - `markers()`: markers of the file. 

1494 - `set_unwrap()`: Set parameters for unwrapping clipped data. 

1495 

1496 """ 

1497 

1498 def __init__(self, filepath=None, buffersize=10.0, backsize=0.0, 

1499 verbose=0, **meta_kwargs): 

1500 super().__init__(None, buffersize, backsize, 

1501 verbose, **meta_kwargs) 

1502 if filepath is not None: 

1503 self.open(filepath, buffersize, backsize, verbose, **meta_kwargs) 

1504 

1505 def __getitem__(self, key): 

1506 return super(DataLoader, self).__getitem__(key) 

1507 

1508 def __next__(self): 

1509 return super(DataLoader, self).__next__() 

1510 

1511 

1512 # relacs interface:  

1513 def open_relacs(self, filepath, buffersize=10.0, backsize=0.0, 

1514 verbose=0, amax=1.0): 

1515 """Open relacs data files (www.relacs.net) for reading. 

1516 

1517 Parameters 

1518 ---------- 

1519 filepath: str 

1520 Path to a relacs data directory or a file therein. 

1521 buffersize: float 

1522 Size of internal buffer in seconds. 

1523 backsize: float 

1524 Part of the buffer to be loaded before the requested start index in seconds. 

1525 verbose: int 

1526 If > 0 show detailed error/warning messages. 

1527 amax: float 

1528 The amplitude range of the data. 

1529 

1530 Raises 

1531 ------ 

1532 ValueError: .gz files not supported. 

1533 """ 

1534 self.verbose = verbose 

1535 

1536 if self.sf is not None: 

1537 self._close_relacs() 

1538 

1539 trace_filepaths = relacs_trace_files(filepath) 

1540 

1541 # open trace files: 

1542 self.sf = [] 

1543 self.frames = None 

1544 self.rate = None 

1545 self.unit = '' 

1546 self.filepath = None 

1547 if len(trace_filepaths) > 0: 

1548 self.filepath = os.path.dirname(trace_filepaths[0]) 

1549 for path in sorted(trace_filepaths): 

1550 if path[-3:] == '.gz': 

1551 raise ValueError('.gz files not supported') 

1552 sf = open(path, 'rb') 

1553 self.sf.append(sf) 

1554 if verbose > 0: 

1555 print(f'open_relacs(filepath) with filepath={path}') 

1556 # file size: 

1557 sf.seek(0, os.SEEK_END) 

1558 frames = sf.tell()//4 

1559 if self.frames is None: 

1560 self.frames = frames 

1561 elif self.frames != frames: 

1562 diff = self.frames - frames 

1563 if diff > 1 or diff < -2: 

1564 raise ValueError('number of frames of traces differ') 

1565 elif diff >= 0: 

1566 self.frames = frames 

1567 sf.seek(0) 

1568 # retrieve sampling rate and unit: 

1569 rate, us = relacs_samplerate_unit(path) 

1570 if self.rate is None: 

1571 self.rate = rate 

1572 elif rate != self.rate: 

1573 raise ValueError('sampling rates of traces differ') 

1574 if len(self.unit) == 0: 

1575 self.unit = us 

1576 elif us != self.unit: 

1577 raise ValueError('unit of traces differ') 

1578 self.channels = len(self.sf) 

1579 self.shape = (self.frames, self.channels) 

1580 self.size = self.frames * self.channels 

1581 self.ndim = len(self.shape) 

1582 self.format = 'RELACS' 

1583 self.encoding = 'FLOAT' 

1584 self.bufferframes = int(buffersize*self.rate) 

1585 self.backframes = int(backsize*self.rate) 

1586 self.init_buffer() 

1587 self.offset = 0 

1588 self.close = self._close_relacs 

1589 self.load_audio_buffer = self._load_buffer_relacs 

1590 self.ampl_min = -amax 

1591 self.ampl_max = +amax 

1592 self._load_metadata = self._metadata_relacs 

1593 # TODO: load markers: 

1594 self._locs = np.zeros((0, 2), dtype=int) 

1595 self._labels = np.zeros((0, 2), dtype=object) 

1596 self._load_markers = None 

1597 return self 

1598 

1599 def _close_relacs(self): 

1600 """Close the relacs data files. 

1601 """ 

1602 if self.sf is not None: 

1603 for file in self.sf: 

1604 file.close() 

1605 self.sf = None 

1606 

1607 def _load_buffer_relacs(self, r_offset, r_size, buffer): 

1608 """Load new data from relacs data file. 

1609 

1610 Parameters 

1611 ---------- 

1612 r_offset: int 

1613 First frame to be read from file. 

1614 r_size: int 

1615 Number of frames to be read from file. 

1616 buffer: ndarray 

1617 Buffer where to store the loaded data. 

1618 """ 

1619 for i, file in enumerate(self.sf): 

1620 file.seek(r_offset*4) 

1621 data = file.read(r_size*4) 

1622 buffer[:, i] = np.frombuffer(data, dtype=np.float32) 

1623 

1624 

1625 def _metadata_relacs(self, store_empty=False, first_only=False): 

1626 """ Load meta-data of a relacs data set. 

1627 """ 

1628 info_path = os.path.join(self.filepath, 'info.dat') 

1629 if not os.path.exists(info_path): 

1630 return {} 

1631 return relacs_header(info_path, store_empty, first_only) 

1632 

1633 

1634 # fishgrid interface:  

1635 def open_fishgrid(self, filepath, buffersize=10.0, backsize=0.0, 

1636 verbose=0): 

1637 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading. 

1638 

1639 Parameters 

1640 ---------- 

1641 filepath: str 

1642 Path to a fishgrid data directory, or a file therein. 

1643 buffersize: float 

1644 Size of internal buffer in seconds. 

1645 backsize: float 

1646 Part of the buffer to be loaded before the requested start index in seconds. 

1647 verbose: int 

1648 If > 0 show detailed error/warning messages. 

1649 """ 

1650 self.verbose = verbose 

1651 

1652 if self.sf is not None: 

1653 self._close_fishgrid() 

1654 

1655 trace_filepaths = fishgrid_trace_files(filepath) 

1656 self.filepath = None 

1657 if len(trace_filepaths) > 0: 

1658 self.filepath = os.path.dirname(trace_filepaths[0]) 

1659 self._load_metadata = metadata_fishgrid 

1660 self._load_markers = markers_fishgrid 

1661 

1662 # open grid files: 

1663 grids = fishgrid_grids(self.metadata()) 

1664 grid_sizes = [r*c for r,c in grids] 

1665 self.channels = 0 

1666 for g, path in enumerate(trace_filepaths): 

1667 self.channels += grid_sizes[g] 

1668 self.sf = [] 

1669 self.grid_channels = [] 

1670 self.grid_offs = [] 

1671 offs = 0 

1672 self.frames = None 

1673 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate') 

1674 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt') 

1675 if v is not None: 

1676 self.ampl_min = -v 

1677 self.ampl_max = +v 

1678 

1679 for g, path in enumerate(trace_filepaths): 

1680 sf = open(path, 'rb') 

1681 self.sf.append(sf) 

1682 if verbose > 0: 

1683 print(f'open_fishgrid(filepath) with filepath={path}') 

1684 # grid channels: 

1685 self.grid_channels.append(grid_sizes[g]) 

1686 self.grid_offs.append(offs) 

1687 offs += grid_sizes[g] 

1688 # file size: 

1689 sf.seek(0, os.SEEK_END) 

1690 frames = sf.tell()//4//grid_sizes[g] 

1691 if self.frames is None: 

1692 self.frames = frames 

1693 elif self.frames != frames: 

1694 diff = self.frames - frames 

1695 if diff > 1 or diff < -2: 

1696 raise ValueError('number of frames of traces differ') 

1697 elif diff >= 0: 

1698 self.frames = frames 

1699 sf.seek(0) 

1700 self.shape = (self.frames, self.channels) 

1701 self.size = self.frames * self.channels 

1702 self.ndim = len(self.shape) 

1703 self.format = 'FISHGRID' 

1704 self.encoding = 'FLOAT' 

1705 self.bufferframes = int(buffersize*self.rate) 

1706 self.backframes = int(backsize*self.rate) 

1707 self.init_buffer() 

1708 self.offset = 0 

1709 self.close = self._close_fishgrid 

1710 self.load_audio_buffer = self._load_buffer_fishgrid 

1711 return self 

1712 

1713 def _close_fishgrid(self): 

1714 """Close the fishgrid data files. 

1715 """ 

1716 if self.sf is not None: 

1717 for file in self.sf: 

1718 file.close() 

1719 self.sf = None 

1720 

1721 def _load_buffer_fishgrid(self, r_offset, r_size, buffer): 

1722 """Load new data from relacs data file. 

1723 

1724 Parameters 

1725 ---------- 

1726 r_offset: int 

1727 First frame to be read from file. 

1728 r_size: int 

1729 Number of frames to be read from file. 

1730 buffer: ndarray 

1731 Buffer where to store the loaded data. 

1732 """ 

1733 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs): 

1734 file.seek(r_offset*4*gchannels) 

1735 data = file.read(r_size*4*gchannels) 

1736 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels)) 

1737 

1738 

1739 # container interface: 

1740 def open_container(self, filepath, buffersize=10.0, 

1741 backsize=0.0, verbose=0, datakey=None, 

1742 samplekey=['rate', 'Fs', 'fs'], 

1743 timekey=['time'], amplkey=['amax'], unitkey='unit', 

1744 metadatakey=['metadata', 'info'], 

1745 poskey=['positions'], 

1746 spanskey=['spans'], labelskey=['labels'], 

1747 descrkey=['descriptions'], 

1748 amax=1.0, unit='a.u.'): 

1749 """Open generic container file. 

1750 

1751 Supported file formats are: 

1752 

1753 - python pickle files (.pkl) 

1754 - numpy files (.npz) 

1755 - matlab files (.mat) 

1756 

1757 Parameters 

1758 ---------- 

1759 filepath: str 

1760 Path to a container file. 

1761 buffersize: float 

1762 Size of internal buffer in seconds. 

1763 backsize: float 

1764 Part of the buffer to be loaded before the requested start index in seconds. 

1765 verbose: int 

1766 If > 0 show detailed error/warning messages. 

1767 datakey: None, str, or list of str 

1768 Name of the variable holding the data. If `None` take the 

1769 variable that is an 2D array and has the largest number of 

1770 elements. 

1771 samplekey: str or list of str 

1772 Name of the variable holding the sampling rate. 

1773 timekey: str or list of str 

1774 Name of the variable holding sampling times. 

1775 If no sampling rate is available, the sampling rate is retrieved 

1776 from the sampling times. 

1777 amplkey: str or list of str 

1778 Name of the variable holding the amplitude range of the data. 

1779 unitkey: str 

1780 Name of the variable holding the unit of the data. 

1781 metadatakey: str or list of str 

1782 Name of the variable holding the metadata. 

1783 poskey: str or list of str 

1784 Name of the variable holding positions of markers. 

1785 spanskey: str or list of str 

1786 Name of the variable holding spans of markers. 

1787 labelskey: str or list of str 

1788 Name of the variable holding labels of markers. 

1789 descrkey: str or list of str 

1790 Name of the variable holding descriptions of markers. 

1791 amax: None or float 

1792 If specified and no amplitude range has been found in the data 

1793 container, then this is the amplitude range of the data. 

1794 unit: None or str 

1795 If specified and no unit has been found in the data container, 

1796 then return this as the unit of the data. 

1797 

1798 Raises 

1799 ------ 

1800 ValueError: 

1801 Invalid key requested. 

1802 """ 

1803 self.verbose = verbose 

1804 data_dict = {} 

1805 ext = os.path.splitext(filepath)[1] 

1806 if ext == '.pkl': 

1807 import pickle 

1808 with open(filepath, 'rb') as f: 

1809 data_dict = pickle.load(f) 

1810 self.format = 'PKL' 

1811 elif ext == '.npz': 

1812 data_dict = np.load(filepath) 

1813 self.format = 'NPZ' 

1814 elif ext == '.mat': 

1815 from scipy.io import loadmat 

1816 data_dict = loadmat(filepath, squeeze_me=True) 

1817 self.format = 'MAT' 

1818 self.buffer, self.rate, self.unit, amax = \ 

1819 extract_container_data(data_dict, datakey, samplekey, 

1820 timekey, amplkey, unitkey, amax, unit) 

1821 self.filepath = filepath 

1822 self.channels = self.buffer.shape[1] 

1823 self.frames = self.buffer.shape[0] 

1824 self.shape = self.buffer.shape 

1825 self.ndim = self.buffer.ndim 

1826 self.size = self.buffer.size 

1827 self.encoding = self.numpy_encodings[self.buffer.dtype] 

1828 self.ampl_min = -amax 

1829 self.ampl_max = +amax 

1830 self.offset = 0 

1831 self.buffer_changed = np.zeros(self.channels, dtype=bool) 

1832 self.bufferframes = self.frames 

1833 self.backsize = 0 

1834 self.close = self._close_container 

1835 self.load_audio_buffer = self._load_buffer_container 

1836 self._metadata = extract_container_metadata(data_dict, metadatakey) 

1837 self._load_metadata = None 

1838 self._locs, self._labels = extract_container_markers(data_dict, 

1839 poskey, 

1840 spanskey, 

1841 labelskey, 

1842 descrkey) 

1843 self._load_markers = None 

1844 

1845 def _close_container(self): 

1846 """Close container. """ 

1847 pass 

1848 

1849 def _load_buffer_container(self, r_offset, r_size, buffer): 

1850 """Load new data from container.""" 

1851 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :] 

1852 

1853 

1854 # raw data interface: 

1855 def open_raw(self, filepath, buffersize=10.0, backsize=0.0, 

1856 verbose=0, rate=44000, channels=1, dtype=np.float32, 

1857 amax=1.0, unit='a.u.'): 

1858 """Load data from a raw file. 

1859 

1860 Raw files just contain the data and absolutely no metadata, not 

1861 even the smapling rate, number of channels, etc. 

1862 Supported file formats are: 

1863 

1864 - raw files (*.raw) 

1865 - LabView scandata (*.scandat) 

1866 

1867 Parameters 

1868 ---------- 

1869 filepath: str 

1870 Path of the file to load. 

1871 buffersize: float 

1872 Size of internal buffer in seconds. 

1873 backsize: float 

1874 Part of the buffer to be loaded before the requested start index in seconds. 

1875 verbose: int 

1876 If > 0 show detailed error/warning messages. 

1877 rate: float 

1878 Sampling rate of the data in Hertz. 

1879 channels: int 

1880 Number of channels multiplexed in the data. 

1881 dtype: str or numpy.dtype 

1882 The data type stored in the file. 

1883 amax: float 

1884 The amplitude range of the data. 

1885 unit: str 

1886 The unit of the data. 

1887 """ 

1888 self.verbose = verbose 

1889 self.filepath = filepath 

1890 self.sf = open(filepath, 'rb') 

1891 if verbose > 0: 

1892 print(f'open_raw(filepath) with filepath={filepath}') 

1893 self.dtype = np.dtype(dtype) 

1894 self.rate = float(rate) 

1895 # file size: 

1896 self.sf.seek(0, os.SEEK_END) 

1897 self.frames = self.sf.tell()//self.dtype.itemsize 

1898 self.sf.seek(0) 

1899 self.channels = int(channels) 

1900 self.shape = (self.frames, self.channels) 

1901 self.ndim = len(self.shape) 

1902 self.size = self.frames*self.channels 

1903 self.format = 'RAW' 

1904 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN') 

1905 self.unit = unit 

1906 self.ampl_max = float(amax) 

1907 self.ampl_min = -self.ampl_max 

1908 self.offset = 0 

1909 self.bufferframes = int(buffersize*self.rate) 

1910 self.backframes = int(backsize*self.rate) 

1911 self.init_buffer() 

1912 self.close = self._close_raw 

1913 self.load_audio_buffer = self._load_buffer_raw 

1914 self._metadata = None 

1915 self._load_metadata = None 

1916 self._locs = None 

1917 self._labels = None 

1918 self._load_markers = None 

1919 

1920 def _close_raw(self): 

1921 """Close raw file. """ 

1922 self.sf.close() 

1923 self.sf = None 

1924 

1925 def _load_buffer_raw(self, r_offset, r_size, buffer): 

1926 """Load new data from container.""" 

1927 self.sf.seek(r_offset*self.dtype.itemsize) 

1928 raw_data = self.sf.read(r_size*self.dtype.itemsize) 

1929 raw_data = np.frombuffer(raw_data, dtype=self.dtype) 

1930 raw_data = raw_data.reshape(-1, self.channels) 

1931 # recode: 

1932 if self.dtype == np.dtype('int16'): 

1933 data = raw_data.astype('float32') 

1934 data *= self.ampl_max/2**15 

1935 elif self.dtype == np.dtype('int32'): 

1936 data = raw_data.astype(float) 

1937 data *= self.ampl_max/2**31 

1938 elif self.dtype == np.dtype('int64'): 

1939 data = raw_data.astype(float) 

1940 data *= self.ampl_max/2**63 

1941 else: 

1942 data = raw_data 

1943 buffer[:, :] = data 

1944 

1945 

1946 # audioio interface:  

1947 def open_audioio(self, filepath, buffersize=10.0, backsize=0.0, 

1948 verbose=0, gainkey=default_gain_keys, sep='.', 

1949 amax=None, unit='a.u.'): 

1950 """Open an audio file. 

1951 

1952 See the [audioio](https://github.com/bendalab/audioio) package 

1953 for details. 

1954 

1955 Parameters 

1956 ---------- 

1957 filepath: str 

1958 Path to an audio file. 

1959 buffersize: float 

1960 Size of internal buffer in seconds. 

1961 backsize: float 

1962 Part of the buffer to be loaded before the requested start index 

1963 in seconds. 

1964 verbose: int 

1965 If > 0 show detailed error/warning messages. 

1966 gainkey: str or list of str 

1967 Key in the file's metadata that holds some gain information. 

1968 If found, the data will be multiplied with the gain, 

1969 and if available, the corresponding unit is returned. 

1970 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details. 

1971 sep: str 

1972 String that separates section names in `gainkey`. 

1973 amax: None or float 

1974 If specified and no gain has been found in the metadata, 

1975 then use this as the amplitude range. 

1976 unit: None or str 

1977 If specified and no gain has been found in the metadata, 

1978 then this is the unit of the data. 

1979 

1980 """ 

1981 self.verbose = verbose 

1982 super(DataLoader, self).open(filepath, buffersize, backsize, verbose) 

1983 md = self.metadata() 

1984 fac, unit = get_gain(md, gainkey, sep, amax, unit) 

1985 if fac is None: 

1986 self.gain_fac = 1.0 

1987 else: 

1988 self.gain_fac = fac 

1989 self._load_buffer_audio_org = self.load_audio_buffer 

1990 self.load_audio_buffer = self._load_buffer_audioio 

1991 self.ampl_min *= self.gain_fac 

1992 self.ampl_max *= self.gain_fac 

1993 self.unit = unit 

1994 return self 

1995 

1996 def _load_buffer_audioio(self, r_offset, r_size, buffer): 

1997 """Load and scale new data from an audio file. 

1998 

1999 Parameters 

2000 ---------- 

2001 r_offset: int 

2002 First frame to be read from file. 

2003 r_size: int 

2004 Number of frames to be read from file. 

2005 buffer: ndarray 

2006 Buffer where to store the loaded data. 

2007 """ 

2008 self._load_buffer_audio_org(r_offset, r_size, buffer) 

2009 buffer *= self.gain_fac 

2010 

2011 

2012 # open multiple files as one: 

2013 def open_multiple(self, filepaths, buffersize=10.0, backsize=0.0, 

2014 verbose=0): 

2015 """Open multiple files as a single concatenated array. 

2016 

2017 Parameters 

2018 ---------- 

2019 filepaths: list of str 

2020 List of file names of audio files. 

2021 buffersize: float 

2022 Size of internal buffer in seconds. 

2023 backsize: float 

2024 Part of the buffer to be loaded before the requested start index in seconds. 

2025 verbose: int 

2026 If larger than zero show detailed error/warning messages. 

2027 

2028 Raises 

2029 ------ 

2030 TypeError 

2031 `filepaths` must be a sequence. 

2032 ValueError 

2033 Empty `filepaths`. 

2034 FileNotFoundError 

2035 `filepaths` does not contain a single valid file. 

2036 

2037 """ 

2038 if not isinstance(filepaths, (list, tuple, np.ndarray)): 

2039 raise TypeError('input argument filepaths is not a sequence!') 

2040 if len(filepaths) == 0: 

2041 raise ValueError('input argument filepaths is empy sequence!') 

2042 self.data_files = [] 

2043 self.start_indices = [] 

2044 for filepath in filepaths: 

2045 try: 

2046 a = DataLoader(filepath, buffersize, backsize, verbose) 

2047 self.data_files. append(a) 

2048 except Exception as e: 

2049 if verbose > 0: 

2050 print(e) 

2051 if len(self.data_files) == 0: 

2052 raise FileNotFoundError('input argument filepaths does not contain any valid audio file!') 

2053 # check contingency and set start indices: 

2054 a0 = self.data_files[0] 

2055 self.filepath = a0.filepath 

2056 self.format = a0.format 

2057 self.encoding = a0.encoding 

2058 self.rate = a0.rate 

2059 self.channels = a0.channels 

2060 self.unit = a0.unit 

2061 self.ampl_max = a0.ampl_max 

2062 self.ampl_min = a0.ampl_min 

2063 self.frames = 0 

2064 self.start_indices = [] 

2065 self.end_indices = [] 

2066 md = a0.metadata() 

2067 start_time = get_datetime(md) 

2068 self._metadata = {} 

2069 self._locs = np.zeros((0, 2), dtype=int) 

2070 self._labels = np.zeros((0, 2), dtype=object) 

2071 for a in self.data_files: 

2072 if a.channels != self.channels: 

2073 raise ValueError(f'number of channels differs: ' 

2074 f'{a.channels} in {a.filepath} versus ' 

2075 f'{self.channels} in {self.filepath}') 

2076 if a.rate != self.rate: 

2077 raise ValueError(f'sampling rates differ: ' 

2078 f'{a.rate} in {a.filepath} versus ' 

2079 f'{self.rate} in {self.filepath}') 

2080 if a.ampl_min != self.ampl_min: 

2081 raise ValueError(f'minimum amplitudes differ: ' 

2082 f'{a.ampl_min} in {a.filepath} versus ' 

2083 f'{self.ampl_min} in {self.filepath}') 

2084 if a.ampl_max != self.ampl_max: 

2085 raise ValueError(f'maximum amplitudes differ: ' 

2086 f'{a.ampl_max} in {a.filepath} versus ' 

2087 f'{self.ampl_max} in {self.filepath}') 

2088 # metadata: 

2089 md = a.metadata() 

2090 fmd = flatten_metadata(md, True) 

2091 add_metadata(self._metadata, fmd) 

2092 # check start time of recording: 

2093 stime = get_datetime(md) 

2094 if start_time is not None and stime is not None and \ 

2095 abs(start_time - stime) > timedelta(seconds=1): 

2096 raise ValueError(f'start time does not indicate continuous recording: ' 

2097 f'expected {start_time} instead of ' 

2098 f'{stime} in {a.filepath}') 

2099 # markers: 

2100 locs, labels = a.markers() 

2101 locs[:,0] += self.frames 

2102 self._locs = np.vstack((self._locs, locs)) 

2103 self._labels = np.vstack((self._labels, labels)) 

2104 # indices: 

2105 self.start_indices.append(self.frames) 

2106 self.frames += a.frames 

2107 self.end_indices.append(self.frames) 

2108 start_time += timedelta(seconds=a.frames/a.rate) 

2109 self.start_indices = np.array(self.start_indices) 

2110 self.end_indices = np.array(self.end_indices) 

2111 # set startime from first file: 

2112 start_time = get_datetime(a0.metadata()) 

2113 set_starttime(self._metadata, start_time) 

2114 # setup infrastructure: 

2115 self.shape = (self.frames, self.channels) 

2116 self.bufferframes = int(buffersize*self.rate) 

2117 self.backframes = int(backsize*self.rate) 

2118 self.init_buffer() 

2119 self.close = self._close_multiple 

2120 self.load_audio_buffer = self._load_buffer_multiple 

2121 return self 

2122 

2123 def _close_multiple(self): 

2124 """Close all the data files. """ 

2125 for a in self.data_files: 

2126 a.close() 

2127 self.data_files = [] 

2128 self.start_indices = [] 

2129 self.end_indices = [] 

2130 

2131 def _load_buffer_multiple(self, r_offset, r_size, buffer): 

2132 """Load new data from the underlying files. 

2133 

2134 Parameters 

2135 ---------- 

2136 r_offset: int 

2137 First frame to be read from file. 

2138 r_size: int 

2139 Number of frames to be read from file. 

2140 buffer: ndarray 

2141 Buffer where to store the loaded data. 

2142 """ 

2143 offs = r_offset 

2144 size = r_size 

2145 boffs = 0 

2146 ai = np.searchsorted(self.end_indices, offs, side='right') 

2147 while size > 0: 

2148 ai0 = offs - self.start_indices[ai] 

2149 ai1 = offs + size 

2150 if ai1 > self.end_indices[ai]: 

2151 ai1 = self.end_indices[ai] 

2152 ai1 -= self.start_indices[ai] 

2153 n = ai1 - ai0 

2154 self.data_files[ai].load_audio_buffer(ai0, n, 

2155 buffer[boffs:boffs + n,:]) 

2156 boffs += n 

2157 offs += n 

2158 size -= n 

2159 ai += 1 

2160 

2161 

2162 def open(self, filepath, buffersize=10.0, backsize=0.0, 

2163 verbose=0, **kwargs): 

2164 """Open file with time-series data for reading. 

2165 

2166 Parameters 

2167 ---------- 

2168 filepath: str or list of str 

2169 Name of the file or list of many file names that should be 

2170 made accessible as a single array. 

2171 buffersize: float 

2172 Size of internal buffer in seconds. 

2173 backsize: float 

2174 Part of the buffer to be loaded before the requested start index 

2175 in seconds. 

2176 verbose: int 

2177 If > 0 show detailed error/warning messages. 

2178 **kwargs: dict 

2179 Further keyword arguments that are passed on to the  

2180 format specific opening functions. 

2181 For example: 

2182 - `amax`: the amplitude range of the data. 

2183 - 'unit': the unit of the data. 

2184 

2185 Raises 

2186 ------ 

2187 ValueError: 

2188 `filepath` is empty string. 

2189 """ 

2190 # list of implemented open functions: 

2191 data_open_funcs = ( 

2192 ('relacs', check_relacs, self.open_relacs, 1), 

2193 ('fishgrid', check_fishgrid, self.open_fishgrid, 1), 

2194 ('container', check_container, self.open_container, 1), 

2195 ('raw', check_raw, self.open_raw, 1), 

2196 ('audioio', None, self.open_audioio, 0), 

2197 ) 

2198 

2199 self.buffer = np.array([]) 

2200 self.rate = 0.0 

2201 if not filepath: 

2202 raise ValueError('input argument filepath is empty string.') 

2203 if isinstance(filepath, (list, tuple, np.ndarray)): 

2204 return self.open_multiple(filepath, buffersize, backsize, verbose) 

2205 # open data: 

2206 for name, check_file, open_file, v in data_open_funcs: 

2207 if check_file is None or check_file(filepath): 

2208 open_file(filepath, buffersize, backsize, verbose, **kwargs) 

2209 if v*verbose > 1: 

2210 if self.format is not None: 

2211 print(f' format : {self.format}') 

2212 if self.encoding is not None: 

2213 print(f' encoding : {self.encoding}') 

2214 print(f' sampling rate: {self.rate} Hz') 

2215 print(f' channels : {self.channels}') 

2216 print(f' frames : {self.frames}') 

2217 print(f' range : {self.ampl_max:g}{self.unit}') 

2218 break 

2219 return self 

2220 

2221 

2222def demo(filepath, plot=False): 

2223 print("try load_data:") 

2224 data, rate, unit, amax = load_data(filepath, verbose=2) 

2225 if plot: 

2226 fig, ax = plt.subplots() 

2227 time = np.arange(len(data))/rate 

2228 for c in range(data.shape[1]): 

2229 ax.plot(time, data[:,c]) 

2230 ax.set_xlabel('Time [s]') 

2231 ax.set_ylabel(f'[{unit}]') 

2232 if amax is not None and np.isfinite(amax): 

2233 ax.set_ylim(-amax, +amax) 

2234 plt.show() 

2235 return 

2236 

2237 print('') 

2238 print("try DataLoader:") 

2239 with DataLoader(filepath, 2.0, 1.0, 1) as data: 

2240 print('sampling rate: %g' % data.rate) 

2241 print('frames : %d %d' % (len(data), data.shape[0])) 

2242 nframes = int(1.0 * data.rate) 

2243 # forward: 

2244 for i in range(0, len(data), nframes): 

2245 print('forward %d-%d' % (i, i + nframes)) 

2246 x = data[i:i + nframes, 0] 

2247 if plot: 

2248 fig, ax = plt.subplots() 

2249 ax.plot((i + np.arange(len(x)))/data.rate, x) 

2250 ax.set_xlabel('Time [s]') 

2251 ax.set_ylabel(f'[{data.unit}]') 

2252 plt.show() 

2253 # and backwards: 

2254 for i in reversed(range(0, len(data), nframes)): 

2255 print('backward %d-%d' % (i, i + nframes)) 

2256 x = data[i:i + nframes, 0] 

2257 if plot: 

2258 fig, ax = plt.subplots() 

2259 ax.plot((i + np.arange(len(x)))/data.rate, x) 

2260 ax.set_xlabel('Time [s]') 

2261 ax.set_ylabel(f'[{data.unit}]') 

2262 plt.show() 

2263 

2264 

2265def main(*cargs): 

2266 """Call demo with command line arguments. 

2267 

2268 Parameters 

2269 ---------- 

2270 cargs: list of str 

2271 Command line arguments as provided by sys.argv[1:] 

2272 """ 

2273 import argparse 

2274 parser = argparse.ArgumentParser(description= 

2275 'Checking thunderlab.dataloader module.') 

2276 parser.add_argument('-p', dest='plot', action='store_true', 

2277 help='plot loaded data') 

2278 parser.add_argument('file', nargs=1, default='', type=str, 

2279 help='name of data file') 

2280 args = parser.parse_args(cargs) 

2281 demo(args.file[0], args.plot) 

2282 

2283 

2284if __name__ == "__main__": 

2285 main(*sys.argv[1:])