Coverage for src/thunderlab/dataloader.py: 77%

1"""Load time-series data from files.

3```

4data, rate, unit, amax = load_data('data/file.wav')

5```

7The function `data_loader()` loads the whole time-series from the file

8as a numpy array of floats. First dimension is frames, second is

9channels. In contrast to the `audioio.load_audio()` function, the

10values of the data array are not restricted between -1 and 1. They can

11assume any value wihin the range `-amax` to `+amax` with the returned

12`unit`.

14```

15data = DataLoader('data/file.wav', 60.0)

16```

17or

18```

19with DataLoader('data/file.wav', 60.0) as data:

20```

21Create an `DataLoader` object that loads chuncks of 60 seconds long data

22on demand. `data` can be used like a read-only numpy array of floats.

25## Supported file formats

27- python pickle files

28- numpy .npz files

29- matlab .mat files

30- audio files via [`audioio`](https://github.com/bendalab/audioio) package

31- LabView .scandat files

32- relacs trace*.raw files (https://www.relacs.net)

33- fishgrid traces-*.raw files (https://github.com/bendalab/fishgrid)

36## Metadata

38Many file formats allow to store metadata that further describe the

39stored time series data. We handle them as nested dictionary of key-value

40pairs. Load them with the `metadata()` function:

41```

42metadata = metadata('data/file.mat')

43```

45## Markers

47Some file formats also allow to store markers that mark specific

48positions in the time series data. Load marker positions and spans (in

49the 2-D array `locs`) and label and text strings (in the 2-D array

50`labels`) with the `markers()` function:

51```

52locs, labels = markers('data.wav')

53```

55## Aditional, format specific functions

57- `extract_container_metadata()`: extract metadata from dictionary loaded from a container file.

58- `relacs_samplerate_unit()`: retrieve sampling rate and unit from a relacs stimuli.dat file.

59- `relacs_header()`: read key-value pairs from relacs *.dat file headers.

60- `fishgrid_grids()`: retrieve grid sizes from a fishgrid.cfg file.

61- `fishgrid_spacings()`: spacing between grid electrodes.

63"""

65import gc

66import os

67import sys

68import glob

69import gzip

70import numpy as np

71try:

72 import matplotlib.pyplot as plt

73except ImportError:

74 pass

75from pathlib import Path

76from datetime import timedelta

77from audioio import load_audio, AudioLoader, unflatten_metadata

78from audioio import get_number_unit, get_number, get_int, get_bool, get_gain

79from audioio import default_starttime_keys, default_gain_keys

80from audioio import get_datetime, flatten_metadata, add_metadata, set_starttime

81from audioio import metadata as metadata_audioio

82from audioio import markers as markers_audioio

85def relacs_samplerate_unit(filepath, channel=0):

86 """Retrieve sampling rate and unit from a relacs stimuli.dat file.

88 Parameters

89 ----------

90 filepath: str

91 Path to a relacs data directory, or a file in a relacs data directory.

92 channel: int

93 Channel (trace) number, if `filepath` does not specify a

94 trace-*.raw file.

96 Returns

97 -------

98 samplerate: float

99 Sampling rate in Hertz

100 unit: str

101 Unit of the trace, can be empty if not found

102

103 Raises

104 ------

105 IOError/FileNotFoundError:

106 If the stimuli.dat file does not exist.

107 ValueError:

108 stimuli.dat file does not contain sampling rate.

109 """

110 trace = channel + 1

111 relacs_dir = filepath

112 # check for relacs data directory:

113 if not os.path.isdir(filepath):

114 relacs_dir = os.path.dirname(filepath)

115 bn = os.path.basename(filepath).lower()

116 i = bn.find('.raw')

117 if len(bn) > 5 and bn[0:5] == 'trace' and i > 6:

118 trace = int(bn[6:i])

119

120 # retreive sampling rate and unit from stimuli.dat file:

121 samplerate = None

122 sampleinterval = None

123 unit = ""

124

125 lines = []

126 stimuli_file = os.path.join(relacs_dir, 'stimuli.dat')

127 if os.path.isfile(stimuli_file + '.gz'):

128 stimuli_file += '.gz'

129 if stimuli_file[-3:] == '.gz':

130 with gzip.open(stimuli_file, 'r', encoding='latin-1') as sf:

131 for line in sf:

132 line = line.strip()

133 if len(line) == 0 or line[0] != '#':

134 break

135 lines.append(line)

136 else:

137 with open(stimuli_file, 'r', encoding='latin-1') as sf:

138 for line in sf:

139 line = line.strip()

140 if len(line) == 0 or line[0] != '#':

141 break

142 lines.append(line)

143

144 for line in lines:

145 if "unit%d" % trace in line:

146 unit = line.split(':')[1].strip()

147 if "sampling rate%d" % trace in line:

148 value = line.split(':')[1].strip()

149 samplerate = float(value.replace('Hz',''))

150 elif "sample interval%d" % trace in line:

151 value = line.split(':')[1].strip()

152 sampleinterval = float(value.replace('ms',''))

153

154 if samplerate is not None:

155 return samplerate, unit

156 if sampleinterval is not None:

157 return 1000/sampleinterval, unit

158 raise ValueError(f'could not retrieve sampling rate from {stimuli_file}')

159

160

161def relacs_header(filepath, store_empty=False, first_only=False,

162 lower_keys=False, flat=False,

163 add_sections=False):

164 """Read key-value pairs from a relacs *.dat file header.

165

166 Parameters

167 ----------

168 filepath: str

169 A relacs *.dat file, can be also a zipped .gz file.

170 store_empty: bool

171 If `False` do not add meta data with empty values.

172 first_only: bool

173 If `False` only store the first element of a list.

174 lower_keys: bool

175 Make all keys lower case.

176 flat: bool

177 Do not make a nested dictionary.

178 Use this option also to read in very old relacs metadata with

179 ragged left alignment.

180 add_sections: bool

181 If `True`, prepend keys with sections names separated by

182 '.' to make them unique.

183

184 Returns

185 -------

186 data: dict

187 Nested dictionary with key-value pairs of the file header.

188

189 Raises

190 ------

191 IOError/FileNotFoundError:

192 If `filepath` cannot be opened.

193 """

194 # read in header from file:

195 lines = []

196 if os.path.isfile(filepath + '.gz'):

197 filepath += '.gz'

198 if filepath[-3:] == '.gz':

199 with gzip.open(filepath, 'r', encoding='latin-1') as sf:

200 for line in sf:

201 line = line.strip()

202 if len(line) == 0 or line[0] != '#':

203 break

204 lines.append(line)

205 else:

206 with open(filepath, 'r', encoding='latin-1') as sf:

207 for line in sf:

208 line = line.strip()

209 if len(line) == 0 or line[0] != '#':

210 break

211 lines.append(line)

212 # parse:

213 data = {}

214 cdatas = [data]

215 sections = ['']

216 ident_offs = None

217 ident = None

218 for line in lines:

219 words = line.split(':')

220 value = ':'.join(words[1:]).strip() if len(words) > 1 else ''

221 if len(words) >= 1:

222 key = words[0].strip('#')

223 # get section level:

224 level = 0

225 if not flat or len(value) == 0:

226 nident = len(key) - len(key.lstrip())

227 if ident_offs is None:

228 ident_offs = nident

229 elif ident is None:

230 if nident > ident_offs:

231 ident = nident - ident_offs

232 level = 1

233 else:

234 level = (nident - ident_offs)//ident

235 # close sections:

236 if not flat:

237 while len(cdatas) > level + 1:

238 cdatas[-1][sections.pop()] = cdatas.pop()

239 else:

240 while len(sections) > level + 1:

241 sections.pop()

242 # key:

243 key = key.strip().strip('"')

244 if lower_keys:

245 key = key.lower()

246 skey = key

247 if add_sections:

248 key = '.'.join(sections[1:] + [key])

249 if len(value) == 0:

250 # new sub-section:

251 if flat:

252 if store_empty:

253 cdatas[-1][key] = None

254 else:

255 cdatas.append({})

256 sections.append(skey)

257 else:

258 # key-value pair:

259 value = value.strip('"')

260 if len(value) > 0 or value != '-' or store_empty:

261 if len(value) > 0 and value[0] == '[' and value[-1] == ']':

262 value = [v.strip() for v in value.lstrip('[').rstrip(']').split(',')]

263 if first_only:

264 value = value[0]

265 cdatas[-1][key] = value

266 while len(cdatas) > 1:

267 cdatas[-1][sections.pop()] = cdatas.pop()

268 return data

269

270

271def check_relacs(filepath):

272 """Check for valid relacs file.

273

274 Parameters

275 ----------

276 filepath: str

277 Path to a relacs data directory, or a file in a relacs data directory.

278

279 Returns

280 -------

281 is_relacs: boolean

282 `True` if `filepath` is a valid relacs directory or is a file therein.

283 """

284 # relacs data directory:

285 relacs_dir = filepath

286 if not os.path.isdir(filepath):

287 relacs_dir = os.path.dirname(filepath)

288 # check for a valid relacs data directory:

289 has_stimuli = False

290 has_trace = False

291 for fname in ['stimuli.dat', 'stimuli.dat.gz']:

292 if os.path.isfile(os.path.join(relacs_dir, fname)):

293 has_stimuli = True

294 for fname in ['trace-1.raw', 'trace-1.raw.gz']:

295 if os.path.isfile(os.path.join(relacs_dir, fname)):

296 has_trace = True

297 return has_stimuli and has_trace

298

299

300def relacs_trace_files(filepath):

301 """Expand file path for relacs data to appropriate trace*.raw file names.

302

303 Parameters

304 ----------

305 filepath: str

306 Path to a relacs data directory, or a file in a relacs data directory.

307

308 Returns

309 -------

310 trace_filepaths: list of str

311 List of relacs trace*.raw files.

312 """

313 relacs_dir = filepath

314 if not os.path.isdir(filepath):

315 relacs_dir = os.path.dirname(filepath)

316 trace_filepaths = []

317 for k in range(10000):

318 fname = os.path.join(relacs_dir, f'trace-{k+1}.raw')

319 if os.path.isfile(fname):

320 trace_filepaths.append(fname)

321 elif os.path.isfile(fname + '.gz'):

322 trace_filepaths.append(fname + '.gz')

323 else:

324 break

325 return trace_filepaths

326

327

328def load_relacs(filepath, amax=1.0):

329 """Load traces that have been recorded with relacs (https://github.com/relacs/relacs).

330

331 Parameters

332 ----------

333 filepath: str

334 Path to a relacs data directory, or a file in a relacs data directory.

335 amax: float

336 The amplitude range of the data.

337

338 Returns

339 -------

340 data: 2-D array

341 All data traces as an 2-D numpy array, even for single channel data.

342 First dimension is time, second is channel.

343 rate: float

344 Sampling rate of the data in Hz

345 unit: str

346 Unit of the data

347 amax: float

348 Maximum amplitude of data range.

349

350 Raises

351 ------

352 FileNotFoundError:

353 Invalid or non existing relacs files.

354 ValueError:

355 - Invalid name for relacs trace-*.raw file.

356 - Sampling rates of traces differ.

357 - Unit of traces differ.

358 """

359 trace_filepaths = relacs_trace_files(filepath)

360 if len(trace_filepaths) == 0:

361 raise FileNotFoundError(f'no relacs files found')

362 # load trace*.raw files:

363 nchannels = len(trace_filepaths)

364 data = None

365 nrows = 0

366 rate = None

367 unit = ''

368 for c, path in enumerate(sorted(trace_filepaths)):

369 if path[-3:] == '.gz':

370 with gzip.open(path, 'rb') as sf:

371 x = np.frombuffer(sf.read(), dtype=np.float32)

372 else:

373 x = np.fromfile(path, np.float32)

374 if data is None:

375 nrows = len(x)

376 data = np.zeros((nrows, nchannels))

377 n = min(len(x), nrows)

378 data[:n,c] = x[:n]

379 # retrieve sampling rate and unit:

380 crate, us = relacs_samplerate_unit(path, c)

381 if rate is None:

382 rate = crate

383 elif crate != rate:

384 raise ValueError('sampling rates of traces differ')

385 if len(unit) == 0:

386 unit = us

387 elif us != unit:

388 raise ValueError('unit of traces differ')

389 return data, rate, unit, amax

390

391

392def metadata_relacs(filepath, store_empty=False, first_only=False,

393 lower_keys=False, flat=False, add_sections=False):

394 """ Read meta-data of a relacs data set.

395

396 Parameters

397 ----------

398 filepath: str

399 A relacs data directory or a file therein.

400 store_empty: bool

401 If `False` do not add meta data with empty values.

402 first_only: bool

403 If `False` only store the first element of a list.

404 lower_keys: bool

405 Make all keys lower case.

406 flat: bool

407 Do not make a nested dictionary.

408 Use this option also to read in very old relacs metadata with

409 ragged left alignment.

410 add_sections: bool

411 If `True`, prepend keys with sections names separated by

412 '.' to make them unique.

413

414 Returns

415 -------

416 data: nested dict

417 Nested dictionary with key-value pairs of the meta data.

418 """

419 relacs_dir = filepath

420 if not os.path.isdir(filepath):

421 relacs_dir = os.path.dirname(filepath)

422 info_path = os.path.join(relacs_dir, 'info.dat')

423 if not os.path.exists(info_path):

424 return dict(), []

425 data = relacs_header(info_path, store_empty, first_only,

426 lower_keys, flat, add_sections)

427 return data

428

429

430def fishgrid_spacings(metadata, unit='m'):

431 """Spacing between grid electrodes.

432

433 Parameters

434 ----------

435 metadata: dict

436 Fishgrid metadata obtained from `metadata_fishgrid()`.

437 unit: str

438 Unit in which to return the spacings.

439

440 Returns

441 -------

442 grid_dist: list of tuple of float

443 For each grid the distances between rows and columns in `unit`.

444 """

445 grids_dist = []

446 for k in range(4):

447 row_dist = get_number(metadata, unit, f'RowDistance{k+1}', default=0)

448 col_dist = get_number(metadata, unit, f'ColumnDistance{k+1}', default=0)

449 rows = get_int(metadata, f'Rows{k+1}', default=0)

450 cols = get_int(metadata, f'Columns{k+1}', default=0)

451 if get_bool(metadata, f'Used{k+1}', default=False) or \

452 cols > 0 and rows > 0:

453 grids_dist.append((row_dist, col_dist))

454 return grids_dist

455

456

457def fishgrid_grids(metadata):

458 """Retrieve grid sizes from a fishgrid.cfg file.

459

460 Parameters

461 ----------

462 metadata: dict

463 Fishgrid metadata obtained from `metadata_fishgrid()`.

464

465 Returns

466 -------

467 grids: list of tuple of int

468 For each grid the number of rows and columns.

469 """

470 grids = []

471 for k in range(4):

472 rows = get_int(metadata, f'Rows{k+1}', default=0)

473 cols = get_int(metadata, f'Columns{k+1}', default=0)

474 if get_bool(metadata, f'Used{k+1}', default=False) or \

475 cols > 0 and rows > 0:

476 grids.append((rows, cols))

477 return grids

478

479

480def check_fishgrid(filepath):

481 """Check for valid fishgrid file (https://github.com/bendalab/fishgrid).

482

483 Parameters

484 ----------

485 filepath: str

486 Path to a fishgrid data directory or a file in a fishgrid

487 data directory.

488

489 Returns

490 -------

491 is_fishgrid: bool

492 `True` if `filepath` is a valid fishgrid data directory or

493 a file therein.

494 """

495 # fishgrid data directory:

496 fishgrid_dir = filepath

497 if not os.path.isdir(filepath):

498 fishgrid_dir = os.path.dirname(filepath)

499 # check for a valid fishgrid data directory:

500 return (os.path.isfile(os.path.join(fishgrid_dir, 'fishgrid.cfg')) and

501 (os.path.isfile(os.path.join(fishgrid_dir, 'traces-grid1.raw')) or

502 os.path.isfile(os.path.join(fishgrid_dir, 'traces.raw'))))

503

504

505def fishgrid_trace_files(filepath):

506 """Expand file paths for fishgrid data to appropriate traces*.raw file names.

507

508 Parameters

509 ----------

510 filepath: str

511 Path to a fishgrid data directory, or a file therein.

512

513 Returns

514 -------

515 trace_filepaths: list of str

516 List of fishgrid traces*.raw files.

517 """

518 # find grids:

519 fishgrid_dir = filepath

520 if not os.path.isdir(fishgrid_dir):

521 fishgrid_dir = os.path.dirname(filepath)

522 trace_filepaths = []

523 for k in range(10000):

524 file = os.path.join(fishgrid_dir, f'traces-grid{k+1}.raw')

525 if os.path.isfile(file):

526 trace_filepaths.append(file)

527 else:

528 break

529 if len(trace_filepaths) == 0:

530 file = os.path.join(fishgrid_dir, f'traces.raw')

531 if os.path.isfile(file):

532 trace_filepaths.append(file)

533 return trace_filepaths

534

535

536def load_fishgrid(filepath):

537 """Load traces that have been recorded with fishgrid (https://github.com/bendalab/fishgrid).

538

539 Parameters

540 ----------

541 filepath: str

542 Path to a fishgrid data directory, or a file therein.

543

544 Returns

545 -------

546 data: 2-D array

547 All data traces as an 2-D numpy array, even for single channel data.

548 First dimension is time, second is channel.

549 rate: float

550 Sampling rate of the data in Hz.

551 unit: str

552 Unit of the data.

553 amax: float

554 Maximum amplitude of data range.

555

556 Raises

557 ------

558 FileNotFoundError:

559 Invalid or non existing fishgrid files.

560 """

561 trace_filepaths = fishgrid_trace_files(filepath)

562 if len(trace_filepaths) == 0:

563 raise FileNotFoundError(f'no fishgrid files found')

564 md = metadata_fishgrid(filepath)

565 grids = fishgrid_grids(md)

566 grid_sizes = [r*c for r, c in grids]

567

568 # load traces-grid*.raw files:

569 grid_channels = []

570 nchannels = 0

571 for g, path in enumerate(trace_filepaths):

572 grid_channels.append(grid_sizes[g])

573 nchannels += grid_sizes[g]

574 data = None

575 nrows = 0

576 c = 0

577 rate = get_number(md, 'Hz', 'AISampleRate')

578 for path, channels in zip(trace_filepaths, grid_channels):

579 x = np.fromfile(path, np.float32).reshape((-1, channels))

580 if data is None:

581 nrows = len(x)

582 data = np.zeros((nrows, nchannels))

583 n = min(len(x), nrows)

584 data[:n,c:c+channels] = x[:n,:]

585 c += channels

586 amax, unit = get_number_unit(md, 'AIMaxVolt')

587 return data, rate, unit, amax

588

589

590# add fishgrid keys:

591default_starttime_keys.append(['StartDate', 'StartTime'])

592default_gain_keys.insert(0, 'AIMaxVolt')

593

594

595def metadata_fishgrid(filepath):

596 """ Read meta-data of a fishgrid data set.

597

598 Parameters

599 ----------

600 filepath: str

601 A fishgrid data directory or a file therein.

602

603 Returns

604 -------

605 data: nested dict

606 Nested dictionary with key-value pairs of the meta data.

607 """

608 fishgrid_dir = filepath

609 if not os.path.isdir(fishgrid_dir):

610 fishgrid_dir = os.path.dirname(filepath)

611 path = os.path.join(fishgrid_dir, 'fishgrid.cfg')

612 # read in header from file:

613 lines = []

614 if os.path.isfile(path + '.gz'):

615 path += '.gz'

616 if not os.path.exists(path):

617 return {}

618 if path[-3:] == '.gz':

619 with gzip.open(path, 'r', encoding='latin-1') as sf:

620 for line in sf:

621 lines.append(line)

622 else:

623 with open(path, 'r', encoding='latin-1') as sf:

624 for line in sf:

625 lines.append(line)

626 # parse:

627 data = {}

628 cdatas = [data]

629 ident_offs = None

630 ident = None

631 old_style = False

632 grid_n = False

633 for line in lines:

634 if len(line.strip()) == 0:

635 continue

636 if line[0] == '*':

637 key = line[1:].strip()

638 data[key] = {}

639 cdatas = [data, data[key]]

640 elif '----' in line:

641 old_style = True

642 key = line.strip().strip(' -').replace('&', '')

643 if key.upper() == 'SETUP':

644 key = 'Grid 1'

645 grid_n = False

646 if key[:4].lower() == 'grid':

647 grid_n = key[5]

648 cdatas = cdatas[:2]

649 cdatas[1][key] = {}

650 cdatas.append(cdatas[1][key])

651 else:

652 words = line.split(':')

653 key = words[0].strip().strip('"')

654 value = None

655 if len(words) > 1 and (len(words[1].strip()) > 0 or old_style):

656 value = ':'.join(words[1:]).strip().strip('"')

657 if old_style:

658 if value is None:

659 cdatas = cdatas[:3]

660 cdatas[2][key] = {}

661 cdatas.append(cdatas[2][key])

662 else:

663 if grid_n and key[-1] != grid_n:

664 key = key + grid_n

665 cdatas[-1][key] = value

666 else:

667 # get section level:

668 level = 0

669 nident = len(line) - len(line.lstrip())

670 if ident_offs is None:

671 ident_offs = nident

672 elif ident is None:

673 if nident > ident_offs:

674 ident = nident - ident_offs

675 level = 1

676 else:

677 level = (nident - ident_offs)//ident

678 # close sections:

679 cdatas = cdatas[:2 + level]

680 if value is None:

681 # new section:

682 cdatas[-1][key] = {}

683 cdatas.append(cdatas[-1][key])

684 else:

685 # key-value pair:

686 cdatas[-1][key] = value.replace(r'\n', '\n')

687 # remove unused grids:

688 fgm = data.get('FishGrid', {})

689 for i in range(4):

690 gs = f'Grid {i+1}'

691 if gs in fgm:

692 gm = fgm[gs]

693 us = f'Used{i+1}'

694 if us in gm and gm[us].upper() == 'FALSE':

695 del fgm[gs]

696 return data

697

698

699def markers_fishgrid(filepath):

700 """ Read markers of a fishgrid data set.

701

702 Parameters

703 ----------

704 filepath: str

705 A fishgrid data directory or a file therein.

706

707 Returns

708 -------

709 locs: 2-D array of ints

710 Marker positions (first column) and spans (second column)

711 for each marker (rows).

712 labels: 2-D array of string objects

713 Labels (first column) and texts (second column)

714 for each marker (rows).

715 """

716 def add_marker():

717 if 'index1' in marker:

718 index1 = int(marker['index1'])//nchannels

719 else:

720 index1 = int(marker['index'])//nchannels

721 span1 = int(marker.get('span1', 0))//nchannels

722 locs.append([index1, span1])

723 ls = marker.get('label', 'M')

724 cs = marker.get('comment', '')

725 labels.append([ls, cs])

726

727 fishgrid_dir = filepath

728 if not os.path.isdir(fishgrid_dir):

729 fishgrid_dir = os.path.dirname(filepath)

730 path = os.path.join(fishgrid_dir, 'timestamps.dat')

731 if not os.path.isfile(path):

732 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)

733 # get number of channels:

734 md = metadata_fishgrid(path.replace('timestamps.dat', 'fishgrid.cfg'))

735 grids = fishgrid_grids(md)

736 nchannels = np.prod(grids[0])

737 # read timestamps:

738 locs = []

739 labels = []

740 marker = {}

741 with open(path, 'r') as sf:

742 for line in sf:

743 if len(line.strip()) == 0:

744 add_marker()

745 marker = {}

746 else:

747 words = line.split(':')

748 if len(words) > 1:

749 v = words[1].strip()

750 v = v.strip('"')

751 marker[words[0].strip().lower()] = v

752 if len(marker) > 0:

753 add_marker()

754 if len(locs) > 2:

755 return np.array(locs[1:-1]), np.array(labels[1:-1])

756 else:

757 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)

758

759

760def check_container(filepath):

761 """Check if file is a generic container file.

762

763 Supported file formats are:

764

765 - python pickle files (.pkl)

766 - numpy files (.npz)

767 - matlab files (.mat)

768

769 Parameters

770 ----------

771 filepath: str

772 Path of the file to check.

773

774 Returns

775 -------

776 is_container: bool

777 `True`, if `filepath` is a supported container format.

778 """

779 ext = os.path.splitext(filepath)[1]

780 return ext.lower() in ('.pkl', '.npz', '.mat')

781

782

783def extract_container_data(data_dict, datakey=None,

784 samplekey=['rate', 'Fs', 'fs'],

785 timekey=['time'], amplkey=['amax'], unitkey='unit',

786 amax=1.0, unit='a.u.'):

787 """Extract data from dictionary loaded from a container file.

788

789 Parameters

790 ----------

791 data_dict: dict

792 Dictionary of the data items contained in the container.

793 datakey: None, str, or list of str

794 Name of the variable holding the data. If `None` take the

795 variable that is an 2D array and has the largest number of

796 elements.

797 samplekey: str or list of str

798 Name of the variable holding the sampling rate.

799 timekey: str or list of str

800 Name of the variable holding sampling times.

801 If no sampling rate is available, the sampling rate is retrieved

802 from the sampling times.

803 amplkey: str or list of str

804 Name of the variable holding the amplitude range of the data.

805 unitkey: str

806 Name of the variable holding the unit of the data.

807 amax: None or float

808 If specified and no amplitude range has been found in `data_dict`,

809 then this is the amplitude range of the data.

810 unit: None or str

811 If specified and no unit has been found in `data_dict`,

812 then return this as the unit of the data.

813

814 Returns

815 -------

816 data: 2-D array of floats

817 All data traces as an 2-D numpy array, even for single channel data.

818 First dimension is time, second is channel.

819 rate: float

820 Sampling rate of the data in Hz.

821 unit: str

822 Unit of the data.

823 amax: float

824 Maximum amplitude of data range in `unit`.

825

826 Raises

827 ------

828 ValueError:

829 Invalid key requested.

830 """

831 # extract format data:

832 if not isinstance(samplekey, (list, tuple, np.ndarray)):

833 samplekey = (samplekey,)

834 if not isinstance(timekey, (list, tuple, np.ndarray)):

835 timekey = (timekey,)

836 if not isinstance(amplkey, (list, tuple, np.ndarray)):

837 amplkey = (amplkey,)

838 rate = 0.0

839 for skey in samplekey:

840 if skey in data_dict:

841 rate = float(data_dict[skey])

842 break

843 if rate == 0.0:

844 for tkey in timekey:

845 if tkey in data_dict:

846 rate = 1.0/(data_dict[tkey][1] - data_dict[tkey][0])

847 break

848 if rate == 0.0:

849 raise ValueError(f"invalid keys {', '.join(samplekey)} and {', '.join(timekey)} for requesting sampling rate or sampling times")

850 for akey in amplkey:

851 if akey in data_dict:

852 amax = float(data_dict[akey])

853 break

854 if unitkey in data_dict:

855 unit = data_dict[unitkey]

856 # get data array:

857 raw_data = np.array([])

858 if datakey:

859 # try data keys:

860 if not isinstance(datakey, (list, tuple, np.ndarray)):

861 datakey = (datakey,)

862 for dkey in datakey:

863 if dkey in data_dict:

864 raw_data = data_dict[dkey]

865 break

866 if len(raw_data) == 0:

867 raise ValueError(f"invalid key(s) {', '.join(datakey)} for requesting data")

868 else:

869 # find largest 2D array:

870 for d in data_dict:

871 if hasattr(data_dict[d], 'shape'):

872 if 1 <= len(data_dict[d].shape) <= 2 and \

873 np.max(data_dict[d].shape) > np.max(raw_data.shape):

874 raw_data = data_dict[d]

875 if len(raw_data) == 0:

876 raise ValueError('no data found')

877 # make 2D:

878 if len(raw_data.shape) == 1:

879 raw_data = raw_data.reshape(-1, 1)

880 # transpose if necessary:

881 if np.argmax(raw_data.shape) > 0:

882 raw_data = raw_data.T

883 # recode:

884 if raw_data.dtype == np.dtype('int16'):

885 data = raw_data.astype('float32')

886 data *= amax/2**15

887 elif raw_data.dtype == np.dtype('int32'):

888 data = raw_data.astype(float)

889 data *= amax/2**31

890 elif raw_data.dtype == np.dtype('int64'):

891 data = raw_data.astype(float)

892 data *= amax/2**63

893 else:

894 data = raw_data

895 return data, rate, unit, amax

896

897

898def load_container(filepath, datakey=None,

899 samplekey=['rate', 'Fs', 'fs'],

900 timekey=['time'], amplkey=['amax'], unitkey='unit',

901 amax=1.0, unit='a.u.'):

902 """Load data from a generic container file.

903

904 Supported file formats are:

905

906 - python pickle files (.pkl)

907 - numpy files (.npz)

908 - matlab files (.mat)

909

910 Parameters

911 ----------

912 filepath: str

913 Path of the file to load.

914 datakey: None, str, or list of str

915 Name of the variable holding the data. If `None` take the

916 variable that is an 2D array and has the largest number of

917 elements.

918 samplekey: str or list of str

919 Name of the variable holding the sampling rate.

920 timekey: str or list of str

921 Name of the variable holding sampling times.

922 If no sampling rate is available, the sampling rate is retrieved

923 from the sampling times.

924 amplkey: str

925 Name of the variable holding the amplitude range of the data.

926 unitkey: str

927 Name of the variable holding the unit of the data.

928 If `unitkey` is not a valid key, then return `unitkey` as the `unit`.

929 amax: None or float

930 If specified and no amplitude range has been found in the data

931 container, then this is the amplitude range of the data.

932 unit: None or str

933 If specified and no unit has been found in the data container,

934 then return this as the unit of the data.

935

936 Returns

937 -------

938 data: 2-D array of floats

939 All data traces as an 2-D numpy array, even for single channel data.

940 First dimension is time, second is channel.

941 rate: float

942 Sampling rate of the data in Hz.

943 unit: str

944 Unit of the data.

945 amax: float

946 Maximum amplitude of data range.

947

948 Raises

949 ------

950 ValueError:

951 Invalid key requested.

952 """

953 # load data:

954 data_dict = {}

955 ext = os.path.splitext(filepath)[1]

956 if ext == '.pkl':

957 import pickle

958 with open(filepath, 'rb') as f:

959 data_dict = pickle.load(f)

960 elif ext == '.npz':

961 data_dict = np.load(filepath)

962 elif ext == '.mat':

963 from scipy.io import loadmat

964 data_dict = loadmat(filepath, squeeze_me=True)

965 return extract_container_data(data_dict, datakey, samplekey,

966 timekey, amplkey, unitkey, amax, unit)

967

968

969def extract_container_metadata(data_dict, metadatakey=['metadata', 'info']):

970 """ Extract metadata from dictionary loaded from a container file.

971

972 Parameters

973 ----------

974 data_dict: dict

975 Dictionary of the data items contained in the container.

976 metadatakey: str or list of str

977 Name of the variable holding the metadata.

978

979 Returns

980 -------

981 metadata: nested dict

982 Nested dictionary with key-value pairs of the meta data.

983 """

984 if not isinstance(metadatakey, (list, tuple, np.ndarray)):

985 metadatakey = (metadatakey,)

986 # get single metadata dictionary:

987 for mkey in metadatakey:

988 if mkey in data_dict:

989 return data_dict[mkey]

990 # collect all keys starting with metadatakey:

991 metadata = {}

992 for mkey in metadatakey:

993 mkey += '__'

994 for dkey in data_dict:

995 if dkey[:len(mkey)] == mkey:

996 v = data_dict[dkey]

997 if hasattr(v, 'size') and v.ndim == 0:

998 v = v.item()

999 metadata[dkey[len(mkey):]] = v

1000 if len(metadata) > 0:

1001 return unflatten_metadata(metadata, sep='__')

1002 return metadata

1003

1004

1005def metadata_container(filepath, metadatakey=['metadata', 'info']):

1006 """ Read meta-data of a container file.

1007

1008 Parameters

1009 ----------

1010 filepath: str

1011 A container file.

1012 metadatakey: str or list of str

1013 Name of the variable holding the metadata.

1014

1015 Returns

1016 -------

1017 metadata: nested dict

1018 Nested dictionary with key-value pairs of the meta data.

1019 """

1020 data_dict = {}

1021 ext = os.path.splitext(filepath)[1]

1022 if ext == '.pkl':

1023 import pickle

1024 with open(filepath, 'rb') as f:

1025 data_dict = pickle.load(f)

1026 elif ext == '.npz':

1027 data_dict = np.load(filepath)

1028 elif ext == '.mat':

1029 from scipy.io import loadmat

1030 data_dict = loadmat(filepath, squeeze_me=True)

1031 return extract_container_metadata(data_dict, metadatakey)

1032

1033

1034def extract_container_markers(data_dict, poskey=['positions'],

1035 spanskey=['spans'], labelskey=['labels'],

1036 descrkey=['descriptions']):

1037 """ Extract markers from dictionary loaded from a container file.

1038

1039 Parameters

1040 ----------

1041 data_dict: dict

1042 Dictionary of the data items contained in the container.

1043 poskey: str or list of str

1044 Name of the variable holding positions of markers.

1045 spanskey: str or list of str

1046 Name of the variable holding spans of markers.

1047 labelskey: str or list of str

1048 Name of the variable holding labels of markers.

1049 descrkey: str or list of str

1050 Name of the variable holding descriptions of markers.

1051

1052 Returns

1053 -------

1054 locs: 2-D array of ints

1055 Marker positions (first column) and spans (second column)

1056 for each marker (rows).

1057 labels: 2-D array of string objects

1058 Labels (first column) and texts (second column)

1059 for each marker (rows).

1060 """

1061 if not isinstance(poskey, (list, tuple, np.ndarray)):

1062 poskey = (poskey,)

1063 if not isinstance(spanskey, (list, tuple, np.ndarray)):

1064 spanskey = (spanskey,)

1065 if not isinstance(labelskey, (list, tuple, np.ndarray)):

1066 labelskey = (labelskey,)

1067 if not isinstance(descrkey, (list, tuple, np.ndarray)):

1068 descrkey = (descrkey,)

1069 locs = np.zeros((0, 2), dtype=int)

1070 for pkey in poskey:

1071 if pkey in data_dict:

1072 locs = np.zeros((len(data_dict[pkey]), 2), dtype=int)

1073 locs[:,0] = data_dict[pkey]

1074 break

1075 for skey in spanskey:

1076 if skey in data_dict:

1077 locs[:,1] = data_dict[skey]

1078 break

1079 labels = np.zeros((0, 2), dtype=object)

1080 for lkey in labelskey:

1081 if lkey in data_dict:

1082 labels = np.zeros((len(data_dict[lkey]), 2), dtype=object)

1083 labels[:,0] = data_dict[lkey]

1084 break

1085 for dkey in descrkey:

1086 if dkey in data_dict:

1087 labels[:,1] = data_dict[dkey]

1088 break

1089 return locs, labels

1090

1091

1092def markers_container(filepath, poskey=['positions'],

1093 spanskey=['spans'], labelskey=['labels'],

1094 descrkey=['descriptions']):

1095 """ Read markers of a container file.

1096

1097 Parameters

1098 ----------

1099 filepath: str

1100 A container file.

1101 poskey: str or list of str

1102 Name of the variable holding positions of markers.

1103 spanskey: str or list of str

1104 Name of the variable holding spans of markers.

1105 labelskey: str or list of str

1106 Name of the variable holding labels of markers.

1107 descrkey: str or list of str

1108 Name of the variable holding descriptions of markers.

1109

1110 Returns

1111 -------

1112 locs: 2-D array of ints

1113 Marker positions (first column) and spans (second column)

1114 for each marker (rows).

1115 labels: 2-D array of string objects

1116 Labels (first column) and texts (second column)

1117 for each marker (rows).

1118 """

1119 data_dict = {}

1120 ext = os.path.splitext(filepath)[1]

1121 if ext == '.pkl':

1122 import pickle

1123 with open(filepath, 'rb') as f:

1124 data_dict = pickle.load(f)

1125 elif ext == '.npz':

1126 data_dict = np.load(filepath)

1127 elif ext == '.mat':

1128 from scipy.io import loadmat

1129 data_dict = loadmat(filepath, squeeze_me=True)

1130 return extract_container_markers(data_dict, poskey, spanskey,

1131 labelskey, descrkey)

1132

1133

1134def check_raw(filepath):

1135 """Check if file is a raw file.

1136

1137 The following extensions are interpreted as raw files:

1138

1139 - raw files (*.raw)

1140 - LabView scandata (*.scandat)

1141

1142 Parameters

1143 ----------

1144 filepath: str

1145 Path of the file to check.

1146

1147 Returns

1148 -------

1149 is_raw: bool

1150 `True`, if `filepath` is a raw format.

1151 """

1152 ext = os.path.splitext(filepath)[1]

1153 return ext.lower() in ('.raw', '.scandat', '.mat')

1154

1155

1156def load_raw(filepath, rate=44000, channels=1, dtype=np.float32,

1157 amax=1.0, unit='a.u.'):

1158 """Load data from a raw file.

1159

1160 Raw files just contain the data and absolutely no metadata, not

1161 even the smapling rate, number of channels, etc.

1162 Supported file formats are:

1163

1164 - raw files (*.raw)

1165 - LabView scandata (*.scandat)

1166

1167 Parameters

1168 ----------

1169 filepath: str

1170 Path of the file to load.

1171 rate: float

1172 Sampling rate of the data in Hertz.

1173 channels: int

1174 Number of channels multiplexed in the data.

1175 dtype: str or numpy.dtype

1176 The data type stored in the file.

1177 amax: float

1178 The amplitude range of the data.

1179 unit: str

1180 The unit of the data.

1181

1182 Returns

1183 -------

1184 data: 2-D array of floats

1185 All data traces as an 2-D numpy array, even for single channel data.

1186 First dimension is time, second is channel.

1187 rate: float

1188 Sampling rate of the data in Hz.

1189 unit: str

1190 Unit of the data.

1191 amax: float

1192 Maximum amplitude of data range.

1193

1194 """

1195 raw_data = np.fromfile(filepath, dtype=dtype).reshape(-1, channels)

1196 # recode:

1197 if dtype == np.dtype('int16'):

1198 data = raw_data.astype('float32')

1199 data *= amax/2**15

1200 elif dtype == np.dtype('int32'):

1201 data = raw_data.astype(float)

1202 data *= amax/2**31

1203 elif dtype == np.dtype('int64'):

1204 data = raw_data.astype(float)

1205 data *= amax/2**63

1206 else:

1207 data = raw_data

1208 return data, rate, unit, amax

1209

1210

1211def load_audioio(filepath, verbose=0, gainkey=default_gain_keys, sep='.',

1212 amax=1.0, unit='a.u.'):

1213 """Load data from an audio file.

1214

1215 See the

1216 [`load_audio()`](https://bendalab.github.io/audioio/api/audioloader.html#audioio.audioloader.load_audio)

1217 function of the [`audioio`](https://github.com/bendalab/audioio)

1218 package for more infos.

1219

1220 Parameters

1221 ----------

1222 filepath: str

1223 Path of the file to load.

1224 verbose: int

1225 If > 0 show detailed error/warning messages.

1226 gainkey: str or list of str

1227 Key in the file's metadata that holds some gain information.

1228 If found, the data will be multiplied with the gain,

1229 and if available, the corresponding unit is returned.

1230 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.

1231 sep: str

1232 String that separates section names in `gainkey`.

1233 amax: float

1234 If specified and no gain has been found in the metadata,

1235 then use this as the amplitude range.

1236 unit: str

1237 If specified and no gain has been found in the metadata,

1238 then return this as the unit of the data.

1239

1240 Returns

1241 -------

1242 data: 2-D array of floats

1243 All data traces as an 2-D numpy array, even for single channel data.

1244 First dimension is time, second is channel.

1245 rate: float

1246 Sampling rate of the data in Hz.

1247 unit: str

1248 Unit of the data if found in the metadata (see `gainkey`),

1249 otherwise `unit`.

1250 amax: float

1251 Maximum amplitude of data range.

1252 """

1253 # get gain:

1254 md = metadata_audioio(filepath)

1255 amax, unit = get_gain(md, gainkey, sep, amax, unit)

1256 # load data:

1257 data, rate = load_audio(filepath, verbose)

1258 if amax != 1.0:

1259 data *= amax

1260 return data, rate, unit, amax

1261

1262

1263data_loader_funcs = (

1264 ('relacs', check_relacs, load_relacs, metadata_relacs, None),

1265 ('fishgrid', check_fishgrid, load_fishgrid, metadata_fishgrid, markers_fishgrid),

1266 ('container', check_container, load_container, metadata_container, markers_container),

1267 ('raw', check_raw, load_raw, None, None),

1268 ('audioio', None, load_audioio, metadata_audioio, markers_audioio),

1269 )

1270"""List of implemented load functions.

1271

1272Each element of the list is a tuple with the data format's name, its

1273check and its load function.

1274

1275"""

1276

1277

1278def load_data(filepath, verbose=0, **kwargs):

1279 """Load time-series data from a file.

1280

1281 Parameters

1282 ----------

1283 filepath: str

1284 Path and name of the file to load.

1285 verbose: int

1286 If > 0 show detailed error/warning messages.

1287 **kwargs: dict

1288 Further keyword arguments that are passed on to the

1289 format specific loading functions.

1290 For example:

1291 - `amax`: the amplitude range of the data.

1292 - 'unit': the unit of the data.

1293

1294 Returns

1295 -------

1296 data: 2-D array

1297 All data traces as an 2-D numpy array, even for single channel data.

1298 First dimension is time, second is channel.

1299 rate: float

1300 Sampling rate of the data in Hz.

1301 unit: str

1302 Unit of the data.

1303 amax: float

1304 Maximum amplitude of data range.

1305

1306 Raises

1307 ------

1308 ValueError:

1309 `filepath` is empty string.

1310 """

1311 if len(filepath) == 0:

1312 raise ValueError('input argument filepath is empty string.')

1313 # load data:

1314 for name, check_file, load_file, _, _ in data_loader_funcs:

1315 if check_file is None or check_file(filepath):

1316 data, rate, unit, amax = load_file(filepath, **kwargs)

1317 if verbose > 0:

1318 print(f'loaded {name} data from file "{filepath}"')

1319 if verbose > 1:

1320 print(f' sampling rate: {rate:g} Hz')

1321 print(f' channels : {data.shape[1]}')

1322 print(f' frames : {len(data)}')

1323 print(f' range : {amax:g}{unit}')

1324 return data, rate, unit, amax

1325 return np.zeros((0, 1)), 0.0, '', 1.0

1326

1327

1328def metadata(filepath, **kwargs):

1329 """ Read meta-data from a data file.

1330

1331 Parameters

1332 ----------

1333 filepath: str

1334 The full path and name of the file to load. For some file

1335 formats several files can be provided in a list.

1336 **kwargs: dict

1337 Further keyword arguments that are passed on to the

1338 format specific loading functions.

1339

1340 Returns

1341 -------

1342 meta_data: nested dict

1343 Meta data contained in the file. Keys of the nested

1344 dictionaries are always strings. If the corresponding

1345 values are dictionaries, then the key is the section name

1346 of the metadata contained in the dictionary. All other

1347 types of values are values for the respective key. In

1348 particular they are strings, or list of strings. But other

1349 simple types like ints or floats are also allowed.

1350

1351 Raises

1352 ------

1353 ValueError:

1354 `filepath` is empty string.

1355 """

1356 if len(filepath) == 0:

1357 raise ValueError('input argument filepath is empty string.')

1358 # load metadata:

1359 for _, check_file, _, metadata_file, _ in data_loader_funcs:

1360 if check_file is None or check_file(filepath):

1361 if metadata_file is not None:

1362 return metadata_file(filepath, **kwargs)

1363 return {}

1364

1365

1366def markers(filepath):

1367 """ Read markers of a data file.

1368

1369 Parameters

1370 ----------

1371 filepath: str or file handle

1372 The data file.

1373

1374 Returns

1375 -------

1376 locs: 2-D array of ints

1377 Marker positions (first column) and spans (second column)

1378 for each marker (rows).

1379 labels: 2-D array of string objects

1380 Labels (first column) and texts (second column)

1381 for each marker (rows).

1382

1383 Raises

1384 ------

1385 ValueError:

1386 `filepath` is empty string.

1387 """

1388 if len(filepath) == 0:

1389 raise ValueError('input argument filepath is empty string.')

1390 # load markers:

1391 for _, check_file, _, _, markers_file in data_loader_funcs:

1392 if check_file is None or check_file(filepath):

1393 if markers_file is not None:

1394 return markers_file(filepath)

1395 return np.zeros((0, 2), dtype=int), np.zeros((0, 2), dtype=object)

1396

1397

1398class DataLoader(AudioLoader):

1399 """Buffered reading of time-series data for random access of the data in the file.

1400

1401 This allows for reading very large data files that do not fit into

1402 memory. A `DataLoader` instance can be used like a huge

1403 read-only numpy array, i.e.

1404 ```

1405 data = DataLoader('path/to/data/file.dat')

1406 x = data[10000:20000,0]

1407 ```

1408 The first index specifies the frame, the second one the channel.

1409

1410 `DataLoader` first determines the format of the data file and then

1411 opens the file (first line). It then reads data from the file as

1412 necessary for the requested data (second line).

1413

1414 Supported file formats are

1415

1416 - audio files via `audioio` package

1417 - python pickle files

1418 - numpy .npz files

1419 - matlab .mat files

1420 - relacs trace*.raw files (www.relacs.net)

1421 - fishgrid traces-*.raw files

1422

1423 Reading sequentially through the file is always possible. If

1424 previous data are requested, then the file is read from the

1425 beginning. This might slow down access to previous data

1426 considerably. Use the `backsize` argument to the open functions to

1427 make sure some data are loaded before the requested frame. Then a

1428 subsequent access to the data within `backsize` seconds before that

1429 frame can still be handled without the need to reread the file

1430 from the beginning.

1431

1432 Usage:

1433 ------

1434 ```

1435 import thunderlab.dataloader as dl

1436 with dl.DataLoader(filepath, 60.0, 10.0) as data:

1437 # do something with the content of the file:

1438 x = data[0:10000,0]

1439 y = data[10000:20000,0]

1440 z = x + y

1441 ```

1442

1443 Normal open and close:

1444 ```

1445 data = dl.DataLoader(filepath, 60.0)

1446 x = data[:,:] # read the whole file

1447 data.close()

1448 ```

1449 that is the same as:

1450 ```

1451 data = dl.DataLoader()

1452 data.open(filepath, 60.0)

1453 ```

1454

1455 Parameters

1456 ----------

1457 filepath: str

1458 Name of the file.

1459 buffersize: float

1460 Size of internal buffer in seconds.

1461 backsize: float

1462 Part of the buffer to be loaded before the requested start index in seconds.

1463 verbose: int

1464 If larger than zero show detailed error/warning messages.

1465 meta_kwargs: dict

1466 Keyword arguments that are passed on to the _load_metadata() function.

1467

1468 Attributes

1469 ----------

1470 rate: float

1471 The sampling rate of the data in Hertz.

1472 channels: int

1473 The number of channels that are read in.

1474 frames: int

1475 The number of frames in the file.

1476 format: str or None

1477 Format of the audio file.

1478 encoding: str or None

1479 Encoding/subtype of the audio file.

1480 shape: tuple

1481 Number of frames and channels of the data.

1482 ndim: int

1483 Number of dimensions: always 2 (frames and channels).

1484 unit: str

1485 Unit of the data.

1486 ampl_min: float

1487 Minimum amplitude the file format supports.

1488 ampl_max: float

1489 Maximum amplitude the file format supports.

1490

1491 Methods

1492 -------

1493

1494 - `len()`: the number of frames

1495 - `open()`: open a data file.

1496 - `open_*()`: open a data file of a specific format.

1497 - `close()`: close the file.

1498 - `metadata()`: metadata of the file.

1499 - `markers()`: markers of the file.

1500 - `set_unwrap()`: Set parameters for unwrapping clipped data.

1501

1502 """

1503

1504 def __init__(self, filepath=None, buffersize=10.0, backsize=0.0,

1505 verbose=0, **meta_kwargs):

1506 super().__init__(None, buffersize, backsize,

1507 verbose, **meta_kwargs)

1508 if filepath is not None:

1509 self.open(filepath, buffersize, backsize, verbose, **meta_kwargs)

1510

1511 def __getitem__(self, key):

1512 return super(DataLoader, self).__getitem__(key)

1513

1514 def __next__(self):

1515 return super(DataLoader, self).__next__()

1516

1517

1518 # relacs interface:

1519 def open_relacs(self, filepath, buffersize=10.0, backsize=0.0,

1520 verbose=0, amax=1.0):

1521 """Open relacs data files (www.relacs.net) for reading.

1522

1523 Parameters

1524 ----------

1525 filepath: str

1526 Path to a relacs data directory or a file therein.

1527 buffersize: float

1528 Size of internal buffer in seconds.

1529 backsize: float

1530 Part of the buffer to be loaded before the requested start index in seconds.

1531 verbose: int

1532 If > 0 show detailed error/warning messages.

1533 amax: float

1534 The amplitude range of the data.

1535

1536 Raises

1537 ------

1538 FileNotFoundError:

1539 Invalid or non existing fishgrid files.

1540 ValueError:

1541 .gz files not supported.

1542 """

1543 self.verbose = verbose

1544

1545 # open trace files:

1546 self.trace_filepaths = relacs_trace_files(filepath)

1547 if len(self.trace_filepaths) == 0:

1548 raise FileNotFoundError(f'no relacs files found')

1549 self.sf = []

1550 self.frames = None

1551 self.rate = None

1552 self.unit = ''

1553 self.filepath = filepath

1554 self.file_paths = [self.filepath]

1555 self.file_indices = [0]

1556 for path in self.trace_filepaths:

1557 if path[-3:] == '.gz':

1558 raise ValueError('.gz files not supported')

1559 sf = open(path, 'rb')

1560 self.sf.append(sf)

1561 if verbose > 0:

1562 print(f'open_relacs(filepath) with filepath={path}')

1563 # file size:

1564 sf.seek(0, os.SEEK_END)

1565 frames = sf.tell()//4

1566 if self.frames is None:

1567 self.frames = frames

1568 elif self.frames != frames:

1569 diff = self.frames - frames

1570 if diff > 1 or diff < -2:

1571 raise ValueError('number of frames of traces differ')

1572 elif diff >= 0:

1573 self.frames = frames

1574 sf.seek(0)

1575 # retrieve sampling rate and unit:

1576 rate, us = relacs_samplerate_unit(path)

1577 if self.rate is None:

1578 self.rate = rate

1579 elif rate != self.rate:

1580 raise ValueError('sampling rates of traces differ')

1581 if len(self.unit) == 0:

1582 self.unit = us

1583 elif us != self.unit:

1584 raise ValueError('unit of traces differ')

1585 self.channels = len(self.sf)

1586 self.shape = (self.frames, self.channels)

1587 self.size = self.frames * self.channels

1588 self.ndim = len(self.shape)

1589 self.format = 'RELACS'

1590 self.encoding = 'FLOAT'

1591 self.bufferframes = int(buffersize*self.rate)

1592 self.backframes = int(backsize*self.rate)

1593 self.init_buffer()

1594 self.offset = 0

1595 self.close = self._close_relacs

1596 self.load_audio_buffer = self._load_buffer_relacs

1597 self.basename = self._basename_relacs

1598 self.ampl_min = -amax

1599 self.ampl_max = +amax

1600 self._load_metadata = self._metadata_relacs

1601 # TODO: load markers:

1602 self._locs = np.zeros((0, 2), dtype=int)

1603 self._labels = np.zeros((0, 2), dtype=object)

1604 self._load_markers = None

1605 return self

1606

1607 def _close_relacs(self):

1608 """Close the relacs data files.

1609 """

1610 for file in self.sf:

1611 file.close()

1612 self.sf = []

1613

1614 def _load_buffer_relacs(self, r_offset, r_size, buffer):

1615 """Load new data from relacs data file.

1616

1617 Parameters

1618 ----------

1619 r_offset: int

1620 First frame to be read from file.

1621 r_size: int

1622 Number of frames to be read from file.

1623 buffer: ndarray

1624 Buffer where to store the loaded data.

1625 """

1626 if len(self.sf) == 0 and len(self.trace_filepaths) > 0:

1627 for path in self.trace_filepaths:

1628 self.sf.append(open(path, 'rb'))

1629 for i, file in enumerate(self.sf):

1630 file.seek(r_offset*4)

1631 data = file.read(r_size*4)

1632 buffer[:, i] = np.frombuffer(data, dtype=np.float32)

1633

1634

1635 def _metadata_relacs(self, store_empty=False, first_only=False):

1636 """ Load meta-data of a relacs data set.

1637 """

1638 path = os.path.dirname(self.filepath)

1639 info_path = os.path.join(path, 'info.dat')

1640 if not os.path.exists(info_path):

1641 return {}

1642 return relacs_header(info_path, store_empty, first_only)

1643

1644 def _basename_relacs(self, path=None):

1645 """ Base name of the relacs data files.

1646

1647 Parameters

1648 ----------

1649 path: str or None

1650 Path of a relacs data file (*.raw, info.dat, or just the directory).

1651 If `None`, use `self.filepath`.

1652

1653 Returns

1654 -------

1655 s: str

1656 The base name, i.e. the name of the directory containing the

1657 relacs data files.

1658

1659 """

1660 if path is None:

1661 path = self.filepath

1662 path = Path(path)

1663 if path.is_dir():

1664 return path.name

1665 else:

1666 return path.parent.name

1667

1668

1669 # fishgrid interface:

1670 def open_fishgrid(self, filepath, buffersize=10.0, backsize=0.0,

1671 verbose=0):

1672 """Open fishgrid data files (https://github.com/bendalab/fishgrid) for reading.

1673

1674 Parameters

1675 ----------

1676 filepath: str

1677 Path to a fishgrid data directory, or a file therein.

1678 buffersize: float

1679 Size of internal buffer in seconds.

1680 backsize: float

1681 Part of the buffer to be loaded before the requested start index in seconds.

1682 verbose: int

1683 If > 0 show detailed error/warning messages.

1684

1685 Raises

1686 ------

1687 FileNotFoundError:

1688 Invalid or non existing fishgrid files.

1689 """

1690 self.verbose = verbose

1691

1692 self.trace_filepaths = fishgrid_trace_files(filepath)

1693 if len(self.trace_filepaths) == 0:

1694 raise FileNotFoundError(f'no fishgrid files found')

1695 self.filepath = filepath

1696 self.file_paths = [self.filepath]

1697 self.file_indices = [0]

1698 self._load_metadata = metadata_fishgrid

1699 self._load_markers = markers_fishgrid

1700

1701 # open grid files:

1702 grids = fishgrid_grids(self.metadata())

1703 grid_sizes = [r*c for r,c in grids]

1704 self.channels = 0

1705 for g, path in enumerate(self.trace_filepaths):

1706 self.channels += grid_sizes[g]

1707 self.sf = []

1708 self.grid_channels = []

1709 self.grid_offs = []

1710 offs = 0

1711 self.frames = None

1712 self.rate = get_number(self.metadata(), 'Hz', 'AISampleRate')

1713 v, self.unit = get_number_unit(self.metadata(), 'AIMaxVolt')

1714 if v is not None:

1715 self.ampl_min = -v

1716 self.ampl_max = +v

1717

1718 for g, path in enumerate(self.trace_filepaths):

1719 sf = open(path, 'rb')

1720 self.sf.append(sf)

1721 if verbose > 0:

1722 print(f'open_fishgrid(filepath) with filepath={path}')

1723 # grid channels:

1724 self.grid_channels.append(grid_sizes[g])

1725 self.grid_offs.append(offs)

1726 offs += grid_sizes[g]

1727 # file size:

1728 sf.seek(0, os.SEEK_END)

1729 frames = sf.tell()//4//grid_sizes[g]

1730 if self.frames is None:

1731 self.frames = frames

1732 elif self.frames != frames:

1733 diff = self.frames - frames

1734 if diff > 1 or diff < -2:

1735 raise ValueError('number of frames of traces differ')

1736 elif diff >= 0:

1737 self.frames = frames

1738 sf.seek(0)

1739 self.shape = (self.frames, self.channels)

1740 self.size = self.frames * self.channels

1741 self.ndim = len(self.shape)

1742 self.format = 'FISHGRID'

1743 self.encoding = 'FLOAT'

1744 self.bufferframes = int(buffersize*self.rate)

1745 self.backframes = int(backsize*self.rate)

1746 self.init_buffer()

1747 self.offset = 0

1748 self.close = self._close_fishgrid

1749 self.load_audio_buffer = self._load_buffer_fishgrid

1750 self.basename = self._basename_fishgrid

1751 return self

1752

1753 def _close_fishgrid(self):

1754 """Close the fishgrid data files.

1755 """

1756 for file in self.sf:

1757 file.close()

1758 self.sf = []

1759

1760 def _load_buffer_fishgrid(self, r_offset, r_size, buffer):

1761 """Load new data from relacs data file.

1762

1763 Parameters

1764 ----------

1765 r_offset: int

1766 First frame to be read from file.

1767 r_size: int

1768 Number of frames to be read from file.

1769 buffer: ndarray

1770 Buffer where to store the loaded data.

1771 """

1772 if len(self.sf) == 0 and len(self.trace_filepaths) > 0:

1773 for path in self.trace_filepaths:

1774 self.sf.append(open(path, 'rb'))

1775 for file, gchannels, goffset in zip(self.sf, self.grid_channels, self.grid_offs):

1776 file.seek(r_offset*4*gchannels)

1777 data = file.read(r_size*4*gchannels)

1778 buffer[:, goffset:goffset+gchannels] = np.frombuffer(data, dtype=np.float32).reshape((-1, gchannels))

1779

1780 def _basename_fishgrid(self, path=None):

1781 """ Base name of the fishgrid data files.

1782

1783 Parameters

1784 ----------

1785 path: str or None

1786 Path of a fishgrid data file

1787 (*.raw, fishgrid.cfg, or just the directory).

1788 If `None`, use `self.filepath`.

1789

1790 Returns

1791 -------

1792 s: str

1793 The base name, i.e. the name of the directory containing the

1794 fishgrid data files.

1795

1796 """

1797 if path is None:

1798 path = self.filepath

1799 path = Path(path)

1800 if path.is_dir():

1801 return path.name

1802 else:

1803 return path.parent.name

1804

1805

1806

1807 # container interface:

1808 def open_container(self, filepath, buffersize=10.0,

1809 backsize=0.0, verbose=0, datakey=None,

1810 samplekey=['rate', 'Fs', 'fs'],

1811 timekey=['time'], amplkey=['amax'], unitkey='unit',

1812 metadatakey=['metadata', 'info'],

1813 poskey=['positions'],

1814 spanskey=['spans'], labelskey=['labels'],

1815 descrkey=['descriptions'],

1816 amax=1.0, unit='a.u.'):

1817 """Open generic container file.

1818

1819 Supported file formats are:

1820

1821 - python pickle files (.pkl)

1822 - numpy files (.npz)

1823 - matlab files (.mat)

1824

1825 Parameters

1826 ----------

1827 filepath: str

1828 Path to a container file.

1829 buffersize: float

1830 Size of internal buffer in seconds.

1831 backsize: float

1832 Part of the buffer to be loaded before the requested start index in seconds.

1833 verbose: int

1834 If > 0 show detailed error/warning messages.

1835 datakey: None, str, or list of str

1836 Name of the variable holding the data. If `None` take the

1837 variable that is an 2D array and has the largest number of

1838 elements.

1839 samplekey: str or list of str

1840 Name of the variable holding the sampling rate.

1841 timekey: str or list of str

1842 Name of the variable holding sampling times.

1843 If no sampling rate is available, the sampling rate is retrieved

1844 from the sampling times.

1845 amplkey: str or list of str

1846 Name of the variable holding the amplitude range of the data.

1847 unitkey: str

1848 Name of the variable holding the unit of the data.

1849 metadatakey: str or list of str

1850 Name of the variable holding the metadata.

1851 poskey: str or list of str

1852 Name of the variable holding positions of markers.

1853 spanskey: str or list of str

1854 Name of the variable holding spans of markers.

1855 labelskey: str or list of str

1856 Name of the variable holding labels of markers.

1857 descrkey: str or list of str

1858 Name of the variable holding descriptions of markers.

1859 amax: None or float

1860 If specified and no amplitude range has been found in the data

1861 container, then this is the amplitude range of the data.

1862 unit: None or str

1863 If specified and no unit has been found in the data container,

1864 then return this as the unit of the data.

1865

1866 Raises

1867 ------

1868 ValueError:

1869 Invalid key requested.

1870 """

1871 self.verbose = verbose

1872 data_dict = {}

1873 ext = os.path.splitext(filepath)[1]

1874 if ext == '.pkl':

1875 import pickle

1876 with open(filepath, 'rb') as f:

1877 data_dict = pickle.load(f)

1878 self.format = 'PKL'

1879 elif ext == '.npz':

1880 data_dict = np.load(filepath)

1881 self.format = 'NPZ'

1882 elif ext == '.mat':

1883 from scipy.io import loadmat

1884 data_dict = loadmat(filepath, squeeze_me=True)

1885 self.format = 'MAT'

1886 self.buffer, self.rate, self.unit, amax = \

1887 extract_container_data(data_dict, datakey, samplekey,

1888 timekey, amplkey, unitkey, amax, unit)

1889 self.filepath = filepath

1890 self.file_paths = [self.filepath]

1891 self.file_indices = [0]

1892 self.channels = self.buffer.shape[1]

1893 self.frames = self.buffer.shape[0]

1894 self.shape = self.buffer.shape

1895 self.ndim = self.buffer.ndim

1896 self.size = self.buffer.size

1897 self.encoding = self.numpy_encodings[self.buffer.dtype]

1898 self.ampl_min = -amax

1899 self.ampl_max = +amax

1900 self.offset = 0

1901 self.buffer_changed = np.zeros(self.channels, dtype=bool)

1902 self.bufferframes = self.frames

1903 self.backsize = 0

1904 self.close = self._close_container

1905 self.load_audio_buffer = self._load_buffer_container

1906 self._metadata = extract_container_metadata(data_dict, metadatakey)

1907 self._load_metadata = None

1908 self._locs, self._labels = extract_container_markers(data_dict,

1909 poskey,

1910 spanskey,

1911 labelskey,

1912 descrkey)

1913 self._load_markers = None

1914

1915 def _close_container(self):

1916 """Close container. """

1917 pass

1918

1919 def _load_buffer_container(self, r_offset, r_size, buffer):

1920 """Load new data from container."""

1921 buffer[:, :] = self.buffer[r_offset:r_offset + r_size, :]

1922

1923

1924 # raw data interface:

1925 def open_raw(self, filepath, buffersize=10.0, backsize=0.0,

1926 verbose=0, rate=44000, channels=1, dtype=np.float32,

1927 amax=1.0, unit='a.u.'):

1928 """Load data from a raw file.

1929

1930 Raw files just contain the data and absolutely no metadata, not

1931 even the smapling rate, number of channels, etc.

1932 Supported file formats are:

1933

1934 - raw files (*.raw)

1935 - LabView scandata (*.scandat)

1936

1937 Parameters

1938 ----------

1939 filepath: str

1940 Path of the file to load.

1941 buffersize: float

1942 Size of internal buffer in seconds.

1943 backsize: float

1944 Part of the buffer to be loaded before the requested start index in seconds.

1945 verbose: int

1946 If > 0 show detailed error/warning messages.

1947 rate: float

1948 Sampling rate of the data in Hertz.

1949 channels: int

1950 Number of channels multiplexed in the data.

1951 dtype: str or numpy.dtype

1952 The data type stored in the file.

1953 amax: float

1954 The amplitude range of the data.

1955 unit: str

1956 The unit of the data.

1957 """

1958 self.verbose = verbose

1959 self.filepath = filepath

1960 self.file_paths = [self.filepath]

1961 self.file_indices = [0]

1962 self.sf = open(self.filepath, 'rb')

1963 if verbose > 0:

1964 print(f'open_raw(filepath) with filepath={filepath}')

1965 self.dtype = np.dtype(dtype)

1966 self.rate = float(rate)

1967 # file size:

1968 self.sf.seek(0, os.SEEK_END)

1969 self.frames = self.sf.tell()//self.dtype.itemsize

1970 self.sf.seek(0)

1971 self.channels = int(channels)

1972 self.shape = (self.frames, self.channels)

1973 self.ndim = len(self.shape)

1974 self.size = self.frames*self.channels

1975 self.format = 'RAW'

1976 self.encoding = self.numpy_encodings.get(self.dtype, 'UNKNOWN')

1977 self.unit = unit

1978 self.ampl_max = float(amax)

1979 self.ampl_min = -self.ampl_max

1980 self.offset = 0

1981 self.bufferframes = int(buffersize*self.rate)

1982 self.backframes = int(backsize*self.rate)

1983 self.init_buffer()

1984 self.close = self._close_raw

1985 self.load_audio_buffer = self._load_buffer_raw

1986 self._metadata = None

1987 self._load_metadata = None

1988 self._locs = None

1989 self._labels = None

1990 self._load_markers = None

1991

1992 def _close_raw(self):

1993 """Close raw file. """

1994 self.sf.close()

1995 self.sf = None

1996

1997 def _load_buffer_raw(self, r_offset, r_size, buffer):

1998 """Load new data from container."""

1999 if self.sf is None:

2000 self.sf = open(self.filepath, 'rb')

2001 self.sf.seek(r_offset*self.dtype.itemsize)

2002 raw_data = self.sf.read(r_size*self.dtype.itemsize)

2003 raw_data = np.frombuffer(raw_data, dtype=self.dtype)

2004 raw_data = raw_data.reshape(-1, self.channels)

2005 # recode:

2006 if self.dtype == np.dtype('int16'):

2007 data = raw_data.astype('float32')

2008 data *= self.ampl_max/2**15

2009 elif self.dtype == np.dtype('int32'):

2010 data = raw_data.astype(float)

2011 data *= self.ampl_max/2**31

2012 elif self.dtype == np.dtype('int64'):

2013 data = raw_data.astype(float)

2014 data *= self.ampl_max/2**63

2015 else:

2016 data = raw_data

2017 buffer[:, :] = data

2018

2019

2020 # audioio interface:

2021 def open_audioio(self, filepath, buffersize=10.0, backsize=0.0,

2022 verbose=0, gainkey=default_gain_keys, sep='.',

2023 amax=None, unit='a.u.'):

2024 """Open an audio file.

2025

2026 See the [audioio](https://github.com/bendalab/audioio) package

2027 for details.

2028

2029 Parameters

2030 ----------

2031 filepath: str

2032 Path to an audio file.

2033 buffersize: float

2034 Size of internal buffer in seconds.

2035 backsize: float

2036 Part of the buffer to be loaded before the requested start index

2037 in seconds.

2038 verbose: int

2039 If > 0 show detailed error/warning messages.

2040 gainkey: str or list of str

2041 Key in the file's metadata that holds some gain information.

2042 If found, the data will be multiplied with the gain,

2043 and if available, the corresponding unit is returned.

2044 See the [audioio.get_gain()](https://bendalab.github.io/audioio/api/audiometadata.html#audioio.audiometadata.get_gain) function for details.

2045 sep: str

2046 String that separates section names in `gainkey`.

2047 amax: None or float

2048 If specified and no gain has been found in the metadata,

2049 then use this as the amplitude range.

2050 unit: None or str

2051 If specified and no gain has been found in the metadata,

2052 then this is the unit of the data.

2053

2054 """

2055 self.verbose = verbose

2056 super(DataLoader, self).open(filepath, buffersize, backsize, verbose)

2057 md = self.metadata()

2058 fac, unit = get_gain(md, gainkey, sep, amax, unit)

2059 if fac is None:

2060 self.gain_fac = 1.0

2061 else:

2062 self.gain_fac = fac

2063 self._load_buffer_audio_org = self.load_audio_buffer

2064 self.load_audio_buffer = self._load_buffer_audioio

2065 self.ampl_min *= self.gain_fac

2066 self.ampl_max *= self.gain_fac

2067 self.unit = unit

2068 return self

2069

2070 def _load_buffer_audioio(self, r_offset, r_size, buffer):

2071 """Load and scale new data from an audio file.

2072

2073 Parameters

2074 ----------

2075 r_offset: int

2076 First frame to be read from file.

2077 r_size: int

2078 Number of frames to be read from file.

2079 buffer: ndarray

2080 Buffer where to store the loaded data.

2081 """

2082 self._load_buffer_audio_org(r_offset, r_size, buffer)

2083 buffer *= self.gain_fac

2084

2085

2086 # open multiple files as one:

2087 def open_multiple(self, filepaths, buffersize=10.0, backsize=0.0,

2088 verbose=0, rate=None, channels=None,

2089 unit=None, amax=None, end_indices=None):

2090 """Open multiple files as a single concatenated array.

2091

2092 Parameters

2093 ----------

2094 filepaths: list of str

2095 List of file names of audio files.

2096 buffersize: float

2097 Size of internal buffer in seconds.

2098 backsize: float

2099 Part of the buffer to be loaded before the requested start index in seconds.

2100 verbose: int

2101 If larger than zero show detailed error/warning messages.

2102 rate: float

2103 If provided, do a minimal initialization (no checking)

2104 using the provided sampling rate (in Hertz), channels,

2105 unit, maximum amplitude, and end_indices.

2106 channels: int

2107 If provided, do a minimal initialization (no checking)

2108 using the provided rate, number of channels,

2109 unit, maximum amplitude, and end_indices.

2110 unit: str

2111 If provided, do a minimal initialization (no checking)

2112 using the provided rate, number of channels,

2113 unit, maximum amplitude, and end_indices.

2114 amax: float

2115 If provided, do a minimal initialization (no checking)

2116 using the provided rate, number of channels,

2117 unit, maximum amplitude amax, and end_indices.

2118 end_indices: sequence of int

2119 If provided, do a minimal initialization (no checking)

2120 using the provided rate, channels,

2121 unit, maximum amplitude, and end_indices.

2122

2123 Raises

2124 ------

2125 TypeError

2126 `filepaths` must be a sequence.

2127 ValueError

2128 Empty `filepaths`.

2129 FileNotFoundError

2130 `filepaths` does not contain a single valid file.

2131

2132 """

2133 if not isinstance(filepaths, (list, tuple, np.ndarray)):

2134 raise TypeError('input argument filepaths is not a sequence!')

2135 if len(filepaths) == 0:

2136 raise ValueError('input argument filepaths is empy sequence!')

2137 self.buffersize = buffersize

2138 self.backsize = backsize

2139 self.filepath = None

2140 self.file_paths = []

2141 self.open_files = []

2142 self.open_loaders = []

2143 self.data_files = []

2144 self.collect_counter = 0

2145 self.frames = 0

2146 self.start_indices = []

2147 self.end_indices = []

2148 self.start_time = None

2149 start_time = None

2150 self._metadata = {}

2151 self._locs = np.zeros((0, 2), dtype=int)

2152 self._labels = np.zeros((0, 2), dtype=object)

2153 if end_indices is not None:

2154 self.filepath = filepaths[0]

2155 self.file_paths = filepaths

2156 self.data_files = [None] * len(filepaths)

2157 self.frames = end_indices[-1]

2158 self.start_indices = [0] + list(end_indices[:-1])

2159 self.end_indices = end_indices

2160 self.format = None

2161 self.encoding = None

2162 self.rate = rate

2163 self.channels = channels

2164 self.unit = unit

2165 self.ampl_max = amax

2166 self.ampl_min = -amax

2167 else:

2168 for filepath in filepaths:

2169 try:

2170 a = DataLoader(filepath, buffersize, backsize, verbose)

2171 except Exception as e:

2172 if verbose > 0:

2173 print(e)

2174 continue

2175 # collect metadata:

2176 md = a.metadata()

2177 fmd = flatten_metadata(md, True)

2178 add_metadata(self._metadata, fmd)

2179 if self.filepath is None:

2180 # first file:

2181 self.filepath = a.filepath

2182 self.format = a.format

2183 self.encoding = a.encoding

2184 self.rate = a.rate

2185 self.channels = a.channels

2186 self.unit = a.unit

2187 self.ampl_max = a.ampl_max

2188 self.ampl_min = a.ampl_min

2189 self.start_time = get_datetime(md)

2190 start_time = self.start_time

2191 else:

2192 # check channels, rate, and amplitudes:

2193 error_str = None

2194 if a.channels != self.channels:

2195 error_str = f'number of channels differs: ' \

2196 f'{a.channels} in {a.filepath} versus ' \

2197 f'{self.channels} in {self.filepath}'

2198 if a.rate != self.rate:

2199 error_str = f'sampling rates differ: ' \

2200 f'{a.rate} in {a.filepath} versus ' \

2201 f'{self.rate} in {self.filepath}'

2202 if a.ampl_min != self.ampl_min:

2203 error_str = f'minimum amplitudes differ: ' \

2204 f'{a.ampl_min} in {a.filepath} versus ' \

2205 f'{self.ampl_min} in {self.filepath}'

2206 if a.ampl_max != self.ampl_max:

2207 error_Str = f'maximum amplitudes differ: ' \

2208 f'{a.ampl_max} in {a.filepath} versus ' \

2209 f'{self.ampl_max} in {self.filepath}'

2210 # check start time of recording:

2211 stime = get_datetime(md)

2212 if start_time is None or stime is None or \

2213 abs(start_time - stime) > timedelta(seconds=1):

2214 error_str = f'start time does not indicate continuous recording: ' \

2215 f'expected {start_time} instead of ' \

2216 f'{stime} in {a.filepath}'

2217 if error_str is not None:

2218 if verbose > 0:

2219 print(error_str)

2220 a.close()

2221 del a

2222 break

2223 # markers:

2224 locs, labels = a.markers()

2225 locs[:,0] += self.frames

2226 self._locs = np.vstack((self._locs, locs))

2227 self._labels = np.vstack((self._labels, labels))

2228 # indices:

2229 self.start_indices.append(self.frames)

2230 self.frames += a.frames

2231 self.end_indices.append(self.frames)

2232 if start_time is not None:

2233 start_time += timedelta(seconds=a.frames/a.rate)

2234 # add file to lists:

2235 self.file_paths.append(filepath)

2236 if len(self.open_files) < AudioLoader.max_open_files:

2237 self.open_files.append(a)

2238 else:

2239 a.close()

2240 if len(self.open_loaders) < AudioLoader.max_open_loaders:

2241 self.data_files.append(a)

2242 self.open_loaders.append(a)

2243 else:

2244 a.close()

2245 del a

2246 self.data_files.append(None)

2247 if len(self.data_files) == 0:

2248 raise FileNotFoundError('input argument filepaths does not contain any valid audio file!')

2249 # set startime from first file:

2250 if self.start_time is not None:

2251 set_starttime(self._metadata, self.start_time)

2252 # setup infrastructure:

2253 self.file_indices = self.start_indices

2254 self.start_indices = np.array(self.start_indices)

2255 self.end_indices = np.array(self.end_indices)

2256 self.shape = (self.frames, self.channels)

2257 self.bufferframes = int(buffersize*self.rate)

2258 self.backframes = int(backsize*self.rate)

2259 self.init_buffer()

2260 self.close = self._close_multiple

2261 self.load_audio_buffer = self._load_buffer_multiple

2262 self._load_metadata = None

2263 self._load_markers = None

2264 return self

2265

2266 def _close_multiple(self):

2267 """Close all the data files. """

2268 self.open_files = []

2269 self.open_loaders = []

2270 if hasattr(self, 'data_files'):

2271 for a in self.data_files:

2272 if a is not None:

2273 a.close()

2274 self.data_files = []

2275 self.filepath = None

2276 self.file_paths = []

2277 self.file_indices = []

2278 self.start_indices = []

2279 self.end_indices = []

2280 del self.data_files

2281 del self.open_files

2282 del self.open_loaders

2283 del self.start_indices

2284 del self.end_indices

2285

2286 def _load_buffer_multiple(self, r_offset, r_size, buffer):

2287 """Load new data from the underlying files.

2288

2289 Parameters

2290 ----------

2291 r_offset: int

2292 First frame to be read from file.

2293 r_size: int

2294 Number of frames to be read from file.

2295 buffer: ndarray

2296 Buffer where to store the loaded data.

2297 """

2298 offs = r_offset

2299 size = r_size

2300 boffs = 0

2301 ai = np.searchsorted(self.end_indices, offs, side='right')

2302 while size > 0:

2303 if self.data_files[ai] is None:

2304 a = DataLoader(self.file_paths[ai],

2305 self.buffersize, self.backsize, 0)

2306 self.data_files[ai] = a

2307 self.open_loaders.append(a)

2308 self.open_files.append(a)

2309 if len(self.open_files) > AudioLoader.max_open_files:

2310 a0 = self.open_files.pop(0)

2311 a0.close()

2312 if len(self.open_loaders) > AudioLoader.max_open_loaders:

2313 a0 = self.open_loaders.pop(0)

2314 self.data_files[self.data_files.index(a0)] = None

2315 a0.close()

2316 del a0

2317 self.collect_counter += 1

2318 if self.collect_counter > AudioLoader.max_open_loaders//2:

2319 gc.collect() # takes time!

2320 self.collect_counter = 0

2321 else:

2322 self.open_loaders.pop(self.open_loaders.index(self.data_files[ai]))

2323 self.open_loaders.append(self.data_files[ai])

2324 ai0 = offs - self.start_indices[ai]

2325 ai1 = offs + size

2326 if ai1 > self.end_indices[ai]:

2327 ai1 = self.end_indices[ai]

2328 ai1 -= self.start_indices[ai]

2329 n = ai1 - ai0

2330 self.data_files[ai].load_audio_buffer(ai0, n,

2331 buffer[boffs:boffs + n,:])

2332 if self.data_files[ai] in self.open_files:

2333 self.open_files.pop(self.open_files.index(self.data_files[ai]))

2334 self.open_files.append(self.data_files[ai])

2335 if len(self.open_files) > AudioLoader.max_open_files:

2336 self.open_files[0].close()

2337 self.open_files.pop(0)

2338 boffs += n

2339 offs += n

2340 size -= n

2341 ai += 1

2342

2343

2344 def open(self, filepath, buffersize=10.0, backsize=0.0,

2345 verbose=0, **kwargs):

2346 """Open file with time-series data for reading.

2347

2348 Parameters

2349 ----------

2350 filepath: str or list of str

2351 Name of the file or list of many file names that should be

2352 made accessible as a single array.

2353 buffersize: float

2354 Size of internal buffer in seconds.

2355 backsize: float

2356 Part of the buffer to be loaded before the requested start index

2357 in seconds.

2358 verbose: int

2359 If > 0 show detailed error/warning messages.

2360 **kwargs: dict

2361 Further keyword arguments that are passed on to the

2362 format specific opening functions.

2363 For example:

2364 - `amax`: the amplitude range of the data.

2365 - 'unit': the unit of the data.

2366

2367 Raises

2368 ------

2369 ValueError:

2370 `filepath` is empty string.

2371 """

2372 # list of implemented open functions:

2373 data_open_funcs = (

2374 ('relacs', check_relacs, self.open_relacs, 1),

2375 ('fishgrid', check_fishgrid, self.open_fishgrid, 1),

2376 ('container', check_container, self.open_container, 1),

2377 ('raw', check_raw, self.open_raw, 1),

2378 ('audioio', None, self.open_audioio, 0),

2379 )

2380

2381 self.buffer = np.array([])

2382 self.rate = 0.0

2383 if not filepath:

2384 raise ValueError('input argument filepath is empty string.')

2385 if isinstance(filepath, (list, tuple, np.ndarray)):

2386 if len(filepath) > 1:

2387 self.open_multiple(filepath, buffersize, backsize,

2388 verbose, **kwargs)

2389 if len(self.file_paths) > 1:

2390 return self

2391 filepath = self.file_paths[0]

2392 self.close()

2393 else:

2394 filepath = filepath[0]

2395 # open data:

2396 for name, check_file, open_file, v in data_open_funcs:

2397 if check_file is None or check_file(filepath):

2398 open_file(filepath, buffersize, backsize, verbose, **kwargs)

2399 if v*verbose > 1:

2400 if self.format is not None:

2401 print(f' format : {self.format}')

2402 if self.encoding is not None:

2403 print(f' encoding : {self.encoding}')

2404 print(f' sampling rate: {self.rate} Hz')

2405 print(f' channels : {self.channels}')

2406 print(f' frames : {self.frames}')

2407 print(f' range : {self.ampl_max:g}{self.unit}')

2408 break

2409 return self

2410

2411

2412def demo(filepath, plot=False):

2413 print("try load_data:")

2414 data, rate, unit, amax = load_data(filepath, verbose=2)

2415 if plot:

2416 fig, ax = plt.subplots()

2417 time = np.arange(len(data))/rate

2418 for c in range(data.shape[1]):

2419 ax.plot(time, data[:,c])

2420 ax.set_xlabel('Time [s]')

2421 ax.set_ylabel(f'[{unit}]')

2422 if amax is not None and np.isfinite(amax):

2423 ax.set_ylim(-amax, +amax)

2424 plt.show()

2425 return

2426

2427 print('')

2428 print("try DataLoader:")

2429 with DataLoader(filepath, 2.0, 1.0, 1) as data:

2430 print('sampling rate: %g' % data.rate)

2431 print('frames : %d %d' % (len(data), data.shape[0]))

2432 nframes = int(1.0 * data.rate)

2433 # forward:

2434 for i in range(0, len(data), nframes):

2435 print('forward %d-%d' % (i, i + nframes))

2436 x = data[i:i + nframes, 0]

2437 if plot:

2438 fig, ax = plt.subplots()

2439 ax.plot((i + np.arange(len(x)))/data.rate, x)

2440 ax.set_xlabel('Time [s]')

2441 ax.set_ylabel(f'[{data.unit}]')

2442 plt.show()

2443 # and backwards:

2444 for i in reversed(range(0, len(data), nframes)):

2445 print('backward %d-%d' % (i, i + nframes))

2446 x = data[i:i + nframes, 0]

2447 if plot:

2448 fig, ax = plt.subplots()

2449 ax.plot((i + np.arange(len(x)))/data.rate, x)

2450 ax.set_xlabel('Time [s]')

2451 ax.set_ylabel(f'[{data.unit}]')

2452 plt.show()

2453

2454

2455def main(*cargs):

2456 """Call demo with command line arguments.

2457

2458 Parameters

2459 ----------

2460 cargs: list of str

2461 Command line arguments as provided by sys.argv[1:]

2462 """

2463 import argparse

2464 parser = argparse.ArgumentParser(description=

2465 'Checking thunderlab.dataloader module.')

2466 parser.add_argument('-p', dest='plot', action='store_true',

2467 help='plot loaded data')

2468 parser.add_argument('file', nargs=1, default='', type=str,

2469 help='name of data file')

2470 args = parser.parse_args(cargs)

2471 demo(args.file[0], args.plot)

2472

2473

2474if __name__ == "__main__":

2475 main(*sys.argv[1:])