Module thunderfish.collectfish
Collect data generated by thunderfish in a wavefish and a pulsefish table.
Expand source code
"""
Collect data generated by thunderfish in a wavefish and a pulsefish table.
"""
import os
import glob
import io
import zipfile
import sys
import argparse
import numpy as np
from thunderlab.configfile import ConfigFile
from thunderlab.tabledata import TableData, add_write_table_config, write_table_args
from .version import __version__, __year__
from .harmonics import add_harmonic_groups_config
from .eodanalysis import wave_similarity, pulse_similarity
from .eodanalysis import load_species_waveforms, add_species_config
from .eodanalysis import wave_quality, wave_quality_args, add_eod_quality_config
from .eodanalysis import pulse_quality, pulse_quality_args
from .eodanalysis import adjust_eodf
from .eodanalysis import parse_filename
def collect_fish(files, simplify_file=False,
meta_data=None, meta_recordings=None, skip_recordings=False,
temp_col=None, q10=1.62, max_fish=0, harmonics=None,
peaks0=None, peaks1=None, cfg=None, verbose=0):
"""Combine all *-wavefish.* and/or *-pulsefish.* files into respective summary tables.
Data from the *-wavespectrum-*.* and the *-pulsepeaks-*.* files can be added
as specified by `harmonics`, `peaks0`, and `peaks1`.
Meta data of the recordings can also be added via `meta_data` and
`meta_recordings`. If `meta_data` contains a column with
temperature, this column can be specified by the `temp_col`
parameter. In this case, an 'T_adjust' and an 'EODf_adjust' column
are inserted into the resulting tables containing the mean
temperature and EOD frequencies adjusted to this temperature,
respectively. For the temperature adjustment of EOD frequency
a Q10 value can be supplied by the `q10` parameter.
Parameters
----------
files: list of strings
Files to be combined.
simplify_file: boolean
Remove initial common directories from input files.
meta_data: TableData or None
Table with additional data for each of the recordings.
The meta data are inserted into the summary table according to
the name of the recording as specified in `meta_recordings`.
meta_recordings: array of strings
For each row in `meta_data` the name of the recording.
This name is matched agains the basename of input `files`.
skip_recordings: bool
If True skip recordings that are not found in `meta_recordings`.
temp_col: string or None
A column in `meta_data` with temperatures to which EOD
frequences should be adjusted.
q10: float
Q10 value describing temperature dependence of EOD
frequencies. The default of 1.62 is from Dunlap, Smith, Yetka
(2000) Brain Behav Evol, measured for Apteronotus
lepthorhynchus in the lab.
max_fish: int
Maximum number of fish to be taken, if 0 take all.
harmonics: int
Number of harmonic to be added to the wave-type fish table
(amplitude, relampl, phase). This data is read in from the
corresponding *-wavespectrum-*.* files.
peaks0: int
Index of the first peak of a EOD pulse to be added to the
pulse-type fish table. This data is read in from the
corresponding *-pulsepeaks-*.* files.
peaks1: int
Index of the last peak of a EOD pulse to be added to the
pulse-type fish table. This data is read in from the
corresponding *-pulsepeaks-*.* files.
cfg: ConfigFile
Configuration parameter for EOD quality assessment and species
assignment.
verbose: int
Verbose output:
1: print infos on meta data coverage.
2: print additional infos on discarded recordings.
Returns
-------
wave_table: TableData
Summary table for all wave-type fish.
pulse_table: TableData
Summary table for all pulse-type fish.
all_table: TableData
Summary table for all wave-type and pulse-type fish.
"""
def file_iter(files):
""" Iterate over analysis files.
Parameters
----------
files: list of str
Input files.
Yields
------
zf: ZipFile or None
In case an input file is a zip archive, the open archive.
file_path: str
The full path of a single file to be processed.
I.e. a '*-wavefish.*' or '*-pulsefish.*' file.
fish_type: str
Either 'wave' or 'pulse'.
"""
for file_path in files:
_, _, _, _, ftype, _, ext = parse_filename(file_path)
if ext == 'zip':
zf = zipfile.ZipFile(file_path)
file_pathes = sorted(zf.namelist())
for zfile in file_pathes:
_, _, _, _, ftype, _, _ = parse_filename(zfile)
if ftype in ['wavefish', 'pulsefish']:
yield zf, zfile, ftype[:-4]
elif ftype in ['wavefish', 'pulsefish']:
yield None, file_path, ftype[:-4]
else:
continue
def find_recording(recording, meta_recordings):
""" Find row of a recording in meta data.
Parameters
----------
recording: string
Path and base name of a recording.
meta_recordings: list of string
List of meta data recordings where to find `recording`.
"""
if meta_data is not None:
rec = os.path.splitext(os.path.basename(recording))[0]
for i in range(len(meta_recordings)):
# TODO: strip extension!
if rec == meta_recordings[i]:
return i
return -1
# prepare meta recodings names:
meta_recordings_used = None
if meta_recordings is not None:
meta_recordings_used = np.zeros(len(meta_recordings), dtype=bool)
for r in range(len(meta_recordings)):
meta_recordings[r] = os.path.splitext(os.path.basename(meta_recordings[r]))[0]
# prepare adjusted temperatures:
if meta_data is not None and temp_col is not None:
temp_idx = meta_data.index(temp_col)
temp = meta_data[:,temp_idx]
mean_tmp = np.round(np.nanmean(temp)/0.1)*0.1
meta_data.insert(temp_idx+1, 'T_adjust', 'C', '%.1f')
meta_data.append_data_column([mean_tmp]*meta_data.rows(), temp_idx+1)
# prepare species distances:
wave_names, wave_eods, pulse_names, pulse_eods = \
load_species_waveforms(cfg.value('speciesFile'))
wave_max_rms = cfg.value('maximumWaveSpeciesRMS')
pulse_max_rms = cfg.value('maximumPulseSpeciesRMS')
# load data:
wave_table = None
pulse_table = None
all_table = None
file_pathes = []
for zf, file_name, fish_type in file_iter(files):
# file name:
table = None
window_time = None
recording, base_path, channel, start_time, _, _, file_ext = \
parse_filename(file_name)
file_ext = os.extsep + file_ext
file_pathes.append(os.path.normpath(recording).split(os.path.sep))
if verbose > 2:
print('processing %s (%s):' % (file_name, recording))
# find row in meta_data:
mr = -1
if meta_data is not None:
mr = find_recording(recording, meta_recordings)
if mr < 0:
if skip_recordings:
if verbose > 0:
print('skip recording %s: no metadata found' % recording)
continue
elif verbose > 0:
print('no metadata found for recording %s' % recording)
else:
meta_recordings_used[mr] = True
# data:
if zf is not None:
file_name = io.TextIOWrapper(zf.open(file_name, 'r'))
data = TableData(file_name)
if 'twin' in data:
start_time = data[0, 'twin']
window_time = data[0, 'window']
data.remove(['twin', 'window'])
table = wave_table if fish_type == 'wave' else pulse_table
# prepare tables:
if not table:
df = TableData(data)
df.clear_data()
if meta_data is not None:
if data.nsecs > 0:
df.insert_section(0, 'metadata')
for c in range(meta_data.columns()):
df.insert(c, *meta_data.column_head(c))
df.insert(0, ['recording']*data.nsecs + ['file'], '', '%-s')
if window_time is not None:
df.insert(1, 'window', 's', '%.2f')
if start_time is not None:
df.insert(1, 'time', 's', '%.2f')
if channel >= 0:
df.insert(1, 'channel', '', '%d')
if fish_type == 'wave':
if harmonics is not None:
fn = base_path + '-wavespectrum-0' + file_ext
if zf is not None:
fn = io.TextIOWrapper(zf.open(fn, 'r'))
wave_spec = TableData(fn)
if data.nsecs > 0:
df.append_section('harmonics')
for h in range(min(harmonics, wave_spec.rows())+1):
df.append('ampl%d' % h, wave_spec.unit('amplitude'),
wave_spec.format('amplitude'))
if h > 0:
df.append('relampl%d' % h, '%', '%.2f')
df.append('relpower%d' % h, '%', '%.2f')
df.append('phase%d' % h, 'rad', '%.3f')
if len(wave_names) > 0:
if data.nsecs > 0:
df.append_section('species')
for species in wave_names:
df.append(species, '%', '%.0f')
df.append('species', '', '%-s')
else:
if peaks0 is not None:
fn = base_path + '-pulsepeaks-0' + file_ext
if zf is not None:
fn = io.TextIOWrapper(zf.open(fn, 'r'))
pulse_peaks = TableData(fn)
if data.nsecs > 0:
df.append_section('peaks')
for p in range(peaks0, peaks1+1):
if p != 1:
df.append('P%dtime' % p, 'ms', '%.3f')
df.append('P%dampl' % p, pulse_peaks.unit('amplitude'),
pulse_peaks.format('amplitude'))
if p != 1:
df.append('P%drelampl' % p, '%', '%.2f')
df.append('P%dwidth' % p, 'ms', '%.3f')
if len(pulse_names) > 0:
if data.nsecs > 0:
df.append_section('species')
for species in pulse_names:
df.append(species, '%', '%.0f')
df.append('species', '', '%-s')
if fish_type == 'wave':
wave_table = df
table = wave_table
else:
pulse_table = df
table = pulse_table
if not all_table:
df = TableData()
df.append('file', '', '%-s')
if channel >= 0:
df.append('channel', '', '%d')
if start_time is not None:
df.append('time', 's', '%.1f')
if window_time is not None:
df.append('window', 's', '%.1f')
if meta_data is not None:
for c in range(meta_data.columns()):
df.append(*meta_data.column_head(c))
df.append('index', '', '%d')
df.append('EODf', 'Hz', '%.1f')
df.append('type', '', '%-5s')
if len(wave_names) + len(pulse_names) > 0:
df.append('species', '', '%-s')
all_table = df
# fill tables:
n = data.rows() if not max_fish or max_fish > data.rows() else max_fish
for r in range(n):
# fish index:
idx = r
if 'index' in data:
idx = data[r,'index']
# check quality:
skips = ''
if fish_type == 'wave':
fn = base_path + '-wavespectrum-%d'%idx + file_ext
if zf is not None:
fn = io.TextIOWrapper(zf.open(fn, 'r'))
wave_spec = TableData(fn)
if cfg is not None:
spec_data = wave_spec.array()
props = data.row_dict(r)
if 'clipped' in props:
props['clipped'] *= 0.01
if 'noise' in props:
props['noise'] *= 0.01
if 'rmserror' in props:
props['rmserror'] *= 0.01
if 'thd' in props:
props['thd'] *= 0.01
_, skips, msg = wave_quality(props, 0.01*spec_data[1:,3],
**wave_quality_args(cfg))
else:
if cfg is not None:
props = data.row_dict(r)
if 'clipped' in props:
props['clipped'] *= 0.01
if 'noise' in props:
props['noise'] *= 0.01
skips, msg, _ = pulse_quality(props, **pulse_quality_args(cfg))
if len(skips) > 0:
if verbose > 1:
print('skip fish %2d from %s: %s' % (idx, recording, skips))
continue
# fill in data:
data_col = 0
table.append_data(recording, data_col)
all_table.append_data(recording, data_col)
data_col += 1
if channel >= 0:
table.append_data(channel, data_col)
all_table.append_data(channel, data_col)
data_col += 1
if start_time is not None:
table.append_data(start_time, data_col)
all_table.append_data(start_time, data_col)
data_col += 1
if window_time is not None:
table.append_data(window_time, data_col)
all_table.append_data(window_time, data_col)
data_col += 1
# meta data:
if mr >= 0:
for c in range(meta_data.columns()):
table.append_data(meta_data[mr,c], data_col)
all_table.append_data(meta_data[mr,c], data_col)
data_col += 1
elif meta_data is not None:
data_col += meta_data.columns()
table.append_data(data[r,:].array(), data_col)
eodf = data[r,'EODf']
all_table.append_data(data[r,'index'], data_col)
all_table.append_data(eodf)
all_table.append_data(fish_type)
species_name = 'unknown'
species_rms = 1.0e12
if fish_type == 'wave':
if harmonics is not None:
for h in range(min(harmonics, wave_spec.rows())+1):
table.append_data(wave_spec[h,'amplitude'])
if h > 0:
table.append_data(wave_spec[h,'relampl'])
table.append_data(wave_spec[h,'relpower'])
table.append_data(wave_spec[h,'phase'])
if len(wave_names) > 0:
fn = base_path + '-eodwaveform-%d'%idx + file_ext
if zf is not None:
fn = io.TextIOWrapper(zf.open(fn, 'r'))
wave_eod = TableData(fn).array()
wave_eod[:,0] *= 0.001
for species, eod in zip(wave_names, wave_eods):
rms = wave_similarity(eod, wave_eod, 1.0, eodf)
if rms < species_rms and rms < wave_max_rms:
species_name = species
species_rms = rms
table.append_data(100.0*rms)
table.append_data(species_name)
else:
if peaks0 is not None:
fn = base_path + '-pulsepeaks-%d'%idx + file_ext
if zf is not None:
fn = io.TextIOWrapper(zf.open(fn, 'r'))
pulse_peaks = TableData(fn)
for p in range(peaks0, peaks1+1):
for pr in range(pulse_peaks.rows()):
if pulse_peaks[pr,'P'] == p:
break
else:
continue
if p != 1:
table.append_data(pulse_peaks[pr,'time'], 'P%dtime' % p)
table.append_data(pulse_peaks[pr,'amplitude'], 'P%dampl' % p)
if p != 1:
table.append_data(pulse_peaks[pr,'relampl'], 'P%drelampl' % p)
table.append_data(pulse_peaks[pr,'width'], 'P%dwidth' % p)
if len(pulse_names) > 0:
fn = base_path + '-eodwaveform-%d'%idx + file_ext
if zf is not None:
fn = io.TextIOWrapper(zf.open(fn, 'r'))
pulse_eod = TableData(fn).array()
pulse_eod[:,0] *= 0.001
for species, eod in zip(pulse_names, pulse_eods):
rms = pulse_similarity(eod, pulse_eod)
if rms < species_rms and rms < pulse_max_rms:
species_name = species
species_rms = rms
table.append_data(100.0*rms)
table.append_data(species_name)
#if len(wave_names) + len(pulse_names) > 0:
# all_table.append_data(species_name)
table.fill_data()
all_table.fill_data()
# check coverage of meta data:
if meta_recordings_used is not None:
if np.all(meta_recordings_used):
if verbose > 0:
print('found recordings for all meta data')
else:
if verbose > 0:
print('no recordings found for:')
for mr in range(len(meta_recordings)):
recording = meta_recordings[mr]
if not meta_recordings_used[mr]:
if verbose > 0:
print(recording)
all_table.set_column(0)
all_table.append_data(recording)
for c in range(meta_data.columns()):
all_table.append_data(meta_data[mr,c])
all_table.append_data(np.nan) # index
all_table.append_data(np.nan) # EODf
all_table.append_data('none') # type
# adjust EODf to mean temperature:
for table in [wave_table, pulse_table, all_table]:
if table is not None and temp_col is not None:
eodf_idx = table.index('EODf')
table.insert(eodf_idx+1, 'EODf_adjust', 'Hz', '%.1f')
table.fill_data()
temp_idx = table.index(temp_col)
tadjust_idx = table.index('T_adjust')
for r in range(table.rows()):
eodf = table[r,eodf_idx]
if np.isfinite(table[r,temp_col]) and np.isfinite(table[r,tadjust_idx]):
eodf = adjust_eodf(eodf, table[r,temp_col], table[r,tadjust_idx], q10)
table[r,eodf_idx+1] = eodf
# add wavefish species (experimental):
# simplify pathes:
if simplify_file and len(file_pathes) > 1:
fp0 = file_pathes[0]
for fi in range(len(fp0)):
is_same = True
for fp in file_pathes[1:]:
if fi >= len(fp) or fp[fi] != fp0[fi]:
is_same = False
break
if not is_same:
break
for table in [wave_table, pulse_table, all_table]:
if table is not None:
for k in range(table.rows()):
idx = table.index('file')
fps = os.path.normpath(table[k,idx]).split(os.path.sep)
table[k,idx] = os.path.sep.join(fps[fi:])
return wave_table, pulse_table, all_table
def rangestr(string):
"""
Parse string of the form N:M .
"""
if string[0] == '=':
string = '-' + string[1:]
ss = string.split(':')
v0 = v1 = None
if len(ss) == 1:
v0 = int(string)
v1 = v0
else:
v0 = int(ss[0])
v1 = int(ss[1])
return (v0, v1)
def main(cargs=None):
# command line arguments:
if cargs is None:
cargs = sys.argv[1:]
parser = argparse.ArgumentParser(add_help=True,
description='Collect data generated by thunderfish in a wavefish and a pulsefish table.',
epilog='version %s by Benda-Lab (2019-%s)' % (__version__, __year__))
parser.add_argument('--version', action='version', version=__version__)
parser.add_argument('-v', action='count', dest='verbose', default=0,
help='verbosity level: -v for meta data coverage, -vv for additional info on discarded recordings.')
parser.add_argument('-t', dest='table_type', default=None, choices=['wave', 'pulse'],
help='wave-type or pulse-type fish')
parser.add_argument('-c', dest='simplify_file', action='store_true',
help='remove initial common directories from input files')
parser.add_argument('-m', dest='max_fish', type=int, metavar='N',
help='maximum number of fish to be taken from each recording')
parser.add_argument('-p', dest='pulse_peaks', type=rangestr,
default=(0, 1), metavar='N:M',
help='add properties of peak N to M of pulse-type EODs to the table')
parser.add_argument('-w', dest='harmonics', type=int, default=3, metavar='N',
help='add properties of first N harmonics of wave-type EODs to the table')
parser.add_argument('-r', dest='remove_cols', action='append', default=[], metavar='COLUMN',
help='columns to be removed from output table')
parser.add_argument('-s', dest='statistics', action='store_true',
help='also write table with statistics')
parser.add_argument('-i', dest='meta_file', metavar='FILE:REC:TEMP', default='', type=str,
help='insert rows from metadata table in FILE matching recording in colum REC. The optional TEMP specifies a column with temperatures to which EOD frequencies should be adjusted')
parser.add_argument('-q', dest='q10', metavar='Q10', default=1.62, type=float,
help='Q10 value for adjusting EOD frequencies to a common temperature')
parser.add_argument('-S', dest='skip', action='store_true',
help='skip recordings that are not contained in metadata table')
parser.add_argument('-n', dest='file_suffix', metavar='NAME', default='', type=str,
help='name for summary files that is appended to "wavefish" or "pulsefish"')
parser.add_argument('-o', dest='out_path', metavar='PATH', default='.', type=str,
help='path where to store summary tables')
parser.add_argument('-f', dest='format', default='auto', type=str,
choices=TableData.formats + ['same'],
help='file format used for saving summary tables ("same" uses same format as input files)')
parser.add_argument('file', nargs='+', default='', type=str,
help='a *-wavefish.* or *-pulsefish.* file as generated by thunderfish')
# fix minus sign issue:
ca = []
pa = False
for a in cargs:
if pa and a[0] == '-':
a = '=' + a[1:]
pa = False
if a == '-p':
pa = True
ca.append(a)
# read in command line arguments:
args = parser.parse_args(ca)
verbose = args.verbose
table_type = args.table_type
remove_cols = args.remove_cols
statistics = args.statistics
meta_file = args.meta_file
file_suffix = args.file_suffix
out_path = args.out_path
data_format = args.format
# expand wildcard patterns:
files = []
if os.name == 'nt':
for fn in args.file:
files.extend(glob.glob(fn))
else:
files = args.file
# read configuration:
cfgfile = __package__ + '.cfg'
cfg = ConfigFile()
add_harmonic_groups_config(cfg)
add_eod_quality_config(cfg)
add_species_config(cfg)
add_write_table_config(cfg, table_format='csv', unit_style='row',
align_columns=True, shrink_width=False)
cfg.load_files(cfgfile, files[0], 3)
# output format:
if data_format == 'same':
ext = os.path.splitext(files[0])[1][1:]
if ext in TableData.ext_formats:
data_format = TableData.ext_formats[ext]
else:
data_format = 'dat'
if data_format != 'auto':
cfg.set('fileFormat', data_format)
# create output folder:
if not os.path.exists(out_path):
os.makedirs(out_path)
# read in meta file:
md = None
rec_data = None
temp_col = None
if len(meta_file) > 0:
mds = meta_file.split(':')
meta_data = mds[0]
if not os.path.isfile(meta_data):
print('meta data file "%s" not found.' % meta_data)
exit()
md = TableData(meta_data)
if len(mds) < 2:
print('no recording column specified for the table in %s. Choose one of' % meta_data)
for k in md.keys():
print(' ', k)
exit()
rec_col = mds[1]
if rec_col not in md:
print('%s is not a valid key for the table in %s. Choose one of' % (rec_col, meta_data))
for k in md.keys():
print(' ', k)
exit()
else:
rec_data = md[:,rec_col]
del md[:,rec_col]
if len(mds) > 2:
temp_col = mds[2]
if temp_col not in md:
print('%s is not a valid key for the table in %s. Choose one of' % (temp_col, meta_data))
for k in md.keys():
print(' ', k)
exit()
# collect files:
wave_table, pulse_table, all_table = collect_fish(files, args.simplify_file,
md, rec_data, args.skip,
temp_col, args.q10,
args.max_fish, args.harmonics,
args.pulse_peaks[0], args.pulse_peaks[1],
cfg, verbose)
# write tables:
if len(file_suffix) > 0 and file_suffix[0] != '-':
file_suffix = '-' + file_suffix
tables = []
table_names = []
if pulse_table and (not table_type or table_type == 'pulse'):
tables.append(pulse_table)
table_names.append('pulse')
if wave_table and (not table_type or table_type == 'wave'):
tables.append(wave_table)
table_names.append('wave')
if all_table and not table_type:
tables.append(all_table)
table_names.append('all')
for table, name in zip(tables, table_names):
for rc in remove_cols:
if rc in table:
table.remove(rc)
table.write(os.path.join(out_path, '%sfish%s' % (name, file_suffix)),
**write_table_args(cfg))
if statistics:
s = table.statistics()
s.write(os.path.join(out_path, '%sfish%s-statistics' % (name, file_suffix)),
**write_table_args(cfg))
if __name__ == '__main__':
main()
Functions
def collect_fish(files, simplify_file=False, meta_data=None, meta_recordings=None, skip_recordings=False, temp_col=None, q10=1.62, max_fish=0, harmonics=None, peaks0=None, peaks1=None, cfg=None, verbose=0)
-
Combine all -wavefish. and/or -pulsefish. files into respective summary tables.
Data from the -wavespectrum-. and the -pulsepeaks-. files can be added as specified by
harmonics
,peaks0
, andpeaks1
.Meta data of the recordings can also be added via
meta_data
andmeta_recordings
. Ifmeta_data
contains a column with temperature, this column can be specified by thetemp_col
parameter. In this case, an 'T_adjust' and an 'EODf_adjust' column are inserted into the resulting tables containing the mean temperature and EOD frequencies adjusted to this temperature, respectively. For the temperature adjustment of EOD frequency a Q10 value can be supplied by theq10
parameter.Parameters
files
:list
ofstrings
- Files to be combined.
simplify_file
:boolean
- Remove initial common directories from input files.
meta_data
:TableData
orNone
- Table with additional data for each of the recordings.
The meta data are inserted into the summary table according to
the name of the recording as specified in
meta_recordings
. meta_recordings
:array
ofstrings
- For each row in
meta_data
the name of the recording. This name is matched agains the basename of inputfiles
. skip_recordings
:bool
- If True skip recordings that are not found in
meta_recordings
. temp_col
:string
orNone
- A column in
meta_data
with temperatures to which EOD frequences should be adjusted. q10
:float
- Q10 value describing temperature dependence of EOD frequencies. The default of 1.62 is from Dunlap, Smith, Yetka (2000) Brain Behav Evol, measured for Apteronotus lepthorhynchus in the lab.
max_fish
:int
- Maximum number of fish to be taken, if 0 take all.
harmonics
:int
- Number of harmonic to be added to the wave-type fish table (amplitude, relampl, phase). This data is read in from the corresponding -wavespectrum-.* files.
peaks0
:int
- Index of the first peak of a EOD pulse to be added to the pulse-type fish table. This data is read in from the corresponding -pulsepeaks-.* files.
peaks1
:int
- Index of the last peak of a EOD pulse to be added to the pulse-type fish table. This data is read in from the corresponding -pulsepeaks-.* files.
cfg
:ConfigFile
- Configuration parameter for EOD quality assessment and species assignment.
verbose
:int
-
Verbose output:
1: print infos on meta data coverage. 2: print additional infos on discarded recordings.
Returns
wave_table
:TableData
- Summary table for all wave-type fish.
pulse_table
:TableData
- Summary table for all pulse-type fish.
all_table
:TableData
- Summary table for all wave-type and pulse-type fish.
Expand source code
def collect_fish(files, simplify_file=False, meta_data=None, meta_recordings=None, skip_recordings=False, temp_col=None, q10=1.62, max_fish=0, harmonics=None, peaks0=None, peaks1=None, cfg=None, verbose=0): """Combine all *-wavefish.* and/or *-pulsefish.* files into respective summary tables. Data from the *-wavespectrum-*.* and the *-pulsepeaks-*.* files can be added as specified by `harmonics`, `peaks0`, and `peaks1`. Meta data of the recordings can also be added via `meta_data` and `meta_recordings`. If `meta_data` contains a column with temperature, this column can be specified by the `temp_col` parameter. In this case, an 'T_adjust' and an 'EODf_adjust' column are inserted into the resulting tables containing the mean temperature and EOD frequencies adjusted to this temperature, respectively. For the temperature adjustment of EOD frequency a Q10 value can be supplied by the `q10` parameter. Parameters ---------- files: list of strings Files to be combined. simplify_file: boolean Remove initial common directories from input files. meta_data: TableData or None Table with additional data for each of the recordings. The meta data are inserted into the summary table according to the name of the recording as specified in `meta_recordings`. meta_recordings: array of strings For each row in `meta_data` the name of the recording. This name is matched agains the basename of input `files`. skip_recordings: bool If True skip recordings that are not found in `meta_recordings`. temp_col: string or None A column in `meta_data` with temperatures to which EOD frequences should be adjusted. q10: float Q10 value describing temperature dependence of EOD frequencies. The default of 1.62 is from Dunlap, Smith, Yetka (2000) Brain Behav Evol, measured for Apteronotus lepthorhynchus in the lab. max_fish: int Maximum number of fish to be taken, if 0 take all. harmonics: int Number of harmonic to be added to the wave-type fish table (amplitude, relampl, phase). This data is read in from the corresponding *-wavespectrum-*.* files. peaks0: int Index of the first peak of a EOD pulse to be added to the pulse-type fish table. This data is read in from the corresponding *-pulsepeaks-*.* files. peaks1: int Index of the last peak of a EOD pulse to be added to the pulse-type fish table. This data is read in from the corresponding *-pulsepeaks-*.* files. cfg: ConfigFile Configuration parameter for EOD quality assessment and species assignment. verbose: int Verbose output: 1: print infos on meta data coverage. 2: print additional infos on discarded recordings. Returns ------- wave_table: TableData Summary table for all wave-type fish. pulse_table: TableData Summary table for all pulse-type fish. all_table: TableData Summary table for all wave-type and pulse-type fish. """ def file_iter(files): """ Iterate over analysis files. Parameters ---------- files: list of str Input files. Yields ------ zf: ZipFile or None In case an input file is a zip archive, the open archive. file_path: str The full path of a single file to be processed. I.e. a '*-wavefish.*' or '*-pulsefish.*' file. fish_type: str Either 'wave' or 'pulse'. """ for file_path in files: _, _, _, _, ftype, _, ext = parse_filename(file_path) if ext == 'zip': zf = zipfile.ZipFile(file_path) file_pathes = sorted(zf.namelist()) for zfile in file_pathes: _, _, _, _, ftype, _, _ = parse_filename(zfile) if ftype in ['wavefish', 'pulsefish']: yield zf, zfile, ftype[:-4] elif ftype in ['wavefish', 'pulsefish']: yield None, file_path, ftype[:-4] else: continue def find_recording(recording, meta_recordings): """ Find row of a recording in meta data. Parameters ---------- recording: string Path and base name of a recording. meta_recordings: list of string List of meta data recordings where to find `recording`. """ if meta_data is not None: rec = os.path.splitext(os.path.basename(recording))[0] for i in range(len(meta_recordings)): # TODO: strip extension! if rec == meta_recordings[i]: return i return -1 # prepare meta recodings names: meta_recordings_used = None if meta_recordings is not None: meta_recordings_used = np.zeros(len(meta_recordings), dtype=bool) for r in range(len(meta_recordings)): meta_recordings[r] = os.path.splitext(os.path.basename(meta_recordings[r]))[0] # prepare adjusted temperatures: if meta_data is not None and temp_col is not None: temp_idx = meta_data.index(temp_col) temp = meta_data[:,temp_idx] mean_tmp = np.round(np.nanmean(temp)/0.1)*0.1 meta_data.insert(temp_idx+1, 'T_adjust', 'C', '%.1f') meta_data.append_data_column([mean_tmp]*meta_data.rows(), temp_idx+1) # prepare species distances: wave_names, wave_eods, pulse_names, pulse_eods = \ load_species_waveforms(cfg.value('speciesFile')) wave_max_rms = cfg.value('maximumWaveSpeciesRMS') pulse_max_rms = cfg.value('maximumPulseSpeciesRMS') # load data: wave_table = None pulse_table = None all_table = None file_pathes = [] for zf, file_name, fish_type in file_iter(files): # file name: table = None window_time = None recording, base_path, channel, start_time, _, _, file_ext = \ parse_filename(file_name) file_ext = os.extsep + file_ext file_pathes.append(os.path.normpath(recording).split(os.path.sep)) if verbose > 2: print('processing %s (%s):' % (file_name, recording)) # find row in meta_data: mr = -1 if meta_data is not None: mr = find_recording(recording, meta_recordings) if mr < 0: if skip_recordings: if verbose > 0: print('skip recording %s: no metadata found' % recording) continue elif verbose > 0: print('no metadata found for recording %s' % recording) else: meta_recordings_used[mr] = True # data: if zf is not None: file_name = io.TextIOWrapper(zf.open(file_name, 'r')) data = TableData(file_name) if 'twin' in data: start_time = data[0, 'twin'] window_time = data[0, 'window'] data.remove(['twin', 'window']) table = wave_table if fish_type == 'wave' else pulse_table # prepare tables: if not table: df = TableData(data) df.clear_data() if meta_data is not None: if data.nsecs > 0: df.insert_section(0, 'metadata') for c in range(meta_data.columns()): df.insert(c, *meta_data.column_head(c)) df.insert(0, ['recording']*data.nsecs + ['file'], '', '%-s') if window_time is not None: df.insert(1, 'window', 's', '%.2f') if start_time is not None: df.insert(1, 'time', 's', '%.2f') if channel >= 0: df.insert(1, 'channel', '', '%d') if fish_type == 'wave': if harmonics is not None: fn = base_path + '-wavespectrum-0' + file_ext if zf is not None: fn = io.TextIOWrapper(zf.open(fn, 'r')) wave_spec = TableData(fn) if data.nsecs > 0: df.append_section('harmonics') for h in range(min(harmonics, wave_spec.rows())+1): df.append('ampl%d' % h, wave_spec.unit('amplitude'), wave_spec.format('amplitude')) if h > 0: df.append('relampl%d' % h, '%', '%.2f') df.append('relpower%d' % h, '%', '%.2f') df.append('phase%d' % h, 'rad', '%.3f') if len(wave_names) > 0: if data.nsecs > 0: df.append_section('species') for species in wave_names: df.append(species, '%', '%.0f') df.append('species', '', '%-s') else: if peaks0 is not None: fn = base_path + '-pulsepeaks-0' + file_ext if zf is not None: fn = io.TextIOWrapper(zf.open(fn, 'r')) pulse_peaks = TableData(fn) if data.nsecs > 0: df.append_section('peaks') for p in range(peaks0, peaks1+1): if p != 1: df.append('P%dtime' % p, 'ms', '%.3f') df.append('P%dampl' % p, pulse_peaks.unit('amplitude'), pulse_peaks.format('amplitude')) if p != 1: df.append('P%drelampl' % p, '%', '%.2f') df.append('P%dwidth' % p, 'ms', '%.3f') if len(pulse_names) > 0: if data.nsecs > 0: df.append_section('species') for species in pulse_names: df.append(species, '%', '%.0f') df.append('species', '', '%-s') if fish_type == 'wave': wave_table = df table = wave_table else: pulse_table = df table = pulse_table if not all_table: df = TableData() df.append('file', '', '%-s') if channel >= 0: df.append('channel', '', '%d') if start_time is not None: df.append('time', 's', '%.1f') if window_time is not None: df.append('window', 's', '%.1f') if meta_data is not None: for c in range(meta_data.columns()): df.append(*meta_data.column_head(c)) df.append('index', '', '%d') df.append('EODf', 'Hz', '%.1f') df.append('type', '', '%-5s') if len(wave_names) + len(pulse_names) > 0: df.append('species', '', '%-s') all_table = df # fill tables: n = data.rows() if not max_fish or max_fish > data.rows() else max_fish for r in range(n): # fish index: idx = r if 'index' in data: idx = data[r,'index'] # check quality: skips = '' if fish_type == 'wave': fn = base_path + '-wavespectrum-%d'%idx + file_ext if zf is not None: fn = io.TextIOWrapper(zf.open(fn, 'r')) wave_spec = TableData(fn) if cfg is not None: spec_data = wave_spec.array() props = data.row_dict(r) if 'clipped' in props: props['clipped'] *= 0.01 if 'noise' in props: props['noise'] *= 0.01 if 'rmserror' in props: props['rmserror'] *= 0.01 if 'thd' in props: props['thd'] *= 0.01 _, skips, msg = wave_quality(props, 0.01*spec_data[1:,3], **wave_quality_args(cfg)) else: if cfg is not None: props = data.row_dict(r) if 'clipped' in props: props['clipped'] *= 0.01 if 'noise' in props: props['noise'] *= 0.01 skips, msg, _ = pulse_quality(props, **pulse_quality_args(cfg)) if len(skips) > 0: if verbose > 1: print('skip fish %2d from %s: %s' % (idx, recording, skips)) continue # fill in data: data_col = 0 table.append_data(recording, data_col) all_table.append_data(recording, data_col) data_col += 1 if channel >= 0: table.append_data(channel, data_col) all_table.append_data(channel, data_col) data_col += 1 if start_time is not None: table.append_data(start_time, data_col) all_table.append_data(start_time, data_col) data_col += 1 if window_time is not None: table.append_data(window_time, data_col) all_table.append_data(window_time, data_col) data_col += 1 # meta data: if mr >= 0: for c in range(meta_data.columns()): table.append_data(meta_data[mr,c], data_col) all_table.append_data(meta_data[mr,c], data_col) data_col += 1 elif meta_data is not None: data_col += meta_data.columns() table.append_data(data[r,:].array(), data_col) eodf = data[r,'EODf'] all_table.append_data(data[r,'index'], data_col) all_table.append_data(eodf) all_table.append_data(fish_type) species_name = 'unknown' species_rms = 1.0e12 if fish_type == 'wave': if harmonics is not None: for h in range(min(harmonics, wave_spec.rows())+1): table.append_data(wave_spec[h,'amplitude']) if h > 0: table.append_data(wave_spec[h,'relampl']) table.append_data(wave_spec[h,'relpower']) table.append_data(wave_spec[h,'phase']) if len(wave_names) > 0: fn = base_path + '-eodwaveform-%d'%idx + file_ext if zf is not None: fn = io.TextIOWrapper(zf.open(fn, 'r')) wave_eod = TableData(fn).array() wave_eod[:,0] *= 0.001 for species, eod in zip(wave_names, wave_eods): rms = wave_similarity(eod, wave_eod, 1.0, eodf) if rms < species_rms and rms < wave_max_rms: species_name = species species_rms = rms table.append_data(100.0*rms) table.append_data(species_name) else: if peaks0 is not None: fn = base_path + '-pulsepeaks-%d'%idx + file_ext if zf is not None: fn = io.TextIOWrapper(zf.open(fn, 'r')) pulse_peaks = TableData(fn) for p in range(peaks0, peaks1+1): for pr in range(pulse_peaks.rows()): if pulse_peaks[pr,'P'] == p: break else: continue if p != 1: table.append_data(pulse_peaks[pr,'time'], 'P%dtime' % p) table.append_data(pulse_peaks[pr,'amplitude'], 'P%dampl' % p) if p != 1: table.append_data(pulse_peaks[pr,'relampl'], 'P%drelampl' % p) table.append_data(pulse_peaks[pr,'width'], 'P%dwidth' % p) if len(pulse_names) > 0: fn = base_path + '-eodwaveform-%d'%idx + file_ext if zf is not None: fn = io.TextIOWrapper(zf.open(fn, 'r')) pulse_eod = TableData(fn).array() pulse_eod[:,0] *= 0.001 for species, eod in zip(pulse_names, pulse_eods): rms = pulse_similarity(eod, pulse_eod) if rms < species_rms and rms < pulse_max_rms: species_name = species species_rms = rms table.append_data(100.0*rms) table.append_data(species_name) #if len(wave_names) + len(pulse_names) > 0: # all_table.append_data(species_name) table.fill_data() all_table.fill_data() # check coverage of meta data: if meta_recordings_used is not None: if np.all(meta_recordings_used): if verbose > 0: print('found recordings for all meta data') else: if verbose > 0: print('no recordings found for:') for mr in range(len(meta_recordings)): recording = meta_recordings[mr] if not meta_recordings_used[mr]: if verbose > 0: print(recording) all_table.set_column(0) all_table.append_data(recording) for c in range(meta_data.columns()): all_table.append_data(meta_data[mr,c]) all_table.append_data(np.nan) # index all_table.append_data(np.nan) # EODf all_table.append_data('none') # type # adjust EODf to mean temperature: for table in [wave_table, pulse_table, all_table]: if table is not None and temp_col is not None: eodf_idx = table.index('EODf') table.insert(eodf_idx+1, 'EODf_adjust', 'Hz', '%.1f') table.fill_data() temp_idx = table.index(temp_col) tadjust_idx = table.index('T_adjust') for r in range(table.rows()): eodf = table[r,eodf_idx] if np.isfinite(table[r,temp_col]) and np.isfinite(table[r,tadjust_idx]): eodf = adjust_eodf(eodf, table[r,temp_col], table[r,tadjust_idx], q10) table[r,eodf_idx+1] = eodf # add wavefish species (experimental): # simplify pathes: if simplify_file and len(file_pathes) > 1: fp0 = file_pathes[0] for fi in range(len(fp0)): is_same = True for fp in file_pathes[1:]: if fi >= len(fp) or fp[fi] != fp0[fi]: is_same = False break if not is_same: break for table in [wave_table, pulse_table, all_table]: if table is not None: for k in range(table.rows()): idx = table.index('file') fps = os.path.normpath(table[k,idx]).split(os.path.sep) table[k,idx] = os.path.sep.join(fps[fi:]) return wave_table, pulse_table, all_table
def rangestr(string)
-
Parse string of the form N:M .
Expand source code
def rangestr(string): """ Parse string of the form N:M . """ if string[0] == '=': string = '-' + string[1:] ss = string.split(':') v0 = v1 = None if len(ss) == 1: v0 = int(string) v1 = v0 else: v0 = int(ss[0]) v1 = int(ss[1]) return (v0, v1)
def main(cargs=None)
-
Expand source code
def main(cargs=None): # command line arguments: if cargs is None: cargs = sys.argv[1:] parser = argparse.ArgumentParser(add_help=True, description='Collect data generated by thunderfish in a wavefish and a pulsefish table.', epilog='version %s by Benda-Lab (2019-%s)' % (__version__, __year__)) parser.add_argument('--version', action='version', version=__version__) parser.add_argument('-v', action='count', dest='verbose', default=0, help='verbosity level: -v for meta data coverage, -vv for additional info on discarded recordings.') parser.add_argument('-t', dest='table_type', default=None, choices=['wave', 'pulse'], help='wave-type or pulse-type fish') parser.add_argument('-c', dest='simplify_file', action='store_true', help='remove initial common directories from input files') parser.add_argument('-m', dest='max_fish', type=int, metavar='N', help='maximum number of fish to be taken from each recording') parser.add_argument('-p', dest='pulse_peaks', type=rangestr, default=(0, 1), metavar='N:M', help='add properties of peak N to M of pulse-type EODs to the table') parser.add_argument('-w', dest='harmonics', type=int, default=3, metavar='N', help='add properties of first N harmonics of wave-type EODs to the table') parser.add_argument('-r', dest='remove_cols', action='append', default=[], metavar='COLUMN', help='columns to be removed from output table') parser.add_argument('-s', dest='statistics', action='store_true', help='also write table with statistics') parser.add_argument('-i', dest='meta_file', metavar='FILE:REC:TEMP', default='', type=str, help='insert rows from metadata table in FILE matching recording in colum REC. The optional TEMP specifies a column with temperatures to which EOD frequencies should be adjusted') parser.add_argument('-q', dest='q10', metavar='Q10', default=1.62, type=float, help='Q10 value for adjusting EOD frequencies to a common temperature') parser.add_argument('-S', dest='skip', action='store_true', help='skip recordings that are not contained in metadata table') parser.add_argument('-n', dest='file_suffix', metavar='NAME', default='', type=str, help='name for summary files that is appended to "wavefish" or "pulsefish"') parser.add_argument('-o', dest='out_path', metavar='PATH', default='.', type=str, help='path where to store summary tables') parser.add_argument('-f', dest='format', default='auto', type=str, choices=TableData.formats + ['same'], help='file format used for saving summary tables ("same" uses same format as input files)') parser.add_argument('file', nargs='+', default='', type=str, help='a *-wavefish.* or *-pulsefish.* file as generated by thunderfish') # fix minus sign issue: ca = [] pa = False for a in cargs: if pa and a[0] == '-': a = '=' + a[1:] pa = False if a == '-p': pa = True ca.append(a) # read in command line arguments: args = parser.parse_args(ca) verbose = args.verbose table_type = args.table_type remove_cols = args.remove_cols statistics = args.statistics meta_file = args.meta_file file_suffix = args.file_suffix out_path = args.out_path data_format = args.format # expand wildcard patterns: files = [] if os.name == 'nt': for fn in args.file: files.extend(glob.glob(fn)) else: files = args.file # read configuration: cfgfile = __package__ + '.cfg' cfg = ConfigFile() add_harmonic_groups_config(cfg) add_eod_quality_config(cfg) add_species_config(cfg) add_write_table_config(cfg, table_format='csv', unit_style='row', align_columns=True, shrink_width=False) cfg.load_files(cfgfile, files[0], 3) # output format: if data_format == 'same': ext = os.path.splitext(files[0])[1][1:] if ext in TableData.ext_formats: data_format = TableData.ext_formats[ext] else: data_format = 'dat' if data_format != 'auto': cfg.set('fileFormat', data_format) # create output folder: if not os.path.exists(out_path): os.makedirs(out_path) # read in meta file: md = None rec_data = None temp_col = None if len(meta_file) > 0: mds = meta_file.split(':') meta_data = mds[0] if not os.path.isfile(meta_data): print('meta data file "%s" not found.' % meta_data) exit() md = TableData(meta_data) if len(mds) < 2: print('no recording column specified for the table in %s. Choose one of' % meta_data) for k in md.keys(): print(' ', k) exit() rec_col = mds[1] if rec_col not in md: print('%s is not a valid key for the table in %s. Choose one of' % (rec_col, meta_data)) for k in md.keys(): print(' ', k) exit() else: rec_data = md[:,rec_col] del md[:,rec_col] if len(mds) > 2: temp_col = mds[2] if temp_col not in md: print('%s is not a valid key for the table in %s. Choose one of' % (temp_col, meta_data)) for k in md.keys(): print(' ', k) exit() # collect files: wave_table, pulse_table, all_table = collect_fish(files, args.simplify_file, md, rec_data, args.skip, temp_col, args.q10, args.max_fish, args.harmonics, args.pulse_peaks[0], args.pulse_peaks[1], cfg, verbose) # write tables: if len(file_suffix) > 0 and file_suffix[0] != '-': file_suffix = '-' + file_suffix tables = [] table_names = [] if pulse_table and (not table_type or table_type == 'pulse'): tables.append(pulse_table) table_names.append('pulse') if wave_table and (not table_type or table_type == 'wave'): tables.append(wave_table) table_names.append('wave') if all_table and not table_type: tables.append(all_table) table_names.append('all') for table, name in zip(tables, table_names): for rc in remove_cols: if rc in table: table.remove(rc) table.write(os.path.join(out_path, '%sfish%s' % (name, file_suffix)), **write_table_args(cfg)) if statistics: s = table.statistics() s.write(os.path.join(out_path, '%sfish%s-statistics' % (name, file_suffix)), **write_table_args(cfg))