From 32fe2ba67a598c751385b96aa45a2947f8f31ced Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 29 Mar 2022 18:51:32 -0400 Subject: [PATCH 1/2] DOC: convert docs to markdown + use myst, fix #151 - add .md files generated by rst-to-myst - add dependencies to 'doc' in pyproject.toml - update extensions in conf.py, add myst_parser and a couple others - delete .rst versions of docs files - fix broken md in doc/howto-user-format.md from weird .rst that didn't parse well --- doc/{background.rst => background.md} | 69 ++-- doc/conf.py | 6 +- doc/howto-user-format.md | 471 +++++++++++++++++++++++++ doc/howto-user-format.rst | 482 -------------------------- doc/howto.md | 11 + doc/howto.rst | 12 - doc/index.md | 200 +++++++++++ doc/index.rst | 211 ----------- doc/{tutorial.rst => tutorial.md} | 324 ++++++++--------- pyproject.toml | 6 +- 10 files changed, 887 insertions(+), 905 deletions(-) rename doc/{background.rst => background.md} (50%) create mode 100644 doc/howto-user-format.md delete mode 100644 doc/howto-user-format.rst create mode 100644 doc/howto.md delete mode 100644 doc/howto.rst create mode 100644 doc/index.md delete mode 100644 doc/index.rst rename doc/{tutorial.rst => tutorial.md} (56%) diff --git a/doc/background.rst b/doc/background.md similarity index 50% rename from doc/background.rst rename to doc/background.md index ed4c1372..d9a8bf59 100644 --- a/doc/background.rst +++ b/doc/background.md @@ -1,13 +1,10 @@ -.. _background: +(background)= -============== -**Background** -============== +# **Background** -**Why is** ``crowsetta`` **needed**? -==================================== +## **Why is** `crowsetta` **needed**? -The target audience of ``crowsetta`` is anyone that works with birdsong +The target audience of `crowsetta` is anyone that works with birdsong or any other vocalization that is annotated in some way, meaning someone took the time to figure out where elements of the vocalizations start and stop, and has assigned labels to those elements. Maybe you are a neuroscientist trying to figure out how songbirds learn their song, @@ -19,55 +16,55 @@ Alzheimer's disease, etc., etc., ... To run a computational analysis on this kind of data, you'll need to get the annotation out of a file, which often means you'll end up writing something like this: -.. code-block:: python - - from scipy.io import loadmat # function from scipy library for loading Matlab data files - annot = loadmat('bird1_experiment1_annotation_2018-11-17_083521.mat', squeeze_me=True) - onsets = annot['onsets'] # unpack from dictionary - onsets = np.asarray(onsets) # convert to an array - onsets = onsets / 1000 # convert from milliseconds to seconds +```python +from scipy.io import loadmat # function from scipy library for loading Matlab data files +annot = loadmat('bird1_experiment1_annotation_2018-11-17_083521.mat', squeeze_me=True) +onsets = annot['onsets'] # unpack from dictionary +onsets = np.asarray(onsets) # convert to an array +onsets = onsets / 1000 # convert from milliseconds to seconds +``` This is verbose and not easy to read. You could do some of it in one line ... -.. code-block:: python - - onsets = np.asarray(annot['onsets']) / 1000 +```python +onsets = np.asarray(annot['onsets']) / 1000 +``` ... but now the next time you read that one-liner, you will have to mentally unpack it. -Such code quickly turns into `boilerplate `_ +Such code quickly turns into [boilerplate](https://en.wikipedia.org/wiki/Boilerplate_code) that you will write any time you need to work with this data. It becomes repetitive and -presents many opportunities for easy-to-miss bugs (e.g. a line with a variable named ``offset`` -where you meant to type ``onset`` of some syllable or phoneme or whatever, because you cut and -pasted the line above it, and forgot to change ``off`` to ``on``\ ). +presents many opportunities for easy-to-miss bugs (e.g. a line with a variable named `offset` +where you meant to type `onset` of some syllable or phoneme or whatever, because you cut and +pasted the line above it, and forgot to change `off` to `on`). And things can become even more complicated if you have to deal with annotation stored in other formats, such as a database. Here's an example of one way -.. code-block:: python - - import pymyseql +```python +import pymyseql +``` What would be nice is to have data types that represent annotation in a concise way, and that we can manipulate like we would some native Python data type like a list or a -dictionary. ``crowsetta`` provides such data types: ``Sequence``\ s and ``Segment``\ s. +dictionary. `crowsetta` provides such data types: `Sequence`s and `Segment`s. -**How** ``crowsetta`` **works** -=============================== +## **How** `crowsetta` **works** -Internally, ``crowsetta`` takes whatever format you give it for a pile of files, -and turns that into a bunch of ``Sequence``\ s made up of ``Segment``\ s. For someone working -with birdsong, the ``Sequence``\ s will be single audio files / song bouts, and the -``Segment``\ s will be syllables in those song bouts (99.9% of the time). Then, if -you need it to, ``crowsetta`` can spit out your ``Sequence``\ s of ``Segment``\ s in +Internally, `crowsetta` takes whatever format you give it for a pile of files, +and turns that into a bunch of `Sequence`s made up of `Segment`s. For someone working +with birdsong, the `Sequence`s will be single audio files / song bouts, and the +`Segment`s will be syllables in those song bouts (99.9% of the time). Then, if +you need it to, `crowsetta` can spit out your `Sequence`s of `Segment`s in a simple text file with a comma-separated value (csv) format. This file format was chosen because it is widely considered to be the most robust way to share data. An example csv looks like this: -.. literalinclude:: ../tests/test_data/csv/gy6or6_032312.csv - :lines: 1-5 - :language: none +```{literalinclude} ../tests/test_data/csv/gy6or6_032312.csv +:language: none +:lines: 1-5 +``` -Now that you have that, you can load it into a `pandas` dataframe or an Excel +Now that you have that, you can load it into a `pandas` dataframe or an Excel spreadsheet or a SQL database, or whatever you want. diff --git a/doc/conf.py b/doc/conf.py index 7d72a6bd..e87bebc6 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -39,12 +39,16 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + 'myst_parser', 'sphinx.ext.autodoc', + 'sphinx.ext.coverage', + 'sphinx_copybutton', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', - 'sphinx.ext.coverage', 'sphinx.ext.mathjax', + 'sphinx.ext.napoleon', + 'sphinxext.opengraph', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode', ] diff --git a/doc/howto-user-format.md b/doc/howto-user-format.md new file mode 100644 index 00000000..94905c20 --- /dev/null +++ b/doc/howto-user-format.md @@ -0,0 +1,471 @@ +(howto-user-format)= + +# **How to use** `crowsetta` **with your own annotation format** + +This section shows you how to use `crowsetta` for working with your +own annotation format for vocalizations (or some other format not +currently built into the library). + +You can get the Jupyter notebook for this section by going to + and clicking on the big green +“Clone or Download” button on the right side of the screen. You can then +find this notebook and others in the `crowsetta/notebooks/` directory. + +## Steps to using `crowsetta` with your own annotation format + +Below we’ll walk through a case study for using `crowsetta` with your +annotation format. Here’s an outline of the steps we’ll go through: + +1. get your annotations into some variables in Python (maybe you already + wrote code to do this) +2. use one of the `Sequence` “factory functions” (we’ll explain what + that means) to conveniently turn your annotations into + `Sequence`s +3. turn the code you just wrote into a function that takes annotation + files as an argument, and returns `Sequence`s +4. make a `Transcriber` that knows to use this function when you tell + it you want to turn your annotation files into `Sequence`s + +## Case Study: the `BatLAB` format + +Let’s say you work in the Schumacher lab, studying bat vocalizations. +The lab research specialist, Alfred, has spent years writing an +application in Labview to capture bat calls, called `SoNAR` (“Sound +and Neural Activity Recorder”). Alfred has also written a GUI in MATLAB +called `BatLAB` that lets you interactively annotate audio files +containing the bats’ calls, and saves the annotations in `.mat` +(MATLAB data) files. + +You’ve started to work with Python to analyze your data, because you +like the data science and machine learning libraries. However, you find +yourself writing the same code over and over again to unpack the +annotations from the `.mat` files made by `BatLAB`. Every time you +use the code for a new analysis, you have to modify it slightly. The +code has some weird, hard-to-read lines to deal with the complicated +MATLAB `struct`s created by `BatLAB` and how they load into +Python. The code also has several repetitive steps to deal with the +idiosyncracies of how `SoNAR` and `BatLAB` save data: unit +conversion, data types, etcetera. You can’t change `BatLAB` or +`SoNAR` though, because that’s Alfred’s job, and everyone else’s code +that was written ten years ago (and still works!) expects those +idiosyncracies. + +You know that it’s a good idea to turn the code you wrote into a +function (because you took part in a [Software +Carpentry](https://software-carpentry.org/) workshop and then you +read [this +paper](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005510).) +You figured out which bits of the code will be common to all your +projects and you make that into a function, called `parse_batlab_mat`. +At first you just copy and paste it into all your projects. Then you +decide you also want to save everyone else in your lab the effort of +writing the same code, so you put the script on your lab’s Github page. +This is a step in the right direction, although `parse_batlab_mat` +gives you back a Python `list` of `dict`s, and you end up typing a +lot of things like: + +```python +labels = annot_list[0]['seg_type'] +onsets = annot_list[0]['seg_onsets'] +offsets = annot_list[0]['seg_offsets'] +``` + +Typing all those very similar `['keys']` in particular gets kind of +annoying and makes you wonder if you should spend your vacation learning +how to use one of those hacker text editors like `vim`. + +But before you can worry about that, you get back reviews of your paper +in *PLOS Comp. Bio.* called “Pidgeon Bat: Emergence of Dialects in +Colonies of Multiple Bat Species”. Reviewer #3 doesn’t buy your +conclusions (and you are pretty sure from the way they write that it is +Oswald Cobblepot, professor emeritus of ethology at Metropolitan +University of Fruitville, Florida, and author of the seminal review from +1982, “Bat Calls: A Completely Innate Behavior Encoded Genetically”). +You want to share your data with the world, mainly to mollify reviewer +\#3. The problem is that this reviewer (if he is who you think he is) +only knows how to write Fortran code and is definitely not going to +figure out how to copy and use your function `parse_batlab_mat` so he +can run your analysis scripts and reproduce your figures for himself. + +What you really want is to share your data and write your code in a way +that doesn’t depend on anyone knowing anything about `BatLAB` +or`SoNAR` and how those programs save data and annotations. This is +where `crowsetta` comes to your rescue. + +Okay, now that we’ve set up some background for our case study, let’s go +through the steps we outlined above. + +### 1. get your annotation into some variables in Python + +Let’s look at this complicated data structure that we have our +annotation in. The `BatLAB` GUI saves annotation into `annotation.mat` +files with two variables: +- `filenames`: a vector where each element is the name of an audio file +- `annotations`: a `struct` that has a record for each element in `filenames`, + and that record is the annotation corresponding + to the audio file with the same index in `filenames` + +```ipython3 +from scipy.io import loadmat +bat1_annotation = loadmat('bat1_annotation.mat') +print('variables in .mat file:', + [var for var in list(bat1_annotation.keys()) + if not var.startswith('__')] + ) +``` + +```{eval-rst} +.. parsed-literal:: + + variables in .mat file: ['filenames', 'annotations'] + +``` + +Below is the code you wrote to unpack the `.mat` files. Like we said +above, the code has some weird, hard-to-read lines to deal with the way +that the complicated MATLAB `struct`s created by `BatLAB` load +into Python, such as calling `tolist()` just to unpack an array, and +some logic to make sure the labels get loaded correctly into a numpy +array. And the code has several repetitive steps to deal with the +idiosyncracies of `SoNAR` and `BatLAB`, like converting the start +and stop times of the calls from seconds back to Hertz so you can find +those times in the raw audio files. + +```ipython3 +# %load -r 7-8,14-46 parsebat.py +mat = loadmat(mat_file, squeeze_me=True) +annot_list = [] +for filename, annotation in zip(mat['filenames'], mat['annotations']): + # below, .tolist() does not actually create a list, + # instead gets ndarray out of a zero-length ndarray of dtype=object. + # This is just weirdness that results from loading complicated data + # structure in .mat file. + seg_start_times = annotation['segFileStartTimes'].tolist() + seg_end_times = annotation['segFileEndTimes'].tolist() + seg_types = annotation['segType'].tolist() + if type(seg_types) == int: + # this happens when there's only one syllable in the file + # with only one corresponding label + seg_types = np.asarray([seg_types]) # so make it a one-element list + elif type(seg_types) == np.ndarray: + # this should happen whenever there's more than one label + pass + else: + # something unexpected happened + raise ValueError("Unable to load labels from {}, because " + "the segType parsed as type {} which is " + "not recognized.".format(filename, + type(seg_types))) + samp_freq = annotation['fs'].tolist() + seg_start_times_Hz = np.round(seg_start_times * samp_freq).astype(int) + seg_end_times_Hz = np.round(seg_end_times * samp_freq).astype(int) + annot_dict = { + 'audio_file': filename, + 'seg_types': seg_types, + 'seg_start_times': seg_start_times, + 'seg_end_times': seg_end_times, + 'seg_start_times_Hz': seg_start_times_Hz, + 'seg_end_times_Hz': seg_end_times_Hz, + } + annot_list.append(annot_dict) +``` + +When it runs on a file, you end up with an `annot_list` where each +item in the list is an `annot_dict` that contains the annotations for +a file, like this: + +```python +annot_dict = { + 'seg_types': array([1, 1, 5, 2, ...]), + 'seq_start_times': array([0.00297619, 0.279125, 0.55564729,... ]), + ... # end times, start and end times in Hertz +} +``` + +Again, as we said above, you turned your code into a function to make it +easier to use across projects: + +```python +import numpy as np +from scipy.io import loadmat + +def parse_batlab_mat(mat_file): + """parse batlab annotation.mat file""" + # code from above + return annot_list +``` + +As we’ll see in a moment, all you need to do is take this code you +already wrote, and instead of returning your `list` of `dict`s, +you return a list of `Sequence`s. + +### 2. use one of the `Sequence` “factory functions” to conveniently turn annotations in your format into `Sequence`s + +First, to get the `Sequence`, we’ll use a “factory function”, which +just means it’s a function built into the `Sequence` class that gives +us back an instance of a `Sequence`. One such factory function is +`Sequence.from_keyword`. Here’s an example of using it: + +```ipython3 +from parsebat import parse_batlab_mat +from crowsetta.sequence import Sequence + +# you, using the function you already wrote +annot_list = parse_batlab_mat(mat_file='bat1_annotation.mat') + +# you have annotation from one file in an "annot_dict" +annot_dict = annot_list[0] + +a_sequence = Sequence.from_keyword(labels=annot_dict['seg_types'], + onsets_s=annot_dict['seg_start_times'], + offsets_s=annot_dict['seg_end_times'], + onset_inds=annot_dict['seg_start_times_Hz'], + offset_inds=annot_dict['seg_end_times_Hz'], + file=annot_dict['audio_file']) +print("a_sequence:\n", a_sequence) +``` + +```{eval-rst} +.. parsed-literal:: + + a_sequence: + + +``` + +### 3. turn the code we just wrote into a function that takes annotation files as an argument, and returns `Sequence`s + +Again, you pretty much already wrote this. Just take your +`parse_batlab_mat` function from above and change a couple lines. +First, you’re going to return a list of sequences instead of your +`annot_list` from before. You probably want to make that explicit in +your function. + +```ipython3 +# %load -r 4-7,24-25 batlab2seq.py +from crowsetta.sequence import Sequence + + +def batlab2seq(mat_file): + mat = loadmat(mat_file, squeeze_me=True) + seq_list = [] +``` + +Then at the end of your main loop, instead of making your +`annot_dict`, you’ll make a new `Sequence` from each file using the +`from_keyword` factory function, append the new `Sequence` to your +`seq_list`, and then finally return that `list` of `Sequence`s. + +```ipython3 + # %load -r 56-63 batlab2seq.py + seq = Sequence.from_keyword(file=filename, + labels=seg_types, + onsets_s=seg_start_times, + offsets_s=seg_end_times, + onset_inds=seg_start_times_Hz, + offset_inds=seg_end_times_Hz) + seq_list.append(seq) + return seq_list +``` + +If this still feels too wordy and repetitive for you, you can put +``segFileStartTimes``, ``segFileEndTimes``, et al., into a Python +``dict`` with ``keys`` corresponding to the parameters for +``Segment.from_keyword``: + + +% + +```python +annot_dict = { + 'file': filename, + 'onsets_s': annotation['segFileStartTimes'].tolist(), + 'offsets_s': annotation['segFileEndTimes'].tolist() + 'labels': seg_types +} +``` + +Note here that you only have to specify the onsets an offsets of +segments *either* in seconds or in Hertz (but you can define +both). + + +% + +and then use another factory function, `Sequence.from_dict`, to +create the `Sequence`. +```python +seq_list.append(Sequence.from_dict(annot_dict)) +``` + +Now that you have a function that takes annotation files and return +`Sequence`s, call it something like `batlab2seq` and put it in a +file that ends with `.py`, e.g. `batlab2seq.py`. This is also +known as a Python **module** (as you’ll need to know below). To see the +entire example, check out the [batlab2seq.py](./batlab2seq.py) file +in this folder (and compare it with [parsebat.py](./parsebat.py)). + +### 4. make a `Transcriber` that knows to use this function when you tell it you want to turn your annotation files into `Sequence`s + +If you have worked with `Crowsetta` already, or gone through the +tutorial, you know that we can work with a `Transcriber` that does the +work of making `Sequence`s of `Segment`s from annotation files +for us. We create a new instance of a `Transcriber` by writing +something like this: + +```python +scribe = Transcriber() +``` + +You will do the same thing here, but to tell the `Transcriber` how to +work with your format, you will pass an argument for the `user_config` +parameter when you create a new one: + +```python +scribe = Transcriber(user_config=your_config) +``` + +The argument you pass for `user_config` will be a Python dictionary +with the following structure: + +```python +your_config = { + 'batlab': { + 'module': 'batlab2seq.py', + 'to_seq': 'batlab2seq', + 'to_csv': 'None', + 'to_format': 'None', + } +} +``` + +Notice that this a dictionary of dictionaries, where each `key` in the +top-level `dict` is the name of a user-defined format, here +`batlab`. If you had multiple formats to use, you would add more +`dict`s inside the top-level `dict`. + +The `value` for each `key` is another Python dictionary that tells +the `Transcriber` what functions to use from your module when you call +one of its methods and specify this format. In the example above, you’re +telling the `Transcriber` that when you say `file_format='batlab'`, +it should use functions from the `batlab2seq.py` module. More +specifically, when you call +`scribe.to_seq(file='annotation.mat', file_format='batlab')`, it +should use the `batlab2seq` function to convert your annotation into +`Sequence`s. Notice also that you can specify `'None'` for +`to_csv` and `to_format` (which would be a function that converts +`Sequence`s back to the `BatLAB` format). + +Here’s what it looks like to do all of that in a few lines of code: + +```ipython3 +from crowsetta import Transcriber + +your_config = { + 'batlab': { + 'module': 'batlab2seq.py', + 'to_seq': 'batlab2seq', + } +} + +scribe = Transcriber(user_config=your_config) + +seq_list = scribe.to_seq(file='bat1_annotation.mat', file_format='batlab') +``` + +And now, just like you do with the built-in formats, you get back a list +of `Sequence`s from your format: + +```ipython3 +print(f'First item in seq_list: {seq_list[0]}') +print(f'First segment in first sequence:\n{seq_list[0].segments[0]}') +``` + +```{eval-rst} +.. parsed-literal:: + + First item in seq_list: + First segment in first sequence: + Segment(label='1', file='lbr3009_0005_2017_04_27_06_14_46.wav', onset_s=0.0029761904761904934, offset_s=0.14150432900432905, onset_ind=143, offset_ind=6792) + +``` + +Notice that we also get a `to_csv` function for free: + +```ipython3 +scribe.to_csv(file='bat1_annotation.mat', + csv_filename='test.csv', + file_format='batlab') + +import csv +with open('test.csv', 'r', newline='') as csv_file: + reader = csv.reader(csv_file) + for _ in range(4): + print(next(reader)) +``` + +```{eval-rst} +.. parsed-literal:: + + ['label', 'onset_s', 'offset_s', 'onset_ind', 'offset_ind', 'file'] + ['1', '0.0029761904761904934', '0.14150432900432905', '143', '6792', 'lbr3009_0005_2017_04_27_06_14_46.wav'] + ['1', '0.279125', '0.504625', '13398', '24222', 'lbr3009_0005_2017_04_27_06_14_46.wav'] + ['5', '0.5556472915365209', '0.5962916666666667', '26671', '28622', 'lbr3009_0005_2017_04_27_06_14_46.wav'] + +``` + +How does that work? Well, as long as we can convert our annotation +format to `Sequence`s, then we can pass those `Sequence`s to the +`crowsetta.csv2seq` function, which will output them as a `.csv` +file. The `Transcriber` does this by default. Under the hood, when you +make a new `Transcriber` with your `user_config`, it wraps your +`format2seq` function and the `seq2csv` function into one, using the +function `crowsetta.csv.toseq_func_to_csv`. + +## Summary + +Now you have seen in detail the process of working with your own +annotation format in `Crowsetta`. Here’s a review of the steps, with +some code snippets worked in to tie it all together: + +1. get your annotations into some variables in Python, perhaps using + code you already wrote +2. use one of the `Sequence` “factory functions” to conveniently turn + your annotations into `Sequence`s +3. turn all that code into a function that takes annotation files as an + argument, and returns `Sequence`s + + +steps 1-3 will give you something like this in a file named something +like `myformat.py` +```python +from Crowsetta import Sequence +def myformat2seq(my_format_files): + seq_list = [] + for format_file in my_format_files: + # load annotation into some Python variables, e.g. a dictionary + annot_dict = magic_annotation_unpacking_function(format_file) + seq = Sequence.from_dict(annot_dict) + seq_list.append(seq) + return seq_list +``` + +4. make a `Transcriber` that knows to use this function when you tell + it you want to turn your annotation files into `Sequence`s, + and/or csv files, or to convert back to your format from + `Sequence`s (assuming you wrote a function in your module that + will do so). + +```python +from Crowsetta import Transcriber +my_config = { + 'my_format': { + 'module': 'myformat.py', + 'to_seq': 'myformat2seq', + 'to_csv': 'myformat2csv', + 'to_format': 'seq2myformat, + } +} +scribe = Transcriber(user_config=my_config) +seq_list = scribe.to_seq(file='my_annotations.txt', file_format='my_format') +``` diff --git a/doc/howto-user-format.rst b/doc/howto-user-format.rst deleted file mode 100644 index 0d56ee5c..00000000 --- a/doc/howto-user-format.rst +++ /dev/null @@ -1,482 +0,0 @@ - -.. _howto-user-format: - -**How to use** ``crowsetta`` **with your own annotation format** -================================================================ - -This section shows you how to use ``crowsetta`` for working with your -own annotation format for vocalizations (or some other format not -currently built into the library). - -You can get the Jupyter notebook for this section by going to -https://github.com/NickleDave/crowsetta and clicking on the big green -“Clone or Download” button on the right side of the screen. You can then -find this notebook and others in the ``crowsetta/notebooks/`` directory. - -Steps to using ``crowsetta`` with your own annotation format ------------------------------------------------------------- - -Below we’ll walk through a case study for using ``crowsetta`` with your -annotation format. Here’s an outline of the steps we’ll go through: - -1. get your annotations into some variables in Python (maybe you already - wrote code to do this) -2. use one of the ``Sequence`` “factory functions” (we’ll explain what - that means) to conveniently turn your annotations into - ``Sequence``\ s -3. turn the code you just wrote into a function that takes annotation - files as an argument, and returns ``Sequence``\ s -4. make a ``Transcriber`` that knows to use this function when you tell - it you want to turn your annotation files into ``Sequence``\ s - -Case Study: the ``BatLAB`` format ---------------------------------- - -Let’s say you work in the Schumacher lab, studying bat vocalizations. -The lab research specialist, Alfred, has spent years writing an -application in Labview to capture bat calls, called ``SoNAR`` (“Sound -and Neural Activity Recorder”). Alfred has also written a GUI in MATLAB -called ``BatLAB`` that lets you interactively annotate audio files -containing the bats’ calls, and saves the annotations in ``.mat`` -(MATLAB data) files. - -You’ve started to work with Python to analyze your data, because you -like the data science and machine learning libraries. However, you find -yourself writing the same code over and over again to unpack the -annotations from the ``.mat`` files made by ``BatLAB``. Every time you -use the code for a new analysis, you have to modify it slightly. The -code has some weird, hard-to-read lines to deal with the complicated -MATLAB ``struct``\ s created by ``BatLAB`` and how they load into -Python. The code also has several repetitive steps to deal with the -idiosyncracies of how ``SoNAR`` and ``BatLAB`` save data: unit -conversion, data types, etcetera. You can’t change ``BatLAB`` or -``SoNAR`` though, because that’s Alfred’s job, and everyone else’s code -that was written ten years ago (and still works!) expects those -idiosyncracies. - -You know that it’s a good idea to turn the code you wrote into a -function (because you took part in a `Software -Carpentry `__ workshop and then you -read `this -paper `__.) -You figured out which bits of the code will be common to all your -projects and you make that into a function, called ``parse_batlab_mat``. -At first you just copy and paste it into all your projects. Then you -decide you also want to save everyone else in your lab the effort of -writing the same code, so you put the script on your lab’s Github page. -This is a step in the right direction, although ``parse_batlab_mat`` -gives you back a Python ``list`` of ``dict``\ s, and you end up typing a -lot of things like: - -.. code:: python - - labels = annot_list[0]['seg_type'] - onsets = annot_list[0]['seg_onsets'] - offsets = annot_list[0]['seg_offsets'] - -Typing all those very similar ``['keys']`` in particular gets kind of -annoying and makes you wonder if you should spend your vacation learning -how to use one of those hacker text editors like ``vim``. - -But before you can worry about that, you get back reviews of your paper -in *PLOS Comp. Bio.* called “Pidgeon Bat: Emergence of Dialects in -Colonies of Multiple Bat Species”. Reviewer #3 doesn’t buy your -conclusions (and you are pretty sure from the way they write that it is -Oswald Cobblepot, professor emeritus of ethology at Metropolitan -University of Fruitville, Florida, and author of the seminal review from -1982, “Bat Calls: A Completely Innate Behavior Encoded Genetically”). -You want to share your data with the world, mainly to mollify reviewer -#3. The problem is that this reviewer (if he is who you think he is) -only knows how to write Fortran code and is definitely not going to -figure out how to copy and use your function ``parse_batlab_mat`` so he -can run your analysis scripts and reproduce your figures for himself. - -What you really want is to share your data and write your code in a way -that doesn’t depend on anyone knowing anything about ``BatLAB`` -or\ ``SoNAR`` and how those programs save data and annotations. This is -where ``crowsetta`` comes to your rescue. - -Okay, now that we’ve set up some background for our case study, let’s go -through the steps we outlined above. - -1. get your annotation into some variables in Python -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -| Let’s look at this complicated data structure that we have our - annotation in. The ``BatLAB`` GUI saves annotation into - ``annotation.mat`` files with two variables: -| - ``filenames``: a vector where each element is the name of an audio - file - ``annotations``: a ``struct`` that has a record for each - element in ``filenames``, and that record is the annotation - corresponding to the audio file with the same index in ``filenames`` - -.. code:: ipython3 - - from scipy.io import loadmat - bat1_annotation = loadmat('bat1_annotation.mat') - print('variables in .mat file:', - [var for var in list(bat1_annotation.keys()) - if not var.startswith('__')] - ) - - -.. parsed-literal:: - - variables in .mat file: ['filenames', 'annotations'] - - -Below is the code you wrote to unpack the ``.mat`` files. Like we said -above, the code has some weird, hard-to-read lines to deal with the way -that the complicated MATLAB ``struct``\ s created by ``BatLAB`` load -into Python, such as calling ``tolist()`` just to unpack an array, and -some logic to make sure the labels get loaded correctly into a numpy -array. And the code has several repetitive steps to deal with the -idiosyncracies of ``SoNAR`` and ``BatLAB``, like converting the start -and stop times of the calls from seconds back to Hertz so you can find -those times in the raw audio files. - -.. code:: ipython3 - - # %load -r 7-8,14-46 parsebat.py - mat = loadmat(mat_file, squeeze_me=True) - annot_list = [] - for filename, annotation in zip(mat['filenames'], mat['annotations']): - # below, .tolist() does not actually create a list, - # instead gets ndarray out of a zero-length ndarray of dtype=object. - # This is just weirdness that results from loading complicated data - # structure in .mat file. - seg_start_times = annotation['segFileStartTimes'].tolist() - seg_end_times = annotation['segFileEndTimes'].tolist() - seg_types = annotation['segType'].tolist() - if type(seg_types) == int: - # this happens when there's only one syllable in the file - # with only one corresponding label - seg_types = np.asarray([seg_types]) # so make it a one-element list - elif type(seg_types) == np.ndarray: - # this should happen whenever there's more than one label - pass - else: - # something unexpected happened - raise ValueError("Unable to load labels from {}, because " - "the segType parsed as type {} which is " - "not recognized.".format(filename, - type(seg_types))) - samp_freq = annotation['fs'].tolist() - seg_start_times_Hz = np.round(seg_start_times * samp_freq).astype(int) - seg_end_times_Hz = np.round(seg_end_times * samp_freq).astype(int) - annot_dict = { - 'audio_file': filename, - 'seg_types': seg_types, - 'seg_start_times': seg_start_times, - 'seg_end_times': seg_end_times, - 'seg_start_times_Hz': seg_start_times_Hz, - 'seg_end_times_Hz': seg_end_times_Hz, - } - annot_list.append(annot_dict) - -When it runs on a file, you end up with an ``annot_list`` where each -item in the list is an ``annot_dict`` that contains the annotations for -a file, like this: - -.. code:: python - - annot_dict = { - 'seg_types': array([1, 1, 5, 2, ...]), - 'seq_start_times': array([0.00297619, 0.279125, 0.55564729,... ]), - ... # end times, start and end times in Hertz - } - -Again, as we said above, you turned your code into a function to make it -easier to use across projects: - -.. code:: python - - import numpy as np - from scipy.io import loadmat - - def parse_batlab_mat(mat_file): - """parse batlab annotation.mat file""" - # code from above - return annot_list - -As we’ll see in a moment, all you need to do is take this code you -already wrote, and instead of returning your ``list`` of ``dict``\ s, -you return a list of ``Sequence``\ s. - -2. use one of the ``Sequence`` “factory functions” to conveniently turn annotations in your format into ``Sequence``\ s -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -First, to get the ``Sequence``, we’ll use a “factory function”, which -just means it’s a function built into the ``Sequence`` class that gives -us back an instance of a ``Sequence``. One such factory function is -``Sequence.from_keyword``. Here’s an example of using it: - -.. code:: ipython3 - - from parsebat import parse_batlab_mat - from crowsetta.sequence import Sequence - - # you, using the function you already wrote - annot_list = parse_batlab_mat(mat_file='bat1_annotation.mat') - - # you have annotation from one file in an "annot_dict" - annot_dict = annot_list[0] - - a_sequence = Sequence.from_keyword(labels=annot_dict['seg_types'], - onsets_s=annot_dict['seg_start_times'], - offsets_s=annot_dict['seg_end_times'], - onset_inds=annot_dict['seg_start_times_Hz'], - offset_inds=annot_dict['seg_end_times_Hz'], - file=annot_dict['audio_file']) - print("a_sequence:\n", a_sequence) - - -.. parsed-literal:: - - a_sequence: - - - -3. turn the code we just wrote into a function that takes annotation files as an argument, and returns ``Sequence``\ s -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Again, you pretty much already wrote this. Just take your -``parse_batlab_mat`` function from above and change a couple lines. -First, you’re going to return a list of sequences instead of your -``annot_list`` from before. You probably want to make that explicit in -your function. - -.. code:: ipython3 - - # %load -r 4-7,24-25 batlab2seq.py - from crowsetta.sequence import Sequence - - - def batlab2seq(mat_file): - mat = loadmat(mat_file, squeeze_me=True) - seq_list = [] - -Then at the end of your main loop, instead of making your -``annot_dict``, you’ll make a new ``Sequence`` from each file using the -``from_keyword`` factory function, append the new ``Sequence`` to your -``seq_list``, and then finally return that ``list`` of ``Sequence``\ s. - -.. code:: ipython3 - - # %load -r 56-63 batlab2seq.py - seq = Sequence.from_keyword(file=filename, - labels=seg_types, - onsets_s=seg_start_times, - offsets_s=seg_end_times, - onset_inds=seg_start_times_Hz, - offset_inds=seg_end_times_Hz) - seq_list.append(seq) - return seq_list - - If this still feels too wordy and repetitive for you, you can put - ``segFileStartTimes``, ``segFileEndTimes``, et al., into a Python - ``dict`` with ``keys`` corresponding to the parameters for - ``Segment.from_keyword``: - -.. - - .. code:: python - - annot_dict = { - 'file': filename, - 'onsets_s': annotation['segFileStartTimes'].tolist(), - 'offsets_s': annotation['segFileEndTimes'].tolist() - 'labels': seg_types - } - - Note here that you only have to specify the onsets an offsets of - segments *either* in seconds or in Hertz (but you can define - both). - -.. - - and then use another factory function, ``Sequence.from_dict``, to - create the ``Sequence``. - - .. code:: python - - seq_list.append(Sequence.from_dict(annot_dict)) - -Now that you have a function that takes annotation files and return -``Sequence``\ s, call it something like ``batlab2seq`` and put it in a -file that ends with ``.py``, e.g. \ ``batlab2seq.py``. This is also -known as a Python **module** (as you’ll need to know below). To see the -entire example, check out the `batlab2seq.py <./batlab2seq.py>`__ file -in this folder (and compare it with `parsebat.py <./parsebat.py>`__). - -4. make a ``Transcriber`` that knows to use this function when you tell it you want to turn your annotation files into ``Sequence``\ s -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If you have worked with ``Crowsetta`` already, or gone through the -tutorial, you know that we can work with a ``Transcriber`` that does the -work of making ``Sequence``\ s of ``Segment``\ s from annotation files -for us. We create a new instance of a ``Transcriber`` by writing -something like this: - -.. code:: python - - scribe = Transcriber() - -You will do the same thing here, but to tell the ``Transcriber`` how to -work with your format, you will pass an argument for the ``user_config`` -parameter when you create a new one: - -.. code:: python - - scribe = Transcriber(user_config=your_config) - -The argument you pass for ``user_config`` will be a Python dictionary -with the following structure: - -.. code:: python - - your_config = { - 'batlab': { - 'module': 'batlab2seq.py', - 'to_seq': 'batlab2seq', - 'to_csv': 'None', - 'to_format': 'None', - } - } - -Notice that this a dictionary of dictionaries, where each ``key`` in the -top-level ``dict`` is the name of a user-defined format, here -``batlab``. If you had multiple formats to use, you would add more -``dict``\ s inside the top-level ``dict``. - -The ``value`` for each ``key`` is another Python dictionary that tells -the ``Transcriber`` what functions to use from your module when you call -one of its methods and specify this format. In the example above, you’re -telling the ``Transcriber`` that when you say ``file_format='batlab'``, -it should use functions from the ``batlab2seq.py`` module. More -specifically, when you call -``scribe.to_seq(file='annotation.mat', file_format='batlab')``, it -should use the ``batlab2seq`` function to convert your annotation into -``Sequence``\ s. Notice also that you can specify ``'None'`` for -``to_csv`` and ``to_format`` (which would be a function that converts -``Sequence``\ s back to the ``BatLAB`` format). - -Here’s what it looks like to do all of that in a few lines of code: - -.. code:: ipython3 - - from crowsetta import Transcriber - - your_config = { - 'batlab': { - 'module': 'batlab2seq.py', - 'to_seq': 'batlab2seq', - } - } - - scribe = Transcriber(user_config=your_config) - - seq_list = scribe.to_seq(file='bat1_annotation.mat', file_format='batlab') - -And now, just like you do with the built-in formats, you get back a list -of ``Sequence``\ s from your format: - -.. code:: ipython3 - - print(f'First item in seq_list: {seq_list[0]}') - print(f'First segment in first sequence:\n{seq_list[0].segments[0]}') - - -.. parsed-literal:: - - First item in seq_list: - First segment in first sequence: - Segment(label='1', file='lbr3009_0005_2017_04_27_06_14_46.wav', onset_s=0.0029761904761904934, offset_s=0.14150432900432905, onset_ind=143, offset_ind=6792) - - -Notice that we also get a ``to_csv`` function for free: - -.. code:: ipython3 - - scribe.to_csv(file='bat1_annotation.mat', - csv_filename='test.csv', - file_format='batlab') - - import csv - with open('test.csv', 'r', newline='') as csv_file: - reader = csv.reader(csv_file) - for _ in range(4): - print(next(reader)) - - -.. parsed-literal:: - - ['label', 'onset_s', 'offset_s', 'onset_ind', 'offset_ind', 'file'] - ['1', '0.0029761904761904934', '0.14150432900432905', '143', '6792', 'lbr3009_0005_2017_04_27_06_14_46.wav'] - ['1', '0.279125', '0.504625', '13398', '24222', 'lbr3009_0005_2017_04_27_06_14_46.wav'] - ['5', '0.5556472915365209', '0.5962916666666667', '26671', '28622', 'lbr3009_0005_2017_04_27_06_14_46.wav'] - - -How does that work? Well, as long as we can convert our annotation -format to ``Sequence``\ s, then we can pass those ``Sequence``\ s to the -``crowsetta.csv2seq`` function, which will output them as a ``.csv`` -file. The ``Transcriber`` does this by default. Under the hood, when you -make a new ``Transcriber`` with your ``user_config``, it wraps your -``format2seq`` function and the ``seq2csv`` function into one, using the -function ``crowsetta.csv.toseq_func_to_csv``. - -Summary -------- - -Now you have seen in detail the process of working with your own -annotation format in ``Crowsetta``. Here’s a review of the steps, with -some code snippets worked in to tie it all together: - -1. get your annotations into some variables in Python, perhaps using - code you already wrote -2. use one of the ``Sequence`` “factory functions” to conveniently turn - your annotations into ``Sequence``\ s -3. turn all that code into a function that takes annotation files as an - argument, and returns ``Sequence``\ s - -.. - - steps 1-3 will give you something like this in a file named something - like ``myformat.py`` - - .. code:: python - - from Crowsetta import Sequence - - - def myformat2seq(my_format_files): - seq_list = [] - for format_file in my_format_files: - # load annotation into some Python variables, e.g. a dictionary - annot_dict = magic_annotation_unpacking_function(format_file) - seq = Sequence.from_dict(annot_dict) - seq_list.append(seq) - return seq_list - -4. make a ``Transcriber`` that knows to use this function when you tell - it you want to turn your annotation files into ``Sequence``\ s, - and/or csv files, or to convert back to your format from - ``Sequence``\ s (assuming you wrote a function in your module that - will do so). - -.. - - .. code:: python - - from Crowsetta import Transcriber - - my_config = { - 'my_format': { - 'module': 'myformat.py', - 'to_seq': 'myformat2seq', - 'to_csv': 'myformat2csv', - 'to_format': 'seq2myformat, - } - } - scribe = Transcriber(user_config=my_config) - seq_list = scribe.to_seq(file='my_annotations.txt', file_format='my_format') diff --git a/doc/howto.md b/doc/howto.md new file mode 100644 index 00000000..bad1de3d --- /dev/null +++ b/doc/howto.md @@ -0,0 +1,11 @@ +(howto)= + +# **How-To** + +This section shows you how to use `crowsetta` for specific tasks. + +```{toctree} +:maxdepth: 2 + +howto-user-format +``` diff --git a/doc/howto.rst b/doc/howto.rst deleted file mode 100644 index 3d0d50d6..00000000 --- a/doc/howto.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. _howto: - -========== -**How-To** -========== - -This section shows you how to use `crowsetta` for specific tasks. - -.. toctree:: - :maxdepth: 2 - - howto-user-format diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 00000000..4d4976d8 --- /dev/null +++ b/doc/index.md @@ -0,0 +1,200 @@ +% Crowsetta documentation master file, created by +% sphinx-quickstart on Sat Dec 22 21:16:45 2018. +% You can adapt this file completely to your liking, but it should at least +% contain the root `toctree` directive. + +# **Crowsetta** + +`crowsetta` is a tool to work with any format for annotating vocalizations, like +birdsong or human speech. **The goal of** `crowsetta` **is to make sure that your +ability to work with a dataset of vocalizations does not depend on your ability to work with +any given format for annotating that dataset.** + +## **Features** + +### **Data types that help you write clean code** + +What `crowsetta` gives you is **not** yet another format for +annotation (I promise!). Instead you get some nice data types that make it easier to +work with any format: namely, `Sequence`s made up of `Segment`s. +The code block below shows some of the features of these data types. + +```python +>>> from crowsetta import Segment, Sequence +>>> a_segment = Segment.from_keyword( +... label='a', +... onset_ind=16000, +... offset_ind=32000, +... file='bird21.wav' +... ) +>>> another_segment = Segment.from_keyword( +... label='b', +... onset_ind=36000, +... offset_ind=48000, +... file='bird21.wav' +... ) +>>> list_of_segments = [a_segment, another_segment] +>>> seq = Sequence.from_segments(segments=list_of_segments) +>>> print(seq) + +>>> for segment in seq.segments: print(segment) +Segment(label='a', file='bird21.wav', onset_s=None, offset_s=None, onset_ind=16000, offset_ind=32000) +Segment(label='b', file='bird21.wav', onset_s=None, offset_s=None, onset_ind=36000, offset_ind=48000) +>>> seq.file +bird21.wav +>>> seq.onset_inds +array([16000, 36000]) +``` + +You load annotation from your format of choice into `Sequence`s of `Segment`s +(most conveniently with the `Transcriber`, as explained below) and then use the +`Sequence`s however you need to in your program. + +For example, if you want to loop through the `Segment`s of each `Sequence` to +pull syllables out of a spectrogram, you can do something like this: + +```python +>>> list_of_sequences = my_sequence_loading_function(file='annotation.txt') +>>> syllables_from_sequences = [] +>>> for a_sequence in list_of_sequences: +... # get name of the audio file associated with the Sequence +... audio_file = a_sequence.file +... # then create a spectrogram from that audio file +... spect = some_spectrogram_making_function(audio_file) +... syllables = [] +... for segment in a_sequence.segments: +... ## spectrogram is a 2d numpy array so we index into using onset and offset from segment +... syllable = spect[:, segment.onset_s:segment.offset_s] +... syllables.append(syllable) +... syllables_from_sequences.append(syllables) +``` + +This code is succinct, compared to the data munging code you usually write when dealing with +audio files and annotation formats. It reads like idiomatic Python. +For a deeper dive into why this is useful, see {ref}`background`. + +### **A** `Transcriber` **that makes it convenient to work with any annotation format** + +As mentioned, `crowsetta` provides you with a `Transcriber` that comes equipped +with convenience functions to do the work of loading and saving annotations for you. + +```python +>>> annotation_files = [ +... '~/Data/bird1_day1/song1_2018-12-07_072135.not.mat', +... '~/Data/bird1_day1/song2_2018-12-07_072316.not.mat', +... '~/Data/bird1_day1/song3_2018-12-07_072749.not.mat' +... ] +>>> from crowsetta import Transcriber +>>> scribe = Transcriber() +>>> seq = scribe.to_seq(file=annotation_files, format='notmat') +>>> len(seq) +3 +>>> print(seq[0]) + +``` + +### **Easily use the** `Transcriber` **with your own annotation format** + +You can even easily tell the `Transcriber` to use your own in-house format, like so: + +```python +>>> my_config = { +... 'myformat_name': { +... 'module': '/home/MyUserName/Documents/Python/convert_myformat.py' +... 'to_seq': 'myformat2seq', +... 'to_csv': 'myformat2csv'} +... } +... } +>>> scribe = crowsetta.Transcriber(user_config=my_config) +>>> seq = scribe.toseq(file='my_annotation.mat', file_format='myformat_name') +``` + +For more about how that works, please see {ref}`howto-user-format`. + +### **Save and load annotations in plain text files** + +If you need it to, `crowsetta` can save your `Sequence`s of `Segment`s +as a plain text file in the comma-separated values (csv) format. This file format +was chosen because it is widely considered to be a very robust way to share data. + +```python +from crowsetta import Transcriber +scribe = Transcriber(user_config=your_config) +scribe.to_csv(file_'your_annotation_file.mat', + csv_filename='your_annotation.csv') +``` + +An example csv looks like this: + +```{literalinclude} ../tests/test_data/csv/gy6or6_032312.csv +:language: none +:lines: 1-5 +``` + +Now that you have that, you can load it into a [pandas] dataframe or an Excel +spreadsheet or an SQL database, or whatever you want. + +You might find this useful in any situation where you want to share audio files of +song and some associated annotations, but you don't want to require the user to +install a large application in order to work with the annotation files. + +### **Getting Started** + +Install `crowsetta` by running: + +```console +$ pip install crowsetta +``` + +If you are new to the library, start with {ref}`tutorial`. + +To see an example of using `crowsetta` to work with your own annotation format, +see {ref}`howto-user-format`. + +## **Table of Contents** + +```{toctree} +:maxdepth: 2 + +tutorial +howto +background +``` + +## **Project Information** + +`crowsetta` was developed for use with the [songdeck] and +[hybrid-vocal-classifier] libraries. + +### Support + +If you are having issues, please let us know. + +- Issue Tracker: + +### Contribute + +- Issue Tracker: +- Source Code: + +### License + +The project is licensed under the +[BSD license](https://github.com/NickleDave/crowsetta/blob/master/LICENSE). + +### CHANGELOG + +You can see project history and work in progress in the +[CHANGELOG](https://github.com/NickleDave/crowsetta/blob/master/doc/CHANGELOG.md). + +### Citation + +If you use `crowsetta`, please cite the DOI: + +```{image} https://zenodo.org/badge/159904494.svg +:target: https://zenodo.org/badge/latestdoi/159904494 +``` + +[hybrid-vocal-classifier]: https://hybrid-vocal-classifier.readthedocs.io/en/latest/ +[pandas]: https://pandas.pydata.org/ +[songdeck]: https://github.com/NickleDave/songdeck diff --git a/doc/index.rst b/doc/index.rst deleted file mode 100644 index daeb4365..00000000 --- a/doc/index.rst +++ /dev/null @@ -1,211 +0,0 @@ -.. Crowsetta documentation master file, created by - sphinx-quickstart on Sat Dec 22 21:16:45 2018. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -============= -**Crowsetta** -============= - -``crowsetta`` is a tool to work with any format for annotating vocalizations, like -birdsong or human speech. **The goal of** ``crowsetta`` **is to make sure that your -ability to work with a dataset of vocalizations does not depend on your ability to work with -any given format for annotating that dataset.** - -**Features** -============ - -**Data types that help you write clean code** ---------------------------------------------- - -What ``crowsetta`` gives you is **not** yet another format for -annotation (I promise!). Instead you get some nice data types that make it easier to -work with any format: namely, ``Sequence``\ s made up of ``Segment``\ s. -The code block below shows some of the features of these data types. - -.. code-block:: python - - >>> from crowsetta import Segment, Sequence - >>> a_segment = Segment.from_keyword( - ... label='a', - ... onset_ind=16000, - ... offset_ind=32000, - ... file='bird21.wav' - ... ) - >>> another_segment = Segment.from_keyword( - ... label='b', - ... onset_ind=36000, - ... offset_ind=48000, - ... file='bird21.wav' - ... ) - >>> list_of_segments = [a_segment, another_segment] - >>> seq = Sequence.from_segments(segments=list_of_segments) - >>> print(seq) - - >>> for segment in seq.segments: print(segment) - Segment(label='a', file='bird21.wav', onset_s=None, offset_s=None, onset_ind=16000, offset_ind=32000) - Segment(label='b', file='bird21.wav', onset_s=None, offset_s=None, onset_ind=36000, offset_ind=48000) - >>> seq.file - bird21.wav - >>> seq.onset_inds - array([16000, 36000]) - -You load annotation from your format of choice into ``Sequence``\ s of ``Segment``\ s -(most conveniently with the ``Transcriber``, as explained below) and then use the -``Sequence``\ s however you need to in your program. - -For example, if you want to loop through the ``Segment``\ s of each ``Sequence`` to -pull syllables out of a spectrogram, you can do something like this: - -.. code-block:: python - - >>> list_of_sequences = my_sequence_loading_function(file='annotation.txt') - >>> syllables_from_sequences = [] - >>> for a_sequence in list_of_sequences: - ... # get name of the audio file associated with the Sequence - ... audio_file = a_sequence.file - ... # then create a spectrogram from that audio file - ... spect = some_spectrogram_making_function(audio_file) - ... syllables = [] - ... for segment in a_sequence.segments: - ... ## spectrogram is a 2d numpy array so we index into using onset and offset from segment - ... syllable = spect[:, segment.onset_s:segment.offset_s] - ... syllables.append(syllable) - ... syllables_from_sequences.append(syllables) - -This code is succinct, compared to the data munging code you usually write when dealing with -audio files and annotation formats. It reads like idiomatic Python. -For a deeper dive into why this is useful, see :ref:`background`. - -**A** ``Transcriber`` **that makes it convenient to work with any annotation format** --------------------------------------------------------------------------------------- - -As mentioned, ``crowsetta`` provides you with a ``Transcriber`` that comes equipped -with convenience functions to do the work of loading and saving annotations for you. - -.. code-block:: python - - >>> annotation_files = [ - ... '~/Data/bird1_day1/song1_2018-12-07_072135.not.mat', - ... '~/Data/bird1_day1/song2_2018-12-07_072316.not.mat', - ... '~/Data/bird1_day1/song3_2018-12-07_072749.not.mat' - ... ] - >>> from crowsetta import Transcriber - >>> scribe = Transcriber() - >>> seq = scribe.to_seq(file=annotation_files, format='notmat') - >>> len(seq) - 3 - >>> print(seq[0]) - - -**Easily use the** ``Transcriber`` **with your own annotation format** ----------------------------------------------------------------------- -You can even easily tell the ``Transcriber`` to use your own in-house format, like so: - -.. code-block:: python - - >>> my_config = { - ... 'myformat_name': { - ... 'module': '/home/MyUserName/Documents/Python/convert_myformat.py' - ... 'to_seq': 'myformat2seq', - ... 'to_csv': 'myformat2csv'} - ... } - ... } - >>> scribe = crowsetta.Transcriber(user_config=my_config) - >>> seq = scribe.toseq(file='my_annotation.mat', file_format='myformat_name') - -For more about how that works, please see :ref:`howto-user-format`. - -**Save and load annotations in plain text files** -------------------------------------------------- -If you need it to, ``crowsetta`` can save your ``Sequence``\ s of ``Segment``\ s -as a plain text file in the comma-separated values (csv) format. This file format -was chosen because it is widely considered to be a very robust way to share data. - -.. code-block:: python - - from crowsetta import Transcriber - scribe = Transcriber(user_config=your_config) - scribe.to_csv(file_'your_annotation_file.mat', - csv_filename='your_annotation.csv') - - -An example csv looks like this: - -.. literalinclude:: ../tests/test_data/csv/gy6or6_032312.csv - :lines: 1-5 - :language: none - -Now that you have that, you can load it into a pandas_ dataframe or an Excel -spreadsheet or an SQL database, or whatever you want. - -.. _pandas: https://pandas.pydata.org/ - -You might find this useful in any situation where you want to share audio files of -song and some associated annotations, but you don't want to require the user to -install a large application in order to work with the annotation files. - -**Getting Started** -------------------- -Install ``crowsetta`` by running: - -.. code-block:: console - - $ pip install crowsetta - - -If you are new to the library, start with :ref:`tutorial`. - -To see an example of using ``crowsetta`` to work with your own annotation format, -see :ref:`howto-user-format`. - -**Table of Contents** -===================== - -.. toctree:: - :maxdepth: 2 - - tutorial - howto - background - -**Project Information** -======================= - -``crowsetta`` was developed for use with the songdeck_ and -hybrid-vocal-classifier_ libraries. - -.. _songdeck: https://github.com/NickleDave/songdeck - -.. _hybrid-vocal-classifier: https://hybrid-vocal-classifier.readthedocs.io/en/latest/ - -Support -------- - -If you are having issues, please let us know. - -- Issue Tracker: https://github.com/NickleDave/crowsetta/issues - -Contribute ----------- - -- Issue Tracker: https://github.com/NickleDave/crowsetta/issues -- Source Code: https://github.com/NickleDave/crowsetta - -License -------- - -The project is licensed under the -`BSD license `_. - -CHANGELOG ---------- -You can see project history and work in progress in the -`CHANGELOG `_. - -Citation --------- -If you use ``crowsetta``, please cite the DOI: - -.. image:: https://zenodo.org/badge/159904494.svg - :target: https://zenodo.org/badge/latestdoi/159904494 diff --git a/doc/tutorial.rst b/doc/tutorial.md similarity index 56% rename from doc/tutorial.rst rename to doc/tutorial.md index cf34b0bc..dbd1b996 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.md @@ -1,78 +1,73 @@ +(tutorial)= -.. _tutorial: +# **Tutorial** -**Tutorial** -============ - -This tutorial present beginners with an introduction to ``crowsetta``. +This tutorial present beginners with an introduction to `crowsetta`. You can get the Jupyter notebook for this tutorial by going to -https://github.com/NickleDave/crowsetta and clicking on the big green + and clicking on the big green “Clone or Download” button on the right side of the screen. You can then -find this notebook and others in the ``crowsetta/notebooks/`` directory. +find this notebook and others in the `crowsetta/notebooks/` directory. -**Finding out what annotation formats are built in to** ``crowsetta`` **and getting some example data to work with** --------------------------------------------------------------------------------------------------------------------- +## **Finding out what annotation formats are built in to** `crowsetta` **and getting some example data to work with** -Since ``crowsetta`` is a tool to working with annotations of +Since `crowsetta` is a tool to working with annotations of vocalizations, we need some audio files containing vocalizations that are annotated. In this case, birdsong. The first thing we need to do to work with any Python library is import it. -.. code:: ipython3 - - import crowsetta +```ipython3 +import crowsetta +``` -Now we can use the ``formats`` function to find out what formats are +Now we can use the `formats` function to find out what formats are built in to Crowsetta. -.. code:: ipython3 - - crowsetta.formats() - - - +```ipython3 +crowsetta.formats() +``` +```{eval-rst} .. parsed-literal:: 'Annotation formats built in to Crowsetta: notmat, koumura' +``` You can download small example datasets of the built-in formats with the -``fetch`` function in the ``data`` module, like so: - -.. code:: ipython3 - - crowsetta.data.fetch(format='notmat', destination_path='./data/') +`fetch` function in the `data` module, like so: +```ipython3 +crowsetta.data.fetch(format='notmat', destination_path='./data/') +``` +```{eval-rst} .. parsed-literal:: Downloading https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/13993349/cbinnotmat.tar.gz (8.6 MB) - [........................................] 100.00000 - ( 8.6 MB / 8.6 MB, 3.8 MB/s) + [........................................] 100.00000 - ( 8.6 MB / 8.6 MB, 3.8 MB/s) File saved as ./data/cbin-notmat.tar.gz. - + extracting ./data/cbin-notmat.tar.gz +``` -Here we downloaded some ``.cbin`` audio files. Each ``.cbin`` file has -an associated ``.not.mat`` file that contains the annotation. +Here we downloaded some `.cbin` audio files. Each `.cbin` file has +an associated `.not.mat` file that contains the annotation. -We use the ``glob`` function from the Python standard library to list -those files. (``glob`` gives you the full path to files that match a -string pattern; ``*`` in the string below is a wildcard that will match +We use the `glob` function from the Python standard library to list +those files. (`glob` gives you the full path to files that match a +string pattern; `*` in the string below is a wildcard that will match zero or more characters). -.. code:: ipython3 - - from glob import glob - glob('./data/cbin-notmat/032312/*.cbin*') - - - +```ipython3 +from glob import glob +glob('./data/cbin-notmat/032312/*.cbin*') +``` +```{eval-rst} .. parsed-literal:: ['./data/cbin-notmat/032312/gy6or6_baseline_230312_0819.190.cbin', @@ -97,32 +92,32 @@ zero or more characters). './data/cbin-notmat/032312/gy6or6_baseline_230312_0808.138.cbin.not.mat'] +``` (It doesn’t matter much for our purposes, but … files in the -``.not.mat`` annotation format are produced by a Matlab GUI, +`.not.mat` annotation format are produced by a Matlab GUI, evsonganaly, and are used to annotate audio files produced by a Labview program for running behavioral experiments called EvTAF.) -**Using the** ``Transcriber`` **to load annotation files into a data type we can work with in Python** ------------------------------------------------------------------------------------------------------- +## **Using the** `Transcriber` **to load annotation files into a data type we can work with in Python** -Now we want to use ``crowsetta`` to load the annotations into some +Now we want to use `crowsetta` to load the annotations into some **data type** that makes it easy to get what we want out of audio files. -Python has several data types like a ``list`` or ``dict`` that make it -easy to work with data; the data types that ``crowsetta`` gives us, -``Sequence``\ s and ``Segment``\ s, specifically make it easy to write +Python has several data types like a `list` or `dict` that make it +easy to work with data; the data types that `crowsetta` gives us, +`Sequence`s and `Segment`s, specifically make it easy to write clean code for working with annotation formats for birdsong and other vocalizations. First we need to get all the annotation files in some variable. We use -``glob`` again to do so, this time just getting the ``.not.mat`` files. - -.. code:: ipython3 - - notmats = glob('./data/cbin-notmat/032312/*.not.mat') - for notmat in notmats: print(notmat) +`glob` again to do so, this time just getting the `.not.mat` files. +```ipython3 +notmats = glob('./data/cbin-notmat/032312/*.not.mat') +for notmat in notmats: print(notmat) +``` +```{eval-rst} .. parsed-literal:: ./data/cbin-notmat/032312/gy6or6_baseline_230312_0819.190.cbin.not.mat @@ -136,139 +131,142 @@ First we need to get all the annotation files in some variable. We use ./data/cbin-notmat/032312/gy6or6_baseline_230312_0811.159.cbin.not.mat ./data/cbin-notmat/032312/gy6or6_baseline_230312_0808.138.cbin.not.mat +``` Now that we have our annotation files in a variable, we use the -``Transcriber`` to load them. +`Transcriber` to load them. -The ``Transcriber`` is a Python ``class``, and we want to create a new -``instance`` of that class. You don’t have to understand what that +The `Transcriber` is a Python `class`, and we want to create a new +`instance` of that class. You don’t have to understand what that means, but you do have to know that before you can do anything with a -``Transcriber``, you have to call the class, as if it were a function, +`Transcriber`, you have to call the class, as if it were a function, and assign it to some variable, like this: -.. code:: ipython3 - - scribe = crowsetta.Transcriber() - print("scribe is an instance of a", type(scribe)) - +```ipython3 +scribe = crowsetta.Transcriber() +print("scribe is an instance of a", type(scribe)) +``` +```{eval-rst} .. parsed-literal:: scribe is an instance of a +``` -Now we have a ``scribe`` with ``methods`` that we can use on our +Now we have a `scribe` with `methods` that we can use on our annotation files (methods are functions that “belong” to a class). -**Using the** ``to_seq`` **method to load annotation format files into** ``Sequence``\ **s** -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +### **Using the** `to_seq` **method to load annotation format files into** `Sequence`**s** -The ``to_seq`` method loads each file into a ``Sequence``, one of the +The `to_seq` method loads each file into a `Sequence`, one of the data types that helps us work with the annotation. We call the method, -passing our list of files as an argument for ``file`` and telling the -``scribe`` our ``file_format``. - -.. code:: ipython3 - - seq = scribe.to_seq(file=notmats, file_format='notmat') +passing our list of files as an argument for `file` and telling the +`scribe` our `file_format`. -For each annotation file, we should have a ``Sequence``. +```ipython3 +seq = scribe.to_seq(file=notmats, file_format='notmat') +``` -.. code:: ipython3 - - print("Number of annotation files: ", len(notmats)) - print("Number of Sequences: ", len(seq)) - if len(notmats) == len(seq): - print("The number of annotation files is equal to number of sequences.") +For each annotation file, we should have a `Sequence`. +```ipython3 +print("Number of annotation files: ", len(notmats)) +print("Number of Sequences: ", len(seq)) +if len(notmats) == len(seq): + print("The number of annotation files is equal to number of sequences.") +``` +```{eval-rst} .. parsed-literal:: Number of annotation files: 10 Number of Sequences: 10 The number of annotation files is equal to number of sequences. +``` -Each ``Sequence`` consists of some number of ``Segment``\ s, i.e., a -part of the sequence defined by an ``onset`` and ``offset`` that has a -``label`` associated with it. - -.. code:: ipython3 - - print("first element of seq: ", seq[0]) - print("\nFirst two Segments of first Sequence:") - for seg in seq[0].segments[0:2]: print(seg) +Each `Sequence` consists of some number of `Segment`s, i.e., a +part of the sequence defined by an `onset` and `offset` that has a +`label` associated with it. +```ipython3 +print("first element of seq: ", seq[0]) +print("\nFirst two Segments of first Sequence:") +for seg in seq[0].segments[0:2]: print(seg) +``` +```{eval-rst} .. parsed-literal:: first element of seq: - + First two Segments of first Sequence: Segment(label='i', file='./data/cbin-notmat/032312/gy6or6_baseline_230312_0819.190.cbin', onset_s=0.435, offset_s=0.511, onset_ind=13924, offset_ind=16350) Segment(label='i', file='./data/cbin-notmat/032312/gy6or6_baseline_230312_0819.190.cbin', onset_s=0.583, offset_s=0.662, onset_ind=18670, offset_ind=21184) +``` -**Using** ``crowsetta`` **data types to write clean code** ----------------------------------------------------------- +## **Using** `crowsetta` **data types to write clean code** -Now that we have a ``list`` of ``Sequence``\ s, we can ``iterate`` +Now that we have a `list` of `Sequence`s, we can `iterate` (loop) through it to get at our audio data in a clean, Pythonic way. Let’s say we’re interested in the mean amplitude of each type of syllable in an individual bird’s song. How do we get that data into something in Python we can analyze? One approach would be to create a -Python ``dict`` that maps the name of each syllable type to a list of +Python `dict` that maps the name of each syllable type to a list of the mean amplitudes of every occurrence of that syllable in our dataset. Something like this: -.. code:: python - - syl_amp_dict = { - 'a': [0.01, 0.023, ..., 0.017], - 'b': [0.03, 0.032, ..., 0.291], - ..., - 'j': [0.07, 0.068, ..., 0.71], - } +```python +syl_amp_dict = { + 'a': [0.01, 0.023, ..., 0.017], + 'b': [0.03, 0.032, ..., 0.291], + ..., + 'j': [0.07, 0.068, ..., 0.71], +} +``` So to do that, we need to first figure out the unique types of syllables -that will be the ``keys`` of our dictionary, ``a``, ``b``, …, ``n``. +that will be the `keys` of our dictionary, `a`, `b`, …, `n`. -We’ll ``iterate`` over all the ``Sequence``\ s, and then in an inner -loop, we’ll ``iterate`` through all the ``Segment``\ s in that -``Sequence``, using the ``label`` property of the segment to figure out +We’ll `iterate` over all the `Sequence`s, and then in an inner +loop, we’ll `iterate` through all the `Segment`s in that +`Sequence`, using the `label` property of the segment to figure out which syllable type we’re looking at from this bird. -.. code:: ipython3 - - import numpy as np - - all_labels = [] - for sequence in seq: - for segment in sequence.segments: - all_labels.append(segment.label) - - unique_labels = np.unique(all_labels) - - # now we make our dict,. - # with some fancy Pythoning - syl_amp_dict = dict( - zip(unique_labels, - [[] for _ in range(len(unique_labels))]) - ) - - print("syl_amp_dict", syl_amp_dict) +```ipython3 +import numpy as np +all_labels = [] +for sequence in seq: + for segment in sequence.segments: + all_labels.append(segment.label) +unique_labels = np.unique(all_labels) + +# now we make our dict,. +# with some fancy Pythoning +syl_amp_dict = dict( + zip(unique_labels, + [[] for _ in range(len(unique_labels))]) +) + +print("syl_amp_dict", syl_amp_dict) +``` + +```{eval-rst} .. parsed-literal:: syl_amp_dict {'a': [], 'b': [], 'c': [], 'd': [], 'e': [], 'f': [], 'g': [], 'h': [], 'i': [], 'j': [], 'k': []} +``` (There are more concise ways to do that, but doing it the way we did let -us clearly see iterating through the ``Segment``\ s and -``Sequence``\ s.) +us clearly see iterating through the `Segment`s and +`Sequence`s.) Now we want to get the amplitude for each syllable. We’ll take the amplitude from the audio waveform (instead of, say, making a spectrogram @@ -276,25 +274,25 @@ out of it and then getting an amplitude measure by summing power of every time bin in the spectrogram). Since the audio signal might be a bit noisy, we’ll use a function, -``smooth_data`` (from the -```evfuncs`` `__ library) that +`smooth_data` (from the +`` `evfuncs `` \<>\`\_\_ library) that takes the raw audio from a file, applies a bandpass filter, rectifies the signal, and then smooths it with a sliding window. -.. code:: ipython3 - - import evfuncs - help(evfuncs.smooth_data) - +```ipython3 +import evfuncs +help(evfuncs.smooth_data) +``` +```{eval-rst} .. parsed-literal:: Help on function smooth_data in module evfuncs.evfuncs: - + smooth_data(rawsong, samp_freq, freq_cutoffs=(500, 10000), smooth_win=2) filter raw audio and smooth signal used to calculate amplitude. - + Parameters ---------- rawsong : ndarray @@ -307,44 +305,45 @@ the signal, and then smooths it with a sliding window. If None, in which case bandpass filter is not applied. smooth_win : integer size of smoothing window in milliseconds. Default is 2. - + Returns ------- smooth : ndarray 1-d numpy array, smoothed waveform - + Applies a bandpass filter with the frequency cutoffs in spect_params, then rectifies the signal by squaring, and lastly smooths by taking the average within a window of size sm_win. This is a very literal translation from the Matlab function SmoothData.m by Evren Tumer. Uses the Thomas-Santana algorithm. - - - -.. code:: ipython3 - - for sequence in seq: - cbin = sequence.file - raw_audio, samp_freq = evfuncs.load_cbin(cbin) - smoothed = evfuncs.smooth_data(raw_audio, samp_freq, - freq_cutoffs=(500, 10000)) - for segment in sequence.segments: - smoothed_seg = smoothed[segment.onset_ind:segment.offset_ind] - mean_seg_amp = np.mean(smoothed_seg) - syl_amp_dict[segment.label].append(mean_seg_amp) - - mean_syl_amp_dict = {} - for syl_label, mean_syl_amps_list in syl_amp_dict.items(): - # get mean of means - mean_syl_amp_dict[syl_label] = np.mean(mean_syl_amps_list) - -.. code:: ipython3 - - for syl_label, mean_syl_amp in mean_syl_amp_dict.items(): - print(f'mean of mean amplitude for syllable {syl_label}:', - mean_syl_amp) +``` + +```ipython3 +for sequence in seq: + cbin = sequence.file + raw_audio, samp_freq = evfuncs.load_cbin(cbin) + smoothed = evfuncs.smooth_data(raw_audio, samp_freq, + freq_cutoffs=(500, 10000)) + for segment in sequence.segments: + smoothed_seg = smoothed[segment.onset_ind:segment.offset_ind] + mean_seg_amp = np.mean(smoothed_seg) + syl_amp_dict[segment.label].append(mean_seg_amp) + +mean_syl_amp_dict = {} +for syl_label, mean_syl_amps_list in syl_amp_dict.items(): + # get mean of means + mean_syl_amp_dict[syl_label] = np.mean(mean_syl_amps_list) +``` + +```ipython3 +for syl_label, mean_syl_amp in mean_syl_amp_dict.items(): + print(f'mean of mean amplitude for syllable {syl_label}:', + mean_syl_amp) +``` + +```{eval-rst} .. parsed-literal:: mean of mean amplitude for syllable a: 208207.1240286356 @@ -359,6 +358,7 @@ the signal, and then smooths it with a sliding window. mean of mean amplitude for syllable j: 3005979.1576137305 mean of mean amplitude for syllable k: 170753.7788673711 +``` -Okay, now you’ve seen the basics of working with ``crowsetta``. Get out +Okay, now you’ve seen the basics of working with `crowsetta`. Get out there and analyze some vocalizations! diff --git a/pyproject.toml b/pyproject.toml index f48fd6c1..7b409e3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,8 +38,12 @@ test = [ "pytest-cov >=2.12.0" ] doc = [ + "myst-parser >=0.17.0", "Sphinx>=3.4.3", - "jupyterlab>=3.0.3" + "jupyterlab>=3.0.3", + "sphinxext-opengraph >=0.5.1", + "sphinx-copybutton >=0.4.0", + "sphinx-autobuild >= 2021.3.14", ] dev = [ 'flit', From b8d47bb48b7a8209778650098d2932c43d395180 Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 29 Mar 2022 21:00:07 -0400 Subject: [PATCH 2/2] DOC: add .readthedocs.yaml --- .readthedocs.yaml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .readthedocs.yaml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..923e931c --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,22 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-20.04 + apt_packages: + - libsndfile1 + tools: + python: "3.8" + +sphinx: + configuration: doc/conf.py + +python: + install: + - method: pip + path: . + extra_requirements: + - doc