Initial commit

hitachi-speech · Sep 12, 2019 · 94b316a · 94b316a
commit 94b316a
Show file tree

Hide file tree

Showing 72 changed files with 4,911 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,115 @@
+egs/**/data/
+egs/**/exp/
+tools/kaldi
+tools/miniconda3.sh
+tools/miniconda3/
+tools/sctk
+tools/sctk-2.4.10-20151007-1312Z.tar.bz2
+tools/sctk-2.4.10/
+tools/env.sh
+.nfs*
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Hitachi, Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,81 @@
+# EEND (End-to-End Neural Diarization)
+
+EEND (End-to-End Neural Diarization) is a neural-network-based speaker diarization method.
+
+## Install tools
+### Requirements
+ - NVIDIA CUDA GPU
+ - CUDA Toolkit (8.0 <= version <= 10.1)
+
+### Install kaldi and python environment
+```bash
+cd tools
+make
+```
+- This command builds kaldi at `tools/kaldi`
+  - if you want to use pre-build kaldi
+    ```bash
+    cd tools
+    make KALDI=<existing_kaldi_root>
+    ```
+    This option make a symlink at `tools/kaldi`
+- This command extracts miniconda3 at `tools/miniconda3`, and creates conda envirionment named 'eend'
+- Then, installs Chainer and cupy into 'eend' environment
+  - use CUDA in `/usr/local/cuda/`
+    - if you need to specify your CUDA path
+      ```bash
+      cd tools
+      make CUDA_PATH=/your/path/to/cuda-8.0
+      ```
+      This command installs cupy-cudaXX according to your CUDA version.
+      See https://docs-cupy.chainer.org/en/stable/install.html#install-cupy
+
+## Test recipe (mini_librispeech)
+### Configuration
+- Modify `egs/mini_librispeech/v1/cmd.sh` according to your job schedular.
+If you use your local machine, use "run.pl".
+If you use Grid Engine, use "queue.pl"
+If you use SLURM, use "slurm.pl".
+For more information about cmd.sh see http://kaldi-asr.org/doc/queue.html.
+### Data preparation
+```bash
+cd egs/mini_librispeech/v1
+./run_prepare_shared.sh
+```
+### Run training, inference, and scoring
+```bash
+./run.sh
+```
+- See `RESULT.md` and compare with your result.
+
+## CALLHOME two-speaker experiment
+### Configuraition
+- Modify `egs/callhome/v1/cmd.sh` according to your job schedular.
+If you use your local machine, use "run.pl".
+If you use Grid Engine, use "queue.pl"
+If you use SLURM, use "slurm.pl".
+For more information about cmd.sh see http://kaldi-asr.org/doc/queue.html.
+- Modify `egs/callhome/v1/run_prepare_shared.sh` according to storage paths of your copora.
+
+### Data preparation
+```bash
+cd egs/callhome/v1
+./run_prepare_shared.sh
+```
+### Self-attention-based model (latest configuration)
+```bash
+./run.sh
+```
+### BLSTM-based model (old configuration)
+```bash
+local/run_blstm.sh
+```
+## Citation
+```
+@inproceedings{Fujita2019Interspeech,
+ author={Yusuke Fujita and Naoyuki Kanda and Shota Horiguchi and Kenji Nagamatsu and Shinji Watanabe},
+ title={End-to-end Neural Speaker Diarization with Permutation-free Objectives},
+ booktitle={Interspeech},
+ year=2019
+}
+```
diff --git a/eend/bin/infer.py b/eend/bin/infer.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+#
+# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
+# Licensed under the MIT license.
+#
+import yamlargparse
+from eend import system_info
+
+parser = yamlargparse.ArgumentParser(description='decoding')
+parser.add_argument('-c', '--config', help='config file path',
+                    action=yamlargparse.ActionConfigFile)
+parser.add_argument('data_dir',
+                    help='kaldi-style data dir')
+parser.add_argument('model_file',
+                    help='best.nnet')
+parser.add_argument('out_dir',
+                    help='output directory.')
+parser.add_argument('--backend', default='chainer',
+                    choices=['chainer', 'pytorch'],
+                    help='backend framework')
+parser.add_argument('--model_type', default='LSTM', type=str)
+parser.add_argument('--gpu', type=int, default=-1)
+parser.add_argument('--num-speakers', type=int, default=4)
+parser.add_argument('--hidden-size', default=256, type=int,
+                    help='number of lstm output nodes')
+parser.add_argument('--num-lstm-layers', default=1, type=int,
+                    help='number of lstm layers')
+parser.add_argument('--input-transform', default='',
+                    choices=['', 'log', 'logmel',
+                             'logmel23', 'logmel23_swn', 'logmel23_mn'],
+                    help='input transform')
+parser.add_argument('--embedding-size', default=256, type=int)
+parser.add_argument('--embedding-layers', default=2, type=int)
+parser.add_argument('--chunk-size', default=2000, type=int,
+                    help='input is chunked with this size')
+parser.add_argument('--context-size', default=0, type=int,
+                    help='frame splicing')
+parser.add_argument('--subsampling', default=1, type=int)
+parser.add_argument('--sampling-rate', default=16000, type=int,
+                    help='sampling rate')
+parser.add_argument('--frame-size', default=1024, type=int,
+                    help='frame size')
+parser.add_argument('--frame-shift', default=256, type=int,
+                    help='frame shift')
+parser.add_argument('--transformer-encoder-n-heads', default=4, type=int)
+parser.add_argument('--transformer-encoder-n-layers', default=2, type=int)
+parser.add_argument('--save-attention-weight', default=0, type=int)
+args = parser.parse_args()
+
+system_info.print_system_info()
+print(args)
+if args.backend == 'chainer':
+    from eend.chainer_backend.infer import infer
+    infer(args)
+elif args.backend == 'pytorch':
+    # TODO
+    # from eend.pytorch_backend.infer import infer
+    # infer(args)
+    raise NotImplementedError()
+else:
+    raise ValueError()