From d8927c99e758550ee53cfd807ac715d527551906 Mon Sep 17 00:00:00 2001
From: Ke Li <keli1@c11.clsp.jhu.edu>
Date: Thu, 28 Feb 2019 12:22:43 -0500
Subject: [PATCH] add SEAME (Mandarin-English code-switched corpus) kaldi
 recipes

---
 egs/seame/s5/cmd.sh                           |  15 ++
 egs/seame/s5/conf                             |   1 +
 egs/seame/s5/local/chain/run_chain_common.sh  |  82 +++++++
 .../s5/local/chain/tuning/run_tdnn_1a.sh      | 203 ++++++++++++++++
 egs/seame/s5/local/rnnlm/run_tdnn_lstm_1a.sh  | 158 ++++++++++++
 egs/seame/s5/local/train_lms_srilm.sh         | 224 ++++++++++++++++++
 egs/seame/s5/path.sh                          |   6 +
 egs/seame/s5/rnnlm                            |   1 +
 egs/seame/s5/run.sh                           | 183 ++++++++++++++
 egs/seame/s5/steps                            |   1 +
 egs/seame/s5/utils                            |   1 +
 11 files changed, 875 insertions(+)
 create mode 100644 egs/seame/s5/cmd.sh
 create mode 120000 egs/seame/s5/conf
 create mode 100755 egs/seame/s5/local/chain/run_chain_common.sh
 create mode 100755 egs/seame/s5/local/chain/tuning/run_tdnn_1a.sh
 create mode 100755 egs/seame/s5/local/rnnlm/run_tdnn_lstm_1a.sh
 create mode 100755 egs/seame/s5/local/train_lms_srilm.sh
 create mode 100755 egs/seame/s5/path.sh
 create mode 120000 egs/seame/s5/rnnlm
 create mode 100755 egs/seame/s5/run.sh
 create mode 120000 egs/seame/s5/steps
 create mode 120000 egs/seame/s5/utils

diff --git a/egs/seame/s5/cmd.sh b/egs/seame/s5/cmd.sh
new file mode 100644
index 00000000000..ea341c98d4a
--- /dev/null
+++ b/egs/seame/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/seame/s5/conf b/egs/seame/s5/conf
new file mode 120000
index 00000000000..9de386c9300
--- /dev/null
+++ b/egs/seame/s5/conf
@@ -0,0 +1 @@
+../../wsj/s5/conf
\ No newline at end of file
diff --git a/egs/seame/s5/local/chain/run_chain_common.sh b/egs/seame/s5/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..da37e148441
--- /dev/null
+++ b/egs/seame/s5/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/seame/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/seame/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..42778ac679c
--- /dev/null
+++ b/egs/seame/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+# 1a matches the model and parameters in wsj 1g recipe.
+
+set -e -o pipefail
+
+# configs for 'chain'
+stage=0
+decode_nj=50
+train_set=train
+gmm=tri3   # this is the source gmm-dir that we'll use for alignments; it should
+           # have alignments for the specified training data.
+nnet3_affix=  # affix for exp dirs, e.g. it was _cleaned in librispeech
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+
+# TDNN options
+frames_per_eg=140,100,160
+remove_egs=true
+srand=0
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=5
+
+# decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm 6 --num-processes 3 \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+# ref for the number of tree leaves is from wsj recipe: local/chain/tuning/run_tdnn_1g.sh
+local/chain/run_chain_common.sh --stage $stage \
+                                --gmm-dir $gmm_dir \
+                                --ali-dir $ali_dir \
+                                --lores-train-data-dir ${lores_train_data_dir} \
+                                --lang $lang \
+                                --lat-dir $lat_dir \
+                                --num-leaves 3500 \
+                                --tree-dir $tree_dir || exit 1;
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{09,10,11,12}/$USER/kaldi-data/egs/seame-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change 2.0 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 8 \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test $dir $graph_dir
+  # remove <UNK> from the graph, and convert back to const-FST.
+  fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
+    fstconvert --fst_type=const > $graph_dir/temp.fst
+  mv $graph_dir/temp.fst $graph_dir/HCLG.fst
+fi
+
+if [ $stage -le 17 ]; then
+  frames_per_chunk=$(echo $frames_per_eg | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev_man dev_sge; do
+      (
+        nspk=$(wc -l <data/${decode_set}_hires/spk2utt)  
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd" --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set} || exit 1
+      ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/seame/s5/local/rnnlm/run_tdnn_lstm_1a.sh b/egs/seame/s5/local/rnnlm/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..c228abd8c76
--- /dev/null
+++ b/egs/seame/s5/local/rnnlm/run_tdnn_lstm_1a.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2018  Ke Li
+
+# This script trains an RNNLM on the SEAME training data.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 13) was 5, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 45.8 / 69.1.
+# Train objf: -4.45 -4.20 -4.04 -3.90 -3.78 -3.66 -3.54 -3.42 -3.30 -3.19 -3.08 -2.98
+# Dev objf:   -4.74 -4.42 -4.32 -4.25 -4.24 -4.25 -4.29 -4.32 -4.38 -4.43 -4.50 -4.56
+
+# WER summary on dev and test sets
+# System                   tdnn_1c_sp  +lattice_rescore  +nbest_rescore 
+# WER on dev_man             19.96          18.75             18.80
+# WER on dev_sge             25.72          24.26             24.28
+
+
+# %WER 19.96 [ 16895 / 84652, 1656 ins, 3649 del, 11590 sub ] exp/chain/tdnn_1a_sp/decode_dev_man/wer_10_0.0
+# %WER 25.72 [ 13992 / 54408, 1459 ins, 2939 del, 9594 sub ] exp/chain/tdnn_1a_sp/decode_dev_sge/wer_10_0.0
+
+# Lattice rescoring
+# %WER 18.75 [ 15872 / 84652, 1688 ins, 3450 del, 10734 sub ] exp/chain/tdnn_1a_sp/decode_dev_man_rnnlm_1a_rescore/wer_10_0.0
+# %WER 24.26 [ 13198 / 54408, 1482 ins, 2782 del, 8934 sub ] exp/chain/tdnn_1a_sp/decode_dev_sge_rnnlm_1a_rescore/wer_10_0.0
+
+# Nbest rescoring
+# %WER 18.80 [ 15917 / 84652, 1541 ins, 3671 del, 10705 sub ] exp/chain/tdnn_1a_sp/decode_dev_man_rnnlm_1a_nbest_rescore/wer_10_0.0
+# %WER 24.28 [ 13211 / 54408, 1406 ins, 2881 del, 8924 sub ] exp/chain/tdnn_1a_sp/decode_dev_sge_rnnlm_1a_nbest_rescore/wer_10_0.0
+
+# Begin configuration section.
+
+dir=exp/rnnlm_lstm_1a
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=-10
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_lat_rescore=true
+run_nbest_rescore=true
+run_backward_rnnlm=false
+ac_model_dir=exp/chain/tdnn_1a_sp
+decode_dir_suffix=rnnlm_1a
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+# test of 960 hours training transcriptions
+text=data/train/text
+lexicon=data/lang_nosp/words.txt
+text_dir=data/rnnlm/text_rnnlm_1a
+mkdir -p $dir/config
+set -e
+
+for f in $text $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for run.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 50 lines as dev data.
+  cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%50 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/seame.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lexicon $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+seame   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --top-word-features=3000 \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<v-noise>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200 \
+                            $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-final 1 \
+                       --stage $train_stage \
+                       --num-epochs $epochs \
+                       --cmd "$train_cmd" $dir
+fi
+
+if [ $stage -le 4 ] && $run_lat_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+#  LM=tgsmall # if using the original 3-gram G.fst as old lm
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  for decode_set in dev_man dev_sge; do
+    decode_dir=${ac_model_dir}/decode_${decode_set}
+    # Lattice rescoring
+    rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd --mem 8G" \
+        --weight 0.45 --max-ngram-order $ngram_order \
+        data/lang_test $dir \
+        data/${decode_set}_hires ${decode_dir} \
+        ${ac_model_dir}/decode_${decode_set}_${decode_dir_suffix}_rescore
+  done
+fi
+
+if [ $stage -le 5 ] && $run_nbest_rescore; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+  for decode_set in dev_man dev_sge; do
+    decode_dir=${ac_model_dir}/decode_${decode_set}
+    # Nbest rescoring
+    rnnlm/lmrescore_nbest.sh \
+        --cmd "$decode_cmd --mem 8G" --N 20 \
+        0.4 data/lang_test $dir \
+        data/${decode_set}_hires ${decode_dir} \
+        ${ac_model_dir}/decode_${decode_set}_${decode_dir_suffix}_nbest_rescore
+  done
+fi
+
+exit 0;
diff --git a/egs/seame/s5/local/train_lms_srilm.sh b/egs/seame/s5/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..8160b060dc7
--- /dev/null
+++ b/egs/seame/s5/local/train_lms_srilm.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+export LC_ALL=C
+
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+outlm=lm.gz
+
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
+  else
+    sdir=`pwd`/../../../tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  [ -z "$train_text" ] && train_text=$datadir/train/text
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+fi
+
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+echo "-------------------"
+echo "Good-Turing 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 2grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+
+#This will link the lowest perplexity LM as the output LM.
+#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm
+
+#A slight modification of the previous approach:
+#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl
+nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l`
+if [[ $nof_trigram_lm -eq 0 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+elif [[ $nof_trigram_lm -eq 2 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+else  #exactly one 3gram LM
+  lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '`
+fi
+(cd $tgtdir; ln -sf `basename $lmfilename` $outlm )
+
diff --git a/egs/seame/s5/path.sh b/egs/seame/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/seame/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/seame/s5/rnnlm b/egs/seame/s5/rnnlm
new file mode 120000
index 00000000000..e136939ba72
--- /dev/null
+++ b/egs/seame/s5/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
\ No newline at end of file
diff --git a/egs/seame/s5/run.sh b/egs/seame/s5/run.sh
new file mode 100755
index 00000000000..935edf80a64
--- /dev/null
+++ b/egs/seame/s5/run.sh
@@ -0,0 +1,183 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2018  Ke Li
+
+# Begin configuration section.
+# initialize path and commands
+. ./path.sh
+. ./cmd.sh
+
+nj=9 # number of parallel jobs
+stage=0
+
+set -e
+
+
+[ ! -L "steps" ] && ln -s ../../wsj/s5/steps
+
+[ ! -L "utils" ] && ln -s ../../wsj/s5/utils
+
+[ ! -L "conf" ] && ln -s ../../wsj/s5/conf
+
+. utils/parse_options.sh # accept options
+AUDIO=(
+    /export/corpora5/LDC/LDC2015S04/data/conversation/audio/
+    /export/corpora5/LDC/LDC2015S04/data/interview/audio/
+)
+TEXT=(
+    /export/corpora5/LDC/LDC2015S04/data/conversation/transcript/phaseII
+    /export/corpora5/LDC/LDC2015S04/data/interview/transcript/phaseII
+)
+
+# TODO Data preparation
+if [ $stage -le 0 ]; then
+  # 01NC01FBX_0101 start_time end_time LID utt_trans
+  # 01NC01FBX is spkeaer id
+  # 0101 part 01
+  local/prepare_text_data.sh $TEXT
+  local/prepare_audio_data.sh $AUDIO
+fi
+
+if [ $stage -le 1 ]; then
+  #TODO Prepare dictionary
+  #local/prepare_dict.sh $corpus
+  utils/validate_dict_dir.pl data/local/dict_nosp
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_nosp data/lang_nosp
+  utils/validate_lang.pl data/lang_nosp
+fi
+
+# Prepare language models
+if [ $stage -le 2 ]; then
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
+    data/lang_nosp/words.txt data data/lm
+  utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
+    data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
+  utils/validate_lang.pl data/lang_nosp_test
+fi
+
+# Feature extraction
+if [ $stage -le 3 ]; then
+  for x in train dev_man dev_sge; do
+    dir=data/$x
+    utils/fix_data_dir.sh $dir
+    steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" $dir
+    utils/fix_data_dir.sh $dir
+    steps/compute_cmvn_stats.sh $dir
+
+    utils/fix_data_dir.sh $dir
+    utils/validate_data_dir.sh $dir
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  # Make some small data subsets for early system-build stages.
+  # For the monophone stages we select the shortest utterances, which should make it easier to align the data from a flat start.
+  utils/subset_data_dir.sh --shortest data/train 5000 data/train_short
+fi
+
+if [ $stage -le 5 ]; then
+  # train a monophone system
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_short data/lang_nosp_test exp/mono
+  # decode using the monophone model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/mono exp/mono/graph_nosp
+    for test in dev_man dev_sge; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \
+      data/$test exp/mono/decode_nosp_$test
+    done
+  )&
+fi
+
+if [ $stage -le 6 ]; then
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/mono exp/mono_ali || exit 1;
+  # train a first delta + delta-delta triphone system on all utterances
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+    2000 10000 data/train data/lang_nosp_test exp/mono_ali exp/tri1
+
+  # decode using the tri1 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp
+    for test in dev_man dev_sge; do
+      nspk=$(wc -l <data/${test}/spk2utt)
+      steps/decode.sh --nj $nspk --cmd "$decode_cmd" exp/tri1/graph_nosp \
+      data/$test exp/tri1/decode_nosp_$test
+    done
+  )&
+fi
+
+if [ $stage -le 7 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/tri1 exp/tri1_ali || exit 1;
+  
+  # train an LDA+MLLT system.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
+    data/train data/lang_nosp_test exp/tri1_ali exp/tri2
+
+  # decode using the LDA+MLLT model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp
+    for test in dev_man dev_sge; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \
+      data/$test exp/tri2/decode_nosp_$test
+    done
+  )&
+fi
+
+if [ $stage -le 8 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" --use-graphs true \
+    data/train data/lang_nosp_test exp/tri2 exp/tri2_ali || exit 1;
+
+  # train tri3, which is a LDA+MLLT+SAT system on all utterances
+  steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
+    data/train data/lang_nosp_test exp/tri2_ali exp/tri3
+
+  # decode using the tri3 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+    for test in dev_man dev_sge; do
+      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \
+      data/$test exp/tri3/decode_nosp_$test
+    done
+  )&
+fi
+
+if [ $stage -le 9 ]; then
+  # Now we compute the pronunciation and silence probabilities from training data,
+  # and re-create the lang directory.
+  steps/get_prons.sh --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
+  
+  utils/prepare_lang.sh data/local/dict \
+    "<unk>" data/local/lang data/lang
+
+  utils/format_lm.sh data/lang data/lm/lm.gz \
+    data/local/dict/lexiconp.txt data/lang_test
+fi
+
+if [ $stage -le 10 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_test exp/tri3 exp/tri3_ali
+
+  # decode using the tri3 model with pronunciation and silence probabilities
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph
+    for test in dev_man dev_sge; do
+      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph \
+      data/$test exp/tri3/decode_$test
+    done
+  )&
+fi
+
+# After run.sh is finished, run the followings:
+# ./local/chain/tuning/run_tdnn_1a.sh
+# ./local/rnnlm/run_tdnn_lstm_1a.sh
+exit 0;
diff --git a/egs/seame/s5/steps b/egs/seame/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/seame/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/seame/s5/utils b/egs/seame/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/seame/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file