#!/bin/bash
set -e


Dot=$1 #Input
BNTransDic=$2 #Output
TransRoot=$3


#
Ids=$BNTransDic.Ids
Txt=$BNTransDic.Txt
Trans1=$BNTransDic.Trans1

rm -rf $Ids $Txt $Trans1 $BNTransDic

#make a transcription from dot
#cat $Dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> $Ids
cat $Dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ""}'> $Ids
cat $Dot | sed -e 's/(.*)//' > $Txt
paste -d" " $Ids $Txt | sort -k 1 > $Trans1





# Do some basic normalization steps.  At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
echo "ERROR in Dot2BNTransDic: Juan, normalize_transcript.pl has to be found in local of TrainGen. Press a key"
read p
cat $Trans1 | $TransRoot/KaldiFun/normalize_transcript.pl $noiseword | sort > $BNTransDic || exit 1;    

rm -rf $Ids $Txt $Trans1

echo "Written: $BNTransDic"

