%% Synopsis %%%%%%%%%%%

  ./perma.pl -task apply|train|find -i INPUTFILE -cost FLOAT|COSTFILE \
        -vl <0> -ij <2> -ml INT -md <4> -perm <1> -ct <lev>|prod \
        -default <1>|<0> -nrm <0>|1 -op <sum>|prod -optim <min>|max \
        -zs FLOAT -max_s INT -al <w>|v|vw -displc <0>|1 \
        [-bnd boundarySymbol] [-joker JOKER] [-infind INPUTFILE2] \
        [-mc FLOAT]

  ./perma.pl -task tapply -i INPUTFILE1+INPUTFILE2 -cost FLOAT|COSTFILE \
        -nrm <0>|1


%% Application %%%%%%%%

  % apply

  Derives/uses co-occurrence based cost function c for sequence alignment
  which is understood as transforming sequence v into sequence w with
  minimum cost.
  Edit operations:
        substitution of v_i by w_j: c(v_i,w_j) = 1 - P(w_j|v_i)
        deletion of v_i:            c(v_i,_) = 1 - P(_|v_i)
        insertion of w_j:           c(_,w_j) = 1 - P(w_j|_)

  % tapply

  Alignes 2 time sequences v and w specified by 2 lists of
  <label,onset,offset>-triplets.
  Interpolation of edit and overlap costs: (editCost+overlapCost)/2
  Cost calculation for segments <'x',xons,xoff> and <'y',yons,yoff>:

           |---------x----------|
  |------y------|
  yons          yoff
           xons                 xoff

  - editCost: same calculation as for 'apply' task
  - overlapCost:

      substitution of x by y

                     min(xoff,yoff)-max(xons,yons)
                 1 - -----------------------------
                     max(xoff,yoff)-min(xons,yons)
      
      deletion of x

                     min(xoff,xons)-max(xons,xons)
                 1 - -----------------------------  =  1
                     max(xoff,xons)-min(xons,xons)

      insertion of y
    
                     min(yons,yoff)-max(yons,yons)
                 1 - -----------------------------  =  1
                     max(yons,yoff)-min(yons,yons)


%% Arguments %%%%%%%%%%

  -task
        apply: align
        tapply: temporal alignment
        train: develop cost function
        find: find sequences in INPUTFILE2 similar to those in INPUTFILE1
  -i INPUTFILE
        text file containing sequence pairs v and w, one per line,
        separated by a tabulator
        example: A m p e l              Q a m p l
  -i INPUTFILE1+INPUTFILE2
        2 text files containing rows with label timeOnset timeOffset triplets
        example: U 20000 23000
        (time values can be ms, samples, etc.)
  -cost FLOAT|COSTFILE
        FLOAT: constant cost for all operations
        COSTFILE: text file
          containing cost for symbol pairs (_ for indels)
                %% examples (v window length -vl==0):
                a u     0.3     % c(a,u)
                a _     0.2     % c(a,_)
                _ u     0.4     % c(_,u)
                default 1       % c(unseen pairings)
                %% (-vl==1):
                e a i u 0.3     % c(e a i,u), i.e. a -> u | e __ i
                e a # _ 0.2     % c(e a #,_), i.e. a -> _ | e __ #
                                % (# = out-of-word)
                a _ i u 0.4     % c(a _ i,u), i.e. a -> a u | __ i 
          if omitted: INPUTFILE.cst is generated for task=train|apply
  %% for training or application without predefined cost function
  -vl <0>
        length of v-window to be used for P(w_j|vwin)
        if set to 0, vwin is v_i. If set to 1, vwin is v_i-1,v_i,v_i+1,
        etc.; <=1 recommended
  -ij <2>
        spreading co-occurrence counts within triangular window of
        surface 1 and length ij*2+1 centered on i
  -ml INT
        max length of v and w to be used for cost function developement
  -md <4>
        max length difference of v and w to be used for cost function
        developement
  -perm <1>
        in case of length differences between v and w, padd '_' to
        shorter sequence and increment+normalise co-occurrence counts for
        all '_'-permutations
        --> uniform treatment of substitutions and indels, increasing
        performance
  -max_s INT <dynamic sdjustment abs(|v|-|w|)>
        used if -perm is set to 1. Maximum length of consecutive
        '_' sequence in permutations (e.g. for German G2P -max_s 2
        recommended: sch --> S _ _)
  -ct <lev>|prod
        cost type
        lev: 1-P(w|v) for Levenshtein calculation
        prod: P(w|v) for multiplying probabilities
        -op and -optim have to be adjusted accordingly:
          lev -> sum, min
          prod -> sum|prod, max
  -default <1>|<0>
        default costs for unseen pairings, 1 for lev, 0 for prod
  -zs FLOAT
        cost for zero substitution. If not set 1-P(x|x)
  -bnd STRING
  defines boundary symbol BS which is intended to mark (e.g. word)
  boundaries when applying the aligner. It is removed from the training
  material and afterwards added to the cost function in the following
  way: c(BS,BS)=0, c(BS,_)=c(_,BS)=0.5, c(_,*)=c(*,_)=default*2. This way:
  BS will not take part in any non-zero substitution, and will be zero-
  substituted rather than deleted/inserted.
        
  %% for application
  -al <w>|v|vw
        w: align w (2nd column) to v (1st column)
        v: align v to w
	vw: align both
        Remarks: all based on cost function for v -> w
                 v,w: aligned column contains insertions '+' and
                      deletions '_'
                 vw: both columns contain deletions '_' only
  -nrm <0>|1
        normalise costs wrt length of v
  -op <sum>|prod 
        sum or multiply costs
  -optim <min>|max
        find path to minimise|maximise costs
  -zs FLOAT
        cost for zero substitution (if not specified in cost function)
        if neither set and nor specified in cost function, default cost
        is used
  -default <1>
        cost for unseen cases
  -joker JOKER
        placeholder, cost 0
  -displc <0>|1
        display sum of edit costs
  -bnd STRING
    defines boundary symbol BS which is intended to mark (e.g. word)
    boundaries when applying the aligner. It is added to the cost
    function as described above (see training section).

  %% for finding corresponding w-sequences for given v
  -infind STRING
        file containing sequences
  -md <UNSPEC>
        max allowed length difference of v and w
  -max_s INT <UNSPEC>
         Maximum length of allowed consecutive '_' sequence in aligned
         sequence
  -nrm <0>|1
  -ct <lev>|prod
        cost type (see above)
  -default <1>
        cost for unseen cases
  -zs FLOAT
    cost for zero substitution (if not specified in cost function)
    if neither set and nor specified in cost function, default cost
    is used
  -mc FLOAT
    cost upper bound for assigning w to v

%% Input, Output format %%%%%%%%%%%%%%%%%%%%%%%

  % Input -task train|apply: v <tab> w
        U n t           U n
  % Input -task find:
  % -i: v
        U n t
  % -infind: v
        U n
  % Output -task train|apply:
  % -al w: v <newline> w_aligned
        U n t
        U n _
  % -al v: v_aligned <newline> w
        U n+t
        U n
  % -al w -displc 1: v <newline> w_aligned <newline> sum_cost
        U n t
        U n _
        1
  % Output -task find: v <newline> wi <tab> wi|v_aligned <tab> cost <newline>
  % -al w
        U n t
            U n   U n _   1
            n     _ n _   2
  % -al v
        U n t
            U n   U n+t   1
            n     U+n+t   2


%% Available Cost Function Files %%%%%%%%%%%%%%

  to be announced

%% Some application examples %%%%%%%%%%%%%%%%%%%%%%%

  %%% -task apply

  % Grapheme-phoneme alignment (training data for G2P conversion)
  v: graphemes (unified to lowercase)
  w: phonemes
  -al w
  -zs *         % or simply omit -zs; since <x> != /x/
  -max_s 2      % for German, <s c h> /S _ _/
  -ml 18    % long sequences make cost function noisy
  -ij 1

  % Canonic-spontaneous_speech alignment CS1 (training data for
  % P2P conversion)
  v: canonic transcription
  w: spontaneous speech transcription
  -al w
  -zs 0   % since /x/ = /x/

  % Canonic-spontaneous_speech alignment CS2 (training data for
  % P2P conversion)
  v: spontaneous speech transcription
  w: canonic transcription
  -al v
  -zs 0   % since /x/ = /x/

  % MAUS-to-canonic alignment MC1 (mapping MAUS output on prondict)
  %     (same as Canonic-spontaneous_speech alignment)
  v: canonic transcription
  w: MAUS output
  -al w
  -zs 0

  % MAUS-to-canonic alignment MC2 (mapping MAUS output on prondict)
  v: MAUS output
  w: canonic transcription
  -al v
  -zs 0

  % observed alignment performances:
  CS1 > CS2
  MC1 > MC2

  %%% -task find
  % finding phonetic realisations of canonic forms
  -i: canonic form lexicon, 1 sequence per row
  -infind: connected speech transcriptions, 1 sequence per row
  -nrm 1, normalise to remove influence of sequence lengths 
  -mc 0.37, depending also on -nrm
  -cost 1, if no cost function is available
  -zs 0, zero substitution cost (set to 1 if not specified!)

  % finding potential canonic forms for phonetic realisation
  -i: connected speech transcriptions, 1 sequence per row
  -infind: canonic form lexicon, 1 sequence per row
  -nrm 1, normalise to remove influence of sequence lengths 
  -mc 0.37, depending also on -nrm
  -cost 1, if no cost function is available
  -zs 0, zero substitution cost (set to 1 if not specified!)

% v1.0
% October 1st 2013
% Uwe Reichel, IPS