#!/usr/bin/perl
#
#    pal2nal.pl  (v8)                                      Mikita Suyama
#
#    Usage:  pal2nal.pl  pep.ali  nuc.fasta  [nuc.fasta...]  [options]  >  output
#
#        pep.ali:    protein alignment either in CLUSTAL or FASTA format
#
#        nuc.fasta:  DNA sequences (single multi-fasta or separated files)
#
#        Options:  -output (clustal(default)|paml|fasta)
#                  -nogap
#                  -html
#                  -nostderr
#                  -blockonly
#                  -nomismatch
#                  -codontable (universal(default)|vmitchondria)
#
#        - IDs in pep.aln are used in the output.
#
#        - Sequence order in pep.aln and nuc.fasta should be the same.
#
#        - In-frame stop -> "*" or "_"
#
#        - Frameshift
#
#            Gene    HCDG
#            Pseudo  HC1G     2 del in pseudo
#
#            Gene    HCDG
#            Pseudo  HC2G     1 del in pseudo
#
#            Gene    ND-TY
#            Pseudo  ND1TY    1 ins in pseudo
#
#            Gene    ND-TY
#            Pseudo  ND2TY    2 ins in pseudo
#
#            Gene    EREQK
#            Pseudo  EK4QK    1 ins in pseudo
#
#
#        example:
#
#          pep.aln
#            AA1    ACDEFGARH1G-F*
#            AA2    A2DDW-A*H-G2F*
#
#          nuc.fasta
#            >NUC1
#            GCTTGTGATGAATTTGGTGCTCGTCATGGGGTTTTAA
#            >NUC2
#            GCGGGGACGACTGGGCGTAGCACGGGGGTTTTGA
#
#          output
#            NUC1   GCTTGTGATGAATTTGGTGCTCGTCATG--GGG---TTTTAA
#            NUC2   GCGGG-GACGACTGG---GCGTAGCAC---GGGGG-TTTTGA
#    
#
#       old script name: pal2nal.pl
#                                       06/09/2000    Mikita Suyama
#
#     - modified for multiple seqs.     17/10/2000
#
#     - AA-ids and NUC-ids must be the same -> TURNED OFF  05/05/2002
#       Now, only the order is the matter.
#       AA-ids are used throughout; NUC-ids are not used at all.
#
#     - old condition: AA = NUC
#       new condition: AA <= NUC   ##  NUC(cDNA) can be longer than AA  ##
#
#     - renamed from "pal2nal.pl" to codon_aln.pl          12/07/2004
#
#  v2.
#     - nuc.fasta can be either one multiple fasta file or several files.
#       The order of seqs should be the same as those in the pep.aln file.
#                                                          13/07/2004
#
#  v3.
#     - renamed back to "pal2nal.pl"
#     - subroutine (pn2codon) is modified:
#         - reverse translation table (%p2c) was replaced
#         - if there is no exact match, try fragment anchors
#         - return value is changed to hash
#     - new options:
#         -output clustal|paml|fasta
#                   clustal format (default)
#                   paml
#                   fasta
#         -nogap        ->  remove all gaped codons and stop codons
#
#                                                          22/04/2005
#  v4.
#     - new option:
#         -html         ->  for the web server (STDERR messages -> STDOUT)
#                                              (mismatch in color)
#         -nostderr     ->  for Ks,Ka calc on the web server (NO STDERR messages)
#                                                          25/04/2005
#
#  v6.
#   #  - new option:
#   #      -fasta        -> set this option if input pep file is
#   #                       not in *.aln format but in *.fasta format.
#   #                                                       16/08/2005
#
#  v7.
#     - new option:
#   #       -a  input_alignment_type
#   #         clustal  (default)    CLUSTALW format
#   #         fasta
#   #         gblocks               Gblocks txt format *-gb.txt (-p=t)
#   #
#   #           in the case of "-a gblocks", only the codons in conserved
#   #           blocks are converted into codon alignment.
#                                                          27/01/2006
#
#     - if clustal alignment contains
#
#     - new option:
#          -blockonly
#                                                          03/02/2006
#
#     - show warning message only if the position is marked '#'.
#
#     - -i, -j options are deleted.
#                                                          08/02/2006
#  v8.
#     - automatic detection of the input alignment format
#       (-a is no longer used.)
#                                                          14/02/2006
#     - new option:
#          -nomismatch  -> remove mismatched codons (mismatch between 
#                          pep and cDNA) from the output.
#
#          -codontable (universal(default)|vmitochondria)
#                                                          26/03/2006

$| = 1;

if ($#ARGV < 1) {
    & showhelp();
    exit;
} else {
    $getoutform = 0;
    $nogap = 0;
    $html = 0;
    $nostderr = 0;
    $fasta = 0;
    undef($alnfile);
    undef(@nucfiles);
    $outform = "clustal";
    $blockonly = 0;
    $nomismatch = 0;
    $getcodontable = 0;
    $codontable = "universal";
    foreach $i (0..$#ARGV) {
        if ($ARGV[$i] eq "-h") {
            & showhelp();
        } elsif ($ARGV[$i] eq "-output") {
            $getoutform = 1;
        } elsif ($getoutform) {
            $outform = $ARGV[$i];
            if ($outform ne "clustal" &&
                $outform ne "paml" &&
                $outform ne "fasta") {
                print STDERR "\nERROR:  valid output format: clustal, paml, or fasta\n\n";
                exit;
            }
            $getoutform = 0;
        } elsif ($ARGV[$i] eq "-blockonly") {
            $blockonly = 1;
        } elsif ($ARGV[$i] eq "-nogap") {
            $nogap = 1;
        } elsif ($ARGV[$i] eq "-nomismatch") {
            $nomismatch = 1;
        } elsif ($ARGV[$i] eq "-codontable") {
            $getcodontable = 1;
        } elsif ($getcodontable) {
            $codontable = $ARGV[$i];
            if ($codontable ne "universal" &&
                $codontable ne "vmitochondria") {
                print STDERR "\nERROR:  valid codon table: universal or vmitochondria\n\n";
                exit;
            }
            $getcodontable = 0;
        } elsif ($ARGV[$i] eq "-html") {
            $html = 1;
        } elsif ($ARGV[$i] eq "-nostderr") {
            $nostderr = 1;
        } elsif (!$alnfile) {
            $alnfile = $ARGV[$i];
        } else {
            push(@nucfiles, $ARGV[$i]);
        }
    }
}


#---------------------#
#  Get nuc sequences
#---------------------#

undef(@nucid);
undef(@nucseq);
$nseq = -1;
foreach $i (0..$#nucfiles) {
    open(NUCFILE, "< $nucfiles[$i]") || die "Can't open $nucfiles[$i]";
    while (<NUCFILE>) {
        if (!/^#/ && /\S+/) {
            if (/^>(\S+)/) {
                ++$nseq;
                push(@nucid, $1);
            } else {
                s/[^a-zA-Z]//g;
                $nucseq[$nseq] .= $_;
            }
        }
    }
    close(NUCFILE);
}


#-------------------#
#  Get aa alignemt
#-------------------#

undef(@aaid);
undef(%id2aaaln);
undef(%aaidcnt);
undef(@aaseq);
undef($gblockseq);

$gettype = 1;
open(ALNFILE, "< $alnfile") || die "Can't open $alnfile";
while (<ALNFILE>) {
    chomp;
    if ($gettype && !/^#/ && /\S+/) {
        if (/^CLUSTAL/) {
            $inalntype = "clustal";
        } elsif (/^>/) {
            $inalntype = "fasta";
        } elsif (/^Gblocks/) {
            $inalntype = "gblocks";
        }

        $gettype = 0;
    }
}
close(ALNFILE);

if ($inalntype eq "clustal") {
    open(ALNFILE, "< $alnfile") || die "Can't open $alnfile";
    $getblock = 0;
    while (<ALNFILE>) {
        chomp;
        if (!/^CLUSTAL/ && /^\S+/ && !/^#/) {
            s/\s+$//;
            @dat = split(/\s+/, $_);
            ++$aaidcnt{$dat[0]};
            push(@aaid, $dat[0]) if ($aaidcnt{$dat[0]} == 1);
            $dat[1] =~ tr/a-z/A-Z/;
            $id2aaaln{$dat[0]} .= $dat[1];

            $tmplen = length($_);

            /^\S+\s+/;
            $idspc = length($&);
            $subalnlen = length($dat[1]);

            $getblock = 1;
        } elsif ($getblock) {
            $_ .= ' ' x ($tmplen - length($_));
            $gblockseq .= substr($_, $idspc, $subalnlen);

            $getblock = 0;
        }
    }
    close(ALNFILE);
} elsif ($inalntype eq "fasta") {
    open(ALNFILE, "< $alnfile") || die "Can't open $alnfile";
    while (<ALNFILE>) {
        chomp;
        if (/^>(\S+)/) {
            $tmpid = $1;
            push(@aaid, $tmpid);
        } else {
            s/\s+//g;
            tr/a-z/A-Z/;
            $id2aaaln{$tmpid} .= $_;
        }
    }
    close(ALNFILE);
} elsif ($inalntype eq "gblocks") {
    open(ALNFILE, "< $alnfile") || die "Can't open $alnfile";
    $getaln = 0;
    while (<ALNFILE>) {
        chomp;
        if (/^\s+\=/) {
            $getaln = 1;
        } elsif (/^Parameters/) {
            $getaln = 0;
        } elsif ($getaln) {
            @dat = split(/\s+/, $_);
            if (/^Gblocks/) {
                $gblockseq .= $dat[1];
            } elsif (/^\S+/) {
                ++$aaidcnt{$dat[0]};
                push(@aaid, $dat[0]) if ($aaidcnt{$dat[0]} == 1);
                $dat[1] =~ tr/a-z/A-Z/;
                $id2aaaln{$dat[0]} .= $dat[1];
            }
        }
    }
    close(ALNFILE);
}

foreach $i (0..$#aaid) {
    push(@aaseq, $id2aaaln{$aaid[$i]});
}


#-------------------------------------#
#  Check the input seqs (pep <=> nuc)
#-------------------------------------#

if ($#aaid != $#nucid) {
    $naa = $#aaid + 1;
    $nnuc = $#nucid + 1;
    if ($html) {
        print "\nERROR: number of input seqs differ (aa: $naa;  nuc: $nnuc)!!\n\n";
    } else {
        print STDERR "\nERROR: number of input seqs differ (aa: $naa;  nuc: $nnuc)!!\n\n";
        print STDERR "   aa  '@aaid'\n";
        print STDERR "   nuc '@nucid'\n";
    }
    exit;
}
        

#-------------------#
#  Codon sequences
#-------------------#

undef(@codonseq);
undef(%aaidpos2mismatch);
undef(@outmessage);
foreach $i (0..$#aaid) {
    %codonout = & pn2codon($aaseq[$i], $nucseq[$i], $codontable);
    @message = @{$codonout{'message'}};

    foreach $j (0..$#message) {
        $outl = "WARNING: $aaid[$i] $message[$j]";
        push(@outmessage, $outl);
        @dat = split(/\s+/, $message[$j]);
        $dat[1] =~ s/:$//;
        $aaidpos2mismatch{"$aaid[$i] $dat[1]"} = 1;
    }

    if ($codonout{'result'} == 1 || $codonout{'result'} == 2) {
        push(@codonseq, $codonout{'codonseq'});
    } else {
        if ($html) {
            print "#---  ERROR: no match found  ---#\n";
            print "    >$aaid[$i]\n";
            print "    $aaseq[$i]\n";
            print "    >$nucid[$i]\n";
            print "    $nucseq[$i]\n";
        } else {
            print STDERR "#---  ERROR: no match found  ---#\n";
            print STDERR "    >$aaid[$i]\n";
            print STDERR "    $aaseq[$i]\n";
            print STDERR "    >$nucid[$i]\n";
            print STDERR "    $nucseq[$i]\n";
        }
        exit;
    }
}


#-------------------#
#  Warning in '#'?
#-------------------#

if ($gblockseq =~ /#/ && $blockonly) {
    undef(@newoutmessage);
    foreach $i (0..$#outmessage) {
        $mpos = (split(/\s+/, $outmessage[$i]))[3];
        $mpos =~ s/://;

        if (substr($gblockseq, $mpos - 1, 1) eq "#") {
            push(@newoutmessage, $outmessage[$i]);
        }
    }
    @outmessage = @newoutmessage;
}


#--------------------#
#  Warning messages
#--------------------#

if (!$nostderr) {
    foreach $j (0..$#outmessage) {
        if ($html) {
            if ($j == 0 && !$nomismatch) {
                print "#------------------------------------------------------------------------#\n";
            }
            if ($nomismatch) {
                # print "#  $outmessage[$j]  (excluded from the output)\n";
            } else {
                print "#  $outmessage[$j]\n";
            }
            if ($j == $#outmessage && !$nomismatch) {
                print "#------------------------------------------------------------------------#\n\n";
            }
        } else {
            if ($j == 0 && !$nomismatch) {
                print STDERR "#------------------------------------------------------------------------#\n";
                print STDERR "#  Input files:  $alnfile @nucfiles\n";
            }
            if ($nomismatch) {
                # print STDERR "#  $outmessage[$j]  (exlucded from the output)\n";
            } else {
                print STDERR "#  $outmessage[$j]\n";
            }
            if ($j == $#outmessage && !$nomismatch) {
                print STDERR "#------------------------------------------------------------------------#\n\n";
            }
        }
    }
}

undef(%errorpos);
foreach $j (0..$#outmessage) {
    @dat = split(/\s+/, $outmessage[$j]);
    $tmperrpos = $dat[3];
    $tmperrpos =~ s/:$//;
    $tmperrpos -= 1;
    $errorpos{$tmperrpos} = 1;
}


#----------------------------------#
#  Make an AA-based NUC alignment
#----------------------------------#

undef(@tmppos);
undef(@codonaln);
undef(@coloraln);
undef($maskseq);
foreach $i (0..length($aaseq[0]) - 1) {
    $tmpmax = 0;
    $apos = $i + 1;

    #-----------#
    # gblocks ?
    #-----------#

    $putcodon = 1;
    if ($gblockseq =~ /#/ && substr($gblockseq, $i, 1) ne "#" && $blockonly) {
        $putcodon = 0;
    }
    if ($nomismatch && $errorpos{$i}) {
        $putcodon = 0;
    }

    foreach $k (0..$#aaid) {
        $tmpaa = substr($aaseq[$k], $i, 1);
        if ($tmpaa !~ /\d/) {
            $tmplen = 3;
        } else {
            $tmplen = (int(($tmpaa - 1) / 3) + 1) * 3;  # 1, 2, 3 -> 3
                                                        # 4, 5, 6 -> 6
                                                        # 7, 8, 9 -> 9
        }
        $tmpmax = $tmplen if ($tmpmax < $tmplen);
    }
    foreach $k (0..$#aaid) {
        $tmpaa = substr($aaseq[$k], $i, 1);
        if ($tmpaa !~ /\d/) {
            if ($tmpaa eq '-') {
                # if ($putcodon || (!$blockonly && !$nomismatch)) {
                if ($putcodon) {
                    $codonaln[$k] .= '-' x $tmpmax;
                    if ($aaidpos2mismatch{"$aaid[$k] $apos"}) {
                        $coloraln[$k] .= 'R' x $tmpmax;
                    } else {
                        $coloraln[$k] .= '-' x $tmpmax;
                    }
                }
            } elsif ($tmpaa =~ /[A-Z]/ || $tmpaa eq '*') {
                # if ($putcodon || (!$blockonly && !$nomismatch)) {
                if ($putcodon) {
                    $codonaln[$k] .= substr($codonseq[$k], $tmppos[$k], 3);
                    if ($aaidpos2mismatch{"$aaid[$k] $apos"}) {
                        $coloraln[$k] .= 'RRR';
                    } else {
                        $coloraln[$k] .= '---';
                    }
                }
                $tmppos[$k] += 3;
                # if ($putcodon || (!$blockonly && !$nomismatch)) {
                if ($putcodon) {
                    $codonaln[$k] .= '-' x ($tmpmax - 3) if ($tmpmax - 3 > 0);
                    if ($aaidpos2mismatch{"$aaid[$k] $apos"}) {
                        $coloraln[$k] .= 'R' x ($tmpmax - 3) if ($tmpmax - 3 > 0);
                    } else {
                        $coloraln[$k] .= '-' x ($tmpmax - 3) if ($tmpmax - 3 > 0);
                    }
                }
            }
        } elsif ($tmpaa =~ /\d/) {
            # if ($putcodon || (!$blockonly && !$nomismatch)) {
            if ($putcodon) {
                $codonaln[$k] .= substr($codonseq[$k], $tmppos[$k], $tmpaa);
                if ($aaidpos2mismatch{"$aaid[$k] $apos"}) {
                    $coloraln[$k] .= 'R' x $tmpaa;
                } else {
                    $coloraln[$k] .= '-' x $tmpaa;
                }
            }
            $tmppos[$k] += $tmpaa;
            # if ($putcodon || (!$blockonly &&!$nomismatch)) {
            if ($putcodon) {
                $codonaln[$k] .= '-' x ($tmpmax - $tmpaa);
                if ($aaidpos2mismatch{"$aaid[$k] $apos"}) {
                    $coloraln[$k] .= 'R' x ($tmpmax - $tmpaa);
                } else {
                    $coloraln[$k] .= '-' x ($tmpmax - $tmpaa);
                }
            }
        }
    }
    if (!$blockonly) {
        # if ($putcodon && substr($gblockseq, $i, 1) eq "#") {
        #     $maskseq .= '#' x $tmpmax;
        # } else {
        #     $maskseq .= ' ' x $tmpmax;
        # }
        if ($putcodon) {
            if (substr($gblockseq, $i, 1) eq "#") {
                $maskseq .= "#" x $tmpmax;
            } else {
                $maskseq .= " " x $tmpmax;
            }
        }
    }
}


#-----------#
#  -nogap?
#-----------#

$alilen = length($codonaln[0]);

if ($nogap) {
    $tmppos = 0;
    undef(@nogapaln);
    undef(@nogapcoloraln);
    undef($nogapmaskseq);
    while ($tmppos < $alilen) {
        $outok = 1;
        foreach $i (0..$#codonaln) {
            $tmpcodon = substr($codonaln[$i], $tmppos, 3);
            $outok = 0 if ($tmpcodon =~ /-/);
            $outok = 0 if ($tmpcodon =~ /(((U|T)A(A|G|R))|((T|U)GA))/);
        }
        if ($outok) {
            foreach $i (0..$#codonaln) {
                $tmpcodon = substr($codonaln[$i], $tmppos, 3);
                $nogapaln[$i] .= $tmpcodon;
                $tmpcolorcodon = substr($coloraln[$i], $tmppos, 3);
                $nogapcoloraln[$i] .= $tmpcolorcodon;
            }
            $nogapmaskseq .= substr($maskseq, $tmppos, 3);
        }
        $tmppos += 3;
    }
    @codonaln = @nogapaln;
    @coloraln = @nogapcoloraln;
    $maskseq = $nogapmaskseq;
}


#----------#
#  Output
#----------#

$maxn = 0;
foreach $i (0..$#aaid) {
    $maxn = length($aaid[$i]) if ($maxn < length($aaid[$i]));
}
$maxn = 10 if ($maxn < 10);

$alilen = length($codonaln[0]);

foreach $i (0..$#aaid) {
    1 while $codonaln[$i] =~ s/(.{60})(.+)/$1\n$2/;
    1 while $coloraln[$i] =~ s/(.{60})(.+)/$1\n$2/;
}
1 while $maskseq =~ s/(.{60})(.+)/$1\n$2/;

if ($outform eq "clustal") {

    #-----------#
    #  clustal
    #-----------#

    print "CLUSTAL W multiple sequence alignment\n";
    print "\n";

    if ($html) {
        @output1 = split(/\n/, $codonaln[0]);
        foreach $i (0..$#output1) {
            foreach $k (0..$#aaid) {
                printf "%-${maxn}s    ", $aaid[$k];
                $outf = (split(/\n+/, $codonaln[$k]))[$i];
                $outr = (split(/\n+/, $coloraln[$k]))[$i];
                $rlen = length($outf);
                foreach $l (0..$rlen - 1) {
                    $tmpnuc = substr($outf, $l, 1);
                    $tmpr = substr($outr, $l, 1);
                    if ($tmpr eq "R") {
                        print "<FONT color='red'>$tmpnuc</FONT>";
                    } else {
                        print "$tmpnuc";
                    }
                }
                print "\n";
            }
            $outmask = (split(/\n+/, $maskseq))[$i];
            printf "%-${maxn}s    $outmask\n", ' ' if (!$blockonly && $gblockseq =~ /#/);
            print "\n";
        }
    } else {
        @output1 = split(/\n/, $codonaln[0]);
        foreach $i (0..$#output1) {
            foreach $k (0..$#aaid) {
                $outf = (split(/\n+/, $codonaln[$k]))[$i];
                printf "%-${maxn}s    $outf\n", $aaid[$k];
            }
            $outmask = (split(/\n+/, $maskseq))[$i];
            printf "%-${maxn}s    $outmask\n", ' ' if (!$blockonly && $gblockseq =~ /#/);
            print "\n";
        }
    }
} elsif ($outform eq "paml") {

    #--------#
    #  paml
    #--------#

    $nseq = $#codonaln + 1;
    printf " %3d %6d\n", $nseq, $alilen;
    foreach $i (0..$#codonaln) {
        print  "$aaid[$i]\n";
        if ($html) {
            @outf = split(/\n+/, $codonaln[$i]);
            @outr = split(/\n+/, $coloraln[$i]);
            foreach $k (0..$#outf) {
                $lenf = length($outf[$k]);
                foreach $l (0..$lenf - 1) {
                    $tmpnuc = substr($outf[$k], $l, 1);
                    $tmpr = substr($outr[$k], $l, 1);
                    if ($tmpr eq "R") {
                        print "<FONT color='red'>$tmpnuc</FONT>";
                    } else {
                        print "$tmpnuc";
                    }
                }
                print "\n";
            }
        } else {
            print  "$codonaln[$i]\n";
        }
    }
} elsif ($outform eq "fasta") {

    #---------#
    #  fasta
    #---------#

    foreach $i (0..$#codonaln) {
        print  ">$aaid[$i]\n";
        if ($html) {
            @outf = split(/\n+/, $codonaln[$i]);
            @outr = split(/\n+/, $coloraln[$i]);
            foreach $k (0..$#outf) {
                $lenf = length($outf[$k]);
                foreach $l (0..$lenf - 1) {
                    $tmpnuc = substr($outf[$k], $l, 1);
                    $tmpr = substr($outr[$k], $l, 1);
                    if ($tmpr eq "R") {
                        print "<FONT color='red'>$tmpnuc</FONT>";
                    } else {
                        print "$tmpnuc";
                    }
                }
                print "\n";
            }
        } else {
            print  "$codonaln[$i]\n";
        }
    }
}


#-----------------------------------------------------------------------

sub pn2codon {
    #    pn2codon v3
    #
    #    input:   $pep    protein sequence
    #                         termination -> "_" or "*";
    #                         frameshift  -> digit
    #                         "-" or "."  -> gap
    #             $nuc    DNA or RNA sequence (lower/upper case letters)
    #
    #             $codontable  'universal' or 'vmitochondria'
    #
    #    return:  hash
    #                $retval{'codonseq'}: codon seq (w/o gap)
    #                $retval{'message'}:  error/warning messame (array)
    #                $retval{'result'}:   1: OK, 2: mismatch, -1: no match found
    #
    #
    #                                      05/05/2002    Mikita Suyama
    #                                      12/07/2004
    #
    #    v2                                22/04/2005
    #    - reverse translation table (%p2c) was replaced
    #    - if there is no exact match, try fragment anchors
    #    - return value is changed to hash
    #
    #    v3                                26/03/2006
    #    - codon table for vertebrate mitochondria (vmitochondria) is added


    local($pep, $nuc, $codontable) = @_;


    local(%p2c);
    local($peplen, $qcodon, $codon);
    local($tmpaa, $message, $modpep, @qcodon, @fncodon, $wholecodon);
    local($i, $j, $anclen, @anchor, $peppos, $tmpcodon, $codonpos);

    local(%retval);


    if ($codontable eq "universal") {
        #-----------#
        # universal
        #-----------#
        %p2c = (
            "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))",
            "R" => "((CG.)|(AG(A|G|R)))",
            "S" => "(((U|T)C.)|(AG(U|T|C|Y)))",
            "A" => "(GC.)",
            "G" => "(GG.)",
            "P" => "(CC.)",
            "T" => "(AC.)",
            "V" => "(G(U|T).)",
            "I" => "(A(U|T)(U|T|C|Y|A))",
            "_" => "(((U|T)A(A|G|R))|((T|U)GA))",
            "*" => "(((U|T)A(A|G|R))|((T|U)GA))",
            "C" => "((U|T)G(U|T|C|Y))",
            "D" => "(GA(U|T|C|Y))",
            "E" => "(GA(A|G|R))",
            "F" => "((U|T)(U|T)(U|T|C|Y))",
            "H" => "(CA(U|T|C|Y))",
            "K" => "(AA(A|G|R))",
            "N" => "(AA(U|T|C|Y))",
            "Q" => "(CA(A|G|R))",
            "Y" => "((U|T)A(U|T|C|Y))",
            "M" => "(A(U|T)G)",
            "W" => "((U|T)GG)",
            "X" => "...",
        );
    } elsif ($codontable eq "vmitochondria") {
        #--------------------------#
        # vertebrate mitochondrial
        #--------------------------#
        %p2c = (
            "L" => "((C(U|T).)|((U|T)(U|T)(A|G|R)))",
            "R" => "(CG.)",
            "S" => "(((U|T)C.)|(AG(U|T|C|Y)))",
            "A" => "(GC.)",
            "G" => "(GG.)",
            "P" => "(CC.)",
            "T" => "(AC.)",
            "V" => "(G(U|T).)",
            "I" => "(A(U|T)(U|T|C|Y))",
            "_" => "(((U|T)A(A|G|R))|(AG(A|G|R)))",
            "*" => "(((U|T)A(A|G|R))|(AG(A|G|R)))",
            "C" => "((U|T)G(U|T|C|Y))",
            "D" => "(GA(U|T|C|Y))",
            "E" => "(GA(A|G|R))",
            "F" => "((U|T)(U|T)(U|T|C|Y))",
            "H" => "(CA(U|T|C|Y))",
            "K" => "(AA(A|G|R))",
            "N" => "(AA(U|T|C|Y))",
            "Q" => "(CA(A|G|R))",
            "Y" => "((U|T)A(U|T|C|Y))",
            "M" => "(A(U|T)(A|G|R))",
            "W" => "((U|T)G(A|G|R))",
            "X" => "...",
        );
    }


    #---------------------------------------------------------------#
    # make codon sequence, $qcodon,  with all possible combinations
    #---------------------------------------------------------------#

    $peplen = length($pep);
    undef($qcodon);
    foreach $i (0..$peplen - 1) {
        $peppos = $i + 1;
        $tmpaa = substr($pep, $i, 1);
        if ($tmpaa =~ /[ACDEFGHIKLMNPQRSTVWY_\*XU]/) {
            $qcodon .= $p2c{substr($pep, $i, 1)};
        } elsif ($tmpaa =~ /\d/) {
            $qcodon .= "." x $tmpaa;
        } elsif ($tmpaa =~ /[-\.]/) {
            # nothing to do
        } else {
            $message = "pepAlnPos $peppos: $tmpaa unknown AA type. Taken as 'X'";
            push(@{$retval{'message'}}, $message);
            $qcodon .= $p2c{'X'};
        }
    }
    # print "$qcodon\n";


    #-----------------------------#
    # does $nuc contain $qcodon ?
    #-----------------------------#

    if ($nuc =~ /$qcodon/i) {
        $codon = $&;

        $retval{'codonseq'} = $codon;
        $retval{'result'} = 1;

    } else {
        #-------------------#
        # make 10 aa anchor
        #-------------------#

#        undef(@{$retval{'message'}});

        $modpep = $pep;
        1 while $modpep =~ s/(.{10})(.{10,})/$1\n$2/;
        @anchor = split(/\n/, $modpep);
        undef($wholecodon);
        foreach $i (0..$#anchor) {
            # print "    $anchor[$i]\n";
            $anclen = length($anchor[$i]);
            undef(@qcodon);
            undef(@fncodon);
            foreach $j (0..$anclen - 1) {
                $peppos = $i * 10 + $j + 1;
                $tmpaa = substr($anchor[$i], $j, 1);
                if ($tmpaa =~ /[ACDEFGHIKLMNPQRSTVWY_\*XU]/) {
                    $qcodon[$i] .= $p2c{$tmpaa};
                    $fncodon[$i] .= $p2c{'X'};
                } elsif ($tmpaa =~ /\d/) {
                    $qcodon[$i] .= "." x $tmpaa;
                    $fncodon[$i] .= "." x $tmpaa;
                } elsif ($tmpaa =~ /[-\.]/) {
                    # nothing to do
                } else {
                    #del $message = "pepAlnPos $peppos: $tmpaa unknown AA type. Replaced by 'X'";
                    #del push(@{$retval{'message'}}, $message);
                    $qcodon[$i] .= $p2c{'X'};
                    $fncodon[$i] .= $p2c{'X'};
                }
            }
            if ($nuc =~ /$qcodon[$i]/i) {
                $wholecodon .= $qcodon[$i];
            } else {
                $wholecodon .= $fncodon[$i];
            }
        }

        if ($nuc =~ /$wholecodon/i) {
            $codon = $&;
            $codonpos = 0;
            foreach $i (0..$peplen - 1) {
                $peppos = $i + 1;
                $tmpaa = substr($pep, $i, 1);
                undef($tmpcodon);
                if ($tmpaa !~ /\d/ && $tmpaa !~ /-/) {
                    $tmpcodon = substr($codon, $codonpos, 3);
                    $codonpos += 3;
                    if ($tmpcodon !~ /$p2c{$tmpaa}/) {
                        $message = "pepAlnPos $peppos: $tmpaa does not correspond to $tmpcodon";
                        push(@{$retval{'message'}}, $message);
                    }
                } elsif ($tmpaa =~ /\d/i) {
                    $tmpcodon = substr($codon, $codonpos, $tmpaa);
                    $codonpos += $tmpaa;
                }
                # print "$tmpaa    $tmpcodon\n";
            }
            $codon;

            $retval{'codonseq'} = $codon;
            $retval{'result'} = 2;

        } else {

            $retval{'result'} = -1;

        }

    }

    return(%retval);
}

#---------------------------------------------------------

sub showhelp {
print STDERR<<EOF;

pal2nal.pl  (v8)

Usage:  pal2nal.pl  pep.ali  nuc.fasta  [nuc.fasta...]  [options]


    pep.ali:    protein alignment either in CLUSTAL or FASTA format

    nuc.fasta:  DNA sequences (single multi-fasta or separated files)

    Options:  -h            Show help 

              -output (clustal|paml|fasta)
                            Output format; default = clustal

              -blockonly    Show only user specified blocks
                            '#' under CLUSTAL alignment (see example)

              -nogap        remove columns with gaps and inframe stop codons

              -nomismatch   remove mismatched codons (mismatch between
                            pep and cDNA) from the output

              -codontable (universal|vmitochondria)
                            Codon table
                              - universal (default)
                              - vmitochondria -> vertebrate mitochondrial

              -html         HTML output (only for the web server)

              -nostderr     No STDERR messages (only for the web server)


    - sequence order in pep.aln and nuc.fasta should be the same.

    - IDs in pep.aln are used in the output.

EOF
}
