Throwing God’s Dice


use strict;
use warnings;

$|=1;

my @aa=('A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V');

#PAM1
#Mutation probability matrix for the evolutionary distance of 1 PAM (i.e., one Accepted Point Mutation per 100 amino acids).
#An element of this matrix, [Mij], gives the probability that the amino acid in column j will be replaced by the amino acid
#in row i after a given evolutionary interval, in this case 1 PAM. (Adapted from Figure 82. Atlas of Protein Sequence and Structure, Suppl 3, 1978, M.O. Dayhoff, ed. National Biomedical Research Foundation, 1979.)
#Thus, there is a 0.56% probability that Asp will be replaced by Glu.
#%PAM calculated from upper PAM1 table
#Hash Key states from AA1 to AA2 eg. AE means change of changing/evolving from A to E.
#Figures are in percentages (DE = 0.56)

my %PAM=("AA",98.67,"AR",0.01,"AN",0.04,"AD",0.06,"AC",0.01,"AQ",0.03,"AE",0.1,"AG",0.21,"AH",0.01,"AI",0.02,"AL",0.03,"AK",0.02,"AM",0.01,"AF",0.01,"AP",0.13,"AS",0.28,"AT",0.22,"AW",0,"AY",0.01,"AV",0.13,"RA",0.02000200020002,"RR",99.1399139913991,"RN",0.01000100010001,"RD",0,"RC",0.01000100010001,"RQ",0.09000900090009,"RE",0,"RG",0.01000100010001,"RH",0.08000800080008,"RI",0.02000200020002,"RL",0.01000100010001,"RK",0.37003700370037,"RM",0.01000100010001,"RF",0.01000100010001,"RP",0.05000500050005,"RS",0.11001100110011,"RT",0.02000200020002,"RW",0.02000200020002,"RY",0,"RV",0.02000200020002,"NA",0.09,"NR",0.01,"NN",98.22,"ND",0.42,"NC",0,"NQ",0.04,"NE",0.07,"NG",0.12,"NH",0.18,"NI",0.03,"NL",0.03,"NK",0.25,"NM",0,"NF",0.01,"NP",0.02,"NS",0.34,"NT",0.13,"NW",0,"NY",0.03,"NV",0.01,"DA",0.1,"DR",0,"DN",0.36,"DD",98.59,"DC",0,"DQ",0.05,"DE",0.56,"DG",0.11,"DH",0.03,"DI",0.01,"DL",0,"DK",0.06,"DM",0,"DF",0,"DP",0.01,"DS",0.07,"DT",0.04,"DW",0,"DY",0,"DV",0.01,"CA",0.03,"CR",0.01,"CN",0,"CD",0,"CC",99.73,"CQ",0,"CE",0,"CG",0.01,"CH",0.01,"CI",0.02,"CL",0,"CK",0,"CM",0,"CF",0,"CP",0.01,"CS",0.11,"CT",0.01,"CW",0,"CY",0.03,"CV",0.03,"QA",0.08,"QR",0.1,"QN",0.04,"QD",0.06,"QC",0,"QQ",98.76,"QE",0.35,"QG",0.03,"QH",0.2,"QI",0.01,"QL",0.06,"QK",0.12,"QM",0.02,"QF",0,"QP",0.08,"QS",0.04,"QT",0.03,"QW",0,"QY",0,"QV",0.02,"EA",0.17,"ER",0,"EN",0.06,"ED",0.53,"EC",0,"EQ",0.27,"EE",98.65,"EG",0.07,"EH",0.01,"EI",0.02,"EL",0.01,"EK",0.07,"EM",0,"EF",0,"EP",0.03,"ES",0.06,"ET",0.02,"EW",0,"EY",0.01,"EV",0.02,"GA",0.21,"GR",0,"GN",0.06,"GD",0.06,"GC",0,"GQ",0.01,"GE",0.04,"GG",99.35,"GH",0,"GI",0,"GL",0.01,"GK",0.02,"GM",0,"GF",0.01,"GP",0.02,"GS",0.16,"GT",0.02,"GW",0,"GY",0,"GV",0.03,"HA",0.02000200020002,"HR",0.1000100010001,"HN",0.21002100210021,"HD",0.04000400040004,"HC",0.01000100010001,"HQ",0.23002300230023,"HE",0.02000200020002,"HG",0.01000100010001,"HH",99.1299129912991,"HI",0,"HL",0.04000400040004,"HK",0.02000200020002,"HM",0,"HF",0.02000200020002,"HP",0.05000500050005,"HS",0.02000200020002,"HT",0.01000100010001,"HW",0,"HY",0.04000400040004,"HV",0.03000300030003,"IA",0.05999400059994,"IR",0.02999700029997,"IN",0.02999700029997,"ID",0.00999900009999,"IC",0.00999900009999,"IQ",0.00999900009999,"IE",0.02999700029997,"IG",0,"IH",0,"II",98.7101289871013,"IL",0.21997800219978,"IK",0.03999600039996,"IM",0.04999500049995,"IF",0.07999200079992,"IP",0.00999900009999,"IS",0.01999800019998,"IT",0.10998900109989,"IW",0,"IY",0.00999900009999,"IV",0.56994300569943,"LA",0.04,"LR",0.01,"LN",0.01,"LD",0,"LC",0,"LQ",0.03,"LE",0.01,"LG",0.01,"LH",0.01,"LI",0.09,"LL",99.47,"LK",0.01,"LM",0.08,"LF",0.06,"LP",0.02,"LS",0.01,"LT",0.02,"LW",0,"LY",0.01,"LV",0.11,"KA",0.01999600079984,"KR",0.18996200759848,"KN",0.12997400519896,"KD",0.02999400119976,"KC",0,"KQ",0.0599880023995201,"KE",0.0399920015996801,"KG",0.01999600079984,"KH",0.00999800039992002,"KI",0.01999600079984,"KL",0.01999600079984,"KK",99.2401519696061,"KM",0.0399920015996801,"KF",0,"KP",0.01999600079984,"KS",0.0699860027994401,"KT",0.0799840031993601,"KW",0,"KY",0,"KV",0.00999800039992002,"MA",0.06000600060006,"MR",0.04000400040004,"MN",0,"MD",0,"MC",0,"MQ",0.04000400040004,"ME",0.01000100010001,"MG",0.01000100010001,"MH",0,"MI",0.12001200120012,"ML",0.45004500450045,"MK",0.2000200020002,"MM",98.7498749874988,"MF",0.04000400040004,"MP",0.01000100010001,"MS",0.04000400040004,"MT",0.06000600060006,"MW",0,"MY",0,"MV",0.17001700170017,"FA",0.01999600079984,"FR",0.00999800039992002,"FN",0.00999800039992002,"FD",0,"FC",0,"FQ",0,"FE",0,"FG",0.00999800039992002,"FH",0.01999600079984,"FI",0.0699860027994401,"FL",0.12997400519896,"FK",0,"FM",0.00999800039992002,"FF",99.4401119776045,"FP",0.00999800039992002,"FS",0.02999400119976,"FT",0.00999800039992002,"FW",0.00999800039992002,"FY",0.20995800839832,"FV",0.00999800039992002,"PA",0.21995600879824,"PR",0.0399920015996801,"PN",0.01999600079984,"PD",0.00999800039992002,"PC",0.00999800039992002,"PQ",0.0599880023995201,"PE",0.02999400119976,"PG",0.02999400119976,"PH",0.02999400119976,"PI",0,"PL",0.02999400119976,"PK",0.02999400119976,"PM",0,"PF",0,"PP",99.2401519696061,"PS",0.16996600679864,"PT",0.0499900019996001,"PW",0,"PY",0,"PV",0.02999400119976,"SA",0.35,"SR",0.06,"SN",0.2,"SD",0.05,"SC",0.05,"SQ",0.02,"SE",0.04,"SG",0.21,"SH",0.01,"SI",0.01,"SL",0.01,"SK",0.08,"SM",0.01,"SF",0.02,"SP",0.12,"SS",98.4,"ST",0.32,"SW",0.01,"SY",0.01,"SV",0.02,"TA",0.31993601279744,"TR",0.00999800039992002,"TN",0.0899820035992801,"TD",0.02999400119976,"TC",0.00999800039992002,"TQ",0.01999600079984,"TE",0.01999600079984,"TG",0.02999400119976,"TH",0.00999800039992002,"TI",0.0699860027994401,"TL",0.02999400119976,"TK",0.10997800439912,"TM",0.01999600079984,"TF",0.00999800039992002,"TP",0.0399920015996801,"TS",0.379924015196961,"TT",98.6902619476105,"TW",0,"TY",0.00999800039992002,"TV",0.0999800039992002,"WA",0,"WR",0.08,"WN",0.01,"WD",0,"WC",0,"WQ",0,"WE",0,"WG",0,"WH",0.01,"WI",0,"WL",0.04,"WK",0,"WM",0,"WF",0.03,"WP",0,"WS",0.05,"WT",0,"WW",99.76,"WY",0.02,"WV",0,"YA",0.02000400080016,"YR",0,"YN",0.0400080016003201,"YD",0,"YC",0.03000600120024,"YQ",0,"YE",0.01000200040008,"YG",0,"YH",0.0400080016003201,"YI",0.01000200040008,"YL",0.02000400080016,"YK",0.01000200040008,"YM",0,"YF",0.28005601120224,"YP",0,"YS",0.02000400080016,"YT",0.02000400080016,"YW",0.01000200040008,"YY",99.4698939787958,"YV",0.02000400080016,"VA",0.18,"VR",0.01,"VN",0.01,"VD",0.01,"VC",0.02,"VQ",0.01,"VE",0.02,"VG",0.05,"VH",0.01,"VI",0.33,"VL",0.15,"VK",0.01,"VM",0.04,"VF",0,"VP",0.02,"VS",0.02,"VT",0.09,"VW",0,"VY",0.01,"VV",99.01);

#Generate random set of protein of defined length
my $length = 1000;
srand(time|$$);
#array to store random proteins
my @random_protein = ();

#Generate reference species
@random_protein = make_random_protein_set($length,1);

my $count = 0;

print "PAM,%identity, Sequence\n";

foreach my $eiwit (@random_protein){
$count = $count + 1;
#print ">$count,".length($eiwit)."\n";
print "0,100\t$eiwit\n";
}

#how many PAM do you have to simulate

my $maxpam = 2500;

for (my $pg=0;$pg<=$maxpam;$pg++) {

my $new_protein = "";
my $dif=0;

#simulate evolution

my $sum;
my $aa_evolution="";
my $amino;

#calculate evoluted sequence

for (my $aaa=0;$aaa<$length;$aaa++) {

$sum = 0;
$amino = substr($random_protein[$pg],$aaa,1);
#print "($aaa)".substr($random_protein[$pg],$aaa,1)."\n";

my $r = rand(1)*100;
my $found = 0;
my $key;
my %randompam;

#make random array of amino acids
foreach (@aa) {$randompam{$_}=int(rand(1)*1000);}
my @sortedaa = sort {$randompam{$a} <=> $randompam{$b} } keys %randompam;

foreach (@sortedaa) {
$key = $amino.$_;
$sum+=$PAM{$key};
#print "random number = $r\tsum = $sum\tKey =".$key."\tfound3 = $found3 \n";
if ($sum>=$r && $found==0) {
$found=1;$aa_evolution=$_;
#print "aa_evolution = $aa_evolution\n";
}
}
if ($amino ne $aa_evolution) { $dif++;};
$new_protein .= $aa_evolution;
}

$random_protein[$pg+1] = $new_protein;
#print "$pg,$dif,";
print ($pg+1);
print ",";
#print "new protein = ".$new_protein."\n";
#calculate total difference
my $tdif=0;
for (my $at=0;$at<$length;$at++) {
if (substr($random_protein[$pg+1],$at,1) ne substr($random_protein[0],$at,1)) {$tdif++;}
}
my $percent = ($length-$tdif)*100/$length;
print $percent."\t$random_protein[$pg+1]\n";
#print $percent."\n";
}

##################################################
#### subroutines for the random protein generation
##################################################

sub make_random_protein_set {

my($length,$size_of_set) = @_;
my $p;
my @set;
for (my $i = 0; $i < $size_of_set ; ++$i) {

$p = make_random_protein($length);
push(@set,$p);
}
return @set;
}

sub make_random_protein {
my($length) = @_;
my $p;
for (my $i=0 ; $i < $length ; ++$i ){
$p .= randomaminoacid();
}
return $p;
}

sub randomaminoacid {
my(@aa) = ('A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V');
return randomelement(@aa);
}

sub randomelement {
my (@array) = @_;
return $array[rand @array];
}