#!/usr/bin/perl
use strict;

my %seq_names;
my %org_seqs;

my %org_ct;

open(OUT,">all_16S_rRNA_coords.txt")||die;

while(my $gff=<gff_renamed/*.gff>){
	my $i;
	my %id_seq;my $fna=$gff; $fna=~s/\.gff/\.fna/;$fna=~s/gff_renamed/fna_renamed/;
	open(IN,$fna)||die;
	my $firstline=<IN>; $firstline=~/^>(\S+)/;my $id=$1;my $seq;
	while(my $line=<IN>){
		if($line=~/^>/){
			$id_seq{$id}=$seq;
			$line=~/^>(\S+)/;$id=$1;
			$seq="";
		}else{
			chomp $line;
			$seq.=$line;
		}
	}
	$id_seq{$id}=$seq;
	close IN;

	$gff=~/gff_renamed\/(.+)\.gff/;
	my $org=$1;
	open(GFF,$gff)||die;
	while(my $line=<GFF>){
		if($line!~/^#/){
			my @tmp=split("\t",$line);
#			print $tmp[2]."\n";
			if($tmp[2] eq "rRNA"){
				if($line=~/16S/){
				my $size=abs($tmp[3]-$tmp[4])+1;
				if($size>1000){
					my $seq=substr($id_seq{$tmp[0]},$tmp[3]-1,$size);
					$seq=($tmp[6] eq "-")? $seq : rc($seq);
					print OUT join("\t",($org,$tmp[0],$tmp[2],$tmp[3],$tmp[4],$tmp[6],$size,$seq))."\n";
					push(@{$org_seqs{$org}},$seq);
					push(@{$seq_names{$seq}},$org);					
					$org_ct{$org}++;
				}
				}
			}
		}
	}
	close GFF;
}
close OUT;


open(SUM,">all_16S_rRNA.summary")||die;
open(UNI,">all_16S_rRNA_unique.fasta")||die;
open(UN1,">all_16S_rRNA_unique_id.fasta")||die;
open(PG,">PG_16S_rRNA_unique.fasta")||die;
open(PG1,">PG_16S_rRNA_unique_id.fasta")||die;
open(NAME2,">PG_id_name.txt")||die;
open(NAME1,">all_id_name.txt")||die;
my $u;my $p;
my %org_ct1;
foreach my $seq (keys %seq_names){
	my $i;$u++;
	my $t=scalar(@{$seq_names{$seq}});
	print SUM join("\t",("UniqSeq.$u",$t,length($seq),join(";",@{$seq_names{$seq}})))."\n";
	my @tmp;
	foreach my $org(@{$seq_names{$seq}}){
		my $org1=$org;
		if($org_ct{$org}>1){
			my $s=$org_ct1{$org}+1;
			$org1=$org."($s/".$org_ct{$org}.")";
		}
			$org_ct1{$org}++;
			push(@tmp,$org1);
	}
	my $joint=join(";",sort @tmp);
	my %org_ct;
	foreach my $org(@tmp){
		$org=~s/\(.+\)//;
		$org_ct{$org}++;
	}
	my @nnn;
	foreach my $org(sort{$a cmp $b} keys %org_ct){
		my $t=scalar(@{$org_seqs{$org}});
		push(@nnn,$org."(".$org_ct{$org}."/".$t.")");
	}
	my $joint2=join(";",@nnn);

	print NAME1 "U$u\t$joint2\n";
	print UNI ">U$u.$joint\n";
	print UNI $seq."\n";
	print UN1 ">U$u\n";
	print UN1 $seq."\n";
	if($joint=~/P_gingivalis/||1){
		$joint=~s/P_gingivalis_//g;
		$p++;
		print NAME2 "U$p\t$joint\n";
		print PG ">U$p.$joint\n";
		print PG $seq."\n";
		print PG1 ">U$p\n";
		print PG1 $seq."\n";
	}
}
close SUM;
close UNI;
close PG;
close NAME1; close NAME2; close PG1; close UN1;

open(FAS,">all_16S_rRNA.fasta")||die;
open(NAM,">all_16S_rRNA_id_name.txt")||die;
my $c;
foreach my $org(sort{$a cmp $b} keys %org_seqs){
	my $t=scalar(@{$org_seqs{$org}});
	my $i;
	foreach my $seq (@{$org_seqs{$org}}){
		$i++;
		$c++;
		if($t>1){
		print FAS ">S".$c.".".$org."($i/$t)\n".$seq."\n";
		print NAM "S".$c."\t".$org."($i/$t)\n";
		}else{
		print FAS ">S".$c.".".$org."\n".$seq."\n";
		print NAM "S".$c."\t".$org."\n";
		}
	}
}

close FAS;
close NAM;


my %idname;
open(IN,"PG_id_name.txt")||die;
while(my $line=<IN>){
        chomp $line;
        my ($id, $name)=split("\t",$line);
        $idname{$id}=$name;
}
close IN;


#### MAFFT alignment

system("mafft PG_16S_rRNA_unique_id.fasta > PG_16S_rRNA_unique_id.mafft.fasta");




open(IN,"PG_16S_rRNA_unique_id.mafft.fasta")||die;
open(OUT,">PG_16S_rRNA_unique_name.mafft.fasta")||die;

while(my $line=<IN>){
        $line=~s/^>(\S+)/>$idname{$1}/;
        print OUT $line;
}
close IN;
close OUT;



#trim aligned sequences
open(IN,"PG_16S_rRNA_unique_name.mafft.fasta")||die;
open(OUT,">PG_16S_rRNA_unique_name.mafft.trimmed.fasta")||die;
my $first=<IN>; my $seq;
while(my $line=<IN>){
	if($line=~/^>/){
		substr($seq,0,101,'');
		$seq=substr($seq,0,1425);
		print OUT $first.$seq."\n";
		$first=$line;$seq="";
	}else{
		chomp $line;
		$seq.=$line;
	}
}
		substr($seq,0,101,'');
		$seq=substr($seq,0,1425);
		print OUT $first.$seq."\n";

close IN;
close OUT;





sub rc(){
    my $rc = reverse(shift);
    $rc =~ tr/ABCDGHMNRSTUVWXYabcdghmnrstuvwxy/TVGHCDKNYSAABWXRtvghcdknysaabwxr/;
    return $rc;
}
