SNPseqRetrieve.pl
From Bioinformatics Core Wiki
#!/usr/bin/perl
# AUTHOR: Joseph Fass
# LAST REVISED: September 2010
#
# The Bioinformatics Core at UC Davis Genome Center
# http://bioinformatics.ucdavis.edu
# Copyright (c) 2010 The Regents of University of California, Davis Campus.
# All rights reserved.
#####
# takes list of SNPs (e.g. lines from a variant pileup; pileup -cv)
# and retrieves flanking ~consensus~ sequence from full pileup (pileup -c).
#####
use Getopt::Std;
$usage = "\nusage: perl $0 [options] > SNPs.tsv <tab-delimited fields describing SNP sequences>\n\n".
"$0 reads in a pileup (e.g. 'samtools pileup'; from which to pull consensus sequence)\n".
"and lines from a variant pileup (consisting of filtered, selected SNPs,\n".
"and retrieves the SNP-flanking sequences (e.g. ATGC[C/T]CGTA) for those SNP positions.\n".
"\n".
"options:\n".
"-f # # of flanking bases to retrieve on each side of the SNP.\n".
" '-' indicates that no reads mapped to the position (if reference sequence not used),".
" or positions beyond reference sequence ends.\n".
"-p # use full consensus pileup #\n".
"-s # use selected variant pileup lines in file # for SNP positions / identities\n".
"-r # use reference sequence # <fasta> for positions not covered by bases from reads\n\n";
getopts('f:p:s:r:') or die $usage;
if (!defined($opt_f) or !($opt_f =~ m/^[0-9]+$/) or ($opt_f<0)) { $opt_f = 60 }
if (!defined($opt_p)) { die $usage }
if (!defined($opt_s)) { die $usage }
# read in pileup 'e.g. from samtools pileup -f ref.fasta alignment.bam'
open FID, "<$opt_p" or die $usage;
while ($line = <FID>) {
@line = split /\t/, $line;
# check if indel, and if good enough quality, replace existing base so that "+" or "-" later disqualifies SNP
if ($line[2] eq "*" and $line[5] > 5) {
$pile{$line[0]}{$line[1]} = $line[3]; # hash keyed by scaffold and position gets consensus as value
}
# otherwise, it's reference base position; assign to hash
else { $pile{$line[0]}{$line[1]} = $line[3] } # hash keyed by scaffold and position gets consensus as value
}
close FID;
print STDERR "\ndone with pileup reading\n\n";
# if desired, read in reference sequence
if (defined $opt_r) {
open FID, "<$opt_r" or die $usage;
while ($line = <FID>) {
if ($line =~ m/^>(\S+)/) { # header line; BWA will truncate at first whitespace??
$refName = $1;
}
else { # sequence line
chomp $line;
$ref{$refName} .= $line; # add sequence to hash of reference sequences
}
}
close FID;
print STDERR "\ndone with reference sequence reading\n\n";
}
# read SNPs list ... must be TSV of scaffold id, position, ref base, and SNP consensus
# for each SNP, output tab-delimited fields appropriate for Illumina Array Design Tool (plus ...)
open FID, "<$opt_s" or die $usage;
print "SNP_name\tSNP_sequence\tref_name\tposition\tSNP_quality\t#A\t#T\t#C\t#G\n";
while ($line = <FID>) {
@line = split /\t/, $line;
$tabOut = "";
# process pileup field to get allele frequencies
$SNPpile = $line[8];
$SNPpile =~ s/\^.//g; # get rid of read-start indicators, and quality character
$SNPpile =~ s/[\-\+](\d+)(??{".{$1}"})//g; # remove ins/del numbers and letters
$SNPpile =~ s/\$//g; # remove read-end indicators
$SNPpile =~ s/[\.,]/$line[2]/g; # convert "." and "," to reference base (which could be lc)
$SNPpile = uc $SNPpile; # convert to upper-case
$tabOut = "SNP_at_$line[0]:$line[1]\t"; # start header with ref sequence and position
for ($i=$line[1]-$opt_f; $i<$line[1]; $i++) {
if (defined $pile{$line[0]}{$i}) { # can pull from new consensus?
$tabOut .= $pile{$line[0]}{$i};
}
elsif (defined($nt = lc(substr($ref{$line[0]},$i-1,1)))) { # can pull from reference?
$tabOut .= $nt;
}
else { $tabOut .= "-" }
}
chomp $line[3];
if ($line[3] eq "Y") { $tabOut .= "[C/T]" }
elsif ($line[3] eq "R") { $tabOut .= "[A/G]" }
elsif ($line[3] eq "W") { $tabOut .= "[A/T]" }
elsif ($line[3] eq "S") { $tabOut .= "[C/G]" }
elsif ($line[3] eq "K") { $tabOut .= "[T/G]" }
elsif ($line[3] eq "M") { $tabOut .= "[C/A]" }
else { $tabOut .= "[problem:$line[3]]" }
for ($i=$line[1]+1; $i<=$line[1]+$opt_f; $i++) {
if (defined $pile{$line[0]}{$i}) { # can pull from new consensus?
$tabOut .= $pile{$line[0]}{$i};
}
elsif (defined($nt = lc(substr($ref{$line[0]},$i-1,1)))) { # can pull from reference?
$tabOut .= $nt;
}
else { $tabOut .= "-" }
}
$tabOut .= "\t$line[0]\t$line[1]";
$tabOut .= "\t$line[5]"; # add SNP quality
$allele{"A"}=0; $allele{"T"}=0; $allele{"C"}=0; $allele{"G"}=0; # initialize allele count hash
while (length($SNPpile)>0) {
$base = chop $SNPpile;
$allele{$base}++; # increment count of this base
}
foreach $k ("A","T","C","G") {
$tabOut .= "\t$allele{$k}"; # add base and count
}
$tabOut .= "\n";
# now, check for "-" or "+" or "*" in sequences ... disqualify if found
@tabOutFields = split(/\t/,$tabOut);
if (!($tabOutFields[1] =~ m/[\*\-\+NYRWSKM]/)) { print $tabOut }
}
close FID;