SNPseqRetrieve.pl

From Bioinformatics Core Wiki
Jump to: navigation, search


#!/usr/bin/perl

# AUTHOR: Joseph Fass
# LAST REVISED: September 2010
#
# The Bioinformatics Core at UC Davis Genome Center
# http://bioinformatics.ucdavis.edu
# Copyright (c) 2010 The Regents of University of California, Davis Campus.
# All rights reserved.

#####
# takes list of SNPs (e.g. lines from a variant pileup; pileup -cv)
# and retrieves flanking ~consensus~ sequence from full pileup (pileup -c).
#####

use Getopt::Std;
$usage = "\nusage: perl $0 [options] > SNPs.tsv <tab-delimited fields describing SNP sequences>\n\n".
         "$0 reads in a pileup (e.g. 'samtools pileup'; from which to pull consensus sequence)\n".
         "and lines from a variant pileup (consisting of filtered, selected SNPs,\n".
         "and retrieves the SNP-flanking sequences (e.g. ATGC[C/T]CGTA) for those SNP positions.\n".
         "\n".
         "options:\n".
         "-f #     # of flanking bases to retrieve on each side of the SNP.\n".
         "         '-' indicates that no reads mapped to the position (if reference sequence not used),".
         "         or positions beyond reference sequence ends.\n".
         "-p #     use full consensus pileup #\n".
         "-s #     use selected variant pileup lines in file # for SNP positions / identities\n".
         "-r #     use reference sequence # <fasta> for positions not covered by bases from reads\n\n";
getopts('f:p:s:r:') or die $usage;
if (!defined($opt_f) or !($opt_f =~ m/^[0-9]+$/) or ($opt_f<0)) { $opt_f = 60 }
if (!defined($opt_p)) { die $usage }
if (!defined($opt_s)) { die $usage }

# read in pileup 'e.g. from samtools pileup -f ref.fasta alignment.bam'
open FID, "<$opt_p" or die $usage;
while ($line = <FID>) {
	@line = split /\t/, $line;
	# check if indel, and if good enough quality, replace existing base so that "+" or "-" later disqualifies SNP
	if ($line[2] eq "*" and $line[5] > 5) {
		$pile{$line[0]}{$line[1]} = $line[3];  # hash keyed by scaffold and position gets consensus as value
	}
	# otherwise, it's reference base position; assign to hash
	else { $pile{$line[0]}{$line[1]} = $line[3] }  # hash keyed by scaffold and position gets consensus as value
}
close FID;
print STDERR "\ndone with pileup reading\n\n";

# if desired, read in reference sequence
if (defined $opt_r) {
    open FID, "<$opt_r" or die $usage;
    while ($line = <FID>) {
        if ($line =~ m/^>(\S+)/) {  # header line; BWA will truncate at first whitespace??
            $refName = $1;
        }
        else {  # sequence line
            chomp $line;
            $ref{$refName} .= $line;  # add sequence to hash of reference sequences
        }
    }
    close FID;
    print STDERR "\ndone with reference sequence reading\n\n";
}

# read SNPs list ... must be TSV of scaffold id, position, ref base, and SNP consensus
# for each SNP, output tab-delimited fields appropriate for Illumina Array Design Tool (plus ...)
open FID, "<$opt_s" or die $usage;
print "SNP_name\tSNP_sequence\tref_name\tposition\tSNP_quality\t#A\t#T\t#C\t#G\n";
while ($line = <FID>) {
	@line = split /\t/, $line;
	$tabOut = "";
	# process pileup field to get allele frequencies
    $SNPpile = $line[8];
    $SNPpile =~ s/\^.//g;  # get rid of read-start indicators, and quality character
    $SNPpile =~ s/[\-\+](\d+)(??{".{$1}"})//g;  # remove ins/del numbers and letters
    $SNPpile =~ s/\$//g;  # remove read-end indicators
    $SNPpile =~ s/[\.,]/$line[2]/g;  # convert "." and "," to reference base (which could be lc)
    $SNPpile = uc $SNPpile;  # convert to upper-case
    $tabOut = "SNP_at_$line[0]:$line[1]\t";  # start header with ref sequence and position
	for ($i=$line[1]-$opt_f; $i<$line[1]; $i++) {
		if (defined $pile{$line[0]}{$i}) {  # can pull from new consensus?
			$tabOut .= $pile{$line[0]}{$i};
		}
		elsif (defined($nt = lc(substr($ref{$line[0]},$i-1,1)))) {  # can pull from reference?
            $tabOut .= $nt;
		}
		else { $tabOut .= "-" }
	}
	chomp $line[3];
	if ($line[3] eq "Y") { $tabOut .= "[C/T]" }
	elsif ($line[3] eq "R") { $tabOut .= "[A/G]" }
	elsif ($line[3] eq "W") { $tabOut .= "[A/T]" }
	elsif ($line[3] eq "S") { $tabOut .= "[C/G]" }
	elsif ($line[3] eq "K") { $tabOut .= "[T/G]" }
	elsif ($line[3] eq "M") { $tabOut .= "[C/A]" }
	else { $tabOut .= "[problem:$line[3]]" }
	for ($i=$line[1]+1; $i<=$line[1]+$opt_f; $i++) {
		if (defined $pile{$line[0]}{$i}) {  # can pull from new consensus?
            $tabOut .= $pile{$line[0]}{$i};
        }
        elsif (defined($nt = lc(substr($ref{$line[0]},$i-1,1)))) {  # can pull from reference?
            $tabOut .= $nt;
        }
        else { $tabOut .= "-" }
	}
	$tabOut .= "\t$line[0]\t$line[1]";
    $tabOut .= "\t$line[5]";  # add SNP quality
    $allele{"A"}=0; $allele{"T"}=0; $allele{"C"}=0; $allele{"G"}=0;  # initialize allele count hash
    while (length($SNPpile)>0) {
        $base = chop $SNPpile;
        $allele{$base}++;  # increment count of this base
    }
    foreach $k ("A","T","C","G") {
        $tabOut .= "\t$allele{$k}";  # add base and count
    }
	$tabOut .= "\n";
	# now, check for "-" or "+" or "*" in sequences ... disqualify if found
	@tabOutFields = split(/\t/,$tabOut);
	if (!($tabOutFields[1] =~ m/[\*\-\+NYRWSKM]/)) { print $tabOut }
}
close FID;