Trim.pl
From Bioinformatics Core Wiki
#!/usr/bin/perl
# AUTHOR: Nik Joshi
# LAST REVISED: September 2010
#
# The Bioinformatics Core at UC Davis Genome Center
# http://bioinformatics.ucdavis.edu
# Copyright (c) 2010 The Regents of University of California, Davis Campus.
# All rights reserved.
use Getopt::Long;
use strict;
if ($#ARGV < 0) {
print STDERR <<EOF;
Options:
--type <num> 0=standard trimming, 1=adaptive trimming, 2=windowed adaptive trimming. Default 0
--qual-threshold <num> quality threshold for trimming, default 20
--length-threshold <num> length threshold for trimming, default 20
--qual-type <num> 0=sanger qualities, 1=illumina qualities pipeline>=1.3, 2=illumina qualities pipeline<1.3. Default 0.
--pair1 <paired end input filename> fastq, paired end file. Must have same number of records as pair2. Required.
--pair2 <paired end input filename> fastq, paired end file. Must have same number of records as pair1. Required.
--outpair1 <paired end output file> Required.
--outpair2 <paired end output file> Required.
--single <single end output file> Required.
EOF
exit(1);
}
my $type = 0;
my $qual_threshold = 20;
my $length_threshold = 20;
my $qual_type = 0;
my ($pair1, $pair2, $outpair1, $outpair2, $single);
GetOptions ('type=i' => \$type,
'qual-threshold=i' => \$qual_threshold,
'length-threshold=i' => \$length_threshold,
'qual-type=i' => \$qual_type,
'pair1=s' => \$pair1,
'pair2=s' => \$pair2,
'outpair1=s' => \$outpair1,
'outpair2=s' => \$outpair2,
'single=s' => \$single);
if (!defined $pair1 || !defined $pair2 || !defined $outpair1 || !defined $outpair2 || !defined $single) {
print STDERR "Error: You must define pair1, pair2, outpair1, outpair2, and single\n";
exit(1);
}
sub get_quality_num {
my ($qual_char, $qual_type) = @_;
if ($qual_type == 0) {
return (ord($qual_char) - 33);
}
elsif ($qual_type == 1) {
return (ord($qual_char) - 64);
}
elsif ($qual_type == 2) {
return (10 * log(1 + 10 ** ((ord($qual_char) - 64) / 10.0)) / log(10));
}
}
open (PAIR1, "<$pair1") or die "Cannot access file $pair1.\n";
open (PAIR2, "<$pair2") or die "Cannot access file $pair2.\n";
open (OUTPAIR1, ">$outpair1");
open (OUTPAIR2, ">$outpair2");
open (SINGLE, ">$single");
my ($count, $p1_header1, $p1_seq, $p1_header2, $p1_qual, $p2_header1, $p2_seq, $p2_header2, $p2_qual);
my ($pos, $p1_flag, $qual_char, $qualnum, $p1_cut, $p2_flag, $p2_cut);
my ($paired_cnt, $single_cnt, $discard_cnt) = (0,0,0);
my ($i, $window_total, $window_size, $window_start, @p1_qual_data, @p2_qual_data);
my ($trimmed_p1_seq, $trimmed_p1_qual, $trimmed_p2_seq, $trimmed_p2_qual);
$count = 0;
while ($p1_header1 = <PAIR1>) {
$p1_seq = <PAIR1>;
$p1_header2 = <PAIR1>;
$p1_qual = <PAIR1>;
chomp $p1_seq;
chomp $p1_qual;
$p2_header1 = <PAIR2>;
$p2_seq = <PAIR2>;
$p2_header2 = <PAIR2>;
$p2_qual = <PAIR2>;
chomp $p2_seq;
chomp $p2_qual;
if ($type == 0) {
$trimmed_p1_seq = substr ($p1_seq, 0, $length_threshold);
$trimmed_p1_qual = substr ($p1_qual, 0, $length_threshold);
print OUTPAIR1 "$p1_header1$trimmed_p1_seq\n$p1_header2$trimmed_p1_qual\n";
$trimmed_p2_seq = substr ($p2_seq, 0, $length_threshold);
$trimmed_p2_qual = substr ($p2_qual, 0, $length_threshold);
print OUTPAIR2 "$p2_header1$trimmed_p2_seq\n$p2_header2$trimmed_p2_qual\n";
$paired_cnt++;
}
elsif ($type == 1) {
$pos=0;
$p1_flag=1;
foreach $qual_char (split (//, $p1_qual)) {
$qualnum = get_quality_num ($qual_char, $qual_type);
if ($qualnum < $qual_threshold) {
$p1_cut = $pos;
if ($pos < $length_threshold) {
$p1_flag = 0;
}
last;
}
$pos++;
}
$pos=0;
$p2_flag=1;
foreach $qual_char (split (//, $p2_qual)) {
$qualnum = get_quality_num ($qual_char, $qual_type);
if ($qualnum < $qual_threshold) {
$p2_cut = $pos;
if ($pos < $length_threshold) {
$p2_flag = 0;
}
last;
}
$pos++;
}
if ($p1_flag == 1 && $p2_flag == 1) {
$trimmed_p1_seq = substr ($p1_seq, 0, $p1_cut);
$trimmed_p1_qual = substr ($p1_qual, 0, $p1_cut);
print OUTPAIR1 "$p1_header1$trimmed_p1_seq\n$p1_header2$trimmed_p1_qual\n";
$trimmed_p2_seq = substr ($p2_seq, 0, $p2_cut);
$trimmed_p2_qual = substr ($p2_qual, 0, $p2_cut);
print OUTPAIR2 "$p2_header1$trimmed_p2_seq\n$p2_header2$trimmed_p2_qual\n";
$paired_cnt++;
}
elsif ($p1_flag == 1 && $p2_flag == 0) {
$trimmed_p1_seq = substr ($p1_seq, 0, $p1_cut);
$trimmed_p1_qual = substr ($p1_qual, 0, $p1_cut);
print SINGLE "$p1_header1$trimmed_p1_seq\n$p1_header2$trimmed_p1_qual\n";
$single_cnt++;
}
elsif ($p1_flag == 0 && $p2_flag == 1) {
$trimmed_p2_seq = substr ($p2_seq, 0, $p2_cut);
$trimmed_p2_qual = substr ($p2_qual, 0, $p2_cut);
print SINGLE "$p2_header1$trimmed_p2_seq\n$p2_header2$trimmed_p2_qual\n";
$single_cnt++;
}
else {$discard_cnt++;}
}
# sliding window
elsif ($type == 2) {
$window_size = int (0.1 * length ($p1_qual));
$window_start = 0;
$window_total = 0;
$p1_flag = 1;
$p1_cut = length($p1_qual);
@p1_qual_data = split (//, $p1_qual);
for ($i=0; $i<$window_size; $i++) {
$window_total += get_quality_num ($p1_qual_data[$i], $qual_type);
}
for ($i=0; $i<length($p1_qual); $i++) {
# if the average quality in the window is less than the threshold
# or if the window is the last window in the read
if ($window_total / $window_size < $qual_threshold || $window_start+$window_size > length ($p1_qual)) {
# at what point in the window does the quality dip below the threshold?
for (my $j=$window_start; $j<$window_start+$window_size; $j++) {
if (get_quality_num ($p1_qual_data[$j], $qual_type) < $qual_threshold) {
$p1_cut = $j;
if ($p1_cut < $length_threshold) {$p1_flag = 0;}
last;
}
}
last;
}
# instead of sliding the window, subtract the first qual and add the next qual
$window_total -= get_quality_num ($p1_qual_data[$window_start], $qual_type);
$window_total += get_quality_num ($p1_qual_data[$window_start+$window_size], $qual_type);
$window_start++;
}
$window_size = int (0.1 * length ($p2_qual));
$window_start = 0;
$window_total = 0;
$p2_flag = 1;
$p2_cut = length($p2_qual);
@p2_qual_data = split (//, $p2_qual);
for ($i=0; $i<$window_size; $i++) {
$window_total += get_quality_num ($p2_qual_data[$i], $qual_type);
}
for ($i=0; $i<length($p2_qual); $i++) {
if ($window_total / $window_size < $qual_threshold || $window_start+$window_size > length ($p2_qual)) {
# at what point in the window does the quality dip below the threshold?
for (my $j=$window_start; $j<$window_start+$window_size; $j++) {
if (get_quality_num ($p2_qual_data[$j], $qual_type) < $qual_threshold) {
$p2_cut = $j;
if ($p2_cut < $length_threshold) {$p2_flag = 0;}
last;
}
}
last;
}
# instead of sliding the window, subtract the first qual and add the next qual
$window_total -= get_quality_num ($p2_qual_data[$window_start], $qual_type);
$window_total += get_quality_num ($p2_qual_data[$window_start+$window_size], $qual_type);
$window_start++;
}
my ($trimmed_p1_seq, $trimmed_p1_qual, $trimmed_p2_seq, $trimmed_p2_qual);
if ($p1_flag == 1 && $p2_flag == 1) {
$trimmed_p1_seq = substr ($p1_seq, 0, $p1_cut);
$trimmed_p1_qual = substr ($p1_qual, 0, $p1_cut);
print OUTPAIR1 "$p1_header1$trimmed_p1_seq\n$p1_header2$trimmed_p1_qual\n";
$trimmed_p2_seq = substr ($p2_seq, 0, $p2_cut);
$trimmed_p2_qual = substr ($p2_qual, 0, $p2_cut);
print OUTPAIR2 "$p2_header1$trimmed_p2_seq\n$p2_header2$trimmed_p2_qual\n";
$paired_cnt++;
}
elsif ($p1_flag == 1 && $p2_flag == 0) {
$trimmed_p1_seq = substr ($p1_seq, 0, $p1_cut);
$trimmed_p1_qual = substr ($p1_qual, 0, $p1_cut);
print SINGLE "$p1_header1$trimmed_p1_seq\n$p1_header2$trimmed_p1_qual\n";
$single_cnt++;
}
elsif ($p1_flag == 0 && $p2_flag == 1) {
$trimmed_p2_seq = substr ($p2_seq, 0, $p2_cut);
$trimmed_p2_qual = substr ($p2_qual, 0, $p2_cut);
print SINGLE "$p2_header1$trimmed_p2_seq\n$p2_header2$trimmed_p2_qual\n";
$single_cnt++;
}
else {$discard_cnt++;}
}
$count++;
if ($count % 10000 == 0) {
print STDERR "Records: $count, Paired: $paired_cnt, Single: $single_cnt, Discarded: $discard_cnt \r";
}
}
print STDERR "Records: $count, Paired: $paired_cnt, Single: $single_cnt, Discarded: $discard_cnt \n";
close (PAIR1);
close (PAIR2);
close (OUTPAIR1);
close (OUTPAIR2);
close (SINGLE);