#!/usr/bin/perl -w #-------------------Print out the purpose of the script------------- #You should change to words so that the purpose of the script #will appear when you run it. print "This program checks Locus and Sequecne objects.\n"; print "This is designed for Erich's .ace files for gene annotation.\n"; print "-*-------------*--------------*--------------*----------\n\n"; #------------------Make dictionaries------------------------------ #This section makes a dictionary for each class of data you want to check. #If you want to check Locus, make Locus dictionary. #If you want to check Cell, make Cell dictionary. #... #In reach dictionary-making, for example, Locus dictionary, #/home/wen/dict/l_Locus.ace is the path of Wen's dictionary file. #you should custumize this path according to the location of your dictionary print "Make Locus dictionary ..."; @locus = MakeDictionary ("/home/wen/dict/l_Locus.ace"); print "Done.\n"; $lo_count=@locus; print "Make Sequence dictionary ..."; @sequence = MakeDictionary ("/home/wen/dict/l_Sequence.ace"); print "Done.\n"; $se_count=@sequence; print "Make Strange Sequence dictionary ..."; @strange_seq = MakeDictionary ("/home/wen/dict/l_StrangeSeq.ace"); print "Done.\n"; $stse_count=@strange_seq; print "Make Gene_class dictionary ..."; @gene_class = MakeDictionary ("/home/wen/dict/l_Gene_class.ace"); print "Done.\n"; $ge_count=@gene_class; #-----------Dictionaries made, now check .ace files-------------- #Here the script asks users to enter the name of input files and output files. print "What .ace file do you want to check? "; chomp($Ace_name=); print "What do you want to call the output file? "; chomp($Out_name=); print "Start checking ... \n\n"; open (IN, "$Ace_name") || die "can't open $!"; #Open input file open (OUT, ">$Out_name") || die "can't open $!";#Open output file $Line=; # Read a line from input file. $l=1; while ($Line=) { # Read another line from input file. $l++; # Remember which line the script is working on. chomp ($Line); if ($Line ne "") { #Extract out the first word of the line, which should be the data tag. @word = split ('"', $Line); $length = @word; if ($length >= 2) { # Means there are two words in the line. @w = split (' ', $word[0]); $word[0] = $w[0]; #First word extracted, should be tag $word[1] = "\"$word[1]\""; #Second word extracted, should be data field. $step=0; $match=0; #The following "if"s check each line whether there is a tag, #then check if the data field is in the dictionary. #In this particular example of Erich's .ace file, #tag for ?Locus is "Locus", $tag for ?Sequence is either "Sequence" or "Genomic Sequence". #You should customize the script according to #the data model of your .ace file. #If the first word is Locus, if (($word[0] eq "Locus") || ($word[0] eq "Other_name") || ($word[0] eq "Old_name")) { while (($step < $lo_count) && ($match == 0)) { if ($word[1] eq $locus[$step]) { $step=$lo_count+1; $match = 1; #found the locus in dictionary } else { $step++; } } } elsif (($word[0] eq "Sequence") || ($word[0] eq "Genomic_sequence")) { while (($step < $se_count) && ($match == 0)){ if ($word[1] eq $sequence[$step]) { $step=$se_count+1; $match = 1; #found the sequence in valid Sequence dictionary #The valid dictionary and to-be-deleted dictionary #shown below excludes each other. } else { $step++; } } $step = 0; #This "while" loop is special for Erich's .ace file. #To check if the existing Sequence entry belongs to #a particular group of sequence that may be deleted soon. #The valid sequence dictionary and to-be-deleted dictionary #excludes each other. while (($step < $stse_count) && ($match == 0)){ if ($word[1] eq $strange_seq[$step]) { $step=$stse_count+1; $match = 2; #found the sequence in invaid dictionary } else { $step++; } } } elsif (($word[0] eq "Gene_Class") || ($word[0] eq "Gene_class")) { while (($step < $ge_count) && ($match == 0)){ if ($word[1] eq $gene_class[$step]) { $step=$ge_count+1; $match = 1; } else { $step++; } } } else { $match = 1; } if ($match == 0) { #match = 0 means the data field was #never found in any dictionary. print OUT "Line $l is new: $Line\n"; } elsif ($match == 2) { #match = 2 means it is in to-be-deleted #dictionary, this only applies to #Sequence objects. print OUT "Line $l sequence might be invalid: $Line\n"; } } #End of if ($length >= 2) } # End of if {$Line ne "") } close (IN); close (OUT); print "\n"; print "Totally $l lines read. Thank you for using aceChecker!\n"; #------------------------sub routines----------------------------- #This part is a subroutine that takes in any keyset dump and #convert it to a dictionary. sub MakeDictionary { my $filename = shift(@_); my @Type = "null"; print "file name : $filename \n"; open (IN, $filename) || die "can't open $!"; $Line=; $l=1; while ($Line=) { chomp ($Line); @fields = split / : /, $Line; $Type[$l-1] = $fields[1]; $l++; } close (IN); return @Type; }