#!/usr/bin/perl -w # # Written 2010-03-23 by Steven J. DeRose. # 2012-04-04ff sjd: Integrate TabularFormats. Better reporting. Add -invert. # 2013-02-15 sjd: sjdUtils. TF. -numbered. Fix encoding options. # # To do: # Integrate token categories. # Option to tokenize/split the key. # Loose matching options? # use strict; use Getopt::Long; use Encode; use sjdUtils; use TabularFormats; our $VERSION = "2013-03-25"; # Option defaults # my $asciiOnly = 0; my $dict = "/usr/share/dict/words"; my $field = 0; my $ignoreCase = 0; my $iencoding = ""; my $iLineends = "U"; my $invert = 0; my $justCount = 0; my $numbered = 0; my $oencoding = ""; my $olineends = "U"; my $quiet = 0; my $tag = "pre"; my $tickInterval = 100000; my $verbose = 0; my $wordOnly = 0; ############################################################################### # Parse options # my %getoptHash = ( "asciiOnly!" => \$asciiOnly, "dict=s" => \$dict, "field=i" => \$field, "h|help" => sub { system "perldoc $0"; exit; }, "ignoreCase!" => \$ignoreCase, "iencoding=s" => \$iencoding, "iLineends=s" => \$iLineends, "invert!" => \$invert, "justCount!" => \$justCount, "listEncodings" => sub { warn "\nEncodings available:\n"; my $last = ""; my $buf = ""; for my $k (Encode->encodings(":all")) { my $cur = substr($k,0,2); if ($cur ne $last) { warn "$buf\n"; $last = $cur; $buf = ""; } $buf .= "$k "; } warn "$buf\n"; exit; }, "numbered!" => \$numbered, "oencoding=s" => \$oencoding, "olineends=s" => \$olineends, "q|quiet!" => \$quiet, "tag=s" => \$tag, "tickInterval=i" => \$tickInterval, "unicode" => sub { $iencoding = "utf8"; }, "v|verbose+" => \$verbose, "version" => sub { die "Version of $VERSION, by Steven J. DeRose.\n"; }, "wordOnly!" => \$wordOnly, ); (my $tfmt = new TabularFormats()) || dieCleanly("Could not construct TabularFormats instance."); $tfmt->addOptionsToGetoptLongArg(\%getoptHash); Getopt::Long::Configure ("ignore_case"); GetOptions(%getoptHash) || dieCleanly("Bad options."); ############################################################################### # Validate options # sjdUtils::setVerbose($verbose); (-r $dict) || die "Can't read dictionary '$dict'.\n"; ($field && $numbered) && die "Can't use -field and -numbered together.\n"; $iLineends = uc(substr($iLineends."U",0,1)); if ($iLineends eq "M") { $/ = chr(13); } elsif ($iLineends eq "D") { $/ = chr(13).chr(10); } else { } $olineends = uc(substr($olineends."U",0,1)); if ($olineends eq "M") { $\ = chr(13); } elsif ($olineends eq "D") { $\ = chr(13).chr(10); } else { } if ($oencoding) { print ""; binmode(STDOUT, ":encoding($oencoding)"); } ############################################################################### ############################################################################### # Main # my $recnum = 0; my $headRec = ""; my @headNames = undef; my $kept = 0; my $foundInDict = 0; my $hasSomeNonAscii = 0; my $hasAllNonAscii = 0; my $hasSomeNonWord = 0; my $hasSpace = 0; my $missingNumber = 0; # Load the dictionary # my %dwords = (); open(D, "<$dict") || die "Can't open dictionary '$dict'.\n"; if ($iencoding) { binmode(D, ":encoding($iencoding)"); } while (my $dword = ) { $dword = normalize($dword); $dwords{$dword}++; } ($quiet) || warn scalar(keys %dwords) . " words loaded from dictionary.\n"; # Set up input # if ($field) { $tfmt->open($ARGV[0], $iencoding) || die "Couldn't open '$ARGV[0]'."; if ($tfmt->getOption("header")) { $headRec = $tfmt->readHeader(); @headNames = $tfmt->parseHeader($headRec); } } else { binmode(STDIN, ":encoding($iencoding)"); } # Go # my $rec = undef; while (1) { $rec = ($field) ? $tfmt->readRecord() : <>; (defined $rec) || last; $recnum++; if ($tickInterval!=0 && $recnum % $tickInterval == 0) { warn "At record $recnum\n"; } # Extract the right data to check against dictionary chomp $rec; my $key; if ($field != 0) { my @fields = @{$tfmt->parseRecord($rec)}; if (scalar(@fields)<=$field) { die "Not enough fields (only " . scalar(@fields) . " at record $recnum:\n $rec\n"; } $key = $fields[$field]; } else { $key = $rec; if ($numbered) { $key =~ s/^\s*(\d+)\s+//; if (!$key) { $missingNumber++; } } } # Filter out any undesired records (multiple filters may apply) $key = normalize($key); my $discard = 0; if ($key =~ m/\P{IsASCII}/) { # Non-ASCII present $hasSomeNonAscii++; if ($key !~ m/\p{IsASCII}/) { $hasAllNonAscii++; } if ($asciiOnly || $wordOnly) { $discard = 1; } } if ($key =~ m/\W/) { # Non-word-chars present $hasSomeNonWord++; if ($key =~ m/\s/) { $hasSpace++; } if ($wordOnly) { $discard = 1; } } if (defined $dwords{$key}) { # In dictionary $foundInDict++; $discard = 1; } if ($discard == $invert) { $kept++; ($justCount) || print "$rec\n"; } } # EOF ($quiet) || report(); exit; ############################################################################### ############################################################################### # sub report { warn "Numbers of records:\n"; sjdUtils::pline("total", $recnum); ($recnum) || return; sjdUtils::pline("kept", $kept); sjdUtils::pline("found in dictionary", $foundInDict); sjdUtils::pline("w/ non-ASCII characters", $hasSomeNonAscii); sjdUtils::pline("w/ only non-ASCII characters", $hasAllNonAscii); sjdUtils::pline("w/ non-\\w characters", $hasSomeNonWord); sjdUtils::pline(" w/ space(s)", $hasSpace); if ($numbered) { sjdUtils::pline("w/o leading number", $missingNumber); } warn "Done after $recnum records, $kept kept.\n"; } sub normalize { my ($s) = @_; if ($ignoreCase) { $s = lc($s); } $s =~ s/\s//g; return($s); } ############################################################################### ############################################################################### ############################################################################### # =pod =head1 Usage dropDictionaryWords [options] [file] Copy the input, retaining only records for words that are *not* in the dictionary file (see I<-dict>). With I<-justCount>, simply report how many such words there are. Optionally, filter by character repertoire and/or by checking only a certain field rather than the whole record. =head1 Options (prefix 'no' to negate where applicable): All the C options are also available (see its doc). The most important ones are I<-fieldSep>, I<-quote>, I. =over =item * B<-asciiOnly> Discard (but count) entries with non-ASCII chars. =item * B<-dict> I Dictionary to use (default C). =item * B<-field> I If specified, only operate on field I (see TabularFormats.pm). =item * B<-iencoding> I Assume s as the character encoding for the input. =item * B<-ignoreCase> Ignore upper/lower case distinctions when looking words up in the dictionary. =item * B<-ilineends> I Assume Unix, Dos, or Mac line-breaks for input. =item * B<-invert> Keep the words that I in the dictionary, rather than the ones that aren't. =item * B<-justCount> Just count how many words are not in dictionary (don't copy them) =item * B<-listEncodings> Show the encodings supported by I<-iencoding> and I<-oencoding>, and exit. =item * B<-numbered> Strip leading numbers (in the form output by *nix C) from input (not dictionary) records) before comparison. Does not work with I<-field>. =item * B<-oencoding> I Use s as the character encoding for the output. =item * B<-olineends> I Use Unix, Dos, or Mac line-breaks for output. =item * B<-quiet> Shorten up the output. =item * B<-tickInterval> I Report progress every I records. =item * B<-unicode> Synonym for I<-iEncoding utf8>. =item * B<-verbose> More detailed output. =item * B<-version> Display version information and exit. =item * B<-wordOnly> Only keep words with only [-._'a-zA-z]. =back =head1 Known bugs and limitations =head1 Related commands c -- similar. C =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see L. The author's present email is sderose at acm.org. For the most recent version, see L. =cut