#!/usr/bin/perl -w # # normalizeSpace: Normalize whitespace in the input (not including CR/LF) # # 2007-10-??: Written by Steven J. DeRose. # 2012-09-07: bash-->Perl. Unicode support. # Add -control, -dash, -privateUse, -quote, -space. # # To do: # use strict; use Getopt::Long; use Encode; our $VERSION = "2012-09-07"; my $control = 0; my $dash = 0; my $iencoding = ""; my $ilineends = "U"; my $privateUse = 0; my $quote = 0; my $space = 1; my $verbose = 0; ############################################################################### # my %getoptHash = ( "control!" => \$control, "dash!" => \$dash, "h|help" => sub { system "perldoc $0"; exit; }, "iencoding=s" => \$iencoding, "ilineends=s" => \$ilineends, "listEncodings" => sub { warn "\nEncodings available:\n"; for my $k (Encode->encodings(":all")) { warn " $k\n"; } exit; }, "privateUse!" => \$privateUse, "quote!" => \$quote, "space!" => \$space, "unicode!" => sub { $iencoding = "utf8"; }, "v+" => \$verbose, "version" => sub { die "Version of $VERSION, by Steven J. DeRose.\n"; }, ); Getopt::Long::Configure ("ignore_case"); GetOptions(%getoptHash) || die "Bad options.\n"; $ilineends = uc(substr($ilineends."U",0,1)); if ($ilineends eq "M") { $/ = chr(13); } elsif ($ilineends eq "D") { $/ = chr(13).chr(10); } else { } ############################################################################### ############################################################################### # Main # if (scalar(@ARGV)<=0) { push @ARGV, "-"; if (-t STDIN) { warn "Waiting on STDIN...\n"; } } while (my $file = shift) { open(my $fh, "<$file") || die "Failed to open input file '$file'.\n"; if ($iencoding) { binmode($fh, ":encoding($iencoding)"); print ""; binmode(STDOUT, ":encoding($iencoding)"); } my $recnum = 0; while (my $rec = <$fh>) { $recnum++; chomp $rec; if ($control) { $rec =~ s/\p{Control}/ /g; } if ($privateUse) { $rec =~ s/([\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}])//g; } if ($dash) { $rec =~ s/\xAD//g; # soft hyphen $rec =~ s/\p{Dash_Punctuation}+/-/g; } if ($quote) { $rec =~ s/\p{Initial_Punctuation}/"/g; $rec =~ s/\p{Final_Punctuation}/"/g; } if ($space) { $rec =~ s/\p{Separator}+/ /g; $rec =~ s/^ //; $rec =~ s/ $//; } print "$rec\n"; } close($fh); } #warn "Done, $recnum records processed.\n"; exit; ############################################################################### ############################################################################### ############################################################################### # =pod =head1 Usage normalizeSpace [options] file Reduce runs of whitespace to a single space, and drop leading and trailing whitespace. If you specify I<-iencoding utf8>, all Unicode "Separator" characters count as spaces. These include regular space, TAB, CR, LF, non-breaking space, en, em, thin, and other width spaces (U+20xx), etc. =head1 Options (prefix 'no' to negate where applicable) =over =item * B<-control> Turn each control character (U+0000 to U+001F and U+0080 to U+009F) to a space. This happens before I<-space>. B: If the "upper" control characters are used for graphical characters, as in Windows CP1252 (but not Unicode or most other encodings), this option will turn them to spaces. If you have this problem, either translate your data out of CP1252, or specify I<-iencoding cp1252>. =item * B<-dash> Turn all dash character sequences to a single dash, except for soft hyphens (U+00AD), which are deleted. =item * B<-iencoding> I Specify character encoding for input. =item * B<-ilineends> I Assume Unix, Dos, or Mac line-breaks for input. =item * B<-listEncodings> Show all available character encodings, then exit. =item * B<-privateUse> Delete all Unicode private-use characters. =item * B<-quote> Turn each Unicode initial (open) or final (close) quotation mark to '"'. Apostrophe and backquote are not changed. =item * B<-space> Turn each run of one or more Unicode Separator characters to a single space. This is the default behavior; use I<-nospace> to turn it off. =item * B<-unicode> Synonym for I<-encoding utf8>. =item * B<-v> Add more messages (repeatable). =item * B<-version> Show version info and exit. =back =head1 Known Bugs and Limitations =head1 Related commands C provides much more extensive normalization and tokenization capabilities. C does some similar cleanups. =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see L. The author's present email is sderose at acm.org. For the most recent version, see L. =cut