#!/usr/bin/perl -w
#
# grepfields: do 'grep', but only within certain fields of each records.
#
# 2008-08-09: Written by Steven J. DeRose, sderose@acm.org.
# 2011-01-12ff: Integrate csvFormat.pm. 
# 2011-02-19 sjd: Much cleaning. Organize options.
#
# To do:
#     Lots of testing.
#     Option like -o except to get whole fields.
#     Match-on-whole-fields option.
#     Field specs like cut: -f 1-3,12-15
#     Base for line-numbers and byte offsets.
#     with --only-matching, include all field delims? delims between?

use strict;
use Getopt::Long;
use csvFormat;

my $version = "2011-02-19";

# Options like regular 'grep' (named like the long grep options)
# These are listed in the same order as in the MacOS man page for 'grep'.
# They are named for the single-char name, "_", and the long name.
# Options that just set other options, merely have a comment here.
#
# Options marked with "###" are per-expr options. They are stored in the
# current toFind object; when an actual regex is set (say, via -e), it
# is also stored there, and a new toFind object is started, inheriting the
# option values then in effect (not including -e).
#
my $A_afterContext      = 0;       # num lines of following context to show
#   a_text                       # (sets binaryFiles=text)
my $B_beforeContext     = 0;       # num lines of preceding context to show
#   C_context                    # (sets afterContext and beforeContext)
my $b_byteOffset        = 0;       # print byte offset
my $binaryFiles         = "sniff"; # binary, without-match, text, (sniff)
my $color               = (defined $ENV{"USE_COLOR"} && -t STDOUT) ? 1:0;
my $c_count             = 0;       # just print count
my $D_devices           = "read";  # read or skip (FIFO and socket files)
my $d_directories       = "read";  # read or skip or recurse

my $E_extendedRegexp    = 0;       ### (### means per-expr options)
my $e_regexp            = "";      ### The actual regex to look for
my $F_fixedStrings      = 0;       ###
my $f_file              = "";      ### Take patterns from a file
my $G_basicRexexp       = 1;       ###

my $H_withFilename      = 0;       # always show filenames
#   h_noFilename                   # (sets H_withFilename)
#   help                           # (special)
#   I                              # (sets binaryFiles=without-match)

my $i_ignoreCase        = 0;       ###
my $L_filesWithoutMatch = 0;       ###
my $l_filesWithMatches  = 0;       ###
my $m_maxCount          = 0;       

#   mmap                           # (not supported)
my $n_lineNumber        = 0;       # show line numbers
my $o_onlyMatching      = 0;       # show only matching portions
my $label               = "";      # 'filename' to use for STDIN.
#   line-buffered                  # (not supported)
my $P_perlRegexp        = 1;       ### Perl-style regular expressions
my $q_quiet             = 0;
#   R_r_recursive                  # (same as --directories=recurse)
my $include             = "";      # recurse only in dirs matching this expr
my $exclude             = "";      # don't recurse into dirs matching pattern
my $s_noMessages        = 0;       # suppress !-e messages
my $U_binary            = 0;       # treat all files as binary (else sniff)
my $u_unixByteOffsets   = 0;       # report as if Unix-style text (strip CR)
#   V_version                      # (special)
my $v_invertMatch       = 0;       ###
my $w_wordRegexp        = 0;       ###
my $x_lineRegexp        = 0;       ###
#   y                              # (same as -i)
my $Z_null              = 0;       # Write NUL instead of the usual.


# This object holds a regex, and the options in effect when it was specified.
#
my @exprs           = (); # An array of toFind objects.
my $curTF = new toFind();
$curTF->setRegexpType($E_extendedRegexp,
                      $F_fixedStrings,
                      $f_file,
                      $G_basicRexexp,
                      $P_perlRegexp,
                      $w_wordRegexp,
                      $x_lineRegexp,
                      $i_ignoreCase,
                      $L_filesWithoutMatch,
                      $l_filesWithMatches,
                      $v_invertMatch
    );

# Options not from 'grep'
#
my $base            = 10;         # Base to show offsets and line numbers in
my $combine         = "all";      # Require all, or just any, exprs to match?
#my $excludeFrom     = "";         # Path to a file listing patterns for -exclude
my $fileSep         = "\n--\n";   # Put between context-groups
my $oEncoding       = "";         # Output char encoding
my $onlyFields      = 0;          # Only return fields that matched.
my $verbose         = 0;          # More messages (repeatable)

# CSV format options
#
my $basicType     = "CSV";
my $delim         = ""; # aka recordSep
my $escape        = "";
my $expect        = 0;
my $header        = 0;
my @names         = ();
my $msQuoting     = 0;
my $quote         = "";
my $subfield      = "";
my $iencoding     = "";
my $ilineends     = "";


#############################################################################
# Process options
#
Getopt::Long::Configure ("no_ignore_case");
#
my $result = GetOptions(
    # csvFormat (input) options
    "basicType=s"             => \$basicType,
    "delim=s"                 => \$delim,
    "escape=s"                => \$escape,
    "expect=n"                => \$expect,
    "header!"                 => \$header,
    "msquoting!"              => \$msQuoting,
    "names=s"                 => \@names,
    "quote=s"                 => \$quote,
    "iencoding=s"             => \$iencoding,
    "ilineends=s"             => \$ilineends,
    "unicode!"                => sub { $iencoding = "utf8"; },    

    # grep-like options
    "A|after-context=n"    => \$A_afterContext,
    "a|text"               => sub { $binaryFiles = "text"; },
    "B|before-context=n"   => \$B_beforeContext,
    "C|context=n"          => sub {
        if ($_[1] > $A_afterContext)  { $A_afterContext  = $_[1]; }
        if ($_[1] > $B_beforeContext) { $B_beforeContext = $_[1]; }
    },
    "b|byte-offset!"       => \$b_byteOffset,
    "binaryFiles"          => \$binaryFiles,
    "color|colour=s"       => \$color,
	"c|count!"             => \$c_count,
    "D|devices=s"          => \$D_devices,
    "d|directories=s"      => \$d_directories,

    "E|extended-regexp!"   => sub { $curTF->{"E_extendedRegexp"} = 1; },
    "e|expr=s"             => sub {
        warn "New expr: '$_[0]' '$_[1]'\n";
        $curTF->{"expr"} = $_[1];
        push @exprs, $curTF;
        my $newTF = new toFind($curTF); # copy the options to new object
        $curTF = $newTF;
    },
    "F|fixed-strings!"     => sub { $curTF->{"fixedStrings"} = 1; },
    "f_file=s"             => sub { $curTF->{"file"} = $_[1]; },
    "G|basic-regexp!"      => sub { $curTF->{"basicRegexp"} = 1; },

    "H|with-filename"      => \$H_withFilename,
    "h|no-filename"        => sub { $H_withFilename = 0; },
	"help|?"               => sub { system "perldoc grepFields"; exit; },
    "I!"                   => sub { $binaryFiles = "without-match"; },

	"i|ignore-case|y!"     => sub { $curTF->{"ignoreCase"} = 1; },
    "L|files-without-matches"
                           => sub { $curTF->{"filesWithoutMatch"} = 1; },
    "l|files-with-matches"
                           => sub { $curTF->{"filesWithMatches"} = 1; },
	"m|max=i"              => \$m_maxCount,

    "mmap"                 => sub { die "-mmap is not supported.\n"; },

	"n|line-number!"       => \$n_lineNumber,
	"o|only-matching!"     => \$o_onlyMatching,
	"label=s"              => \$label,
    "line-buffered"        => sub { die "-line-buffered is not supported.\n"; },
    "P_perl-regexp!"       => sub { $curTF->{"perlRegexp"} = 1; },
	"q|quiet|silent!"      => \$q_quiet,
    "R|r|recursive!"       => sub { $d_directories = "recurse"; },
    "include=s"            => \$include,
    "exclude=s"            => \$exclude,
	"s|no-messages!"       => \$s_noMessages,
    "U|binary"             => \$U_binary,
    "u|unix-byte-offsets"  => \$u_unixByteOffsets,
    "version"              => sub {
		die "Version of $version, by Steven J. DeRose.\n";
	},
	"v|invert-match!"      => sub { $curTF->{"v_invertMatch"} = 1; },
    "w|wordRegexp!"        => sub { $curTF->{"wordRegexp"} = 1; },
    "x|lineRegexp!"        => sub { $curTF->{"lineRegexp"} = 1; },
    # "y" (see "i")
    "Z|null"               => \$Z_null,

    # Now, our extra options
    "fields=n"              => sub { $curTF->{"fieldsToUse"} = $_[0]; },
    "fileSep=s"            => \$fileSep,
    "fRange=s"             => sub {
        (my $fRange = $_[1]) =~ s/^\s+//;
        my @tokens = split(/\s*,\s*/,$fRange);
        (scalar @tokens == 2) || die "Bad format for -fRange $fRange.\n";
        my $frMin = sscanf("%g",$tokens[0]);
        my $frMax = sscanf("%g",$tokens[1]);
        ($frMin > $frMax) && die "min/max for -fRange $fRange out of order.\n";
        $curTF->{frMin} = $frMin;
        $curTF->{frMax} = $frMax;
        $curTF->{gotfRange} = 1;
    },
	"verbose+"             => \$verbose,
	);

($result) || die "Bad options.\n";


###############################################################################
# Check options and prep expressions to search for
#
if ($curTF->{expr}) {
    push @exprs, $curTF;
}
scalar(@exprs) || die "No expressions specified.\n";

($binaryFiles =~ m/^(binary|without-match|text|sniff)$/) || die
    die "Bad value '$binaryFiles' for -binaryFiles option.\n";

# Check validity of options for each expr
#
my $recnum = 0;
my $totalExprRecs = 0;
for my $i (0..scalar(@exprs)) {
    my $curTF = $exprs[$i];
    if ($curTF->{"f_file"}) {
        my $path = $curTF->{"f_file"};
        open(E, "<$path") || die
            "Can't find -f file '$path'.\n";
        $recnum = 0;
        while (my $ex = <E>) {
            # ($ex =~ m/^#/) || next;
            $recnum++;
            chomp $ex;
            push @{$curTF->{fileExprs}}, $ex;
        }
        close E;
        ($verbose) && warn "Loaded $recnum exprs from '$path'.\n";
        $totalExprRecs += $recnum;
    }

    if ($curTF->{wordRegexp} && $curTF->{lineRegexp}) {
        die "Can't specify both -word-regexp and -line-regexp.\n";
    }
    elsif ($curTF->{fixedStrings}) {
        $curTF->{exprs} =~ s/([^\w\d])/\\$1/g;
    }
    elsif ($curTF->{wordRegexp}) {
        $curTF->{exprs} = "\\b$exprs[$i]\\b";
    }
    elsif ($curTF->{lineRegexp}) {
        $curTF->{exprs} = "^$exprs[$i]\$";
    }

    my @fds = @{$curTF->expandFieldsToUse()};
    if ($expect > 0 && scalar(@fds) > $expect) {
        warn "Reference to field " . scalar(@fds) . ", but -expect is $expect.\n";
    }

    ($curTF->{fieldsToUse} =~ m/(\d+|\d+=\d+)(,(\d+|\d+=\d+))*/) ||
        die "Bad -fieldsToUse expression '$curTF->{fieldsToUse}'.\n";
} # per expr objects


###############################################################################
# Other option checks
#    
if ($oEncoding) {
    print "";
    binmode(STDOUT,":$oEncoding");
}

#if ($excludeFrom) {
#    open(EX, "<$excludeFrom") || die
#        "Can't find -exclude-from file '$excludeFrom'.\n";

#    while (my $ex = <EX>) {
#        chomp $ex;
#        push @exclude, $ex;
#    }
#    close EX;
#}


###############################################################################
# Set up CSV handling. Actual headers are loaded per-file.
#
my $csvf = new csvFormat();
$basicType = uc($basicType);
($basicType =~ m/^(CSV|XML|ARFF)$/) ||
    die "Unknown -basicType '$basicType'\n";
setOptions($csvf);

my @headerNames = ();        # Must update manually if you change names
my %headerNamesHash = ();    # ...
if (scalar(@names)>0) {
    ($header) && die "Can't use both -names and -header.\n";
}


###############################################################################
#
my $nFiles   = 0;
my $nFound   = 0;
my $totLines = 0;
my $nBadFieldCount = 0;
my $nBinaryFiles = 0;
my $nDirectories = 0;
my $nPlainFiles = 0;

my $f = "";
while ($f = shift) {
	if (! -e $f) {                          # Doesn't exist
		($s_noMessages) || warn "Could not find file '$f'.\n";
		next;
	}
    if (-B $f) {                            # binary file
        ($verbose) && warn "Binary file '$f'\n";
        ($binaryFiles == "without-match") && next;
        $nBinaryFiles++;
    }
    elsif (-d $f) {                         # directory
        doOneDir($f);
    }
                                            # (handled devices here)

    else {                                  # regular file
        ($verbose) && warn "Normal file '$f'\n";
        $nFiles++;
        doOneFile($f);
    }
} # while shift
 
($q_quiet) ||
    warn "Done, $nFound hits, $totLines records, $nFiles files.\n";

exit;


###############################################################################
# (*** may have to explicitly prepend dir path here ***)
#
sub doOneDir {
    my ($d) = @_;

    ($verbose) && warn "Directory '$d'\n";
    $nDirectories++;

    if    ($d_directories eq "skip") { return; }
    elsif ($d_directories eq "read") { doOneFile($d); }
    elsif  ($d =~ m/$exclude/)       { return; }
    elsif ($d_directories eq "recurse") {
        opendir(D, "<$d") || die "Couldn't open '$d'\n";
        while (my $f = readdir(D)) {
            ($f =~ m/^\./) && next;
            if (-d $f) { doOneDir($f); }
            else { doOneFile($f); }
        }
        closedir D;
    }
    else {
        ($verbose) && warn "doOneDir: Not recursing on dir '$d'.\n"
    }
} # doOneDir


###############################################################################
#
sub doOneFile {
    my $f = $_[0];

    ($verbose) && warn "******* Starting file '$f'\n";
    $nPlainFiles++;

    if ($exclude && $f =~ m/$exclude/) { 
        ($verbose>1) && warn "Excluding file '$f'.\n";
        return;
    }
    if ($include && $f !~ m/$include/) { 
        ($verbose>1) && warn "Not including file '$f'.\n";
        return;
    }

    ($verbose>1) && warn "Not excluded or un-included: $f.\n";

	my $recnum = 0;
    my @beforeContextLines = ();
    my $stillToPrint = 0;
    my $hitsInThisFile = 0;
    my $offset = 0;
    my $crCount = 0;

    my $fh;
    if (!open($fh, "<$f")) {
        warn "Unable to open file '$f'.\n";
        return;
    }
    ($verbose>1) && warn "Opened '$f'\n";

    if ($header) {
        my @headerNames = @{$csvf->parseHeader(<$fh>)};
        if ($verbose) {
            warn "Read header, labels:\n";
            for my $i (1..(scalar(@headerNames)-1)) {
                warn sprintf("  %4d '%s'\n", $i, $headerNames[$i]);
            }
        }
    }

    while (my $rec = <$fh>) {
        $recnum++;
		$totLines++;
        if (index($rec, "\r")>=0) { $crCount++; }
        if (!$U_binary) { chomp $rec; }

        ($verbose>1) && warn "Checking rec $recnum: '$rec'\n";

        # Keep last n lines around in case we'll need to print it.
        if ($B_beforeContext>0) {
            if (scalar @beforeContextLines >= $B_beforeContext) {
                shift @beforeContextLines;
            }
            push @beforeContextLines, $rec;
        }

        # inverted exprs are handled by isRecordAHit().
        my $hitText = isRecordAHit($rec);

        # Report or count hit(s)
        if (defined $hitText) {
            $nFound++;
            $hitsInThisFile++;
            if ($A_afterContext | $B_beforeContext) {
                print "$fileSep"; # Separator line
                print join("\n",@beforeContextLines) . "\n";
                @beforeContextLines = ();
                $stillToPrint = $A_afterContext;
            }
            if (!$c_count
                # && !$filesWithoutMatch
                # && !$filesWithMatches
                ) {
                if ($H_withFilename)    { print $f . ($Z_null?chr(0):":"); }
                if ($n_lineNumber)      { print "$recnum:"; }
                if ($b_byteOffset)      { print "$offset:"; }
                if ($u_unixByteOffsets) { print "" . ($offset-$crCount) . ":"; }
                if ($o_onlyMatching) {
                    print "$hitText\n";
                }
                else {
                    print "$rec\n";
                }
            }
        } # hitText
        elsif ($stillToPrint-- > 0) {
            print $rec;
        }

        $offset += length($rec);
        if ($m_maxCount>0 && $nFound >= $m_maxCount) { last; }
	} # EOF

    if (
        ($L_filesWithoutMatch && !$hitsInThisFile) ||
        ($l_filesWithMatches  && $hitsInThisFile)
        ) {
        print "$f" . (($c_count) ? ":$hitsInThisFile":"") . "\n";
    }
} # doOneFile


###############################################################################
# Test the record against all the expressions. It must match all non-inverted
# exprs, and none of the inverted ones, for us to return TRUE.
# 
# Returns: the text content that was matched.
#
sub isRecordAHit {
    my ($rec) = @_;

    # Parse the line into fields
    my @fieldContents = @{$csvf->parseRecord($rec)};
    if ($expect && scalar(@fieldContents)!=$expect) {
        warn "File $f, record $recnum: has " . scalar(@fieldContents) .
            " records, not expected $expect.\n";
        $nBadFieldCount++;
    }

    # See which exprs the record matches.
    my @matchList = ();
    my $nMatches = 0;
    for my $i (0..scalar(@exprs)) {
        my $curTF = $exprs[$i];
        my $someFieldMatched = 0;
        my @fds = @{$curTF->expandFieldsToUse()};
        for my $fNum (1..(scalar(@fds)-1)) {
            (defined $fds[$fNum]) || next; # sparse array
            if ($fNum !~ m/^\d+$/) {
                $fNum = $csvf->getFieldNum($fNum);
                ($fNum) || next;
            }
            my $curField = $fieldContents[$fNum-1];
            ($verbose>2) && warn "  checking field #$fNum: '$curField'\n";
            ($fNum >= scalar @fieldContents) || next;

            if (my $ex = $curTF->{expr}) {
                ($verbose>3) && warn "    vs expr /$ex/\n";
                if ($curField =~ m/($ex)/) {
                    $someFieldMatched = 1; last;
                }
            }
            elsif ($curTF->{gotfRange}) {
                my $fieldValue = $curField - 0.0;
                if ($curField >= $curTF->{frMin} &&
                    $curField <= $curTF->{frMax}) {
                    $someFieldMatched = 1; last;
                }
            }
        } # per field
        if (($someFieldMatched  && !$curTF->{"invertMatch"}) ||
            (!$someFieldMatched && $curTF->{"invertMatch"})) {
            $matchList[$i] = 1;
            $nMatches++;
        }
        else {
            $matchList[$i] = 0;
        }
    } # per expr
    
    # Now combine the results of all the individual matches.
    if ($combine eq "all") {
        return(($nMatches >= scalar(@exprs)) ? 1:0);
    }
    elsif ($combine eq "any") {
        return(($nMatches > 0) ? 1:0);
    }
} # isRecordAHit


###############################################################################
#
sub setOptions {
    my ($csvf) = @_;
    if ($verbose) { $csvf->setVerbose(1); }

    if ($basicType)  { $csvf->setFormatOption("basicType", $basicType); }
    if ($delim)      { $csvf->setFormatOption("fieldSep",  $delim);  }
    if ($escape)     { $csvf->setFormatOption("escape",    $escape); }
    if ($expect)     { $csvf->setFormatOption("minFields", $expect); }    # ???
    if ($header)     { $csvf->setFormatOption("header",    $header); }
    if ($quote)      { $csvf->setFormatOption("quote",     $quote);  }
    if ($msQuoting)  { $csvf->setFormatOption("msQuoting", $msQuoting); }
    if ($subfield)   { $csvf->setFormatOption("subfieldSep",$subfield); }

    if (scalar(@names)>0) {
        for my $i (1..(scalar(@names))) {
            $csvf->setFieldName($i,$names[$i]);
        }
    }

    ($verbose) || return;

    warn "******* Input Format Options:\n";
    showOption($csvf, "basicType");
    showOption($csvf, "delim");
    showOption($csvf, "escape");
    #showOption($csvf, "expect");
    showOption($csvf, "header");
    showOption($csvf, "quote");
    showOption($csvf, "msQuoting");
    #showOption($csvf, "subfield");
} # setOptions

sub showOption {
    my ($csv, $opt) = @_;
    (my $csvOptName = $opt) =~ s/delim/fieldSep/; 
    my $value = $csv->getFormatOption($csvOptName);      
    warn sprintf("    Option %-12s   '%s'\n", $opt, showControls($value));
} # showOption


###############################################################################
###############################################################################
# This represents a single regex to search for, along with all the options
# in effect when it was specified. Option processing writes the relevant
# option values directly into a "current" toFind object (and creates a new
# toFind whenever it sees -expr).
#
package toFind;

sub new {
    my ($class, $copyThis) = @_;
    my $self = {
        version         => "2011-02-19",
        fieldsToUSe     => "",
        extendedRegexp	=> 0,
        fixedStrings	=> 0,
        file			=> 0,
        basicRexexp		=> 0,
        perlRegexp  	=> 1,
        wordRegexp		=> 0,
        lineRegexp		=> 0,
        ignoreCase		=> 0,
        filesWithoutMatch	=> 0,
        filesWithMatches	=> 0,
        maxCount		=> 0,
        invertMatch		=> 0,
        gotfRange       => 0,
        frMin           => 0.0,
        frMax           => 0.0,
    };
    if (defined $copyThis) {
        my %copyHash = %$copyThis;
        for my $k (keys %copyHash) {
            $self->{$k} = $copyHash{$k};
        }
    }
    $self->{expr} = "";
    $self->{gotfRange} = 0;
    $self->{frMin} = $self->{frMin} = 0;
    bless $self, $class;
    return($self);
}

# Used to copy in the current state of expr-specific options, so they're
# associated with each expr correctly.
#
sub setRegexpType {
    (scalar(@_) == 12) || die "Wrong number of args to setRegexpType().\n";
    my $self = shift @_;
    ($self->{extendedRegexp},
     $self->{fixedStrings},
     $self->{file},
     $self->{basicRexexp},
     $self->{perlRegexp},
     $self->{wordRegexp},
     $self->{lineRegexp},
     $self->{ignoreCase},
     $self->{filesWithoutMatch},
     $self->{filesWithMatches},
     $self->{maxCount},
     $self->{invertMatch}) = @_; 
}

sub getVersion {
    my ($self) = @_;
    return($self->{version});
}

sub expandFieldsToUse {
    my ($self) = @_;
    my @theList = ();
    for my $t (split(/,/,$self->{fieldsToUse})) {
        if ($t =~ m/^(\d+)/) {
            $theList[$1]++;
        }
        elsif ($t =~ m/^(\d+)-(\d+)/) {
            my $from = $1; my $to = $2;
            ($from>$to) && die
                "Fields $from-$to out of order in '$self->{fieldsToUse}'.\n";
            for my $i ($from..$to) {
                $theList[$i]++;
            }
        }
    } # $t
    return(\@theList);
}


###############################################################################
###############################################################################
#

=pod

=head1 Usage

grepFields [options] [files]

Essentially the same as C<grep>, except it only looks within specified fields
of CSV-style records. different expressions can be sought in different fields.

The field(s) to be matched are specified with L<"-fields"> option, which applies
to following matches until changed by another L<"-fields"> option. Several options
can be repeated in order to change their value for following matches:

=over

I<E> or I<extendedRegexp>, 
I<F> or I<fixedStrings>, 
I<f> or I<file>, 
I<G> or I<basicRexexp>, 
I<w> or I<wordRegexp>, 
I<x> or I<lineRegexp>, 
I<i> or I<ignoreCase>, 
I<L> or I<filesWithoutMatch>, 
I<l> or I<filesWithMatches>, 
I<m> or I<maxCount>, 
I<v> or I<invertMatch>.

=back


=head1 Input formats

The delimiter and other input-format options can be set with the
options standard for my CSV-oriented scripts (which are built on C<csvFormat.pm>):

=over

=item * B<-delim> I<s>

=item * B<-escape> I<s>

=item * B<-expect> I<n>

=item * B<-msQuoting>

=item * B<-quote> I<s>

=item * B<-header>

=item * B<-iencoding> I<e>

=item * B<-ilineends>  I<os>

=back


=head1 Options I<not> like those of C<grep>

=over

=item * B<-base> I<10|16>

Whether to show line numbers and/or file offsets in decimal (default) or hex.

=item * B<--excludeFrom> I<path>

Use I<path> as a list of patterns to treat as if they had been specified
for the I<-exclude> option.

=item * B<-expect> I<n>

Warn for any record that doesn't have exactly I<n> records.

=item * B<-expr> I<regex>

B<Unlike> grep, you can specify multiple regular expressions to match for,
possibly in different fields and with different options (such as I<-i>).

=item * B<-fields> I<n>

Which fields to look in (repeatable, and applies to following matches until
replaced by another L<"-fields"> option). Fields count from 1, not 0 
(with L<"-header">, field names will eventually be supported as well).


=item * B<-frange> I<min..max>

Search for a value of at least I<min> and at most I<max>, in any of the
currently-active fields.
For example, '-frange 10,57 would require a number between
10 and 57 inclusive. Number formats as understood by Perl scanf %g are ok.

This is in addition to any requirements specified by -expr.

=item * B<-oencoding> I<e>

Set the output encoding to I<e>.

=item * B<-only-fields>

Similar to the L<"-only-matching"> option, but causing only matching I<fields>
to be returned, rather than only the matching I<parts> of fields.

=back


=head1 Unsupported standard C<grep> options

=over

=item I<-D> or I<--devices=action> read, skip

=item I<-E> (extended regexes; I<-P> is the default, which is pretty 'extended')

=item I<-G> (basic regexes)

=item I<-T> or  I<--initial-tab>

=item I<--line-buffered>

=item I<--mmap>

=item I<-z> or I<--null-data>

=back


=head1 Options like those of C<grep>

(from the usual C<grep> descriptions -- differences added in B<bold>).

=over

=item * B<-A ___> or B<--after-context=___>

Print ___ lines of trailing context after matching lines.
Places a line containing I<--> between contiguous groups of matches.

=item * B<-a> or B<--text>

Process a binary file as if it were text; this is equivalent
to the I<--binary-files=text> option.

=item * B<-B ___> or B<--before-context=___>

Print ___ lines of leading context before matching lines.
Places a line containing I<--> between contiguous groups of matches.

=item * B<-C ___> or B<--context=___>

Print ___ lines of output context. Places a line containing
I<--> between contiguous groups of matches.

=item * B<-b> or B<--byte-offset>

Print the byte offset within the input file before each line of output.

=item * B<--binary-files=____>

If the first few bytes of a file indicate that the file contains binary data, 
assume that the file is of type ____.  
By default, ____ is binary, and grep normally outputs either a
one-line message saying that a binary file matches, 
or no message if there is no match. If ____ is I<without-match>, grep
assumes that a binary file does not match; this is equivalent
to the -I option. 
If ____ is I<text>, grep processes a binary
file as if it were text; this is equivalent to the I<-a> option.
________ grep I<--binary-files=text> might output binary garbage,
which can have nasty side effects if the output is a terminal
and if the terminal driver interprets some of it as commands.

=item * B<--colour[=____]_ --color[=____]>

Surround the matching string with the marker find  in
C<GREP_COLOR>  environment  variable.  WHEN may be I<never>, I<always>, or I<auto>.
B<Unlike> grep, Defaults to true if the environment variable C<USE_COLOR> is set
and STDOUT is going to a terminal.

=item * B<-c> or B<--count>

Suppress normal output; instead print a count of matching
lines for each input file. With the I<-v> or B<--invert-match> option
(see below), count non-matching lines.

=item * B<-D ______> or B<--devices=______>

If an input file is a device, FIFO or socket, use ______ to process it.
By default, ______ is I<read>, 
which means that devices are read just as if they were ordinary files.
If ______ is I<skip>, devices are silently skipped.

=item * B<-d ______> or B<--directories=______>

If an input file is a directory, use ______ to process it. 
By default, ______ is I<read>, which means that directories are read
just as if they were ordinary files. 
If ______ is I<skip>, directories are silently skipped.
If ______ is I<recurse>, grep reads all files under each directory, recursively; 
this is equivalent to the I<-r> option.

=item * B<-E> or B<--extended-regexp>

Interpret _______ as an extended regular expression (see below).

=item * B<-e _______> or B<--regexp=_______>

Use _______ as the pattern; useful to protect patterns beginning with I<->.

=item * B<-F> or B<--fixed-strings>

Interpret _______ as a list of fixed strings, separated by
newlines, any of which is to be matched.

=item * B<-f ____> or B<--file=____>

Obtain patterns from ____, one per line. 
The empty file contains zero patterns, and therefore matches nothing.

=item * B<-G> or B<--basic-regexp>

Interpret _______ as a basic regular expression (see below). This is the default.

=item * B<-H> or B<--with-filename>

Print the filename for each match.

=item * B<-h> or B<--no-filename>

Suppress the prefixing of filenames on output when multiple files are searched.

=item * B<--help>

Output a brief help message.

=item * B<-I>

Process a binary file as if it did not contain matching data;
this is equivalent to the I<--binary-files=without-match> option.

=item * B<-i> or B<--ignore-case>

Ignore case distinctions in both the _______ and the input files.

=item * B<-L> or B<--files-without-match>

Suppress normal output; instead print the name of each input
file from which no output would normally have been printed.
The scanning will stop on the first match.

=item * B<-l> or B<--files-with-matches>

Suppress normal output; instead print the name of each input
file from which output would normally have been printed.  The
scanning will stop on the first match.

=item * B<-m ___> or B<--max-count=___>

Stop reading a file after ___ matching lines. If the input is
standard input from a regular file, and ___ matching lines are
output, grep ensures that the standard input is positioned to
just after the last matching line before exiting, regardless
of the presence of trailing context lines. This enables a
calling process to resume a search. When grep stops after ___
matching lines, it outputs any trailing context lines. 
When the I<-c> or I<--count> option is also used, grep does not output a
count greater than ___. When the I<-v> or I<--invert-match> option
is also used, grep stops after outputting ___ non-matching lines.

=item * B<--mmap>

If possible, use the mmap(2) system call to read input,
instead of the default read(2) system call.
In some situations I<--mmap> yields better performance. However, I<--mmap> can
cause undefined behavior (including core dumps) if an input
file shrinks while C<grep> is operating, or if an I/O error occurs.

=item * B<-n> or B<--line-number>

Prefix each line of output with the line number within its input file.

=item * B<-o> or B<--only-matching>

Show only the part of a matching line that matches ________

=item * B<--label=_____>

Displays input actually coming from standard input as input
coming from file ______ This is especially useful for tools
like C<zgrep>, e.g. C<gzip -cd foo.gz | grep --label=foo something>.

=item * B<--line-buffered>

Turns on line buffering. However, this can be a performance penalty.
B<Unlike> grep, this is not supported.

=item * B<-P> or B<--perl-regexp>

Interpret _______ as a Perl regular expression.
B<Unlike> grep, this is the default.

=item * B<-q> or B<--quiet> or B<--silent>

Quiet; do not write anything to standard output. 
Exit immediately with zero status if any match is found, even if an error
was detected. Also see the I<-s> or I<--no-messages> option.

=item * B<-R, -r> or B<--recursive>

Read all files under each directory, recursively; this is
equivalent to the I<-d> recurse option.

=item * B<--include> I<PATTERN>

Recurse in directories only searching file matching ________
(B<unlike> grep, this is repeatable).

=item * B<--exclude> I<PATTERN>

Recurse in directories skip file matching ________
(B<unlike> grep, this is repeatable, and there is a related L<"-excludeFrom">
option to take a list of exclude-patterns from a file.).

=item * B<-s> or B<--no-messages>

Suppress error messages about nonexistent or unreadable files.
Portability note: unlike GNU grep, traditional grep did not
conform to POSIX.2, because traditional grep lacked a I<-q>
option and its -s option behaved like GNU grep's I<-q> option.
Shell scripts intended to be portable to traditional grep
should avoid both I<-q> and I<-s> and should redirect output to 
C</dev/null> instead.

=item * B<-U> or B<--binary>

Treat the file(s) as binary. By default, under MS-DOS and MSWindows, 
grep guesses the file type by looking at the contents
of the first 32KB read from the file. If grep decides the
file is a text file, it strips the CR characters from the
original file contents (to make regular expressions with C<^> and
C<$> work correctly). Specifying I<-U> overrules this guesswork,
causing all files to be read and passed to the matching mechanism verbatim;
if the file is a text file with CR/LF pairs at
the end of each line, this will cause some regular expressions
to fail. This option has no effect on platforms other than MS-DOS and MS-Windows.

=item * B<-u> or B<--unix-byte-offsets>

Report Unix-style byte offsets. This switch causes grep to
report byte offsets as if the file were Unix-style text file,
i.e. with CR characters stripped off.  This will produce
results identical to running grep on a Unix machine.  This
option has no effect unless the I<-b> option is also used; it has no
effect on platforms other than MS-DOS and MS-Windows.

=item * B<-V> or B<--version>

Print the version number of grep to standard error.
This version number should be included in all bug reports (see below).

=item * B<-v> or B<--invert-match>

Invert the sense of matching, to select non-matching lines.

=item * B<-w> or B<--word-regexp>

Select only those lines containing matches that form whole
words. The test is that the matching substring must either be
at the beginning of the line, or preceded by a non-word constituent character.
Similarly, it must be either at the end
of the line or followed by a non-word constituent character.
Word-constituent characters are letters, digits, and the underscore.

=item * B<-x> or B<--line-regexp> or B<--field-regexp>

Select only those matches that exactly match the whole sequence of fields
current being searched.

=item * B<-y>

Obsolete synonym for -i.

=item * B<-Z> or B<--null>

Output a zero byte (the ASCII NUL character) instead of the
character that normally follows a file name. For example,
grep -lZ outputs a zero byte after each file name instead of
the usual newline. This option makes the output unambiguous,
even in the presence of file names containing unusual 
characters like newlines.  This option can be used with commands
like find C<-print0>, C<perl -0>, C<sort -z>, and C<xargs -0> to process
arbitrary file names, even those that contain newline characters.

=back


=head1 Perl regexes

(see C<man perlre> for details)


=head1 Related commands

grep, cvsFormat.pm.


=head1 Known bugs and limitations

Many of the options have not had much testing (for example,
unix-byte-offsets and the exclude/include, binary, context,
and directory options).


=head1 Ownership

This work by Steven J. DeRose is licensed under a Creative Commons 
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see http://creativecommons.org/licenses/by-sa/3.0/.

The author's present email is sderose at acm.org.

For the most recent version, see http://www.derose.net/steve/utilities/.

=cut