#!/usr/bin/perl # # randomrecords # # Select n random records from a file (helpful for regression testing). # # 2006-09~18: Written by Steven J. DeRose. Eliminates many manual steps for # converter regression testing. # 2009-08-05: clean up. # # To do: # Teach it to ignore blank lines, or lines with comments. use strict; use Getopt::Long; my $version = "2009-08-05"; my $help = 0; my $num = 0; my $lineNums = 0; my $quiet = 0; my $verbose = 0; # Process options Getopt::Long::Configure ("ignore_case" ); my $result = GetOptions( "h|help|?" => \$help, "l" => \$lineNums, "n=n" => \$num, "q|quiet!" => \$quiet, "v|verbose+" => \$verbose, "version" => sub { die "Version of $version, by Steven J. DeRose, sderose\@acm.org\n"; } ); if ($help) { showUsage(); exit; } ($result) || die "Bad options.\n"; # Validate and default options ($num >= 0) || die "Invalid -n value '$ARGV[0]'.\n"; my $file = $ARGV[0]; (-e $file) || die "Couldn't find file '$file'.\n"; ################################################################################ my $totalRecords = `cat $file | wc -l` - 0; ($quiet) || warn "Total records in source file: $totalRecords.\n"; ($totalRecords > 0) || die "No records, sorry.\n"; if ($num > ($totalRecords * 0.9)) { warn "That's over 90% of the records. Just do them all.\n"; exit; } my %recList = (); # Make a list of random numbers from 1...#records for (my $i=0; $i<$num; $i++) { my $r; do { $r = int(rand $totalRecords) + 1; } while ($recList{$r}); $recList{$r} = 1; } my $pct = int(10000.0*$num/$totalRecords)/100.0; ($quiet) || warn "$num records have been selected ($pct%). Reading them...\n"; # Read the file, and print just the records whose numbers are in the list. open IN, "<$file"; my $recNum = 0; while () { $recNum++; if ($recList{$recNum}) { if ($lineNums) { print "$recNum:\t"; } print $_; } } close IN; %recList = (); ($quiet) || warn "Done.\n"; exit; ################################################################################ sub showUsage() { print " Usage: randomrecords [options] [file] Selects records randomly from the specified file, and writes them to stdout. Useful for doing random sampling from lists, tab-delimited database tables, etc. Options: -l Prefix each line with its line-number and ': '. -n num Set how many records to select. This must be less than 90% of the total records. -q Suppress most messages. -version Display version info and exit ($version, sjd). Notes: Can accept DOS, but not Mac line-ends (output line-ends are always Unix) "; }