#!/usr/bin/perl -w # # tab2xml: Change a tab-delimited (or similar) file to XML. # # 2006-09-07: Written by Steven J. DeRose, sderose@acm.org. # 2007-01-25 sjd: More flexible i/o options. -nonxml, -norm, -qwhere, -escape, # -outlineend, etc. # 2007-02-07 sjd: Add -comment, -ocomment. # 2007-11-27 sjd: strict. # 2008-01-18 sjd: Getopt. Lots of bug fixes. Add -overflow, -tableattrs. # 2008-03-25 sjd: Implement -quote, -escape, fix msq, linebreaking, regexes, # error-reporting, -lineends. Make it actually work. # 2008-04-14 sjd: Add -empty. # # To do: # Distinguish start/end delim for non-XML output # Allow to set up colspecs # Consider adding some layout options like normalizeXML. # Support named DTDs: cals, html, etc. # use strict; use Getopt::Long; my $version = "2008-04-14"; my $dft_delim = "\\s+"; my $dft_odelim = "\t"; my $dft_oquote = "\""; my $dft_oescape = "\\"; my $dft_empty = " "; ################################################################################ # Input options: my $delim = $dft_delim; # regex for split() to break into fields my $header = 0; # 1st records is field names? my $msq = 0; # Use funky MS quoting? my $quote = ""; # Quote character to expect? my $escape = ""; # Escape char for the quote? my $lineends = "U"; # Platform specific line-separators my $nfields = 0; # Number of fields per line (0 = don't care) my $norm = 1; # Normalize input whitespace? my $overflow = "fatal"; # What to do with "too many" fields in a record. my $comments = 0; # Treat \# lines as comments? # XML output options: my $ampEntity = "&"; # What to write for ampersand my $ltEntity = "<"; # What to write for less-than my $entityFormat = "XXXXX"; # Change to work like normalizeXML? my $colspecs = ""; # Info to put in colspec elements my $empty = $dft_empty; # What to put in empty cells my $id = ""; # Put IDs on rows? my $indent = 0; # Break and indent XML output? my $indents = " "; # String to use for indenting my $headingTag = "th"; # Tag for heading fields my $fieldTag = "td"; # Tag to use for other fields my $namestring = ""; # List of user-specified field names my $useClass = 0; # Put field-name on class attribute? my $recordTag = "tr"; # Tag to surround whole records my $rootTag = "table"; # Tag to surround entire table my $tableAttrs = ""; # Attribute string to add on table element. my $public = ""; # Public identifier to put in DOCTYPE my $system = ""; # System identifier to put in DOCTYPE my $xmldcl = 0; # Write XML declaration? # Non-XML output options: my $nonxmlOutput = 0; # Write non-XML? my $odelim = $dft_odelim; # Field separator to use my $oheader = 0; # Write first record with field names? my $oquote = $dft_oquote; # Quoting charcter to use my $oqwhere = "delim"; # Which fields to quote? none|all|num|text|delim my $oescape = $dft_oescape; # Escape char for quotes? my $olineends = "u"; # Line-end type to use my $ocomments = 1; # Copy -comments lines to output? # General options: my $help = 0; my $quiet = 0; my $verbose = 0; ############################################################################### # Process options Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( # Input options "comments" => \$comments, "delim=s" => \$delim, "escape=s" => \$escape, "header" => \$header, "lineends=s" => sub { $lineends = uc(substr("$ARGV[0]"."U", 0, 1)); }, "msq" => \$msq, "nfields=i" => \$nfields, "overflow=s" => \$overflow, "quote=s" => \$quote, # XML output options "empty=s" => \$empty, "id=s" => \$id, "indent" => \$indent, "istring=s" => \$indents, "names=s" => \$namestring, "nonxml" => \$nonxmlOutput, "numerics=s" => \$entityFormat, "colspecs=s" => \$colspecs, "class" => \$useClass, "td|field=s" => \$fieldTag, "th|heading=s" => \$headingTag, "tr|record=s" => \$recordTag, "table|root=s" => \$rootTag, "tableattrs=s" => \$tableAttrs, "public=s" => \$public, "system=s" => \$system, "xmldcl" => \$xmldcl, # nonxml output options "ocomments" => \$ocomments, "odelim=s" => \$odelim, "oheader" => \$oheader, "olineends=s" => sub { $olineends = uc(substr("$ARGV[0]"."U", 0, 1)); }, "oquote" => \$oquote, "oqwhere=s" => \$oqwhere, # Other options "h|help|?" => \$help, "q|quiet!" => \$quiet, "v|verbose+" => \$verbose, "version" => sub { die "Version of $version, by Steven J. DeRose, sderose\@acm.org.\n"; } ); if ($help) { showUsage(); exit; } ($result) || die "Bad options.\n"; # Validate and default options # Set up amp/lt entities my $lt; my $amp; my $prefix; if ($entityFormat =~ /x/) { $prefix = "&#x"; $amp = "26"; $lt = "3c"; } elsif ($entityFormat =~ /X/) { $prefix = "&#X"; $amp = "26"; $lt = "3C"; } else { $prefix = "&#"; $amp = "38"; $lt = "60"; } if (length($entityFormat) > 2) { $prefix .= "0" x (length($entityFormat) - 2); } $ampEntity = "$prefix$amp;"; $ltEntity = "&prefix$lt;"; ($colspecs) && die "-colspecs is not yet supported.\n"; ($delim) || die "Bad value for -d option.\n"; ($entityFormat =~ m/^(9+|x+|X+)$/) || die "Invalid -numeric format '$entityFormat'.\n"; $olineends = lc($olineends); if ($olineends eq "u") { $olineends = chr(10); } elsif ($olineends eq "m") { $olineends = chr(13); } elsif ($olineends eq "d") { $olineends = chr(10) . chr(13); } else { die "Unknown output line-end type '$olineends'.\n"; } $oqwhere = lc($oqwhere); if ($oqwhere eq "none" || $oqwhere eq "num" || $oqwhere eq "text" || $oqwhere eq "all" || $oqwhere eq "delim") { $oqwhere = $oqwhere; } else { die "Unknown value for '-oqwhere': '$oqwhere'.\n"; } $overflow = lc($overflow); ($overflow =~ m/^(fatal|discard|class)$/) || die "Bad value '$overflow' for -overflow option.\n"; if ($msq && $delim ne "\t" && $delim ne ",") { die "Can only have comma or tab as delimiter with -msq option.\n"; } ################################################################################ # Collect field names, if available from first record or option my @names = (); if ($header) { my $headline = <>; chomp($headline); @names = split(/$delim/, $headline); ($verbose) && warn "Header names: " . join(", ",@names) . ".\n"; } elsif ($namestring ne "") { @names = split(/\s+/,$namestring); } # Ensure they're ok XML names (and we limit to ASCII for now) for (my $i=0; $i$olineends"; } if ($public) { print ""; ($indent) && print "$olineends"; } elsif ($system) { print ""; ($indent) && print "$olineends"; } ($verbose) && warn "tableattrs: '$tableAttrs'.\n"; startTag($rootTag, $tableAttrs); } # Write the header if needed if ($oheader) { if (!$nonxmlOutput) { print ""; ($indent) && print $olineends; foreach my $h (@names) { print "<$headingTag>$h"; ($indent) && print $olineends; } print ""; ($indent) && print $olineends; } else { print join($odelim,@names) . $olineends; } } # oheader ################################################################################ # Start processing the main part of the file my $recnum = 0; my $maxf = 0; my $minf = 0; my $level = 0; my @fields = (); while (<>) { # Convert the data $recnum++; my $line = $_; $line =~ s/\r\n/\n/; # Nuke DOS line-ends chomp $line; ($verbose) && warn "Read: '$line'\n"; if ($line =~ m/^\s*$/) { next; } # blank line if ($comments && $line =~ m/^\s*\#/) { # comment line if (!$ocomments) { # no-op } elsif ($nonxmlOutput) { print "$line$olineends"; } else { ($indent) && print $olineends; print ""; ($indent) && print $olineends; } next; } # Parse the line into fields @fields = (); if ($msq && index($_,"\"")>=0) { ($verbose) && warn "parsing as msq\n"; # Load more records if there's an unbalanced double-quote. while (countChar($line,"\"") % 2) { # unbalanced $line =~ s/\n/\&\#10;/g; $line .= <>; } # Now parse the Excel way: use '""' for quote in quoted field. $line =~ s/\"\"/\"/g; # What else might the quotes be quoting? just tabs or commas? while ($line) { $line =~ s/^\s+//; if (substr($line,0,1) eq "\"") { $line =~ s/\"([^\"]*)\"//; } else { $line =~ s/([^$delim]*)//; } push @fields, $1; $line =~ s/^\s*$delim//; } # while } # msq with quotes elsif ($quote ne "") { ($verbose) && warn "parsing with -quote '$quote': '$line'\n"; my $token = ""; while ($line) { $line =~ s/^\s+//; if (substr($line,0,1) ne $quote) { $line =~ s/([^$delim]*)//; $token = $1; } elsif ($escape ne "") { $line =~ s/^$quote(([^$quote]|$escape$quote)*)$quote//; $token = $1; $token =~ s/$escape$quote/$quote/g; # Ok for multiple quotes?? } else { $line =~ s/^$quote([^$quote]*)$quote//; $token = $1; } ($verbose) && warn " token: '$token', rest '$line'.\n"; push @fields, $token; $line =~ s/^\s*$delim//; } # while } else { ($verbose) && warn "parsing with no quoting.\n"; @fields = split(/$delim/, $line); } my $fieldsInThisRec = scalar @fields; if ($fieldsInThisRec > $maxf) { $maxf = $fieldsInThisRec; } if ($minf==0 || $fieldsInThisRec<$minf) { $minf = $fieldsInThisRec; } ($verbose) && warn "Parsed record into:\n " . join("\n ",@fields) . "\n"; if ($norm) { # Normalize whitespace for (my $i=0; $i 0 && scalar(@names) > 0) { if ($overflow eq "fatal") { die "Too many fields ($fieldsInThisRec) in record $recnum:\n$line\n"; } if ($overflow eq "discard") { splice(@fields, 0 - $excess); } } # Write out the record if (!$nonxmlOutput) { startTag($recordTag, (($id) ? "id=\"$id\_$recnum\"":"")); for (my $i=0; $i=0) { $name = "A.$name"; } return($name); } # Write start/end tags with optional indentation by level sub startTag { $level++; print ($olineends) if ($indent); print ($indents x $level) if ($indent); print "<$_[0]" . (($_[1]) ? " $_[1]":"") . ">"; } sub endTag { $level--; #print ($indents x $level) if ($indent); print ""; if ($_[1]) { print $olineends; } #print ($olineends) if ($indent); } # Escape special XML characters and print sub arrangeContent { (my $out = $_[0]) =~ s/&/$ampEntity/g; $out =~ s/