#!/usr/bin/perl -w
#
# normalizeXML: Lay out XML in a wide choice of ways.
#
# 2006-10-05: Written by Steven J. DeRose, sderose@acm.org.
# ...
# 2008-01-07 sjd: Resolve perl -w warnings.
# 2008-01-09 sjd: Start -archform. -ientities for drop/keep internal entities.
# 2008-02-01 sjd: Clean up options a little more. More on -noids, -norids
# 2008-02-06 sjd: Fix -xpath. Distinguish childElementCounts/childNodeCounts.
# Add -endtags. Fix -comments, mistaken force of -indent from -istring.
# Add -flagBlockEnds, @idStack.
# 2008-03-19 sjd: Make portable. Start fixing empty-element handling, adding
# preserve-space and block element classes. Revamp class and dtd handling.
# 2008-03-21 sjd: Add -nil, -etrace. Fix event tracing, add -tracewsevents.
# Work on trapping internal (especially %) entities.
# 2008-04-03 sjd: Fix uninit'd vars in DTD-handling. Don't double-space dtd.
# 2008-04-04 sjd: Improve param. ent. output, DTD layout. Add -maxnamewidth.
# 2008-05-05 sjd: Add -progress, move in event-tracing, -tracefqgi.
# 2008-08-15 sjd: Add -curlyQuotes, -attrQuoteType, -xmllang. Rename -empty....
#
# To do:
# *** Don't write end-tag with -empty E
# *** Way to drop ID/IDREF by type, not just name (partly done)
# *** Control col widths for DTD output. Correct placement of "]"
# *** Add Mulberry DTD format option
# *** -only
# *** Fix long empty comments, bars in multi-line comments, and
# bars serving to center text, to match -normalizebars width.
# Option to generate CDATA marked sections for text nodes within
# certain element types, or t.n. with too many escapes
# *** Escape as needed in -tracepentities output
# *** Implement -bmodels
# *** Way to declare certain PIs to open/close indentation
# *** Make work better with dtd file that has no DOCTYPE dcl
# *** Handle undefined entities and recover.
# *** Join successive text nodes even with -b and -i?
# Support XInclude? xi:include, xi:fallback.
# Option to n-space before comment bars when prior item is not a comment
# Suppress wide-char warning when in internal entity dcl
# Finish -entities
# In @select, @test, and @match of xsl elements, break at =,and,or,][,/
# Normalize charset of names and/or content (spaces, dashes, digits, all)?
# Option to put type-attr? + id into -flagBlockEnds
# Option to extract a single element by xpath or id, or split at certain
# elements, doing the entity/PI stuff right?
# Option to add IDs to certain element types -- gi+xptr or similar.
# Suppress replacement of parameter entities in DTD
# Pull out all content models (and names?) into % entities a la TEI.
# In entity tracing, report location in entire entity state
# In event tracing, option to skip whitespace or all text nodes.
# Option to make content of some element types CDATA marked sections.
# NOTE: To add an option, add to all of: history, variable dcl, option parse,
# option validation, printout in dumpOptions, actual use, and help text.
#use strict;
use Getopt::Long;
my $version = "2008-08-16";
my $default_catalog = "";
my $dft_fqgisep = "/";
my $dft_indentString = " " x 4;
my $dft_outLineends = "U"; # Unix-style
my $dft_quoteType = "\"";
my $dft_width = 80;
my $xinclude_ns = "http://www.w3.org/2001/XInclude";
# Options for white-space.
my $break = 0; # Break output into lines?
my $breakAtAllAttrs = 0; # Break before every attribute?
my $breakInsideAttrs = 0; # Break up long individual attributes?
my $breakInsideTags = 0; # Break up long start-tags?
my $breakInsideText = 0; # Break text nodes into lines?
my $breakModels = 0; # Wrap and indent content models (implies -dtd)?
my $breakOutside = 0; # Break around outside of (non-inline) elements?
my $breakUgly = 0; # Are we breaking before end-of-tag?
my $mulberryDtd = 0; # Lay out DTD in Mulberrytech.com style.
# New break-rules
my ($bi, $bb, $bw, $bs, $bo, $bp, $bc, $bm) = "0000";
my $cleanTextNodeEnds = 0; # Don't do this
my $indent = 0; # Are we indenting?
my $indentAttrsPastGI = 0; # When splitting tags, where to line up attrs
my $indentComments = 0; # Indent within multi-line comments?
my $indentInsideText = 0; # With breakInsideText, do we also indent?
my $indentString = $dft_indentString; # String to repeat per indent level
my $outLineends = $dft_outLineends;
my $screen = 0; # Set to screen width
my $unbreakInsideText = 0; # Unbreak multi-line text nodes into 1 line?
my $width = $dft_width; # Width to break to
my $wsHandling = "D"; # Keep, Drop(dft), ->Pis, M keep only if >1 line
# Options for entity management
my $cr_amp = "&"; # Predefined XML entities
my $cr_lt = "<";
my $cr_gt = ">";
my $cr_apos = "'";
my $cr_quot = """;
my $cr_all = 0; # ***** FIX *****
my $crDigits = 5; # Min number of digits for numeric entities
my $crBase = 16; # Hex or decimal?
my $crCase = 'L'; # Case for [a-f] in hex character references
my $ldquo = "“"; # How to write out quotes (no options yet)
my $rdquo = "”";
my $lsquo = "‘";
my $rsquo = "’";
my $entities = 0; # Preserve entity bounds as PIs
my $etrace = 0; # Report everyplace an entity is referenced.
my $maxNonEntityChar = 127; # Chars > this become numeric entities
my $parseParamEnt = 1; # Handle external parameters entities?
my $xmlEntities = "name"; # name|number
# Options for things normally kept, that you can drop:
my $keepBlankLines = 1; # Retain blank lines?
my $keepDefaultAttrs = 1; # Attributes that can be defaulted?
my $keepComments = 1; # Retain all comments?
my $keepDoc = 1; # The document itself?
my $keepDoctype = 1; # The DOCTYPE declaration?
my $keepEndTags = 1; # All end-tags?
my $keepGenIds = 1; # Attribute values from generate-id()?
my $genIdPattern = "=\\\"[TNF]0x[0-9a-f]*[N\.]0x[0-9a-f]*\\\"";
my $keepIds = 1; # ID attributes?
my $keepInternalEntDcls = 1; # (reset to shorten DTD, like from entity sets)
my $keepNsAttrs = 1; # Namespace attributes in general?
my $keepPIs = 1; # All processing instructions?
my $keepRedundantNS = 1; # Redundant ns declarations?
my $keepRids = 1; # Reference-to-IDs?
my $keepXmlDcl = 1; # The XML declaration?
my $nil = 0; # Suppress all dtd/document output.
# Options for things normally dropped, that you can keep:
my $keepDTD = 0; # Write out the whole DTD?
my $keepPentities = 0; # Parameter entity declarations?
# Options for things to normalize
my $alphabetizeModels = 0; # Alphabetize tokens in OR content models
my $barchar = "="; # For -normalizebars
my $canonical = 0;
my $maxNameWidth = 20; # For arranging DTD nicely
my $educateQuotes = 0; # Curly quotes in content (not preserveElements)
my $emptyFormat = "F"; # Full start-end pairs
my $idnames = 0; # Rename all ID attributes to 'id', etc.?
my $lineends = 0; # Normalize incoming line-ends?
my $nameForm = ""; # Can switch to Upper, Lower, Ascii, Clean
my $normalizeRuns = 0; # Normalize internal multi-space runs in text.
my $normalizeText = 0; # Do normalize-space on text nodes?
# This leads to problems with text-nodes joining up.
my $normalizeAttrs = 1; # Do normalize-space on attribute values?
my $normalizeBars = 1; # Adjust length of
my $xinclude = 0; # Interpret XInclude directives
my $xsl = 0; # Assume input is XSL
# Options for things to add
my $elementSpace = 1; # Put in a space when dropping elements?
my $flagBlockEnds = 0; # Put a comment at end of block-elements
my $fqgi = 0; # Show ancestry for non-inline elements
my $fqgiSep = $dft_fqgisep; # Delimiter between gis in fqgis
my $lnum = 0; # Are we inserting line number comments?
my $ourns = "nx"; # Namespace prefix for -fqgi, -xpath, -linenum
my $tagSpace = 1; # Put in a space when dropping tags?
my $xpath = "N"; # Add attr with xpath to element
# Other options
my @archforms = (); # Hytime-like arch. form attributes
my $attrQuoteType = "\""; # Char to use around attribute values
my $catalog = $default_catalog;
my $curlyQuotes = "O"; # (letter) O means leave quotes as in source
my $help = 0;
my $htmlparse = 0; # Try to handle ill-formed HTML
my $only = ""; # Only output for this ID value.
my $outputExtension = ""; # If non-nil, save to file(s), not stdout
my $progress = 0; # How often to report progress (0=never)
my $quiet = 0;
my $quoteType = $dft_quoteType;
my $sdOptions = 0; # Set my preferred options?
my $textForm = "F"; # Display full text/greek/rot13/none
my $traceEvents = 0; # Report parser events (except whitespace).
my $traceFQGI = 0; # and include element contexts
my $tracePentities = 0; # Report parameter entity declarations
my $traceWsEvents = 0; # Report parser whitespace events.
my $tracePrint = 0; # Detailed debug info from dumpLine()
my $verbose = 0;
my $voptions = 0;
my $waitfor = ""; # Suppress output til this string shows up
my $xmllang = "O"; # Mess with inherited xml:lang attributes?
# Elements to treat specially re. whitespace/syntax
my $setdtd = ""; # Set following as for a given DTD?
my @blockElements = (); # What elements should get extra space before?
my @emptyElements = (); # Elements declared EMPTY (for SGML output)
my @inlineElements = (); # What elements shouldn't break anyway?
my @preserveeElements = ();
my @sectionElements = ();
my @dropTags = (); # Delete these tags (not elements)?
my @dropElements = (); # Delete these whole elements
my @dropAttrNames = (); # Delete these attributes
my @dropAttrPats = (); # Delete attrs with values matching these exprs
my @dropPIs = (); # Delete PIs with these names
###############################################################################
Getopt::Long::Configure ("ignore_case");
my $result = GetOptions(
# Options for whitespace
"b|break!" => \$break,
"bmodels!" => \$breakModels,
"boutside!" => \$breakOutside,
"ballattrs!" => \$breakAtAllAttrs,
"battrs!" => \$breakInsideAttrs,
"btext!" => \$breakInsideText,
"btags!" => \$breakInsideTags,
"bugly!" => \$breakUgly,
"i|indent!" => \$indent,
"iattrspastgi!" => \$indentAttrsPastGI,
"icomments!" => \$indentComments,
"istring=s" => \$indentString,
"itext!" => \$indentInsideText,
"linends|lineends!" => \$lineends,
"mulberryDtd!" => \$mulberryDtd,
"outlinends|outlineends=s"=> \$outLineends,
"screen" => \$screen,
"width=i" => \$width,
"ws=s" => \$wsHandling,
# new break rules
"binline=s" => \$binline,
"bblock=s" => \$bblock,
"bwhitespacepreserve=s" => \$bwsp,
"bsection=s" => \$bsection,
"bother=s" => \$bother,
"bpi=s" => \$bpi,
"bcomment=s" => \$bcomment,
"bmcomment=s" => \$bmcomment,
# repeatable: whitespace treatment of specific elements
"setdtd=s" => \$setdtd,
"block=s" => \@blockElements,
"emptyElements=s" => \@emptyElements,
"inline=s" => \@inlineElements,
"preserve=s" => \@preserveeElements,
"section=s" => \@sectionElements,
# Options for entity management
"crbase=i" => \$crBase,
"crcase=s" => \$crCase,
"crdigits=i" => \$crDigits,
"entities" => \$entities,
"etrace" => \$etrace,
"maxchar=i" => \$maxNonEntityChar,
"names=s" => \$nameForm,
"param!" => \$parseParamEnt,
# Options for things to keep/drop
"dropattr=s" => \@dropAttrNames,
"dropattrpat=s" => \@dropAttrPats,
"dropelement=s" => \@dropElements,
"droppi=s" => \@dropPIs,
"droptag=s" => \@dropTags,
"blanklines!" => \$keepBlankLines,
"comments!" => \$keepComments,
"defaultattrs!" => \$keepDefaultAttrs,
"doc!" => \$keepDoc,
"doctype!" => \$keepDoctype,
"dtd!" => \$keepDTD,
"endtags!" => \$keepEndTags,
"genids!" => \$keepGenIds,
"ids!" => \$keepIds,
"ientities!" => \$keepInternalEntDcls,
"nil!" => \$nil,
"nsattrs!" => \$keepNsAttrs,
"pentities!" => \$keepPentities,
"pis!" => \$keepPIs,
"rids|idrefs!" => \$keepRids,
"text=s" => \$textForm,
"xmldcl!" => \$keepXmlDcl,
# Options for things to normalize
"amodels!" => \$alphabetizeModels,
"barchar=s" => \$barchar,
"can|canonical!" => \$canonical,
"educatequotes!" => \$educateQuotes,
"emptyFormat=s" => \$emptyFormat,
"idnames" => \$idnames,
"maxNameWidth=i" => \$maxNameWidth,
"ntext|normalizetext!" => \$normalizeText,
"nruns|normalizeruns!" => \$normalizeRuns,
"nattrs|normalizeattrs!" => \$normalizeAttrs,
"nbars|normalizebars!" => \$normalizeBars,
"xi|xinclude!" => \$xinclude,
"xsl!" => \$xsl,
# Options for things to add
"element-space|elementspace!"=> \$elementSpace,
"flagBlockEnds!" => \$flagBlockEnds,
"fqgi!" => \$fqgi,
"fqgisep=s" => \$fqgiSep,
"lnum!" => \$lnum,
"ourns=s" => \$ourns,
"tagspace!" => \$tagSpace,
"xpath=s" => \$xpath,
# Other options
"archform=s" => \@archforms,
"attrQuoteType=s" => \$attrQuoteType,
"c=s" => \$catalog,
"curlyQuotes=s" => \$curlyQuotes,
"h|help|?" => sub { showUsage(); exit; },
"help-archforms" => sub { showArchforms(); exit; },
"help-breaks" => sub { showBreakRules(); exit; },
"html!" => \$htmlparse,
"only=s" => \$only,
"out=s" => \$outputExtension,
"progress=n" => \$progress,
"q!" => \$quiet,
"sd|sjd!" => \$sdOptions,
"traceprint!" => \$tracePrint,
"traceevents!" => \$traceEvents,
"tracefqgi!" => \$traceFQGI,
"tracepentities!" => \$tracePentities,
"tracewsevents!" => \$traceWsEvents,
"v+" => \$verbose,
"version" => sub {
die "Version of $version, by Steven J. DeRose, sderose\@acm.org.\n";
},
"voptions!" => \$voptions,
"waitfor=s" => \$waitfor,
"xmllang=s" => \$xmllang
);
if ($help) { showUsage(); exit; }
($result) || die "Bad options.\n";
my $nameField = "%-" . $maxNameWidth . "s";
###############################################################################
# Set implied options, validate option values...
# Whitespace
if ($screen) {
($ENV{COLUMNS} > 0) || die "Can't get env. variable \$COLUMNS.\n";
$width = $ENV{COLUMNS};
}
if ($sdOptions) {
$indent = 1;
$indentComments = 1;
$indentInsideText = 1;
$breakInsideTags = 1;
$keepInternalEntDcls = 0;
}
if ($alphabetizeModels) { $breakModels = 1; }
if ($breakModels) { $keepDTD = 1; }
if ($keepPentities) { $keepDTD = 1; }
#if ($indentString) { $indent = $break = 1; }
if ($indentComments) { $indent = 1; }
if ($indentInsideText) { $breakInsideText = $indent = $break = 1;}
if ($indent) { $break = 1; }
if ($breakInsideAttrs) { $breakInsideTags = 1; }
if ($breakInsideTags) { $break = 1; }
if ($breakInsideText) { $break = 1; }
if ($width) {
($width > 10)
|| die "-width is too small (must be > 10).\n";
}
# Entities
($crBase==10) || ($crBase==16)
|| die "-crbase must be 10 or 16.\n";
$crCase = uc(substr($crCase."U",0,1));
($crCase =~ m/[UL]/)
|| die "-crcase must be L or U.\n";
($crDigits>0)
|| die "-crdigits must be positive.\n";
if ($maxNonEntityChar) {
($maxNonEntityChar <= 0) &&
die "-maxchar value must be > 0.\n";
($maxNonEntityChar > 0xFFFFF) &&
warn "-maxchar value > 0xFFFFF ($maxNonEntityChar) is kinda useless.\n";
}
# Drop
my %dropTagsHash = ();
for my $t (@dropTags) { $dropTagsHash{$t} = 1; }
($verbose) &&
warn "Tags to drop: " . join(" ",sort keys %dropTagsHash) . ".\n";
@dropTags = ();
my %dropElementsHash = ();
for my $t (@dropElements) { $dropElementsHash{$t} = 1; }
($verbose) &&
warn "Elements to drop: " . join(" ",sort keys %dropElementsHash) . ".\n";
@dropElements = ();
my %dropPIsHash = ();
for my $t (@dropPIs) { $dropPIsHash{$t} = 1; }
($verbose) &&
warn "PIs to drop: " . join(" ",sort keys %dropPIsHash) . ".\n";
@dropPIs = ();
my %dropAttrNamesHash = ();
if (0) { # old way, just by name; now we know from dcls
if (!$keepIds) { $dropAttrNamesHash{"rid"} = 1; }
if (!$keepRids) { $dropAttrNamesHash{"id"} = 1; }
}
for my $da (@dropAttrNames) {
if (index($da,"/") >= 0) {
(my $elem = $da) =~ s/\/.*\$//;
(my $attr = $da) =~ s/^.*\/\$//;
$dropAttrNamesHash{$attr} = $elem;
}
else { $dropAttrNamesHash{$da} = 1; }
}
($verbose) &&
warn "Attribute names to drop: "
. join(" ",sort keys %dropAttrNamesHash) . ".\n";
@dropAttrNames = ();
# Assemble a single pattern from all dropAttrPats.
my $dropAttrPatsExpr = "";
foreach my $p (@dropAttrPats) {
$dropAttrPatsExpr .= "($p)|";
}
$dropAttrPatsExpr =~ s/\|\$//;
@dropAttrPats = ();
($verbose) &&
warn "dropAttrPatsExpr = '$dropAttrPatsExpr'.\n";
($verbose) &&
warn "Architectural form attributes: " . join(", ",@archforms) . ".\n";
# Normalize
if ($canonical) { # Per http://www.w3.org/TR/xml-c14n)
$crDigits = 1; # no leading zeros in character references
$crBase = 16;
$crCase = "U";
$keepDoctype = 0;
$keepXmlDcl = 0;
$wsHandling = "D";
$lineends = 1;
}
$emptyFormat = uc(substr($emptyFormat."F",0,1));
($emptyFormat =~ m/[FEHS]/)
|| die "Unknown format '$emptyFormat' for -empty option, must be"
. " Full (), Empty (), Html (), or Sgml ()\n";
if ($nameForm) {
$nameForm = uc(substr("$nameForm ",0,1));
($nameForm =~ /[ULAC]/)
|| die "Invalid form '$nameForm' on -name option.\n";
}
if ($outLineends) {
$outLineends = uc(substr($outLineends."U",0,1));
(index("MDU",$outLineends) >= 0)
|| die "Unknown output line-end type '$outLineends'.\n";
}
if ($textForm) {
$textForm = uc(substr($textForm."F",0,1));
(index("FGNR",$textForm) >= 0)
|| die "Unknown output text output form '$textForm'.\n";
}
if ($wsHandling ) {
$wsHandling = uc(substr($wsHandling."K",0,1));
($wsHandling =~ m/[KDPM]/)
|| die "Unknown white-space only handling '$wsHandling'.\n";
}
$xpath = uc(substr($xpath."?",0,1));
($xpath =~ m/[NSLC]/) ||
die "Invalid format for -xpath option (must be N, S, L or C, not $xpath).\n";
if ($xsl) {
$break = $indent = $breakInsideText = 1;
$breakInsideTags = 1;
$emptyFormat = "E";
$wsHandling = "M";
}
# Add
$fqgiSep = substr($fqgiSep,0,1);
($fqgiSep =~ m/[-_:.\w0-9]/)
&& die "Can't use a name character ($fqgiSep) as fqgisep.\n";
if ($ourns) {
($ourns =~ m/^[\w\d]+$/)
|| die "Invalid -ourns namespace prefix '$ourns'.\n";
}
# Other
($attrQuoteType =~ m/^['"]$/)
|| die "-attrQuoteType must be ' or \".\n";
($catalog eq "") || (-f $catalog)
|| die "Can't find catalog file '$catalog'.\n";
$curlyQuotes = uc(substr($curlyQuotes . "O",0,1));
($curlyQuotes =~ m/^[AON]$/)
|| die "-curlyQuotes value must be [AON], not '$curlyQuotes'.\n";
if ($tracePentities) { $keepDTD = 1; }
$xmllang = uc(substr($xmllang . "O",0,1));
($xmllang =~ m/^[IOE]$/)
|| die "-xmllang must be Explicit, Implicit, or Original, not '$xmllang'.\n";
####### Copy repeatable options from arrays into hash tables....
if ($setdtd eq "docbook") { setupDocbookElements(); }
elsif ($setdtd eq "html") { setupHtmlElements(); }
elsif ($setdtd eq "mathml") { setupMathmlElements(); }
elsif ($setdtd eq "nlm") { setupNlmElements(); }
elsif ($setdtd eq "osis") { setupOsisElements(); }
elsif ($setdtd eq "tei") { setupTeiElements(); }
elsif ($setdtd eq "xsl") { setupXslElements(); }
elsif ($setdtd ne "") {
die "Unknown -setdtd value '$setdtd'.\n";
}
my %displayClass = ();
my $e = "";
for $e (@blockElements) { $displayClass{$e} = "BLOCK"; }
for $e (@inlineElements) { $displayClass{$e} = "INLINE"; }
for $e (@preserveElements) { $displayClass{$e} = "PRESERVE"; }
for $e (@sectionElements) { $displayClass{$e} = "SECTION"; }
my %empties = ();
for my $e (@emptyElements) { $empties{$e} = 1; }
###############################################################################
# Make up the sprintf string used for numeric character references
my $crFormatString = (($crBase == 16) ? "":"")
. "%0$crDigits" . (($crBase == 16) ? "x":"d")
. ";";
if ($crCase eq "U") { $crFormatString = uc($crFormatString); }
# Set up strings to output for built-in XML entities -- named or numeric
if ($xmlEntities eq "name") {
$cr_amp = "&";
$cr_lt = "<";
$cr_gt = ">";
$cr_apos = "'";
$cr_quot = """;
}
else {
$cr_amp = sprintf($crFormatString, 0x26);
$cr_lt = sprintf($crFormatString, 0x3c);
$cr_gt = sprintf($crFormatString, 0x3e);
$cr_apos = sprintf($crFormatString, 0x27);
$cr_quot = sprintf($crFormatString, 0x22);
}
my $newline = chr(10);
if ($outLineends eq "M") { $newline = chr(13); }
elsif ($outLineends eq "D") { $newline = chr(13).chr(10); }
($quiet) || ($catalog eq "") || doWarn( "Using catalog '$catalog'.");
my $file = $ARGV[0] ? $ARGV[0]:"";
(-f $file) || die "Can't find input file '$file'.\n";
# Try to interleave stdout/stderr as desired...
#my $ofh = select(STDOUT);
#$| = 1;
#select($ofh);
###############################################################################
# Declare parsing state information, clear it, and go.
my $repressed = 0; # Are we in an element to delete all of?
my @repressStack = ();
my $attdclbuf = ""; # Saved-up attlist declaration
my %idAttrList = (); # Record all gi/attrnames declared as type ID
my %idrefAttrList = (); # Record all gi/attrnames declared as type IDREF(S)
my %attrDefaults = (); # Record which attrs default to what
# Synchronized stacks (easier than a stack of hashes; maybe)
my @tagStack = (); # Maintained by start/end handlers
my @idStack = (); # ID attr values for open elements
my @childElementCounts = (); # (num of children so far, mostly for -xpath)
my @childNodeCounts = (); # (num of children so far, mostly for -xpath)
my @nsStack = (); # Runs in sync with @tagStack (just prefix lists)
my @nsFullStack = (); # Runs in sync with @tagStack (hash of prefix=>uri)
my @entityStack = (); # Only used with -entities option
my $pastDTD = 0; # To suppress comments from DTD
my $commentCount = 0;
my $parserCatalog;
my $docelStatus = "before"; # later "in" then "after" for Canonical rules.
my $eventCount = 0; # For reporting progress
my $elementCount = 0; # Per document
($voptions) && dumpOptions();
my $totalEventCount = 0; # grand total
my $totalElementCount = 0; # grand total
my $totalDocuments = 0;
my $fh = STDOUT;
foreach my $file (@ARGV) {
$totalDocuments++;
($verbose) && warn "Starting on file '$file'.\n";
if ($outputExtension) {
my $outfile = $file;
$outfile =~ s/\.[^.]+$/\.$outputExtension/;
open $fh, ">$outfile";
}
clearParserState();
parseDocument($file,$fh);
if ($outputExtension) {
close $fh;
}
$totalEventCount += $eventCount;
$totalElementCount += $elementCount;
}
if ($keepComments && $commentCount == 0 && !$quiet) {
doWarn( "WARNING: No comments seen. Parser may have discarded them.");
}
# This is only reporting events/elements in last document... XXX
($quiet) || doWarn( "Done, total of $totalElementCount elements ("
. "$totalEventCount events) processed,"
. " over $totalDocuments document(s).");
exit;
###############################################################################
sub clearParserState {
$repressed = 0; # Are we in an element to delete all of?
@repressStack = ();
$attdclbuf = ""; # Saved-up attlist declaration
%idAttrList = (); # Record all ID values defined
%idrefAttrList = (); # Record all ID values referenced
%attrDefaults = (); # Record which attrs default to what
@tagStack = ();
@idStack = ();
@childElementCounts = ();
@childNodeCounts = ();
@nsStack = ();
@nsFullStack = ();
@entityStack = (); # Only used with -entities option
$pastDTD = 0;
$commentCount = 0;
$eventCount = 0;
$elementCount = 0;
$docelStatus = "before";
}
###############################################################################
sub dumpOptions {
warn "\n\nOPTION SETTINGS:\n";
warn "version [$version]\n";
warn "default_catalog [$default_catalog]\n";
warn "xinclude_ns [$xinclude_ns]\n";
# Options for white-space
warn "\nOptions for white-space wrapping:\n";
warn "breakAtAllAttrs [$breakAtAllAttrs]\n";
warn "breakInsideAttrs [$breakInsideAttrs]\n";
warn "breakInsideTags [$breakInsideTags]\n";
warn "breakInsideText [$breakInsideText]\n";
warn "breakModels [$breakModels]\n";
warn "Pre-defined breaking rules:\n";
warn "break [$break]\n";
warn "breakOutside [$breakOutside]\n";
warn "breakUgly [$breakUgly]\n";
warn "Detailed breaking rules:\n";
warn " inline [$binline]\n";
warn " block [$bblock]\n";
warn " whitespacepres. [$bwsp]\n";
warn " section [$bsection]\n";
warn " other [$bother]\n";
warn " pi [$bpi]\n";
warn " comment [$bcomment]\n";
warn " multi-line-com. [$bmcomment]\n";
warn "mulberry DTD format [$mulberryDtd]\n";
warn "cleanTextNodeEnds [$cleanTextNodeEnds]\n";
warn "indentString [$indentString]\n";
warn "indent [$indent]\n";
warn "indentComments [$indentComments]\n";
warn "indentInsideText [$indentInsideText]\n";
warn "outLineends [$outLineends]\n";
warn "unbreakInsideText [$unbreakInsideText]\n";
warn "width [$width]\n";
warn "wsHandling [$wsHandling]\n";
# Options for entity management
warn "\nOptions for entity management\n";
warn "Character references:\n";
warn " For amp [$cr_amp]\n";
warn " For lt [$cr_lt]\n";
warn " For gt [$cr_gt]\n";
warn " For apos [$cr_apos]\n";
warn " For quot [$cr_quot]\n";
warn " cr_all [$cr_all]\n";
warn " Minumum Digits [$crDigits]\n";
warn " Decimal or Hex [$crBase]\n";
warn " Case for hex 'x' [$crCase]\n";
warn "entities [$entities]\n";
warn "entity ref tracing [$etrace]\n";
warn "maxNonEntityChar [$maxNonEntityChar]\n";
warn "parseParamEnt [$parseParamEnt]\n";
warn "xmlEntities [$xmlEntities]\n";
# Options for things to keep/drop
warn "\nOptions for things to keep/drop\n";
warn "Blank lines [$keepBlankLines]\n";
warn "Default Attributes [$keepDefaultAttrs]\n";
warn "The DTD [$keepDTD]\n";
warn "Comments [$keepComments]\n";
warn "The document [$keepDoc]\n";
warn "The doctype [$keepDoctype]\n";
warn "Ids [$keepIds]\n";
warn "XSLT-generated IDs [$keepGenIds]\n";
warn "genIdPattern [$genIdPattern]\n";
warn "NsAttrs [$keepNsAttrs]\n";
warn "Parameter entities [$keepPentities]\n";
warn "PIs [$keepPIs]\n";
warn "Redundant NS dcls [$keepRedundantNS]\n";
warn "IDREF(S) attributes [$keepRids]\n";
warn "The XML Declaration [$keepXmlDcl]\n";
warn "Suppress all (-nil) [$nil]\n";
# Options for things to normalize
warn "\nOptions for things to normalize\n";
warn "alphabetizeModels [$alphabetizeModels]\n";
warn "barchar [$barchar]\n";
warn "canonical [$canonical] (forces many other options)\n";
warn "educate quotes [$educateQuotes (in non-preserve content)\n";
warn "emptyFormat [$emptyFormat]\n";
warn "idnames [$idnames]\n";
warn "lineends [$lineends]\n";
warn "max name width [$maxNameWidth]\n";
warn "nameForm [$nameForm]\n";
warn "normalizeRuns [$normalizeRuns]\n";
warn "normalizeText [$normalizeText] (not recommended)\n";
# normaliText leads to problems with text-nodes joining up.
warn "normalizeAttrs [$normalizeAttrs]\n";
warn "normalizeBars [$normalizeBars]\n";
warn "xinclude [$xinclude] (not yet implemented)\n";
warn "xsl [$xsl]\n";
# Options for things to add
warn "\nOptions for things to add\n";
warn "elementSpace [$elementSpace]\n";
warn "flagBlockEnds [$flagBlockEnds]\n";
warn "fqgi [$fqgi]\n";
warn "fqgiSep [$fqgiSep]\n";
warn "lnum [$lnum]\n";
warn "ourns [$ourns]\n";
warn "tagSpace [$tagSpace]\n";
warn "xpath [$xpath]\n";
# Other options
warn "\nOther options\n";
warn "catalog [$catalog]\n";
warn "help [$help]\n";
warn "htmlparse [$htmlparse]\n";
warn "only output [id] [$only]\n";
warn "outputExtension [$outputExtension]\n";
warn "progress [$progress]\n";
warn "quiet [$quiet]\n";
warn "quoteType [$quoteType] (not yet implemented)\n";
warn "textForm [$textForm]\n";
warn "traceEvents [$traceEvents]\n";
warn "traceFQGI [$traceFQGI]\n";
warn "trace P Entities [$tracePentities]\n";
warn "traceWsEvents [$traceWsEvents]\n";
warn "tracePrint [$tracePrint]\n";
warn "verbose [$verbose]\n";
warn "voptions [$voptions]\n";
warn "waitfor [$waitfor]\n";
# Lists of XML constructs by name
warn "\nLists of XML constructs by name\n";
warn "blockElements: " . join(", ", @blockElements) . "\n";
warn "emptyElements: " . join(", ", @emptyElements) . "\n";
warn "inlineElements: " . join(", ", @inlineElements) . "\n";
warn "preserveElements:" . join(", ", @preserveElements) . "\n";
warn "sectionElements: " . join(", ", @sectionElements) . "\n";
warn "\n";
warn "dropTags: " . join(", ", @dropTags) . "\n";
warn "dropElements: " . join(", ", @dropElements) . "\n";
warn "dropAttrNames: " . join(", ", @dropAttrNames) . "\n";
warn "dropAttrPats: " . join(", ", @dropAttrPats) . "\n";
warn "dropPIs: " . join(", ", @dropPIs) . "\n";
} # dumpOptions
###############################################################################
sub parseDocument {
use lib "sw/lib/perl";
use XML::Parser;
# http://search.cpan.org/~ebohlman/XML-Catalog-0.02/Catalog.pm
use XML::Catalog;
my $parser;
if ($parseParamEnt) {
$parser = new XML::Parser(ErrorContext => 2, ParseParamEnt => 1);
}
else {
$parser = new XML::Parser(ErrorContext => 2);
}
$parser->setHandlers(
Start => \&startHandler,
End => \&endHandler,
Init => \&initHandler,
Final => \&finalHandler,
Char => \&charHandler,
Proc => \&procHandler,
Comment => \&commentHandler,
Doctype => \&doctypeHandler,
DoctypeFin => \&doctypeFinHandler,
Default => \&defaultHandler);
if ($entities) {
$parser->setHandlers(
ExternEnt => \&externEntHandler,
ExternEntFin => \&externEntFinHandler);
}
else {
if ($catalog ne "") {
$parserCatalog=XML::Catalog->new($catalog);
$parser->setHandlers(
ExternEnt => $parserCatalog->get_handler($parser));
}
}
if ($keepDTD) {
$parser->setHandlers(
Element => \&elementDclHandler,
Attlist => \&attlistDclHandler,
Entity => \&entityDclHandler,
XMLDecl => \&xmlDclHandler);
}
if ($htmlparse) {
die "-html option is not yet finished.\n";
# See perldoc XML::LibXML::Parser. And don't do catalog or dtd handling
# Maybe invoke known list of empties for output?
# $parser->parse_html_file($_[0]);
}
else {
$parser->parsefile($_[0]);
}
if ($break || $breakOutside) {
print $fh $newline;
}
} # sub parseDocument
###############################################################################
#
# Regarding the XML::Parser module:
#
# API doc is at http://search.cpan.org/~msergeant/XML-Parser/Parser.pm
# initHandler: Called at start of parsing
sub initHandler {
startEvent("init");
if (!$keepXmlDcl) {
dumpLine("", "pi");
}
}
# finalHandler: Called at end of parsing
sub finalHandler {
startEvent("final");
dumpAccumulatedAttlistDcl();
(scalar @tagStack) && doWarn("Unclosed elements on stack: " .
join("/",@tagStack));
}
sub startHandler {
my $parser = shift;
my $gi = shift;
my $depth = scalar @tagStack;
my %attlist = ();
my $myNS = (scalar(@nsStack)>0) ? $nsStack[-1]:"";
startEvent("start", "'$gi'");
if ($nameForm ne "") { $gi = fixNameForm($gi); }
if (scalar @childNodeCounts > 0) {
$childElementCounts[-1]++;
$childNodeCounts[-1]++;
}
dumpAccumulatedAttlistDcl();
# Signal that we're really in the document now (not the DTD)
$pastDTD = 1;
$docelStatus = "in";
if ($repressed or (defined $dropElementsHash{$gi})) {
#doWarn( "Repressing $gi");
$repressed = 1;
push @repressStack, $gi;
}
# Organize the attributes into a hash, dropping any we don't want,
# and renaming id-ish attributes if asked.
my $nsPrefixes = "";
my %nsDeclaredHere = ();
while (scalar @_) {
my $attr = shift;
my $value = shift;
# Do we need to save the default namespace, too?
if ($attr =~ m/^xmlns:[^=]/) { # namespace attribute
$nsPrefixes .= "$attr ";
(my $prefix = $attr) =~ s/xmlns://;
$nsDeclaredHere{$prefix} = $value;
}
if (defined $dropAttrNamesHash{$gi}) { next; }
if (!$keepIds && defined $idAttrList{"$gi/$attr"}) { next; }
if (!$keepRids && defined $idrefAttrList{"$gi/$attr"}) { next; }
if ($keepGenIds && $value =~ m/$genIdPattern/o) { next; }
if (!$keepDefaultAttrs) {
if (defined $attrDefaults{"$gi/$attr"}) {
($verbose) && warn "gi $gi, attr $attr, value $value,"
. " vs. dft " . $attrDefaults{"$gi/$attr"} . ".\n";
if ($attrDefaults{"$gi/$attr"} eq $value) { next; }
}
}
# Replace this with single /o 'or' pattern from $dropAttrPatsExpr
#foreach my $p (keys %dropAttrPats) {
# if ($attlist{$k} =~ m/$p/) { next; }
#}
if ($value =~ /$dropAttrPatsExpr/o) { next; }
if ($idnames) {
if ($idAttrList{"$gi/$attr"}) { $attr = "id"; }
elsif ($idrefAttrList{"$gi/$attr"}) { $attr = "rid"; }
}
if ($nameForm ne "") {
$attr = fixNameForm($attr);
}
$attlist{$attr} = $value;
} # while
# Interpet XInclude if needed
if ($xinclude && $myNS eq $xinclude_ns) {
doXInclude();
}
# Add any requested specialty attributes
if ($xpath ne "N") {
$attlist{"$ourns:xpath"} =
makeXPath(\@tagStack,\@childElementCounts,$gi, $xpath);
}
if ($fqgi) {
$attlist{"$ourns:fqgi"} = join($fqgiSep,@tagStack) . "$fqgiSep$gi";
}
if ($lnum) {
$attlist{"$ourns:lnum"} = $parser->current_line;
}
if ($xmllang eq "E" && !defined $attlist{"xml:lang"}) { # Make explicit
if (defined $langStack[-1]) {
$attlist{"xml:lang"} = $langStack[-1];
# warn "Inheriting xml:lang\n";
}
else {
$attlist{"xml:lang"} = "EN";
# warn "Forcing xml:lang\n";
}
}
elsif ($xmllang eq "I" && defined $attlist{"xml:lang"} &&
$attlist{"xml:lang"} eq $langStack[-1]) { # Make implicit
delete $attlist{"xml:lang"};
}
# else leave as in source
# Generate alphabetized/normalized attribute list. Do this even if
# dropping the tag or element, so we track ns attributes right.
# Canonical XML (Clause 2.2) says:
# namespace nodes are sorted lexicographically by local name
# attribute nodes are sorted lexicographically with namespace URI
# as the primary key and local name as the secondary key
my $ats = "";
foreach my $k (sort keys %attlist) {
# Skip this one if it's a redundant namespace attr
if ($k =~ m/^xmlns:[^=]/ &&
!$keepRedundantNS &&
index(" SnsPrefixes ", " $k ") >= 0) {
next;
}
my $cur = $attlist{$k};
(!defined $cur) && warn "Attlist entry for '$k' not defined\n";
# Escape attr per clause 2.3 of http://www.w3.org/TR/xml-c14n
$cur =~ s/\&/$cr_amp/g;
$cur =~ s/\$cr_lt/g;
if ($attrQuoteType eq "\"") {
$cur =~ s/\"/$cr_quot/g;
}
else {
$cur =~ s/\'/$cr_apos/g;
}
$cur =~ s/\t/&\#x9;/g;
$cur =~ s/\n/&\#xA;/g;
$cur =~ s/\r/&\#xD/g;
if ($normalizeAttrs) {
$cur =~ s/\s\s+/ /g;
}
$cur = fixEntities($cur); # Fix to handler $quoteType....
$ats .= " $k=$quoteType$cur$quoteType";
} # foreach
if (defined $dropTagsHash{$gi}) {
if ($tagSpace) { dumpLine(" ", "text"); }
}
elsif ($repressed) {
if ($elementSpace) { dumpLine(" ", "text"); }
}
else {
for ($f=0; $f";
#if ($lnum && $thisline!=$curline) {
# $startTag = "" . $startTag;
#}
dumpLine($startTag, "start", $gi) unless($repressed);
}
push(@tagStack,$gi);
push(@idStack, $attlist{"id"}); # for now; should go by type, not name
push(@childElementCounts,0);
push(@childNodeCounts,0);
push(@nsStack,$nsPrefixes);
push(@nsFullStack, \%nsDeclaredHere);
$elementCount++;
} # startHandler
sub endHandler {
my $gi = $_[1];
startEvent("end", "'$gi'");
if ($nameForm ne "") {
$gi = fixNameForm($gi);
}
dumpAccumulatedAttlistDcl();
my $old = $tagStack[-1];
if ($old ne $gi && scalar(@archforms)>0) {
doWarn( "Got end for '$gi' when expecting '$old'.");
}
else {
pop(@tagStack);
pop(@idStack);
pop(@childElementCounts);
pop(@childNodeCounts);
pop(@nsStack);
pop(@nsFullStack);
}
if (!defined $dropTagsHash{$gi}) {
for ($f=0; $f", "end", $gi);
if ($flagBlockEnds && $displayClass{$gi} eq "BLOCK") {
dumpline("\n", "comment");
}
}
}
# BUG? Only pop if we're ending the thing at stack-top?
if ($repressed) {
pop @repressStack;
if (scalar @repressStack <= 0) { $repressed = 0; }
}
if (scalar @tagStack <= 0) {
$docelStatus = "after";
}
} # endHandler
# May need to drop first newline from nodes in dtd?
sub charHandler {
if (scalar @childNodeCounts > 0) { $childNodeCounts[-1]++; }
my $text = $_[1];
startEvent("char", "'$text'");
dumpAccumulatedAttlistDcl();
if ($repressed) { return; }
if ($text !~ m/\S/) { # white-space-only text node
($verbose>1) && warn "charHandler: white-space-only node.\n";
if ($wsHandling eq "D") {
return;
} # drop
if ($wsHandling eq "P") { # turn into PIs
$text =~ s/\r/\&\#13;/g;
$text =~ s/\n/\&\#10;/g;
$text =~ s/\t/\&\#9;/g;
$text =~ s/\'/\&\#39;/g;
$text =~ s/\?>/\&\#63;>/g;
dumpLine("", "text") unless($repressed);
return;
}
if ($wsHandling eq "M") { # drop unless multi-line
if (!($text =~ m/\n.*\n/)) {
return;
}
$text =~ s/\n//; # just drop one
}
if ($wsHandling ne "K") {
doWarn("Bad white space handling type '$wsHandling'.");
}
} # white-space-only
if ($textForm eq "N") { return; }
$text =~ s/&/$cr_amp/g;
$text =~ s/$cr_lt/g;
if ($canonical) {
$text =~ s/>/$cr_gt/g;
$text =~ s/\r/&\#xD;/g;
}
if ($cr_all) {
$text =~ s/\'/$cr_apos/g;
$text =~ s/\"/$cr_quot/g;
}
if ($unbreakInsideText) {
($verbose>1) && warn "charHandler: unbreaking.\n";
$text =~ s/\012/ /g;
}
if ($normalizeText) {
($verbose>1) && warn "charHandler: normalizing space (excessive).\n";
$text =~ s/\s+/ /g;
$text =~ s/^\s+//;
$text =~ s/\s+$//;
}
if ($normalizeRuns) {
($verbose>1) && warn "charHandler: normalizing space runs.\n";
$text =~ s/\s+/ /g;
}
if ($lineends) {
($verbose>1) && warn "charHandler: normalizing lineends.\n";
$text =~ s/\015\012/\012/g; # normalize line-ends
$text =~ s/\015/\012/g;
}
if ($keepBlankLines) {
($verbose>1) && warn "charHandler: degeminating tabs.\n";
$text =~ s/\012\012/\012/g;
}
my $fixed = "";
if ($textForm eq "G") {
$fixed = greekText($text);
}
elsif ($textForm eq "R") {
$fixed = rot13Text($text);
}
else {
$fixed = fixEntities($text);
}
# ($verbose > 1) && warn "Dumping text: '$fixed'\n";
dumpLine("$fixed", "text") unless($repressed);
} # charHandler
sub procHandler {
my ($p, $name, $data) = @_;
startEvent("proc", "'$name'");
dumpAccumulatedAttlistDcl();
($keepPIs) || return;
#$indentSpaces = curIndent();
if ($repressed or (defined $dropPIsHash{$name})) {
#doWarn( "Repressing PI $name");
}
elsif ($pastDTD && !$repressed) {
if ($canonical && $docelStatus eq "after") { print "\n"; }
dumpLine("$name" .
(($data ne "") ? " $data":"") .
"?>", "pi");
if ($canonical && $docelStatus eq "before") { print "\n"; }
}
} # procHandler
# Discard DTD comments unless asked for DTD.
sub commentHandler {
my $text = $_[1];
dumpAccumulatedAttlistDcl();
$commentCount++; # XXX Perhaps count DTD comments separately?
return unless ($keepComments && !$repressed);
return unless ($pastDTD || $keepDTD);
startEvent("comment", "'$text'");
my $theIndent = curIndent(1);
if ($normalizeBars && ($text =~ m/^\s*[-_*~=.]+\s*$/)) { # bar line
my $len = $width - 9;
$len -= length(curIndent());
$text = ($barchar x $len);
}
elsif ($normalizeBars && ($text =~ m/^\s*$/)) { # blank line
my $len = $width - 9;
$len -= length(curIndent());
$text = (' ' x $len);
}
elsif ($indentComments) {
$text =~ s/^\s+//;
$text =~ s/\s+$//;
$text =~ s/\n\s+/\n$theIndent/g; # Indent to match "", "comment");
if ($canonical && $docelStatus eq "before") { print "\n"; }
} # commentHandler
sub doctypeHandler {
my ($p, $name, $sysid, $pubid, $internal) = @_;
startEvent("doctype", "'$name'");
dumpAccumulatedAttlistDcl();
($keepDoctype) || return;
if ($nameForm ne "") { $name = fixNameForm($name); }
if ($pubid) {
dumpLine("", "dcl");
}
elsif ($sysid) {
dumpLine("", "dcl");
}
else {
dumpLine("", "dcl");
}
} # doctypeHandler
sub doctypeFinHandler {
startEvent("doctypeFin", ("*" x 30));
dumpAccumulatedAttlistDcl();
$pastDTD = 1;
if ($verbose) {
warn "doctypeFin: ID attributes declared: "
. scalar(keys %idAttrList) . ": "
. join(", ", keys %idAttrList) . ".\n";
warn "doctypeFin: IDREF(S) attributes declared: "
. scalar(keys %idrefAttrList) . ": "
. join(", ", keys %idrefAttrList) . ".\n";
}
($keepDoc) || exit;
} # doctypeFinHandler
sub cdataStartHandler {
startEvent("cdata");
# ignored
}
sub cdataEndHandler {
startEvent("cdataEnd");
# ignored
}
# The external entity handlers are only enabled with the -entities option.
sub externEntHandler {
(my $p, my $base, my $sysid, my $pubid) = @_;
startEvent("externEnt", "'$sysid'");
my $entityInfo =
"name='???' base='$base' public='$pubid' system='$sysid' file='???'";
push@entityStack, $entityInfo;
($etrace || $entities) &&
dumpLine("", "pi");
# ($parserCatalog) || warn "\n*** parserCatalog is nil***\n";
return($parserCatalog->get_handler($p)); # not right
}
sub externEntFinHandler {
startEvent("externEntFin");
my $entityInfo = pop @entityStack;
($etrace || $entities) &&
dumpLine("", "pi");
}
###############################################################################
# DTD stuff
sub entityDclHandler {
(my $p, my $name, my $value, my $sysid,
my $pubid, my $ndata, my $isParam) = @_;
my $pflag = ($isParam) ? "%":"";
if (!defined $pubid) { $pubid = ""; }
if (!defined $sysid) { $sysid = ""; }
# Integrate $tracePentities into startEvent() XXX
startEvent("entityDcl", "$pflag$name");
dumpAccumulatedAttlistDcl();
($keepDTD) || return;
if ($isParam) {
if (!$keepPentities) { return; }
}
if (!$keepInternalEntDcls && $value ne "" &&
$sysid eq "" && $pubid eq "" && $ndata eq "" && !$isParam) {
return; # (typically, a character entity dcl)
}
my $buf = "";
dumpLine($buf,"dcl");
} # entityDclHandler
sub elementDclHandler {
(my $p, my $name, my $model) = @_;
startEvent("elementDcl", "'$name'");
dumpAccumulatedAttlistDcl();
($keepDTD) || return;
if ($alphabetizeModels) { # only does if it's all one big model
if ($verbose) {
print "alphabetizing ||$model||\n";
(my $z = $model) =~ s/\w//g;
}
(my $wmodel = $model) =~ s/([?+*])\s*$//;
my $rep = $1;
$wmodel =~ s/^\(\s*//;
$wmodel =~ s/\)\s*$//;
if ($wmodel =~ m/^([\w\#-_.:]+\|)+[\w\#-_.:]+$/) { # a|b|c...
my @items = split(/\|/,$wmodel);
$wmodel = join("|",sort @items);
if (substr($wmodel,0,1) eq "|") {
$wmodel = substr($wmodel,1);
}
$wmodel = "(" . $wmodel . ")" . $rep;
($verbose) && print "***** OR model, " . (scalar @items)
. " tokens, rep '$rep'. Sorted: $wmodel\n";
$model = $wmodel;
}
} # ab
if ($breakModels) { $model = breakModel($model,"MODEL"); }
$model =~ s/\s*$//;
if ($breakModels) {
$model = breakModel($model, "XSL"); # experimental
}
dumpLine(sprintf("",$name,$model), "dcl");
} # sub elementDclHandler
# Multiple attributes are returned as separate events, so we have to
# accumulate them to print a correct single ATTLIST.
sub attlistDclHandler {
(my $p, my $ename, my $aname, my $avalue, my $adft, my $afix) = @_;
startEvent("attlistDcl", "'$ename\@$aname'");
dumpAccumulatedAttlistDcl();
# Track which are real ID/IDREF(S), so we know which to normalize/del.
if ($avalue eq "ID") { $idAttrList{"$ename/$aname"} = 1; }
if ($avalue eq "IDREF" || $avalue eq "IDREFS") {
$idrefAttrList{"$ename/$aname"} = 1;
}
if (!$keepDefaultAttrs) { # Memorize who they are
if ($adft ne "") {
$attrDefaults{"$ename/$aname"} = $adft;
}
}
($keepDTD) || return;
my $abuf;
if ($afix) {
$abuf = sprintf("$nameField $nameField #FIXED %s\n",$aname,$avalue,$adft);
}
else {
$abuf = sprintf("$nameField $nameField %s\n",$aname,$avalue,$adft);
}
if ($attdclbuf eq "") {
$attdclbuf = "";
dumpLine($buf, "pi");
} # sub xmlDclHandler
sub defaultHandler {
my $text = (defined $_[1]) ? $_[1]:"";
# If not -dtd, markup dcls come here -- don't count?
if ($pastDTD || $keepDTD) {
startEvent("default", "'$text'");
}
dumpAccumulatedAttlistDcl();
}
###############################################################################
# startEvent: Called at the beginning of every event.
# Params:
# 0: event type (name of handler subroutine minue 'Handler')
# 1: anything to print following trace message
# XXX Perhaps move dumpAccumulatedAttlist call in here?
#
sub startEvent {
my $etype = $_[0];
my $msg = (defined $_[1]) ? $_[1]:"";
my $leader = "***SAX***";
$eventCount++;
if ($progress > 0 &&
($eventCount % $progress) == 0) {
warn "$leader events handled: $eventCount\n";
}
# Perhaps report FQGI instead of just element on start and end?
my $context = ($traceFQGI) ? join($fqgiSep,@tagStack) : "";
# Print an event tracing message?
if ($traceEvents) {
if ($etype ne "char" ||
$traceWsEvents ||
$msg =~ m/[^\s]/) {
$msg =~ s/\n/\\n/g;
if ($traceFQGI) {
warn "$leader (FQGI $context) " . uc($etype) . " $msg\n";
}
else {
warn "$leader " . uc($etype) . " $msg\n";
}
}
}
elsif ($tracePentities &&
$etype eq "entityDcl" && $msg =~ m/^%/) {
$msg =~ s/\n/\\n/g;
warn "$leader " . uc($etype) . " $msg\n";
}
# If verbose mode, otherwise just once at the very end, check integrity
if ($verbose || $etype eq "final") {
checkStacks();
}
} # startEvent
###############################################################################
# Check integrity of open element, their child-counts, and their namespaces.
sub checkStacks {
if (scalar @tagStack == scalar @childElementCounts &&
scalar @tagStack == scalar @idStack &&
scalar @tagStack == scalar @childNodeCounts &&
scalar @tagStack == scalar @nsStack &&
scalar @tagStack == scalar @nsFullStack) { return; }
warn "ERROR: Stacks are out of sync:\n";
my $i = 0;
while ($i1) && warn "\n******* breakStartTag (ballattrs $breakAtAllAttrs)\n";
# First, normalize space.
(my $s1 = $s) =~ s/\s+/ /g;
my $s1len = length($s1);
# Parse off the element type and following space
$s =~ s/\s*<([-_.:\w]+)\s+//;
my $gi = $1;
my $buf = "<$gi"; # no space since attrs each add one
my $thisAttr = "";
while ($s =~ m/[^\s>]/) { # per attribute
if ($s =~ m/^([-_.:\w]+)=\'/) { # single-quoted
$s =~ s/^([-_.:\w]+=\'[^\']*\')\s*//;
$thisAttr = $1;
}
elsif ($s =~ m/^([-_.:\w]+)=\"/) { # double-quoted
$s =~ s/^([-_.:\w]+=\"[^\"]*\")\s*//;
$thisAttr = $1;
}
else { # shouldn't happen
doWarn( "breakStartTag: Found strange data in start-tag: '$s'.");
doWarn( " Original was: $_[0]");
$rc .= "$buf$newline$ind$s";
$buf = "";
last;
}
# For now, this requires that breakAttrs entail breakAtAllAttrs.
if ($breakAtAllAttrs ||
(length($buf) + length($thisAttr) + 1 > $width-$ind)) {
if ($verbose>2) {
warn "\n******* overflow thisAttr '$thisAttr'\n";
warn "******* dumping buf '$buf'\n";
}
$rc .= $buf;
if ($indentAttrsPastGI) {
$buf = "$newline$ind " . (" " x length($gi));
}
else {
$buf = "$newline$ind";
}
if ($breakInsideAttrs && length($thisAttr) > $width-$ind) {
($verbose>2) && warn "\n******* breaking thisAttr '$thisAttr'\n";
if ($xsl) {
$thisAttr = breakXslExpr($thisAttr);
}
else {
$thisAttr = breakModel($thisAttr, ($gi=~m/^xsl:/)?"XSL":"ATTR");
}
}
}
$buf .= " $thisAttr";
} # while
$rc = "$rc$buf>";
# Check that we didn't toast any data (maybe test one level down?)
my $rclen = length($rc);
if ($rclen < $s1len || $verbose>1) {
($rclen < $s1len) && warn "ERROR: breakStartTag lost data:\n";
warn "\nOriginal ($s1len): '$s1'\n\nResult($rclen): '$rc'\n";
}
return($rc);
} # breakStartTag
# Break up a parenthesized string such as a content model or XSL attr.
# Probably should really build a parse tree and go bottom-up.
# Should also alphabetize OR-groups?
# Make an option to keep preceding token sticky, as in f(x).
# This won't work with quoted parens.
sub breakModel {
my $m = $_[0]; # Content model, attr, or other to break
my $whoseRules = $_[1]; # What is it, anyway?
my $buf = ""; # Results
my $baseIndent = ""; # Initial indent
my $modelIndentString = " "; # Incremental indent
my $lvl = 0; # Current nesting level
# Break it up at (), pretend whitespace doesn't matter
my @parts = ();
if ($whoseRules eq "MODEL") { # (...)*
$m =~ s/\s//g;
@parts = split(/([()][*+?]?)/, $m);
$baseIndent = "" x 29;
}
elsif ($whoseRules eq "XSL") { # foo(...) bar[...]
# Lose some whitespace?
@parts = split(/\w+\(|\)/, $m);
$baseIndent = "";
}
elsif ($whoseRules eq "ATTR") { # ???
@parts = split(/\s+/, $m);
$baseIndent = curIndent(1);
}
else {
die "breakModel: Unknown whoseRules '$whoseRules'.\n";
}
for (my $i=0; $i $width) {
my $part = substr($parts[$i],0,$width);
$part =~ s/[|,\s&][^|,\s&]*$//;
$innerbuf .= "$newline$baseIndent$ind$part";
$parts[$i] = substr($parts[$i],length($part));
}
if ($innerbuf ne "") { $buf .= "$innerbuf$newline"; }
$buf .= "$baseIndent$ind$parts[$i]";
}
}
} # for
$buf =~ s/^\s*//;
($lvl!=0) && ($verbose) &&
warn "Unbalanced () in: '$m'.\n";
return $buf;
} # end breakModel
BEGIN {
my $rc = ""; # Results
my $buf = ""; # Line being built
my $baseIndent = 0; # Initial indent
my $incrIndent = 4; # Incremental indent amount
my $lvl = 0; # Current nesting level
my @margins = (); # Distance over for each lvl.
# Need to pass in the indent to the attribute...
sub breakXslExpr() {
my $m = $_[0]; # Content model, attr, or other to break
$buf = $rc = "";
$baseIndent = ($_[1] > 0) ? $_[1]:curIndent(2);
$modelIndentString = " ";
$lvl = 0;
# Break it up at: and, or, |, &,
my @parts = ();
@parts = split(/( and | or |, |[()\[\]])/, $m);
($verbose) &&
warn "Tokenizes attr '$m' to:\n " . join("\n ",@parts) . "\n";
for (my $i=0; $i $width) {
breakHere();
}
$buf .= "$parts[$i]";
}
} # for
#$buf =~ s/^\s*//;
($lvl!=0) && ($verbose) &&
warn "Unbalanced () in: '$m'.\n";
return ("$rc$buf");
}
sub breakHere() {
$rc .= "$buf$newline";
$buf = "$baseIndent" . ($modelIndentString x $lvl);
}
} # END
###############################################################################
# This buffer is filled in by the attribute declaration handler, and dumped
# when we get any other event. That's because we get a separate event for each
# attribute declared, but have to issue just one combined declaration.
sub dumpAccumulatedAttlistDcl {
if ($keepDTD && ($attdclbuf ne "")) {
dumpLine("$attdclbuf>","dcl");
}
$attdclbuf = "";
}
BEGIN {
# Variables needed for greeking text, so we don't re-assign many times.
my $ucase = "ABCDEFGHIJKLMNOPQRSTUVWXYZAASSTTEEEEIIOOU";
# Include an occasional extra space so word lengths change.
my $lcase = "abcdefghijklmnopqrstuvwxyz aasstteeeeiioou";
my $digit = "1234567890120";
my $finalPunct = "..,,,?!\'";
my $midWordPunct = "-\'";
my $ulen = length($ucase);
my $llen = length($lcase);
my $dlen = length($digit);
my $flen = length($finalPunct);
my $mlen = length($midWordPunct);
sub greekText {
my $n = length($_[0]);
my $rc = "";
for (my $i=0; $i<$n; $i++) {
my $c = substr($_[0],$i,1);
if ($c =~ /[[:upper:]]/) {
$rc .= substr($ucase, int(rand($ulen)),1);
}
elsif ($c =~ /[[:lower:]]/) {
$rc .= substr($lcase, int(rand($llen)),1);
}
elsif ($c =~ /[[:digit:]]/) {
if (rand(1.0) > 0.1) { # delete a few
$rc .= substr($digit, int(rand($dlen)),1);
}
}
elsif ($c =~ /\s/) {
$rc .= " ";
}
elsif (substr("$_[0] ",$i+1,1) =~ /\s/) {
$rc .= substr($finalPunct, int(rand($flen)),1);
}
else { # assume it's mid-word punctuation
$rc .= substr($midWordPunct, int(rand($mlen)),1);
}
}
($verbose>1) && doWarn( "Greeked [$_[0]] to [$rc].");
return($rc);
} # greekText
}
sub rot13Text {
my $n = length($_[0]);
my $ucase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" x 2;
my $lcase = "abcdefghijklmnopqrstuvwxyz" x 2;
my $rc = "";
for (my $i=0; $i<$n; $i++) {
my $c = substr($_[0],$i,1);
if ($c =~ /[A-Z]/) {
$rc .= substr($ucase, index($ucase,$c)+13,1);
}
elsif ($c =~ /[a-z]/) {
$rc .= substr($lcase, index($lcase,$c)+13,1);
}
else {
$rc .= $c;
}
}
return($rc);
} # rot13Text
# Turn non-ASCII characters (any > $maxNonEntityChar) into entities.
# amp, lt, whitespace, etc. are handled elsewhere. Watch performance!
sub fixEntities {
if ($_[0] =~ /^\p{IsASCII}*$/) { # Got only ASCII?
($verbose > 2) && warn "fixEntities-pl($_[0]).\n";
return($_[0]);
}
else {
my $rc = "";
my $tlen = length($_[0]);
for (my $i=0; $i<$tlen; $i++) {
my $o = ord(substr($_[0],$i,1));
if ($o>$maxNonEntityChar) {
$rc .= sprintf($crFormatString,$o);
}
else {
$rc .= substr($_[0],$i,1);
}
} #for
# Normalize was done in charhandler
($verbose > 2) && warn "fixEntities-rc($_[0]).\n";
return($rc);
}
} # fixEntities
# Force an element or attribute name, etc., to desired char set.
# Does not currently get rid of ASCII punctuation.
sub fixNameForm {
my $rc;
if ($nameForm eq "L") { $rc = lc($_[0]); }
elsif ($nameForm eq "U") { $rc = uc($_[0]); }
elsif ($nameForm eq "A") { ($rc = $_[0]) =~ s/\P{IsASCII}/_/g; }
elsif ($nameForm eq "C") { ($rc = $_[0]) =~ s/[^-.:a-zA-Z0-9]/_/g; }
return($rc);
}
###############################################################################
# dumpLine: Write out one tag/text/whatever event, line-breaking correctly.
# Handle when to insert newlines.
# Params: 0: text of line; 1: kind of event; 2: gi if element.
# Notes: char handler may not even pass us white-space-only nodes....
#
BEGIN {
# Track kinds of events happening, for better control of whitespace placement.
# *** Expand 'dcl' event to qw/ dcl pi comment mcomment /
my $lastEvent = "dcl"; # One of qw/ dcl start text end /
my $lastElementClass = ""; # qw/ inline block section preserve /
my $thisEvent;
my $pendingLine = ""; # Hold start-tags til we see if element is empty
my $l;
my $gi;
my $buf;
my $reIndent;
# With -breaks: nl before every line
# With -breaksOutside: nl before anything except end-tag.
# Params:
# 0: text of line to dump
# 1: 'text', 'start', 'end', 'dcl' (helps decide where to break lines)
# 2: element type when applicable, so we can look up for inlines
#
sub dumpLine {
$l = $_[0];
$thisEvent = $_[1];
$gi = $_[2];
$reIndent = ($indent) ? curIndent():"";
my $buf = "";
if ($pendingLine) { # Do we have a start-tag waiting?
$buf = handlePendingLine();
} # $pendingLine
# We should not be called at all if $repressed is true.
if ($thisEvent eq "text") {
$buf = handleTextLine($l);
# ($verbose > 1) && warn "back from handle: '$buf'\n";
}
elsif ($thisEvent eq "start") {
$buf = handleStartTagLine($l,$gi);
}
elsif ($thisEvent eq "end") {
$buf = handleEndTagLine($l,$gi);
}
elsif ($thisEvent =~ m/^(dcl|pi|m?comment)$/) {
$buf = handleDclLine($l);
}
else {
doWarn( "WARNING: Invalid dumpLine event: '$thisEvent'.");
}
# ($verbose > 1) && warn "dump to reallypr: '$buf'\n";
($nil) || reallyPrint($buf, "dumpLine");
$lastEvent = $thisEvent;
} # dumpLine
# What's pending, should be a start-tag.
sub handlePendingLine() {
if ($verbose && !breakAtAllAttrs && !$breakInsideAttrs &&
$pendingLine =~ m/\n.*\n/) {
doWarn( "Got more than 1 newline with pending start-tag.\n");
}
if ($thisEvent ne "end") { # With all but "end", just issue it
reallyPrint($pendingLine, "handlePendingLine-non-end");
$pendingLine = "";
}
# Is $pendingLine hasn't been cleared, then we're now at an end-tag
# immediately after a start-tag, so this must be an empty element.
else {
# Modify the start-tag in $pendingLine as appropriate
if ($emptyFormat eq "F") { # full start/end pair
$pendingLine =~ s/[\r\n ]+$//;
}
elsif ($emptyFormat eq "E") { $pendingLine =~ s|>\s*$|/>|; }
elsif ($emptyFormat eq "H") { $pendingLine =~ s|>\s*$| />|; }
elsif ($emptyFormat eq "S") {
# Don't change the start-tag at all
}
else {
doWarn("Unknown empty element format '$emptyFormat'.");
exit;
}
reallyPrint($pendingLine, "handlePendingLine-empty");
$pendingLine = "";
$lastEvent = $thisEvent;
if ($emptyFormat eq "E" ||
$emptyFormat eq "H" ||
($emptyFormat eq "S" && $displayClass{$gi} eq "EMPTY")
) { # Suppress current line (end-tag)
#doWarn ("Dropping end-tag '$l'\n");
return;
}
}
return("");
} # handlePendingLine
sub handleTextLine() {
my $l = $_[0];
my $buf = "";
($verbose>2) &&
warn "handleTextLine got '$l' (breakInsideText $breakInsideText,"
. " cleanTextNodeEnds $cleanTextNodeEnds)\n";
if ($l =~ m/[^\s]/ && ($breakInsideText || $cleanTextNodeEnds)) {
$l =~ s/^\s\s*//;
$l =~ s/\s\s*$//;
}
if ($breakInsideText) {
if ($l =~ /^\s*$/) { $buf = ""; }
else {
my $ind = ($indentInsideText) ? curIndent():"";
my @tokens = split(/\s+/, $l);
#print scalar @tokens . " tokens: " . join("|",@tokens) . "\n";
$buf .= "\n";
my $lbuf = $ind;
for my $t (@tokens) {
if (length($lbuf) + length($t) > $width) {
$buf .= "$lbuf\n";
$lbuf = "$ind $t";
}
else {
$lbuf .= " $t";
}
}
$buf .= $lbuf;
}
}
elsif ($unbreakInsideText) {
($buf = $l) =~ s/[\n\r]+/ /g;
}
else {
$buf = $l;
}
if ($educateQuotes) {
$buf =~ s/(\s)\"/$1$ldquo/g;
$buf =~ s/\"(\s)/$rdquo/g;
$buf =~ s/(\s)\'/$1$lsquo/g;
$buf =~ s/\'(\s)/$rsquo/g;
}
($verbose>2) && warn "handleTextLine ret '$buf'\n";
return($buf);
} # handleTextLine
sub handleStartTagLine() {
my $l = $_[0];
if ($breakInsideTags) {
$l = breakStartTag($l);
}
if (defined $displayClass{$_[1]} && $displayClass{$_[1]} eq "INLINE") {
$buf = $l;
}
elsif (defined $displayClass{$_[1]} &&
$displayClass{$_[1]} eq "BLOCK" &&
$lastEvent !~ m/^(dcl|pi|m?comment)$/) {
$buf = "$newline$newline$reIndent$l";
}
elsif ($breakUgly) {
($verbose) && doWarn("***");
my $reIndent = curIndent();
($buf = $l) =~ s|>$|$newline$reIndent>|;
}
elsif ($break || $breakOutside) {
if ($lastEvent ne "end") {
$buf = "$newline$reIndent$l";
}
else {
$buf = "$newline$reIndent$l";
}
}
else {
$buf = $l;
}
#warn "Deferring a start-tag for '$_[1]'.\n";
$pendingLine = $buf;
$buf = "";
return($buf);
} # handleStartTagLine
sub handleEndTagLine() {
my $l = $_[0];
my $dclass = (defined $displayClass{$_[1]}) ? $displayClass{$_[1]} : "";
if ($dclass eq "INLINE" ||
$lastEvent eq "start" ||
$lastEvent eq "text") {
$buf = $l;
}
elsif ($dclass eq "BLOCK") {
$buf = "$newline$reIndent$l$newline$newline";
}
elsif ($dclass eq "SECTION") {
$buf = "$newline$reIndent$l$newline$newline$newline";
}
elsif ($break) {
$buf = "$newline$reIndent$l";
}
elsif ($breakOutside) {
$buf = "$l$newline$reIndent";
}
elsif ($breakUgly) {
my $reIndent = curIndent();
($buf = $l) =~ s|>$|$newline$reIndent>|;
}
else {
$buf = $l;
}
return($buf);
} # handleEndTagLine
sub handleDclLine() {
my $l = $_[0];
if ($break || $breakOutside) {
$buf = (lastEvent =~ m/^(dcl|pi|m?comment)$/) ? "":$newline;
$buf .= "$reIndent$l$newline";
}
elsif ($breakUgly) {
my $reIndent = curIndent();
($buf = $l) =~ s|(--n)?>$|$newline$reIndent$1>|;
}
else {
$buf = $l;
}
return($buf);
} # handleDclLine
} # the BEGIN
# Run all printing through here, so we can implement -waitfor and -traceprint.
sub reallyPrint {
if ($waitfor && index($_[0],$waitfor) >= 0) {
$waitfor = "";
}
if ($waitfor) { return; }
my $p = $_[0];
if ($tracePrint) {
my $l = $p;
#$l =~ s/\n/\\n/g;
#$l =~ s/\r/\\r/g;
#$l =~ s/\t/\\t/g;
#$l = "[$_[1]]===>$l<===\n";
warn "reallyPrint: '$l'\n";
}
if ($p =~ /\P{ASCII}/) {
for (my $i=0; $i 0) { $offset = $_[0]; }
return($indentString x ($offset + scalar @tagStack));
}
###############################################################################
sub doWarn {
use IO::Handle;
my $ofh = select(STDOUT);
$ofh->flush();
warn "$_[0]\n";
}
###############################################################################
sub showArchforms {
warn"
The -archform [name] option provides some small support for HyTime architectural
form processing, and xlink processing which is quite similar. In short, using
this option amounts to declaring that the [name] attribute specifies the element
type for the element (generally in a namespace other than the one applicable to
the present explicit element type). With this option, you can cause such values
to replace the explicit element type.
For example, if your DTD contains:
and your document contains:
then specifying the option '-archform xsl-form' will transform that element to:
Options that delete particular tags or element types operate on the original
element type, not on any replacement element type specified via this option.
The -archform option is repeatable. If more than one of the named attributes is
present on an element instance, the one whose -archform option is first applies.";
}
sub showBreakRules {
warn "
There are several pre-defined patterns of line-breaking available:
The default rule is to insert no breaks.
-b breaks before start-tags and after end-tags, except for any elements
known to be inlines (see -inline and -setdtd). There will also be breaks
before and after comments, but not PIs.
-bugly is special: it forces line-breaks before the '>' of start and end tags.
This puts the newlines in markup instead of content, so they will never
produce white-space-only text-nodes that could get in the way.
There are also special options for breaking within start-tags, within individual
attributes, within text content, and within comment text.
If you want finer control, breaks can be set for several classes of events:
Elements known as inline, block, whitespace-preserving (wsp), and section
(elements can be categorized individually, or by -setdtd [dtdname]).
Elements not in any of those 4 categories ('other').
PIs, comments (single-line), and multi-line-comments.
These event classes are referred to by first letter: [ibwsopcm].
For each class, you can choose whether to have a newline in each of 4 places:
Before the start, after the start, before the end, and after the end.
These places are marked (respectively) with 1, 2, 3, and 4 below:
12Hello, world.3
4 (same for inline, block, wsp, and section)
14
14 (same for multi-line comments)
The options are named simply 'b' plus the name of the event class. To determine
the option value, put together four digits: how many newlines for each place.
For example, to have both tags for section-class elements ('s') on their own
separate lines and to double-space before the start-tag, use the option:
-bsection 2111
In some cases, you may want to have a break in place number 1 only if the last
(preceding) event was a certain class. To express such conditions, you can
replace the first integer with a bracketed list of the preceding event classes
with which you *do* want a newline. Only in the case of the categories for
*elements* could the preceding event be either a start or and end (tag).
To distinguish these, use the class letter (from [ibwso]) to indicate the
start-tag, and '/' plus the start letter to indicate the end-tag. For example:
-bb [s/s/b]000
means: set the breaking rule for block elements so there is a
break before the start-tag if the preceding event was: a start-tag for a
section element (s), an end-tag for a section element (/s), or an end-tag
for a block element (/b); and so there are no breaks in the other 3 places.
All the pre-defined breaking patterns (and many more) can be built this way
(except -bugly). For example, -b is the same as:
-bi 0000 -bb 1001 -bp 1001 -bs 1001 -bo 1001 -bp 0000 -bc 1001 -bm 1001
\n";
}
sub showUsage {
warn "
Usage: normalizeXML [options] file
Write out a normalized form of an XML file. For Canonical XML say -can.
However, also lets you chose a wide variety of layouts, and uniformly
add/delete various items.
Details:
Empty elements become start/end pairs, or per -empty option
Whitespace in markup is normalized
Whitespace-only text nodes are dropped
Attributes are alphabetized, always quoted with '\"', and
have internal '\"' replaced by &quo;
Defaulted attributes become explicit
CDATA marked sections go away in favor of just escaping
Characters > 255 become 5-digit '#x' entities (does not do Unicode normzn)
External parsed entities become inline data.
DOCTYPE declaration is deleted.
CRLF, CR, and LF are normalized to just LF (d10), or per -outlineends.
-sd Set my preferrred layout. Merely shorthand for:
-i -ic -it -btags -noientities
Whitespace options (for more details, use '-help-breaks'):
NOTE: Negate a flag option by prefixing 'no', e.g. -nob to negate -b.
-b Break lines around each tag or PI, and at EOF.
(see -outlineends, below, to control line-end type).
-ballattrs Break before every attributes (implies -btags).
-battrs Break up long attributes (implies -btags) (experimental).
-btags Break and indent within long start-tags (implies -i).
-btext Break and indent text nodes (implies -i). See also -it.
WARNING: Parser may separate text nodes.
(may get break/indent around character refs, \\n).
-bmodels Break/indent content models (implies -dtd). (unfinished).
-boutside Break lines 'outside': before start-tags, after ends.
-bugly Break in 'ugly' but safe places: inside end of tags.
-i Indent the output.
-iattrspastgi With -btags, line up attribute names past element name.
-ic Indent lines of multi-line comments (implies -i).
-istring 's' String to repeat for each indent level (default ' ').
('| ' makes it easy to see element scopes).
-it Indent text broken by -btext (implies -btext, -i, -b).
-lineends Normalize incoming lineends to Unix-style.
-mulberryDtd Lay out DTD in as for mulberrytech.com (unfinished).
-outlineends x Use M(ac), D(OS), or U(nix: default) style line ends.
-width n Set width for breaking lines (default $dft_width, min 10)
(\$COLUMNS has real terminal width, if exported).
-ws type How to handle white-space-only text nodes (default = D)
K(eep), D(rop), P(turn into PIs), M (keep multi-line)
Entity options:
-crbase 10|16 Write numeric character references in decimal or hex.
-crcase u|l Write hex numeric char refs in u(pper) or l(ower) case
-crdigits n Minimum number of digits to write for character refs.
-crxml form Output amp, lt, etc. as 'name' (amp, lt) or 'number'.
-entities Preserve external entities by adding start/end PIs
(for now, entities are not expanded, you just get PIs).
-etrace Report when entities are referenced.
Things normally kept, that you can drop:
-nil Suppress everything (except tracing and errors).
-noattr 'name' Delete XML attribute 'name' (repeatable).
-noattrpat 'pat' Delete XML attributes matching 'pat' (experimental).
-noblanklines Drop blank lines.
-nocomments Delete XML comments.
-nodefaultattrs Delete attributes that can be defaulted (experimental).
-nodoc Delete the document (mainly useful with -dtd).
-nodoctype Delete XML DOCTYPE declaration.
-noelement 'name' Delete XML tags and content for 'name' (repeatable).
-noendtags Delete all end-tags.
-nogenids Delete XSLT generate-id() values.
(Xalan/Saxon: =\"[TNF]0x[0-9a-f]*[N\.]0x[0-9a-f]*\").
-noids Delete XML attributes named 'id' (see also -norid).
-noientities Delete internal entities when displaying the DTD.
-nonsattrs Delete XML namespace attributes.
-nopi 'name' Delete XML processing instrs of 'name' (repeatable).
-nopis Delete processing instructions.
-norids Delete XML attributes named 'rid' (by name, not type).
-notag 'name' Delete XML start and end tags for 'name' (repeatable).
-noxmldcl Delete XML declaration
-text 'form' Print text as F(ull), N(one), G(reeked), or R(ot13).
Things normally dropped, that you can keep:
-dtd Include the DTD (with parameter entities expanded!)
(for now, excludes entity declarations).
-pentities Include parameter entity declarations (implies -dtd).
(not the same as -noparam!).
Things to normalize:
-amodels Alphabetize names in OR models (implies -bmodels)
-can Canonical: -noxmldcl -nodoctype -ws D -lineends
-crdigits 1 -cdbase 16 -crcase U, and
also invokes special escaping and whitespace handling
to conform to Canonical XML (www.w3.org/TR/xml-c14n).
-educateQuotes Make straight into curly quotes in non-preserve content.
-idnames Normalize names of ID-related attributes.
-maxnamewidth n Size column to allow for XML names in DTD (if there are names
over this width, they are *not* truncated).
-normalizetext Normalize whitespace in text nodes (not recommended).
-normalizeruns Normalize *runs* of whitespace in text nodes (safer).
-normalizeattrs Normalize whitespace inside attribute values
(this could change the meaning, for example in XSLT).
-normalizebars Normalize length of comments containing only '='.
-xsl Format XSL code nicely.
Things to add:
-elementSpace Insert a space for -noelement elements (see also -tagspace).
-flagBlockEnds Add an 'end' comment at the end of block elements.
-fqgi Add nx:fqgi attribute with list of ancestor element types
(you can use this and then 'grep' in the result...).
-fqgisep char Use between -fqgi element names (default '$dft_fqgisep').
-lnum Add nx:lnum giving the source line-number of each element.
-ourns [prefix] Namespace prefix for -xpath, -fqgi, -lnum (default 'nx').
-tagspace Insert space in place of -notag tags (see -elementspace).
-xpath [form] Add attribute nx:xpath with an XPath to the element.
Form = None, Short, Long, or Cseq.
Element layout class options:
-block 'gi...' Element types to put extra space before (repeatable).
-emptyElements 'gi...'
Element types to treat as EMPTY when they are empty
(only important with '-emptyFormat S' and no DTD).
-inline 'gi...' Element types to treat as inline (repeatable).
-preserve 'gi...' Element types to treat as space-preserving (repeatable).
Use this for code-blocks, examples, etc.
-section 'gi...' Element types to treat as big blocks (repeatable).
These get some extra whitespace above/below.
-setdtd name Set block, empty, inline, preserve, and section as
for the named DTD (repeatable). Known DTDs are:
tei, docbook, html, osis.
Other options:
-archform name Output the value of the 'name' attribute as the element
type for elements it appears on (see -help-archform)
-attrquotetype 'c' Specify single or double quote to go around attributes.
-c name Use XML catalog 'name' (default = $default_catalog).
-curlyQuotes opt A: Turn plain quotes to curly in content. O: leave as
in source (default). N: never have curly quotes.
-emptyFormat opt Write empty elements as Full: tart and end (default),
Empty, HTML: empty with space like
,
or SGML: start-tag only (see -emptyElements).
-html Use libxml's HTML parsing option.
-maxchar n Maximum character value to *not* make into an entity.
Warning: Latin-1 differs from UTF-8, so if this is >127,
display on XTerms, Browsers, etc. may be wrong.
-names form Force element/attribute names to Upper, Lower, Ascii,
or Clean (clean means s/[^-.:a-zA-Z0-9]/_/g).
Does not affect DTD or ID values yet.
-noparam Don't parse parameter entities or DTD.
-only [id] Only output the element with id 'id' (not yet).
-out ext Save each file's result in [filename].ext (vs. stdout)
-progress n Report progress every n parser events (counting element
starts/ends, text chunks, declarations if -dtd,...
-q Suppress most messages.
-traceevents Report every parser event (except whitespace ones).
-tracefqgi Report element context (implies -traceevents).
-tracepentities Report parameter entity declaration events, and normalizes
space in internal values (implies -dtd).
-tracewsevents Report parser whitespace events.
-v Add more messages, and check integrity frequently.
-voptions Show all option settings before running.
-version Display version info and exit ($version, sjd).
-waitfor string Don't output anything until the string occurs.
-xmllang [opt] Explicit: Add xml:lang attribute on all elements.
Implicit: Remove xml:lang where it can inherit.
Original: Leave as in source.
Known Bugs/Limitations (please report any other bugs to sderose\@acm.org):
There appears to be a bug with -empty not actually suppressing the end-tag.
Use -noparam if you're getting undefined-entity errors.
Redundant-namespace declaration removal is not thoroughly tested.
-bmodels indents at (), but not within a single ()-group.
ID/IDREF dropping by declared value (instead of name) is not finished.
IDs are only output for -flagblockends if they are *named* 'id'.
Ordering of namespaces/attributes may not be perfect for Canonical XML.
Requires DOCTYPE in order to parse a DTD file.
Escaping quotes and ampersands in entity replacement text not fully tested.
Related commands:
postdiff - Runs any command (like this one) on two files, then diffs them.
nameEntities - Changes numeric XML character references to named.
splitat - Just break file at tags; faster but less flexible.
hilite - Can colorize our output with its '-xml' option.
";
}