#!/usr/bin/perl -w # # normalizeXML: Lay out XML in a wide choice of ways. # # 2006-10-05: Written by Steven J. DeRose, sderose@acm.org. # ... # 2008-01-07 sjd: Resolve perl -w warnings. # 2008-01-09 sjd: Start -archform. -ientities for drop/keep internal entities. # 2008-02-01 sjd: Clean up options a little more. More on -noids, -norids # 2008-02-06 sjd: Fix -xpath. Distinguish childElementCounts/childNodeCounts. # Add -endtags. Fix -comments, mistaken force of -indent from -istring. # Add -flagBlockEnds, @idStack. # 2008-03-19 sjd: Make portable. Start fixing empty-element handling, adding # preserve-space and block element classes. Revamp class and dtd handling. # 2008-03-21 sjd: Add -nil, -etrace. Fix event tracing, add -tracewsevents. # Work on trapping internal (especially %) entities. # 2008-04-03 sjd: Fix uninit'd vars in DTD-handling. Don't double-space dtd. # 2008-04-04 sjd: Improve param. ent. output, DTD layout. Add -maxnamewidth. # 2008-05-05 sjd: Add -progress, move in event-tracing, -tracefqgi. # 2008-08-15 sjd: Add -curlyQuotes, -attrQuoteType, -xmllang. Rename -empty.... # # To do: # *** Don't write end-tag with -empty E # *** Way to drop ID/IDREF by type, not just name (partly done) # *** Control col widths for DTD output. Correct placement of "]" # *** Add Mulberry DTD format option # *** -only # *** Fix long empty comments, bars in multi-line comments, and # bars serving to center text, to match -normalizebars width. # Option to generate CDATA marked sections for text nodes within # certain element types, or t.n. with too many escapes # *** Escape as needed in -tracepentities output # *** Implement -bmodels # *** Way to declare certain PIs to open/close indentation # *** Make work better with dtd file that has no DOCTYPE dcl # *** Handle undefined entities and recover. # *** Join successive text nodes even with -b and -i? # Support XInclude? xi:include, xi:fallback. # Option to n-space before comment bars when prior item is not a comment # Suppress wide-char warning when in internal entity dcl # Finish -entities # In @select, @test, and @match of xsl elements, break at =,and,or,][,/ # Normalize charset of names and/or content (spaces, dashes, digits, all)? # Option to put type-attr? + id into -flagBlockEnds # Option to extract a single element by xpath or id, or split at certain # elements, doing the entity/PI stuff right? # Option to add IDs to certain element types -- gi+xptr or similar. # Suppress replacement of parameter entities in DTD # Pull out all content models (and names?) into % entities a la TEI. # In entity tracing, report location in entire entity state # In event tracing, option to skip whitespace or all text nodes. # Option to make content of some element types CDATA marked sections. # NOTE: To add an option, add to all of: history, variable dcl, option parse, # option validation, printout in dumpOptions, actual use, and help text. #use strict; use Getopt::Long; my $version = "2008-08-16"; my $default_catalog = ""; my $dft_fqgisep = "/"; my $dft_indentString = " " x 4; my $dft_outLineends = "U"; # Unix-style my $dft_quoteType = "\""; my $dft_width = 80; my $xinclude_ns = "http://www.w3.org/2001/XInclude"; # Options for white-space. my $break = 0; # Break output into lines? my $breakAtAllAttrs = 0; # Break before every attribute? my $breakInsideAttrs = 0; # Break up long individual attributes? my $breakInsideTags = 0; # Break up long start-tags? my $breakInsideText = 0; # Break text nodes into lines? my $breakModels = 0; # Wrap and indent content models (implies -dtd)? my $breakOutside = 0; # Break around outside of (non-inline) elements? my $breakUgly = 0; # Are we breaking before end-of-tag? my $mulberryDtd = 0; # Lay out DTD in Mulberrytech.com style. # New break-rules my ($bi, $bb, $bw, $bs, $bo, $bp, $bc, $bm) = "0000"; my $cleanTextNodeEnds = 0; # Don't do this my $indent = 0; # Are we indenting? my $indentAttrsPastGI = 0; # When splitting tags, where to line up attrs my $indentComments = 0; # Indent within multi-line comments? my $indentInsideText = 0; # With breakInsideText, do we also indent? my $indentString = $dft_indentString; # String to repeat per indent level my $outLineends = $dft_outLineends; my $screen = 0; # Set to screen width my $unbreakInsideText = 0; # Unbreak multi-line text nodes into 1 line? my $width = $dft_width; # Width to break to my $wsHandling = "D"; # Keep, Drop(dft), ->Pis, M keep only if >1 line # Options for entity management my $cr_amp = "&"; # Predefined XML entities my $cr_lt = "<"; my $cr_gt = ">"; my $cr_apos = "'"; my $cr_quot = """; my $cr_all = 0; # ***** FIX ***** my $crDigits = 5; # Min number of digits for numeric entities my $crBase = 16; # Hex or decimal? my $crCase = 'L'; # Case for [a-f] in hex character references my $ldquo = "“"; # How to write out quotes (no options yet) my $rdquo = "”"; my $lsquo = "‘"; my $rsquo = "’"; my $entities = 0; # Preserve entity bounds as PIs my $etrace = 0; # Report everyplace an entity is referenced. my $maxNonEntityChar = 127; # Chars > this become numeric entities my $parseParamEnt = 1; # Handle external parameters entities? my $xmlEntities = "name"; # name|number # Options for things normally kept, that you can drop: my $keepBlankLines = 1; # Retain blank lines? my $keepDefaultAttrs = 1; # Attributes that can be defaulted? my $keepComments = 1; # Retain all comments? my $keepDoc = 1; # The document itself? my $keepDoctype = 1; # The DOCTYPE declaration? my $keepEndTags = 1; # All end-tags? my $keepGenIds = 1; # Attribute values from generate-id()? my $genIdPattern = "=\\\"[TNF]0x[0-9a-f]*[N\.]0x[0-9a-f]*\\\""; my $keepIds = 1; # ID attributes? my $keepInternalEntDcls = 1; # (reset to shorten DTD, like from entity sets) my $keepNsAttrs = 1; # Namespace attributes in general? my $keepPIs = 1; # All processing instructions? my $keepRedundantNS = 1; # Redundant ns declarations? my $keepRids = 1; # Reference-to-IDs? my $keepXmlDcl = 1; # The XML declaration? my $nil = 0; # Suppress all dtd/document output. # Options for things normally dropped, that you can keep: my $keepDTD = 0; # Write out the whole DTD? my $keepPentities = 0; # Parameter entity declarations? # Options for things to normalize my $alphabetizeModels = 0; # Alphabetize tokens in OR content models my $barchar = "="; # For -normalizebars my $canonical = 0; my $maxNameWidth = 20; # For arranging DTD nicely my $educateQuotes = 0; # Curly quotes in content (not preserveElements) my $emptyFormat = "F"; # Full start-end pairs my $idnames = 0; # Rename all ID attributes to 'id', etc.? my $lineends = 0; # Normalize incoming line-ends? my $nameForm = ""; # Can switch to Upper, Lower, Ascii, Clean my $normalizeRuns = 0; # Normalize internal multi-space runs in text. my $normalizeText = 0; # Do normalize-space on text nodes? # This leads to problems with text-nodes joining up. my $normalizeAttrs = 1; # Do normalize-space on attribute values? my $normalizeBars = 1; # Adjust length of my $xinclude = 0; # Interpret XInclude directives my $xsl = 0; # Assume input is XSL # Options for things to add my $elementSpace = 1; # Put in a space when dropping elements? my $flagBlockEnds = 0; # Put a comment at end of block-elements my $fqgi = 0; # Show ancestry for non-inline elements my $fqgiSep = $dft_fqgisep; # Delimiter between gis in fqgis my $lnum = 0; # Are we inserting line number comments? my $ourns = "nx"; # Namespace prefix for -fqgi, -xpath, -linenum my $tagSpace = 1; # Put in a space when dropping tags? my $xpath = "N"; # Add attr with xpath to element # Other options my @archforms = (); # Hytime-like arch. form attributes my $attrQuoteType = "\""; # Char to use around attribute values my $catalog = $default_catalog; my $curlyQuotes = "O"; # (letter) O means leave quotes as in source my $help = 0; my $htmlparse = 0; # Try to handle ill-formed HTML my $only = ""; # Only output for this ID value. my $outputExtension = ""; # If non-nil, save to file(s), not stdout my $progress = 0; # How often to report progress (0=never) my $quiet = 0; my $quoteType = $dft_quoteType; my $sdOptions = 0; # Set my preferred options? my $textForm = "F"; # Display full text/greek/rot13/none my $traceEvents = 0; # Report parser events (except whitespace). my $traceFQGI = 0; # and include element contexts my $tracePentities = 0; # Report parameter entity declarations my $traceWsEvents = 0; # Report parser whitespace events. my $tracePrint = 0; # Detailed debug info from dumpLine() my $verbose = 0; my $voptions = 0; my $waitfor = ""; # Suppress output til this string shows up my $xmllang = "O"; # Mess with inherited xml:lang attributes? # Elements to treat specially re. whitespace/syntax my $setdtd = ""; # Set following as for a given DTD? my @blockElements = (); # What elements should get extra space before? my @emptyElements = (); # Elements declared EMPTY (for SGML output) my @inlineElements = (); # What elements shouldn't break anyway? my @preserveeElements = (); my @sectionElements = (); my @dropTags = (); # Delete these tags (not elements)? my @dropElements = (); # Delete these whole elements my @dropAttrNames = (); # Delete these attributes my @dropAttrPats = (); # Delete attrs with values matching these exprs my @dropPIs = (); # Delete PIs with these names ############################################################################### Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( # Options for whitespace "b|break!" => \$break, "bmodels!" => \$breakModels, "boutside!" => \$breakOutside, "ballattrs!" => \$breakAtAllAttrs, "battrs!" => \$breakInsideAttrs, "btext!" => \$breakInsideText, "btags!" => \$breakInsideTags, "bugly!" => \$breakUgly, "i|indent!" => \$indent, "iattrspastgi!" => \$indentAttrsPastGI, "icomments!" => \$indentComments, "istring=s" => \$indentString, "itext!" => \$indentInsideText, "linends|lineends!" => \$lineends, "mulberryDtd!" => \$mulberryDtd, "outlinends|outlineends=s"=> \$outLineends, "screen" => \$screen, "width=i" => \$width, "ws=s" => \$wsHandling, # new break rules "binline=s" => \$binline, "bblock=s" => \$bblock, "bwhitespacepreserve=s" => \$bwsp, "bsection=s" => \$bsection, "bother=s" => \$bother, "bpi=s" => \$bpi, "bcomment=s" => \$bcomment, "bmcomment=s" => \$bmcomment, # repeatable: whitespace treatment of specific elements "setdtd=s" => \$setdtd, "block=s" => \@blockElements, "emptyElements=s" => \@emptyElements, "inline=s" => \@inlineElements, "preserve=s" => \@preserveeElements, "section=s" => \@sectionElements, # Options for entity management "crbase=i" => \$crBase, "crcase=s" => \$crCase, "crdigits=i" => \$crDigits, "entities" => \$entities, "etrace" => \$etrace, "maxchar=i" => \$maxNonEntityChar, "names=s" => \$nameForm, "param!" => \$parseParamEnt, # Options for things to keep/drop "dropattr=s" => \@dropAttrNames, "dropattrpat=s" => \@dropAttrPats, "dropelement=s" => \@dropElements, "droppi=s" => \@dropPIs, "droptag=s" => \@dropTags, "blanklines!" => \$keepBlankLines, "comments!" => \$keepComments, "defaultattrs!" => \$keepDefaultAttrs, "doc!" => \$keepDoc, "doctype!" => \$keepDoctype, "dtd!" => \$keepDTD, "endtags!" => \$keepEndTags, "genids!" => \$keepGenIds, "ids!" => \$keepIds, "ientities!" => \$keepInternalEntDcls, "nil!" => \$nil, "nsattrs!" => \$keepNsAttrs, "pentities!" => \$keepPentities, "pis!" => \$keepPIs, "rids|idrefs!" => \$keepRids, "text=s" => \$textForm, "xmldcl!" => \$keepXmlDcl, # Options for things to normalize "amodels!" => \$alphabetizeModels, "barchar=s" => \$barchar, "can|canonical!" => \$canonical, "educatequotes!" => \$educateQuotes, "emptyFormat=s" => \$emptyFormat, "idnames" => \$idnames, "maxNameWidth=i" => \$maxNameWidth, "ntext|normalizetext!" => \$normalizeText, "nruns|normalizeruns!" => \$normalizeRuns, "nattrs|normalizeattrs!" => \$normalizeAttrs, "nbars|normalizebars!" => \$normalizeBars, "xi|xinclude!" => \$xinclude, "xsl!" => \$xsl, # Options for things to add "element-space|elementspace!"=> \$elementSpace, "flagBlockEnds!" => \$flagBlockEnds, "fqgi!" => \$fqgi, "fqgisep=s" => \$fqgiSep, "lnum!" => \$lnum, "ourns=s" => \$ourns, "tagspace!" => \$tagSpace, "xpath=s" => \$xpath, # Other options "archform=s" => \@archforms, "attrQuoteType=s" => \$attrQuoteType, "c=s" => \$catalog, "curlyQuotes=s" => \$curlyQuotes, "h|help|?" => sub { showUsage(); exit; }, "help-archforms" => sub { showArchforms(); exit; }, "help-breaks" => sub { showBreakRules(); exit; }, "html!" => \$htmlparse, "only=s" => \$only, "out=s" => \$outputExtension, "progress=n" => \$progress, "q!" => \$quiet, "sd|sjd!" => \$sdOptions, "traceprint!" => \$tracePrint, "traceevents!" => \$traceEvents, "tracefqgi!" => \$traceFQGI, "tracepentities!" => \$tracePentities, "tracewsevents!" => \$traceWsEvents, "v+" => \$verbose, "version" => sub { die "Version of $version, by Steven J. DeRose, sderose\@acm.org.\n"; }, "voptions!" => \$voptions, "waitfor=s" => \$waitfor, "xmllang=s" => \$xmllang ); if ($help) { showUsage(); exit; } ($result) || die "Bad options.\n"; my $nameField = "%-" . $maxNameWidth . "s"; ############################################################################### # Set implied options, validate option values... # Whitespace if ($screen) { ($ENV{COLUMNS} > 0) || die "Can't get env. variable \$COLUMNS.\n"; $width = $ENV{COLUMNS}; } if ($sdOptions) { $indent = 1; $indentComments = 1; $indentInsideText = 1; $breakInsideTags = 1; $keepInternalEntDcls = 0; } if ($alphabetizeModels) { $breakModels = 1; } if ($breakModels) { $keepDTD = 1; } if ($keepPentities) { $keepDTD = 1; } #if ($indentString) { $indent = $break = 1; } if ($indentComments) { $indent = 1; } if ($indentInsideText) { $breakInsideText = $indent = $break = 1;} if ($indent) { $break = 1; } if ($breakInsideAttrs) { $breakInsideTags = 1; } if ($breakInsideTags) { $break = 1; } if ($breakInsideText) { $break = 1; } if ($width) { ($width > 10) || die "-width is too small (must be > 10).\n"; } # Entities ($crBase==10) || ($crBase==16) || die "-crbase must be 10 or 16.\n"; $crCase = uc(substr($crCase."U",0,1)); ($crCase =~ m/[UL]/) || die "-crcase must be L or U.\n"; ($crDigits>0) || die "-crdigits must be positive.\n"; if ($maxNonEntityChar) { ($maxNonEntityChar <= 0) && die "-maxchar value must be > 0.\n"; ($maxNonEntityChar > 0xFFFFF) && warn "-maxchar value > 0xFFFFF ($maxNonEntityChar) is kinda useless.\n"; } # Drop my %dropTagsHash = (); for my $t (@dropTags) { $dropTagsHash{$t} = 1; } ($verbose) && warn "Tags to drop: " . join(" ",sort keys %dropTagsHash) . ".\n"; @dropTags = (); my %dropElementsHash = (); for my $t (@dropElements) { $dropElementsHash{$t} = 1; } ($verbose) && warn "Elements to drop: " . join(" ",sort keys %dropElementsHash) . ".\n"; @dropElements = (); my %dropPIsHash = (); for my $t (@dropPIs) { $dropPIsHash{$t} = 1; } ($verbose) && warn "PIs to drop: " . join(" ",sort keys %dropPIsHash) . ".\n"; @dropPIs = (); my %dropAttrNamesHash = (); if (0) { # old way, just by name; now we know from dcls if (!$keepIds) { $dropAttrNamesHash{"rid"} = 1; } if (!$keepRids) { $dropAttrNamesHash{"id"} = 1; } } for my $da (@dropAttrNames) { if (index($da,"/") >= 0) { (my $elem = $da) =~ s/\/.*\$//; (my $attr = $da) =~ s/^.*\/\$//; $dropAttrNamesHash{$attr} = $elem; } else { $dropAttrNamesHash{$da} = 1; } } ($verbose) && warn "Attribute names to drop: " . join(" ",sort keys %dropAttrNamesHash) . ".\n"; @dropAttrNames = (); # Assemble a single pattern from all dropAttrPats. my $dropAttrPatsExpr = ""; foreach my $p (@dropAttrPats) { $dropAttrPatsExpr .= "($p)|"; } $dropAttrPatsExpr =~ s/\|\$//; @dropAttrPats = (); ($verbose) && warn "dropAttrPatsExpr = '$dropAttrPatsExpr'.\n"; ($verbose) && warn "Architectural form attributes: " . join(", ",@archforms) . ".\n"; # Normalize if ($canonical) { # Per http://www.w3.org/TR/xml-c14n) $crDigits = 1; # no leading zeros in character references $crBase = 16; $crCase = "U"; $keepDoctype = 0; $keepXmlDcl = 0; $wsHandling = "D"; $lineends = 1; } $emptyFormat = uc(substr($emptyFormat."F",0,1)); ($emptyFormat =~ m/[FEHS]/) || die "Unknown format '$emptyFormat' for -empty option, must be" . " Full (), Empty (), Html (), or Sgml ()\n"; if ($nameForm) { $nameForm = uc(substr("$nameForm ",0,1)); ($nameForm =~ /[ULAC]/) || die "Invalid form '$nameForm' on -name option.\n"; } if ($outLineends) { $outLineends = uc(substr($outLineends."U",0,1)); (index("MDU",$outLineends) >= 0) || die "Unknown output line-end type '$outLineends'.\n"; } if ($textForm) { $textForm = uc(substr($textForm."F",0,1)); (index("FGNR",$textForm) >= 0) || die "Unknown output text output form '$textForm'.\n"; } if ($wsHandling ) { $wsHandling = uc(substr($wsHandling."K",0,1)); ($wsHandling =~ m/[KDPM]/) || die "Unknown white-space only handling '$wsHandling'.\n"; } $xpath = uc(substr($xpath."?",0,1)); ($xpath =~ m/[NSLC]/) || die "Invalid format for -xpath option (must be N, S, L or C, not $xpath).\n"; if ($xsl) { $break = $indent = $breakInsideText = 1; $breakInsideTags = 1; $emptyFormat = "E"; $wsHandling = "M"; } # Add $fqgiSep = substr($fqgiSep,0,1); ($fqgiSep =~ m/[-_:.\w0-9]/) && die "Can't use a name character ($fqgiSep) as fqgisep.\n"; if ($ourns) { ($ourns =~ m/^[\w\d]+$/) || die "Invalid -ourns namespace prefix '$ourns'.\n"; } # Other ($attrQuoteType =~ m/^['"]$/) || die "-attrQuoteType must be ' or \".\n"; ($catalog eq "") || (-f $catalog) || die "Can't find catalog file '$catalog'.\n"; $curlyQuotes = uc(substr($curlyQuotes . "O",0,1)); ($curlyQuotes =~ m/^[AON]$/) || die "-curlyQuotes value must be [AON], not '$curlyQuotes'.\n"; if ($tracePentities) { $keepDTD = 1; } $xmllang = uc(substr($xmllang . "O",0,1)); ($xmllang =~ m/^[IOE]$/) || die "-xmllang must be Explicit, Implicit, or Original, not '$xmllang'.\n"; ####### Copy repeatable options from arrays into hash tables.... if ($setdtd eq "docbook") { setupDocbookElements(); } elsif ($setdtd eq "html") { setupHtmlElements(); } elsif ($setdtd eq "mathml") { setupMathmlElements(); } elsif ($setdtd eq "nlm") { setupNlmElements(); } elsif ($setdtd eq "osis") { setupOsisElements(); } elsif ($setdtd eq "tei") { setupTeiElements(); } elsif ($setdtd eq "xsl") { setupXslElements(); } elsif ($setdtd ne "") { die "Unknown -setdtd value '$setdtd'.\n"; } my %displayClass = (); my $e = ""; for $e (@blockElements) { $displayClass{$e} = "BLOCK"; } for $e (@inlineElements) { $displayClass{$e} = "INLINE"; } for $e (@preserveElements) { $displayClass{$e} = "PRESERVE"; } for $e (@sectionElements) { $displayClass{$e} = "SECTION"; } my %empties = (); for my $e (@emptyElements) { $empties{$e} = 1; } ############################################################################### # Make up the sprintf string used for numeric character references my $crFormatString = (($crBase == 16) ? "&#x":"&#") . "%0$crDigits" . (($crBase == 16) ? "x":"d") . ";"; if ($crCase eq "U") { $crFormatString = uc($crFormatString); } # Set up strings to output for built-in XML entities -- named or numeric if ($xmlEntities eq "name") { $cr_amp = "&"; $cr_lt = "<"; $cr_gt = ">"; $cr_apos = "'"; $cr_quot = """; } else { $cr_amp = sprintf($crFormatString, 0x26); $cr_lt = sprintf($crFormatString, 0x3c); $cr_gt = sprintf($crFormatString, 0x3e); $cr_apos = sprintf($crFormatString, 0x27); $cr_quot = sprintf($crFormatString, 0x22); } my $newline = chr(10); if ($outLineends eq "M") { $newline = chr(13); } elsif ($outLineends eq "D") { $newline = chr(13).chr(10); } ($quiet) || ($catalog eq "") || doWarn( "Using catalog '$catalog'."); my $file = $ARGV[0] ? $ARGV[0]:""; (-f $file) || die "Can't find input file '$file'.\n"; # Try to interleave stdout/stderr as desired... #my $ofh = select(STDOUT); #$| = 1; #select($ofh); ############################################################################### # Declare parsing state information, clear it, and go. my $repressed = 0; # Are we in an element to delete all of? my @repressStack = (); my $attdclbuf = ""; # Saved-up attlist declaration my %idAttrList = (); # Record all gi/attrnames declared as type ID my %idrefAttrList = (); # Record all gi/attrnames declared as type IDREF(S) my %attrDefaults = (); # Record which attrs default to what # Synchronized stacks (easier than a stack of hashes; maybe) my @tagStack = (); # Maintained by start/end handlers my @idStack = (); # ID attr values for open elements my @childElementCounts = (); # (num of children so far, mostly for -xpath) my @childNodeCounts = (); # (num of children so far, mostly for -xpath) my @nsStack = (); # Runs in sync with @tagStack (just prefix lists) my @nsFullStack = (); # Runs in sync with @tagStack (hash of prefix=>uri) my @entityStack = (); # Only used with -entities option my $pastDTD = 0; # To suppress comments from DTD my $commentCount = 0; my $parserCatalog; my $docelStatus = "before"; # later "in" then "after" for Canonical rules. my $eventCount = 0; # For reporting progress my $elementCount = 0; # Per document ($voptions) && dumpOptions(); my $totalEventCount = 0; # grand total my $totalElementCount = 0; # grand total my $totalDocuments = 0; my $fh = STDOUT; foreach my $file (@ARGV) { $totalDocuments++; ($verbose) && warn "Starting on file '$file'.\n"; if ($outputExtension) { my $outfile = $file; $outfile =~ s/\.[^.]+$/\.$outputExtension/; open $fh, ">$outfile"; } clearParserState(); parseDocument($file,$fh); if ($outputExtension) { close $fh; } $totalEventCount += $eventCount; $totalElementCount += $elementCount; } if ($keepComments && $commentCount == 0 && !$quiet) { doWarn( "WARNING: No comments seen. Parser may have discarded them."); } # This is only reporting events/elements in last document... XXX ($quiet) || doWarn( "Done, total of $totalElementCount elements (" . "$totalEventCount events) processed," . " over $totalDocuments document(s)."); exit; ############################################################################### sub clearParserState { $repressed = 0; # Are we in an element to delete all of? @repressStack = (); $attdclbuf = ""; # Saved-up attlist declaration %idAttrList = (); # Record all ID values defined %idrefAttrList = (); # Record all ID values referenced %attrDefaults = (); # Record which attrs default to what @tagStack = (); @idStack = (); @childElementCounts = (); @childNodeCounts = (); @nsStack = (); @nsFullStack = (); @entityStack = (); # Only used with -entities option $pastDTD = 0; $commentCount = 0; $eventCount = 0; $elementCount = 0; $docelStatus = "before"; } ############################################################################### sub dumpOptions { warn "\n\nOPTION SETTINGS:\n"; warn "version [$version]\n"; warn "default_catalog [$default_catalog]\n"; warn "xinclude_ns [$xinclude_ns]\n"; # Options for white-space warn "\nOptions for white-space wrapping:\n"; warn "breakAtAllAttrs [$breakAtAllAttrs]\n"; warn "breakInsideAttrs [$breakInsideAttrs]\n"; warn "breakInsideTags [$breakInsideTags]\n"; warn "breakInsideText [$breakInsideText]\n"; warn "breakModels [$breakModels]\n"; warn "Pre-defined breaking rules:\n"; warn "break [$break]\n"; warn "breakOutside [$breakOutside]\n"; warn "breakUgly [$breakUgly]\n"; warn "Detailed breaking rules:\n"; warn " inline [$binline]\n"; warn " block [$bblock]\n"; warn " whitespacepres. [$bwsp]\n"; warn " section [$bsection]\n"; warn " other [$bother]\n"; warn " pi [$bpi]\n"; warn " comment [$bcomment]\n"; warn " multi-line-com. [$bmcomment]\n"; warn "mulberry DTD format [$mulberryDtd]\n"; warn "cleanTextNodeEnds [$cleanTextNodeEnds]\n"; warn "indentString [$indentString]\n"; warn "indent [$indent]\n"; warn "indentComments [$indentComments]\n"; warn "indentInsideText [$indentInsideText]\n"; warn "outLineends [$outLineends]\n"; warn "unbreakInsideText [$unbreakInsideText]\n"; warn "width [$width]\n"; warn "wsHandling [$wsHandling]\n"; # Options for entity management warn "\nOptions for entity management\n"; warn "Character references:\n"; warn " For amp [$cr_amp]\n"; warn " For lt [$cr_lt]\n"; warn " For gt [$cr_gt]\n"; warn " For apos [$cr_apos]\n"; warn " For quot [$cr_quot]\n"; warn " cr_all [$cr_all]\n"; warn " Minumum Digits [$crDigits]\n"; warn " Decimal or Hex [$crBase]\n"; warn " Case for hex 'x' [$crCase]\n"; warn "entities [$entities]\n"; warn "entity ref tracing [$etrace]\n"; warn "maxNonEntityChar [$maxNonEntityChar]\n"; warn "parseParamEnt [$parseParamEnt]\n"; warn "xmlEntities [$xmlEntities]\n"; # Options for things to keep/drop warn "\nOptions for things to keep/drop\n"; warn "Blank lines [$keepBlankLines]\n"; warn "Default Attributes [$keepDefaultAttrs]\n"; warn "The DTD [$keepDTD]\n"; warn "Comments [$keepComments]\n"; warn "The document [$keepDoc]\n"; warn "The doctype [$keepDoctype]\n"; warn "Ids [$keepIds]\n"; warn "XSLT-generated IDs [$keepGenIds]\n"; warn "genIdPattern [$genIdPattern]\n"; warn "NsAttrs [$keepNsAttrs]\n"; warn "Parameter entities [$keepPentities]\n"; warn "PIs [$keepPIs]\n"; warn "Redundant NS dcls [$keepRedundantNS]\n"; warn "IDREF(S) attributes [$keepRids]\n"; warn "The XML Declaration [$keepXmlDcl]\n"; warn "Suppress all (-nil) [$nil]\n"; # Options for things to normalize warn "\nOptions for things to normalize\n"; warn "alphabetizeModels [$alphabetizeModels]\n"; warn "barchar [$barchar]\n"; warn "canonical [$canonical] (forces many other options)\n"; warn "educate quotes [$educateQuotes (in non-preserve content)\n"; warn "emptyFormat [$emptyFormat]\n"; warn "idnames [$idnames]\n"; warn "lineends [$lineends]\n"; warn "max name width [$maxNameWidth]\n"; warn "nameForm [$nameForm]\n"; warn "normalizeRuns [$normalizeRuns]\n"; warn "normalizeText [$normalizeText] (not recommended)\n"; # normaliText leads to problems with text-nodes joining up. warn "normalizeAttrs [$normalizeAttrs]\n"; warn "normalizeBars [$normalizeBars]\n"; warn "xinclude [$xinclude] (not yet implemented)\n"; warn "xsl [$xsl]\n"; # Options for things to add warn "\nOptions for things to add\n"; warn "elementSpace [$elementSpace]\n"; warn "flagBlockEnds [$flagBlockEnds]\n"; warn "fqgi [$fqgi]\n"; warn "fqgiSep [$fqgiSep]\n"; warn "lnum [$lnum]\n"; warn "ourns [$ourns]\n"; warn "tagSpace [$tagSpace]\n"; warn "xpath [$xpath]\n"; # Other options warn "\nOther options\n"; warn "catalog [$catalog]\n"; warn "help [$help]\n"; warn "htmlparse [$htmlparse]\n"; warn "only output [id] [$only]\n"; warn "outputExtension [$outputExtension]\n"; warn "progress [$progress]\n"; warn "quiet [$quiet]\n"; warn "quoteType [$quoteType] (not yet implemented)\n"; warn "textForm [$textForm]\n"; warn "traceEvents [$traceEvents]\n"; warn "traceFQGI [$traceFQGI]\n"; warn "trace P Entities [$tracePentities]\n"; warn "traceWsEvents [$traceWsEvents]\n"; warn "tracePrint [$tracePrint]\n"; warn "verbose [$verbose]\n"; warn "voptions [$voptions]\n"; warn "waitfor [$waitfor]\n"; # Lists of XML constructs by name warn "\nLists of XML constructs by name\n"; warn "blockElements: " . join(", ", @blockElements) . "\n"; warn "emptyElements: " . join(", ", @emptyElements) . "\n"; warn "inlineElements: " . join(", ", @inlineElements) . "\n"; warn "preserveElements:" . join(", ", @preserveElements) . "\n"; warn "sectionElements: " . join(", ", @sectionElements) . "\n"; warn "\n"; warn "dropTags: " . join(", ", @dropTags) . "\n"; warn "dropElements: " . join(", ", @dropElements) . "\n"; warn "dropAttrNames: " . join(", ", @dropAttrNames) . "\n"; warn "dropAttrPats: " . join(", ", @dropAttrPats) . "\n"; warn "dropPIs: " . join(", ", @dropPIs) . "\n"; } # dumpOptions ############################################################################### sub parseDocument { use lib "sw/lib/perl"; use XML::Parser; # http://search.cpan.org/~ebohlman/XML-Catalog-0.02/Catalog.pm use XML::Catalog; my $parser; if ($parseParamEnt) { $parser = new XML::Parser(ErrorContext => 2, ParseParamEnt => 1); } else { $parser = new XML::Parser(ErrorContext => 2); } $parser->setHandlers( Start => \&startHandler, End => \&endHandler, Init => \&initHandler, Final => \&finalHandler, Char => \&charHandler, Proc => \&procHandler, Comment => \&commentHandler, Doctype => \&doctypeHandler, DoctypeFin => \&doctypeFinHandler, Default => \&defaultHandler); if ($entities) { $parser->setHandlers( ExternEnt => \&externEntHandler, ExternEntFin => \&externEntFinHandler); } else { if ($catalog ne "") { $parserCatalog=XML::Catalog->new($catalog); $parser->setHandlers( ExternEnt => $parserCatalog->get_handler($parser)); } } if ($keepDTD) { $parser->setHandlers( Element => \&elementDclHandler, Attlist => \&attlistDclHandler, Entity => \&entityDclHandler, XMLDecl => \&xmlDclHandler); } if ($htmlparse) { die "-html option is not yet finished.\n"; # See perldoc XML::LibXML::Parser. And don't do catalog or dtd handling # Maybe invoke known list of empties for output? # $parser->parse_html_file($_[0]); } else { $parser->parsefile($_[0]); } if ($break || $breakOutside) { print $fh $newline; } } # sub parseDocument ############################################################################### # # Regarding the XML::Parser module: # # API doc is at http://search.cpan.org/~msergeant/XML-Parser/Parser.pm # initHandler: Called at start of parsing sub initHandler { startEvent("init"); if (!$keepXmlDcl) { dumpLine("", "pi"); } } # finalHandler: Called at end of parsing sub finalHandler { startEvent("final"); dumpAccumulatedAttlistDcl(); (scalar @tagStack) && doWarn("Unclosed elements on stack: " . join("/",@tagStack)); } sub startHandler { my $parser = shift; my $gi = shift; my $depth = scalar @tagStack; my %attlist = (); my $myNS = (scalar(@nsStack)>0) ? $nsStack[-1]:""; startEvent("start", "'$gi'"); if ($nameForm ne "") { $gi = fixNameForm($gi); } if (scalar @childNodeCounts > 0) { $childElementCounts[-1]++; $childNodeCounts[-1]++; } dumpAccumulatedAttlistDcl(); # Signal that we're really in the document now (not the DTD) $pastDTD = 1; $docelStatus = "in"; if ($repressed or (defined $dropElementsHash{$gi})) { #doWarn( "Repressing $gi"); $repressed = 1; push @repressStack, $gi; } # Organize the attributes into a hash, dropping any we don't want, # and renaming id-ish attributes if asked. my $nsPrefixes = ""; my %nsDeclaredHere = (); while (scalar @_) { my $attr = shift; my $value = shift; # Do we need to save the default namespace, too? if ($attr =~ m/^xmlns:[^=]/) { # namespace attribute $nsPrefixes .= "$attr "; (my $prefix = $attr) =~ s/xmlns://; $nsDeclaredHere{$prefix} = $value; } if (defined $dropAttrNamesHash{$gi}) { next; } if (!$keepIds && defined $idAttrList{"$gi/$attr"}) { next; } if (!$keepRids && defined $idrefAttrList{"$gi/$attr"}) { next; } if ($keepGenIds && $value =~ m/$genIdPattern/o) { next; } if (!$keepDefaultAttrs) { if (defined $attrDefaults{"$gi/$attr"}) { ($verbose) && warn "gi $gi, attr $attr, value $value," . " vs. dft " . $attrDefaults{"$gi/$attr"} . ".\n"; if ($attrDefaults{"$gi/$attr"} eq $value) { next; } } } # Replace this with single /o 'or' pattern from $dropAttrPatsExpr #foreach my $p (keys %dropAttrPats) { # if ($attlist{$k} =~ m/$p/) { next; } #} if ($value =~ /$dropAttrPatsExpr/o) { next; } if ($idnames) { if ($idAttrList{"$gi/$attr"}) { $attr = "id"; } elsif ($idrefAttrList{"$gi/$attr"}) { $attr = "rid"; } } if ($nameForm ne "") { $attr = fixNameForm($attr); } $attlist{$attr} = $value; } # while # Interpet XInclude if needed if ($xinclude && $myNS eq $xinclude_ns) { doXInclude(); } # Add any requested specialty attributes if ($xpath ne "N") { $attlist{"$ourns:xpath"} = makeXPath(\@tagStack,\@childElementCounts,$gi, $xpath); } if ($fqgi) { $attlist{"$ourns:fqgi"} = join($fqgiSep,@tagStack) . "$fqgiSep$gi"; } if ($lnum) { $attlist{"$ourns:lnum"} = $parser->current_line; } if ($xmllang eq "E" && !defined $attlist{"xml:lang"}) { # Make explicit if (defined $langStack[-1]) { $attlist{"xml:lang"} = $langStack[-1]; # warn "Inheriting xml:lang\n"; } else { $attlist{"xml:lang"} = "EN"; # warn "Forcing xml:lang\n"; } } elsif ($xmllang eq "I" && defined $attlist{"xml:lang"} && $attlist{"xml:lang"} eq $langStack[-1]) { # Make implicit delete $attlist{"xml:lang"}; } # else leave as in source # Generate alphabetized/normalized attribute list. Do this even if # dropping the tag or element, so we track ns attributes right. # Canonical XML (Clause 2.2) says: # namespace nodes are sorted lexicographically by local name # attribute nodes are sorted lexicographically with namespace URI # as the primary key and local name as the secondary key my $ats = ""; foreach my $k (sort keys %attlist) { # Skip this one if it's a redundant namespace attr if ($k =~ m/^xmlns:[^=]/ && !$keepRedundantNS && index(" SnsPrefixes ", " $k ") >= 0) { next; } my $cur = $attlist{$k}; (!defined $cur) && warn "Attlist entry for '$k' not defined\n"; # Escape attr per clause 2.3 of http://www.w3.org/TR/xml-c14n $cur =~ s/\&/$cr_amp/g; $cur =~ s/\"; #if ($lnum && $thisline!=$curline) { # $startTag = "" . $startTag; #} dumpLine($startTag, "start", $gi) unless($repressed); } push(@tagStack,$gi); push(@idStack, $attlist{"id"}); # for now; should go by type, not name push(@childElementCounts,0); push(@childNodeCounts,0); push(@nsStack,$nsPrefixes); push(@nsFullStack, \%nsDeclaredHere); $elementCount++; } # startHandler sub endHandler { my $gi = $_[1]; startEvent("end", "'$gi'"); if ($nameForm ne "") { $gi = fixNameForm($gi); } dumpAccumulatedAttlistDcl(); my $old = $tagStack[-1]; if ($old ne $gi && scalar(@archforms)>0) { doWarn( "Got end for '$gi' when expecting '$old'."); } else { pop(@tagStack); pop(@idStack); pop(@childElementCounts); pop(@childNodeCounts); pop(@nsStack); pop(@nsFullStack); } if (!defined $dropTagsHash{$gi}) { for ($f=0; $f", "end", $gi); if ($flagBlockEnds && $displayClass{$gi} eq "BLOCK") { dumpline("\n", "comment"); } } } # BUG? Only pop if we're ending the thing at stack-top? if ($repressed) { pop @repressStack; if (scalar @repressStack <= 0) { $repressed = 0; } } if (scalar @tagStack <= 0) { $docelStatus = "after"; } } # endHandler # May need to drop first newline from nodes in dtd? sub charHandler { if (scalar @childNodeCounts > 0) { $childNodeCounts[-1]++; } my $text = $_[1]; startEvent("char", "'$text'"); dumpAccumulatedAttlistDcl(); if ($repressed) { return; } if ($text !~ m/\S/) { # white-space-only text node ($verbose>1) && warn "charHandler: white-space-only node.\n"; if ($wsHandling eq "D") { return; } # drop if ($wsHandling eq "P") { # turn into PIs $text =~ s/\r/\&\#13;/g; $text =~ s/\n/\&\#10;/g; $text =~ s/\t/\&\#9;/g; $text =~ s/\'/\&\#39;/g; $text =~ s/\?>/\&\#63;>/g; dumpLine("", "text") unless($repressed); return; } if ($wsHandling eq "M") { # drop unless multi-line if (!($text =~ m/\n.*\n/)) { return; } $text =~ s/\n//; # just drop one } if ($wsHandling ne "K") { doWarn("Bad white space handling type '$wsHandling'."); } } # white-space-only if ($textForm eq "N") { return; } $text =~ s/&/$cr_amp/g; $text =~ s//$cr_gt/g; $text =~ s/\r/&\#xD;/g; } if ($cr_all) { $text =~ s/\'/$cr_apos/g; $text =~ s/\"/$cr_quot/g; } if ($unbreakInsideText) { ($verbose>1) && warn "charHandler: unbreaking.\n"; $text =~ s/\012/ /g; } if ($normalizeText) { ($verbose>1) && warn "charHandler: normalizing space (excessive).\n"; $text =~ s/\s+/ /g; $text =~ s/^\s+//; $text =~ s/\s+$//; } if ($normalizeRuns) { ($verbose>1) && warn "charHandler: normalizing space runs.\n"; $text =~ s/\s+/ /g; } if ($lineends) { ($verbose>1) && warn "charHandler: normalizing lineends.\n"; $text =~ s/\015\012/\012/g; # normalize line-ends $text =~ s/\015/\012/g; } if ($keepBlankLines) { ($verbose>1) && warn "charHandler: degeminating tabs.\n"; $text =~ s/\012\012/\012/g; } my $fixed = ""; if ($textForm eq "G") { $fixed = greekText($text); } elsif ($textForm eq "R") { $fixed = rot13Text($text); } else { $fixed = fixEntities($text); } # ($verbose > 1) && warn "Dumping text: '$fixed'\n"; dumpLine("$fixed", "text") unless($repressed); } # charHandler sub procHandler { my ($p, $name, $data) = @_; startEvent("proc", "'$name'"); dumpAccumulatedAttlistDcl(); ($keepPIs) || return; #$indentSpaces = curIndent(); if ($repressed or (defined $dropPIsHash{$name})) { #doWarn( "Repressing PI $name"); } elsif ($pastDTD && !$repressed) { if ($canonical && $docelStatus eq "after") { print "\n"; } dumpLine("", "pi"); if ($canonical && $docelStatus eq "before") { print "\n"; } } } # procHandler # Discard DTD comments unless asked for DTD. sub commentHandler { my $text = $_[1]; dumpAccumulatedAttlistDcl(); $commentCount++; # XXX Perhaps count DTD comments separately? return unless ($keepComments && !$repressed); return unless ($pastDTD || $keepDTD); startEvent("comment", "'$text'"); my $theIndent = curIndent(1); if ($normalizeBars && ($text =~ m/^\s*[-_*~=.]+\s*$/)) { # bar line my $len = $width - 9; $len -= length(curIndent()); $text = ($barchar x $len); } elsif ($normalizeBars && ($text =~ m/^\s*$/)) { # blank line my $len = $width - 9; $len -= length(curIndent()); $text = (' ' x $len); } elsif ($indentComments) { $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\n\s+/\n$theIndent/g; # Indent to match "", "comment"); if ($canonical && $docelStatus eq "before") { print "\n"; } } # commentHandler sub doctypeHandler { my ($p, $name, $sysid, $pubid, $internal) = @_; startEvent("doctype", "'$name'"); dumpAccumulatedAttlistDcl(); ($keepDoctype) || return; if ($nameForm ne "") { $name = fixNameForm($name); } if ($pubid) { dumpLine("", "dcl"); } elsif ($sysid) { dumpLine("", "dcl"); } else { dumpLine("", "dcl"); } } # doctypeHandler sub doctypeFinHandler { startEvent("doctypeFin", ("*" x 30)); dumpAccumulatedAttlistDcl(); $pastDTD = 1; if ($verbose) { warn "doctypeFin: ID attributes declared: " . scalar(keys %idAttrList) . ": " . join(", ", keys %idAttrList) . ".\n"; warn "doctypeFin: IDREF(S) attributes declared: " . scalar(keys %idrefAttrList) . ": " . join(", ", keys %idrefAttrList) . ".\n"; } ($keepDoc) || exit; } # doctypeFinHandler sub cdataStartHandler { startEvent("cdata"); # ignored } sub cdataEndHandler { startEvent("cdataEnd"); # ignored } # The external entity handlers are only enabled with the -entities option. sub externEntHandler { (my $p, my $base, my $sysid, my $pubid) = @_; startEvent("externEnt", "'$sysid'"); my $entityInfo = "name='???' base='$base' public='$pubid' system='$sysid' file='???'"; push@entityStack, $entityInfo; ($etrace || $entities) && dumpLine("", "pi"); # ($parserCatalog) || warn "\n*** parserCatalog is nil***\n"; return($parserCatalog->get_handler($p)); # not right } sub externEntFinHandler { startEvent("externEntFin"); my $entityInfo = pop @entityStack; ($etrace || $entities) && dumpLine("", "pi"); } ############################################################################### # DTD stuff sub entityDclHandler { (my $p, my $name, my $value, my $sysid, my $pubid, my $ndata, my $isParam) = @_; my $pflag = ($isParam) ? "%":""; if (!defined $pubid) { $pubid = ""; } if (!defined $sysid) { $sysid = ""; } # Integrate $tracePentities into startEvent() XXX startEvent("entityDcl", "$pflag$name"); dumpAccumulatedAttlistDcl(); ($keepDTD) || return; if ($isParam) { if (!$keepPentities) { return; } } if (!$keepInternalEntDcls && $value ne "" && $sysid eq "" && $pubid eq "" && $ndata eq "" && !$isParam) { return; # (typically, a character entity dcl) } my $buf = ""; dumpLine($buf,"dcl"); } # entityDclHandler sub elementDclHandler { (my $p, my $name, my $model) = @_; startEvent("elementDcl", "'$name'"); dumpAccumulatedAttlistDcl(); ($keepDTD) || return; if ($alphabetizeModels) { # only does if it's all one big model if ($verbose) { print "alphabetizing ||$model||\n"; (my $z = $model) =~ s/\w//g; } (my $wmodel = $model) =~ s/([?+*])\s*$//; my $rep = $1; $wmodel =~ s/^\(\s*//; $wmodel =~ s/\)\s*$//; if ($wmodel =~ m/^([\w\#-_.:]+\|)+[\w\#-_.:]+$/) { # a|b|c... my @items = split(/\|/,$wmodel); $wmodel = join("|",sort @items); if (substr($wmodel,0,1) eq "|") { $wmodel = substr($wmodel,1); } $wmodel = "(" . $wmodel . ")" . $rep; ($verbose) && print "***** OR model, " . (scalar @items) . " tokens, rep '$rep'. Sorted: $wmodel\n"; $model = $wmodel; } } # ab if ($breakModels) { $model = breakModel($model,"MODEL"); } $model =~ s/\s*$//; if ($breakModels) { $model = breakModel($model, "XSL"); # experimental } dumpLine(sprintf("",$name,$model), "dcl"); } # sub elementDclHandler # Multiple attributes are returned as separate events, so we have to # accumulate them to print a correct single ATTLIST. sub attlistDclHandler { (my $p, my $ename, my $aname, my $avalue, my $adft, my $afix) = @_; startEvent("attlistDcl", "'$ename\@$aname'"); dumpAccumulatedAttlistDcl(); # Track which are real ID/IDREF(S), so we know which to normalize/del. if ($avalue eq "ID") { $idAttrList{"$ename/$aname"} = 1; } if ($avalue eq "IDREF" || $avalue eq "IDREFS") { $idrefAttrList{"$ename/$aname"} = 1; } if (!$keepDefaultAttrs) { # Memorize who they are if ($adft ne "") { $attrDefaults{"$ename/$aname"} = $adft; } } ($keepDTD) || return; my $abuf; if ($afix) { $abuf = sprintf("$nameField $nameField #FIXED %s\n",$aname,$avalue,$adft); } else { $abuf = sprintf("$nameField $nameField %s\n",$aname,$avalue,$adft); } if ($attdclbuf eq "") { $attdclbuf = ""; dumpLine($buf, "pi"); } # sub xmlDclHandler sub defaultHandler { my $text = (defined $_[1]) ? $_[1]:""; # If not -dtd, markup dcls come here -- don't count? if ($pastDTD || $keepDTD) { startEvent("default", "'$text'"); } dumpAccumulatedAttlistDcl(); } ############################################################################### # startEvent: Called at the beginning of every event. # Params: # 0: event type (name of handler subroutine minue 'Handler') # 1: anything to print following trace message # XXX Perhaps move dumpAccumulatedAttlist call in here? # sub startEvent { my $etype = $_[0]; my $msg = (defined $_[1]) ? $_[1]:""; my $leader = "***SAX***"; $eventCount++; if ($progress > 0 && ($eventCount % $progress) == 0) { warn "$leader events handled: $eventCount\n"; } # Perhaps report FQGI instead of just element on start and end? my $context = ($traceFQGI) ? join($fqgiSep,@tagStack) : ""; # Print an event tracing message? if ($traceEvents) { if ($etype ne "char" || $traceWsEvents || $msg =~ m/[^\s]/) { $msg =~ s/\n/\\n/g; if ($traceFQGI) { warn "$leader (FQGI $context) " . uc($etype) . " $msg\n"; } else { warn "$leader " . uc($etype) . " $msg\n"; } } } elsif ($tracePentities && $etype eq "entityDcl" && $msg =~ m/^%/) { $msg =~ s/\n/\\n/g; warn "$leader " . uc($etype) . " $msg\n"; } # If verbose mode, otherwise just once at the very end, check integrity if ($verbose || $etype eq "final") { checkStacks(); } } # startEvent ############################################################################### # Check integrity of open element, their child-counts, and their namespaces. sub checkStacks { if (scalar @tagStack == scalar @childElementCounts && scalar @tagStack == scalar @idStack && scalar @tagStack == scalar @childNodeCounts && scalar @tagStack == scalar @nsStack && scalar @tagStack == scalar @nsFullStack) { return; } warn "ERROR: Stacks are out of sync:\n"; my $i = 0; while ($i1) && warn "\n******* breakStartTag (ballattrs $breakAtAllAttrs)\n"; # First, normalize space. (my $s1 = $s) =~ s/\s+/ /g; my $s1len = length($s1); # Parse off the element type and following space $s =~ s/\s*<([-_.:\w]+)\s+//; my $gi = $1; my $buf = "<$gi"; # no space since attrs each add one my $thisAttr = ""; while ($s =~ m/[^\s>]/) { # per attribute if ($s =~ m/^([-_.:\w]+)=\'/) { # single-quoted $s =~ s/^([-_.:\w]+=\'[^\']*\')\s*//; $thisAttr = $1; } elsif ($s =~ m/^([-_.:\w]+)=\"/) { # double-quoted $s =~ s/^([-_.:\w]+=\"[^\"]*\")\s*//; $thisAttr = $1; } else { # shouldn't happen doWarn( "breakStartTag: Found strange data in start-tag: '$s'."); doWarn( " Original was: $_[0]"); $rc .= "$buf$newline$ind$s"; $buf = ""; last; } # For now, this requires that breakAttrs entail breakAtAllAttrs. if ($breakAtAllAttrs || (length($buf) + length($thisAttr) + 1 > $width-$ind)) { if ($verbose>2) { warn "\n******* overflow thisAttr '$thisAttr'\n"; warn "******* dumping buf '$buf'\n"; } $rc .= $buf; if ($indentAttrsPastGI) { $buf = "$newline$ind " . (" " x length($gi)); } else { $buf = "$newline$ind"; } if ($breakInsideAttrs && length($thisAttr) > $width-$ind) { ($verbose>2) && warn "\n******* breaking thisAttr '$thisAttr'\n"; if ($xsl) { $thisAttr = breakXslExpr($thisAttr); } else { $thisAttr = breakModel($thisAttr, ($gi=~m/^xsl:/)?"XSL":"ATTR"); } } } $buf .= " $thisAttr"; } # while $rc = "$rc$buf>"; # Check that we didn't toast any data (maybe test one level down?) my $rclen = length($rc); if ($rclen < $s1len || $verbose>1) { ($rclen < $s1len) && warn "ERROR: breakStartTag lost data:\n"; warn "\nOriginal ($s1len): '$s1'\n\nResult($rclen): '$rc'\n"; } return($rc); } # breakStartTag # Break up a parenthesized string such as a content model or XSL attr. # Probably should really build a parse tree and go bottom-up. # Should also alphabetize OR-groups? # Make an option to keep preceding token sticky, as in f(x). # This won't work with quoted parens. sub breakModel { my $m = $_[0]; # Content model, attr, or other to break my $whoseRules = $_[1]; # What is it, anyway? my $buf = ""; # Results my $baseIndent = ""; # Initial indent my $modelIndentString = " "; # Incremental indent my $lvl = 0; # Current nesting level # Break it up at (), pretend whitespace doesn't matter my @parts = (); if ($whoseRules eq "MODEL") { # (...)* $m =~ s/\s//g; @parts = split(/([()][*+?]?)/, $m); $baseIndent = "" x 29; } elsif ($whoseRules eq "XSL") { # foo(...) bar[...] # Lose some whitespace? @parts = split(/\w+\(|\)/, $m); $baseIndent = ""; } elsif ($whoseRules eq "ATTR") { # ??? @parts = split(/\s+/, $m); $baseIndent = curIndent(1); } else { die "breakModel: Unknown whoseRules '$whoseRules'.\n"; } for (my $i=0; $i $width) { my $part = substr($parts[$i],0,$width); $part =~ s/[|,\s&][^|,\s&]*$//; $innerbuf .= "$newline$baseIndent$ind$part"; $parts[$i] = substr($parts[$i],length($part)); } if ($innerbuf ne "") { $buf .= "$innerbuf$newline"; } $buf .= "$baseIndent$ind$parts[$i]"; } } } # for $buf =~ s/^\s*//; ($lvl!=0) && ($verbose) && warn "Unbalanced () in: '$m'.\n"; return $buf; } # end breakModel BEGIN { my $rc = ""; # Results my $buf = ""; # Line being built my $baseIndent = 0; # Initial indent my $incrIndent = 4; # Incremental indent amount my $lvl = 0; # Current nesting level my @margins = (); # Distance over for each lvl. # Need to pass in the indent to the attribute... sub breakXslExpr() { my $m = $_[0]; # Content model, attr, or other to break $buf = $rc = ""; $baseIndent = ($_[1] > 0) ? $_[1]:curIndent(2); $modelIndentString = " "; $lvl = 0; # Break it up at: and, or, |, &, my @parts = (); @parts = split(/( and | or |, |[()\[\]])/, $m); ($verbose) && warn "Tokenizes attr '$m' to:\n " . join("\n ",@parts) . "\n"; for (my $i=0; $i $width) { breakHere(); } $buf .= "$parts[$i]"; } } # for #$buf =~ s/^\s*//; ($lvl!=0) && ($verbose) && warn "Unbalanced () in: '$m'.\n"; return ("$rc$buf"); } sub breakHere() { $rc .= "$buf$newline"; $buf = "$baseIndent" . ($modelIndentString x $lvl); } } # END ############################################################################### # This buffer is filled in by the attribute declaration handler, and dumped # when we get any other event. That's because we get a separate event for each # attribute declared, but have to issue just one combined declaration. sub dumpAccumulatedAttlistDcl { if ($keepDTD && ($attdclbuf ne "")) { dumpLine("$attdclbuf>","dcl"); } $attdclbuf = ""; } BEGIN { # Variables needed for greeking text, so we don't re-assign many times. my $ucase = "ABCDEFGHIJKLMNOPQRSTUVWXYZAASSTTEEEEIIOOU"; # Include an occasional extra space so word lengths change. my $lcase = "abcdefghijklmnopqrstuvwxyz aasstteeeeiioou"; my $digit = "1234567890120"; my $finalPunct = "..,,,?!\'"; my $midWordPunct = "-\'"; my $ulen = length($ucase); my $llen = length($lcase); my $dlen = length($digit); my $flen = length($finalPunct); my $mlen = length($midWordPunct); sub greekText { my $n = length($_[0]); my $rc = ""; for (my $i=0; $i<$n; $i++) { my $c = substr($_[0],$i,1); if ($c =~ /[[:upper:]]/) { $rc .= substr($ucase, int(rand($ulen)),1); } elsif ($c =~ /[[:lower:]]/) { $rc .= substr($lcase, int(rand($llen)),1); } elsif ($c =~ /[[:digit:]]/) { if (rand(1.0) > 0.1) { # delete a few $rc .= substr($digit, int(rand($dlen)),1); } } elsif ($c =~ /\s/) { $rc .= " "; } elsif (substr("$_[0] ",$i+1,1) =~ /\s/) { $rc .= substr($finalPunct, int(rand($flen)),1); } else { # assume it's mid-word punctuation $rc .= substr($midWordPunct, int(rand($mlen)),1); } } ($verbose>1) && doWarn( "Greeked [$_[0]] to [$rc]."); return($rc); } # greekText } sub rot13Text { my $n = length($_[0]); my $ucase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" x 2; my $lcase = "abcdefghijklmnopqrstuvwxyz" x 2; my $rc = ""; for (my $i=0; $i<$n; $i++) { my $c = substr($_[0],$i,1); if ($c =~ /[A-Z]/) { $rc .= substr($ucase, index($ucase,$c)+13,1); } elsif ($c =~ /[a-z]/) { $rc .= substr($lcase, index($lcase,$c)+13,1); } else { $rc .= $c; } } return($rc); } # rot13Text # Turn non-ASCII characters (any > $maxNonEntityChar) into entities. # amp, lt, whitespace, etc. are handled elsewhere. Watch performance! sub fixEntities { if ($_[0] =~ /^\p{IsASCII}*$/) { # Got only ASCII? ($verbose > 2) && warn "fixEntities-pl($_[0]).\n"; return($_[0]); } else { my $rc = ""; my $tlen = length($_[0]); for (my $i=0; $i<$tlen; $i++) { my $o = ord(substr($_[0],$i,1)); if ($o>$maxNonEntityChar) { $rc .= sprintf($crFormatString,$o); } else { $rc .= substr($_[0],$i,1); } } #for # Normalize was done in charhandler ($verbose > 2) && warn "fixEntities-rc($_[0]).\n"; return($rc); } } # fixEntities # Force an element or attribute name, etc., to desired char set. # Does not currently get rid of ASCII punctuation. sub fixNameForm { my $rc; if ($nameForm eq "L") { $rc = lc($_[0]); } elsif ($nameForm eq "U") { $rc = uc($_[0]); } elsif ($nameForm eq "A") { ($rc = $_[0]) =~ s/\P{IsASCII}/_/g; } elsif ($nameForm eq "C") { ($rc = $_[0]) =~ s/[^-.:a-zA-Z0-9]/_/g; } return($rc); } ############################################################################### # dumpLine: Write out one tag/text/whatever event, line-breaking correctly. # Handle when to insert newlines. # Params: 0: text of line; 1: kind of event; 2: gi if element. # Notes: char handler may not even pass us white-space-only nodes.... # BEGIN { # Track kinds of events happening, for better control of whitespace placement. # *** Expand 'dcl' event to qw/ dcl pi comment mcomment / my $lastEvent = "dcl"; # One of qw/ dcl start text end / my $lastElementClass = ""; # qw/ inline block section preserve / my $thisEvent; my $pendingLine = ""; # Hold start-tags til we see if element is empty my $l; my $gi; my $buf; my $reIndent; # With -breaks: nl before every line # With -breaksOutside: nl before anything except end-tag. # Params: # 0: text of line to dump # 1: 'text', 'start', 'end', 'dcl' (helps decide where to break lines) # 2: element type when applicable, so we can look up for inlines # sub dumpLine { $l = $_[0]; $thisEvent = $_[1]; $gi = $_[2]; $reIndent = ($indent) ? curIndent():""; my $buf = ""; if ($pendingLine) { # Do we have a start-tag waiting? $buf = handlePendingLine(); } # $pendingLine # We should not be called at all if $repressed is true. if ($thisEvent eq "text") { $buf = handleTextLine($l); # ($verbose > 1) && warn "back from handle: '$buf'\n"; } elsif ($thisEvent eq "start") { $buf = handleStartTagLine($l,$gi); } elsif ($thisEvent eq "end") { $buf = handleEndTagLine($l,$gi); } elsif ($thisEvent =~ m/^(dcl|pi|m?comment)$/) { $buf = handleDclLine($l); } else { doWarn( "WARNING: Invalid dumpLine event: '$thisEvent'."); } # ($verbose > 1) && warn "dump to reallypr: '$buf'\n"; ($nil) || reallyPrint($buf, "dumpLine"); $lastEvent = $thisEvent; } # dumpLine # What's pending, should be a start-tag. sub handlePendingLine() { if ($verbose && !breakAtAllAttrs && !$breakInsideAttrs && $pendingLine =~ m/\n.*\n/) { doWarn( "Got more than 1 newline with pending start-tag.\n"); } if ($thisEvent ne "end") { # With all but "end", just issue it reallyPrint($pendingLine, "handlePendingLine-non-end"); $pendingLine = ""; } # Is $pendingLine hasn't been cleared, then we're now at an end-tag # immediately after a start-tag, so this must be an empty element. else { # Modify the start-tag in $pendingLine as appropriate if ($emptyFormat eq "F") { # full start/end pair $pendingLine =~ s/[\r\n ]+$//; } elsif ($emptyFormat eq "E") { $pendingLine =~ s|>\s*$|/>|; } elsif ($emptyFormat eq "H") { $pendingLine =~ s|>\s*$| />|; } elsif ($emptyFormat eq "S") { # Don't change the start-tag at all } else { doWarn("Unknown empty element format '$emptyFormat'."); exit; } reallyPrint($pendingLine, "handlePendingLine-empty"); $pendingLine = ""; $lastEvent = $thisEvent; if ($emptyFormat eq "E" || $emptyFormat eq "H" || ($emptyFormat eq "S" && $displayClass{$gi} eq "EMPTY") ) { # Suppress current line (end-tag) #doWarn ("Dropping end-tag '$l'\n"); return; } } return(""); } # handlePendingLine sub handleTextLine() { my $l = $_[0]; my $buf = ""; ($verbose>2) && warn "handleTextLine got '$l' (breakInsideText $breakInsideText," . " cleanTextNodeEnds $cleanTextNodeEnds)\n"; if ($l =~ m/[^\s]/ && ($breakInsideText || $cleanTextNodeEnds)) { $l =~ s/^\s\s*//; $l =~ s/\s\s*$//; } if ($breakInsideText) { if ($l =~ /^\s*$/) { $buf = ""; } else { my $ind = ($indentInsideText) ? curIndent():""; my @tokens = split(/\s+/, $l); #print scalar @tokens . " tokens: " . join("|",@tokens) . "\n"; $buf .= "\n"; my $lbuf = $ind; for my $t (@tokens) { if (length($lbuf) + length($t) > $width) { $buf .= "$lbuf\n"; $lbuf = "$ind $t"; } else { $lbuf .= " $t"; } } $buf .= $lbuf; } } elsif ($unbreakInsideText) { ($buf = $l) =~ s/[\n\r]+/ /g; } else { $buf = $l; } if ($educateQuotes) { $buf =~ s/(\s)\"/$1$ldquo/g; $buf =~ s/\"(\s)/$rdquo/g; $buf =~ s/(\s)\'/$1$lsquo/g; $buf =~ s/\'(\s)/$rsquo/g; } ($verbose>2) && warn "handleTextLine ret '$buf'\n"; return($buf); } # handleTextLine sub handleStartTagLine() { my $l = $_[0]; if ($breakInsideTags) { $l = breakStartTag($l); } if (defined $displayClass{$_[1]} && $displayClass{$_[1]} eq "INLINE") { $buf = $l; } elsif (defined $displayClass{$_[1]} && $displayClass{$_[1]} eq "BLOCK" && $lastEvent !~ m/^(dcl|pi|m?comment)$/) { $buf = "$newline$newline$reIndent$l"; } elsif ($breakUgly) { ($verbose) && doWarn("***"); my $reIndent = curIndent(); ($buf = $l) =~ s|>$|$newline$reIndent>|; } elsif ($break || $breakOutside) { if ($lastEvent ne "end") { $buf = "$newline$reIndent$l"; } else { $buf = "$newline$reIndent$l"; } } else { $buf = $l; } #warn "Deferring a start-tag for '$_[1]'.\n"; $pendingLine = $buf; $buf = ""; return($buf); } # handleStartTagLine sub handleEndTagLine() { my $l = $_[0]; my $dclass = (defined $displayClass{$_[1]}) ? $displayClass{$_[1]} : ""; if ($dclass eq "INLINE" || $lastEvent eq "start" || $lastEvent eq "text") { $buf = $l; } elsif ($dclass eq "BLOCK") { $buf = "$newline$reIndent$l$newline$newline"; } elsif ($dclass eq "SECTION") { $buf = "$newline$reIndent$l$newline$newline$newline"; } elsif ($break) { $buf = "$newline$reIndent$l"; } elsif ($breakOutside) { $buf = "$l$newline$reIndent"; } elsif ($breakUgly) { my $reIndent = curIndent(); ($buf = $l) =~ s|>$|$newline$reIndent>|; } else { $buf = $l; } return($buf); } # handleEndTagLine sub handleDclLine() { my $l = $_[0]; if ($break || $breakOutside) { $buf = (lastEvent =~ m/^(dcl|pi|m?comment)$/) ? "":$newline; $buf .= "$reIndent$l$newline"; } elsif ($breakUgly) { my $reIndent = curIndent(); ($buf = $l) =~ s|(--n)?>$|$newline$reIndent$1>|; } else { $buf = $l; } return($buf); } # handleDclLine } # the BEGIN # Run all printing through here, so we can implement -waitfor and -traceprint. sub reallyPrint { if ($waitfor && index($_[0],$waitfor) >= 0) { $waitfor = ""; } if ($waitfor) { return; } my $p = $_[0]; if ($tracePrint) { my $l = $p; #$l =~ s/\n/\\n/g; #$l =~ s/\r/\\r/g; #$l =~ s/\t/\\t/g; #$l = "[$_[1]]===>$l<===\n"; warn "reallyPrint: '$l'\n"; } if ($p =~ /\P{ASCII}/) { for (my $i=0; $i 0) { $offset = $_[0]; } return($indentString x ($offset + scalar @tagStack)); } ############################################################################### sub doWarn { use IO::Handle; my $ofh = select(STDOUT); $ofh->flush(); warn "$_[0]\n"; } ############################################################################### sub showArchforms { warn" The -archform [name] option provides some small support for HyTime architectural form processing, and xlink processing which is quite similar. In short, using this option amounts to declaring that the [name] attribute specifies the element type for the element (generally in a namespace other than the one applicable to the present explicit element type). With this option, you can cause such values to replace the explicit element type. For example, if your DTD contains: and your document contains: then specifying the option '-archform xsl-form' will transform that element to: Options that delete particular tags or element types operate on the original element type, not on any replacement element type specified via this option. The -archform option is repeatable. If more than one of the named attributes is present on an element instance, the one whose -archform option is first applies."; } sub showBreakRules { warn " There are several pre-defined patterns of line-breaking available: The default rule is to insert no breaks. -b breaks before start-tags and after end-tags, except for any elements known to be inlines (see -inline and -setdtd). There will also be breaks before and after comments, but not PIs. -bugly is special: it forces line-breaks before the '>' of start and end tags. This puts the newlines in markup instead of content, so they will never produce white-space-only text-nodes that could get in the way. There are also special options for breaking within start-tags, within individual attributes, within text content, and within comment text. If you want finer control, breaks can be set for several classes of events: Elements known as inline, block, whitespace-preserving (wsp), and section (elements can be categorized individually, or by -setdtd [dtdname]). Elements not in any of those 4 categories ('other'). PIs, comments (single-line), and multi-line-comments. These event classes are referred to by first letter: [ibwsopcm]. For each class, you can choose whether to have a newline in each of 4 places: Before the start, after the start, before the end, and after the end. These places are marked (respectively) with 1, 2, 3, and 4 below: 1

2Hello, world.3

4 (same for inline, block, wsp, and section) 14 14 (same for multi-line comments) The options are named simply 'b' plus the name of the event class. To determine the option value, put together four digits: how many newlines for each place. For example, to have both tags for section-class elements ('s') on their own separate lines and to double-space before the start-tag, use the option: -bsection 2111 In some cases, you may want to have a break in place number 1 only if the last (preceding) event was a certain class. To express such conditions, you can replace the first integer with a bracketed list of the preceding event classes with which you *do* want a newline. Only in the case of the categories for *elements* could the preceding event be either a start or and end (tag). To distinguish these, use the class letter (from [ibwso]) to indicate the start-tag, and '/' plus the start letter to indicate the end-tag. For example: -bb [s/s/b]000 means: set the breaking rule for block elements so there is a break before the start-tag if the preceding event was: a start-tag for a section element (s), an end-tag for a section element (/s), or an end-tag for a block element (/b); and so there are no breaks in the other 3 places. All the pre-defined breaking patterns (and many more) can be built this way (except -bugly). For example, -b is the same as: -bi 0000 -bb 1001 -bp 1001 -bs 1001 -bo 1001 -bp 0000 -bc 1001 -bm 1001 \n"; } sub showUsage { warn " Usage: normalizeXML [options] file Write out a normalized form of an XML file. For Canonical XML say -can. However, also lets you chose a wide variety of layouts, and uniformly add/delete various items. Details: Empty elements become start/end pairs, or per -empty option Whitespace in markup is normalized Whitespace-only text nodes are dropped Attributes are alphabetized, always quoted with '\"', and have internal '\"' replaced by &quo; Defaulted attributes become explicit CDATA marked sections go away in favor of just escaping Characters > 255 become 5-digit '#x' entities (does not do Unicode normzn) External parsed entities become inline data. DOCTYPE declaration is deleted. CRLF, CR, and LF are normalized to just LF (d10), or per -outlineends. -sd Set my preferrred layout. Merely shorthand for: -i -ic -it -btags -noientities Whitespace options (for more details, use '-help-breaks'): NOTE: Negate a flag option by prefixing 'no', e.g. -nob to negate -b. -b Break lines around each tag or PI, and at EOF. (see -outlineends, below, to control line-end type). -ballattrs Break before every attributes (implies -btags). -battrs Break up long attributes (implies -btags) (experimental). -btags Break and indent within long start-tags (implies -i). -btext Break and indent text nodes (implies -i). See also -it. WARNING: Parser may separate text nodes. (may get break/indent around character refs, \\n). -bmodels Break/indent content models (implies -dtd). (unfinished). -boutside Break lines 'outside': before start-tags, after ends. -bugly Break in 'ugly' but safe places: inside end of tags. -i Indent the output. -iattrspastgi With -btags, line up attribute names past element name. -ic Indent lines of multi-line comments (implies -i). -istring 's' String to repeat for each indent level (default ' '). ('| ' makes it easy to see element scopes). -it Indent text broken by -btext (implies -btext, -i, -b). -lineends Normalize incoming lineends to Unix-style. -mulberryDtd Lay out DTD in as for mulberrytech.com (unfinished). -outlineends x Use M(ac), D(OS), or U(nix: default) style line ends. -width n Set width for breaking lines (default $dft_width, min 10) (\$COLUMNS has real terminal width, if exported). -ws type How to handle white-space-only text nodes (default = D) K(eep), D(rop), P(turn into PIs), M (keep multi-line) Entity options: -crbase 10|16 Write numeric character references in decimal or hex. -crcase u|l Write hex numeric char refs in u(pper) or l(ower) case -crdigits n Minimum number of digits to write for character refs. -crxml form Output amp, lt, etc. as 'name' (amp, lt) or 'number'. -entities Preserve external entities by adding start/end PIs (for now, entities are not expanded, you just get PIs). -etrace Report when entities are referenced. Things normally kept, that you can drop: -nil Suppress everything (except tracing and errors). -noattr 'name' Delete XML attribute 'name' (repeatable). -noattrpat 'pat' Delete XML attributes matching 'pat' (experimental). -noblanklines Drop blank lines. -nocomments Delete XML comments. -nodefaultattrs Delete attributes that can be defaulted (experimental). -nodoc Delete the document (mainly useful with -dtd). -nodoctype Delete XML DOCTYPE declaration. -noelement 'name' Delete XML tags and content for 'name' (repeatable). -noendtags Delete all end-tags. -nogenids Delete XSLT generate-id() values. (Xalan/Saxon: =\"[TNF]0x[0-9a-f]*[N\.]0x[0-9a-f]*\"). -noids Delete XML attributes named 'id' (see also -norid). -noientities Delete internal entities when displaying the DTD. -nonsattrs Delete XML namespace attributes. -nopi 'name' Delete XML processing instrs of 'name' (repeatable). -nopis Delete processing instructions. -norids Delete XML attributes named 'rid' (by name, not type). -notag 'name' Delete XML start and end tags for 'name' (repeatable). -noxmldcl Delete XML declaration -text 'form' Print text as F(ull), N(one), G(reeked), or R(ot13). Things normally dropped, that you can keep: -dtd Include the DTD (with parameter entities expanded!) (for now, excludes entity declarations). -pentities Include parameter entity declarations (implies -dtd). (not the same as -noparam!). Things to normalize: -amodels Alphabetize names in OR models (implies -bmodels) -can Canonical: -noxmldcl -nodoctype -ws D -lineends -crdigits 1 -cdbase 16 -crcase U, and also invokes special escaping and whitespace handling to conform to Canonical XML (www.w3.org/TR/xml-c14n). -educateQuotes Make straight into curly quotes in non-preserve content. -idnames Normalize names of ID-related attributes. -maxnamewidth n Size column to allow for XML names in DTD (if there are names over this width, they are *not* truncated). -normalizetext Normalize whitespace in text nodes (not recommended). -normalizeruns Normalize *runs* of whitespace in text nodes (safer). -normalizeattrs Normalize whitespace inside attribute values (this could change the meaning, for example in XSLT). -normalizebars Normalize length of comments containing only '='. -xsl Format XSL code nicely. Things to add: -elementSpace Insert a space for -noelement elements (see also -tagspace). -flagBlockEnds Add an 'end' comment at the end of block elements. -fqgi Add nx:fqgi attribute with list of ancestor element types (you can use this and then 'grep' in the result...). -fqgisep char Use between -fqgi element names (default '$dft_fqgisep'). -lnum Add nx:lnum giving the source line-number of each element. -ourns [prefix] Namespace prefix for -xpath, -fqgi, -lnum (default 'nx'). -tagspace Insert space in place of -notag tags (see -elementspace). -xpath [form] Add attribute nx:xpath with an XPath to the element. Form = None, Short, Long, or Cseq. Element layout class options: -block 'gi...' Element types to put extra space before (repeatable). -emptyElements 'gi...' Element types to treat as EMPTY when they are empty (only important with '-emptyFormat S' and no DTD). -inline 'gi...' Element types to treat as inline (repeatable). -preserve 'gi...' Element types to treat as space-preserving (repeatable). Use this for code-blocks, examples, etc. -section 'gi...' Element types to treat as big blocks (repeatable). These get some extra whitespace above/below. -setdtd name Set block, empty, inline, preserve, and section as for the named DTD (repeatable). Known DTDs are: tei, docbook, html, osis. Other options: -archform name Output the value of the 'name' attribute as the element type for elements it appears on (see -help-archform) -attrquotetype 'c' Specify single or double quote to go around attributes. -c name Use XML catalog 'name' (default = $default_catalog). -curlyQuotes opt A: Turn plain quotes to curly in content. O: leave as in source (default). N: never have curly quotes. -emptyFormat opt Write empty elements as Full: tart and end (default), Empty, HTML: empty with space like
, or SGML: start-tag only (see -emptyElements). -html Use libxml's HTML parsing option. -maxchar n Maximum character value to *not* make into an entity. Warning: Latin-1 differs from UTF-8, so if this is >127, display on XTerms, Browsers, etc. may be wrong. -names form Force element/attribute names to Upper, Lower, Ascii, or Clean (clean means s/[^-.:a-zA-Z0-9]/_/g). Does not affect DTD or ID values yet. -noparam Don't parse parameter entities or DTD. -only [id] Only output the element with id 'id' (not yet). -out ext Save each file's result in [filename].ext (vs. stdout) -progress n Report progress every n parser events (counting element starts/ends, text chunks, declarations if -dtd,... -q Suppress most messages. -traceevents Report every parser event (except whitespace ones). -tracefqgi Report element context (implies -traceevents). -tracepentities Report parameter entity declaration events, and normalizes space in internal values (implies -dtd). -tracewsevents Report parser whitespace events. -v Add more messages, and check integrity frequently. -voptions Show all option settings before running. -version Display version info and exit ($version, sjd). -waitfor string Don't output anything until the string occurs. -xmllang [opt] Explicit: Add xml:lang attribute on all elements. Implicit: Remove xml:lang where it can inherit. Original: Leave as in source. Known Bugs/Limitations (please report any other bugs to sderose\@acm.org): There appears to be a bug with -empty not actually suppressing the end-tag. Use -noparam if you're getting undefined-entity errors. Redundant-namespace declaration removal is not thoroughly tested. -bmodels indents at (), but not within a single ()-group. ID/IDREF dropping by declared value (instead of name) is not finished. IDs are only output for -flagblockends if they are *named* 'id'. Ordering of namespaces/attributes may not be perfect for Canonical XML. Requires DOCTYPE in order to parse a DTD file. Escaping quotes and ampersands in entity replacement text not fully tested. Related commands: postdiff - Runs any command (like this one) on two files, then diffs them. nameEntities - Changes numeric XML character references to named. splitat - Just break file at tags; faster but less flexible. hilite - Can colorize our output with its '-xml' option. "; }