#!/usr/bin/perl -w # # xmlstats: Parse some XML and gather lots of statistics. # # API doc at http://search.cpan.org/~msergeant/XML-Parser/Parser.pm # Also see http://search.cpan.org/~msergeant/XML-Parser-2.34/Expat/Expat.pm # # History: # 2006 sometime: Written by Steven J. DeRose, sderose@acm.org, based on old C. # 2008-03-23 sjd: Rewrite, clean up; fix langStack. # # Todo: # Count #fixed attr occurrences, redundant #implied/ns/lang? # Fix max children per element # Identify all id and idref attributes declared # Track longest attribute *list*. # Handle multiple documents # Fanout/Fanin stats (max children by element type # Count namespace nodes, elements per namespace # use strict; use Getopt::Long; my $version = "2008-03-23"; my $default_catalog = ""; # Option values my $all = 0; my $catalog = $default_catalog; my $help = 0; my $quiet = 0; my $verbose = 0; my $vv = 0; my $orphans = 0; my $chars = 0; my $charPairs = 0; my $comma = 1; my $dtd = my $stats = my $trackIds = 0; my $dot = "."; # Char to show indents in verbose mode my $ids = 0; my $minFqgiFreq = 1; # Min frequency to show up in report my $maxFqgiFreq = 0; # Min frequency to report (0=unlimited) my $nameWidth = 45; # Space to allow in reports by element type my $rank = 0; # List objects by frequency of occurrence my $reportParents = 0; my $reportFqgis = 0; # Report fqgis at all? my $trackMilestones = 0; my $trojans = 0; my $usage = 0; # Report occurrences of various things # Process options Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "c=s" => \$catalog, "chars!" => \$chars, "charpairs!" => \$charPairs, "comma!" => \$comma, "dot=s" => \$dot, "h|help|?" => \$help, "ids!" => \$trackIds, "dtd!" => \$dtd, "fqgis!" => \$reportFqgis, "maxfqgi=i" => \$maxFqgiFreq, "minfqgi=i" => \$minFqgiFreq, "orphans!" => \$orphans, "parents!" => \$reportParents, "q|quiet!" => \$quiet, "rank!" => \$rank, "stats|all!" => \$all, "trojans!" => \$trackMilestones, "usage!" => \$usage, "v|verbose+" => \$verbose, "vv+" => \$vv, "version" => sub { die "Version of $version, by Steven J. DeRose, sderose\@acm.org.\n"; } ); if ($help) { showUsage(); exit; } ($result) || die "Bad options.\n"; # Validate and default options if ($reportFqgis) { $usage = 1; } if ($reportParents) { $usage = 1; } if ($vv > 0) { $verbose = $dtd = $usage = 1; } if ($all) { $usage = $dtd = $orphans = $ids = $trojans = $chars = $reportParents = $reportFqgis = 1; $quiet = 0; } ($trackMilestones) && print "Trojan milestone tracking is not fully implemented.\n"; ($catalog eq "") || (-f $catalog) || die "Cannot find catalog file '$catalog'.\n"; ($maxFqgiFreq >= 0) || die "-maxfqgi value invalid ($maxFqgiFreq).\n"; ($minFqgiFreq > 0) || die "-minfqgi value too small ($minFqgiFreq).\n"; my $file = $ARGV[0]; if ($file eq "") { showUsage(); exit; } (-e $file) || die "File '$file' does not exist.\n"; ($ARGV[1]) && warn "Extra arguments -- ignored.\n"; ($quiet) || warn "Using catalog '$catalog'.\n"; my $cStart = my $cEnd = ""; if ($ENV{USE_COLOR} ne "") { $cStart = `colorstring bold`; $cEnd = `colorstring default`; } ################################################################################ my @tagStack = (); # Current stack of element types my @langStack = (); # Stack of xml:lang values my $lastEvent = ""; my $pastDTD = 0; # DTD stats my ($docel, $docsys, $docpub); # doctype dcl info my $totalElementTypes; my $totalDeclaredContent; my $totalDeclaredContentEMPTY; my $totalDeclaredContentCDATA; my $totalDeclaredContentRCDATA; my $totalAttrsDeclared; my %totalAttrTypesByDclValue; my %totalAttrTypesByDefault; # Event statistics my $totalStart, my $totalEnd; # Tag counts my $totalEmpty; my $totalWithAttr; # Element with any attributes at all my $totalModelTokens; # Total tokens in all content models my $totalPcdataModels; my $totalAttr; # All attribute instances my $totalId; # And a few special attributes my $totalRid; my $totalXmlLang; # Instance totals my $totalChar; # How many PCDATA text nodes my $totalCharLength; # How much PCDATA length my $totalWSN; # Number of white-space-only nodes my $totalWSNlength; my $totalPi; # Number of Processing instructions my $totalPiLength; # my $totalComment; # Count comments my $totalCommentInDoc; my $totalCommentInDTD; my $totalCommentLength; my $totalCommentLengthInDTD; my $totalCommentLengthInDoc; my $totalMarkupLength; # (normalized; may not exactly match source) my $totalContentLength; my $totalCdataStart; # my $totalCdataEnd; # my $totalCdataNodesAdjacent; # How often are text nodes not coalesced? my $totalCharsInCdata; # my $totalEERef; # External entities referenced my $totalDefault; # # Counts events/totals by name/type my %piCounts; # Occurrences of PIs, by name my %elementCounts; # Occurrences of element types, by name my %pairCounts; # Tag+parent occurrences my %fqgiCounts; # Tag context occurrences my %attrCounts; # Counted only by attr name, not per element my %elemattrCounts; # Counted per element my %langCounts; # Use of xml:lang attribute my %lineBreaks; # As Mac/PC/*nix my @characterCounts; # By individual character code my %characterPairCounts; # By sequences of character codes my $previousChar = " "; my %totalCdataByElementType = (); # How much CDATA, by gi my %totalCdataByLang = (); # How much CDATA, by xml:lang my %totalTextNodesByElementType = (); # How many text nodes (incl whitespace) my %totalWSNByElementType = (); # How many whitespace-only, by gi # Trojan milestone statistics my $nTrojanTags; my $nTrojanErrors; my %trojanStarts; my %trojanUsed; # Declaration statistics my %internalGeneralEntities; my %externalGeneralEntities; my %internalUnparsedEntities; my %externalUnparsedEntities; my %internalParameterEntities; my %externalParameterEntities; my %notations; # Schema information my %models; # Context information my $inCdata; # Are we in a CDATA section? my $maxLevel; # Deepest nesting level seen my $maxFqgi; # Deepest string of element types my $maxFqgiLoc; my %lastLine; # Line when we last saw each element type start. # ID/validation information (only does *names* id and rid for now). my %ids; my %rids; # occurrences of id/idref values. # Track the maximum values for some things, where they first happened, # and save that example for the report. my $maxNumAttrs; my $maxNumAttrsExample; my $maxNumAttrsLoc; my $maxAttrLength; my $maxAttr; my $maxAttrLoc; my $maxModelLength; my $maxModelTokens; my $maxModelDepth; my $maxModelDepthExample; my $maxModelDepthLoc; my $maxTextNodeLength; my $maxTextNodeLoc; my $maxTextNode; my $maxPiLength; my $maxPiLoc; my $maxPiName; my $maxPi; my $maxCommentLength; my $maxCommentLoc; my $maxComment; my $maxChildrenPerElement; my $maxChildrenPerElementType; my $maxChildrenPerElementLoc; my @nChildren = (); my $gaveXslWarning = 0; resetStats(); my $startTime; my $endTime; ################################################################################ # Set up the parser and callbacks use XML::Parser; use XML::Catalog; #my $parser = new XML::Parser(ErrorContext => 2); my $parser = new XML::Parser(ErrorContext => 0); if ($catalog ne "") { my $catalogObject=XML::Catalog->new($catalog); $parser->setHandlers(ExternEnt => $catalogObject->get_handler($parser)); } $parser->setHandlers( Init => \&initHandler, Final => \&finalHandler, Start => \&startTagHandler, End => \&endTagHandler, Char => \&charHandler, Proc => \&piHandler, Comment => \&commentHandler, CdataStart => \&cdataStartHandler, CdataEnd => \&cdataEndHandler, Doctype => \&doctypeHandler, DoctypeFin => \&doctypeFinHandler, Default => \&defaultHandler, Entity => \&entityDclHandler, Element => \&elementDclHandler ); # Process the XML (can only do one document per parser instance) $parser->parsefile($file); # $parser->parse("

Hello

"); # Issue reports ($quiet) || reportDtdStats(); ($quiet) || reportStats(); ($trackIds) && reportIds(); ($trackMilestones) && reportTrojans(); if ($usage) { if ($rank) { reportUsageByRanks(); } else { reportUsageByNames(); } } reportCharUsage(); ($dtd) && reportDtd(); ($quiet) || print "\n*** Done (xmlstats version of $version, by Steven J. DeRose) ***\n\n"; exit; ############################################################################## sub resetStats { @tagStack = @langStack = (); $lastEvent = ""; $docel = $docsys = $docpub = ""; $totalElementTypes = 0; $totalDeclaredContent = 0; $totalDeclaredContentEMPTY = 0; $totalDeclaredContentCDATA = 0; $totalDeclaredContentRCDATA = 0; $totalAttrsDeclared = 0; %totalAttrTypesByDclValue = ("ID" => 0, "IDREFS" => 0, "RID" => 0, "CDATA" => 0, "NMTOKEN" => 0, "NMTOKENS" => 0); %totalAttrTypesByDefault = ("#IMPLIED" => 0, "#REQUIRED" => 0); $pastDTD = 0; # Event statistics $totalStart =0; # Tag counts $totalEnd = 0; $totalEmpty = 0; $totalWithAttr = 0; # Element with any attributes at all $totalModelTokens = 0; # Total tokens in all content models $totalPcdataModels = 0; $totalAttr = 0; # All attribute instances $totalId = 0; # And a few special attributes $totalRid = 0; $totalXmlLang = 0; # Instance totals $totalChar = 0; # How many PCDATA text nodes $totalCharLength = 0; # How much PCDATA length $totalWSN = 0; # Number of white-space-only nodes $totalWSNlength = 0; $totalPi = 0; # Number of Processing instructions $totalPiLength = 0; # $totalComment = 0; # $totalCommentInDoc = 0; $totalCommentInDTD = 0; $totalCommentLength = 0; $totalCommentLengthInDTD = 0; $totalCommentLengthInDoc = 0; $totalCdataStart = 0; # $totalCdataEnd = 0; # $totalCdataNodesAdjacent = 0; $totalCharsInCdata = 0; # $totalEERef = 0; # External entities referenced $totalDefault = 0; # # Counts of events by name/type %piCounts = (); # Occurrences of PIs, by name %elementCounts = (); # Occurrences of element types, by name %pairCounts = (); # Tag+parent occurrences %fqgiCounts = (); # Tag context occurrences %attrCounts = (); # Counted only by attr name, not per element %elemattrCounts = (); # Counted only by attr name, not per element %langCounts = (); # Use of xml:lang attribute %lineBreaks = (); # As Mac/PC/*nix @characterCounts = (); %totalCdataByElementType = ();# How much CDATA anywhere under each gi %totalCdataByLang = (); %totalTextNodesByElementType = (); %totalWSNByElementType = (); # Trojan milestone statistics $nTrojanTags = 0; $nTrojanErrors = 0; %trojanStarts = (); %trojanUsed = (); # Declaration statistics %internalGeneralEntities = (); %externalGeneralEntities = (); %internalUnparsedEntities = (); %externalUnparsedEntities = (); %internalParameterEntities = (); %externalParameterEntities = (); %notations = (); # Schema information %models = (); # Context information $inCdata = 0; # Are we in a CDATA section? $maxLevel = 0; # Deepest nesting level seen $maxFqgi = ""; # Deepest string of element types $maxFqgiLoc = "-1"; %lastLine = (); # Line when we last saw each element type start. # ID/validation information (only does *names* id and rid for now). %ids = (); %rids = (); # occurrences of id/idref values. # Track the maximum values for some things, and where they first happened $maxNumAttrs = 0; $maxNumAttrsExample = ""; $maxNumAttrsLoc = "-1"; $maxAttrLength = 0; $maxAttr = ""; $maxAttrLoc = "-1"; $maxModelLength = 0; $maxModelTokens = 0; $maxModelDepth = 0; $maxModelDepthExample = ""; $maxModelDepthLoc = -1; $maxTextNodeLength = 0; $maxTextNodeLoc = "-1"; $maxTextNode = ""; $maxPiLength = 0; $maxPiLoc = "-1"; $maxPiName = ""; $maxPi = ""; $maxCommentLength = 0; $maxCommentLoc = "-1"; $maxComment = ""; $maxChildrenPerElement = 0; $maxChildrenPerElementType = ""; $maxChildrenPerElementLoc = -1; @nChildren = (); $gaveXslWarning = 0; } # resetStats ############################################################################## sub initHandler { ($vv) && verbose($_[0],"Init",$_[1]); $startTime = time(); $lastEvent = "INIT"; } sub finalHandler { ($vv) && verbose($_[0],"Final",$_[1]); $endTime = time(); $lastEvent = "FINAL"; } sub startTagHandler { ($verbose) && verbose($_[0],"Start-tag",$_[1]); $totalStart++; my $parentDepth = (scalar @tagStack) - 1; if ($parentDepth > 0) { $nChildren[$parentDepth]++; } push(@tagStack,$_[1]); $nChildren[$parentDepth+1] = 0; my $e = "?ent?"; # $_[0]->current_entity; my $l = $_[0]->current_line; $lastLine{$_[1]} = "$e: $l"; $elementCounts{$_[1]}++; my $fqgi = join("/",@tagStack); $fqgiCounts{$fqgi}++; (my $pair = $fqgi) =~ s|^.*/([^/]*/[^/]*)$|$1|; $pairCounts{$pair}++; # print "gi: $_[1], fqgi: $fqgi, pair: $pair\n"; if ($_[0]->depth > $maxLevel) { $maxLevel = $_[0]->depth; $maxFqgi = $fqgi; $maxFqgiLoc = $_[0]->current_line; } # See if there are any attributes if ($_[2]) { $totalWithAttr++; } # Keep track of number of attrs, and a few specific attributes. my $attlist = ""; my $trojanType = ""; my $trojanKey = ""; my $newLang = ""; my $numAttrs = 0; for (my $i=2; $i $maxAttrLength) { $maxAttrLength = $alen; $maxAttr = $cvalue; $maxAttrLoc = $_[0]->current_line; } if ($cname eq "id") { $totalId++; if (exists $ids{$cvalue}) { if (!$gaveXslWarning) { print "Duplicate ID value '$cvalue' at line " . $_[0]->current_line . " (prior occurrence was at line $ids{$cvalue}).\n"; if ($cvalue =~ m/\{.*\}/) { print "(this looks like an xsl file, not reporting again)\n"; $gaveXslWarning = 1; } } } else { $ids{$cvalue} = $_[0]->current_line; } } elsif ($cname eq "rid") { $totalRid++; $rids{$cvalue}++; } elsif ($cname eq "xml:lang") { $totalXmlLang++; $newLang = $cvalue; $langCounts{$newLang}++; } elsif ($cname eq "sID") { # start Trojan $trojanType = "s"; $trojanKey = $cvalue; if ($trojanStarts{$trojanKey} or $trojanUsed{$trojanKey}) { $nTrojanErrors++; ($trackMilestones) && print "Invalid: Trojan milestone key '$trojanKey' reused.\n"; } else { $trojanStarts{$trojanKey} = 1; } } # sID elsif ($cname eq "eID") { # end Trojan $trojanType = "e"; $trojanKey = $cvalue; if ($trojanUsed{$trojanKey}) { ($trackMilestones) && print "Invalid: Trojan milestone key '$trojanKey' reused.\n"; $nTrojanErrors++; } elsif ($trojanStarts{$trojanKey}) { delete $trojanStarts{$trojanKey}; $trojanUsed{$trojanKey} = 1; } else { ($trackMilestones) && print "Invalid: Trojan milestone key '$trojanKey' appears" . " on eID before sID.\n"; $nTrojanErrors++; } } # eID } # for each attribute $totalAttr += $numAttrs; if ($numAttrs > $maxNumAttrs) { $maxNumAttrs = $numAttrs; $maxNumAttrsExample = "<$_[1]$attlist>"; $maxNumAttrsLoc = $_[0]->current_line; } $totalMarkupLength += length(join("",@_)."<>") + $numAttrs*4; # Keep track of milestone markup if ($trojanType) { $nTrojanTags++; if ($trojanType eq "e" and $numAttrs > 1) { ($trackMilestones) && print "Invalid: Trojan milestone with eID='$trojanKey' has other attributes.\n"; $nTrojanErrors++; } } if ($newLang eq "") { $newLang = $langStack[scalar(@langStack)-1]; # inherit } if (!defined($newLang) || $newLang eq "") { $newLang = "???"; } push @langStack, $newLang; $lastEvent = "STARTTAG"; } # sub startTagHandler sub endTagHandler { ($verbose) && verbose($_[0],"End-tag",$_[1]); if ($nChildren[scalar @tagStack - 1] > $maxChildrenPerElement) { $maxChildrenPerElement = $nChildren[scalar @tagStack - 1]; $maxChildrenPerElementType = $tagStack[scalar @tagStack - 1]; $maxChildrenPerElementLoc = $_[0]->current_line; } $totalEnd++; my $expectedTag = pop(@tagStack); if ($expectedTag ne $_[1]) { my $l = $_[0]->current_line; print "Well-formedness error: Found end of '$_[1]' at line $l when " . " expecting '$expectedTag'.\n."; print "Last '$expectedTag' was started at $lastLine{$expectedTag}.\n"; } $totalMarkupLength += length(""); pop @langStack; if ($lastEvent eq "STARTTAG") { $totalEmpty++; } $lastEvent = "ENDTAG"; } # endTagHandler sub charHandler { ($verbose) && verbose($_[0],"Text node",$_[1]); my $textNodeLength = length($_[1]); $totalChar++; $totalCharLength += $textNodeLength; if ($inCdata) { $totalCharsInCdata += $textNodeLength; } if ($textNodeLength > $maxTextNodeLength) { $maxTextNodeLength = $textNodeLength; $maxTextNode = $_[1]; $maxTextNodeLoc = $_[0]->current_line; } my $c = $_[1]; if ($c =~ m/^\s*$/) { $totalWSN++; $totalWSNlength += length($c); } # Count line-breaks in there, by types (my $tbuf = $_[1]) =~ s/\r\n//g; my $tbuflen = length($tbuf); $lineBreaks{"PC"} += (($textNodeLength - $tbuflen) / 2.0); $tbuf =~ s/\r//g; $lineBreaks{"Mac"} += $tbuflen - length($tbuf); $tbuf =~ s/\n//g; $lineBreaks{"*nix"} += $tbuflen - length($tbuf); # Count occurrences of characters if ($chars) { for (my $i=0; $i"); (my $piKey = $_[1]); $piCounts{$piKey}++; if (length($_[2]) > $maxPiLength) { $maxPiLength = length($_[2]); $maxPiName = $_[1]; $maxPi = $_[2]; $maxPiLoc = $_[0]->current_line; } $totalMarkupLength += length(""); $lastEvent = "PI"; } sub commentHandler { ($verbose) && verbose($_[0],"Comment",$_[1]); $totalComment++; $totalCommentLength += length(""); if ($pastDTD) { $totalCommentInDoc++; $totalCommentLengthInDoc += length($_[1]); } else { $totalCommentInDTD++; $totalCommentLengthInDTD += length($_[1]); } if (length($_[1]) > $maxCommentLength) { $maxCommentLength = length($_[1]); $maxComment = $_[1]; $maxCommentLoc = $_[0]->current_line; } $totalMarkupLength += length(""); $lastEvent = "COMMENT"; } sub cdataStartHandler { ($verbose) && verbose($_[0],"CDATA start",$_[1]); $totalCdataStart++; $inCdata = 1; $totalMarkupLength += length(""); $lastEvent = "CDATAEND"; } sub doctypeHandler { (my $p, $docel, $docsys, $docpub) = @_; } sub doctypeFinHandler { $pastDTD = 1; $lastEvent = "DOCTYPEFIN"; } # This won't be called if there's a catalog in use.... sub externEntHandler { ($verbose) && verbose($_[0],"External Entity Ref",$_[1]); $totalEERef++; $lastEvent = "EXTERNENT"; } sub externEntEndHandler { ($verbose) && verbose($_[0],"External Entity End",$_[1]); $lastEvent = "EXTERNENTEND"; } sub defaultHandler { ($vv) && scalar($_)>1 && verbose($_[0],"Default",$_[1]); $totalDefault++; my ($p, $data) = @_; if ($data =~ /^"); } msg1( "External entity refs (unless catalog in use): ", $totalEERef); } # sub reportStats ################################################################################ sub reportDtdStats { head( " DTD STATISTICS "); if (!$dtd) { msg1( " (use -dtd option to get this information)"); return; } head( "Element types declared: ", $totalElementTypes); msgn( "Element types with #PCDATA: ", $totalPcdataModels); msgn( "Maximum content model length: ", $maxModelLength); msgn( "Maximum content model tokens: ", $maxModelTokens); if ($totalElementTypes != 0) { msg1( "Average content model tokens: " . ($maxModelTokens / $totalElementTypes)); } msgn( "Maximum content model () depth: ", $maxModelDepth); msg1( " A model of that depth (at line $maxModelDepthLoc):" . " $maxModelDepthExample"); head("Attributes declared:", $totalAttrsDeclared); msg1( " For each declared value:"); foreach my $x (sort keys %totalAttrTypesByDclValue) { msgn( " $x", $totalAttrTypesByDclValue{$x}); } msg1( " For each default value:"); foreach my $x (sort keys %totalAttrTypesByDefault) { msgn( " $x", $totalAttrTypesByDefault{$x}); } msg1( "Comments in DTD ", $totalCommentInDTD); } # sub reportDtdStats ################################################################################ sub reportUsageByNames { head("Specific element type usage:"); head( "Element occurrences by type name:"); # Elements foreach my $e (sort keys %elementCounts) { msgn( " $e", $elementCounts{$e}); } if ($reportParents) { head( "Element occurrences by parent:"); # Parents foreach my $p (sort keys %pairCounts) { my $freq = $pairCounts{$p}; msgn( " $p", $freq); } } if ($reportFqgis) { head( "Element context occurrences by context:"); # FQGIs foreach my $e (sort keys %fqgiCounts) { my $freq = $fqgiCounts{$e}; if (($freq>$minFqgiFreq) && ($freq<$maxFqgiFreq || $maxFqgiFreq==0)) { print lpadc($freq) . " $e\n"; } } } if (scalar keys %attrCounts <= 0) { # Attributes head( "No attribute occurrences."); } else { head( "Attribute occurrences by attribute name (not per element):"); foreach my $a (sort keys %attrCounts) { msgn( " $a", $attrCounts{$a}); } head( "Attribute occurrences by element/attribute name:"); foreach my $a (sort keys %elemattrCounts) { msgn( " $a", $elemattrCounts{$a}); } } if (scalar keys %langCounts > 0) { # xml:lang head( "xml:lang attribute occurrences by language value:"); foreach my $l (sort keys %langCounts) { msgn( " $l", $langCounts{$l}); } } else { head( "No xml:lang attribute occurrences."); } if (scalar keys %piCounts > 0) { # PIs head( "PI occurrences by name:"); foreach my $p (sort keys %piCounts) { msgn( " $p", $piCounts{$p}); } } else { head( "No PI occurrences."); } } # sub reportUsageByNames # Comparison routine so we can sort hash keys by value, not name. sub elementsByValue { $elementCounts{$a} <=> $elementCounts{$b}; } sub parentsByValue { $pairCounts{$a} <=> $pairCounts{$b}; } sub fqgisByValue { $fqgiCounts{$a} <=> $fqgiCounts{$b}; } sub attributesByValue { $attrCounts{$a} <=> $attrCounts{$b}; } sub langByValue { $langCounts{$a} <=> $langCounts{$b}; } sub pisByValue { $piCounts{$a} <=> $piCounts{$b}; } sub reportUsageByRanks { head("Specific element type usage:"); head( "Element occurrences by frequency:"); # Elements foreach my $e (sort elementsByValue keys %elementCounts) { msgn( " $e", $elementCounts{$e}); } if ($reportParents) { head( "Element occurrences by parent:"); # Parents foreach my $p (sort parentsByValue keys %pairCounts) { my $freq = $pairCounts{$p}; msgn( " $p", $freq); } } if ($reportFqgis) { head( "Element context occurrences by context:"); # FQGIs foreach my $e (sort fqgisByValue keys %fqgiCounts) { my $freq = $fqgiCounts{$e}; if (($freq>$minFqgiFreq) && ($freq<$maxFqgiFreq || $maxFqgiFreq==0)) { print lpad($freq) . " $e\n"; } } } if (scalar keys %attrCounts > 0) { # Attributes head( "Attribute occurrences by frequency:"); foreach my $a (sort attributesByValue keys %attrCounts) { msgn( " $a", $attrCounts{$a}); } } else { head( "No attribute occurrences."); } if (scalar keys %langCounts > 0) { # xml:lang head( "xml:lang attribute occurrences by frequency:"); foreach my $l (sort langByValue keys %langCounts) { msgn( " $l", $langCounts{$l}); } } else { head( "No xml:lang attribute occurrences."); } if (scalar keys %piCounts > 0) { # PIs head( "PI occurrences by frequency:"); foreach my $p (sort pisByValue keys %piCounts) { msgn( " $p", $piCounts{$p}); } } else { head( "No PI occurrences."); } } # sub reportUsageByRanks ############################################################################### sub reportCharUsage { if ($chars) { head( "\nCharacter occurrences in content:"); my $ascii=0, my $latin1=0; my $rest=0; for (my $i=1; $i=32 && $i<256) ? chr($i):" "); print " ", lpad($characterCounts[$i]) . "\n"; } } # for msg1( " Ascii: $ascii, Latin-1: $latin1, other: $rest."); } head( "Total (direct) cdata content by element type:"); foreach my $l (keys %totalCdataByElementType) { msgn( " $l", $totalCdataByElementType{$l}); } head( "Total (direct) cdata content by language:"); foreach my $l (keys %totalCdataByLang) { msgn( " $l", $totalCdataByLang{$l}); } head( "Total (direct) text nodes by element type:"); foreach my $l (keys %totalTextNodesByElementType) { msgn( " $l", $totalTextNodesByElementType{$l}); } head( "Total (direct) whitespace-only text nodes by element type:"); foreach my $l (keys %totalWSNByElementType) { msgn( " $l", $totalWSNByElementType{$l}); } head( "Total (direct) non-whitespace-only text nodes by element type:"); foreach my $l (keys %totalTextNodesByElementType) { my $wsn = (defined $totalWSNByElementType{$l}) ? $totalWSNByElementType{$l}:0; msgn( " $l", $totalTextNodesByElementType{$l} - $wsn); } if ($charPairs) { head( "Character pair frequencies:"); for my $pair (sort keys %characterPairCounts) { printf "%06x %06x $06d '%s'\n", ord(substr($pair,0,1)), ord(substr($pair,1,1)), $characterPairCounts{$pair}, $pair; } } } # reportCharUsage ############################################################################### sub reportIds { my $numids = scalar keys %ids; head( "Total distinct IDs: ", $numids); my $numrids = scalar keys %rids; msgn( "Total IDREFs: ", $numrids); # Check for missing IDs my $ilist = ""; foreach my $r (keys %rids) { if (!exists $ids{$r}) { $ilist .= "$r "; } } ($ilist) && msg1( "ERROR: IDREFs exist for these missing IDs:\n$ilist"); # Check for orphaned IDs (no IDREFs to them) my $rlist = ""; foreach my $i (sort keys %ids) { if (!exists $rids{$i}) { $rlist .= "$i \t"; } } ($orphans) && ($rlist) && msg2br( "Orphan IDs (never referenced) are:", $rlist); } # sub reportIds ############################################################################### sub reportTrojans { head( "Trojan Milestone Information:"); my $nTrojanErrors = 0; if (scalar keys %trojanStarts) { my $unclosed = join(", ", keys %trojanStarts); msg1( "Invalid: Trojan milestones started but never ended: ", $unclosed); $nTrojanErrors += scalar keys %trojanStarts; } msgn( " Total Trojan milestone tags: ", $nTrojanTags); msgn( " Total Trojan milestone errors: ", $nTrojanErrors); } # reportTrojans ############################################################################### sub reportDtd { head("DTD Information:"); my $nelements = (scalar keys %models); msg1( "Element types declared: ", $nelements); if ($nelements) { # Calculate (type) fanout my @withNchildren = (); my $totalChildTypes = 0; my @modelKeys = keys %models; foreach my $x (@modelKeys) { my $model = $models{$x}; $model =~ s/[?*+(),|&]/ /g; my @tokens = split(" *",$model); my $n = scalar(@tokens); if ($tokens[0] eq "") { $n--; } $withNchildren[$n]++; $totalChildTypes += $n; } msg1( " # child types","# element types"); for (my $j=0; $j 0) { $s .= (" " x $needed); } return($s); } sub lpad { my $s = (defined $_[0]) ? $_[0]:""; my $len = (defined $_[1]) ? $_[1]:0; if ($len <= 2) { $len = 10; } my $needed = $len - length($s); if ($needed > 0) { $s = (" " x $needed) . $s; } return($s); } sub lpadc { # pad, and also insert commas every three digits. my $s = (defined $_[0]) ? $_[0]:""; my $len = (defined $_[1]) ? $_[1]:0; my $buf = ""; if ($comma) { while (length($s) > 3) { $buf = "," . substr($s,length($s)-3) . $buf; $s = substr($s,0,length($s)-3); } } $buf = "$s$buf"; if ($len <= 2) { $len = 10; } my $needed = $len - length($buf); if ($needed > 0) { $buf = (" " x $needed) . $buf; } return($buf); } ################################################################################ sub showUsage { print " Usage: xmlstats [options] [file.xml] Parses an XML file and reports lots of statistics. Options to request specific statistics: -stats Show all available statistical information. -chars Count occurrences of individual content characters -charpairs Count occurrences of sequences of two characters, and calculate entropy. -dtd Show additional DTD information. -fqgis Report element contexts with frequencies. -ids Track ID uniqueness, orphans, etc. (for duplicate IDs, their line numbers will be given). -maxfqgi n Report only element contexts with n occurrences. -orphans Report IDs that are never referenced by IDREFs. -parents Report how often each element type occurs with each parent type. -rank Sort elements/attributes by frequency, not name. -trojans Track and report Trojan milestone usage and errors. -usage Show frequency for each element type, attribute name, PI name. Other options: -c name Use XML catalog 'name' (default = $default_catalog). -comma Put commas into big numbers (default, use -nocomma to turn off). -dot char Replace the '.' character shown for indents with -v and -vv. -h Display this help. -q Suppress most messages. -v Show document instance events. -version Display version information ($version, sjd). -vv Be even more verbose (track all DTD events, too). Related commands: taglist: Quick & dirty count of start-tags in file(s). normalizeXML: Some statistics are reported with line numbers where things happened, so normalizing your file first can give better reporting. "; }