#!/usr/bin/perl -w
#
# xmlstats: Parse some XML and gather lots of statistics.
#
# API doc at http://search.cpan.org/~msergeant/XML-Parser/Parser.pm
# Also see http://search.cpan.org/~msergeant/XML-Parser-2.34/Expat/Expat.pm
#
# History:
# 2006 sometime: Written by Steven J. DeRose, sderose@acm.org, based on old C.
# 2008-03-23 sjd: Rewrite, clean up; fix langStack.
#
# Todo:
# Count #fixed attr occurrences, redundant #implied/ns/lang?
# Fix max children per element
# Identify all id and idref attributes declared
# Track longest attribute *list*.
# Handle multiple documents
# Fanout/Fanin stats (max children by element type
# Count namespace nodes, elements per namespace
#
use strict;
use Getopt::Long;
my $version = "2008-03-23";
my $default_catalog = "";
# Option values
my $all = 0;
my $catalog = $default_catalog;
my $help = 0;
my $quiet = 0;
my $verbose = 0;
my $vv = 0;
my $orphans = 0;
my $chars = 0;
my $charPairs = 0;
my $comma = 1;
my $dtd = my $stats = my $trackIds = 0;
my $dot = "."; # Char to show indents in verbose mode
my $ids = 0;
my $minFqgiFreq = 1; # Min frequency to show up in report
my $maxFqgiFreq = 0; # Min frequency to report (0=unlimited)
my $nameWidth = 45; # Space to allow in reports by element type
my $rank = 0; # List objects by frequency of occurrence
my $reportParents = 0;
my $reportFqgis = 0; # Report fqgis at all?
my $trackMilestones = 0;
my $trojans = 0;
my $usage = 0; # Report occurrences of various things
# Process options
Getopt::Long::Configure ("ignore_case");
my $result = GetOptions(
"c=s" => \$catalog,
"chars!" => \$chars,
"charpairs!" => \$charPairs,
"comma!" => \$comma,
"dot=s" => \$dot,
"h|help|?" => \$help,
"ids!" => \$trackIds,
"dtd!" => \$dtd,
"fqgis!" => \$reportFqgis,
"maxfqgi=i" => \$maxFqgiFreq,
"minfqgi=i" => \$minFqgiFreq,
"orphans!" => \$orphans,
"parents!" => \$reportParents,
"q|quiet!" => \$quiet,
"rank!" => \$rank,
"stats|all!" => \$all,
"trojans!" => \$trackMilestones,
"usage!" => \$usage,
"v|verbose+" => \$verbose,
"vv+" => \$vv,
"version" => sub {
die "Version of $version, by Steven J. DeRose, sderose\@acm.org.\n";
}
);
if ($help) { showUsage(); exit; }
($result) || die "Bad options.\n";
# Validate and default options
if ($reportFqgis) { $usage = 1; }
if ($reportParents) { $usage = 1; }
if ($vv > 0) { $verbose = $dtd = $usage = 1; }
if ($all) {
$usage = $dtd = $orphans = $ids = $trojans = $chars =
$reportParents = $reportFqgis = 1;
$quiet = 0;
}
($trackMilestones) &&
print "Trojan milestone tracking is not fully implemented.\n";
($catalog eq "") || (-f $catalog) ||
die "Cannot find catalog file '$catalog'.\n";
($maxFqgiFreq >= 0) || die "-maxfqgi value invalid ($maxFqgiFreq).\n";
($minFqgiFreq > 0) || die "-minfqgi value too small ($minFqgiFreq).\n";
my $file = $ARGV[0];
if ($file eq "") {
showUsage();
exit;
}
(-e $file) || die "File '$file' does not exist.\n";
($ARGV[1]) && warn "Extra arguments -- ignored.\n";
($quiet) || warn "Using catalog '$catalog'.\n";
my $cStart = my $cEnd = "";
if ($ENV{USE_COLOR} ne "") {
$cStart = `colorstring bold`;
$cEnd = `colorstring default`;
}
################################################################################
my @tagStack = (); # Current stack of element types
my @langStack = (); # Stack of xml:lang values
my $lastEvent = "";
my $pastDTD = 0;
# DTD stats
my ($docel, $docsys, $docpub); # doctype dcl info
my $totalElementTypes;
my $totalDeclaredContent;
my $totalDeclaredContentEMPTY;
my $totalDeclaredContentCDATA;
my $totalDeclaredContentRCDATA;
my $totalAttrsDeclared;
my %totalAttrTypesByDclValue;
my %totalAttrTypesByDefault;
# Event statistics
my $totalStart, my $totalEnd; # Tag counts
my $totalEmpty;
my $totalWithAttr; # Element with any attributes at all
my $totalModelTokens; # Total tokens in all content models
my $totalPcdataModels;
my $totalAttr; # All attribute instances
my $totalId; # And a few special attributes
my $totalRid;
my $totalXmlLang;
# Instance totals
my $totalChar; # How many PCDATA text nodes
my $totalCharLength; # How much PCDATA length
my $totalWSN; # Number of white-space-only nodes
my $totalWSNlength;
my $totalPi; # Number of Processing instructions
my $totalPiLength; #
my $totalComment; # Count comments
my $totalCommentInDoc;
my $totalCommentInDTD;
my $totalCommentLength;
my $totalCommentLengthInDTD;
my $totalCommentLengthInDoc;
my $totalMarkupLength; # (normalized; may not exactly match source)
my $totalContentLength;
my $totalCdataStart; #
my $totalCdataEnd; #
my $totalCdataNodesAdjacent; # How often are text nodes not coalesced?
my $totalCharsInCdata; #
my $totalEERef; # External entities referenced
my $totalDefault; #
# Counts events/totals by name/type
my %piCounts; # Occurrences of PIs, by name
my %elementCounts; # Occurrences of element types, by name
my %pairCounts; # Tag+parent occurrences
my %fqgiCounts; # Tag context occurrences
my %attrCounts; # Counted only by attr name, not per element
my %elemattrCounts; # Counted per element
my %langCounts; # Use of xml:lang attribute
my %lineBreaks; # As Mac/PC/*nix
my @characterCounts; # By individual character code
my %characterPairCounts; # By sequences of character codes
my $previousChar = " ";
my %totalCdataByElementType = (); # How much CDATA, by gi
my %totalCdataByLang = (); # How much CDATA, by xml:lang
my %totalTextNodesByElementType = (); # How many text nodes (incl whitespace)
my %totalWSNByElementType = (); # How many whitespace-only, by gi
# Trojan milestone statistics
my $nTrojanTags;
my $nTrojanErrors;
my %trojanStarts;
my %trojanUsed;
# Declaration statistics
my %internalGeneralEntities;
my %externalGeneralEntities;
my %internalUnparsedEntities;
my %externalUnparsedEntities;
my %internalParameterEntities;
my %externalParameterEntities;
my %notations;
# Schema information
my %models;
# Context information
my $inCdata; # Are we in a CDATA section?
my $maxLevel; # Deepest nesting level seen
my $maxFqgi; # Deepest string of element types
my $maxFqgiLoc;
my %lastLine; # Line when we last saw each element type start.
# ID/validation information (only does *names* id and rid for now).
my %ids;
my %rids; # occurrences of id/idref values.
# Track the maximum values for some things, where they first happened,
# and save that example for the report.
my $maxNumAttrs;
my $maxNumAttrsExample;
my $maxNumAttrsLoc;
my $maxAttrLength;
my $maxAttr;
my $maxAttrLoc;
my $maxModelLength;
my $maxModelTokens;
my $maxModelDepth;
my $maxModelDepthExample;
my $maxModelDepthLoc;
my $maxTextNodeLength;
my $maxTextNodeLoc;
my $maxTextNode;
my $maxPiLength;
my $maxPiLoc;
my $maxPiName;
my $maxPi;
my $maxCommentLength;
my $maxCommentLoc;
my $maxComment;
my $maxChildrenPerElement;
my $maxChildrenPerElementType;
my $maxChildrenPerElementLoc;
my @nChildren = ();
my $gaveXslWarning = 0;
resetStats();
my $startTime;
my $endTime;
################################################################################
# Set up the parser and callbacks
use XML::Parser;
use XML::Catalog;
#my $parser = new XML::Parser(ErrorContext => 2);
my $parser = new XML::Parser(ErrorContext => 0);
if ($catalog ne "") {
my $catalogObject=XML::Catalog->new($catalog);
$parser->setHandlers(ExternEnt => $catalogObject->get_handler($parser));
}
$parser->setHandlers(
Init => \&initHandler,
Final => \&finalHandler,
Start => \&startTagHandler,
End => \&endTagHandler,
Char => \&charHandler,
Proc => \&piHandler,
Comment => \&commentHandler,
CdataStart => \&cdataStartHandler,
CdataEnd => \&cdataEndHandler,
Doctype => \&doctypeHandler,
DoctypeFin => \&doctypeFinHandler,
Default => \&defaultHandler,
Entity => \&entityDclHandler,
Element => \&elementDclHandler
);
# Process the XML (can only do one document per parser instance)
$parser->parsefile($file);
# $parser->parse("
Hello
");
# Issue reports
($quiet) || reportDtdStats();
($quiet) || reportStats();
($trackIds) && reportIds();
($trackMilestones) && reportTrojans();
if ($usage) {
if ($rank) { reportUsageByRanks(); }
else { reportUsageByNames(); }
}
reportCharUsage();
($dtd) && reportDtd();
($quiet) ||
print "\n*** Done (xmlstats version of $version, by Steven J. DeRose) ***\n\n";
exit;
##############################################################################
sub resetStats {
@tagStack = @langStack = ();
$lastEvent = "";
$docel = $docsys = $docpub = "";
$totalElementTypes = 0;
$totalDeclaredContent = 0;
$totalDeclaredContentEMPTY = 0;
$totalDeclaredContentCDATA = 0;
$totalDeclaredContentRCDATA = 0;
$totalAttrsDeclared = 0;
%totalAttrTypesByDclValue =
("ID" => 0, "IDREFS" => 0, "RID" => 0,
"CDATA" => 0, "NMTOKEN" => 0, "NMTOKENS" => 0);
%totalAttrTypesByDefault =
("#IMPLIED" => 0, "#REQUIRED" => 0);
$pastDTD = 0;
# Event statistics
$totalStart =0; # Tag counts
$totalEnd = 0;
$totalEmpty = 0;
$totalWithAttr = 0; # Element with any attributes at all
$totalModelTokens = 0; # Total tokens in all content models
$totalPcdataModels = 0;
$totalAttr = 0; # All attribute instances
$totalId = 0; # And a few special attributes
$totalRid = 0;
$totalXmlLang = 0;
# Instance totals
$totalChar = 0; # How many PCDATA text nodes
$totalCharLength = 0; # How much PCDATA length
$totalWSN = 0; # Number of white-space-only nodes
$totalWSNlength = 0;
$totalPi = 0; # Number of Processing instructions
$totalPiLength = 0; #
$totalComment = 0; #
$totalCommentInDoc = 0;
$totalCommentInDTD = 0;
$totalCommentLength = 0;
$totalCommentLengthInDTD = 0;
$totalCommentLengthInDoc = 0;
$totalCdataStart = 0; #
$totalCdataEnd = 0; #
$totalCdataNodesAdjacent = 0;
$totalCharsInCdata = 0; #
$totalEERef = 0; # External entities referenced
$totalDefault = 0; #
# Counts of events by name/type
%piCounts = (); # Occurrences of PIs, by name
%elementCounts = (); # Occurrences of element types, by name
%pairCounts = (); # Tag+parent occurrences
%fqgiCounts = (); # Tag context occurrences
%attrCounts = (); # Counted only by attr name, not per element
%elemattrCounts = (); # Counted only by attr name, not per element
%langCounts = (); # Use of xml:lang attribute
%lineBreaks = (); # As Mac/PC/*nix
@characterCounts = ();
%totalCdataByElementType = ();# How much CDATA anywhere under each gi
%totalCdataByLang = ();
%totalTextNodesByElementType = ();
%totalWSNByElementType = ();
# Trojan milestone statistics
$nTrojanTags = 0;
$nTrojanErrors = 0;
%trojanStarts = ();
%trojanUsed = ();
# Declaration statistics
%internalGeneralEntities = ();
%externalGeneralEntities = ();
%internalUnparsedEntities = ();
%externalUnparsedEntities = ();
%internalParameterEntities = ();
%externalParameterEntities = ();
%notations = ();
# Schema information
%models = ();
# Context information
$inCdata = 0; # Are we in a CDATA section?
$maxLevel = 0; # Deepest nesting level seen
$maxFqgi = ""; # Deepest string of element types
$maxFqgiLoc = "-1";
%lastLine = (); # Line when we last saw each element type start.
# ID/validation information (only does *names* id and rid for now).
%ids = ();
%rids = (); # occurrences of id/idref values.
# Track the maximum values for some things, and where they first happened
$maxNumAttrs = 0;
$maxNumAttrsExample = "";
$maxNumAttrsLoc = "-1";
$maxAttrLength = 0;
$maxAttr = "";
$maxAttrLoc = "-1";
$maxModelLength = 0;
$maxModelTokens = 0;
$maxModelDepth = 0;
$maxModelDepthExample = "";
$maxModelDepthLoc = -1;
$maxTextNodeLength = 0;
$maxTextNodeLoc = "-1";
$maxTextNode = "";
$maxPiLength = 0;
$maxPiLoc = "-1";
$maxPiName = "";
$maxPi = "";
$maxCommentLength = 0;
$maxCommentLoc = "-1";
$maxComment = "";
$maxChildrenPerElement = 0;
$maxChildrenPerElementType = "";
$maxChildrenPerElementLoc = -1;
@nChildren = ();
$gaveXslWarning = 0;
} # resetStats
##############################################################################
sub initHandler {
($vv) && verbose($_[0],"Init",$_[1]);
$startTime = time();
$lastEvent = "INIT";
}
sub finalHandler {
($vv) && verbose($_[0],"Final",$_[1]);
$endTime = time();
$lastEvent = "FINAL";
}
sub startTagHandler {
($verbose) && verbose($_[0],"Start-tag",$_[1]);
$totalStart++;
my $parentDepth = (scalar @tagStack) - 1;
if ($parentDepth > 0) {
$nChildren[$parentDepth]++;
}
push(@tagStack,$_[1]);
$nChildren[$parentDepth+1] = 0;
my $e = "?ent?"; # $_[0]->current_entity;
my $l = $_[0]->current_line;
$lastLine{$_[1]} = "$e: $l";
$elementCounts{$_[1]}++;
my $fqgi = join("/",@tagStack);
$fqgiCounts{$fqgi}++;
(my $pair = $fqgi) =~ s|^.*/([^/]*/[^/]*)$|$1|;
$pairCounts{$pair}++;
# print "gi: $_[1], fqgi: $fqgi, pair: $pair\n";
if ($_[0]->depth > $maxLevel) {
$maxLevel = $_[0]->depth;
$maxFqgi = $fqgi;
$maxFqgiLoc = $_[0]->current_line;
}
# See if there are any attributes
if ($_[2]) {
$totalWithAttr++;
}
# Keep track of number of attrs, and a few specific attributes.
my $attlist = "";
my $trojanType = ""; my $trojanKey = "";
my $newLang = "";
my $numAttrs = 0;
for (my $i=2; $i $maxAttrLength) {
$maxAttrLength = $alen;
$maxAttr = $cvalue;
$maxAttrLoc = $_[0]->current_line;
}
if ($cname eq "id") {
$totalId++;
if (exists $ids{$cvalue}) {
if (!$gaveXslWarning) {
print "Duplicate ID value '$cvalue' at line "
. $_[0]->current_line
. " (prior occurrence was at line $ids{$cvalue}).\n";
if ($cvalue =~ m/\{.*\}/) {
print "(this looks like an xsl file, not reporting again)\n";
$gaveXslWarning = 1;
}
}
}
else {
$ids{$cvalue} = $_[0]->current_line;
}
}
elsif ($cname eq "rid") {
$totalRid++;
$rids{$cvalue}++;
}
elsif ($cname eq "xml:lang") {
$totalXmlLang++;
$newLang = $cvalue;
$langCounts{$newLang}++;
}
elsif ($cname eq "sID") { # start Trojan
$trojanType = "s";
$trojanKey = $cvalue;
if ($trojanStarts{$trojanKey} or
$trojanUsed{$trojanKey}) {
$nTrojanErrors++;
($trackMilestones) && print
"Invalid: Trojan milestone key '$trojanKey' reused.\n";
}
else {
$trojanStarts{$trojanKey} = 1;
}
} # sID
elsif ($cname eq "eID") { # end Trojan
$trojanType = "e";
$trojanKey = $cvalue;
if ($trojanUsed{$trojanKey}) {
($trackMilestones) && print
"Invalid: Trojan milestone key '$trojanKey' reused.\n";
$nTrojanErrors++;
}
elsif ($trojanStarts{$trojanKey}) {
delete $trojanStarts{$trojanKey};
$trojanUsed{$trojanKey} = 1;
}
else {
($trackMilestones) && print
"Invalid: Trojan milestone key '$trojanKey' appears"
. " on eID before sID.\n";
$nTrojanErrors++;
}
} # eID
} # for each attribute
$totalAttr += $numAttrs;
if ($numAttrs > $maxNumAttrs) {
$maxNumAttrs = $numAttrs;
$maxNumAttrsExample = "<$_[1]$attlist>";
$maxNumAttrsLoc = $_[0]->current_line;
}
$totalMarkupLength += length(join("",@_)."<>") + $numAttrs*4;
# Keep track of milestone markup
if ($trojanType) {
$nTrojanTags++;
if ($trojanType eq "e" and $numAttrs > 1) {
($trackMilestones) && print
"Invalid: Trojan milestone with eID='$trojanKey' has other attributes.\n";
$nTrojanErrors++;
}
}
if ($newLang eq "") {
$newLang = $langStack[scalar(@langStack)-1]; # inherit
}
if (!defined($newLang) || $newLang eq "") {
$newLang = "???";
}
push @langStack, $newLang;
$lastEvent = "STARTTAG";
} # sub startTagHandler
sub endTagHandler {
($verbose) && verbose($_[0],"End-tag",$_[1]);
if ($nChildren[scalar @tagStack - 1] > $maxChildrenPerElement) {
$maxChildrenPerElement = $nChildren[scalar @tagStack - 1];
$maxChildrenPerElementType = $tagStack[scalar @tagStack - 1];
$maxChildrenPerElementLoc = $_[0]->current_line;
}
$totalEnd++;
my $expectedTag = pop(@tagStack);
if ($expectedTag ne $_[1]) {
my $l = $_[0]->current_line;
print "Well-formedness error: Found end of '$_[1]' at line $l when "
. " expecting '$expectedTag'.\n.";
print "Last '$expectedTag' was started at $lastLine{$expectedTag}.\n";
}
$totalMarkupLength += length("$_[1]>");
pop @langStack;
if ($lastEvent eq "STARTTAG") {
$totalEmpty++;
}
$lastEvent = "ENDTAG";
} # endTagHandler
sub charHandler {
($verbose) && verbose($_[0],"Text node",$_[1]);
my $textNodeLength = length($_[1]);
$totalChar++;
$totalCharLength += $textNodeLength;
if ($inCdata) {
$totalCharsInCdata += $textNodeLength;
}
if ($textNodeLength > $maxTextNodeLength) {
$maxTextNodeLength = $textNodeLength;
$maxTextNode = $_[1];
$maxTextNodeLoc = $_[0]->current_line;
}
my $c = $_[1];
if ($c =~ m/^\s*$/) {
$totalWSN++;
$totalWSNlength += length($c);
}
# Count line-breaks in there, by types
(my $tbuf = $_[1]) =~ s/\r\n//g;
my $tbuflen = length($tbuf);
$lineBreaks{"PC"} += (($textNodeLength - $tbuflen) / 2.0);
$tbuf =~ s/\r//g;
$lineBreaks{"Mac"} += $tbuflen - length($tbuf);
$tbuf =~ s/\n//g;
$lineBreaks{"*nix"} += $tbuflen - length($tbuf);
# Count occurrences of characters
if ($chars) {
for (my $i=0; $i");
(my $piKey = $_[1]);
$piCounts{$piKey}++;
if (length($_[2]) > $maxPiLength) {
$maxPiLength = length($_[2]);
$maxPiName = $_[1];
$maxPi = $_[2];
$maxPiLoc = $_[0]->current_line;
}
$totalMarkupLength += length("$_[1] $_[2]?>");
$lastEvent = "PI";
}
sub commentHandler {
($verbose) && verbose($_[0],"Comment",$_[1]);
$totalComment++;
$totalCommentLength += length("");
if ($pastDTD) {
$totalCommentInDoc++;
$totalCommentLengthInDoc += length($_[1]);
}
else {
$totalCommentInDTD++;
$totalCommentLengthInDTD += length($_[1]);
}
if (length($_[1]) > $maxCommentLength) {
$maxCommentLength = length($_[1]);
$maxComment = $_[1];
$maxCommentLoc = $_[0]->current_line;
}
$totalMarkupLength += length("");
$lastEvent = "COMMENT";
}
sub cdataStartHandler {
($verbose) && verbose($_[0],"CDATA start",$_[1]);
$totalCdataStart++;
$inCdata = 1;
$totalMarkupLength += length("");
$lastEvent = "CDATAEND";
}
sub doctypeHandler {
(my $p, $docel, $docsys, $docpub) = @_;
}
sub doctypeFinHandler {
$pastDTD = 1;
$lastEvent = "DOCTYPEFIN";
}
# This won't be called if there's a catalog in use....
sub externEntHandler {
($verbose) && verbose($_[0],"External Entity Ref",$_[1]);
$totalEERef++;
$lastEvent = "EXTERNENT";
}
sub externEntEndHandler {
($verbose) && verbose($_[0],"External Entity End",$_[1]);
$lastEvent = "EXTERNENTEND";
}
sub defaultHandler {
($vv) && scalar($_)>1 && verbose($_[0],"Default",$_[1]);
$totalDefault++;
my ($p, $data) = @_;
if ($data =~ /^");
}
msg1( "External entity refs (unless catalog in use): ", $totalEERef);
} # sub reportStats
################################################################################
sub reportDtdStats {
head( "
DTD STATISTICS
");
if (!$dtd) {
msg1( " (use -dtd option to get this information)");
return;
}
head( "Element types declared: ", $totalElementTypes);
msgn( "Element types with #PCDATA: ", $totalPcdataModels);
msgn( "Maximum content model length: ", $maxModelLength);
msgn( "Maximum content model tokens: ", $maxModelTokens);
if ($totalElementTypes != 0) {
msg1( "Average content model tokens: " .
($maxModelTokens / $totalElementTypes));
}
msgn( "Maximum content model () depth: ", $maxModelDepth);
msg1( " A model of that depth (at line $maxModelDepthLoc):"
. " $maxModelDepthExample");
head("Attributes declared:", $totalAttrsDeclared);
msg1( " For each declared value:");
foreach my $x (sort keys %totalAttrTypesByDclValue) {
msgn( " $x", $totalAttrTypesByDclValue{$x});
}
msg1( " For each default value:");
foreach my $x (sort keys %totalAttrTypesByDefault) {
msgn( " $x", $totalAttrTypesByDefault{$x});
}
msg1( "Comments in DTD ", $totalCommentInDTD);
} # sub reportDtdStats
################################################################################
sub reportUsageByNames {
head("Specific element type usage:");
head( "Element occurrences by type name:"); # Elements
foreach my $e (sort keys %elementCounts) {
msgn( " $e", $elementCounts{$e});
}
if ($reportParents) {
head( "Element occurrences by parent:"); # Parents
foreach my $p (sort keys %pairCounts) {
my $freq = $pairCounts{$p};
msgn( " $p", $freq);
}
}
if ($reportFqgis) {
head( "Element context occurrences by context:"); # FQGIs
foreach my $e (sort keys %fqgiCounts) {
my $freq = $fqgiCounts{$e};
if (($freq>$minFqgiFreq) &&
($freq<$maxFqgiFreq || $maxFqgiFreq==0)) {
print lpadc($freq) . " $e\n";
}
}
}
if (scalar keys %attrCounts <= 0) { # Attributes
head( "No attribute occurrences.");
}
else {
head( "Attribute occurrences by attribute name (not per element):");
foreach my $a (sort keys %attrCounts) {
msgn( " $a", $attrCounts{$a});
}
head( "Attribute occurrences by element/attribute name:");
foreach my $a (sort keys %elemattrCounts) {
msgn( " $a", $elemattrCounts{$a});
}
}
if (scalar keys %langCounts > 0) { # xml:lang
head( "xml:lang attribute occurrences by language value:");
foreach my $l (sort keys %langCounts) {
msgn( " $l", $langCounts{$l});
}
}
else {
head( "No xml:lang attribute occurrences.");
}
if (scalar keys %piCounts > 0) { # PIs
head( "PI occurrences by name:");
foreach my $p (sort keys %piCounts) {
msgn( " $p", $piCounts{$p});
}
}
else {
head( "No PI occurrences.");
}
} # sub reportUsageByNames
# Comparison routine so we can sort hash keys by value, not name.
sub elementsByValue { $elementCounts{$a} <=> $elementCounts{$b}; }
sub parentsByValue { $pairCounts{$a} <=> $pairCounts{$b}; }
sub fqgisByValue { $fqgiCounts{$a} <=> $fqgiCounts{$b}; }
sub attributesByValue { $attrCounts{$a} <=> $attrCounts{$b}; }
sub langByValue { $langCounts{$a} <=> $langCounts{$b}; }
sub pisByValue { $piCounts{$a} <=> $piCounts{$b}; }
sub reportUsageByRanks {
head("Specific element type usage:");
head( "Element occurrences by frequency:"); # Elements
foreach my $e (sort elementsByValue keys %elementCounts) {
msgn( " $e", $elementCounts{$e});
}
if ($reportParents) {
head( "Element occurrences by parent:"); # Parents
foreach my $p (sort parentsByValue keys %pairCounts) {
my $freq = $pairCounts{$p};
msgn( " $p", $freq);
}
}
if ($reportFqgis) {
head( "Element context occurrences by context:"); # FQGIs
foreach my $e (sort fqgisByValue keys %fqgiCounts) {
my $freq = $fqgiCounts{$e};
if (($freq>$minFqgiFreq) &&
($freq<$maxFqgiFreq || $maxFqgiFreq==0)) {
print lpad($freq) . " $e\n";
}
}
}
if (scalar keys %attrCounts > 0) { # Attributes
head( "Attribute occurrences by frequency:");
foreach my $a (sort attributesByValue keys %attrCounts) {
msgn( " $a", $attrCounts{$a});
}
}
else {
head( "No attribute occurrences.");
}
if (scalar keys %langCounts > 0) { # xml:lang
head( "xml:lang attribute occurrences by frequency:");
foreach my $l (sort langByValue keys %langCounts) {
msgn( " $l", $langCounts{$l});
}
}
else {
head( "No xml:lang attribute occurrences.");
}
if (scalar keys %piCounts > 0) { # PIs
head( "PI occurrences by frequency:");
foreach my $p (sort pisByValue keys %piCounts) {
msgn( " $p", $piCounts{$p});
}
}
else {
head( "No PI occurrences.");
}
} # sub reportUsageByRanks
###############################################################################
sub reportCharUsage {
if ($chars) {
head( "\nCharacter occurrences in content:");
my $ascii=0, my $latin1=0; my $rest=0;
for (my $i=1; $i=32 && $i<256) ? chr($i):" ");
print " ", lpad($characterCounts[$i]) . "\n";
}
} # for
msg1( " Ascii: $ascii, Latin-1: $latin1, other: $rest.");
}
head( "Total (direct) cdata content by element type:");
foreach my $l (keys %totalCdataByElementType) {
msgn( " $l", $totalCdataByElementType{$l});
}
head( "Total (direct) cdata content by language:");
foreach my $l (keys %totalCdataByLang) {
msgn( " $l", $totalCdataByLang{$l});
}
head( "Total (direct) text nodes by element type:");
foreach my $l (keys %totalTextNodesByElementType) {
msgn( " $l", $totalTextNodesByElementType{$l});
}
head( "Total (direct) whitespace-only text nodes by element type:");
foreach my $l (keys %totalWSNByElementType) {
msgn( " $l", $totalWSNByElementType{$l});
}
head( "Total (direct) non-whitespace-only text nodes by element type:");
foreach my $l (keys %totalTextNodesByElementType) {
my $wsn = (defined $totalWSNByElementType{$l}) ?
$totalWSNByElementType{$l}:0;
msgn( " $l",
$totalTextNodesByElementType{$l} - $wsn);
}
if ($charPairs) {
head( "Character pair frequencies:");
for my $pair (sort keys %characterPairCounts) {
printf "%06x %06x $06d '%s'\n",
ord(substr($pair,0,1)), ord(substr($pair,1,1)),
$characterPairCounts{$pair}, $pair;
}
}
} # reportCharUsage
###############################################################################
sub reportIds {
my $numids = scalar keys %ids;
head( "Total distinct IDs: ", $numids);
my $numrids = scalar keys %rids;
msgn( "Total IDREFs: ", $numrids);
# Check for missing IDs
my $ilist = "";
foreach my $r (keys %rids) {
if (!exists $ids{$r}) { $ilist .= "$r "; }
}
($ilist) &&
msg1( "ERROR: IDREFs exist for these missing IDs:\n$ilist");
# Check for orphaned IDs (no IDREFs to them)
my $rlist = "";
foreach my $i (sort keys %ids) {
if (!exists $rids{$i}) { $rlist .= "$i \t"; }
}
($orphans) && ($rlist) &&
msg2br( "Orphan IDs (never referenced) are:",
$rlist);
} # sub reportIds
###############################################################################
sub reportTrojans {
head( "Trojan Milestone Information:");
my $nTrojanErrors = 0;
if (scalar keys %trojanStarts) {
my $unclosed = join(", ", keys %trojanStarts);
msg1( "Invalid: Trojan milestones started but never ended: ", $unclosed);
$nTrojanErrors += scalar keys %trojanStarts;
}
msgn( " Total Trojan milestone tags: ", $nTrojanTags);
msgn( " Total Trojan milestone errors: ", $nTrojanErrors);
} # reportTrojans
###############################################################################
sub reportDtd {
head("DTD Information:");
my $nelements = (scalar keys %models);
msg1( "Element types declared: ", $nelements);
if ($nelements) {
# Calculate (type) fanout
my @withNchildren = ();
my $totalChildTypes = 0;
my @modelKeys = keys %models;
foreach my $x (@modelKeys) {
my $model = $models{$x};
$model =~ s/[?*+(),|&]/ /g;
my @tokens = split(" *",$model);
my $n = scalar(@tokens);
if ($tokens[0] eq "") { $n--; }
$withNchildren[$n]++;
$totalChildTypes += $n;
}
msg1( " # child types","# element types");
for (my $j=0; $j 0) {
$s .= (" " x $needed);
}
return($s);
}
sub lpad {
my $s = (defined $_[0]) ? $_[0]:"";
my $len = (defined $_[1]) ? $_[1]:0;
if ($len <= 2) { $len = 10; }
my $needed = $len - length($s);
if ($needed > 0) {
$s = (" " x $needed) . $s;
}
return($s);
}
sub lpadc { # pad, and also insert commas every three digits.
my $s = (defined $_[0]) ? $_[0]:"";
my $len = (defined $_[1]) ? $_[1]:0;
my $buf = "";
if ($comma) {
while (length($s) > 3) {
$buf = "," . substr($s,length($s)-3) . $buf;
$s = substr($s,0,length($s)-3);
}
}
$buf = "$s$buf";
if ($len <= 2) { $len = 10; }
my $needed = $len - length($buf);
if ($needed > 0) {
$buf = (" " x $needed) . $buf;
}
return($buf);
}
################################################################################
sub showUsage {
print "
Usage:
xmlstats [options] [file.xml]
Parses an XML file and reports lots of statistics.
Options to request specific statistics:
-stats Show all available statistical information.
-chars Count occurrences of individual content characters
-charpairs Count occurrences of sequences of two characters, and
calculate entropy.
-dtd Show additional DTD information.
-fqgis Report element contexts with frequencies.
-ids Track ID uniqueness, orphans, etc.
(for duplicate IDs, their line numbers will be given).
-maxfqgi n Report only element contexts with n occurrences.
-orphans Report IDs that are never referenced by IDREFs.
-parents Report how often each element type occurs with each parent type.
-rank Sort elements/attributes by frequency, not name.
-trojans Track and report Trojan milestone usage and errors.
-usage Show frequency for each element type, attribute name, PI name.
Other options:
-c name Use XML catalog 'name' (default = $default_catalog).
-comma Put commas into big numbers (default, use -nocomma to turn off).
-dot char Replace the '.' character shown for indents with -v and -vv.
-h Display this help.
-q Suppress most messages.
-v Show document instance events.
-version Display version information ($version, sjd).
-vv Be even more verbose (track all DTD events, too).
Related commands:
taglist: Quick & dirty count of start-tags in file(s).
normalizeXML: Some statistics are reported with line numbers where things
happened, so normalizing your file first can give better reporting.
";
}