#!/usr/bin/perl -w # # Written by Steven J. DeRose, sderose@acm.org. # # A basic XML parser, started 2:13 2008-08-14. Working in 3 hours 50 minutes, except # for recursive and external entities, and entities inside attribute values. # 2008-08-18 sjd: Add entity manager for external entities (~1hr more). use strict; use Getopt::Long; my $version = "2008-08-18"; my @dirs = (); # Places to look for external entity files my $dtd = 1; # Show DTD events? my $help = 0; my $quiet = 0; my $verbose = 0; my $ws = 0; # Keep whitespace-only nodes? ################################################################################ Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "dir=s" => \@dirs, "dtd!" => \$dtd, "h|help" => \$help, "q!" => \$quiet, "v+" => \$verbose, "version" => sub { die "Version of $version by Steven J. DeRose, sderose\@acm.org.\n"; }, "ws!" => \$ws ); if ($help) { showUsage(); exit; } ($result) || die "Bad options.\n"; ############################################################################### # List of defined constructs (markup declarations will add to this) my %elements = (); # Defined elements (later) my %attlists = (); # Defined attlists, by element (later) my %ents = ( "lt" => "<", # Defined internal entities "gt" => ">", "amp" => "&", # "lsqb" => "[", # "rsqb" => "]", "apos" => "'", "quo" => "\"" ); my %systemEnts = (); # General external entities my %pents = (); # Defined parameter entities (later) my %systemPents = (); # Parameter external entities # Parse state my @nameSpaceStack = (); # Active namespace lists, in sync with tagStack my @tagStack = (); # open element types my @tagStartLocs = (); # entity/loc where each was opened my $seenXmlDcl = 0; # Have we seen it yet? my $inDoctype = 0; # In middle of parsing Doctype declaration? my $seenDoctype = 0; # Past start of Doctype declaration? my $pastDoctype = 0; # Past end of Doctype declaration? my $textBuf = 0; # Accumulated text node contents (-f $ARGV[0]) || die "Cannot find main document file at '$ARGV[0]'.\n"; dumpEvent("INIT",""); defineEntity("-document-", $ARGV[0], "SYSTEM"); openExternalEntity("-document-", $ARGV[0]); (defined fillBuf()) || err("Unable to load beginning of main document."); skipWhiteSpace(); my $c = ""; while ($c = curChar()) { vwarn2("Main loop, c is '$c'"); if ($c eq '<') { doTag(); } elsif ($c eq '&') { doEnt(); } elsif ($inDoctype && $c eq '%') { doParameterEntityRef(); } elsif ($inDoctype && $c eq ']' && nextChar() eq '>') { nextChar(); dumpEvent("DOCTYPEFIN",""); $inDoctype = 0; $pastDoctype = 1; } # Is ']]>' outside a marked section, a WF error? I forget.... else { doChar(); } } if (scalar(@tagStack)>0) { for (my $i=scalar(@tagStack); $i>=0; $i--) { print "Open element '$tagStack[$i]', started at $tagStartLocs[$i].\n"; } err("Unclosed element(s) at EOF."); } dumpEvent("FINAL",""); ($quiet) || vwarn("Done."); exit; ############################################################################### sub doTag { (curChar() eq "<") || err("doTag called while not at '<', but " . curChar()); nextChar(); vwarn2("doTag: char after '<' is '" . curChar() . "'"); if (curChar() eq '!') { doDcl(); } elsif (curChar() eq '?') { doPI(); } elsif (curChar() eq '/') { doEnd(); } else { doStart(); } } sub doDcl { (curChar() eq "!") || err("doDcl called while not at '!', but " . curChar()); nextChar(); # Discard the bang vwarn2("In doDcl, buf is: " . restOfBuf()); if (curChar() eq "-") { (nextChar() eq "-") || err("Comment(?) starts with '")); } elsif (curChar() eq "[" && restOfBuf() =~ /^\[CDATA\[/) { # Marked section for (my $i=0; $i")); } else { my $dclType = readName(); my $paramEntity = 0; vwarn2("Dcl type '$dclType', buf: " . restOfBuf()); skipWhiteSpace(); if (curChar() eq "%") { ($dclType eq "ENTITY") || err("Invalid '%'"); $paramEntity = 1; nextChar(); skipWhiteSpace(); } my $name = readName(); vwarn2("Declared item name '$name'"); skipWhiteSpace(); if ($dclType eq "ELEMENT") { dumpEvent("ELEMENT", $name); } elsif ($dclType eq "ENTITY") { my $idType = ""; skipWhiteSpace(); if (curChar() !~ /['"]/) { $idType= readName(); ($idType eq "PUBLIC" || $idType eq "SYSTEM") || err("Keyword not PUBLIC or SYSTEM for entity '$name'."); skipWhiteSpace(); } if ($idType eq "PUBLIC") { my $publicId = readAttrValue(); skipWhiteSpace(); } my $replacementText = readAttrValue(); my $info = "$name=$replacementText"; if ($paramEntity ne 0) { dumpEvent("ENTITY_PAR", $info); defineParameterEntity($name, $replacementText); } else { defineEntity($name, $replacementText, $idType); if ($idType ne "") { dumpEvent("ENTITY_EXT", $info); } else { dumpEvent("ENTITY_INT", $info); } } } elsif ($dclType eq "ATTLIST") { dumpEvent("ATTLIST", $name); } elsif ($dclType eq "NOTATION") { dumpEvent("NOTATION", $name); } elsif ($dclType eq "DOCTYPE") { if ($seenDoctype) { err("Duplicate DOCTYPE, first was at line $seenDoctype"); } $seenDoctype = 1; $inDoctype = 1; dumpEvent("DOCTYPE", $name); readToAndDiscard("["); return; # Don't scan for '>', must parse subset.... } else { err("Uknown markup declaration type: '$dclType'."); } readToAndDiscard(">"); # fails on embedded quoted gt } } # doDcl sub doParameterEntityRef { (curChar() eq "%") || err("doParameterEntityRef called when not at '%'."); nextChar(); my $name = readName(); (curChar() eq ";") || err("Parameter entity reference not ended with ';', but '" . curChar() . "'."); nextChar(); # Nuke the semicolon err("Parameter entities references are not yet supported. Skipping '$name'"); } sub doPI { (curChar() eq "?") || err("doPI called while not at '?', but " . curChar()); nextChar(); my $target = readName(); if ($target ne "xml") { vwarn2("In doPI, target is '$target'"); dumpEvent("PITARGET", $target); dumpEvent("PIVALUE", readToAndDiscard("?>")); } else { vwarn2("In doPI for XML dcl."); ($seenXmlDcl) && err("Duplicate XML dcl, previous was at line $seenXmlDcl"); dumpEvent("XMLDCL", readToAndDiscard("?>")); $seenXmlDcl = 1; } } # doPI sub doEnd { (curChar() eq "/") || err("doTag called while not at '/', but " . curChar()); nextChar(); my $name = readName(); skipWhiteSpace(); if (curChar() ne ">") { err("Can't find '>' in end-tag for '$name'."); } nextChar(); ($name eq $tagStack[-1]) || err("Close-tag '$name' does not match expected '$tagStack[-1]'"); pop @tagStack; pop @tagStartLocs; dumpEvent("END", $name); } # doEnd sub doStart { my $empty = 0; my $name = readName(); push @tagStack, $name; push @tagStartLocs, currentEntityLoc(); dumpEvent("START", $name); my $done = 0; while (!$done) { skipWhiteSpace(); if (curChar() eq "/") { $empty = 1; nextChar(); } if (curChar() eq ">") { $done = 1; nextChar(); } else { # better be an attribute vwarn2("Trying for aname, buf: " . restOfBuf()); my $aname = readName(); vwarn2(" Got aname '$aname'."); skipWhiteSpace(); (curChar() eq "=") || err("Missing '=' after attr name '$name'."); nextChar(); skipWhiteSpace(); my $avalue = readAttrValue(); dumpEvent("ATTR", "$aname=$avalue"); } } # while if ($empty) { pop @tagStack; dumpEvent("END", $name); # Empty element tag, so issue end-tag event too. } } # doStart sub doEnt { # Deal with an entity or numeric character reference. my $numeric = my $hex = 0; if (curChar() eq "#") { # numeric $numeric = 1; nextChar(); if (curChar() =~ /[xX]/) { $hex = 1; nextChar(); } } my $gotSemi = 0; my $name = ""; while (my $c = nextChar()) { if ($c eq ";") { $gotSemi = 1; last; } $name .= $c; } $gotSemi || die "Unterminated entity/char reference '$name'"; if ($numeric) { my $n = 0; if ($hex) { ($name =~ /[0-9a-f]+/i) || err("Invalid char in hex char ref '$name'"); $n = hex($name); } else { ($name =~ /[0-9]+/i) || err("Invalid char in dec char ref '$name'"); $n = $name - 0; } if ($n <= 0 || $n > 65535) { err("Char ref out of numeric range: $n"); } doChar(chr($n)); } else { # named my $e; if ($e = $ents{$name}) { openInternalEntity($name,$e); } elsif ($e = $systemEnts{$name}) { openExternalEntity($name,$e); } else { err("Undefined entity name '$name'.\n"); } } } # doEnt sub doChar { my $c = curChar(); if ($verbose > 1) { if (ord($c) < 33) { dumpEvent("CHAR", "#" . ord($c)); } else { dumpEvent("CHAR", "'$c'"); } } else { $textBuf .= $c; } nextChar(); } sub dumpTextBuf { if ($textBuf eq "") { return; } if (!$dtd && !$pastDoctype) { $textBuf=""; return; } if ($textBuf =~ /^\s*$/ && ($inDoctype || !$ws)) { $textBuf=""; return; } $textBuf =~ s/\n/\\n/g; print "TEXT $textBuf\n"; $textBuf = ""; } sub dumpEvent { if (!$dtd && !$pastDoctype) { return; } if ($textBuf ne "") { dumpTextBuf(); } print sprintf("%-10s %s\n", $_[0], $_[1]); } ############################################################################### ################## ENTITY / INPUT BUFFER MGMT sub defineEntity { vwarn("Defining entity $_[0], $_[1], $_[2].\n"); my $ename = $_[0]; ($ename =~ /^(lt|gt|amp|quo|apos)$/) && err("Cannot redefine pre-defined entity $ename."); (defined $ents{$ename}) && warn "Warning: More than one dcl for entity $ename.\n"; if (defined $_[2] && $_[2] ne "") { # SYSTEM $systemEnts{$ename} = $_[1]; } else { $ents{$ename} = $_[1]; } } sub defineParameterEntity { my $ename = $_[0]; (defined $ents{$ename}) && warn "Warning: More than one dcl for parameter entity $ename.\n"; if (defined $_[2] && $_[2] ne "") { # SYSTEM $systemPents{$ename} = $_[1]; } else { $pents{$ename} = $_[1]; } } BEGIN { # Stuff needed per open entity: my @oeHandle = (); # A stack of open entity file handles my @oeName = (); # Name of the entity my @oeFilename = (); # Corresponding file name my @oeLinenum = (); # Current line-number in the entity my @l = (); # Current record of input file my @cursor = (); # Current loc in the input record sub curChar { # Return the character the cursor is on, without consuming it if ($cursor[-1] >= length($l[-1])) { fillBuf() || return(undef); } return(substr($l[-1],$cursor[-1],1)); } sub consumeChar { # Return the character the cursor is on, AND consume it my $c = curChar(); nextChar(); return($c); } sub nextChar { # Move to and return, the next character. $cursor[-1]++; return(curChar()); } sub pushBackChar { # not used (--$cursor[-1] >= 0) || err("Cannot push back another character, buf is: '" . restOfBuf() . "'"); } sub readName { # Read an XML NAME vwarn2("readName entered."); my $buf = ""; if (curChar() !~ /\w/) { err("Invalid name-start character '" . curChar() . "'"); } while (curChar() =~ /[-.:_\w\d]/) { $buf .= consumeChar(); } return($buf); } sub skipWhiteSpace { my $buf = ""; while (curChar() =~ /\s/) { $buf .= consumeChar(); } vwarn2("After skipWhiteSpace, curChar is '" . curChar() . "'"); return($buf); } # This needs to add handling for entity refs inside! sub readAttrValue { # Quoted either way vwarn2("In readAttrValue, buf: " . restOfBuf()); my $origline = $oeLinenum[-1]; my $buf = ""; skipWhiteSpace(); if (curChar() !~ /["']/) { err("Invalid attribute-value start character '" . curChar() . "'"); } my $qtype = curChar(); while (nextChar() ne $qtype) { $buf .= curChar(); } nextChar() || err("EOF in middle of attribute value started on line $origline"); return($buf); } sub readToAndDiscard { # End-string cannot be broken across line boundary vwarn2("In readToAndDiscard for '$_[0]'"); my $origline = $oeLinenum[-1]; my $buf = ""; my $hit = ""; $l[-1] = substr($l[-1],$cursor[-1]); $cursor[-1] = 0; while (($hit = index($l[-1],$_[0])) < 0) { $buf .= $l[-1]; fillBuf(); } $buf .= substr($l[-1],0,$hit); $cursor[-1] = $hit + length($_[0]); curChar() || err("EOF in middle of scan for '$_[0]', started on line $origline"); return($buf); } sub restOfBuf { return(substr($l[-1],$cursor[-1])); } sub fillBuf { vwarn2("In fillBuf, entity stack depth " . scalar(@oeFilename)); while (scalar @oeFilename > 0) { local *CURFILE = $oeHandle[-1]; if (my $rec = ) { vwarn2("Read line: $rec"); $l[-1] = $rec; $oeLinenum[-1]++; $cursor[-1] = 0; return $l[-1]; } closeEntity(); } return(undef); # EOF } # Insert contents into current input buffer. # NOTE: This throws off offset-in-line (which we don't presently report anyway). sub openInternalEntity { vwarn("Opening internal entity '$_[0]'.\n"); my $rest = restOfBuf(); $l[-1] = $_[1] . $rest; $cursor[-1] = 0; } # Only SYSTEM identifiers are presently used, and there's no catalog support. # Could add an ENTITY_PATH env. variable or option. sub openExternalEntity { vwarn("Opening external entity '$_[0]', system id '$_[1]'.\n"); my $ename = $_[0]; my $sysid = $_[1]; if (!-f $sysid) { warn "External entity $ename not found at '$sysid'.\n"; (scalar(@oeLinenum)>0) && warn " Referenced at line $oeLinenum[-1] of entity $oeName[-1].\n"; return; } local *FILE; open(FILE, "<$sysid") || err("Failed to open file '$sysid' for entity '$ename'."); push @oeHandle,*FILE; push @oeName, $ename; push @oeFilename, $sysid; push @l, ""; push @cursor, -1; push @oeLinenum, 0; } sub closeEntity { vwarn("Closing entity '$oeName[-1]'.\n"); close $oeHandle[-1]; pop @oeName; pop @oeHandle; pop @oeFilename; pop @l; pop @cursor; pop @oeLinenum; } sub currentEntityLoc { return($oeName[-1] . " + " . $oeLinenum[-1]); } sub dumpEntityStack { for (my $i=scalar @oeFilename; $i<=0; $i--) { print "$i: Entity $oeName[$i], file '$oeFilename[$i]', line $oeLinenum[$i]\n"; } } } # END ############################################################################### sub vwarn { # Basic tracing, invoke with -v ($verbose) || return; print "Warning: $_[0]\n"; } sub vwarn2 { # Tons of detail, invoke with -v -v ($verbose > 1) || return; print "Warning: $_[0]\n"; } sub err { die "FATAL ERROR: $_[0]\n"; dumpEntityStack(); } ############################################################################### sub showUsage { warn " Usage: xmlparser.pl [options] xml-file Parses an XML file. Written in 2 hours after a talk whose speaker said that writing a 'basic XML parser' is hard. Options: -dir path Append a directory to be seached for external entities (not yet). -dtd Include DTD events (default, use -nodtd to suppress). -q Suppress most messages. -v Add more messages (repeatable)_. -version Show version info and exit (sjd $version, version 0.3). -ws Keep whitespace-only text nodes (normally dropped). Notes: Each SAX-like event is reported on a single line, with 10 columns for the event type, then a space, then the event information. Attributes are given as separate events immediately following their start tag's event. Empty element tags are reported as a start and end (as in SAX). Because each event is on a single line, newline characters in event information (say, TEXT events), are displayed as '\\n'. Known Bugs/Limitations (please report any other bugs to sderose\@acm.org): Doesn't handle entity references inside attribute values. Doesn't yet report full data from inside markup declarations. Quoted '>' inside markup dcls will confuse the parser. Reads a line at a time, so a big enough file with no breaks could blow out memory. Well-Formedness checks not done: The inside of the XML declaration is not checked at all. Does not report late/extra XML declaration(s). "; }