#!/usr/bin/perl -w # # osisCheck: Check structure of an OSIS file. # # 2008-08-26: Written by Steven J. DeRose, sderose@acm.org. # 2008-09-12 sjd: Add osisID syntax/range checks, -mixed, continue debugging. # 2008-09-14 sjd: Finish osisID checking, add knowledge of TR/Hebrew variants. # # To do: # osisRefs, incl. ranges # osisIDs with multiple ids in a single attribute. # Ending of Mark # Charsets of content per xml:lang # Locations of red-text, divineName, sigblocks, etc. # (better with schematron?) # Check work and scope declarations and references use strict; use Getopt::Long; my $version = "2008-09-14"; # Option values (see -help at end) my $break = 0; my $canon = "NT"; my $catalog = ""; my $echo = 0; my $elements = 0; my $hebrew = 0; my $help = 0; my $maxerrors = 0; # unlimited my $newLineString = "\n"; my $quiet = 0; my $refs = 0; my $tr = 1; my $verbose = 0; my $vfragments = 0; my $vv = 0; my $xmllang = ""; ################################################################################ Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "break!" => \$break, "canon=s" => \$canon, "catalog=s" => \$catalog, "echo!" => \$echo, "elements!" => \$elements, "hebrew!" => \$hebrew, "h|help" => \$help, "maxerrors=n" => \$maxerrors, "q!" => \$quiet, "refs!" => \$refs, "tr!" => \$tr, "v+" => \$verbose, "version" => sub { showLicense(); exit; }, "vfragments!" => \$vfragments, "xmllang=s" => \$xmllang, ); if ($help) { showUsage(); exit; } ($result) || die "Bad options.\n"; ($canon =~ m/^(OT|NT|Apoc|RC|Prot|RahlsLXX|Vulgate|Ethiopian|Coptic|Armenian|Peshitta|Sinaiticus)$/) || die "Unknown canon '$canon'.\n"; if ($catalog ne "" && !-f $catalog) { die "Can't find specified catalog '$catalog'.\n"; } (scalar @ARGV > 0) || die "No input file specified.\n"; my $file = shift; (-f $file) || die "Can't find XML file at '$file'.\n"; $vv = ($verbose > 1); ############################################################################### my $startTime = 0; my $endTime = 0; my @tagStack; # Current stack of element types my %lastLine; # Where we last saw element of each type my @langStack; # Stack of xml:lang values my $lastEvent = ""; my $pastDTD = 0; my $nodesChanged = 0; # These hashes hold the OSIS book-name abbreviations and expansions. my %bookNamesOT = (); my %bookNamesNT = (); my %bookNamesApoc = (); my %bookNamesRahlfsLXX = (); my %bookNamesVulgate = (); my %bookNamesEthiopianOrthodox = (); my %bookNamesCopticOrthodox = (); my %bookNamesArmenianOrthodox = (); my %bookNamesPeshitta = (); my %bookNamesCodexSinaiticus = (); my %bookNamesAll = (); # Union of the above setupBookAbbrevs(); # These arrays hold book-name abbreviations in the right order for each canon my @canonOT = (); my @canonApoc = (); my @canonNT = (); my @canonRomanCatholic = (); my @canonProtestant = (); my @canonRahlfsLXX = (); my @canonVulgate = (); my @canonEthiopianOrthodox = (); my @canonCopticOrthodox = (); my @canonArmenianOrthodox = (); my @canonPeshitta = (); my @canonCodexSinaiticus = (); setupCanons(); my %otcounts = (); my $totalOTVerses = 23145; my %apoccounts = (); my $totalApocVerses = -1; my %ntcounts = (); my $totalNTVerses = 7957; setupVerseCounts(); my $bookCount = 0; my $chapterCount = 0; my $verseCount = 0; my $errorCount = 0; my @canonOfChoice = @canonNT; my $currentBookNum = -1; my $currentBookName = ""; my @currentBookIdList = (); my $currentPlaceInIdList = 0; my $currentOsisId = ""; my $curline = 0; # global for error messages. my %languagesSeen = (); my %elementCounts = (); my $totalErrors = 0; loadNextBookIdList(); ############################################################################### # Set up the parser and callbacks use XML::Parser; use XML::Catalog; #my $parser = new XML::Parser(ErrorContext => 2); my $parser = new XML::Parser(ErrorContext => 0); if ($catalog ne "") { my $catalogObject=XML::Catalog->new($catalog); $parser->setHandlers(ExternEnt => $catalogObject->get_handler($parser)); } $parser->setHandlers( Init => \&initHandler, Final => \&finalHandler, Start => \&startTagHandler, End => \&endTagHandler, Char => \&charHandler, Proc => \&piHandler, Comment => \&commentHandler, CdataStart => \&cdataStartHandler, CdataEnd => \&cdataEndHandler, Doctype => \&doctypeHandler, DoctypeFin => \&doctypeFinHandler, Default => \&defaultHandler, Entity => \&entityDclHandler, Element => \&elementDclHandler ); # Process the XML (can only do one document per parser instance) $parser->parsefile($file); # $parser->parse("

Hello

"); if ($elements) { for my $e (sort keys %elementCounts) { print sprintf("%-20s %6d\n", $e, $elementCounts{$e}); } } warn "Done. $bookCount books, $chapterCount chapters, $verseCount verses.\n"; exit; ############################################################################## sub initHandler { my ($p, $name) = @_; ($vv) && warnEvent("Init",$_[1]); $startTime = time(); $lastEvent = "INIT"; xmlEcho("\n"); } sub finalHandler { my ($p, $name) = @_; ($vv) && warnEvent("Final",$_[1]); $endTime = time(); $lastEvent = "FINAL"; } sub startTagHandler { my ($p, $name) = @_; $curline = $p->current_line; ($verbose) && warnEvent("Start-tag",$name); push(@tagStack,$name); my $e = "?ent?"; # $p->current_entity; my $l = $p->current_line; $lastLine{$name} = "Entity '$e', line $l"; # for error reporting # Keep track of number of attrs, and a few specific attributes. my $numAttrs = 0; my %attlist = (); for (my $i=2; $i 0) { push @langStack, $langStack[-1]; } else { push @langStack, "EN"; } $lastEvent = "STARTTAG"; $elementCounts{$name}++; if (($name eq "div") && (defined $attlist{"class"}) && ($attlist{"class"} eq "book")) { $bookCount++; } elsif ($name eq "chapter") { $chapterCount++; } elsif ($name eq "verse") { $verseCount++; } # Reconstruct a start-tag to echo my $buf = ""; if ($break) { $buf .= $newLineString; } $buf .= "<$name"; for my $aname (sort keys %attlist) { my $avalue = $attlist{$aname}; $avalue =~ s/\"/&quo;/g; $buf .= " $aname=\"$avalue\""; } $buf .= ">"; xmlEcho($buf); # See if any osisID we get, is the expected one. If not, report then # diff-like search? # reset to where it says we are? # keep going? (extra/missing ID leads to all errors) # check off on global list, and work from there? # Shift name and canon sequence generation to separate process? # if (my $newOsisId=$attlist{"osisID"}) { my $theOsisId = $attlist{"osisID"}; if (checkOsisIdSyntax($theOsisId, $name)) { checkSequence($theOsisId); } } if ($maxerrors && $totalErrors > $maxerrors) { warnEvent("LIMIT of $maxerrors ERRORS REACHED. STOPPING."); die "Terminated.\n"; } } # sub startTagHandler sub endTagHandler { my ($p, $name) = @_; $curline = $p->current_line; ($verbose) && warnEvent("End-tag",$name); my $expectedTag = pop(@tagStack); if ($expectedTag ne $name) { my $l = $_[0]->current_line; print "Well-formedness error: Found end of '$name' at line $l when " . " expecting '$expectedTag'.\n."; print "Last '$expectedTag' was started at $lastLine{$expectedTag}.\n"; } pop @langStack; $lastEvent = "ENDTAG"; xmlEcho(""); } # endTagHandler sub charHandler { my ($p, $data) = @_; $curline = $p->current_line; ($verbose) && warnEvent("Text node",$_[1]); $lastEvent = "CHAR"; xmlEcho($_[1]); } # charHandler sub piHandler { my ($p, $data) = @_; $curline = $p->current_line; ($verbose) && warnEvent("Pi",$_[1]); $lastEvent = "PI"; $_[2] =~ s/\?>/?>/g; xmlEcho(""); } sub commentHandler { my ($p, $data) = @_; $curline = $p->current_line; ($verbose) && warnEvent("Comment",$_[1]); if ($pastDTD) { } else { } $lastEvent = "COMMENT"; xmlEcho(""); } sub cdataStartHandler { my ($p, $data) = @_; $curline = $p->current_line; ($verbose) && warnEvent("CDATA start",$data); $lastEvent = "CDATASTART"; } sub cdataEndHandler { my ($p, $data) = @_; $curline = $p->current_line; ($verbose) && warnEvent("CDATA end",$data); $lastEvent = "CDATAEND"; } sub doctypeHandler { my ($p, $docel, $docsys, $docpub) = @_; $curline = $p->current_line; if (!defined $docsys) { $docsys = ""; } if (!defined $docpub) { $docpub = ""; } xmlEcho("\n"); } else { xmlEcho(" SYSTEM \"$docsys\" []>\n"); } } sub doctypeFinHandler { my ($p, $data) = @_; $curline = $p->current_line; $pastDTD = 1; $lastEvent = "DOCTYPEFIN"; } sub defaultHandler { my ($p, $data) = @_; $curline = $p->current_line; ($vv) && scalar($_)>1 && warnEvent("Default",$_[1]); $lastEvent = "DEFAULT"; } ############################################################################### ############################################################################### # Make hashes of all known book names, keyed by the normative OSIS abbreviation, # and with longer book name(s) as value. # See http://catholic-resources.org/Bible/OT-Statistics-NAB.htm sub setupBookAbbrevs { %bookNamesOT = ( "Gen" => "Genesis", "Exod" => "Exodus", "Lev" => "Leviticus", "Num" => "Numbers", "Deut" => "Deuteronomy", "Josh" => "Joshua", "Judg" => "Judges", "Ruth" => "Ruth", "1Sam" => "1 Samuel", "2Sam" => "2 Samuel", "1Kgs" => "1 Kings", "2Kgs" => "2 Kings", "1Chr" => "1 Chronicles", "2Chr" => "2 Chronicles", "Ezra" => "Ezra", "Neh" => "Nehemiah", "Esth" => "Esther3", "Job" => "Job", "Ps" => "Psalms", "Prov" => "Proverbs", "Eccl" => "Ecclesiastes | Qohelet", "Song" => "Song of Solomon | Canticle of Canticles", "Isa" => "Isaiah", "Jer" => "Jeremiah", "Lam" => "Lamentations", "Ezek" => "Ezekiel", "Dan" => "Daniel", "Hos" => "Hosea", "Joel" => "Joel", "Amos" => "Amos", "Obad" => "Obadiah", "Jonah" => "Jonah", "Mic" => "Micah", "Nah" => "Nahum", "Hab" => "Habakkuk", "Zeph" => "Zephaniah", "Hag" => "Haggai", "Zech" => "Zechariah", "Mal" => "Malachi" ); %bookNamesNT = ( "Matt" => "Matthew", "Mark" => "Mark", "Luke" => "Luke", "John" => "John", "Acts" => "Acts", "Rom" => "Romans", "1Cor" => "1 Corinthians", "2Cor" => "2 Corinthians", "Gal" => "Galatians", "Eph" => "Ephesians", "Phil" => "Philippians", "Col" => "Colossians", "1Thess" => "1 Thessalonians", "2Thess" => "2 Thessalonians", "1Tim" => "1 Timothy", "2Tim" => "2 Timothy", "Titus" => "Titus", "Phlm" => "Philemon", "Heb" => "Hebrews", "Jas" => "James", "1Pet" => "1 Peter", "2Pet" => "2 Peter", "1John" => "1 John", "2John" => "2 John", "3John" => "3 John", "Jude" => "Jude", "Rev" => "Revelation" ); %bookNamesApoc = ( "Tob" => "Tobit", "Jdt" => "Judith", "AddEsth" => "Additions to Esther", "Wis" => "Wisdom | Wisdom of Solomon", "Sir" => "Sirach | Ecclesiasticus", "Bar" => "Baruch", "EpJer" => "Letter of Jeremiah (= Bar.6)", "PrAzar" => "Prayer of Azariah | Song of the Three Children", "Sus" => "Susanna", "Bel" => "Bel and the Dragon", "1Macc" => "1 Maccabees", "2Macc" => "2 Maccabees", "3Macc" => "3 Maccabees", "4Macc" => "4 Maccabees", "PrMan" => "Prayer of Manasseh (= Odes.14)", "1Esd" => "1 Esdras4", "2Esd" => "2 Esdras4 | 5 Ezra (= Bible.NRSVA:2Esd.1-2Esd.2)", "Ps151" => "Psalm 151" ); %bookNamesRahlfsLXX = ( "Odes" => "Odes", "PssSol" => "Psalms of Solomon" ); %bookNamesVulgate = ( # & other later Latin mss* (4) "EpLao" => "Epistle to the Laodiceans", "3Esd" => "3 Esdras4", "4Esd" => "4 Esdras4 | 4 Ezra (= Bible.NRSVA:2Esd.3-2Esd.14)", "5Esd" => "5 Esdras4 | 6 Ezra (= Bible.NRSVA:2Esd.15-2Esd.16)" ); %bookNamesEthiopianOrthodox = ( # Canon/Ge'ez Translation Additions5 (6) "1En" => "1 Enoch | Ethiopic (Apocalypse of) Enoch", "Jub" => "Jubilees", "4Bar" => "4 Baruch | Paraleipomena Jeremiou", "AscenIsa" => "Ascension/Vision of Isaiah (=MartAscenIsa.6 - .11)", "Teg" => "Tegsas (= Prov.25-Prov.31)6", "PsJos" => "Pseudo-Josephus | Jossipon; Josephu ben Gorion Medieval History of the Jews" ); %bookNamesCopticOrthodox = ( "AposCon" => "Apostolic Constitutions and Canons", "1Clem" => "1 Clement", "2Clem" => "2 Clement" ); %bookNamesArmenianOrthodox = ( "3Cor" => "3 Corinthians", "EpCorPaul" => "Epistle of the Corinthians to Paul and His Response", "JosAsen" => "Joseph and Asenath", "T12Patr" => "Testaments of the Twelve Patriarchs (12 parts)", "T12Patr.TAsh" => "Testaments of Asher", "T12Patr.TBenj" => "Testaments of Benjamin", "T12Patr.TDan" => "Testaments of Dan", "T12Patr.TGad" => "Testaments of Gad", "T12Patr.TIss" => "Testaments of Issachar", "T12Patr.TJos" => "Testaments of Joseph", "T12Patr.TJud" => "Testaments of Judah", "T12Patr.TLevi" => "Testaments of Levi", "T12Patr.TNaph" => "Testaments of Naphtali", "T12Patr.TReu" => "Testaments of Reuben", "T12Patr.TSim" => "Testaments of Simeon", "T12Patr.TZeb" => "Testaments of Zebulun" ); %bookNamesPeshitta = ( "2Bar" => "2 Baruch | (Syriac) Apocalypse of Baruch", "EpBar" => "Letter of Baruch (= 2Bar.78-2Bar.86)" ); %bookNamesCodexSinaiticus = ( "Barn" => "Barnabas", "Herm" => "Shepherd of Hermas (= Mandates, Similitudes, Visions)", "Herm.Mand" => "Shepherd of Hermas, Mandates", "Herm.Sim" => "Shepherd of Hermas, Similitudes", "Herm.Vis" => "Shepherd of Hermas, Visions" ); %bookNamesAll = (); addtoUnionBooksList(\%bookNamesOT); addtoUnionBooksList(\%bookNamesNT); addtoUnionBooksList(\%bookNamesApoc); addtoUnionBooksList(\%bookNamesRahlfsLXX); addtoUnionBooksList(\%bookNamesVulgate); addtoUnionBooksList(\%bookNamesEthiopianOrthodox); addtoUnionBooksList(\%bookNamesCopticOrthodox); addtoUnionBooksList(\%bookNamesArmenianOrthodox); addtoUnionBooksList(\%bookNamesPeshitta); addtoUnionBooksList(\%bookNamesCodexSinaiticus); } # setupBookAbbrevs sub addtoUnionBooksList { my %toAdd = %{$_[0]}; for my $k (keys %toAdd) { $bookNamesAll{$k} = $toAdd{$k}; } } # addtoUnionBooksList sub correctBookAbbrev { my $a = $_[0]; if ($a =~ m/^Mat/) { return("Matt"); } if ($a =~ m/^Mark/ || $1 eq "Mk") { return("Mark"); } if ($a =~ m/^Luk/ || $1 eq "Lk") { return("Luke"); } if ($a =~ m/^John/ || $1 eq "Jn") { return("John"); } if ($a =~ m/^Ac/) { return("Acts"); } if ($a =~ m/^Rom/ || $1 eq "Rm") { return("Rom"); } if ($a =~ m/^1C/) { return("1Cor"); } if ($a =~ m/^2C/) { return("2Cor"); } if ($a =~ m/^Gal/) { return("Gal"); } if ($a =~ m/^Eph/) { return("Eph"); } if ($a =~ m/^Phil/ || $1 eq "Phlp") { return("Phil"); } if ($a =~ m/^Col/) { return("Col"); } if ($a =~ m/^1Th/) { return("1Thess"); } if ($a =~ m/^2Th/) { return("2Thess"); } if ($a =~ m/^1T/) { return("1Tim"); } if ($a =~ m/^2T/) { return("2Tim"); } if ($a =~ m/^Tit/ || $1 eq "Tt") { return("Titus"); } if ($a =~ m/^Phile/) { return("Phlm"); } if ($a =~ m/^Heb/ || $1 eq "Hb") { return("Heb"); } if ($a =~ m/^Ja/ || $1 eq "Jm") { return("Jas"); } if ($a =~ m/^1P/) { return("1Pet"); } if ($a =~ m/^2P/) { return("2Pet"); } if ($a =~ m/^1J/) { return("1John"); } if ($a =~ m/^2J/) { return("2John"); } if ($a =~ m/^3J/) { return("3John"); } if ($a =~ m/^Jude/ || $1 eq "Jd") { return("Jude"); } if ($a =~ m/^Rev/ || $1 =~ m/Apoc/) { return("Rev"); } return(""); } # ############################################################################### # The order of books in various canons; the names used must be valid keys to # the %bookNamesAll hash, as set up by setupBookAbbrevs(). # sub setupCanons { @canonOT = ( "Gen", "Exod", "Lev", "Num", "Deut", "Josh", "Judg", "Ruth", "1Sam", "2Sam", "1Kgs", "2Kgs", "1Chr", "2Chr", "Ezra", "Neh", "Esth", "Job", "Ps", "Prov", "Eccl", "Song", "Isa", "Jer", "Lam", "Ezek", "Dan", "Hos", "Joel", "Amos", "Obad", "Jonah", "Mic", "Nah", "Hab", "Zeph", "Hag", "Zech", "Mal"); checkBookNames(@canonOT); @canonApoc = ( "Tob", "Jdt", "AddEsth", "Wis", "Sir", "Bar", "EpJer", "PrAzar", "Sus", "Bel", "1Macc", "2Macc", "3Macc", "4Macc", "PrMan", "1Esd", "2Esd", "Ps151"); checkBookNames(@canonApoc); @canonNT = ( "Matt", "Mark", "Luke", "John", "Acts", "Rom", "1Cor", "2Cor", "Gal", "Eph", "Phil", "Col", "1Thess", "2Thess", "1Tim", "2Tim", "Titus", "Phlm", "Heb", "Jas", "1Pet", "2Pet", "1John", "2John", "3John", "Jude", "Rev"); checkBookNames(@canonNT); @canonRomanCatholic = (); push @canonRomanCatholic, @canonOT; push @canonRomanCatholic, @canonApoc; push @canonRomanCatholic, @canonNT; @canonProtestant = (); push @canonProtestant, @canonOT; push @canonProtestant, @canonNT; @canonRahlfsLXX = ( "Odes", "PssSol"); checkBookNames(@canonRahlfsLXX); @canonVulgate = ( "EpLao", "3Esd", "4Esd", "5Esd"); checkBookNames(@canonVulgate); @canonEthiopianOrthodox = ( # Canon/Ge'ez Translation Additions5 (6) "1En", "Jub", "4Bar", "AscenIsa", "Teg", "PsJos"); checkBookNames(@canonEthiopianOrthodox); @canonCopticOrthodox = ( "AposCon", "1Clem", "2Clem"); checkBookNames(@canonCopticOrthodox); @canonArmenianOrthodox = ( "3Cor", "EpCorPaul", "JosAsen", "T12Patr", "T12Patr.TAsh", "T12Patr.TBenj", "T12Patr.TDan", "T12Patr.TGad", "T12Patr.TIss", "T12Patr.TJos", "T12Patr.TJud", "T12Patr.TLevi", "T12Patr.TNaph","T12Patr.TReu", "T12Patr.TSim", "T12Patr.TZeb"); checkBookNames(@canonArmenianOrthodox); @canonPeshitta = ( "2Bar", "EpBar"); checkBookNames(@canonPeshitta); @canonCodexSinaiticus = ( "Barn", "Herm", "Herm.Mand", "Herm.Sim", "Herm.Vis"); checkBookNames(@canonCodexSinaiticus); } # setupCanons ############################################################################### # A hash keyed by book abbreviation. Each entry's value is an array. # In each array, [0] is number of chapters, [1] is number of verses total, # then [2..n] are verses in each chapter. # sub setupVerseCounts { %otcounts = ( "Gen" => [ 50, 1533, 31, 25, 24, 26, 32, 22, 24, 22, 29, 32, 32, 20, 18, 24, 21, 16, 27, 33, 38, 18, 34, 24, 20, 67, 34, 35, 46, 22, 35, 43, 54, 33, 20, 31, 29, 43, 36, 30, 23, 23, 57, 38, 34, 34, 28, 34, 31, 22, 33, 26 ], "Exod" => [ 40, 1213, 22, 25, 22, 31, 23, 30, 29, 28, 35, 29, 10, 51, 22, 31, 27, 36, 16, 27, 25, 26, 37, 30, 33, 18, 40, 37, 21, 43, 46, 38, 18, 35, 23, 35, 35, 38, 29, 31, 43, 38 ], "Lev" => [ 27, 859, 17, 16, 17, 35, 26, 23, 38, 36, 24, 20, 47, 8, 59, 57, 33, 34, 16, 30, 37, 27, 24, 33, 44, 23, 55, 46, 34 ], "Num" => [ 36, 1289, 54, 34, 51, 49, 31, 27, 89, 26, 23, 36, 35, 16, 33, 45, 41, 35, 28, 32, 22, 29, 35, 41, 30, 25, 19,65, 23, 31, 39, 17, 54, 42, 56, 29, 34, 13 ], "Deut" => [ 34, 959, 46, 37, 29, 49, 33, 25, 26, 20, 29, 22, 32, 31, 19, 29, 23, 22, 20, 22, 21, 20, 23, 29, 26, 22, 19, 19, 26, 69, 28, 20, 30, 52, 29, 12 ], "Josh" => [ 24, 658, 18, 24, 17, 24, 15, 27, 26, 35, 27, 43, 23, 24, 33, 15, 63, 10, 18, 28, 51, 9, 45, 34, 16, 33 ], "Judg" => [ 21, 618, 36, 23, 31, 24, 31, 40, 25, 35, 57, 18, 40, 15, 25, 20, 20, 31, 13, 31, 30, 48, 25 ], "Ruth" => [ 4, 85, 22, 23, 18, 22 ], "1Sam" => [ 31, 810, 28, 36, 21, 22, 12, 21, 17, 22, 27, 27, 15, 25, 23, 52, 35, 23, 58, 30, 24, 42, 16, 23, 28, 23, 43, 25, 12, 25, 11, 31, 13 ], "2Sam" => [ 24, 695, 27, 32, 39, 12, 25, 23, 29, 18, 13, 19, 27, 31, 39, 33, 37, 23, 29, 32, 44, 26, 22, 51, 39, 25 ], "1Kgs" => [ 22, 817, 53, 46, 28, 20, 32, 38, 51, 66, 28, 29, 43, 33, 34, 31, 34, 34, 24, 46, 21, 43, 29, 54 ], "2Kgs" => [ 25, 719, 18, 25, 27, 44, 27, 33, 20, 29, 37, 36, 20, 22, 25, 29, 38, 20, 41, 37, 37, 21, 26, 20, 37, 20, 30 ], "1Chr" => [ 29, 943, 54, 55, 24, 43, 41, 66, 40, 40, 44, 14, 47, 41, 14, 17, 29, 43, 27, 17, 19, 8, 30, 19, 32, 31, 31, 32, 34, 21, 30 ], "2Chr" => [ 36, 821, 18, 17, 17, 22, 14, 42, 22, 18, 31, 19, 23, 16, 23, 14, 19, 14, 19, 34, 11, 37, 20, 12, 21, 27, 28, 23, 9, 27, 36, 27, 21, 33, 25, 33, 26, 23 ], "Ezra" => [ 10, 280, 11, 70, 13, 24, 17, 22, 28, 36, 15, 44 ], "Neh" => [ 13, 405, 11, 20, 38, 17, 19, 19, 72, 18, 37, 40, 36, 47, 31 ], "Tob" => [ 14, 245, 22, 14, 17, 21, 22, 18, 17, 21, 6, 14, 18, 22, 18, 15 ], "Jdt" => [ 16, 340, 16, 28, 10, 15, 24, 21, 32, 36, 14, 23, 23, 20, 20, 19, 14, 25 ], "Esth" => [ 16, 272, 22, 23, 15, 17, 14, 14, 10, 17, 32, 3, 17, 8, 30, 16, 24, 10 ], "1Macc" => [ 16, 922, 63, 70, 59, 61, 68, 63, 50, 32, 73, 89, 74, 53, 53, 49, 41, 24 ], "2Macc" => [ 15, 556, 36, 32, 40, 50, 27, 31, 42, 36, 29, 38, 38, 46, 26, 46, 39 ], "Job" => [ 42, 1068, 22, 13, 26, 21, 27, 30, 21, 22, 35, 22, 20, 25, 28, 22, 35, 22, 16, 21, 29, 29, 34, 30, 17, 25, 6, 14, 21, 28, 25, 31, 40, 22, 33, 37, 16, 33, 24, 41, 30, 32, 26, 17 ], # A Catholic list at catholic-resources.org says 2526. "Ps" => [ 150, 2461, 6, 12, 8, 8, 12, 10, 17, 9, 20, 18, 7, 8, 6, 7, 5, 11, 15, 50, 14, 9, 13, 31, 6, 10, 22, 12, 14, 9, 11, 12, 24, 11, 22, 22, 28, 12, 40, 22, 13, 17, 13, 11, 5, 26, 17, 11, 9, 14, 20, 23, 19, 9, 6, 7, 23, 13, 11, 11, 17, 12, 8, 12, 11, 10, 13, 20, 7, 35, 36, 5, 24, 20, 28, 23, 10, 12, 20, 72, 13, 19, 16, 8, 18, 12, 13, 17, 7, 18, 52, 17, 16, 15, 5, 23, 11, 13, 12, 9, 9, 5, 8, 28, 22, 35, 45, 48, 43, 13, 31, 7, 10, 10, 9, 8, 18, 19, 2, 29, 176, 7, 8, 9, 4, 8, 5, 6, 5, 6, 8, 8, 3, 18, 3, 3, 21, 26, 9, 8, 24, 13, 10, 7, 12, 15, 21, 10, 20, 14, 9, 6 ], "Prov" => [ 31, 915, 33, 22, 35, 27, 23, 35, 27, 36, 18, 32, 31, 28, 25, 35, 33, 33, 28, 24, 29, 30, 31, 29, 35, 34, 28, 28, 27, 28, 27, 33, 31 ], "Eccl" => [ 12, 222, 18, 26, 22, 17, 19, 12, 29, 17, 18, 20, 10, 14 ], "Song" => [ 8, 117, 17, 17, 11, 16, 16, 12, 14, 14 ], "Wis" => [ 19, 436, 16, 24, 19, 20, 23, 25, 30, 21, 18, 21, 26, 27, 19, 31, 19, 29, 21, 25, 22 ], "Sir" => [ 51, 1372, 29, 18, 30, 31, 17, 37, 36, 19, 18, 30, 34, 18, 25, 27, 20, 28, 27, 33, 26, 30, 28, 27, 27, 31, 25, 20, 30, 26, 28, 25, 31, 24, 33, 26, 24, 27, 30, 34, 35, 30, 24, 25, 35, 23, 26, 20, 25, 25, 16, 29, 30 ], "Isa" => [ 66, 1291, 31, 22, 26, 6, 30, 13, 25, 23, 20, 34, 16, 6, 22, 32, 9, 14, 14, 7, 25, 6, 17, 25, 18, 23, 12, 21, 13, 29, 24, 33, 9, 20, 24, 17, 10, 22, 38, 22, 8, 31, 29, 25, 28, 28, 25, 13, 15, 22, 26, 11, 23, 15, 12, 17, 13, 12, 21, 14, 21, 22, 11, 12, 19, 11, 25, 24 ], "Jer" => [ 52, 1364, 19, 37, 25, 31, 31, 30, 34, 23, 25, 25, 23, 17, 27, 22, 21, 21, 27, 23, 15, 18, 14, 30, 40, 10, 38, 24, 22, 17, 32, 24, 40, 44, 26, 22, 19, 32, 21, 28, 18, 16, 18, 22, 13, 30, 5, 28, 7, 47, 39, 46, 64, 34 ], "Lam" => [ 5, 154, 22, 22, 66, 22, 22 ], "Bar" => [ 6, 213, 22, 35, 38, 37, 9, 72 ], "Ezek" => [ 48, 1273, # corrected total from 1271 28, 10, 27, 17, 17, 14, 27, 18, 11, 22, 25, 28, 23, 23, 8, 63, 24, 32, 14, 44, 37, 31, 49, 27, 17, 21, 36, 26, 21, 26, # 49 32 in NEB 18, 32, 33, 31, 15, 38, 28, 23, 29, 49, 26, 20, 27, 31, 25, 24, 23, 35 ], "Dan" => [ 14, 530, 21, 49, 100,34, 30, 29, 28, 27, 27, 21, 45, 13, 64, 42 ], "Hos" => [ 14, 197, 9, 25, 5, 19, 15, 11, 16, 14, 17, 15, 11, 15, 15, 10 ], "Joel" => [ 4, 73, 20, 27, 5, 21 ], "Amos" => [ 9, 146, 15, 16, 15, 13, 27, 14, 17, 14, 15 ], "Obad" => [ 1, 21, 21 ], "Jonah" => [ 4, 48, 16, 11, 10, 11 ], "Mic" => [ 7, 105, 16, 13, 12, 14, 14, 16, 20 ], "Nah" => [ 3, 47, 14, 14, 19 ], "Hab" => [ 3, 56, 17, 20, 19 ], "Zeph" => [ 3, 53, 18, 15, 20 ], "Hag" => [ 2, 38, 15, 23 ], "Zech" => [ 14, 211, 17, 17, 10, 14, 11, 15, 14, 23, 17, 12, 17, 14, 9, 21 ], "Mal" => [ 3, 55, 14, 17, 24 ] ); my $tot = 0; for my $b (keys %otcounts) { $tot += @{$otcounts{$b}}[1]; } if ($verbose && $tot ne $totalOTVerses) { warn "Books added up to $tot for OT, not $totalOTVerses.\n"; } checkBookNames(keys %otcounts); checkVerseCountHash(\%otcounts); # ------------------------------------------------------------------------- # figures from http://catholic-resources.org/Bible/OT-Statistics-NAB.htm %apoccounts = ( "1Esd" => [ 9, 0, ], "2Esd" => [ 16, 0, ], "Tob" => [ 14, 245, 22, 14, 17, 21, 22, 18, 17, 21, 6, 14, 18, 22, 18, 15], "Jdt" => [ 16, 340, 16, 28, 10, 15, 24, 21, 32, 36, 14, 23, 23, 20, 20, 19, 14, 25], # Note: AddEsth starts at chapter 11? "AddEsth" => [0, 0, ], "Wis" => [ 19, 436, 16, 24, 19, 20, 23, 25, 30, 21, 18, 21, 26, 27, 19, 31, 19, 29, 21, 25, 22], "Sir" => [ 51, 1372, # Also known as Ecclesiasticus 29, 18, 30, 31, 17, 37, 36, 19, 18, 30, 34, 18, 25, 27, 20, 28, 27, 33, 26, 30, 28, 27, 27, 31, 25, 20, 30, 26, 28, 25, 31, 24, 33, 26, 24, 27, 30, 34, 35, 30, 24, 25, 35, 23, 26, 20, 25, 25, 16, 29, 30, ], "Bar" => [ 6, 213, 22, 35, 38, 37, 9, 72], # Seems to start at chapter 6, and have 73 verses? "EpJer" => [ 0, 0, ], # Song of the Three (goes between Daniel 3.23 and 3.24) "PrAzar" => [ 0, 0, ], "Sus" => [ 1, 64, 64 ], "Bel" => [ 1, 42, 42 ], "1Macc" => [ 16, 922, 63, 70, 59, 61, 68, 63, 50, 32, 73, 89, 74, 53, 53, 49, 41, 24], "2Macc" => [ 15, 556, 36, 32, 40, 50, 27, 31, 42, 36, 29, 38, 38, 46, 26, 46, 39 ], # Catholic Apocrypha end here. "3Macc" => [ 0, 0, ], "4Macc" => [ 0, 0, ], "PrMan" => [ 0, 0, ], "1Esd" => [ 0, 0, ], "2Esd" => [ 0, 0, ], "Ps151" => [ 0, 0, ], ); $tot = 0; for my $b (keys %apoccounts) { $tot += @{$apoccounts{$b}}[1]; } if ($verbose && $tot ne $totalApocVerses) { warn "Books added to $tot total verses for Apocrypha, not $totalApocVerses.\n"; } checkBookNames(keys %apoccounts); checkVerseCountHash(\%apoccounts); # ------------------------------------------------------------------------- %ntcounts = ( "Matt" => [ 28, 1071, 25, 23, 17, 25, 48, 34, 29, 34, 38, 42, 30, 50, 58, 36, 39, 28, 27, 35, 30, 34, 46, 46, 39, 51, 46, 75, 66, 20 ], "Mark" => [ 16, 678, 45, 28, 35, 41, 43, 56, 37, 38, 50, 52, 33, 44, 37, 72, 47, 20 ], "Luke" => [ 24, 1151, 80, 52, 38, 44, 39, 49, 50, 56, 62, 42, 54, 59, 35, 35, 32, 31, 37, 43, 48, 47, 38, 71, 56, 53 ], "John" => [ 21, 879, 51, 25, 36, 54, 47, 71, 53, 59, 41, 42, 57, 50, 38, 31, 27, 33, 26, 40, 42, 31, 25 ], "Acts" => [ 28, 1006, 26, 47, 26, 37, 42, 15, 60, 40, 43, 48, 30, 25, 52, 28, 41, 40, 34, 28, 40, 38, 40, 30, 35, 27, 27, 32, 44, 31 ], "Rom" => [ 16, 433, 32, 29, 31, 25, 21, 23, 25, 39, 33, 21, 36, 21, 14, 23, 33, 27 ], "1Cor" => [ 16, 437, 31, 16, 23, 21, 13, 20, 40, 13, 27, 33, 34, 31, 13, 40, 58, 24 ], # Added 1 verse to 2Cor 13. "2Cor" => [ 13, 257, 24, 17, 18, 18, 21, 18, 16, 24, 15, 18, 33, 21, 14 ], "Gal" => [ 6, 149, 24, 21, 29, 31, 26, 18 ], "Eph" => [ 6, 155, 23, 22, 21, 32, 33, 24 ], "Phil" => [ 4, 104, 30, 30, 21, 23 ], "Col" => [ 4, 95, 29, 23, 25, 18 ], "1Thess" => [ 5, 89, 10, 20, 13, 18, 28 ], "2Thess" => [ 3, 47, 12, 17, 18 ], "1Tim" => [ 6, 113, 20, 15, 16, 16, 25, 21 ], "2Tim" => [ 4, 83, 18, 26, 17, 22 ], "Titus" => [ 3, 46, 16, 15, 15 ], "Phlm" => [ 1, 25, 25 ], "Heb" => [ 13, 303, 14, 18, 19, 16, 14, 20, 28, 13, 28, 39, 40, 29, 25 ], "Jas" => [ 5, 108, 27, 26, 18, 17, 20 ], "1Pet" => [ 5, 105, 25, 25, 22, 19, 14 ], "2Pet" => [ 3, 61, 21, 22, 18 ], "1John" => [ 5, 105, 10, 29, 24, 21, 21 ], "2John" => [ 1, 13, 13 ], "3John" => [ 1, 14, # Based on NEB; check others? 14 ], "Jude" => [ 1, 25, 25 ], # UBS 3 has Rev.12.18; TR and KJV lack it; others have as part of v. 17. "Rev" => [ 22, 404, 20, 29, 22, 11, 14, 17, 17, 13, 21, 11, 19, 17, 18, 20, 8, 21, 18, 24, 21, 15, 27, 21 ], ); $tot = 0; for my $b (keys %ntcounts) { $tot += @{$ntcounts{$b}}[1]; } if ($verbose && $tot ne $totalNTVerses) { warn "Books added to $tot total verses for NT, not $totalNTVerses.\n"; } checkBookNames(keys %ntcounts); checkVerseCountHash(\%ntcounts); } # setupVerseCounts # Add up the array of verseCounts per chapter, and see if it matches the # explicit total (which is in [1]). sub checkVerseCountHash { my %h = %{$_[0]}; for my $b (keys %h) { my @nums = @{$h{$b}}; my $nchapters = $nums[0]; my $nverses = $nums[1]; (scalar @nums == $nchapters+2) || warn "ERROR: Chapter count $nchapters, but " . scalar(@nums) . " chapters in verse-count array for book '$b'.\n"; my $vct = 0; for (my $i=2; $i3 && !$vfragments) { # vfragments for sub-verse ids warn "Too many tokens in osisId '$anId'.\n"; } elsif ($ntokens<3) { warn "Need at least 3 tokens in osisID '$anId' on a verse.\n"; } elsif ($tokens[0] !~ m/^[A-Z1-9][a-zA-Z0-9]+$/) { warnEvent("Book token '$tokens[0]' in '$anId' is not valid.\n"); } elsif ($tokens[1] !~ m/^[1-9][0-9]{0,2}$/) { warnEvent("Chapter token '$tokens[1]' in '$anId' is not valid.\n"); } elsif ($tokens[2] !~ m/^[1-9][0-9]{0,2}$/) { warnEvent("Verse token '$tokens[2]' in '$anId' is not valid.\n"); } else { $rc = 1; } } elsif ($elementType eq "chapter") { if ($ntokens != 2) { warn "Need exactly 2 tokens in osisID '$anId' on a chapter.\n"; } elsif ($tokens[0] !~ m/^[A-Z1-9][a-zA-Z0-9]+$/) { warnEvent("Book token '$tokens[0]' in '$anId' is not valid.\n"); } elsif ($tokens[1] !~ m/^[1-9][0-9]{0,2}$/) { warnEvent("Chapter token '$tokens[1]' in '$anId' is not valid.\n"); } else { $rc = 1; } } elsif ($elementType eq "div") { if ($ntokens != 1) { warn "Need exactly 1 token in osisID '$anId' on a div.\n"; } elsif ($tokens[0] !~ m/^[A-Z1-9][a-zA-Z0-9]+$/) { warnEvent("Book token '$tokens[0]' in '$anId' is not valid.\n"); } else { $rc = 1; } } else { warn "Unexpected osisID attribute '$anId' on '$elementType' element.\n"; if ($anId !~ m/^[A-Z1-9][a-zA-Z0-9]+\.[1-9][0-9]{0,2}\.[1-9][0-9]{0,2}/) { warn "Bad syntax in osisID '$anId'.\n"; } } ($rc == 0) && return(0); # FAIL # Values for context: book for canon, chapter for book, verse for chapter. my @chapterInfo = (); if (defined $otcounts{$tokens[0]}) { @chapterInfo = @{$otcounts{$tokens[0]}}; } elsif (defined $ntcounts{$tokens[0]}) { @chapterInfo = @{$ntcounts{$tokens[0]}}; } else { warnEvent("Can't find book '$tokens[0]' in chapter-counts lists " . "(only OT and NT have counts so far).\n"); my $c = correctBookAbbrev($tokens[0]); ($c) && warn " Did you mean '$c'?\n"; return(0); } if (scalar @chapterInfo < 1) { warnEvent("Can't find book '$tokens[0]' for '$anId'.\n"); return(0); } if (scalar @tokens > 1) { my $chapternum = $tokens[1]; if ($chapternum > $chapterInfo[0]) { warnEvent("Chapter number '$tokens[1]' too high " . "(book '$tokens[0]' limit is $chapterInfo[0]).\n"); return(0); } if (scalar @tokens > 2) { if ($tokens[2] > $chapterInfo[$chapternum+1]) { warnEvent("Verse number '$tokens[2]' too high " . "(book '$tokens[0]' chapter '$chapternum' limit is " . $chapterInfo[$chapternum+1] . ").\n"); return(0); } } } return(1); } # checkOsisIdSyntax ############################################################################### # See if what we got, points to where we expect. If not, we should try harder to # recover (eventually). sub checkSequence { my $theOsisId = $_[0]; if ($theOsisId eq $currentBookIdList[$currentPlaceInIdList]) { if (++$currentPlaceInIdList >= scalar(@currentBookIdList)) { # end book loadNextBookIdList(); } return(1); } warn "osisID out of order. Expected '" . $currentBookIdList[$currentPlaceInIdList] . "', but found '$theOsisId'.\n"; my $foundAt = findOsisIdInBookIdList($theOsisId); if ($foundAt == -1) { warn " Can't find '$theOsisId' among osisIds for current book.\n"; } elsif ($foundAt < $currentPlaceInIdList) { my $d = $currentPlaceInIdList-$foundAt; warn " Found '$theOsisId' too early by $d. " . "Verse(s) out of order or duplicated?\n"; } else { warn " Found '$theOsisId' too late by " . ($foundAt - $currentPlaceInIdList) . ". Unit(s) missing?\n"; } $currentPlaceInIdList = $foundAt+1; return(0); } sub loadNextBookIdList { $currentBookNum++; if ($currentBookNum >= scalar @canonOfChoice) { return; } $currentBookName = $canonOfChoice[$currentBookNum]; my $ref = getIdListForBook($currentBookName); @currentBookIdList = @$ref; $currentPlaceInIdList = 0; } sub findOsisIdInBookIdList { my $osisId = $_[0]; for (my $i=0; $i "TR", "Mark.7.16" => "TR", "Mark.11.26" => "TR", "Mark.16.9" => "TR", "Mark.16.10" => "TR", "Mark.16.11" => "TR", "Mark.16.12" => "TR", "Mark.16.13" => "TR", "Mark.16.14" => "TR", "Mark.16.15" => "TR", "Mark.16.16" => "TR", "Mark.16.17" => "TR", "Mark.16.18" => "TR", "Mark.16.19" => "TR", "Mark.16.20" => "TR", "John.7.53" => "TR", "John.8.1" => "TR", "John.8.2" => "TR", "John.8.3" => "TR", "John.8.4" => "TR", "John.8.5" => "TR", "John.8.6" => "TR", "John.8.7" => "TR", "John.8.8" => "TR", "John.8.9" => "TR", "John.8.10" => "TR", "John.8.11" => "TR", "Acts.8.37" => "TR", "Acts.9.6" => "TR", "1John.5.7" => "TR", # The "Johannine comma" "1John.5.8" => "TR", # The "Johannine comma" "Rev.12.18" => "UBS3", # TR lacks, others put into v. 17 ); # Missing or just shorter? for (@TrOnly) { if ($_ == $_[0]) { return(1); } } return(0); } # isIdTrOnly ############################################################################### # List verses where the Tetragrammaton occurs sub setupTetraOccurrences { #%tetras = (); } sub setupSalutationOccurrences { #%salutations = (); } ############################################################################### # See if an xml:lang value is one we recognize. # From http://www.loc.gov/standards/iso639-2/php/code_list.php # See files lang2letter and lang3letter, etc. sub setupKnownLangs { return; } sub checkLang { if (length($_[0]) > 1 && length($_[0]) < 4) { return(1); } } # Set (global) min/max values for characters in a given range sub setRangeForLang { } # Is the given Unicode char ok for a given language? sub charOkForLang { return(1); } ############################################################################### # Events always update $curline, so we don't have to pass pointer to parser all # the way down. sub warnEvent { my $m0 = $_[0]; chomp $m0; my $m1 = ($_[1]) ? ": $_[1]" : ""; warn "******* (line $curline): $m0$m1\n"; $totalErrors++; } sub xmlEcho { ($echo) && print $_[0]; } ############################################################################### sub showUsage { warn " Usage: osisCheck [options] file Checks an OSIS XML Bible for correct IDs, book/chapter/verse numbering, etc. Does *not* do full XML validation. Options: -apoc Expect the OT Apocryphal books (not yet implemented)? -break With -echo, break before each tag. -canon name Which canon to check (default = NT), from: OT, NT, Apoc, RomanCatholic, Protestant, RahlsLXX,Ethiopian, Vulgate, Coptic, Armenian, Peshitta, Sinaiticus. -catalog path Use specified XML catalog? -echo Copy out the XML as it is checked. -elements Report how many instances of each element type occurred. -hebrew Use Hebrew numbering for Psalm headings (not finished yet), instead of Greek numbering. -maxerrors n Set to stop after finding n errors. -q Suppress most messages. -strongs Validate that the right Strong's numbers are attached to each verse (not yet implemented). -tr Expect more verses as in KJV/TR. -v Add more messages, and check integrity frequently. -vfragments Allow osisIDs to include sub-verse identifiers. -xmllang name Allow this language (repeatable). If this option is never specified, all languages are allowed. (not yet supported). Version/license information: Last modified $version. Use osisCheck -version for further information. Known Bugs/Limitations (please report any other bugs to sderose\@acm.org): Canons other than Roman Catholic and Protestant are not finished yet. Discontiguous verses (repeated osisIDs) and combined verses (multiple osisIDs on a single verse) are not yet supported. Does not yet check whether various things are tagged at all (divineName, etc). "; } sub showLicense { print " This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. This software was last updated on $version. For the most recent version, see http://www.derose.net/steve/utilities/. "; }