#!/usr/bin/perl -w
#
# osisCheck: Check structure of an OSIS file.
#
# 2008-08-26: Written by Steven J. DeRose, sderose@acm.org.
# 2008-09-12 sjd: Add osisID syntax/range checks, -mixed, continue debugging.
# 2008-09-14 sjd: Finish osisID checking, add knowledge of TR/Hebrew variants.
#
# To do:
# osisRefs, incl. ranges
# osisIDs with multiple ids in a single attribute.
# Ending of Mark
# Charsets of content per xml:lang
# Locations of red-text, divineName, sigblocks, etc.
# (better with schematron?)
# Check work and scope declarations and references
use strict;
use Getopt::Long;
my $version = "2008-09-14";
# Option values (see -help at end)
my $break = 0;
my $canon = "NT";
my $catalog = "";
my $echo = 0;
my $elements = 0;
my $hebrew = 0;
my $help = 0;
my $maxerrors = 0; # unlimited
my $newLineString = "\n";
my $quiet = 0;
my $refs = 0;
my $tr = 1;
my $verbose = 0;
my $vfragments = 0;
my $vv = 0;
my $xmllang = "";
################################################################################
Getopt::Long::Configure ("ignore_case");
my $result = GetOptions(
"break!" => \$break,
"canon=s" => \$canon,
"catalog=s" => \$catalog,
"echo!" => \$echo,
"elements!" => \$elements,
"hebrew!" => \$hebrew,
"h|help" => \$help,
"maxerrors=n" => \$maxerrors,
"q!" => \$quiet,
"refs!" => \$refs,
"tr!" => \$tr,
"v+" => \$verbose,
"version" => sub {
showLicense();
exit;
},
"vfragments!" => \$vfragments,
"xmllang=s" => \$xmllang,
);
if ($help) { showUsage(); exit; }
($result) || die "Bad options.\n";
($canon =~
m/^(OT|NT|Apoc|RC|Prot|RahlsLXX|Vulgate|Ethiopian|Coptic|Armenian|Peshitta|Sinaiticus)$/)
|| die "Unknown canon '$canon'.\n";
if ($catalog ne "" && !-f $catalog) {
die "Can't find specified catalog '$catalog'.\n";
}
(scalar @ARGV > 0) ||
die "No input file specified.\n";
my $file = shift;
(-f $file) ||
die "Can't find XML file at '$file'.\n";
$vv = ($verbose > 1);
###############################################################################
my $startTime = 0;
my $endTime = 0;
my @tagStack; # Current stack of element types
my %lastLine; # Where we last saw element of each type
my @langStack; # Stack of xml:lang values
my $lastEvent = "";
my $pastDTD = 0;
my $nodesChanged = 0;
# These hashes hold the OSIS book-name abbreviations and expansions.
my %bookNamesOT = ();
my %bookNamesNT = ();
my %bookNamesApoc = ();
my %bookNamesRahlfsLXX = ();
my %bookNamesVulgate = ();
my %bookNamesEthiopianOrthodox = ();
my %bookNamesCopticOrthodox = ();
my %bookNamesArmenianOrthodox = ();
my %bookNamesPeshitta = ();
my %bookNamesCodexSinaiticus = ();
my %bookNamesAll = (); # Union of the above
setupBookAbbrevs();
# These arrays hold book-name abbreviations in the right order for each canon
my @canonOT = ();
my @canonApoc = ();
my @canonNT = ();
my @canonRomanCatholic = ();
my @canonProtestant = ();
my @canonRahlfsLXX = ();
my @canonVulgate = ();
my @canonEthiopianOrthodox = ();
my @canonCopticOrthodox = ();
my @canonArmenianOrthodox = ();
my @canonPeshitta = ();
my @canonCodexSinaiticus = ();
setupCanons();
my %otcounts = ();
my $totalOTVerses = 23145;
my %apoccounts = ();
my $totalApocVerses = -1;
my %ntcounts = ();
my $totalNTVerses = 7957;
setupVerseCounts();
my $bookCount = 0;
my $chapterCount = 0;
my $verseCount = 0;
my $errorCount = 0;
my @canonOfChoice = @canonNT;
my $currentBookNum = -1;
my $currentBookName = "";
my @currentBookIdList = ();
my $currentPlaceInIdList = 0;
my $currentOsisId = "";
my $curline = 0; # global for error messages.
my %languagesSeen = ();
my %elementCounts = ();
my $totalErrors = 0;
loadNextBookIdList();
###############################################################################
# Set up the parser and callbacks
use XML::Parser;
use XML::Catalog;
#my $parser = new XML::Parser(ErrorContext => 2);
my $parser = new XML::Parser(ErrorContext => 0);
if ($catalog ne "") {
my $catalogObject=XML::Catalog->new($catalog);
$parser->setHandlers(ExternEnt => $catalogObject->get_handler($parser));
}
$parser->setHandlers(
Init => \&initHandler,
Final => \&finalHandler,
Start => \&startTagHandler,
End => \&endTagHandler,
Char => \&charHandler,
Proc => \&piHandler,
Comment => \&commentHandler,
CdataStart => \&cdataStartHandler,
CdataEnd => \&cdataEndHandler,
Doctype => \&doctypeHandler,
DoctypeFin => \&doctypeFinHandler,
Default => \&defaultHandler,
Entity => \&entityDclHandler,
Element => \&elementDclHandler
);
# Process the XML (can only do one document per parser instance)
$parser->parsefile($file);
# $parser->parse("
Hello
");
if ($elements) {
for my $e (sort keys %elementCounts) {
print sprintf("%-20s %6d\n", $e, $elementCounts{$e});
}
}
warn "Done. $bookCount books, $chapterCount chapters, $verseCount verses.\n";
exit;
##############################################################################
sub initHandler {
my ($p, $name) = @_;
($vv) && warnEvent("Init",$_[1]);
$startTime = time();
$lastEvent = "INIT";
xmlEcho("\n");
}
sub finalHandler {
my ($p, $name) = @_;
($vv) && warnEvent("Final",$_[1]);
$endTime = time();
$lastEvent = "FINAL";
}
sub startTagHandler {
my ($p, $name) = @_;
$curline = $p->current_line;
($verbose) && warnEvent("Start-tag",$name);
push(@tagStack,$name);
my $e = "?ent?"; # $p->current_entity;
my $l = $p->current_line;
$lastLine{$name} = "Entity '$e', line $l"; # for error reporting
# Keep track of number of attrs, and a few specific attributes.
my $numAttrs = 0;
my %attlist = ();
for (my $i=2; $i 0) { push @langStack, $langStack[-1]; }
else { push @langStack, "EN"; }
$lastEvent = "STARTTAG";
$elementCounts{$name}++;
if (($name eq "div") &&
(defined $attlist{"class"}) &&
($attlist{"class"} eq "book")) { $bookCount++; }
elsif ($name eq "chapter") { $chapterCount++; }
elsif ($name eq "verse") { $verseCount++; }
# Reconstruct a start-tag to echo
my $buf = "";
if ($break) { $buf .= $newLineString; }
$buf .= "<$name";
for my $aname (sort keys %attlist) {
my $avalue = $attlist{$aname};
$avalue =~ s/\"/&quo;/g;
$buf .= " $aname=\"$avalue\"";
}
$buf .= ">";
xmlEcho($buf);
# See if any osisID we get, is the expected one. If not, report then
# diff-like search?
# reset to where it says we are?
# keep going? (extra/missing ID leads to all errors)
# check off on global list, and work from there?
# Shift name and canon sequence generation to separate process?
#
if (my $newOsisId=$attlist{"osisID"}) {
my $theOsisId = $attlist{"osisID"};
if (checkOsisIdSyntax($theOsisId, $name)) {
checkSequence($theOsisId);
}
}
if ($maxerrors && $totalErrors > $maxerrors) {
warnEvent("LIMIT of $maxerrors ERRORS REACHED. STOPPING.");
die "Terminated.\n";
}
} # sub startTagHandler
sub endTagHandler {
my ($p, $name) = @_;
$curline = $p->current_line;
($verbose) && warnEvent("End-tag",$name);
my $expectedTag = pop(@tagStack);
if ($expectedTag ne $name) {
my $l = $_[0]->current_line;
print "Well-formedness error: Found end of '$name' at line $l when "
. " expecting '$expectedTag'.\n.";
print "Last '$expectedTag' was started at $lastLine{$expectedTag}.\n";
}
pop @langStack;
$lastEvent = "ENDTAG";
xmlEcho("$name>");
} # endTagHandler
sub charHandler {
my ($p, $data) = @_;
$curline = $p->current_line;
($verbose) && warnEvent("Text node",$_[1]);
$lastEvent = "CHAR";
xmlEcho($_[1]);
} # charHandler
sub piHandler {
my ($p, $data) = @_;
$curline = $p->current_line;
($verbose) && warnEvent("Pi",$_[1]);
$lastEvent = "PI";
$_[2] =~ s/\?>/?>/g;
xmlEcho("$_[1] $_[2]?>");
}
sub commentHandler {
my ($p, $data) = @_;
$curline = $p->current_line;
($verbose) && warnEvent("Comment",$_[1]);
if ($pastDTD) {
}
else {
}
$lastEvent = "COMMENT";
xmlEcho("");
}
sub cdataStartHandler {
my ($p, $data) = @_;
$curline = $p->current_line;
($verbose) && warnEvent("CDATA start",$data);
$lastEvent = "CDATASTART";
}
sub cdataEndHandler {
my ($p, $data) = @_;
$curline = $p->current_line;
($verbose) && warnEvent("CDATA end",$data);
$lastEvent = "CDATAEND";
}
sub doctypeHandler {
my ($p, $docel, $docsys, $docpub) = @_;
$curline = $p->current_line;
if (!defined $docsys) { $docsys = ""; }
if (!defined $docpub) { $docpub = ""; }
xmlEcho("\n");
}
else {
xmlEcho(" SYSTEM \"$docsys\" []>\n");
}
}
sub doctypeFinHandler {
my ($p, $data) = @_;
$curline = $p->current_line;
$pastDTD = 1;
$lastEvent = "DOCTYPEFIN";
}
sub defaultHandler {
my ($p, $data) = @_;
$curline = $p->current_line;
($vv) && scalar($_)>1 && warnEvent("Default",$_[1]);
$lastEvent = "DEFAULT";
}
###############################################################################
###############################################################################
# Make hashes of all known book names, keyed by the normative OSIS abbreviation,
# and with longer book name(s) as value.
# See http://catholic-resources.org/Bible/OT-Statistics-NAB.htm
sub setupBookAbbrevs {
%bookNamesOT = (
"Gen" => "Genesis",
"Exod" => "Exodus",
"Lev" => "Leviticus",
"Num" => "Numbers",
"Deut" => "Deuteronomy",
"Josh" => "Joshua",
"Judg" => "Judges",
"Ruth" => "Ruth",
"1Sam" => "1 Samuel",
"2Sam" => "2 Samuel",
"1Kgs" => "1 Kings",
"2Kgs" => "2 Kings",
"1Chr" => "1 Chronicles",
"2Chr" => "2 Chronicles",
"Ezra" => "Ezra",
"Neh" => "Nehemiah",
"Esth" => "Esther3",
"Job" => "Job",
"Ps" => "Psalms",
"Prov" => "Proverbs",
"Eccl" => "Ecclesiastes | Qohelet",
"Song" => "Song of Solomon | Canticle of Canticles",
"Isa" => "Isaiah",
"Jer" => "Jeremiah",
"Lam" => "Lamentations",
"Ezek" => "Ezekiel",
"Dan" => "Daniel",
"Hos" => "Hosea",
"Joel" => "Joel",
"Amos" => "Amos",
"Obad" => "Obadiah",
"Jonah" => "Jonah",
"Mic" => "Micah",
"Nah" => "Nahum",
"Hab" => "Habakkuk",
"Zeph" => "Zephaniah",
"Hag" => "Haggai",
"Zech" => "Zechariah",
"Mal" => "Malachi"
);
%bookNamesNT = (
"Matt" => "Matthew",
"Mark" => "Mark",
"Luke" => "Luke",
"John" => "John",
"Acts" => "Acts",
"Rom" => "Romans",
"1Cor" => "1 Corinthians",
"2Cor" => "2 Corinthians",
"Gal" => "Galatians",
"Eph" => "Ephesians",
"Phil" => "Philippians",
"Col" => "Colossians",
"1Thess" => "1 Thessalonians",
"2Thess" => "2 Thessalonians",
"1Tim" => "1 Timothy",
"2Tim" => "2 Timothy",
"Titus" => "Titus",
"Phlm" => "Philemon",
"Heb" => "Hebrews",
"Jas" => "James",
"1Pet" => "1 Peter",
"2Pet" => "2 Peter",
"1John" => "1 John",
"2John" => "2 John",
"3John" => "3 John",
"Jude" => "Jude",
"Rev" => "Revelation"
);
%bookNamesApoc = (
"Tob" => "Tobit",
"Jdt" => "Judith",
"AddEsth" => "Additions to Esther",
"Wis" => "Wisdom | Wisdom of Solomon",
"Sir" => "Sirach | Ecclesiasticus",
"Bar" => "Baruch",
"EpJer" => "Letter of Jeremiah (= Bar.6)",
"PrAzar" => "Prayer of Azariah | Song of the Three Children",
"Sus" => "Susanna",
"Bel" => "Bel and the Dragon",
"1Macc" => "1 Maccabees",
"2Macc" => "2 Maccabees",
"3Macc" => "3 Maccabees",
"4Macc" => "4 Maccabees",
"PrMan" => "Prayer of Manasseh (= Odes.14)",
"1Esd" => "1 Esdras4",
"2Esd" => "2 Esdras4 | 5 Ezra (= Bible.NRSVA:2Esd.1-2Esd.2)",
"Ps151" => "Psalm 151"
);
%bookNamesRahlfsLXX = (
"Odes" => "Odes",
"PssSol" => "Psalms of Solomon"
);
%bookNamesVulgate = ( # & other later Latin mss* (4)
"EpLao" => "Epistle to the Laodiceans",
"3Esd" => "3 Esdras4",
"4Esd" => "4 Esdras4 | 4 Ezra (= Bible.NRSVA:2Esd.3-2Esd.14)",
"5Esd" => "5 Esdras4 | 6 Ezra (= Bible.NRSVA:2Esd.15-2Esd.16)"
);
%bookNamesEthiopianOrthodox = ( # Canon/Ge'ez Translation Additions5 (6)
"1En" => "1 Enoch | Ethiopic (Apocalypse of) Enoch",
"Jub" => "Jubilees",
"4Bar" => "4 Baruch | Paraleipomena Jeremiou",
"AscenIsa" => "Ascension/Vision of Isaiah (=MartAscenIsa.6 - .11)",
"Teg" => "Tegsas (= Prov.25-Prov.31)6",
"PsJos" =>
"Pseudo-Josephus | Jossipon; Josephu ben Gorion Medieval History of the Jews"
);
%bookNamesCopticOrthodox = (
"AposCon" => "Apostolic Constitutions and Canons",
"1Clem" => "1 Clement",
"2Clem" => "2 Clement"
);
%bookNamesArmenianOrthodox = (
"3Cor" => "3 Corinthians",
"EpCorPaul" => "Epistle of the Corinthians to Paul and His Response",
"JosAsen" => "Joseph and Asenath",
"T12Patr" => "Testaments of the Twelve Patriarchs (12 parts)",
"T12Patr.TAsh" => "Testaments of Asher",
"T12Patr.TBenj" => "Testaments of Benjamin",
"T12Patr.TDan" => "Testaments of Dan",
"T12Patr.TGad" => "Testaments of Gad",
"T12Patr.TIss" => "Testaments of Issachar",
"T12Patr.TJos" => "Testaments of Joseph",
"T12Patr.TJud" => "Testaments of Judah",
"T12Patr.TLevi" => "Testaments of Levi",
"T12Patr.TNaph" => "Testaments of Naphtali",
"T12Patr.TReu" => "Testaments of Reuben",
"T12Patr.TSim" => "Testaments of Simeon",
"T12Patr.TZeb" => "Testaments of Zebulun"
);
%bookNamesPeshitta = (
"2Bar" => "2 Baruch | (Syriac) Apocalypse of Baruch",
"EpBar" => "Letter of Baruch (= 2Bar.78-2Bar.86)"
);
%bookNamesCodexSinaiticus = (
"Barn" => "Barnabas",
"Herm" => "Shepherd of Hermas (= Mandates, Similitudes, Visions)",
"Herm.Mand" => "Shepherd of Hermas, Mandates",
"Herm.Sim" => "Shepherd of Hermas, Similitudes",
"Herm.Vis" => "Shepherd of Hermas, Visions"
);
%bookNamesAll = ();
addtoUnionBooksList(\%bookNamesOT);
addtoUnionBooksList(\%bookNamesNT);
addtoUnionBooksList(\%bookNamesApoc);
addtoUnionBooksList(\%bookNamesRahlfsLXX);
addtoUnionBooksList(\%bookNamesVulgate);
addtoUnionBooksList(\%bookNamesEthiopianOrthodox);
addtoUnionBooksList(\%bookNamesCopticOrthodox);
addtoUnionBooksList(\%bookNamesArmenianOrthodox);
addtoUnionBooksList(\%bookNamesPeshitta);
addtoUnionBooksList(\%bookNamesCodexSinaiticus);
} # setupBookAbbrevs
sub addtoUnionBooksList {
my %toAdd = %{$_[0]};
for my $k (keys %toAdd) {
$bookNamesAll{$k} = $toAdd{$k};
}
} # addtoUnionBooksList
sub correctBookAbbrev {
my $a = $_[0];
if ($a =~ m/^Mat/) { return("Matt"); }
if ($a =~ m/^Mark/ || $1 eq "Mk") { return("Mark"); }
if ($a =~ m/^Luk/ || $1 eq "Lk") { return("Luke"); }
if ($a =~ m/^John/ || $1 eq "Jn") { return("John"); }
if ($a =~ m/^Ac/) { return("Acts"); }
if ($a =~ m/^Rom/ || $1 eq "Rm") { return("Rom"); }
if ($a =~ m/^1C/) { return("1Cor"); }
if ($a =~ m/^2C/) { return("2Cor"); }
if ($a =~ m/^Gal/) { return("Gal"); }
if ($a =~ m/^Eph/) { return("Eph"); }
if ($a =~ m/^Phil/ || $1 eq "Phlp") { return("Phil"); }
if ($a =~ m/^Col/) { return("Col"); }
if ($a =~ m/^1Th/) { return("1Thess"); }
if ($a =~ m/^2Th/) { return("2Thess"); }
if ($a =~ m/^1T/) { return("1Tim"); }
if ($a =~ m/^2T/) { return("2Tim"); }
if ($a =~ m/^Tit/ || $1 eq "Tt") { return("Titus"); }
if ($a =~ m/^Phile/) { return("Phlm"); }
if ($a =~ m/^Heb/ || $1 eq "Hb") { return("Heb"); }
if ($a =~ m/^Ja/ || $1 eq "Jm") { return("Jas"); }
if ($a =~ m/^1P/) { return("1Pet"); }
if ($a =~ m/^2P/) { return("2Pet"); }
if ($a =~ m/^1J/) { return("1John"); }
if ($a =~ m/^2J/) { return("2John"); }
if ($a =~ m/^3J/) { return("3John"); }
if ($a =~ m/^Jude/ || $1 eq "Jd") { return("Jude"); }
if ($a =~ m/^Rev/ || $1 =~ m/Apoc/) { return("Rev"); }
return("");
} #
###############################################################################
# The order of books in various canons; the names used must be valid keys to
# the %bookNamesAll hash, as set up by setupBookAbbrevs().
#
sub setupCanons {
@canonOT = (
"Gen", "Exod", "Lev", "Num", "Deut", "Josh",
"Judg", "Ruth", "1Sam", "2Sam", "1Kgs", "2Kgs",
"1Chr", "2Chr", "Ezra", "Neh", "Esth", "Job",
"Ps", "Prov", "Eccl", "Song", "Isa", "Jer",
"Lam", "Ezek", "Dan", "Hos", "Joel", "Amos",
"Obad", "Jonah", "Mic", "Nah", "Hab", "Zeph",
"Hag", "Zech", "Mal");
checkBookNames(@canonOT);
@canonApoc = (
"Tob", "Jdt", "AddEsth", "Wis", "Sir", "Bar",
"EpJer", "PrAzar", "Sus", "Bel", "1Macc", "2Macc",
"3Macc", "4Macc", "PrMan", "1Esd", "2Esd", "Ps151");
checkBookNames(@canonApoc);
@canonNT = (
"Matt", "Mark", "Luke", "John", "Acts", "Rom",
"1Cor", "2Cor", "Gal", "Eph", "Phil", "Col",
"1Thess", "2Thess", "1Tim", "2Tim", "Titus", "Phlm",
"Heb", "Jas", "1Pet", "2Pet", "1John", "2John",
"3John", "Jude", "Rev");
checkBookNames(@canonNT);
@canonRomanCatholic = ();
push @canonRomanCatholic, @canonOT;
push @canonRomanCatholic, @canonApoc;
push @canonRomanCatholic, @canonNT;
@canonProtestant = ();
push @canonProtestant, @canonOT;
push @canonProtestant, @canonNT;
@canonRahlfsLXX = (
"Odes", "PssSol");
checkBookNames(@canonRahlfsLXX);
@canonVulgate = (
"EpLao", "3Esd", "4Esd", "5Esd");
checkBookNames(@canonVulgate);
@canonEthiopianOrthodox = ( # Canon/Ge'ez Translation Additions5 (6)
"1En", "Jub", "4Bar", "AscenIsa", "Teg", "PsJos");
checkBookNames(@canonEthiopianOrthodox);
@canonCopticOrthodox = (
"AposCon", "1Clem", "2Clem");
checkBookNames(@canonCopticOrthodox);
@canonArmenianOrthodox = (
"3Cor", "EpCorPaul", "JosAsen", "T12Patr",
"T12Patr.TAsh", "T12Patr.TBenj", "T12Patr.TDan", "T12Patr.TGad",
"T12Patr.TIss", "T12Patr.TJos", "T12Patr.TJud", "T12Patr.TLevi",
"T12Patr.TNaph","T12Patr.TReu", "T12Patr.TSim", "T12Patr.TZeb");
checkBookNames(@canonArmenianOrthodox);
@canonPeshitta = (
"2Bar", "EpBar");
checkBookNames(@canonPeshitta);
@canonCodexSinaiticus = (
"Barn", "Herm", "Herm.Mand", "Herm.Sim", "Herm.Vis");
checkBookNames(@canonCodexSinaiticus);
} # setupCanons
###############################################################################
# A hash keyed by book abbreviation. Each entry's value is an array.
# In each array, [0] is number of chapters, [1] is number of verses total,
# then [2..n] are verses in each chapter.
#
sub setupVerseCounts {
%otcounts = (
"Gen" => [ 50, 1533,
31, 25, 24, 26, 32, 22, 24, 22, 29, 32, 32, 20, 18, 24, 21,
16, 27, 33, 38, 18, 34, 24, 20, 67, 34, 35, 46, 22, 35, 43,
54, 33, 20, 31, 29, 43, 36, 30, 23, 23, 57, 38, 34, 34, 28,
34, 31, 22, 33, 26 ],
"Exod" => [ 40, 1213,
22, 25, 22, 31, 23, 30, 29, 28, 35, 29, 10, 51, 22, 31, 27,
36, 16, 27, 25, 26, 37, 30, 33, 18, 40, 37, 21, 43, 46, 38,
18, 35, 23, 35, 35, 38, 29, 31, 43, 38 ],
"Lev" => [ 27, 859,
17, 16, 17, 35, 26, 23, 38, 36, 24, 20, 47, 8, 59, 57, 33,
34, 16, 30, 37, 27, 24, 33, 44, 23, 55, 46, 34 ],
"Num" => [ 36, 1289,
54, 34, 51, 49, 31, 27, 89, 26, 23, 36, 35, 16, 33, 45, 41,
35, 28, 32, 22, 29, 35, 41, 30, 25, 19,65, 23, 31, 39, 17,
54, 42, 56, 29, 34, 13 ],
"Deut" => [ 34, 959,
46, 37, 29, 49, 33, 25, 26, 20, 29, 22, 32, 31, 19, 29, 23,
22, 20, 22, 21, 20, 23, 29, 26, 22, 19, 19, 26, 69, 28, 20,
30, 52, 29, 12 ],
"Josh" => [ 24, 658,
18, 24, 17, 24, 15, 27, 26, 35, 27, 43, 23, 24, 33, 15, 63,
10, 18, 28, 51, 9, 45, 34, 16, 33 ],
"Judg" => [ 21, 618,
36, 23, 31, 24, 31, 40, 25, 35, 57, 18, 40, 15, 25, 20, 20,
31, 13, 31, 30, 48, 25 ],
"Ruth" => [ 4, 85,
22, 23, 18, 22 ],
"1Sam" => [ 31, 810,
28, 36, 21, 22, 12, 21, 17, 22, 27, 27, 15, 25, 23, 52, 35,
23, 58, 30, 24, 42, 16, 23, 28, 23, 43, 25, 12, 25, 11, 31,
13 ],
"2Sam" => [ 24, 695,
27, 32, 39, 12, 25, 23, 29, 18, 13, 19, 27, 31, 39, 33, 37,
23, 29, 32, 44, 26, 22, 51, 39, 25 ],
"1Kgs" => [ 22, 817,
53, 46, 28, 20, 32, 38, 51, 66, 28, 29, 43, 33, 34, 31, 34,
34, 24, 46, 21, 43, 29, 54 ],
"2Kgs" => [ 25, 719,
18, 25, 27, 44, 27, 33, 20, 29, 37, 36, 20, 22, 25, 29, 38,
20, 41, 37, 37, 21, 26, 20, 37, 20, 30 ],
"1Chr" => [ 29, 943,
54, 55, 24, 43, 41, 66, 40, 40, 44, 14, 47, 41, 14, 17, 29,
43, 27, 17, 19, 8, 30, 19, 32, 31, 31, 32, 34, 21, 30 ],
"2Chr" => [ 36, 821,
18, 17, 17, 22, 14, 42, 22, 18, 31, 19, 23, 16, 23, 14, 19,
14, 19, 34, 11, 37, 20, 12, 21, 27, 28, 23, 9, 27, 36, 27,
21, 33, 25, 33, 26, 23 ],
"Ezra" => [ 10, 280,
11, 70, 13, 24, 17, 22, 28, 36, 15, 44 ],
"Neh" => [ 13, 405,
11, 20, 38, 17, 19, 19, 72, 18, 37, 40, 36, 47, 31 ],
"Tob" => [ 14, 245,
22, 14, 17, 21, 22, 18, 17, 21, 6, 14, 18, 22, 18, 15 ],
"Jdt" => [ 16, 340,
16, 28, 10, 15, 24, 21, 32, 36, 14, 23, 23, 20, 20, 19, 14,
25 ],
"Esth" => [ 16, 272,
22, 23, 15, 17, 14, 14, 10, 17, 32, 3, 17, 8, 30, 16, 24,
10 ],
"1Macc" => [ 16, 922,
63, 70, 59, 61, 68, 63, 50, 32, 73, 89, 74, 53, 53, 49, 41,
24 ],
"2Macc" => [ 15, 556,
36, 32, 40, 50, 27, 31, 42, 36, 29, 38, 38, 46, 26, 46, 39 ],
"Job" => [ 42, 1068,
22, 13, 26, 21, 27, 30, 21, 22, 35, 22, 20, 25, 28, 22, 35,
22, 16, 21, 29, 29, 34, 30, 17, 25, 6, 14, 21, 28, 25, 31,
40, 22, 33, 37, 16, 33, 24, 41, 30, 32, 26, 17 ],
# A Catholic list at catholic-resources.org says 2526.
"Ps" => [ 150, 2461,
6, 12, 8, 8, 12, 10, 17, 9, 20, 18, 7, 8, 6, 7, 5,
11, 15, 50, 14, 9, 13, 31, 6, 10, 22, 12, 14, 9, 11, 12,
24, 11, 22, 22, 28, 12, 40, 22, 13, 17, 13, 11, 5, 26, 17,
11, 9, 14, 20, 23, 19, 9, 6, 7, 23, 13, 11, 11, 17, 12,
8, 12, 11, 10, 13, 20, 7, 35, 36, 5, 24, 20, 28, 23, 10,
12, 20, 72, 13, 19, 16, 8, 18, 12, 13, 17, 7, 18, 52, 17,
16, 15, 5, 23, 11, 13, 12, 9, 9, 5, 8, 28, 22, 35, 45,
48, 43, 13, 31, 7, 10, 10, 9, 8, 18, 19, 2, 29, 176, 7,
8, 9, 4, 8, 5, 6, 5, 6, 8, 8, 3, 18, 3, 3, 21,
26, 9, 8, 24, 13, 10, 7, 12, 15, 21, 10, 20, 14, 9, 6 ],
"Prov" => [ 31, 915,
33, 22, 35, 27, 23, 35, 27, 36, 18, 32, 31, 28, 25, 35, 33,
33, 28, 24, 29, 30, 31, 29, 35, 34, 28, 28, 27, 28, 27, 33,
31 ],
"Eccl" => [ 12, 222,
18, 26, 22, 17, 19, 12, 29, 17, 18, 20, 10, 14 ],
"Song" => [ 8, 117,
17, 17, 11, 16, 16, 12, 14, 14 ],
"Wis" => [ 19, 436,
16, 24, 19, 20, 23, 25, 30, 21, 18, 21, 26, 27, 19, 31, 19,
29, 21, 25, 22 ],
"Sir" => [ 51, 1372,
29, 18, 30, 31, 17, 37, 36, 19, 18, 30, 34, 18, 25, 27, 20,
28, 27, 33, 26, 30, 28, 27, 27, 31, 25, 20, 30, 26, 28, 25,
31, 24, 33, 26, 24, 27, 30, 34, 35, 30, 24, 25, 35, 23, 26,
20, 25, 25, 16, 29, 30 ],
"Isa" => [ 66, 1291,
31, 22, 26, 6, 30, 13, 25, 23, 20, 34, 16, 6, 22, 32, 9,
14, 14, 7, 25, 6, 17, 25, 18, 23, 12, 21, 13, 29, 24, 33,
9, 20, 24, 17, 10, 22, 38, 22, 8, 31, 29, 25, 28, 28, 25,
13, 15, 22, 26, 11, 23, 15, 12, 17, 13, 12, 21, 14, 21, 22,
11, 12, 19, 11, 25, 24 ],
"Jer" => [ 52, 1364,
19, 37, 25, 31, 31, 30, 34, 23, 25, 25, 23, 17, 27, 22, 21,
21, 27, 23, 15, 18, 14, 30, 40, 10, 38, 24, 22, 17, 32, 24,
40, 44, 26, 22, 19, 32, 21, 28, 18, 16, 18, 22, 13, 30, 5,
28, 7, 47, 39, 46, 64, 34 ],
"Lam" => [ 5, 154,
22, 22, 66, 22, 22 ],
"Bar" => [ 6, 213,
22, 35, 38, 37, 9, 72 ],
"Ezek" => [ 48, 1273, # corrected total from 1271
28, 10, 27, 17, 17, 14, 27, 18, 11, 22, 25, 28, 23, 23, 8,
63, 24, 32, 14, 44, 37, 31, 49, 27, 17, 21, 36, 26, 21, 26,
# 49 32 in NEB
18, 32, 33, 31, 15, 38, 28, 23, 29, 49, 26, 20, 27, 31, 25,
24, 23, 35 ],
"Dan" => [ 14, 530,
21, 49, 100,34, 30, 29, 28, 27, 27, 21, 45, 13, 64, 42 ],
"Hos" => [ 14, 197,
9, 25, 5, 19, 15, 11, 16, 14, 17, 15, 11, 15, 15, 10 ],
"Joel" => [ 4, 73,
20, 27, 5, 21 ],
"Amos" => [ 9, 146,
15, 16, 15, 13, 27, 14, 17, 14, 15 ],
"Obad" => [ 1, 21,
21 ],
"Jonah" => [ 4, 48,
16, 11, 10, 11 ],
"Mic" => [ 7, 105,
16, 13, 12, 14, 14, 16, 20 ],
"Nah" => [ 3, 47,
14, 14, 19 ],
"Hab" => [ 3, 56,
17, 20, 19 ],
"Zeph" => [ 3, 53,
18, 15, 20 ],
"Hag" => [ 2, 38,
15, 23 ],
"Zech" => [ 14, 211,
17, 17, 10, 14, 11, 15, 14, 23, 17, 12, 17, 14, 9, 21 ],
"Mal" => [ 3, 55,
14, 17, 24 ]
);
my $tot = 0;
for my $b (keys %otcounts) {
$tot += @{$otcounts{$b}}[1];
}
if ($verbose && $tot ne $totalOTVerses) {
warn "Books added up to $tot for OT, not $totalOTVerses.\n";
}
checkBookNames(keys %otcounts);
checkVerseCountHash(\%otcounts);
# -------------------------------------------------------------------------
# figures from http://catholic-resources.org/Bible/OT-Statistics-NAB.htm
%apoccounts = (
"1Esd" => [ 9, 0,
],
"2Esd" => [ 16, 0,
],
"Tob" => [ 14, 245,
22, 14, 17, 21, 22, 18, 17, 21, 6, 14, 18, 22, 18, 15],
"Jdt" => [ 16, 340,
16, 28, 10, 15, 24, 21, 32, 36, 14, 23, 23, 20, 20, 19, 14,
25],
# Note: AddEsth starts at chapter 11?
"AddEsth" => [0, 0,
],
"Wis" => [ 19, 436,
16, 24, 19, 20, 23, 25, 30, 21, 18, 21, 26, 27, 19, 31, 19,
29, 21, 25, 22],
"Sir" => [ 51, 1372, # Also known as Ecclesiasticus
29, 18, 30, 31, 17, 37, 36, 19, 18, 30, 34, 18, 25, 27, 20,
28, 27, 33, 26, 30, 28, 27, 27, 31, 25, 20, 30, 26, 28, 25,
31, 24, 33, 26, 24, 27, 30, 34, 35, 30, 24, 25, 35, 23, 26,
20, 25, 25, 16, 29, 30, ],
"Bar" => [ 6, 213,
22, 35, 38, 37, 9, 72],
# Seems to start at chapter 6, and have 73 verses?
"EpJer" => [ 0, 0,
],
# Song of the Three (goes between Daniel 3.23 and 3.24)
"PrAzar" => [ 0, 0,
],
"Sus" => [ 1, 64,
64 ],
"Bel" => [ 1, 42,
42 ],
"1Macc" => [ 16, 922,
63, 70, 59, 61, 68, 63, 50, 32, 73, 89, 74, 53, 53, 49, 41,
24],
"2Macc" => [ 15, 556,
36, 32, 40, 50, 27, 31, 42, 36, 29, 38, 38, 46, 26, 46, 39 ],
# Catholic Apocrypha end here.
"3Macc" => [ 0, 0,
],
"4Macc" => [ 0, 0,
],
"PrMan" => [ 0, 0,
],
"1Esd" => [ 0, 0,
],
"2Esd" => [ 0, 0,
],
"Ps151" => [ 0, 0,
],
);
$tot = 0;
for my $b (keys %apoccounts) {
$tot += @{$apoccounts{$b}}[1];
}
if ($verbose && $tot ne $totalApocVerses) {
warn "Books added to $tot total verses for Apocrypha, not $totalApocVerses.\n";
}
checkBookNames(keys %apoccounts);
checkVerseCountHash(\%apoccounts);
# -------------------------------------------------------------------------
%ntcounts = (
"Matt" => [ 28, 1071,
25, 23, 17, 25, 48, 34, 29, 34, 38, 42, 30, 50, 58, 36, 39,
28, 27, 35, 30, 34, 46, 46, 39, 51, 46, 75, 66, 20 ],
"Mark" => [ 16, 678,
45, 28, 35, 41, 43, 56, 37, 38, 50, 52, 33, 44, 37, 72, 47,
20 ],
"Luke" => [ 24, 1151,
80, 52, 38, 44, 39, 49, 50, 56, 62, 42, 54, 59, 35, 35, 32,
31, 37, 43, 48, 47, 38, 71, 56, 53 ],
"John" => [ 21, 879,
51, 25, 36, 54, 47, 71, 53, 59, 41, 42, 57, 50, 38, 31, 27,
33, 26, 40, 42, 31, 25 ],
"Acts" => [ 28, 1006,
26, 47, 26, 37, 42, 15, 60, 40, 43, 48, 30, 25, 52, 28, 41,
40, 34, 28, 40, 38, 40, 30, 35, 27, 27, 32, 44, 31 ],
"Rom" => [ 16, 433,
32, 29, 31, 25, 21, 23, 25, 39, 33, 21, 36, 21, 14, 23, 33,
27 ],
"1Cor" => [ 16, 437,
31, 16, 23, 21, 13, 20, 40, 13, 27, 33, 34, 31, 13, 40, 58,
24 ],
# Added 1 verse to 2Cor 13.
"2Cor" => [ 13, 257,
24, 17, 18, 18, 21, 18, 16, 24, 15, 18, 33, 21, 14 ],
"Gal" => [ 6, 149,
24, 21, 29, 31, 26, 18 ],
"Eph" => [ 6, 155,
23, 22, 21, 32, 33, 24 ],
"Phil" => [ 4, 104,
30, 30, 21, 23 ],
"Col" => [ 4, 95,
29, 23, 25, 18 ],
"1Thess" => [ 5, 89,
10, 20, 13, 18, 28 ],
"2Thess" => [ 3, 47,
12, 17, 18 ],
"1Tim" => [ 6, 113,
20, 15, 16, 16, 25, 21 ],
"2Tim" => [ 4, 83,
18, 26, 17, 22 ],
"Titus" => [ 3, 46,
16, 15, 15 ],
"Phlm" => [ 1, 25,
25 ],
"Heb" => [ 13, 303,
14, 18, 19, 16, 14, 20, 28, 13, 28, 39, 40, 29, 25 ],
"Jas" => [ 5, 108,
27, 26, 18, 17, 20 ],
"1Pet" => [ 5, 105,
25, 25, 22, 19, 14 ],
"2Pet" => [ 3, 61,
21, 22, 18 ],
"1John" => [ 5, 105,
10, 29, 24, 21, 21 ],
"2John" => [ 1, 13,
13 ],
"3John" => [ 1, 14, # Based on NEB; check others?
14 ],
"Jude" => [ 1, 25,
25 ],
# UBS 3 has Rev.12.18; TR and KJV lack it; others have as part of v. 17.
"Rev" => [ 22, 404,
20, 29, 22, 11, 14, 17, 17, 13, 21, 11, 19, 17, 18, 20, 8,
21, 18, 24, 21, 15, 27, 21 ],
);
$tot = 0;
for my $b (keys %ntcounts) {
$tot += @{$ntcounts{$b}}[1];
}
if ($verbose && $tot ne $totalNTVerses) {
warn "Books added to $tot total verses for NT, not $totalNTVerses.\n";
}
checkBookNames(keys %ntcounts);
checkVerseCountHash(\%ntcounts);
} # setupVerseCounts
# Add up the array of verseCounts per chapter, and see if it matches the
# explicit total (which is in [1]).
sub checkVerseCountHash {
my %h = %{$_[0]};
for my $b (keys %h) {
my @nums = @{$h{$b}};
my $nchapters = $nums[0];
my $nverses = $nums[1];
(scalar @nums == $nchapters+2) ||
warn "ERROR: Chapter count $nchapters, but " . scalar(@nums)
. " chapters in verse-count array for book '$b'.\n";
my $vct = 0;
for (my $i=2; $i3 && !$vfragments) { # vfragments for sub-verse ids
warn "Too many tokens in osisId '$anId'.\n";
}
elsif ($ntokens<3) {
warn "Need at least 3 tokens in osisID '$anId' on a verse.\n";
}
elsif ($tokens[0] !~ m/^[A-Z1-9][a-zA-Z0-9]+$/) {
warnEvent("Book token '$tokens[0]' in '$anId' is not valid.\n");
}
elsif ($tokens[1] !~ m/^[1-9][0-9]{0,2}$/) {
warnEvent("Chapter token '$tokens[1]' in '$anId' is not valid.\n");
}
elsif ($tokens[2] !~ m/^[1-9][0-9]{0,2}$/) {
warnEvent("Verse token '$tokens[2]' in '$anId' is not valid.\n");
}
else { $rc = 1; }
}
elsif ($elementType eq "chapter") {
if ($ntokens != 2) {
warn "Need exactly 2 tokens in osisID '$anId' on a chapter.\n";
}
elsif ($tokens[0] !~ m/^[A-Z1-9][a-zA-Z0-9]+$/) {
warnEvent("Book token '$tokens[0]' in '$anId' is not valid.\n");
}
elsif ($tokens[1] !~ m/^[1-9][0-9]{0,2}$/) {
warnEvent("Chapter token '$tokens[1]' in '$anId' is not valid.\n");
}
else { $rc = 1; }
}
elsif ($elementType eq "div") {
if ($ntokens != 1) {
warn "Need exactly 1 token in osisID '$anId' on a div.\n";
}
elsif ($tokens[0] !~ m/^[A-Z1-9][a-zA-Z0-9]+$/) {
warnEvent("Book token '$tokens[0]' in '$anId' is not valid.\n");
}
else { $rc = 1; }
}
else {
warn "Unexpected osisID attribute '$anId' on '$elementType' element.\n";
if ($anId !~ m/^[A-Z1-9][a-zA-Z0-9]+\.[1-9][0-9]{0,2}\.[1-9][0-9]{0,2}/) {
warn "Bad syntax in osisID '$anId'.\n";
}
}
($rc == 0) && return(0); # FAIL
# Values for context: book for canon, chapter for book, verse for chapter.
my @chapterInfo = ();
if (defined $otcounts{$tokens[0]}) {
@chapterInfo = @{$otcounts{$tokens[0]}};
}
elsif (defined $ntcounts{$tokens[0]}) {
@chapterInfo = @{$ntcounts{$tokens[0]}};
}
else {
warnEvent("Can't find book '$tokens[0]' in chapter-counts lists "
. "(only OT and NT have counts so far).\n");
my $c = correctBookAbbrev($tokens[0]);
($c) && warn " Did you mean '$c'?\n";
return(0);
}
if (scalar @chapterInfo < 1) {
warnEvent("Can't find book '$tokens[0]' for '$anId'.\n");
return(0);
}
if (scalar @tokens > 1) {
my $chapternum = $tokens[1];
if ($chapternum > $chapterInfo[0]) {
warnEvent("Chapter number '$tokens[1]' too high "
. "(book '$tokens[0]' limit is $chapterInfo[0]).\n");
return(0);
}
if (scalar @tokens > 2) {
if ($tokens[2] > $chapterInfo[$chapternum+1]) {
warnEvent("Verse number '$tokens[2]' too high "
. "(book '$tokens[0]' chapter '$chapternum' limit is "
. $chapterInfo[$chapternum+1] . ").\n");
return(0);
}
}
}
return(1);
} # checkOsisIdSyntax
###############################################################################
# See if what we got, points to where we expect. If not, we should try harder to
# recover (eventually).
sub checkSequence {
my $theOsisId = $_[0];
if ($theOsisId eq $currentBookIdList[$currentPlaceInIdList]) {
if (++$currentPlaceInIdList >= scalar(@currentBookIdList)) { # end book
loadNextBookIdList();
}
return(1);
}
warn "osisID out of order. Expected '"
. $currentBookIdList[$currentPlaceInIdList]
. "', but found '$theOsisId'.\n";
my $foundAt = findOsisIdInBookIdList($theOsisId);
if ($foundAt == -1) {
warn " Can't find '$theOsisId' among osisIds for current book.\n";
}
elsif ($foundAt < $currentPlaceInIdList) {
my $d = $currentPlaceInIdList-$foundAt;
warn " Found '$theOsisId' too early by $d. "
. "Verse(s) out of order or duplicated?\n";
}
else {
warn " Found '$theOsisId' too late by "
. ($foundAt - $currentPlaceInIdList) . ". Unit(s) missing?\n";
}
$currentPlaceInIdList = $foundAt+1;
return(0);
}
sub loadNextBookIdList {
$currentBookNum++;
if ($currentBookNum >= scalar @canonOfChoice) {
return;
}
$currentBookName = $canonOfChoice[$currentBookNum];
my $ref = getIdListForBook($currentBookName);
@currentBookIdList = @$ref;
$currentPlaceInIdList = 0;
}
sub findOsisIdInBookIdList {
my $osisId = $_[0];
for (my $i=0; $i "TR",
"Mark.7.16" => "TR",
"Mark.11.26" => "TR",
"Mark.16.9" => "TR",
"Mark.16.10" => "TR",
"Mark.16.11" => "TR",
"Mark.16.12" => "TR",
"Mark.16.13" => "TR",
"Mark.16.14" => "TR",
"Mark.16.15" => "TR",
"Mark.16.16" => "TR",
"Mark.16.17" => "TR",
"Mark.16.18" => "TR",
"Mark.16.19" => "TR",
"Mark.16.20" => "TR",
"John.7.53" => "TR",
"John.8.1" => "TR",
"John.8.2" => "TR",
"John.8.3" => "TR",
"John.8.4" => "TR",
"John.8.5" => "TR",
"John.8.6" => "TR",
"John.8.7" => "TR",
"John.8.8" => "TR",
"John.8.9" => "TR",
"John.8.10" => "TR",
"John.8.11" => "TR",
"Acts.8.37" => "TR",
"Acts.9.6" => "TR",
"1John.5.7" => "TR", # The "Johannine comma"
"1John.5.8" => "TR", # The "Johannine comma"
"Rev.12.18" => "UBS3", # TR lacks, others put into v. 17
); # Missing or just shorter?
for (@TrOnly) {
if ($_ == $_[0]) { return(1); }
}
return(0);
} # isIdTrOnly
###############################################################################
# List verses where the Tetragrammaton occurs
sub setupTetraOccurrences {
#%tetras = ();
}
sub setupSalutationOccurrences {
#%salutations = ();
}
###############################################################################
# See if an xml:lang value is one we recognize.
# From http://www.loc.gov/standards/iso639-2/php/code_list.php
# See files lang2letter and lang3letter, etc.
sub setupKnownLangs {
return;
}
sub checkLang {
if (length($_[0]) > 1 && length($_[0]) < 4) {
return(1);
}
}
# Set (global) min/max values for characters in a given range
sub setRangeForLang {
}
# Is the given Unicode char ok for a given language?
sub charOkForLang {
return(1);
}
###############################################################################
# Events always update $curline, so we don't have to pass pointer to parser all
# the way down.
sub warnEvent {
my $m0 = $_[0]; chomp $m0;
my $m1 = ($_[1]) ? ": $_[1]" : "";
warn "******* (line $curline): $m0$m1\n";
$totalErrors++;
}
sub xmlEcho {
($echo) && print $_[0];
}
###############################################################################
sub showUsage {
warn "
Usage: osisCheck [options] file
Checks an OSIS XML Bible for correct IDs, book/chapter/verse numbering, etc.
Does *not* do full XML validation.
Options:
-apoc Expect the OT Apocryphal books (not yet implemented)?
-break With -echo, break before each tag.
-canon name Which canon to check (default = NT), from:
OT, NT, Apoc, RomanCatholic, Protestant, RahlsLXX,Ethiopian,
Vulgate, Coptic, Armenian, Peshitta, Sinaiticus.
-catalog path Use specified XML catalog?
-echo Copy out the XML as it is checked.
-elements Report how many instances of each element type occurred.
-hebrew Use Hebrew numbering for Psalm headings (not finished yet),
instead of Greek numbering.
-maxerrors n Set to stop after finding n errors.
-q Suppress most messages.
-strongs Validate that the right Strong's numbers are attached
to each verse (not yet implemented).
-tr Expect more verses as in KJV/TR.
-v Add more messages, and check integrity frequently.
-vfragments Allow osisIDs to include sub-verse identifiers.
-xmllang name Allow this language (repeatable). If this option is never
specified, all languages are allowed. (not yet supported).
Version/license information:
Last modified $version. Use osisCheck -version for further information.
Known Bugs/Limitations (please report any other bugs to sderose\@acm.org):
Canons other than Roman Catholic and Protestant are not finished yet.
Discontiguous verses (repeated osisIDs) and combined verses (multiple
osisIDs on a single verse) are not yet supported.
Does not yet check whether various things are tagged at all (divineName, etc).
";
}
sub showLicense {
print "
This work by Steven J. DeRose is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see http://creativecommons.org/licenses/by-sa/3.0/.
The author's present email is sderose at acm.org.
This software was last updated on $version. For the most recent version,
see http://www.derose.net/steve/utilities/.
";
}