#!/usr/bin/perl -w # # unicode2names: Find all the Unicode characters and turn them into entities. # # 2006-12-18: Written by Steven J. DeRose. use strict; our $VERSION = "2012-09-12"; my $quiet = 0; my $ordLimit = 127; my $entDir = "/pmc/load/converter3/dtd/nlm/books/ncbi-book/entities-pi"; my @entityNameList = (); my $entFormat = "%05x"; # Process options while ($ARGV[0]) { if (index($ARGV[0],"--")==0) { $ARGV[0] = substr($ARGV[0],1); } if ($ARGV[0] eq "-q") { $quiet = 1; } elsif ($ARGV[0] eq "-ascii") { $ordLimit = 127; } elsif ($ARGV[0] eq "-latin1") { $ordLimit = 255; } elsif ($ARGV[0] eq "-entformat") { die "-entformat option not yet supported.\n"; shift; } elsif ($ARGV[0] eq "-version") { warn "Version of $VERSION, by Steven J. DeRose.\n"; exit; } elsif (substr($ARGV[0],0,1) eq "-") { ($ARGV[0] eq '-h' or $ARGV[0] eq '-help') || print "Unknown option '$ARGV[0]'.\n"; system "perldoc $0"; exit; } else { last; } shift; } # options my $file = $ARGV[0]; (-e $file) || die "Couldn't find file '$file'.\n"; ############################################################################### # Set up mapping from Unicode code points to XML entity names makeEntityNameList(); if ($verbose) { print "\nTable of entity names:\n"; for (my $i=0; $i) { while ($rec =~ m/&\#[xX]?[0-9a-fA-F];/) { # Handle numeric entity references here } for (my $i=0; $i&#x$chex;"; } } } if ($errCount > 0) { warn "\nTotal errors: $errCount.\n"; } exit; ############################################################################### # Extract a list of entity names and numeric values, to make an array where # we'll look up names by value. sub makeEntityNameList { ($quiet) || warn "Looking for entity declarations in '$entDir'.\n"; (-d $entDir) || die "Can't find entity declarations at $entDir.\n"; system "ls $entDir"; my $loadCount = 0; my $errCount = 0; my $dupCount = 0; $cmd = "ls $entDir\/\*\.ent"; #warn "running: '$cmd'.\n"; my @entFiles = `$cmd`; if (scalar @entFiles <=0) { die "No .ent files found in $entDir.\n"; } foreach my $f (@entFiles) { chomp $f; open IN, "<$f"; $inComment = 0; while ($entdcl = ) { chomp $entdcl; # Handle full and partial comments $entDcl =~ s///; if ($entDcl =~ s///) { $inComment = 0; } else { next; } } ($entName = $entDcl) =~ s/(\s*