#!/usr/bin/perl -w
#
# getCharsByScript
#
# Written by Steven J. DeRose, 2013-01-15. sderose@acm.org.
#
# To do:
# Option to treat script as contiguous even if unknowns intervene?
# Save the long stretch of unknowns:
# From="U+02fa1e" To="U+0e0000" N="722403"
# File unknowns under block name (e.g. for math)
# Option to display block name containing each range (ever >1?)
# Extend to do by block, by class (LC_LETTER, etc),....
#
use strict;
use Getopt::Long;
use Encode;
use Unicode::UCD 'charscript'; # Unicode property access
use Unicode::UCD 'charblock'; # Unicode property access
our $VERSION = "2013-01-15";
# General options
#
my $color = ($ENV{"USE_COLOR"} && -t STDERR) ? 1:0;
my $max = 0x10FFFF;
my $min = 0;
my $nil = 0;
my $quiet = 0;
my $script = "";
my $verbose = 0;
###############################################################################
# Process options
#
my %getoptHash = (
"color!" => \$color,
"max=o" => \$max,
"min=o" => \$min,
"nil!" => \$nil,
"q|quiet!" => \$quiet,
"script=s" => \$script,
"v|verbose+" => \$verbose,
"version" => sub {
die "Version of $VERSION, by Steven J. DeRose.";
},
);
Getopt::Long::Configure ("ignore_case");
GetOptions(%getoptHash) || die("Bad options.");
###############################################################################
###############################################################################
# Main
#
no warnings "utf8";
my %byScript = (); # Who's from each Unicode Script?
my %byBlock = (); # Who's from each Unicode Block?
my $nonUnicode = 0;
my $badScript = 0;
my $unkScript = 0;
my $nRanges = 0;
my $maxRangeSize = 0;
my %rangesByScript = ();
my %charsByScript = ();
print "\n\n";
# Gather the raw data
#
my $lastScript = "";
my $curScript = "";
my $start = -1;
my $run = 0;
for (my $i=$min; $i<=$max; $i++) {
my $c = chr($i);
if (!defined $c) {
$nonUnicode++;
next;
}
my $curScript = charscript(sprintf("U+%04x", $i));
if (!$curScript) {
$badScript++;
$curScript = "???";
}
elsif ($curScript eq "Unknown") {
$unkScript++;
}
if ($curScript eq $lastScript) {
$run++;
}
else {
$nRanges++;
my $rangeSize = $i-$start;
$rangesByScript{$lastScript}++;
$charsByScript{$lastScript} += $rangeSize;
my $printIt = 1;
if ($lastScript eq "") { $printIt = 0; }
if ($lastScript eq "Unknown" && !$nil) { $printIt = 0; }
if ($lastScript eq "???" && !$nil) { $printIt = 0; }
if ($script && $lastScript !~ m/$script/i) { $printIt = 0; }
if ($printIt) {
print sprintf(
"\n",
'"'.$lastScript.'"', $start, $i-1, $rangeSize);
if ($rangeSize > $maxRangeSize) { $maxRangeSize = $rangeSize; }
}
$start = $i;
$run = 1;
$lastScript = $curScript;
}
} # for
print "\n\n";
msgLine("Summary");
msgLine("Total code points checked", $max-$min+1);
msgLine("Non-Unicode characters", $nonUnicode);
msgLine("Chars in no script", $badScript);
msgLine("Chars in unknown script", $unkScript);
msgLine("Number of ranges", $nRanges);
msgLine("Longest range", $maxRangeSize);
msgLine("Ranges per script");
msgLine(" Script NRanges NChars");
for my $s (sort keys %rangesByScript) {
warn sprintf(" %-24s %4d %6d\n",
$s, $rangesByScript{$s}, $charsByScript{$s});
}
exit;
###############################################################################
###############################################################################
#
sub msgLine {
my ($label, $num) = @_;
if (not defined $num) {
warn "\n$label:\n";
}
else {
warn sprintf("%-30s %8d\n", $label, $num);
}
}
###############################################################################
###############################################################################
###############################################################################
#
=pod
=head1 Usage
getCharsByScript
Run through a range of Unicode code points,
and gather them into groups by script.
Then provides a list of the range(s) covered by each script, in XSV (qv).
See I<-script> to select only a certain script(s).
Not all legit characters are in a script -- consider math symbols.
B: There's not much after U+2FA1D:
Script="Han" From="U+02f800" To="U+02fa1d" N=" 542"
Script="???" From="U+02fa1e" To="U+0e0000" N="722403"
Script="Common" From="U+0e0001" To="U+0e0001" N=" 1"
Script="???" From="U+0e0002" To="U+0e001f" N=" 30"
Script="Common" From="U+0e0020" To="U+0e007f" N=" 96"
Script="???" From="U+0e0080" To="U+0e00ff" N=" 128"
Script="Inherited" From="U+0e0100" To="U+0e01ef" N=" 240"
You can C the output naively to get each script's ranges together.
=head1 Options
=over
=item * B<-max> I
End with code point I (can be decimal, octal, or hex).
=item * B<-min> I
Start with code point I (can be decimal, octal, or hex).
=item * B<-nil>
Include entries for characters in I script.
=item * B<-q>
Suppress most messages.
=item * B<-script> I
Only report the ranges for scripts that match I
(ignoring case).
It's a regex mainly so you can avoid spelling details....
=item * B<-v>
Add more detailed messages (repeatable).
=item * B<-version>
Show version information and exit.
=back
=head1 Related commands
=head1 Known bugs and limitations
=head1 Ownership
This script was formerly known as 'nonascii'.
This work by Steven J. DeRose is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see L.
The author's present email is sderose at acm.org.
For the most recent version, see L.
=cut