#!/usr/bin/perl -w # # chr: Show char for a given code point number. # # 2007-10-29: Written by Steven J. DeRose. # 2008-01-02 sjd: Add -n. Fix name access for C1 range. # 2008-09-03 sjd: Move to BSD. # 2010-01-06 sjd: Add charnames for Unicode. # 2010-05-03 sjd: perldoc. Start fixing base recognition. # 2011-06-22 sjd: Fix bug handling unknown Unicode char names. # 2011-06-29 sjd: Eliminate -multiple option. Fix decimal input. # 2011-08-04 sjd: Support -cp1252. Fix oct() usage. Check Unicode max. # 2012-01-27 sjd: Keep shifting data to use XmlTuples. # 2012-02-28 sjd: Last of XmlTuples integration. Recognize C0 abbrs. # 2012-07-27 sjd: Trap bad unicode char in isURIchar(). # 2012-08-13f sjd: Better message if arg isn't numeric. Do URI form. # Add HTML::Entities. Clean up display. # 2013-01-14 sjd: Add Unicode script and block. # 2013-06-19: Add Unicode equivalents for CP1252 chars. # 2013-08-19: Add utf-8 input. # # To do: # Add -iencoding, -mac. Combine, and with -cp1252. # Support input of UTF-8 byte sequences? # Make XmlTuples optional. # Option to display all Unicode properties? # use strict; use Getopt::Long; use Encode; use charnames ':full'; use Unicode::UCD 'charscript'; use Unicode::UCD 'charblock'; use HTML::Entities; #use Encode::Escape; #::Unicode; use sjdUtils; our $VERSION = "2013-06-19"; ############################################################################### # cf sjdUtils # sub XisUnicodeCodePoint { my ($n) = @_; ($n < 0 || $n > 0x10FFFF || $n == 0x00FFFE || $n == 0x00FFFF || ($n >= 0x80 && $n < 0xa0) ) && return(0); return(1); } sub Xtry_module { # Also available in sjdUtils.pm my ($mod, $quiet) = @_; eval("use $mod"); if ($@) { ($quiet) || warn "try_module: Couldn't find Perl module '$mod'\n"; return(0); } return(1); } sub pline { my ($label, $data) = @_; printf(" %-16s %s\n", $label, $data || ""); } ############################################################################### # Options # my $cp1252 = 0; my $iencoding = ""; my $long = 0; my $quiet = 0; my $utf8 = 0; my $verbose = 0; my %getoptHash = ( "cp1252!" => \$cp1252, "h|help|?" => sub { system "perldoc $0"; exit; }, "iencoding=s" => \$iencoding, "listEncodings" => sub { warn "\nEncodings available:\n"; my $last = ""; my $buf = ""; for my $k (Encode->encodings(":all")) { my $cur = substr($k,0,2); if ($cur ne $last) { warn "$buf\n"; $last = $cur; $buf = ""; } $buf .= "$k "; } warn "$buf\n"; exit; }, "long!" => \$long, "q|quiet!" => \$quiet, "utf8!" => \$utf8, "v|verbose+" => \$verbose, "version" => sub { die "Version of $VERSION, by Steven J. DeRose.\n"; } ); Getopt::Long::Configure ("ignore_case"); GetOptions(%getoptHash) || die("Bad options.\n"); ($ARGV[0]) || die "Must have a non-negative numeric argument.\n"; ############################################################################### # Process options # my $gotXSV = try_module("XmlTuples"); if (!$gotXSV) { warn "Can't find Perl module 'XmlTuples'. Some options won't work."; } my @C0names = @{getC0Names()}; my @C0longNames = @{getC0LongNames()}; my $c0HashRef = my $c1HashRef = my $macHashRef = undef; if ($gotXSV) { my $xt = new XmlTuples(getC0Data()); $c0HashRef = $xt->getAllAsHash("Hex"); ($verbose) && warn "c0hashRef: $c0HashRef.\n"; $xt = new XmlTuples(getC1Data()); $c1HashRef = $xt->getAllAsHash("Hex"); $xt = new XmlTuples(getMacRomanData()); $macHashRef = $xt->getAllAsHash("Hex"); } ############################################################################### ############################################################################### # MAIN # while (my $arg = shift) { if ($utf8) { if ($arg !~ m/^0?x([0-9a-f][0-9a-f])+$/) { warn "Bad UTF8 value '$arg'. Must be given as 0x....\n"; next; } (my $hex = $arg) =~ s/^0?x//i; $hex =~ s/(..)/\\x$1/g; my $utfString = eval("\"$hex\""); my $str = decode("utf8", "$utfString"); printf("utf-8 %s => %d Unicode character(s).\n", $arg, length($str)); for (my $i=0; $i{$hex}}; if (%c0info) { if ($long) { pline("C0 control:", $c0info{"Descr"}); } else { pline("C0 mnemonic:", $c0info{"Short"}); } } else { pline("WARNING:", "Can't find info on character $n (0x$hex)."); } } elsif ($n == 32) { if ($long) { pline("SPACE:", $C0longNames[$n]); } else { pline("SPACE:", $C0names[$n]); } } elsif ($n < 128) { ($verbose) && pline("G0 graphic character"); pline("G0 literal:", $u); } elsif ($n < 161) { ($verbose) && pline("C1 control character"); my %c1info = %{$c1HashRef->{sprintf("%02X",$n)}}; if ($cp1252 || !$quiet) { pline("CP1252 DANGER:", $c1info{"cp1252Short"}); pline(" Unicode equiv:", "U+" . $c1info{"cp1252Equiv"}); my $uname = charnames::viacode(hex("0x".$c1info{"cp1252Equiv"})); pline(" Unicode name:", $uname || "-NONE-"); } pline("Latin-1 control:", $c1info{"Short"}); ($long) && pline(" Description:", $c1info{"Descr"}); } elsif ($n < 256) { ($verbose) && pline("G1 graphic character"); pline(" G1 literal:", $u); } showUnicodeInfo($n); # URI escaping pline("URI form:", sjdUtils::getUTF8($n)); my $entName = HTML::Entities::encode_entities($u); if ($entName =~ m/^&#/) { $entName = "-NO HTML NAMED ENTITY-"; } my $xform = (sjdUtils::isXmlChar($u)) ? sprintf("&#%d; &#x%x; %s\n", $n, $n, $entName) : "Not an XML character"; pline("XML forms:", $xform); } # doOneChar sub showUnicodeInfo { my ($n) = @_; if (!isUnicodeCodePoint($n)) { pline("WARNING:", "Not a Unicode code point"); } else { pline("Unicode Name:", charnames::viacode($n) || "-NOT FOUND-"); pline("Unicode Script: ", charscript(sprintf("U+%04x", $n))); pline("Unicode Block: ", charblock(sprintf("U+%04x", $n))); } my $pnum = $n >> 16; my $pname = ""; if ($pnum == 16) { $pname = "Supplementary Private Use Area B"; } elsif ($pnum == 15) { $pname = "Supplementary Private Use Area A"; } elsif ($pnum == 14) { $pname = "Supplementary Special-purpose"; } elsif ($pnum >= 3) { $pname = "Unassigned"; } elsif ($pnum == 2) { $pname = "Supplementary Ideographic"; } elsif ($pnum == 1) { $pname = "Supplementary Multilingual"; } elsif ($pnum == 0) { $pname = "Basic Multilingual"; } else { $pname = "-UNKNOWN-"; } pline("Unicode Plane:" , $pnum . ": " . $pname); if ($n == 0xEFBFBD) { pline("WARNING:", "UTF8 of U+FFFD (Replacement Character)?"); } } # showUnicodeInfo # Find a code point given its short name; mainly for control characters. # For example, "DC1" => x11. sub lookupAbbr { my ($ab) = @_; for my $k (keys %{$c0HashRef}) { my $dataRef = $c0HashRef->{$k}; if ($dataRef->{Short} eq $ab) { return(hex("0x" . $dataRef->{"Hex"})); } } return(undef); } sub isURIchar { my ($c) = @_; my $rc = 0; my $expr = "\$c =~ m/[-+A-Z_a-z0-9!\\\$&\'()*.\\\/:;=?\\\@]/"; $rc = eval($expr); if ($@ ne "") { warn "isURIchar: Bad Unicode character '$c'?\n" . " Eval('$expr') said:\n $@\n"; return(0); } return($rc); } sub getBases { my ($n) = @_; return(sprintf("o%04o d%04d x%04x; U+%04x, utf-8 %s", $n, $n, $n, $n, sjdUtils::getUTF8($n, "\\x"))); } ############################################################################### ############################################################################### ############################################################################### # Create arrays of names for the C0 and C1 control characters # (See chart at bottom) # sub getC0Names { return([ "NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", "SPACE"]); } sub getC0LongNames { return([ "Null", "Start Of Heading", "Start Of Text", "End Of Text", "End Of Transmission", "Enquiry", "Acknowledge", "Bell", "Backspace", "Horizontal Tab", "Newline", "Vertical Tab", "Form Feed", "Carriage Return", "Shift Out", "Shift In", "Data Link Escape", "Device Control 1", "Device Control 2", "Device Control 3", "Device Control 4", "Negative Acknowledge", "Synchronous Idle", "End Of Transmission Block", "Cancel", "End Of Medium", "Substitute", "Escape", "Field Separator", "Group Separator", "Record Separator", "Unit Separator", "Space" ]); } # See http://en.wikipedia.org/wiki/Mac_OS_Roman # sub getMacRomanData { return(qq{ }); } # macRomanData sub getC0Data { return(qq@ @); } # getC0Data ############################################################################### # PAD, HOP, and SGCI are listed as "XXX" in Unicode (acc. Wikipedia). # sub getC1Data { return(qq{ }); } # getC1Data ############################################################################### ############################################################################### ############################################################################### # =pod =head1 Usage chr [options] [nums] Display information about the character(s) corresponding to the code point(s) number(s) in I. For example, "chr 0x2203" produces: 0x2203: Bases: o21003 d8707 x2203; U+2203, utf-8 \xe2\x88\x83 Unicode Name: THERE EXISTS Unicode Script: Common Unicode Block: Mathematical Operators Unicode Plane: 0: Basic Multilingual URI form: %e2%88%83 XML forms: ∃ ∃ ∃ I may be in hex (0x...), octal (0...), binary (0b...), or decimal. With the I<-utf8> option, you can give I as hex UTF8. Control characters and spaces will be displayed as mnemonics. Tries to get the full Unicode char name for chars >255. Display of other characters depends on your terminal program. =head1 Options (prefix 'no' to invert when applicable) =over =item B<-cp1252> Show Windows Code Page 1252 meanings of characters d128-d159 (the rest of CP1252 matches Latin-1). =item * B<-iencoding> I Assume the output is in character set I. Not yet supported. See also I<-listEncodings>, I<-cp1252>. =item * B<-listEncodings> Show all the encodings supported by I<-iencoding>, and exit. =item B<-long> Give long names for control characters, instead of mnemonics. =item B<-q> Suppress most messages. =item B<-utf8> Interpret the command-line numbers as hexadecimal representations of UTF8 (the most common representation for Unicode). For example, the C character is Unicode code point U+201C (or o20034, d8220, x201c). In UTF8, all characters > 127 are encoded as multiple bytes, in this case the 3 bytes sequences \xe2, \x80, \x9c. To use C to identify this sequence, do: chr -utf8 0xe2809c You can enter a UTF hex sequence that represents more than one character. C will find the boundaries, and describe each character in turn. =item B<-v> Add more detailed messages. =item B<-version> Display version info and exit. =back =head1 Known bugs and limitations Most terminal programs assume Latin-1 or even CP1252, while Perl most readily writes out Unicode (as utf8). So displayed output may be wrong. =head1 Related commands C -- Do the reverse. C -- Find and/or count characters in particular ranges, including XML character references, URI escapes, etc. C -- CPAN package to deal with Unicode properties and names. C -- takes numbers in any of several forms, and shows them in multiple forms, much like this script also does with I<-nums>. C -- Parse various internal data about characters (optional). =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see L. The author's present email is sderose at acm.org. For the most recent version, see L. =cut