#!/usr/bin/perl -w # # ord: By Steven J. DeRose, 2007-10, sderose@acm.org. # # 2007-11-22 sjd: Accept control-char mnemonics as input. Getopt. # Add binary and long-name output. # 2008-02-14 sjd: Multiple input chars. setupCharacterNames(). Unify $fmt. # Add longNames for G0 and G1. Add -go, -g1. perl -w. # 2008-09-03 sjd: Move to BSD. Improve doc. # 2008-09-16 sjd: Better handling of Unicode input. # 2010-01-06 sjd: Use 'charnames' to know Unicode names. Add -binary. # Make print utf-8 and actual Unicode character. Format binary better. # 2010-05-03 sjd: perldoc. Unify formatting. Add Unix Jargon names, rest of # short names. Make user use "_" in. # 2011-08-23 sjd: Add options to control each display form separately. # Start -cp1252. # 2011-12-11 sjd: Add utf-8 output. Opt out of longNames (lists of # char names -- now using viacode instead). # 2012-01-10 sjd: Cleanup. Lose internal 'longNames' lists. # 2012-08-15 sjd: sjdUtils, and use getUTF8(). # 2013-06-17ff sjd: Add -entities, esp. HTML named ones. Add showUnicodeInfo(). # # To do: # Way to print remaining Unicode char properties # Maybe integrate w/ chr? # Recognize partial char names? Search for all matches? # Switch remaining data into xmlTuples. # use strict; use Getopt::Long; use charnames ':full'; use Unicode::UCD 'charscript'; use Unicode::UCD 'charblock'; use HTML::Entities; use Encode; use sjdUtils; our $VERSION = "2013-06-19"; my @C0names = (); my @G0names = (); my @C1names = (); my @G1names = (); my %uj = (); setupShortCharacterNames(); setupUnixJargon(); my $binary = 0; my $chart = 0; my $cp1252 = 0; my $decimal = 1; my $entities = 1; my $C0 = my $C1 = 0; my $G0 = my $G1 = 0; my $hex = 1; my $jargon = 1; my $literal = 1; my $long = 1; my $octal = 1; my $quiet = 0; my $short = 0; my $utf8 = 1; my $verbose = 0; ############################################################################### # Process options # Getopt::Long::Configure ("ignore_case"); my $result = GetOptions( "binary!" => \$binary, "c|chart" => \$chart, "cp1252!" => \$cp1252, "c0" => \$C0, "c1" => \$C1, "decimal!" => \$decimal, "entities!" => \$entities, "g0" => \$G0, "g1" => \$G1, "h|help|?" => sub { system "perldoc $0"; }, "hex!" => \$hex, "jargon!" => \$jargon, "literal!" => \$literal, "long!" => \$long, "octal!" => \$octal, "q|quiet!" => \$quiet, "short!" => \$short, "utf8!" => \$utf8, "v|verbose+" => \$verbose, "version" => sub { warn "Version of $VERSION, by Steven J. DeRose.\n"; exit; }, ); ($result) || die "Bad options.\n"; ############################################################################### # if ($C0) { showChart("c0"); exit; } if ($G0) { showChart("g0"); exit; } if ($C1) { showChart("c1"); exit; } if ($G1) { showChart("g1"); exit; } if ($chart) { showChart("c0"); showChart("g0"); showChart("c1"); showChart("g1"); exit; } ############################################################################### # (Data also available in tupleSets/cp1252.xsv) # my %cp1252 = ( 0x80 => 0x20AC, # EURO SIGN 0x82 => 0x201A, # SINGLE LOW-9 QUOTATION MARK 0x83 => 0x0192, # LATIN SMALL LETTER F WITH HOOK 0x84 => 0x201E, # DOUBLE LOW-9 QUOTATION MARK 0x85 => 0x2026, # HORIZONTAL ELLIPSIS 0x86 => 0x2020, # DAGGER 0x87 => 0x2021, # DOUBLE DAGGER 0x88 => 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT 0x89 => 0x2030, # PER MILLE SIGN 0x8A => 0x0160, # LATIN CAPITAL LETTER S WITH CARON 0x8B => 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 0x8C => 0x0152, # LATIN CAPITAL LIGATURE OE 0x8E => 0x017D, # LATIN CAPITAL LETTER Z WITH CARON 0x91 => 0x2018, # LEFT SINGLE QUOTATION MARK 0x92 => 0x2019, # RIGHT SINGLE QUOTATION MARK 0x93 => 0x201C, # LEFT DOUBLE QUOTATION MARK 0x94 => 0x201D, # RIGHT DOUBLE QUOTATION MARK 0x95 => 0x2022, # BULLET 0x96 => 0x2013, # EN DASH 0x97 => 0x2014, # EM DASH 0x98 => 0x02DC, # SMALL TILDE 0x99 => 0x2122, # TRADE MARK SIGN 0x9A => 0x0161, # LATIN SMALL LETTER S WITH CARON 0x9B => 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 0x9C => 0x0153, # LATIN SMALL LIGATURE OE 0x9E => 0x017E, # LATIN SMALL LETTER Z WITH CARON 0x9F => 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS ); ############################################################################### # print ""; binmode(STDOUT,":utf8"); while (my $name = shift) { ($name) || die "No character or control-character mnemonic found (see -h for" . " information on hard-to-type characters).\n"; my $n = 0; if (length($name) > 1) { # Search for the name $name = uc($name); # C0 name? for (my $i=0; $i 0) { my $n0 = $n & 0x0F; $n = $n>>4; my $n1 = $n & 0x0F; $n = $n>>4; $rc = sprintf("%04b_%04b %s", $n1, $n0, $rc); } $rc =~ s/\s+$//; return($rc); } ############################################################################### # Display just one character, with unified output formatting. # # Note: The widths aren't right for big unicode stuff. # sub showOne { my ($n) = @_; my $n2 = 0; if ($cp1252 && $n>=128 && $n<160) { $n2 = cp1252ToUnicode($n); } if ($short) { pline(sprintf(" %6s", "'" . getShortName($n2 ? $n2:$n) . "'")); } my $bases = ""; if ($binary) { $bases .= getBinary($n) . " "; } if ($octal) { $bases .= sprintf("o%04o ",$n); } if ($decimal) { $bases .= sprintf("d%04d ",$n); } if ($hex) { $bases .= sprintf("x%04x ",$n); } if ($bases) { pline("Bases:", $bases); } if ($utf8) { my $utf = sjdUtils::getUTF8($n2 ? $n2:$n, "\\x"); pline("Unicode:", sprintf("U+%04x, utf8 %s", $n, $utf)); } if ($entities) { my $entName = HTML::Entities::encode(chr($n)); if ($entName =~ m/^&#/) { $entName = "-NO HTML NAMED ENTITY-"; } pline("Entities:", sprintf("&#%d; &#x%x; %s", $n, $n, $entName)); } if ($long) { showUnicodeInfo($n); } if ($literal) { pline("Literal:", chr($n)); } if ($jargon && defined $uj{chr($n)}) { pline("Unix jargon:", $uj{chr($n)}); } } # showOne sub pline { my ($label, $data) = @_; printf(" %-16s %s\n", $label, $data || ""); } sub showUnicodeInfo { my ($n) = @_; if (!isUnicodeCodePoint($n)) { pline("WARNING:", "Not a Unicode code point"); } else { pline("Unicode Name:", charnames::viacode($n) || "-NOT FOUND-"); pline("Unicode Script: ", charscript(sprintf("U+%04x", $n))); pline("Unicode Block: ", charblock(sprintf("U+%04x", $n))); } my $pnum = $n >> 16; my $pname = ""; if ($pnum == 16) { $pname = "Supplementary Private Use Area B"; } elsif ($pnum == 15) { $pname = "Supplementary Private Use Area A"; } elsif ($pnum == 14) { $pname = "Supplementary Special-purpose"; } elsif ($pnum >= 3) { $pname = "Unassigned"; } elsif ($pnum == 2) { $pname = "Supplementary Ideographic"; } elsif ($pnum == 1) { $pname = "Supplementary Multilingual"; } elsif ($pnum == 0) { $pname = "Basic Multilingual"; } else { $pname = "-UNKNOWN-"; } pline("Unicode Plane:" , $pnum . ": " . $pname); if ($n == 0xEFBFBD) { pline("WARNING:", "UTF8 of U+FFFD (Replacement Character)?"); } } # showUnicodeInfo ############################################################################### # sub getShortName { my ($n) = @_; my $name = ""; if ($n>=256) { $name = charnames::viacode($n); if (!$name) { $name = "???"; } } else { if ($n < 32) { $name = $C0names[$n]; } elsif ($n < 128) { $name = $G0names[$n-32]; } elsif ($n < 160) { $name = $C1names[$n-128]; } elsif ($n < 256) { $name = $G1names[$n-160]; } } return($name); } ############################################################################### # Define local names, in case we want shorter forms than viacode gives. # sub setupShortCharacterNames { @C0names = ( "NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", "SP"); @C1names = ( "PAD", "HOP", "BPH", "NBH", "IND", "NEL", "SSA", "ESA", "HTS", "HTJ", "VTS", "PLD", "PLU", "RI", "SS2", "SS3", "DCS", "PU1", "PU2", "STS", "CCH", "MW", "SPA", "EPA", "SOS", "SGCI", "SCI", "CSI", "ST", "OSC", "PM", "APC", "NBS"); @G0names = ( # Generally from *nix jargon file # 0x20 "SPACE", "BANG", "QUOTATION MARK", "HASH", "DOLLAR", "GRAPES", "AMP", "POP", "LEFT PARENTHESIS", "RIGHT PARENTHESIS", "ASTERISK", "PLUS SIGN", "COMMA", "HYPHEN-MINUS", "FULL STOP", "SOLIDUS", # 0x30 "DIGIT ZERO", "DIGIT ONE", "DIGIT TWO", "DIGIT THREE", "DIGIT FOUR", "DIGIT FIVE", "DIGIT SIX", "DIGIT SEVEN", "DIGIT EIGHT", "DIGIT NINE", "COLON", "SEMICOLON", "LESS-THAN SIGN", "EQUALS SIGN", "GREATER-THAN SIGN", "QUESTION MARK", # 0x40 "COMMERCIAL AT", "LATIN CAPITAL LETTER A", "LATIN CAPITAL LETTER B", "LATIN CAPITAL LETTER C", "LATIN CAPITAL LETTER D", "LATIN CAPITAL LETTER E", "LATIN CAPITAL LETTER F", "LATIN CAPITAL LETTER G", "LATIN CAPITAL LETTER H", "LATIN CAPITAL LETTER I", "LATIN CAPITAL LETTER J", "LATIN CAPITAL LETTER K", "LATIN CAPITAL LETTER L", "LATIN CAPITAL LETTER M", "LATIN CAPITAL LETTER N", "LATIN CAPITAL LETTER O", # 0x50 "LATIN CAPITAL LETTER P", "LATIN CAPITAL LETTER Q", "LATIN CAPITAL LETTER R", "LATIN CAPITAL LETTER S", "LATIN CAPITAL LETTER T", "LATIN CAPITAL LETTER U", "LATIN CAPITAL LETTER V", "LATIN CAPITAL LETTER W", "LATIN CAPITAL LETTER X", "LATIN CAPITAL LETTER Y", "LATIN CAPITAL LETTER Z", "LEFT SQUARE BRACKET", "REVERSE SOLIDUS", "RIGHT SQUARE BRACKET", "CIRCUMFLEX ACCENT", "LOW LINE", # 0x60 "GRAVE ACCENT", "LATIN SMALL LETTER A", "LATIN SMALL LETTER B", "LATIN SMALL LETTER C", "LATIN SMALL LETTER D", "LATIN SMALL LETTER E", "LATIN SMALL LETTER F", "LATIN SMALL LETTER G", "LATIN SMALL LETTER H", "LATIN SMALL LETTER I", "LATIN SMALL LETTER J", "LATIN SMALL LETTER K", "LATIN SMALL LETTER L", "LATIN SMALL LETTER M", "LATIN SMALL LETTER N", "LATIN SMALL LETTER O", # 0x70 "LATIN SMALL LETTER P", "LATIN SMALL LETTER Q", "LATIN SMALL LETTER R", "LATIN SMALL LETTER S", "LATIN SMALL LETTER T", "LATIN SMALL LETTER U", "LATIN SMALL LETTER V", "LATIN SMALL LETTER W", "LATIN SMALL LETTER X", "LATIN SMALL LETTER Y", "LATIN SMALL LETTER Z", "LEFT CURLY BRACKET", "VERTICAL LINE", "RIGHT CURLY BRACKET", "TILDE", " DEL DELETE" ); ($G0names[126-32] eq "TILDE") || die "ord: Internal G0 name table screwed up.\n"; } # setupCharacterNames ############################################################################### # sub setupUnixJargon { %uj = ( "!" => "Common: bang; pling; excl; not; shriek; ball-bat. " . "Rare: factorial; exclam; smash; cuss; boing; yell; wow; hey; " . "wham; eureka; spark-spot; soldier, control", "\"" => "Common: double quote; quote. " . "Rare: literal mark; double-glitch; snakebite; dirk; " . "rabbit-ears; double prime", "#" => "Common: number sign; pound; pound sign; hash; " . "sharp; crunch; hex; mesh. " . "Rare: grid; cross-hatch; octothorpe; flash; pig-pen; " . "tic-tac-toe; scratchmark; thud; thump; splat", "\$" => "Common: dollar. " . "Rare: currency symbol; buck; cash; bling; string (from BASIC); " . "escape (when used as the echo of ASCII ESC); ding; cache; big money", "%" => "Common: percent; mod; grapes. " . "Rare: double-oh-seven", "&" => "Common: amp; amper; and, and sign. " . "Rare: address (from C); reference (from C++); andpersand; " . "bitand; background (from sh(1) ); pretzel", "'" => "Common: single quote; quote. " . "Rare: prime; glitch; tick; irk; pop; spark;", "(" => "Common: l paren; l parenthesis; leftight; open; paren; " . "o paren; o parenthesis; l parenthesis; l banana. " . "Rare: so; lparen; o round bracket, l round bracket, wax; " . "parenthisey; l ear", ")" => " Common: r paren; r parenthesis; right; close; the-sis; " . "c paren; c parenthesis; r parenthesis; r banana. " . "Rare: al-ready; rparen; c round bracket, r round bracket, " . "wane; unparenthisey; r ear", "*" => "Common: star; splat. " . "Rare: wildcard; gear; dingle; mult; spider; aster; " . "times; twinkle; glob; Nathan Hale", "+" => "Common: add. " . "Rare: cross; intersection", "," => "" . "Rare: tail", "-" => "Common: dash. " . "Rare: worm; option; dak; bithorpe", "." => "Common: dot; point. " . "Rare: radix point; full stop; spot", "/" => "Common: slash; stroke; forward slash. " . "Rare: diagonal; solidus; over; slak; virgule; slat", ":" => "Common: . " . "Rare: dots; two-spot", ";" => "Common: semi. " . "Rare: weenie; hybrid, pit-thwong", "<" => "Common: bra; l angle; l angle bracket; l broket. " . "Rare: from; read from; comes-from; in; crunch; tic; angle", ">" => "Common: ket; r angle; r angle bracket; r broket. " . "Rare: into, towards; write to; gozinta; out; zap; tac; right angle", "=" => "Common: gets; takes. " . "Rare: quadrathorpe; half-mesh", "?" => "Common: query; ques . " . "Rare: quiz; whatmark; what; wildchar; huh; hook; " . "buttonhook; hunchback", "@" => "Common: at sign; at; strudel. " . "Rare: each; vortex; whorl; whirlpool; cyclone; snail; " . "ape; cat; rose; cabbage;", "V" => "" . "Rare: book", "[" => "Common: l square bracket; l bracket; bracket. " . "Rare: square; U turn", "]" => "Common: r square bracket; r bracket; unbracket. " . "Rare: un-square; U turn back", "\\" => "Common: backslash, hack, whack; escape; reverse slash; " . "slosh; backslant; backwhack. " . "Rare: bash; reversed virgule; reverse solidus; rsol; backslat", "^" => "Common: hat; control; uparrow; caret. " . "Rare: xor sign, chevron; shark; shark-fin; to the; " . "to the power of; fang; pointer", "_" => "Common: underscore; underbar; under. " . "Rare: score; backarrow; skid; flatworm", "`" => "Common: backquote; left quote; left single quote; " . "open quote; grave. " . "Rare: backprime; backspark; unapostrophe; birk; blugle; " . "back tick; back glitch; push; quasiquote", "{" => "Common: o brace; l brace; l squiggly; l squiggly bracket, " . "l squiggly brace; l curly bracket, l curly brace. " . "Rare: brace; curly-curly; l squirrelly; embrace", "}" => "Common: c brace; r brace; r squiggly; r squiggly bracket, " . "r squiggly brace; r curly bracket; r curly brace. " . "Rare: unbrace; un-curly; r squirrelly; bracelet", "|" => "Common: bar; or; or-bar; v-bar; pipe; vertical bar. " . "Rare: gozinta; thru; pipesinta; spike", "~" => "Common: squiggle; twiddle; not. " . "Rare: approx; wiggle; swung dash; enyay" ); } # setupUnixJargon ############################################################################### # See http://www.microsoft.com/typography/unicode/1252.htm # sub cp1252ToUnicode { my ($char) = @_; return($cp1252{$char}); } ############################################################################### # =pod =head1 Usage ord [options] [chars|mnemonics] Displays Unicode character code point numbers and other information about a character(s). For example, "ord BULLET" produces: Bases: o20042 d8226 x2022 Unicode: U+2022, utf8 \xe2\x80\xa2 Entities: • • • Unicode Name: BULLET Unicode Script: Common Unicode Block: General Punctuation Unicode Plane: 0: Basic Multilingual Literal: • You can specify the character in several ways; =over =item * via a code point, e.g. '\xb9'. =item * For control characters, their short mnemonics such as C. =item * Full Unicode names like "APL FUNCTIONAL SYMBOL LEFTWARDS VANE", ignoring case. But if there are spaces in the name, you need to quote it or use "_" instead of each space. Unfortunately, approximate names are not (yet) supported. =item * A fairly large selection of *nix jargon names (likewise, quote them if they contain spaces). Default output includes the character's short name, code point number (in hexadecimal, decimal, octal, and optional I<-binary>), utf-8 coding in hex, Unicode long name, Unix Jargon file names for the character, and the literal character itself. =head1 Options (prefix 'no' to option name to negate where applicable) =over =item * B<-binary> Show code points in binary. =item * B<-chart> Show a chart of character values and mnemonics. See also I<-c0>, I<-c1>, I<-g0>, and I<-g1> for partial charts. =item * B<-cp1252> Assume char set is cp1252. =item * B<-c0> Same as I<-chart>, but only for C0 range (d0-d31). =item * B<-c1> Same chart, but only for C1 range (d128-d159). =item * B<-decimal> Display code points in decimal (default). =item * B<-entities> Display the named HTML special-character entity (if any), and the SGML/HTML/XML numeric character references (decimal and hexadecimal). =item * B<-g0> Same chart, but only for G0 range (d32-d127). =item * B<-g1> Same chart, but only for G1 range (d160-d255). =item * B<-hex> Display code points in hexadecimal (default). =item * B<-jargon> Display applicable *nix jargon names (default). =item * B<-literal> Include literal character in output (default). =item * B<-long> Show long names for characters (default). =item * B<-octal> Display code points in octal (default). =item * B<-short> Show short names for characters. =item * B<-utf8> Show UTF-8 byte sequence for the character (default). =item * B<-version> Show version/license info and exit. =back =head2 Note You need to backslash and/or quote some characters to use them as arguments: sp (x20, d32, o40) \" (x22, d34, o42) \# (x23, d35, o43) \& (x26, d38, o46) \' (x27, d39, o47) \( (x28, d40, o50) \) (x29, d41, o51) \+ (x2b, d43, 053) (or, you can precede this with '--' (end-of-options) \; (x3b, d59, o73) \< (x3c, d60, o74) \> (x3e, d62, o76) \\ (x5c, d92, o134) \` (x60, d96, o140) \| (x7c, d124, o174) And some you can't escape in some shells, including: \\t (x09, d09, o11) HT \\n (x0a, d10, o12) LF (you can put the newline in double-quotes) \\r (x0d, d13, o15) CR (you can put the return in double-quotes) =head1 Known bugs and limitations Even with a Unicode-enabled terminal such as C, a character > 255 may appear to be length > 1, and so will be taken as a name. But when the name is not found, we print out the value anyway. =head1 Related commands C -- Does the reverse. C -- Converts a number to multiple bases. =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see L. The author's present email is sderose at acm.org. For the most recent version, see L. =cut