#!/usr/bin/perl -w # # XsdDatatypes.pm # # Written 2010-03-23 as part of csvSupport by Steven J. DeRose, sderose@acm.org. # 2011-01-05 sjd: Make into separate package(s). # 2011-01-06 sjd: Compiles. # 2011-01-09 sjd: fileFormat package. Cleaning. Start re-ordering. # 2011-01-13 sjd: Split out from csvFormat. # # To do: # use strict; return(1); ############################################################################### ############################################################################### # This package is just to help organize per-field type/layout information. # No real methods, just access it directly. # package fieldInfo; sub new { my ($class) = @_; my $self = { # Field order is part of record, not field. fName => "", fWidth => 0, fAlign => "L", fNilValue => "", fNormalize => 0, fDatatype => "", fBase => 10, fMin => undef, fMax => undef, fExpr => "", }; bless $self, $class; return $self; } ############################################################################### # Check data for conformance to basic XSD types. # package XsdDatatypes; sub new { my ($class) = @_; my $self = { }; bless $self, $class; $self->setupXsdTypes(); return $self; } ############################################################################### # sub checkOneField { my ($self, $fieldNum, $fieldText, $dt) = @_; ($self->{verbose}) && warn "Checking field $fieldNum type '$dt" . # $self->{fieldInfo}->[$fieldNum]->{fDatatype} . "': '$fieldText'\n"; (my $type = $fieldText) =~ s/(.*\/)\s*(.*)$/$1/; (defined $type) || return(0); my $minmax = $2; my $reg = ""; if (substr($type,0,1) eq "/") { ($reg = $type) =~ s|/||g; } else { $reg = $dt; } if ($fieldText !~ m/^$reg$/) { return("Datatype error: field $fieldNum (/$reg/ vs. '$fieldText')"); } my @limits = split(/\s+/,$minmax); if (defined $limits[0]) { my $fieldValue = $fieldText + 0; if ($fieldValue < $limits[0]) { return("Datatype error: field $fieldNum " . "(value less than $limits[0]): '$fieldText'"); } if ($fieldValue > $limits[1]) { return("Datatype error: field $fieldNum " . "(value greater than $limits[1]): '$fieldText'"); } } return(""); # ok } # checkDatatype ############################################################################### # Define all the xsd build-in datatypes. See http://www.w3.org/TR/xmlschema-2 # Don't include the ^ and $ to tie the ends down, since they apply to all. # Use "NaN" for min/max values on non-numeric types. # WARNING: # "/" inside regex will croak checkDatatype even if escaped. # sub setupXsdTypes { my ($self) = @_; my $Name = "\\w[-_.\\w\\d]*"; my $QName = "\\w[-_.:\\w\\d]*"; my $decimal = "[-+]?\\d+(\\.\\d+)?"; my $float = "INF|-INF||NaN|[-+]?\\d+(\\.\\d+)?(+[Ee][+-]?\\d{1,3})?"; my $double = "INF|-INF||NaN|[-+]?\\d+(\\.\\d+)?(+[Ee][+-]?\\d{1,4})?"; my $time = "\\d+:\\d+:\\d+(\.\\d+)"; my $date = "-?\\d{4,}-\\d\\d-\\d\\d"; my $zone = "(Z|([-+]?\\d\\d:\\d\\d))"; my $duration = "-?P\\d+Y\\d+M\\d+DT\\d+H\\d+M\\d+(\\.\\d+)S"; $self->{typeList} => { # Typename Regex Min Max # -------- ----- --- --- # xsdPrimitiveTypes duration => "/$duration/ NaN NaN", datetime => "/$date"."T$time$zone/ NaN NaN", time => "/$time$zone/ NaN NaN", date => "/$date$zone/ NaN NaN", gYearMonth => "/\\d{4,}-\\d\\d$zone/ NaN NaN", gYear => "/\\d{4,}$zone/ NaN NaN", gMonthDay => "/--\\d\\d-\\d\\d$zone/ NaN NaN", gDay => "/---\\d\\d$zone/ 0 31", gMonth => "/---\\d\\d$zone/ 0 12", boolean => "/0|1|false|true/ NaN NaN", # base64binary must have 4n characters not including \s. base64binary => "/[\\s+\/a-zA-Z0-9]+=*/ NaN NaN", hexBinary => "/([0-9a-fA-F][0-9a-fA-F])+/ NaN NaN", float => "/$float/ -Inf Inf", double => "/$double/ -Inf Inf", decimal => "/$float/ -Inf Inf", # anyURI as defined here doesn't check that %xx escapes are correct. FIX anyURI => "/[-a-zA-Z0-9!\$'*+,.\/:=?%]+/ NaN NaN", QName => "/$QName/ NaN NaN", NOTATION => "/$QName/ NaN NaN", string => "/.*/ NaN NaN", # xsdDecimalTypes integer => "/[-+]?\\d+/ -Inf Inf", nonPositiveInteger=> "/0|(/-\\d+)/ -Inf 0", nonNegativeInteger=> "/0|(\+?/\\d+)/ 0 Inf", negativeInteger => "/-\\d+/ -Inf -1", positiveInteger => "/\+?\\d+/ 1 Inf", long => "/[-+]?\\d+/ " . -(1<<63) ." ". ((1<<63)-1), # Can't just calculate 1<<64 because it overflows. unsignedLong => "/\\d+/ " . 0 ." ". -((-1<<64)-1), int => "/[-+]?\\d+/ " . -(1<<31) ." ". ((1<<31)-1), unsignedInt => "/\\d+/ " . 0 ." ". ((1<<32)-1), short => "/[-+]?\\d+/ " . -(1<<15) ." ". ((1<<15)-1), unsignedShort => "/\\d+/ " . 0 ." ". ((1<<16)-1), byte => "/[-+]?\\d+/ " . -(1<<7) ." ". ((1<<7)-1), unsignedByte => "/\\d+/ " . 0 ." ". ((1<<8)-1), # xsdStringTypes # next two are funny about space-sequences... FIX normalizedString=> "/\\S[^\\r\\n\\t]*\\S/ NaN NaN", token => "/\\S[^\\r\\n\\t]*\\S/ NaN NaN", language => "/[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*/ NaN NaN", Name => "/$Name/ NaN NaN", NCName => "/$Name/ NaN NaN", ID => "/$Name/ NaN NaN", IDREF => "/$Name/ NaN NaN", IDREFS => "/$Name(\\s+$Name)*/ NaN NaN", ENTITY => "/$Name/ NaN NaN", ENTITIES => "/$Name(\\s+$Name)*/ NaN NaN", NMTOKEN => "/$Name/ NaN NaN", NMTOKENS => "/$Name(\\s+$Name)*/ NaN NaN" }; } # setupXsdTypes ############################################################################### ############################################################################### # =head1 Usage use XsdDatatypes =head1 Methods and Options =head2 Methods for basic setup =over =item * B =back =head1 Known bugs and limitations =head1 Ownership This work by Steven J. DeRose is licensed under a Creative Commons Attribution-Share Alike 3.0 Unported License. For further information on this license, see http://creativecommons.org/licenses/by-sa/3.0/. The author's present email is sderose at acm.org. For the most recent version, see http://www.derose.net/steve/utilities/. =cut