#txtnrm.pl use strict; use Getopt::Std; #These modules are in the standard installation use Text::ParseWords; sub UsageAndExit() { print <file] Options: -1 First line of input contains the field names. -f file : File specifies a text file containing a list of field names. One line per field; field name must be the first item on the line (excluding white space or quote marks). -F file : As for -f but for fixed-width import. Field width must be second item on line. -s sep : Input field separator (character or regexp. Default is \\t (tab). -S sep : Output field separator (string). Default is \\t (tab). -p Use input line number as the primary key. Default is first field. -q Interpret text fields delimited (qualified) with quote marks. -c list : list of special characters that may appear in field names in addition to alphanumeric characters and underscore (e.g. " $%"). -r nn : Set number of records to examine when generating field names automatically (default is to examine only the first record). -E Create output records for empty input fields. -U Strip Unix-style escapes in data (e.g. \\" becomes "). Default is to ignore Unix escapes but handle Windows-style doubled quotes (e.g. "" becomes "). -v Version information. -V Verbose. -h This screen. --help : More detailed help. Can be redirected to text file using >file (or to printer using >PRN: if available). ENDUSAGE exit 0; } sub VersionAndExit() { print < file] See the end of this help text for a complete list of options. INPUT Usually the input data will be in a text file. Specify one or more files on the command line and they will be read and processed in order. txtnrm can however also read data from standard input (e.g. piped from another Windows command), although in this situation you must use -f or -F to specify the field names. FIELD NAMES Field names can be specified as follows: - In the first row of input (with the -1 switch). - In a list in a text file specified with the -f or -F switch (-f for field- separated (delimited) files, -F for fixed width: all these switches are case sensitive). The file must have one line per field. The field names must be the first item on each line (not counting white space) and may optionally be enclosed in quotes. - If none of these switches is used, field names will be generated auto- matically (Field1, Field2...). By default, only the first input row is examined to count the fields. Use -r if you want more rows to be read (e.g. -r100 will examine the first 100 rows and warn you if the number of fields varies (see below). INPUT FIELD SEPARATORS Input fields may be fixed width or else separated (delimited) by one or more characters. Default assumption is tab-separated. Other separators can be specified with the -s switch. A separator can be a single character or a string and is interpreted as a regular expression, e.g. -s, comma -s\\. period -s\\t tab -s"|" pipe (quotes prevent Windows from interpreting it as a file redirection command) -s\\174 pipe (octal) -s\\s white space -sXXX the 3 characters "XXX" -s\\\$\\\$\\\$ the 3 characters "\$\$\$" -s\\W+ 1 or more "non-word" characters Use the -q switch where text fields may contain field separators and are delimited (qualified) with quote marks (i.e. the standard CSV file). Without -q, quote marks are preserved and all separators are treated as such. For fixed width input, use the -F switch to specify a textfile containing a list of field names and widths. The field name must be the first item on the line and the field width in characters the second, separated by white space, comma or semicolon, e.g. "ID",6 "FirstName",30 "LastName",30 ... By default, field names can only contain "word" characters (alphanumeric plus underscore). Other characters in names will be replaced with underscores. If any field names are duplicates, numerals will be appended to make them unique. E.g. Person, Bad Name, Bad+Name, Silly??Name, Person will become Person, Bad_Name, Bad_Name1, Silly__Name, Person1 To allow other characters in names, use the -c switch, e.g. -c" -\$\#" to allow space, hyphen, dollar and hash. VARYING NUMBERS OF FIELDS Sometimes not all lines of (field-separated) input have the same number of fields. If a line has more fields than expected, the extra fields are ignored. If a line has fewer than expected, the missing fields are ignored and no corresponding output lines are written. When fieldnames are generated automatically, the expected number of fields is the number found in the first line of input (or, if the -r switch is used, the largest number of fields found in any of the sampled input lines). OUTPUT Normalized records are written to standard output. Usually you will want to use a Windows redirection command such as \>D:\\folder\\filename.txt to write to a new file or \>\>D:\\folder\\filename.txt to append to an existing file. By default the output is tab-separated. Other separators (single character or string) can be specified with the -S switch, in a variety of ways as for the -s switch. E.g.: -S, comma -S"|" pipe (use quote marks to prevent the Windows command processor interpreting the \| as a redirection symbol) -S\\x7c pipe (using \\xnn hex notation to specify a character) -S" " space (use quote marks to delimit a separator string that contains a space) -S\\040 space (using \\nnn octal notation) -S\\\\ backslash (the -S and -s arguments are parsed as double- quoted Perl strings in which a single backslash always escapes the following character; so \\\\ gives you one backslash. -S~#~ three characters "~#~" Use the -v switch to display the switches, how they are being interpreted, and other debugging information. This is shown on screen but not output with the data. By default, no record is output for an empty field in the input. Use the -E switch to override this. ( This is the end of the help text. For convenience, the usage summary is repeated here: ENDHELPTEXT UsageAndExit; #display usage screen too } #declarations my %options; #command line options my $verbose; #1 to print debug messages, 0 not to. my $fldsrc; #where to find field names my @fnames; #list for field names my @fields; #list for field values my @fwidths; #field widths for fixed width data my $recstoscan = 1; # for counting number of fields my ($fname, $fcount, $ffewest, $fwtemplate); my ($i, $j, $temp); my $fs = "\t"; #input field separator (regular expression) my $ofs = "\t"; #output field separator (string) my $schars = ''; #special characters allowed in field names sub AdjustFieldNames(@) { #argument required: list of field names #replace illegal characters and add numbers to duplicate field names #(using uc() to make it case-insensitive but case-preserving) my %fhash; foreach (@_) { s/[^\w$schars]/_/g }; #replace characters not in $schars foreach (@_) { $fhash{uc $_}++ }; #count instances of each name foreach (reverse @_) { #iterate backwards through list $fhash{uc $_}--; #decrement number of instances $_ .= $fhash{uc $_} if $fhash{uc $_}; #append number to name if > 0 } } #main code starts here #process command line options foreach (@ARGV) { HelpAndExit if m/--help/ }; # --help foreach (@ARGV) { UsageAndExit if m/^\/\?$/ }; # /? getopts('1c:EF:f:hpqS:s:Vvr:', \%options); #parse other options UsageAndExit if $options{'h'} || ! defined @ARGV; #help if -h or no arguments VersionAndExit if $options{'v'}; $verbose = $options{'V'}; $recstoscan = $options{'r'} if $options{'r'}; #override default value @fnames = ('IDNUM') if $options{'p'}; #field name for autonum PK #if field separators specified, use eval() to interpret \t and other escapes if ($options{'s'}) { $fs = '"' . $options{'s'} . '"'; $fs = eval $fs; } if ($options{'S'}) { $ofs = '"' . $options{'S'} . '"'; $ofs = eval $ofs; } if ($options{'c'}) { $schars = '"' . $options{'c'} . '"'; $schars = eval $schars; } #where to get the fieldnames { $fldsrc = 'list', last if $options{'f'}; $fldsrc = 'fixed', last if $options{'F'}; $fldsrc = 'header', last if $options{'1'}; $fldsrc = 'create'; } #options not processed above are got from %options as and when needed if ($verbose) { print STDERR "\nSwitches detected:\n"; foreach $i (sort keys %options) { print STDERR "($i)\t($options{$i})\n"; } print STDERR "\n(Input file separator) = ($fs)\n" ; print STDERR "(Output file separator) = ($ofs)\n" ; print STDERR "(Special characters in file names) = ($schars)\n" ; print STDERR "Field names in $fldsrc\n" ; } #make sure a data file has been specified if one is needed unless (scalar @ARGV) { #no command line after processing options die "\nSorry, I can\'t read or generate field names unless you specify.\n" . "one or more input files on the command line, or else provide a\n" . "list of field names with -f or -F. Use -h option for help. \n" if $fldsrc =~ m/header|create/; } if ($fldsrc eq 'list') { #get field names into @fnames from list in file open FL, $options{'f'} or die "Sorry, can't open field list $options{'f'}\n"; while () { m/^\W*?([\w$schars]+)/ ; $fname = $1; $fname =~ s/\s+$// ; $fcount = push @fnames, $fname; } $fcount-- if $options{'p'}; # if so, first item in list is IDNUM for line number print STDERR "$fcount fieldnames found in $options{'f'}\n" if $verbose; close FL; AdjustFieldNames @fnames ; } if ($fldsrc eq 'fixed') {#get names and widths from file open FL, $options{'F'} or die "Sorry, can't open field list $options{'F'}\n"; while () { m/^\W*?([\w#]+)[,;]?\s*(\d+)/ ; $fcount = push @fnames, $1; push @fwidths, $2; $fwtemplate .= "A$2"; # build template for unpack() } $fcount-- if $options{'p'}; if ($verbose) { print STDERR "Field widths: $fwtemplate\n"; print STDERR "$fcount fieldnames found in $options{'F'}\n" ; } close FL; AdjustFieldNames @fnames; } if ($fldsrc eq 'header') { #get names from first line of input open FL, $ARGV[0] or die "Sorry, can't open data file $ARGV[0]\n"; $temp = ; chomp $temp; @fnames = parse_line($fs, 0, $temp); $fcount = scalar @fnames; print STDERR "$fcount fieldnames found in first line\n" if $verbose; close FL; AdjustFieldNames @fnames; } if ($fldsrc eq 'create') { print "Examining $recstoscan records\n" if $verbose; #open input file and count the separators $fcount = 0; $ffewest = 1_000_000_000; open FL, $ARGV[0] or die "Sorry, can't open data file $ARGV[0]\n"; for ($j = 1 ; $j <= $recstoscan; $j++) { last unless $_ = ; #read line; exit if eof before $recstoscan reached @fnames = parse_line(qr($fs), 0, $_); $i = scalar @fnames; $fcount = $i if $i > $fcount; $ffewest = $i if $i < $ffewest; } print STDERR "$fcount fields identified from data file $ARGV[0]\n" if $verbose; print STDERR "WARNING: number of fields per record varies" . "between $ffewest and $fcount in records examined.\n" if $ffewest != $fcount; #We now know how many fields; next, we generate a list of names: for ($j = 1; $j <= $fcount; $j++) {push @fnames, "Field$j"} ; } print STDERR "\nFieldnames: ", (join "$ofs", @fnames), "\n\n" if $verbose; #Now we're ready to start work print STDERR "\nProcessing line $."; #progress message while (<>) { #read input line from file(s) on command line, or from STDIN chomp; next if ($fldsrc eq 'header') && ($. == 1); #skip header if ($fldsrc eq 'fixed') { #fixed width @fields = unpack $fwtemplate, $_; } else { #field-separated @fields = parse_line($fs, ! $options{'q'}, $_); } unshift @fields, $. if $options{'p'}; #prepend line number as PK for ($j = 1; $j < $fcount ; $j++) { print STDOUT "$fields[0]$ofs$fnames[$j]$ofs$fields[$j]\n" if ((defined $fields[$j]) && (length($fields[$j]) > 0)) || $options{'E'}; } print STDERR "\rProcessing line $." unless $. % 10; #update every 10 lines } print STDERR "\rFinished. $. lines processed.\n"; __END__ History: 1-3 November 2003: rough code to import 1730-field CSV file. 8 November 2003 version 0.1: first version made available to other people. C. 115 lines of code, of which c. 20 are use or my declarations. 9 December 2003: version 0.2: some corrections to help text 21 December 2005: further corrections to help text 24 December 2005: version 0.21: bug in output of "null" (zls) fields fixed. -V used for Verbose, -v for version information. 28 December 2005: version 1.00 uploaded to web site.