#!/usr/bin/perl # DBF2CSV.PL -- a program to convert .DBF files to .CSV or .json format # (Written in Perl 4.036, and compatible with Perl 5.) # # This program is uncopyrighted, so do with it whatever you wish. # By Dave Burton, Burton Systems Software, Cary, NC USA # Email & phone: https://burtonsys.com/email/ # The latest version of DBF2CSV can always be found on the Burton # Systems Software web site, in the "downloads" area: # http://www.burtonsys.com/ # =>keyflag<=-- "%v, %d" $ver_ordinal_and_date = "44, 01-Nov-22"; # the TLIB Version Control version number and last-modified date ######## HISTORY ######## # # Dbf2csv v1, 12/22/2000 - Convert .DBF files to .CSV format. # The "-p" option selects interactive mode, for use with Perl4w.exe, # to make a Windows program. # # Dbf2csv v2, 4/14/2003 - handles some ideosyncratic .dbf files with # non-standard headers, which confused v1. # # Dbf2csv v3 (unreleased) - added support for fields > 255 characters # thanks to Jeff Price, and for big-endian machines (for John McVeagh). # # v1 was modified (commented, corrected & enhanced to handle accented # characters) by Jacky Brunocollège de Villeneuve # sur Yonne 89500, in april 2003. His version was included in some of # the dbf2csv.zip distributions as dbf2csv_accentued_characters.pl. # # Dbf2csv v4, 6/27/2003 - the result of merging most of Jacky's changes # into the standard version of DBF2CSV.PL, plus a few more improvements. # Also, changes '~' characters to blanks. # [Shameless advertisement: of course I used the TLIB Version Control # "migrate" command to do the merge and create v4. See our web site, # http://www.burtonsys.com/ -DAB] # # Dbf2csv v5, 1/24/2004 - adds the ability to properly handle FoxPro's # 'I' fields, which are 4-byte little-endian binary integers (signed, # I hope!). Also, tries to tell you what program created the database, # based on the version number. Also, no longer changes '~'s to blanks. # # Dbf2csv v6, 5/7/2004 - by David Marín # Handle almost all PC850 accented characters, not just French ones # # Dbf2csv v7, 7/19/2005 - 'F' fields now fully supported, and '+' # fields now might work (untested, supported with a warning). -DAB # # Dbf2csv v7.1-beta 10/9/2010 - Unsupported fields converted to hex. -DAB # # Dbf2csv v8, 2/20/2011 - 'B' fields (FoxPro 8-byte double-precsion) # now work and negative values in 'I" fields now work correctly on # small-endian machines (but still not on big-endian machines, sorry!). # Also, added the "-j" option, to output a .json file instead of a .csv # file (EXPERIMENTAL / lightly tested). -DAB # # Dbf2csv v9, 5/22/2012 - 'T' fields (FoxPro 8-byte date+time) are now # supported. -DAB # # Dbf2csv v10, 11/15/2012 - Fixed detection of (unsupported) dBase 7 # files. -DAB # # Dbf2csv v11, 12/3/2016 - Enabled escape-coding of special characters. # -DAB # # Dbf2csv v12 - No functional change. # # Dbf2csv v13, 11/1/2022 - Experimental support for Visual Foxpro 8-byte # currency ('Y') fields. (Thanks to Phil Wildcroft.) -DAB # # ######################### $version = "v13"; # This program is uncopyrighted, so it can be modified by anyone who # wants to. But, out of courtesy, please add your own name and what # you did to the history, and do not remove the previous history. # -DAB # Here are some descriptions of the .dbf file format: # http://www.clicketyclick.dk/databases/xbase/format/ # http://www.clicketyclick.dk/databases/xbase/format/data_types.html # http://web.archive.org/web/20080501103856/http://www.klaban.torun.pl/prog/pg2xbase/contrib/dbf-format.html # http://support.microsoft.com/kb/98743 or http://www.webcitation.org/67szzmtLQ (was http://support.microsoft.com/support/kb/articles/q98/7/43.asp) # http://web.archive.org/web/20060821235500/http://community.borland.com/article/0,1410,15838,00.html # http://www.dbase.com/KnowledgeBase/int/db7_file_fmt.htm or http://www.webcitation.org/67t06OStF # Jacky Bruno's additional comments... (but I updated the line numbers -DAB) # The script structure is this one: # lines 1581 to the end : main program : # - it reads command line: if no parameters, then it shows help # - treat flags if there are some (d, p) # - reads the first file name given and give it to the do_a_file function # - do the same with other command line given files # lines 1401 to 1432 : do_a_file function (the name is clear) : # - verify the validity of the file (ending with .dbf) # - buids the .csv file name from the .dbf file name given # - calls the cvt1file function by giving her the 2 file names # - get the records's number and shows informations: # input file output file treated records number # lines 651 to 1394 : cvt1file function : # - as his name tells, do the conversion job of input file # - write to screen informations of file beginning and field names # - save output file : # first field name, then each record # In the output file, fields will be separated by $separe # $separe=";"; $separe=","; $| = 1; # predefined variable. If <> 0 then each print to the console # will immediatly be displayed, instead of buffered. $debugmode = 0; # set to 1 via '-d' for debug prints $prompt = 0; # set to 1 via '-p' for special interactive mode, for use with perl4w.exe $jsonmode = 0; # set to 1 via '-j' to output a JSON file instead of CSV $escape_codes = -1; # set to 1 via '-e' to translate CR, LF, etc. to "\r", "\n", etc. respectively # or set to 2 via '-e2' to translate " to "" (not for json!) # default is '-e0' (dumb/simple mode) for CSV, or '-e1' for json $progversion_shown = 0; # display program name and version (but only once!) sub show_progversion { if (!$progversion_shown) { print "DBF2CSV $version -- Convert .DBF file to .CSV (comma-separated) format\n"; $progversion_shown = 1; } } # display dbf2csv.pl version number (you might want to comment this out) &show_progversion; # Which version of Perl are they using? $perlver = "3 or earlier"; if ($] =~ /\$\$Revision\:\s*([0-9.]+)\s/) { $perlver = $1; # probably 4.something } elsif ($] =~ /([0-9][0-9.]*)/) { $perlver = $1; # probably 5.something or 6.something } print "You are using Perl version $perlver "; # is this a big-endian machine? $big_endian = 0; $tst = pack("S",513); $tst_big_endian = unpack("n",$tst); $tst_little_endian = unpack("v",$tst); if ((513==$tst_big_endian) && (258==$tst_little_endian)) { # this is a big endian machine $big_endian = 1; print "(big-endian) "; } # Does this version of Perl support IEEE-754 8-byte double-precision (Foxpro 'B')? $perl_supports_IEEE754_doubles = 0; $tstIEEE = "\x00\x00\x00\x00\x80\x05\xD1\x40"; $tst2 = unpack("d",$tstIEEE); $tst3 = sprintf("%10.4f", $tst2); if ($tst3 eq '17430.0000') { $perl_supports_IEEE754_doubles = 1; } $warned_about_B_field = 0; # set to 1 when we issue a warning about inability to handle Foxpro 'B' fields (so we won't repeat the warning) # Does this version of Perl support 8-byte integers? # if ($perlver < 5) { # $ivsize = 4; # if (0x8000 < 0) { # $ivsize = 2; # good grief, who's still using 16-bit Perl! # } # } else { # eval('use Config;'); # avoids Perl 4 errlr # $ivsize = 0 + $Config{ivsize}; # } $tmp = 9223372036854775806; $tmp--; if (($tmp < 9223372036854775806) && ($tmp > 9223372036854775804)) { $ivsize = 8; } else { $tmp = 2147483646; $tmp--; if (($tmp < 2147483646) && ($tmp > 2147483644)) { $ivsize = 4; } else { $ivsize = 2; } } # print "ivsize = $ivsize\n"; # number of bytes in an integer if ($ivsize >= 8) { print "(64-bit ints)\n"; } elsif ($ivsize >=4) { print "(32-bit ints)\n"; } else { print "(16-bit ints -- UNSUPPORTED!)"; } # Perl pack/unpack format representing the structure of the first # 32 bytes in a .DBF file: $DBF_header_fmt = "C" . # version number at offset "CCC" . # YY, MM, DD (one byte each) "L" . # Number of records in file "S" . # Length of header structure "S" . # Length of each record "a20"; # 20 bytes that we don't care about # Perl pack/unpack format representing the structure of each field descriptor # (the 2nd-Nth 32-byte chunk): $DBF_field_desc_fmt = "A11" . # Field name in 0-terminated ASCII "a" . # Field type in ASCII "L" . # Field address in memory (unused) "C" . # Field length (binary) \___/ these 2 bytes can also be a 2-byte field length, "C" . # Decimal count (binary) / \ 1-65535, for field type 'C' in Foxbase & Clipper. "C" . # Field flags (FoxPro/FoxBase only) "a1" . # reserved "C" . # Work area ID "a2" . # reserved "C" . # Flag for SET FIELDS "a7" . # reserved "A"; # Index field flag # For the meanings of the template letters, see Perl documentation # (e.g., on Linux do 'man perlfunc' then read 'pack') # Unfortunately, the "v" & "V" template characters (Vax-byte-order integers) # are poorly documented: the Perl docs suggest that they are signed, but they # seem to be unsigned on my x86 machines. The "S" & "L" template characters # (machine-order unsigned integers) won't work right on big-endian machines. # So we do the best we can: on big endian machines we change the 'S' template # characters to 'v' and the 'L' template character to 'V'. That'll work 99.9% # of the time -- i.e., as long as the record length doesn't exceed 32K, or # the v and V templates are unsigned. # Thanks to John McVeagh (who uses AIX) for inspiring this. if ($big_endian) { $DBF_header_fmt =~ s/LSS/Vvv/; print "Note: This is a big-endian machine. Adjusting template.\n"; } # $cvt_failed is a side-effect result of &cvt1file. $cvt_failed = 0; # will be set to 1 iff cvt1file failed, or 0 for success # handy constant $zerobyte = "\0"; # same as pack("c",0), or in Perl 5 it could also be chr(0) # use the '-ta' or '-tu' option to adjust $translate and $garde_accent # By Jacky Bruno... # The file can have accentued characters coded in DOS pc style (where, for example "é" # is coded "82h") or coded in ANSI style (linux or Windows) ( "é" is coded "E9h") # If codage is already ANSI, no need to re-code it: put $translate to 0 # If codage is pc, you can choose to re-code or let codage the way it is. # Accent codage change? codage pc --> codage this way: # 1 -> yes, let's change codage (another number is possible) # 0 -> no, let's keep the codage the way it is in the original file $translate=0; # If codage is changed ($translate=1), do we keep accentued characters? # Translation keeping or not accentued characters # 1 -> keeping accentued characters (or another number) # 0 -> don't keep accentued characters $garde_accent=1; #In french: keep = garde # Conversion tables codage pc <--> codage ansi for accentued characters # They can be completed regarding the "correspondances" # (every character must respect the same order in every codage) $code_pc="\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a". "\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94". "\x95\x96\x97\x98\x99\x9a\xa0\xa1\xa2\xa3". "\xa4\xa5\xb5\xb6\xb7\xc6\xc7\xd2\xd3\xd4". "\xd5\xd6\xd7\xd8\xe0\xe1\xe2\xe3\xe4\xe5". "\xe9\xea\xeb\xec\xed"; $code_ansi="üéâäàåçêëè". "ïîìÄÅÉæÆôö". "òûùÿÖÜáíóú". "ñÑÁÂÀãÃÊËÈ". "iÍÎÏÓßÔÒõÕ". "ÚÛÙýÝ"; $code_brut="ueaaaaceee". "iiiAAEeEoo". "ouuyOUaiou". "nNAAAaAEEE". "iIIIOSOOoO". "UUUyY"; $silent = 0; # a flag we sometimes set to avoid redundant warning messages #################################### ### end of global variables #################################### # Given the version number found in the header of a .dbf file, return our # best guess for the name of the product which created the .dbf file sub productName { local($product); if (2 == $verNum) { $product = "(FoxBase)"; } elsif (3 == $verNum) { $product = "(File without DBT)"; } elsif (4 == $verNum) { $product = "(dBASE IV w/o memo file)"; } elsif (5 == $verNum) { $product = "(dBASE V w/o memo file)"; } elsif (0x30 == $verNum) { # 48 $product = "(Visual FoxPro)"; } elsif (0x31 == $verNum) { # 49 $product = "(Visual FoxPro w/ AutoIncrement field)"; } elsif (0x7B == $verNum) { # 123 $product = "(dBASE IV w/ memo file)"; } elsif (0x83 == $verNum) { # 131 $product = "(File w/ DBT, or dBASE III+ w/ memo file)"; } elsif (0x8B == $verNum) { # 139 $product = "(dBASE III or IV w/ memo file)"; # microsoft says it is dBase III, but http://www.e-bachmann.dk/docs/xbase.htm says it is dBase IV } elsif (0x8E == $verNum) { # 142 $product = "(dBASE IV w/ SQL table )"; } elsif (0xE5 == $verNum) { # 229 $product = "(Clipper SIX w/ SMT memo file)"; } elsif (0xE6 == $verNum) { # 230 $product = "\nWarning: Encrypted Clipper SIX database, probably can't be handled\n"; } elsif (0xF5 == $verNum) { # 245 $product = "(FoxPro w/ memo file)"; } elsif (0xFB == $verNum) { # 251 $product = "(FoxPro ???)"; } else { $product = "(unknown xBase database program [perhaps Alpha4/Alpha5, dBFast, DataBoss, CodeBase, or Lotus Approach], please contact Dave Burton at http:\\\\www.burtonsys.com\\email\\)"; } return $product; } #productName # globals used by &fetch_memo $current_dbf_file = ''; $current_memo_file = ''; $current_memo_file_is_open = 0; $current_memo_file_size = 0; $current_memo_file_blocksize = 0; # will be changed to 512 or 64 or whatever $null_memo_count = 0; $real_memo_count = 0; # Retrieve a memo from a FoxPro .FPT memo file. Input is the block number. # MEMOFILE must already be open, and globals $current_memo_file* must already # be set. sub fetch_fpt_memo { local($blocknum) = shift; local($fbuf) = ''; local($fileofs) = $blocknum * $current_memo_file_blocksize; if (($fileofs+8) >= $current_memo_file_size) { printf "ERROR: block $blocknum (offset $fileofs) of '$current_memo_file' is past EOF.\n"; return ''; } seek( MEMOFILE, $fileofs, 0 ); if (!sysread(MEMOFILE, $fbuf, 8)) { print "ERROR: Could not read '$current_memo_file' at ofs=$fileofs, $!\n"; return ''; } local($memotype, $memolen) = unpack( "NN", $fbuf ); # 'N' is 4-byte big-endian integer if (($memotype < 0) || ($memotype > 2)) { print "Warning: unrecognized Memo type $memotype for Memo block $blocknum\n" . "(offset $fileofs) in '$current_memo_file'.\n"; } if ($memolen < 0) { print "Warning: bad Memo file '$current_memo_file', memo block $blocknum has\n" . "negative size.\n"; } if ($memolen <= 0) { return ''; } if (($memolen + $fileofs + 8) > $current_memo_file_size) { local($newlen) = $current_memo_file_size - ($fileofs + 8); print "Warning: bad Memo file '$current_memo_file', memo block $blocknum is\n" . "incomplete (truncated from $memolen to $newlen bytes)\n"; $memolen = $newlen; } $fbuf = ''; if (!sysread(MEMOFILE, $fbuf, $memolen)) { print "ERROR: could not read $memolen-byte memo from '$current_memo_file'\n" . "at block $blocknum (offset $fileofs)\n"; } return $fbuf; } #fetch_fpt_memo # read MEMOFILE (assumed to be an open .DBT file), and determine the BLOCKSIZE sub fetch_dbt_blocksize { local($mbuf, $blksiz); $blksiz = 0; seek( MEMOFILE, 16, 0 ); $mbuf = "\1"; if (!sysread(MEMOFILE, $mbuf, 1)) { print "ERROR: Could not read '$current_memo_file' at ofs=16, $!\n"; } else { if ($mbuf eq $zerobyte) { # dBase IV, supports variable BLOCKSIZE seek( MEMOFILE, 20, 0 ); $mbuf = "\0\0"; sysread(MEMOFILE, $mbuf, 1); $blksiz = unpack( "v", $mbuf ); # "v" means 2-byte little-endian integer if (1 == $current_memo_file_blocksize) { # dBase III format memo file created by dBase IV ? $blksiz = 512; } elsif ($current_memo_file_blocksize < 0) { # fix overflow for blocksize 32768 (maybe not possible anyhow) $blksiz += 65536; } if (! $blksiz) { print "ERROR: '$current_memo_file' is not a valid dBase IV memo file (no blocksize\n" . "found at file offset 20).\n"; } } elsif ($mbuf eq "\3") { # dBase III+, always uses 512-byte BLOCKSIZE $blksiz = 512; } else { # unrecognized file format print "ERROR: '$current_memo_file' is an unrecognized xBase memo file format\n"; } } return $blksiz; } #fetch_dbt_blocksize # read MEMOFILE (assumed to be an open .FPT file), and determine the BLOCKSIZE sub fetch_fpt_blocksize { local($mbuf, $blksiz); $blksiz = 0; seek( MEMOFILE, 6, 0 ); $mbuf = "\0\0"; if (!sysread(MEMOFILE, $mbuf, 2)) { print "ERROR: Could not read '$current_memo_file' at ofs=6, $!\n"; return 0; } else { $blksiz = unpack( "n", $mbuf ); # "n" means 2-byte big-endian integer if (! $blksiz) { print "ERROR: '$current_memo_file' is not a valid FoxPro memo file (no blocksize\n" . "found at file offset 6).\n"; } } return $blksiz; } #fetch_fpt_blocksize # Input is name of .dbf file (not the memo file!), and the memo block number. # Returns the memo in a string. # The memo file can be a .dbt file or a .fpt file. This subroutine would look # for both (except that .dbt file support isn't yet implemented). # If the .dbf file name is '', then current memo file is closed, '' is # returned, and nothing else is done. # If the block number is zero, then '' is always returned. # If .dbf file name has changed since last time, the old memo file is closed, # and the new one is opened. # If the memo file can't be read, then a string is returned containing the # block number preceeded by '#'. sub fetch_memo { local( $dbf_file, $blocknum ) = @_; local( $result ) = ''; local( $mbuf ); if (($dbf_file ne $current_dbf_file) && $current_memo_file_is_open) { if ($debugmode) { print "dbg: Closing '$current_memo_file' (memo file for '$current_dbf_file').\n"; } close MEMOFILE; $current_memo_file_is_open = 0; $current_memo_file_size = 0; $current_memo_file_blocksize = 0; $null_memo_count = 0; $real_memo_count = 0; } if (('' eq $dbf_file) || (0 == $blocknum)) { return ''; } if (($dbf_file ne $current_dbf_file)) { $current_dbf_file = $dbf_file; $current_memo_file = ''; # figure out name of memo file $fpt_file = $dbf_file; $fpt_file =~ s/\.DBF$/.FPT/; $fpt_file =~ s/\.dbf$/.fpt/i; $dbt_file = $dbf_file; $dbt_file =~ s/\.DBF$/.DBT/; $dbt_file =~ s/\.dbf$/.dbt/i; if ($fpt_file eq $dbt_file) { printf "ERR: '$dbf_file' is not the name of a .DBF file\n"; return '?'; } if (-f $fpt_file) { $current_memo_file = $fpt_file; } elsif (-f $dbt_file) { $current_memo_file = $dbt_file; } # open memo file if ('' ne $current_memo_file) { if (!open( MEMOFILE, $current_memo_file )) { print "ERROR: Could not open '$current_memo_file', $!\n"; } else { $current_memo_file_is_open = 1; } } else { print "ERROR: Memo file $dbt_file or $fpt_file not found.\n"; } # next figure out the SET BLOCKSIZE setting: if ($current_memo_file_is_open) { # We must read the memo file in binary mode (avoid translations of end-lines) binmode MEMOFILE; # get size of memo file $current_memo_file_size = -s MEMOFILE; if ($current_memo_file eq $dbt_file) { # .DBT file $current_memo_file_blocksize = &fetch_dbt_blocksize; } else { # .FPT file $current_memo_file_blocksize = &fetch_fpt_blocksize; } if ($current_memo_file_blocksize < 0) { # fix overflow for blocksize 32768 (maybe not possible anyhow) $current_memo_file_blocksize += 65536; } if (0 == $current_memo_file_blocksize) { close MEMOFILE; $current_memo_file_is_open = 0; # error message was displayed by fetch_dbt_blocksize or fetch_fpt_blocksize } if ($current_memo_file_is_open) { print "Opening memo file '$current_memo_file' for '$dbf_file', block size = $current_memo_file_blocksize.\n"; } } if ($current_memo_file =~ /\.dbt$/) { print "ERROR: This program doesn't yet understand .DBT (memo) files. If you can\n" . "furnish me with some test files, I might be able to add support for you.\n" . "Email me via http\://www.burtonsys.com/email/\n"; if ($current_memo_file_is_open) { close MEMOFILE; $current_memo_file_is_open = 0; } } if (! $current_memo_file_is_open) { print "Warning: memo fields will be converted as block numbers, rather than the\n" . "actual memo text.\n"; } } if (! $current_memo_file_is_open) { return '#' . $blocknum; } # Fetch the memo from the memo file if ($current_memo_file =~ /\.dbt$/) { # not currently implemented $result = &fetch_dbt_memo( $blocknum ); } else { $result = &fetch_fpt_memo( $blocknum ); } if ('' eq $result) { $null_memo_count++; } else { $real_memo_count++; } return $result; } #fetch_memo # Jdate2Cal converts a 4-byte integer Julian Date to YYYY-MM-DD string. # The code was cribbed (with minor modifications) from sub CalDate in CalDate.pm from # http://sourceforge.net/projects/jday/files/jday/2.4/jday-2.4.tar.gz (which is copyrighted code), and # used in this uncopyrighted program by the gracious permission of author Hiram Clawson (jday at hiram.ws): # "Good Afternoon David: Please feel free to use the code snippet as you desire. --Hiram" 5/23/2012 sub Jdate2Cal { local($jd, $ka, $ialp, $kb, $kc, $kd, $ke, $day, $month, $year, $result); $jd = shift; # integer julian date $jd = int($jd); $ka = $jd; if ( $jd >= 2299161 ) { $ialp = int(( $jd - 1867216.25 ) / ( 36524.25 )); $ka = int($jd + 1 + $ialp - ( $ialp >> 2 )); } $kb = int($ka + 1524); $kc = int(( $kb - 122.1 ) / 365.25); $kd = int($kc * 365.25); $ke = int(( $kb - $kd ) / 30.6001); $day = $kb - $kd - int( $ke * 30.6001 ); if ($ke > 13) { $month = int($ke - 13); } else { $month = int($ke - 1); } if ( ($month == 2) && ($day > 28) ) { $day = 29; } if (($month == 2) && ($day == 29) && ($ke == 3)) { $year = int($kc - 4716); } elsif ($month > 2) { $year = int($kc - 4716); } else { $year = int($kc - 4715); } # local($dayofweek) = int(($jd + 1) % 7); # local($dayofyear) # if ( $year == (($year >> 2) << 2) ) { # $dayofyear = # int( ( 275 * $month ) / 9) # - int(($month + 9) / 12) # + $day - 30; # } else { # $dayofyear = # int( ( 275 * $month) / 9) # - ((($month + 9) / 12) << 1) # + $day - 30; # } # $result = $year . '-' . $month . '-' . $day; $result = sprintf( "%04d-%02d-%02d", $year, $month, $day ); return $result; } # Jdate2Cal # sanity tests $tmp1 = &Jdate2Cal(2455958); if ($tmp1 ne "2012-01-31") { print "ERROR: Jdate2Cal(2455958) = '" . $tmp1 . "' but it should be '2012-01-31'\n"; } $tmp1 = &Jdate2Cal(2299161); if ($tmp1 ne "1582-10-15") { print "ERROR: Jdate2Cal(2455958) = '" . $tmp1 . "' but it should be '1582-10-15'\n"; } # Read infile (name of a .DBF file) and create outfile (name of a .CSV file). sub cvt1file { local( $infile, $outfile ) = @_; local( $recnum ) = 0; local( $skipped ) = 0; local( $offset_of_0 ); local( $buf ) = ""; local( $reclens_look_right ) = 0; local( $offsets_match_fieldlens ); local( $i ); local( @tmp, $tmp, $tmp1 ); local( $has_memo_fields ) = 0; # are there any 'M' fields (fields that use the memo file)? local( $date, $time, $hr, $min, $sec ); $cvt_failed = 0; # side-effect result, 0 or 1 # open dbf file if (!open( DBF, $infile )) { &show_progversion; print "ERROR: Could not open '$infile', $!\n"; $cvt_failed = 1; return 0; # no records converted } # We must read the input .dbf file in binary mode (avoid translations of end-lines) binmode DBF; # get size of .dbf input file $DBF_file_size = -s DBF; # Remove old output file if it exists unlink $outfile; # mostly in case outfile is on a buggy MARS_NWE volume, so we don't get trailing junk in the output file if it already existed and was bigger than the new output file if (!sysread(DBF, $buf, 32)) { &show_progversion; print "ERROR: Could not read first 32 bytes from '$infile', $!\n"; $cvt_failed = 1; return 0; # no records converted } # Unpack the file header fields from the first 32 bytes of the .dbf file # The $DBF_header_fmt template is defined above ( $verNum, $yy, $mm, $dd, $numrecs, $hdrLen, $recLen ) = unpack( $DBF_header_fmt, $buf ); $might_be_dBase7 = (4 == (7 & $verNum)); if ($debugmode) { print "dbg[1]: might_be_dBase7 = $might_be_dBase7\n"; } if (($hdrLen % 48) != 21) { $might_be_dBase7 = 0; } if ($debugmode) { print "dbg[2]: might_be_dBase7 = $might_be_dBase7, (hdrLen%48)=" . ($hdrLen % 48) . "\n"; } print "version=$verNum "; $product = &productName( $verNum ); if ($might_be_dBase7) { $product = "(dBASE IV w/o memo file, or perhaps dBase 7)" } if ("" ne $product) { print "$product "; } if ($yy < 78) { # The first .dbf file was created 1/29/1978, so it can't be older than that # Microsoft leaves off the 100, so fix it: $yy += 100; } printf " yyyy/mm/dd=%04d/%02d/%02d\n", ($yy+1900),$mm,$dd; # printf " jj/mm/aa=%02d/%02d/%02d\n", $dd,$mm,$yy; # -- French (by Jacky Bruno) -- print "numrecs=$numrecs"; # print "nombreEnregValides=$numrecs"; # -- French (by Jacky Bruno) -- $calculated_numrecs = int(($DBF_file_size - $hdrLen) / $recLen); print ", calculated numrecs=$calculated_numrecs.\n"; if ($numrecs != $calculated_numrecs) { print "ERROR: numcres from header unequal to calculated number of records.\n"; if ($calculated_numrecs < $numrecs) { print "$infile might be incomplete.\n"; } else { printf "Final %d records are suspect.\n", $calculated_numrecs-$numrecs; } } print "hdrLen=$hdrLen "; # print "hdrLong=$hdrLen "; # -- French (by Jacky Bruno) -- print "recLen=$recLen "; # print "enregLong=$recLen "; # -- French (by Jacky Bruno) -- $numfields = int(($hdrLen - 1) / 32) - 1; if ($might_be_dBase7) { $db7numfields = int(($hdrLen - 69) / 48); print " numfields = $numfields per record (or $db7numfields per record if this is a dBase 7 file).\n"; } else { print " numfields = $numfields per record.\n"; } # print "nombreChamps=$numfields (by record)\n"; # -- French (by Jacky Bruno) -- $extra_hdr_bytes = ($hdrLen - (1+(($numfields+1)*32))); if ($extra_hdr_bytes != 0) { if ((48 == $verNum) && (7 == $extra_hdr_bytes)) { # Visual FoxPro idiosyncracy print "Visual FoxPro idiosyncracy: 7 extra header bytes (ignored)\n"; } else { print "Warning: non-standard .dbf file format, header contains $extra_hdr_bytes extra byte(s).\n"; } } if ($might_be_dBase7) { # Looks like dBase 7 # dBase 7 is a very different format, with 48-byte (instead of 32-byte) field # definition headers; see http://www.dbase.com/KnowledgeBase/int/db7_file_fmt.htm # The header of a dBase 7 file has 36 extra header bytes (mostly the Language # driver name), plus 16 extra bytes per field descriptor, plus the Field # Properties Structure (which I think is at least 16 bytes) print "Warning: $infile might be a dBase 7 file! This tool does not support dBase 7.\n"; } $extra_file_bytes = ($DBF_file_size - ($hdrLen + ($calculated_numrecs * $recLen))); if ($extra_file_bytes > 0) { print "Warning: $infile contains $extra_file_bytes extra byte(s) at the end (ignored).\n"; } # $recfmt will be the unpacking template for each record. # This template will be build by reading field's definitions # (32 bytes per field starting at the 33rd byte of the file) $recfmt = "A"; # first byte of each record is the "deleted" indicator byte (normally blank) # We will build arrays containing fields caracteristics # (name, type, width, offset). # The [0] array entries are for the "deleted" indicator byte: $fld_nam[0] = ''; $fld_ofs[0] = 0; $fld_len[0] = 1; $fld_typ[0] = 'C'; $fld_flg[0] = 0; $fld_supported[0] = 1; $running_offset = 1; # 1, not 0, because the "deleted" indicator is 1 byte # read all the field definition headers (32 bytes each): for ($i=1; $i <= $numfields; $i++) { if (!sysread(DBF, $buf, 32)) { print "ERROR: Could not read field definition header for field $i from '$infile', $!\n"; $cvt_failed = 1; return 0; # exit with error } # Unpack field definition using $DBF_field_desc_fmt template (we keep # only the first 5 fields): ( $fldName, $fldType, $fldOffset, $fldLen, $decCnt, $fldFlags ) = unpack( $DBF_field_desc_fmt, $buf ); # I don't know why the dumb A11 format doesn't strip the garbage after # the 0-byte, but it doesn't. The Perl documentation says, "When # unpacking, 'A' strips trailing spaces and nulls," but that apparently # doesn't mean that it truncates at the first null byte. We could use # "Z11" instead of "A11" if we didn't care about Perl 4 compatibility. # Most .dbf files don't have trailing garbage after the 0-byte, anyhow, # but some do. This is for those .dbf files. $offset_of_0 = index($fldName, $zerobyte); if (-1 != $offset_of_0) { $fldName = substr( $fldName, 0, $offset_of_0 ); } # Some xBase variants (Clipper, Foxbase, perhaps others) permit # character data fields larger than 255 characters, using the # "Decimal Count" field as a high length byte. (Thanks to Jeff # Price for telling me this.) if (($decCnt > 0) && ('C' eq $fldType) && ($recLen >= (256 * $decCnt))) { $fldLen += (256 * $decCnt); } $fldName =~ s/\r/ /g; # change "\rtastrade.d" to " tastrade.d" to avoid messing up display if ($debugmode) { printf "%3d: %-10s type='%s' offset=%d fldLen=%d, fldFlags=0x%02x\n", $i, $fldName, $fldType, $fldOffset, $fldLen, $fldFlags; } $fld_nam[$i] = $fldName; $fld_ofs[$i] = $fldOffset; $fld_len[$i] = $fldLen; $fld_typ[$i] = $fldType; $fld_flg[$i] = $fldFlags; # Add another field to the template, type 'A' (text completed by spaces) with $fldLen width if ((('I' eq $fldType) || ('+' eq $fldType)) && (4 == $fldLen)) { # Two special cases: # 'I' is FoxPro or dBase 7 binary 4-byte integer. For FoxPro, it is known to be little-endian; I dunno about dBase 7. For dBase, it is known to be signed; I presume that is also true for FoxPro. # '+' is a dBase 7 "Autoincrement" field, stored the same as a long (see http://www.dbase.com/KnowledgeBase/int/db7_file_fmt.htm) if ($big_endian) { $recfmt .= "V"; # probably unsigned, unfortunately } else { $recfmt .= "l"; # signed } } elsif (('M' eq $fldType) && (4 == $fldLen)) { # Another special case: # 'M' is a Memo field -- the data is in another file, this is a block # number referencing the other file. Some databases use a 10-byte # string to store the block number, others use 4 binary bytes. So # if the length is 4, we decode it as unsigned binary. if ($big_endian) { $recfmt .= "V"; # probably unsigned } else { $recfmt .= "L"; # definitely unsigned } } elsif (('B' eq $fldType) && (8 == $fldLen)) { # 8-byte Foxbase/Foxpro double-precision binary (IEEE 64-bit floating pt). # Stored 52-bit mantissa LSB first, then 11-bit exponent & 1-bit sign. # Thanks to Roland Baranyai for the sample data! if ($perl_supports_IEEE754_doubles) { $recfmt .= "d"; } else { $recfmt .= "a8"; if (!$warned_about_B_field) { print "Warning: Foxpro 'B' (double-precision) fields won't be converted correctly\n" . "because this Perl doesn't support FoxPro's little-endian IEEE-754 64-bit\n" . "data format. For help, contact Dave Burton at http:\\\\www.burtonsys.com\\email\\\n"; $warned_about_B_field = 1; # It is quite possible that on big-endian computers this fails # but could be made to work by simply reversing the byte order. # But I don't have the ability to test it. } } } elsif (('T' eq $fldType) && (8 == $fldLen)) { # 8-byte Foxpro date/time field $recfmt .= "a8"; # Note: a8 preserves zero bytes, A8 truncates them, apparently } elsif (('Y' eq $fldType) && (8 == $fldLen)) { # 8-byte Visual Foxpro currency field, in 100-ths of cent (per Phil W. 9/21/2022) $recfmt .= "a8"; } else { # Normal fields: $recfmt .= "A$fldLen"; } $running_offset += $fldLen; } #for # The $recfmt unpacking template is complete # This is a hack for Visual Foxpro. For some reason, Visual Foxpro # often (always?) puts 8 junk field definition headers after the real # ones. All 8 always have zero length. The last 7 always have null names, # types & offsets, too; the first of the eight sometimes has a null name, # type & offset, but sometimes has the name "\rtastrade.d" with type='b' # offset=99 (where 99 is character code 'c'). That seems to have something # to do with a Microsoft test file called "tastrade.dbc". Anyhow, when # we encounter this, we just ignore the 8 bogus field definitions: if ($numfields > 8) { if ( (0==$fld_len[$numfields]) && (0==$fld_len[$numfields-1]) && (0==$fld_len[$numfields-2]) && (0==$fld_len[$numfields-3]) && (0==$fld_len[$numfields-4]) && (0==$fld_len[$numfields-5]) && (0==$fld_len[$numfields-6]) && (0==$fld_len[$numfields-7]) && (0 != $fld_len[$numfields-8])) { if ( ('' eq $fld_nam[$numfields]) && ('' eq $fld_nam[$numfields-1]) && ('' eq $fld_nam[$numfields-2]) && ('' eq $fld_nam[$numfields-3]) && ('' eq $fld_nam[$numfields-4]) && ('' eq $fld_nam[$numfields-5]) && ('' eq $fld_nam[$numfields-6]) && ('' ne $fld_nam[$numfields-8])) { print "Visual FoxPro idiosyncracy: 8 bogus 0-length fields (ignored)\n"; $numfields -= 8; splice( @fld_nam, $numfields+1, 8 ); # discard last 8 array elements splice( @fld_ofs, $numfields+1, 8 ); splice( @fld_len, $numfields+1, 8 ); splice( @fld_typ, $numfields+1, 8 ); splice( @fld_flg, $numfields+1, 8 ); $recfmt = substr( $recfmt, 0, length($recfmt)-16 ); # remove the "A0A0A0A0A0A0A0A0" from the end } } } if ($debugmode) { printf "recfmt='%s'\n", $recfmt; } # Classify each field as supported or unsupported for ($i=1; $i <= $numfields; $i++) { $fldType = $fld_typ[$i]; $fld_supported[$i] = 0; if (index("CDLNIMFBTY",$fldType) >= 0) { $fld_supported[$i] = 1; if ((8 != $fld_len[$i]) && (('B' eq $fldType) || ('T' eq $fldType) || ('Y' eq $fldType))) { # 'B', 'T' and 'Y' are only supported if they are 8 bytes long $fld_supported[$i] = 0; } } if ('M' eq $fldType) { $has_memo_fields = 1; # optimization } } #for # Lukasz Matusiak (lukaszzp at gmail.com) provided me 21-May-2012 with the # sample data I needed to support FoxPro's 'T' filed (DateTime). So I # changed "CDLNIMFB" to "CDLNIMFBT" above. Thanks, Lukasz! # "Dan" (mail_lodge at yahoo.com.au) emailed me 19-Jul-2005 to say # that type 'F' fields work fine. They are just text representations of # floating point numbers, such as "1.97056329250e+000" (that example is # from Dan's email, and it has a field length = 19). According to # http://www.dbase.com/KnowledgeBase/int/db7_file_fmt.htm # type 'F' fields contain "Number stored as a string, right justified, and # padded with blanks to the width of the field" in dBase 7, which is # consistent with Dan's report. So I changed "CDLNIM" to "CDLNIMF" above. # type 'B' is IEEE-754 double-precision (8-byte) floating point, in FoxPro. # Thanks to Roland Baranyai for the sample data from which I determined this! # If this Perl supports that data type via the 'd' unpack template, then # we handle this well. Otherwise, we just represent it as hexidecimal # in the output file (and display a warning). # Jacky Bruno comments... # Definition of output format # Here will be used the field separator # (defined by $separe variable modifiable at the beginning of the file) # You can change quotes used at the left and the right of fields too # using another character (is it really useful?) by changing " in the # next variable to the wished character (you can even set a beginning # character and a ending character) example : # $csvoutfmt = 'Y%sZ' . ("${separe}Y%sZ" x ($numfields-1)) . "\n"; # field names will have Y before and Z after : Yname_of_fieldZ # Attention to escape special characters if used # The () tells that ${separe}"%s" will be repeated ($numfields-1) times # If there were no questionable data fields, this would suffice: # $csvoutfmt = '"%s"' . ("${separe}\"%s\"" x ($numfields-1)) . "\n"; # This does the same thing, except that it adds '?' to questionable data fields: $csvoutfmt = ''; for ($i=1; $i <= $numfields; $i++) { if ($i > 1) { $csvoutfmt .= $separe; } if ($fld_supported[$i]) { $csvoutfmt .= '"%s"'; } else { $csvoutfmt .= '"%s?"'; } } $csvoutfmt .= "\n"; # note: we started counting with $i=1 instead of 0 because the DelFlg field # won't be output if ($jsonmode) { $outfmt = '{'; for ($i=1; $i <= $numfields; $i++) { if ($i > 1) { $outfmt .= ", "; } $outfmt .= ('"' . $fld_nam[$i] . '":"%s"'); } $outfmt .= "}"; } else { $outfmt = $csvoutfmt; } if ($running_offset != $recLen) { print "Warning: Summed field lengths (+1 byte for DEL flag) = $running_offset, which is unequal to recLen.\n"; $reclens_look_right = 0; } else { print "summed field lengths + 1 = $running_offset = recLen (as expected).\n"; $reclens_look_right = 1; } ### Begin code to fix field offsets for .dbf files in which the field ### offsets are incorrect or missing altogether # Are two or more fields at the same field offset? If so then the .dbf # file definitely doesn't have correct field offsets in the header. $prev_fldOffset = $fld_ofs[1]; $cnt_idential_fldOffsets = 0; for ($i=2; $i <= $numfields; $i++) { $fldOffset = $fld_ofs[$i]; if ($fldOffset == $prev_fldOffset) { $cnt_idential_fldOffsets++; } $prev_fldOffset = $fldOffset; } #for # Tell the user about the identical field offsets if ($cnt_idential_fldOffsets > 0) { $cnt_idential_fldOffsets++; print "Warning: "; if ($cnt_idential_fldOffsets == $numfields) { print "All "; # say "All nn fields have identical offsets." } print "$cnt_idential_fldOffsets fields have identical offsets.\n"; $silent = 0; if ($cnt_idential_fldOffsets == $numfields) { print "Note: $infile is in a non-standard .dbf format (such as Alpha-4's),\n" . "in which the field offsets are missing from the header.\n" . "The offsets will be recalculated from the summed field lengths.\n"; # Mark Godhelf reported that for his Alpha-4's .dbf files $fldOffset is always zero. 12/13/2002 # Stephane Boireau had a .dbf file in which all the $fldOffsets were 383. 4/13/2003 $silent = 1; } } # Check whether or not the field offsets are consistent with the field lengths, # and if they are not then tell the user about the problem (unless we already # told him that all the field offsets are identical). $running_offset = 1; $offsets_match_fieldlens = 1; for ($i=1; $i <= $numfields; $i++) { $fldLen = $fld_len[$i]; $fldOffset = $fld_ofs[$i]; if (($running_offset != $fldOffset) && !$silent) { print "ERROR: field $i (len=$fldLen): running calculated offset, $running_offset, does not match field offset from header, $fldOffset.\n"; $offsets_match_fieldlens = 0; } $running_offset += $fldLen; } #for if ((!$offsets_match_fieldlens) && ($reclens_look_right || ($cnt_idential_fldOffsets > 0))) { # fix the field offsets by calculating them from the summed field lengths if (!$silent) { print "The offset(s) will be recalculated from the summed field lengths.\n"; } $running_offset = 1; for ($i=1; $i <= $numfields; $i++) { $fldLen = $fld_len[$i]; $fld_ofs[$i] = $running_offset; $running_offset += $fldLen; } #for } ### End of code to fix field offsets # Read last byte of header (at the end of fields definitions), which should be CR (0x0D) if (!sysread(DBF, $buf, 1)) { print "ERROR: Could not read terminator byte from '$infile', $!\n"; $cvt_failed = 1; return 0; # no records converted } if ("\r" ne $buf) { if ((48 == $verNum) && ($zerobyte eq $buf)) { # Visual FoxPro generates databases with this minor flaw printf "Visual FoxPro idiosyncracy:"; } else { print "ERROR:"; } printf " Header-terminator byte at offset %d is 0x%02x (it should be 0x0D)\n", $hdrLen-1, ord($buf); } elsif ($extra_hdr_bytes) { printf "Header-terminator byte is 0x%02x (as expected).\n", ord($buf); } if ($extra_hdr_bytes) { # Usually the header is correctly followed by data records. # But sometimes it isn't right. Sometimes there is a 0 byte, # sometimes there is a field that tells the link to a file. # Stephane Boireau had a .dbf file in which a zero byte followed the normal 0x0D terminator byte. 4/13/2003 if (!sysread(DBF, $buf, $extra_hdr_bytes)) { print "ERROR: Could not read the $extra_hdr_bytes extra header bytes from '$infile', $!\n"; $cvt_failed = 1; return 0; # no records converted } # Visual Foxpro (verNum 48) always generates 7 extra zero header bytes; # for any other circumstance, print them out: if (($verNum != 48) || ($buf ne "\0\0\0\0\0\0\0")) { print "The $extra_hdr_bytes extra header byte(s) are:"; for ($i=0; $i<$extra_hdr_bytes; $i++) { $tmp = substr($buf, $i, 1); printf " 0x%02x", ord($tmp); } print "\n"; } } # Warn about field types that we don't know how to handle for ($i=1; $i <= $numfields; $i++) { $fldLen = $fld_len[$i]; $fldOffset = $fld_ofs[$i]; $fldType = $fld_typ[$i]; if ((!$fld_supported[$i]) || ((('B' eq $fldType)||('T' eq $fldType)||('Y' eq $fldType)) && (8 != $fldLen))) { if ('Y' eq $fldType) { # don't know how this is represented print "Warning: field $i ($fld_nam[$i]) is type 'Y' (len=$fldLen)"; print ", which is unsupported.\n" . "If you can tell me how $fldLen-byte 'Y' fields are stored, or if you're willing to help\n" . "test support for them, then please email Dave via https\://www.burtonsys.com/email/\n"; # Note: type 'F' fields are now known to work fine. Search this # program for references to 'F' for more comments. (Thanks, Dan!) } else { print "Warning: field $i ($fld_nam[$i]) is type '$fldType' (len=$fldLen)"; if (! defined $fld_typ_seen{$fldType}) { print ", which is unsupported.\n" . "If you can tell me how type $fldType fields are stored, or if you are willing to help\n" . "test support for '$fldType' fields, then please email Dave via https\://www.burtonsys.com/email/\n"; } } print "\n"; } elsif (('Y' eq $fldType) && (8 == $fldLen) && !defined $fld_typ_seen{$fldType}) { print "Warning: 'Y' (Visual Foxpro currency fields) is experimental. Please tell Dave\n" . "whether it works correctly! https\://www.burtonsys.com/email/\n"; } $fld_typ_seen{$fldType} = 1; } #for if (!open( OUTP, ">$outfile" )) { print "ERROR: Could not create '$outfile', $!\n"; $cvt_failed = 1; return 0; # no records converted } # echo field names to console: @tmp = @fld_nam; shift @tmp; # don't output field #0 (the 'deleted' flag) foreach $fld (@tmp) { # remove leading and trailing whitespace and any embedded quote marks $fld =~ s/\s*(\S((.*\S)|)|)\s*/$1/; # The aim of this expression before is to take care of fields that have only one letter # It can be changed to another expression $fld =~ s/\s*(\S.*\S)\s*/$1/; # but it won't care about one letter long fields $fld =~ s/\"/\`/g; # field names can't contain quote marks } print "The $numfields fields in each record are named:\n"; printf $csvoutfmt,@tmp; if (('DBWIN' eq substr($tmp[0],0,5)) && $might_be_dBase7) { print "ERROR: $infile appears to be a dBase 7 file. This tool does not support dBase 7 file.\n"; } if ($jsonmode) { print OUTP "[\n"; } else { # output field names as first line in output file: # Write name fields first in output file printf OUTP $csvoutfmt,@tmp; } $recnum = 0; # Then read & convert each record while (sysread(DBF, $buf, $recLen)) { if ((1 == $extra_file_bytes) && (1 == length($buf))) { # For some reason, some .dbf files seem to have an extra ctrl-Z at the end if (26 == ord($buf)) { print "Trailing ctrl-Z ignored.\n"; } else { printf "Warning: ignored final (extra) character at end of %s: 0x%02x\n", $infile, ord($buf); } last; } # in JSON mode, add commas & line-breaks between records if ($recnum && $jsonmode) { print OUTP ",\n"; } # count records $recnum++; if ($recnum <= 0) { $recnum--; print "Warning: more than $recnum records, Perl integer overflow.\n"; $recnum = 1; } # if ($recnum > 5000) { # last; # for debugging # } # Write a dot every 2000 records if (0 == ($recnum % 2000)) { print '.'; } # if ($buf =~ /\~/) { # print "Warning: changed '~' to space in record $recnum\n"; # $buf =~ tr/\~/ /; # if ($buf =~ /\~/) { # print "ERR: failed to change '~' to space in record $recnum\n"; # exit 1; # } # } # Translation of pc codes to ansi (or untranslated) if asked if ($translate) { if ($garde_accent) { eval "\$buf =~ tr /$code_pc/$code_ansi/;"; # Let's keep accentued characters } else { eval "\$buf =~ tr /$code_pc/$code_brut/;"; # Let's remove accentued characters } } # Unpack record according to $recfmt template @fields = unpack( $recfmt, $buf ); # the fields of @fields are record fields # fetch 'M' fields from memo file if ($has_memo_fields) { for ($i=1; $i <= $numfields; $i++) { if ('M' eq $fld_typ[$i]) { $fields[$i] = &fetch_memo( $infile, $fields[$i] ); } } } # handle FoxBase's 'B' (8-byte IEEE double-precision float) and 'T' (8-byte Date/Time) fields, # and Visual Foxpro's 'Y' (8-byte currency) fields for ($i=1; $i <= $numfields; $i++) { if (('T' eq $fld_typ[$i]) && (8 == $fld_len[$i])) { # 8-byte Foxpro Date/Time field. # First 4 bytes is the Julian Date, where Oct. 15, 1582 = 2299161 just like http://www.nr.com/julian.html # Last 4 bytes are the time in milliseconds since midnight. $recfmt = "a8" $tmp = "LL"; if ($big_endian) { $tmp = "VV"; # requires Perl 5 } ( $date, $time ) = unpack( $tmp, $fields[$i] ); # $tmp = length($fields[$i]); # print "dbg: field='" . $fields[$i] . "', len=$tmp, jdate=$date, time=$time, "; $date = &Jdate2Cal($date); # time is in mSec; convert to sec $tmp = int(($time + 500) / 1000); # sec $sec = $tmp % 60; $tmp -= $sec; $tmp = int(($tmp+30) / 60); # min $min = $tmp % 60; $tmp -= $min; $hr = int(($tmp+30) / 60); $time = sprintf( "%02d:%02d:%02d", $hr, $min, $sec ); $fields[$i] = $date . ' ' . $time; # print "date/time= '" . $date . ' ' . $time . "'\n"; } elsif (('B' eq $fld_typ[$i]) && (8 == $fld_len[$i])) { if ($perl_supports_IEEE754_doubles) { $tmp = $fields[$i]; if ($tmp < 0) { $tmp = -$tmp; } # Or, in Perl 5, $tmp = abs($fields[$i]); if (($tmp > 9007199254740991) || (($tmp != 0) && ($tmp < 0.001))) { # If number isn't represented with at least integer precision, # or if it is a tiny fractional amount, then we might need to # use exponential notation. $tmp = sprintf("%g",$fields[$i]); } else { # Otherwise, use regular floating point notation $tmp = sprintf("%f",$fields[$i]); } # shorten floating point numbers by deleting trailing zeros after # the first one (i.e., change "1.00000" to "1.0"). while ($tmp =~ /^[0-9]+\.[0-9]+0+$/) { chop($tmp); } $fields[$i] = $tmp; } else { # if this Perl doesn't support IEEE-754 double-precision, use hexadecimal (ugh!) $tmp1 = $fields[$i]; $tmp = '0x'; for ($j=7; $j>=0; $j--) { $tmp .= sprintf('%02X',ord(substr($tmp1,$j,1))); } $fields[$i] = $tmp; } } elsif (('Y' eq $fld_typ[$i]) && (8 == $fld_len[$i])) { # Visual Foxpro 8-byte integer currency, in 100ths of a cent if ($ivsize < 8) { print "Warning: Visual Foxpro 'Y' field (currency) support is incomplete without\n" . "64-bit integer support. I recommend using a Perl with 64-bit integers.\n"; } $tmp = "ll"; if ($big_endian) { $tmp = "VV"; # requires Perl 5 print "Warning: this is a big-endian computer, which will probably give wrong results\n" . "for the Visual Foxpro 'Y' field (currency) for negative values, and for\n" . "positive currency values > $214,748.36\n"; } ( $lowpart, $highpart ) = unpack( $tmp, $fields[$i] ); if ((($lowpart >= 0) && ($highpart != 0)) || (($lowpart < 0) && ($highpart != -1))) { # with 32-bit Perls, maximum value is $214,748.3647 # $lowpart += (4294967296 * $highpart); $lowpart = unpack( "q", $fields[$i] ); # signed 8-byte value, 'q' will throw an error on 32-bit Perls! } $tmp = sprintf("%0.4f",$lowpart/10000); $fields[$i] = $tmp; } } # debug code -- convert unsupported fields to series of hex bytes from '\x00' to '\xFF' for ($i=1; $i <= $numfields; $i++) { if (! $fld_supported[$i]) { # convert to hexidecimal $tmp_unpack_fmt = 'C' . $fld_len[$i]; @tmp = unpack( $tmp_unpack_fmt, $fields[$i] ); foreach $ch (@tmp) { $ch = '\x' . sprintf( '%02x', $ch ); } $fields[$i] = join( '', @tmp ); $fields[$i] =~ tr/a-f/A-F/; } } # remove leading and trailing whitespace and any embedded quote marks and newlines and ctrl-Zs foreach $fld (@fields) { if ($fld =~ /[\s\"]/) { $fld =~ s/^\s*//; # delete leading whitespace $fld =~ s/\s*$//; # delete trailing whitespace $fld =~ s/\032/ /g; # \032 is ctrl-Z; translate it to a space if (1==$escape_codes) { $fld =~ s/\\/\\\\/g; # single backslash gets doubled $fld =~ s/\r/\\r/g; # CR turns into \r $fld =~ s/\n/\\n/g; # LF turns into \n $fld =~ s/\"/\\"/g; # quote mark turns into \" $fld =~ s/\t/\\t/g; # tab turns into \t $fld =~ s/\f/\\f/g; # formfeed turns into \f } else { if (2==$escape_codes) { $fld =~ s/\"/\"\"/g; # for CSV translate " to "", but I'm unsure every program can handle it } else { $fld =~ s/\"/\`/g; # the old, dumb way: translate " to ` } $fld =~ s/\r\n/ /g; # translate CR and/or LF to a space $fld =~ s/[\r\n]/ /g; } } } # Remove the first field (1-byte flag) that tells if the record is # deleted (value 2Ah= "*") or valid (value 20h = space, but the space was # eliminated above, when we removed leading and trailing whitespace). $deleted_flag = shift @fields; # If you want to include "deleted" records in the output .csv file, then # comment-out the next five lines: if ($deleted_flag ne '') { $skipped++; print "Warning: record $recnum is marked for delete; $skipped records skipped.\n"; next; } # write the converted record, using the format built above printf OUTP $outfmt,@fields; } #while if ($jsonmode) { print OUTP "\n]\n"; } close OUTP; # close output file close DBF; # close input file if ($recnum >= 2000) { print "\n"; # because progress-indicator dots were printed } # $recnum -= $skipped; # account for deleted records # Hmmmm... Bruno thinks that is needed here, but I don't think so. # The question to ask is: does the 'numrecs' field in the .dbf file # header include records marked as 'deleted' or not? I think it should # include them, so $recnum shouldn't be decremented by $skipped. if ($recnum != $numrecs) { print "Warning: file should have had $numrecs records, but actually had $recnum records.\n" . "Calculated numrecs=$calculated_numrecs.\n"; # Since I might be wrong, and Bruno might be right (at least for some # database systems), I added the following message: if (($recnum-$skipped) == $numrecs) { print "Note: The disparity seems to be accounted for by $skipped deleted records.\n" . "Please tell Dave Burton via http\://www.burtonsys.com/email/\n"; } } return $recnum; } #cvt1file $errlevel = 0; $num_files_processed = 0; sub do_a_file { local( $inFile ) = @_; local( $numRecords, $outFile ); if ($num_files_processed) { print "\n"; # separate messages for each file with a blank line } if ($inFile !~ /\.dbf$/i) { printf "ERROR: input file name '$inFile' does not end in '.dbf'\n"; $errlevel = 2; } else { # if input file name was upper-case, make output file name upper-case, too # (the same in french) si fichier d'entrée est en majuscules, fichier de sortie en majuscules aussi $outFile = $inFile; if ($jsonmode) { $outFile =~ s/\.dbf$/\.json/i; } else { $outFile =~ s/\.DBF$/\.CSV/; $outFile =~ s/\.dbf$/\.csv/i; } # Display on the console the file names and number of records $num_files_processed++; print "Input='$inFile' Output='$outFile'\n"; $numRecords = &cvt1file( $inFile, $outFile ); if ($cvt_failed) { $errlevel = 2; } else { $numRecords++; # add one for the first record, with the field names in it print "Created $outFile from $inFile, $numRecords records.\n"; } } } #do_a_file # display a help message; pass 1 as parameter for 'interactive' (shorter) version sub do_help { local( $interactive ) = shift; if (!defined $interactive) { $interactive = 0; } &show_progversion; if (! $interactive) { print "\n" . "Usage:\n" . " perl4w32 dbf2csv.pl file.dbf ...\n" . "or (under MS-DOS):\n" . " perl4s dbf2csv.pl file.dbf ...\n" . "or (if you have a modern Perl installed):\n" . " perl dbf2csv.pl file.dbf ...\n" . "or (to run interactively under Windows and prompt for the .dbf file):\n" . " perl4w dbf2csv.pl -p\n" . "\n" . "For each input file.dbf, an output file.csv will be created.\n" . "There will be one more record in file.csv than file.dbf, because the\n" . "field names are written to file.csv as the first record.\n" . "A dot will printed for every 2000 records, as a progress indicator.\n" . "\n" . "Options: (e.g., \"perl4w32 dbf2csv.pl -d infile.dbf\".)\n" . " -j to output a JSON (.json) file instead of CSV (experimental!)\n" . " -d to enable debug prints\n" . " -p special intaractive mode for use with perl4w.exe\n" . " -ta to translate DOS PC-style accented characters to ANSI\n" . " -tu to translate DOS PC-style accented characters to unaccented\n" . " -e0 translate CR, LF etc. to blanks (this is default for CSV)\n" . " -e1 translate CR, LF etc. to \\r \\n etc. (this is default for JSON)\n" . " -e2 like -e0 except double \"-marks (for CSV only)\n" . "\n" . "Limitations: dBase (.DBT) Memo fields are unsupported, though FoxPro (.FPT)\n" . "Memo fields work. Also, dBase 7 files are completely unsupported.\n" . "Also, if you use a 16-bit version of Perl, such as perl4s\n" . "or perl4w, then you must use \"8.3\" (short) file names, and '-j' (JSON)\n" . "mode creates .JSO files (instead of .json).\n" . "Also, operation on big-endian machines is not fully tested.\n" . "Also, support for Visual Foxpro 'Y' currency fields is experimental, and\n" . "for dollar values > 214,748.36 you need to use a 64-bit Perl.\n"; ; } else { # interactive mode print "\n" . "For each .dbf input file that you specify, an output file will be\n" . "created with the same name but a .csv extension.\n" . "\n" . "For big files, a dot will printed for every 2000 records, as a\n" . "progress indicator. There is no limit on the size of the files,\n" . "but big files may take a long time to convert.\n" . "\n" . "There will be one more record in the .csv output file than in the .dbf\n" . "file, because the field names are written to the .csv file as the\n" . "first record.\n" . "\n" . "Options (you may enter an option instead of a file name):\n" . " -j to output a JSON (.json) file instead of CSV (experimental!)\n" . " -d to enable debug prints\n" . " -ta to translate DOS-style accented characters to ANSI\n" . " -tu to translate DOS-style accented characters to unaccented\n" . " -e0 translate CR, LF etc. to blanks (this is default for CSV)\n" . " -e1 translate CR, LF etc. to \\r \\n etc. (this is default for JSON)\n" . " -e2 like -e0 except double \"-marks (for CSV only)\n" . "\n" . "Note that you might have to specify the full path of each .dbf file.\n"; } } #do_help # From the Perl documentation, it isn't clear whether the 'v' and 'V' template # characters do signed or unsigned conversions. I thought it was signed, but # it seems to be unsigned. This tests it, and prints the result. This is only # called when the user specifieds the "-d" (debug) command-line parameter. sub tst_vV { local($tst_v_fmt, $tst2, $tst_V_fmt); if ($perl_supports_IEEE754_doubles) { print "This Perl supports IEEE-754 double-precision numbers.\n"; } else { print "This Perl does NOT support IEEE-754 double-precision numbers.\n"; } $tst_v_fmt = "\xFF\xFF\xFF\xFF"; $tst2 = unpack("v",$tst_v_fmt); if (-1 == $tst2) { print "'v' format is signed.\n"; } elsif (65535 == $tst2) { print "'v' format is unsigned.\n"; } else { print "ERROR: 'v' format is mishandled!! 0xFFFF -> $tst2\n"; } $tst_V_fmt = "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"; $tst2 = unpack("V",$tst_V_fmt); if (-1 == $tst2) { print "'V' format is signed.\n"; } elsif (4294967295 == $tst2) { print "'V' format is unsigned.\n"; } else { print "'V' format is unsigned! 0xFFFFFFFF -> $tst2\n"; } } sub process_a_file_or_option { local( $inFile ) = shift; if ("-d" eq $inFile) { $debugmode = 1; print "TLIB revision no. $ver_ordinal_and_date\n"; &tst_vV; } elsif ("-d0" eq $inFile) { $debugmode = 0; } elsif ("-p" eq $inFile) { # Special interactive mode for use with Perl4w.exe $prompt = 1; } elsif ("-j" eq $inFile) { # JSON mode (instead of CSV) $jsonmode = 1; } elsif ("-j0" eq $inFile) { $jsonmode = 0; } elsif ("-ta" eq $inFile) { # translate DOS PC-style accented characters to to ANSI-style accented characters $translate = 1; $garde_accent = 1; } elsif ("-tu" eq $inFile) { # translate DOS PC-style accented characters to to unaccented characters $translate = 1; $garde_accent = 0; } elsif (("-e" eq $inFile) || ("-e1" eq $inFile)) { $escape_codes = 1; } elsif ("-e2" eq $inFile) { $escape_codes = 2; } elsif ("-e0" eq $inFile) { $escape_codes = 0; } elsif ("-tn" eq $inFile) { # no translations $translate = 0; } elsif (("-?" eq $inFile) || ("-h" eq $inFile) || ("--help" eq $inFile)) { &do_help; } else { &do_a_file( $inFile ); } } #process_a_file_or_option # Main program # Test if there are parameters if (($#ARGV+1) < 1) { # no, show help &do_help; exit 1; } else { while ($#ARGV >= 0) { $inFile = $ARGV[0]; shift @ARGV; &process_a_file_or_option( $inFile ); } #while # default is backslash-escape sequences for JSON, or dumb/simple for CSV if (-1 == $escape_codes) { $escape_codes = $jsonmode; } if ($prompt) { &do_help( 1 ); # interactive version of help message do { print "\n"; if ($debugmode) { print "[debugmode enabled]\n"; } print "Convert what .DBF file? (or press Enter alone to quit) "; $inFile = ; # remove leading and trailing whitespace (especially the CR at the end) $inFile =~ s/\s*(\S((.*\S)|)|)\s*/$1/; if ('' ne $inFile) { &process_a_file_or_option( $inFile ); } } until ('' eq $inFile); } if (! $prompt) { exit $errlevel; } # An idiosyncracy of Perl4w.exe is that if you exit by dropping off # the end of the program it closes the Window, but if you exit by # calling 'exit' or 'die' then it leaves the window open. Since we # want the window to close, we don't call 'exit'. } __END__