#!/usr/bin/perl # # tfmt - a formatting tool for tables # # $Id: tfmt,v 1.12 2001/03/22 09:37:19 dominik Exp $ # # the global variables are my(@lines); # holds the lines of the file my(@start); # line numbers of lines with &tfmtstart my(@stop); # line numbers of lines with &tfmtstop my($joinstring); # the string that is used to join the individual fields my($minindent); # minimum indentation my(@fmt); # formats for numbers my(@sfmt); # formats for strings my(%force_fmt); # format dictate from options etc. my($underline_re); # re regular expression matching a line only made of = or - my($total_length); # length length of a formatted line my($j); # a counter #option_variables #local ($opt_help,$opt_plus,$opt_minus,$opt_i,$opt_s,$opt_S,$opt_j, # $opt_w,$opt_b,$opt_e,%opt_force_fmt); # the following regexp matches lines made of 15 or more '-' or '=' $underline_re = "^\\s*([=-])\\1{5,}\\s*\$"; &yago( 10, qw( -help +1:SKIPLINES -1:SKIPLINES -i -s:SEPSTRING%s -S:SEPSTRING%s -j:JOINSTRING%s -w:SPACES%d -b:BLOCKSEPLINES%d -e =(%opt_force_fmt)) ) or &print_usage,exit(1); if ($opt_help) { &print_help; exit; } # -w actually should set $opt_j and $opt_s &use_minus_w if $opt_w; # we read entire files undef $/; FILE: while (<>) { # make a backup if editing in place if ($opt_i) { rename($ARGV,"$ARGV.bak") or die "Can't make backup of file $ARGV\n"; } @lines = split(/^/m,$_); # Compute the range over which formatting is desired @start = (); @stop = (); if (/\&tfmt(start|stop)/) { # internal directives present &make_start_stop_arrays || next FILE; } elsif ($opt_e || defined($opt_plus) || defined($opt_minus) ) { # range given on command line push(@start,$opt_plus-1); push(@stop,$#lines-$opt_minus+1); } else { # find the largest block (block defined by $opt_b) ($start[0],$stop[0]) = &find_largest_block( $opt_b || 2) ; $start[0]--; $stop[0]++; } for $j (0..$#start) { # initialize a few arrays and variables &clean; if ( $start[$j] == -1 ) { make_settings($opt_s,$opt_S,$opt_j,%opt_force_fmt); } else { make_settings( &parse_tfmtstart($lines[$start[$j]]) ); } &analyse_table ( $start[$j]+1 , $stop[$j]-1 ); &format_lines ( $start[$j]+1 , $stop[$j]-1 ); } # write if ($opt_i) { open(OUT,">$ARGV") || die "Can't open $ARGV for writing\n"; print OUT @lines; close OUT; } else { print STDOUT @lines; } } # -------------------------------------------------------------------- # -------------------------------------------------------------------- # Subroutine definitions follow below # -------------------------------------------------------------------- # -------------------------------------------------------------------- sub print_usage { print < $stop[0]; push(@stop,$#lines+1) if $start[$#start] > $stop[$#stop]; # sanity check for $i (0..$#start) { if ($stop[$i] <= $start[$i]) { warn "$0: start/stop lines for tfmt confused in file $ARGV. Skipping file.\n"; return 0; } } return 1; } sub find_largest_block { # find the table by looking for the largest block containing # at most N empty lines in a row. my ($N) = @_; my ($togo,$i,$max,$start,$stop,$length,$blockstart); for ($i=0; $i<=$#lines;$i++) { next unless $lines[$i] =~ /\S/; $start = $i++; $togo = $N; $i++ while ( $i <= $#lines && ( ($lines[$i] =~ /\S/) ? ($togo = $N) : --$togo ) ); $stop = $i; $length = $stop-$start; if ($length > $max) { $max = $length; $blockstart = $start; } } if (defined($blockstart)) { return ($blockstart,$blockstart+$max-1); } else { return (0,0); } } sub make_settings { # compute and export splitpattern, joinstring and force_fmt my ($split,$Split,$join,$pat,$sub); ($split,$Split,$join,%force_fmt) = @_; $split = $Split if $Split; $pat = $split ? "\\s*\Q$split\E\\s*(?!\Q$split\E)" : "\\s+"; # FIXME this breaks tfmt -s\& for thing like &&. Why did I do this? # Create the split function which also applys any evaluate items $sub = <= $_[1] ? $_[0] : $_[1]); } sub min { ($_[0] >= $_[1] ? $_[1] : $_[0]); } sub clean { # reset formatting information @fmt = (); @sfmt = (); undef $minindent; } sub get_my_opts { # My favorite way of parsing arguments: # Long options, argument type checking, bundling, other variable names, # defaults, automatic usage message, name=value pair interpretaion # I would use one of the modules, but I want both bundling and long names. # # usage: get_my_opts( NMIX , USAGEREF, # '-NAME:ARGTYPE ($VARIABLE) [DEFAULT]', ... ) # # NMIX: Optional number of non-options allowed mixed with options # -1 means unlimited. 0 is default # USAGEREF: Optional reference to a print_usage function # If not given, one is created on the fly. # # NAME: name of option, can be more than one letter # special: - looks for a bare '-' # -1 looks for -NN where NN is a number # +1 looks for +NN where NN is a number # = looks for VAR=VALUE and sets $VAR to VALUE # or $opt_equal{VAR} to VALUE # ARGTYPE: i (integer) f (float) s (string) # VARIABLE: Name of variable to be set. Default is $opt_NAME # - when given as @VARIABLE, args will be pushed on that array # - when given as %VARIABLE, VAR=VALUE pairs will go into hash # DEFAULT: Default value for option. # # Example: get_my_opts( 3 , undef, # qw( -a -x -v -help -n:i[40] =(%named) # -f($force) -d:s(@dirs) # ) ) or die "Error in options\n"; # # ... will - allow for up to three non-option mixed in # - print an automatic usage message on error # - look for -a -x -v -help and set $opt_... # - look for -n INT with a default of 40 # - put all name=val pairs into hash %named # - look for -f and set $force # - push all -d args onto @dirs my ($nmix) = ($_[0] =~ /^-?\d+$/) ? shift(@_) : 0; my ($uref); my ($opt,$opt1,$arg,$var,$prefix,$def,$first,$rest,$err,$sign); my (%var,%arg,%prefix,@keep,@optlist); my ($check); local($_); # what kind of args do we expect? foreach (@_) { if ( ($opt,$arg,$prefix,$var,$def) = /^ ( -\w+ | \+\d | \- | \= ) \s* # option name ([=:][ifs])? \s* # arg? and arg type (?: \( ([\$\@\%])(\w*) \) )? \s* # varaible name and prefix (?: \[ (\S*?) \] )? \s* # default $/x ) { ($opt1 = $opt) =~ s/^-//; $var = "opt_$opt1" unless $var; $var = 'opt_minus' if ($var eq 'opt_-1'); $var = 'opt_plus' if ($var eq 'opt_+1'); $var = 'opt_equal' if ($var eq 'opt_='); $prefix = "\$" unless $prefix; $arg = ':s' if $arg eq ':'; $arg{$opt} = $arg; $prefix{$opt} = $prefix; eval "\$var{\$opt} = \\$prefix$var"; $ {$var{$opt}} = $def if $def; $arg = 'STRING' if $arg eq ':s'; $arg = 'INTEGER' if $arg eq ':i'; $arg = 'FLOAT' if $arg eq ':f'; $opt1 = "[ $opt" . ($arg?" $arg":"") . " ]"; $opt1 = "[ NAME=VALUE ... ]" if $opt eq '='; $opt1 = "[ -INTEGER ]" if $opt eq '-1'; $opt1 = "[ +INTEGER ]" if $opt eq '+1'; push(@optlist,$opt1); } elsif (ref($_) || $_ == 1) { $uref = $_; } else { die "Illegal option specifier $_\n"; } } # What is there on the command line? ARG: while ($opt = shift(@ARGV)) { last ARG if $opt eq '--'; if ( defined($var{'-'}) && $opt eq '-') { # single dash $ {$var{'-'}} = 1; } elsif ( defined($var{'='}) && ( ($var,$arg) = ($opt =~ /^(\w+)=(.*)$/) ) ) { # var=arg syntax if ($prefix{'='} eq '%') { # put into hash $ {$var{'='}}{$var} = $arg; } else { # set variable directly; eval "\$$var = \$arg"; } next ARG; } elsif ( ($sign,$arg) = ($opt =~ /^([-+])(-?\d+)$/ ) ) { # +NN or -NN option $ {$var{$sign.'1'}} = $arg; next ARG; } elsif ( ($first,$rest) = ($opt =~ /^(-.)(.*)$/) ) { if (defined $var{"$first$rest"}) { $opt = $first.$rest; $rest = ''; } elsif (defined $var{$first}) { $opt = $first; } else { warn "$0: Unknown option $opt\n"; $err++; next; } } else { # not an argument push(@keep,$opt); $nmix-- ? next : last; } # no argument? unless ($arg{$opt}) { $ {$var{$opt}} = 1; unshift(@ARGV,"-$rest") if $rest; next ARG; } # argument! $arg = ($rest ne "") ? $rest : shift(@ARGV); unless ($arg ne "") { warn "Option -$opt requires an argument\n"; $err++; last; } # argument type if ($arg{$opt} eq ":i") { $check = ($arg =~ /^[+-]?\d+$/); } elsif ($arg{$opt} eq ":f") { $check = ($arg =~ /(\+|-)?(\d+\.?\d*|\.\d+)([eE](\+|-)?\d+)?/); } else { $check = 1; } if ($check) { if ($prefix{$opt} eq '@') { push (@{$var{$opt}},$arg); } else { $ {$var{$opt}} = $arg; } next ARG; } else { warn "argument $arg is wrong type for option -$opt\n"; $err++; } } # put the kept stuff back on @ARGV unshift(@ARGV,@keep); if ($err) { # we had some errors if (ref($uref) eq "CODE") { &$uref; } elsif ($uref) { # create usage message on the fly push(@optlist, ref($uref) eq "SCALAR" ? $$uref : "OTHER ARGS ..."); my($line) = "usage: $0"; my($len) = length($line); foreach (@optlist) { if (length($line)+2+length($_) <= 78) { $line .= " $_"; } else { print STDERR $line,"\n"; $line = ' ' x $len . " $_"; } } print STDERR $line,"\n"; } } return $err == 0; } sub use_minus_w { if ($opt_w) { $opt_s = ' ' x $opt_w unless $opt_s || $opt_S; $opt_j = ' ' x $opt_w unless $opt_j; } } my @optlist; sub yago { # Defaults my ($usage) = 0; # Don't print usage message my ($nmix) = 0; # No mixing of options with other args # Local variables my (@args) = @_; my ($opt,$opt1); my ($arg,$argname,$argtype,%arg); my ($var,%var); my ($prefix,%prefix); my ($first,$rest,$err,$sign,$check,$checkkey); my (@keep); @optlist = (); local($_); # Parse option specifiers foreach (@args) { print "working on $_\n"; s/\s+// unless /^\d+$/; # If it is a number it might be a constant.. if ( ($opt,$arg,$argname,$argtype,$prefix,$var) = /^ ( -\w+ | \+\d | \- | \= ) # option name (?: ([=:]) (\w+)? (%[idfs])? )? # arg?, name and type (?: \( ([\$\@\%]) (\w*) \) )? # varaible name and prefix $/x ) { $opt1 = substr($opt,1); unless ($var) { $var = "opt_$opt1"; $var = 'opt_minus' if ($opt eq '-1'); $var = 'opt_plus' if ($opt eq '+1'); $var = 'opt_equal' if ($opt eq '='); } $prefix = "\$" unless $prefix; $prefix = '%' if $opt eq '='; $argtype = '%s' if $arg && $argtype eq ''; $arg{$opt} = $argtype; $prefix{$opt} = $prefix; # create reference to variable eval "\$var{\$opt} = \\$prefix$var"; if ($arg && ! $argname) { $argname = 'STRING' if $argtype eq '%s'; $argname = 'INTEGER' if $argtype eq '%d' || $argtype eq '%i'; $argname = 'FLOAT' if $argtype eq '%f'; } $opt1 = "[$opt" . ($arg?" $argname":"") . "]"; $opt1 = "[NAME=VALUE ...]" if $opt eq '='; $opt1 = "[-INTEGER]" if $opt eq '-1'; $opt1 = "[+INTEGER]" if $opt eq '+1'; push(@optlist,$opt1); } elsif (/^\d+$/) { $nmix = $_; } elsif ($_ eq 'usage') { $usage = 1; } else { die "Illegal option specifier $_\n"; } } # Do we need to check the keys in NAME=VALUE pairs? $checkkey = scalar(keys %{$var{'='}}) if (exists($prefix{'='})); # What is there on the command line? ARG: while (scalar(@ARGV)) { $opt = shift(@ARGV); last ARG if $opt eq '--'; if ( defined($var{'-'}) && $opt eq '-') { # single dash $ {$var{'-'}} = 1; } elsif ( defined($var{'='}) && ( ($var,$arg) = ($opt =~ /^(\w+)=(.*)$/) ) ) { # var=arg syntax if ( !$checkkey || exists($ {$var{'='}}{$var})) { $ {$var{'='}}{$var} = $arg; } else { push (@keep,$opt); } next ARG; } elsif ( ($sign,$arg) = ($opt =~ /^([-+])(-?\d+)$/ ) ) { # +NN or -NN option $ {$var{$sign.'1'}} = $arg; next ARG; } elsif ( ($first,$rest) = ($opt =~ /^(-.)(.*)$/) ) { if (defined $var{"$first$rest"}) { $opt = $first.$rest; $rest = ''; } elsif (defined $var{$first}) { $opt = $first; } else { warn "$0: Unknown option $opt\n"; $err++; next ARG; } } else { # not an argument push(@keep,$opt); $nmix-- ? next : last; } # no argument? unless ($arg{$opt}) { $ {$var{$opt}} = 1; unshift(@ARGV,"-$rest") if $rest; next ARG; } # argument! $arg = ($rest ne "") ? $rest : shift(@ARGV); unless ($arg ne "") { warn "Option $opt requires an argument.\n"; $err++; last; } # argument type if ($arg{$opt} eq "%d") { $check = ($arg =~ /^[+-]?\d+$/); } elsif ($arg{$opt} eq "%f") { $check = ($arg =~ /(\+|-)?(\d+\.?\d*|\.\d+)([eE](\+|-)?\d+)?/); } else { $check = 1; } if ($check) { if ($prefix{$opt} eq '@') { push (@{$var{$opt}},$arg); } else { $ {$var{$opt}} = $arg; } next ARG; } else { warn "argument \'$arg\' for option $opt is not type $arg{$opt}\n"; $err++; } } # Put non-options back on @ARGV unshift(@ARGV,@keep); # Usage message if required &yago_usage if ($err && $usage); return $err == 0; } sub yago_usage { # Create usage message my $HANDLE = 'STDERR'; my $width = 80; my $line = "usage: $0"; my $len = length($line); push(@optlist, "OTHER ARGS ...") unless $optlist[$#optlist] eq "OTHER ARGS ..."; print $HANDLE "\n"; foreach (@optlist) { if (length($line)+2+length($_) <= $width) { $line .= " $_"; } else { print $HANDLE $line,"\n"; $line = ' ' x $len . " $_"; } } print $HANDLE $line,"\n\n"; } __END__ =head1 NAME B - simple formatting for tables in ASCII files =head1 SYNOPSIS tfmt [ -s SEPSTRING ] [ -j JOINSTRING ] [ N=%format ..] [ N=EXPR ..] [ -e ] [ +NN ] [ -NN ] [ -b NN ] [ -i ] [ filename ... ] but usually simply tfmt [ filename... ] =head1 DESCRIPTION B is a formatter for simple tables. It reads text arranged in columns from files or from STDIN and prints the table nicely formatted. The basic idea was to have something as simple as B for tables. Like B, B is not intended to typeset books but to keep hand-maintained tables in files in readable shape. It can also be used as in-line text filter for vi or Emacs. B splits a line into columns on whitespace. After formatting, the columns are joint back together with two space characters as separator. Both the string used for line splitting and for field joining can be changed (see B<-s> and B<-j> options). To determine a format automatically, B looks at all numbers in a column. If all are integers, they are printed as integers. If at least one is float or exponential, the number format for the entire column is chosen accordingly. The format is picked in a way that no significant digits are lost. Strings are printed as they are. Columns that contain any numbers are printed right justified, string-only columns left justified. It is also possible to force a certain format on a column with a C command line option. In order to protect the non-tabular parts of a file, B formats only the largest I in a file. A block is defined as a sequence of lines that contain no more than one (but see B<-b> switch) consecutive empty lines. Thus, as long as the table is the biggest thing in the file, it is sufficient to separate it by two empty lines from any other text. If that does not work, the region to be formatted may also be specified with the B<+NN> and B<-NN> switches, or with directives directly inserted into the file. See below. =head1 OPTIONS =over 5 =item B<-s SEPSTRING> =item B<-S SEPSTRING> Use SEPSTRING rather than whitespace to separate columns. SEPSTRING is also used as the default for JOINSTRING (see B<-j> option). Normally, JOINSTRING will be chosen so that it contains at least one leading and trailing SPACE chracter (i.e., C<-s '|'> implies C<-j ' | '>). However, when specified with the large B<-S>, JOINSTRING will be exactly equal to SEPSTRING. =item B<-j JOINSTRING> Use JOINSTRING rather than the default (two SPACE characters, or equal to SEPSTRING, see option B<-s>) to join the columns together. =item B<-w NN> Use NN space characters for both SEPSTRING and JOINSTRING. This is just a short cut for setting B<-s> and B<-j> explicitly. =item B Force formatting of column C with the format C<%format>. You may specify any parts of the format (e.g. 1=%e or 2=%13.2 or 3=%+), and only those parts mentioned will be forced on the column. =item B Replace column N by the value of EXPRESSION evaluated as a perl block. During the evaluation, the current value of column N is aliased to $_. The values from all columns can also be accessed in the expression as $F[1], $F[2],... This allows you to execute some minimal spreadsheet-like calculations. If the expression starts with a '#', it will be only applied to numeric fields, not to strings. If it starts with !#, it will only be applied to strings. =item B<-i> Format all files mentioned on the command line in place. Backup copies of the old version are kept with an extension ".bak". =item B<-e> Format the entire file, not just the largest block. See B<-b>. =item B<-b> NN Number of consequtive empty lines that separate blocks. The default for this is 2. Thus paragraphs separated by single empty lines are still considered the same block. =item B<+NN> Skip the first B lines in each file as those lines are not part of the table. The skipped lines are printed unchanged. =item B<-NN> Skip the last B lines in each file as those lines are not part of the table. The skipped lines are printed unchanged. =back =head1 DIRECTIVES The region to be formatted by B can be restricted in the file itself with directives. These directives can be put into a comment line. Table formatting is switched on with a directive &tfmtstart or &tfmtstart() or &tfmtstart( OPTIONS ) where C are command line-like options that will be used for the local formatting operation. B<-s>, B<-S>, B<-j>, B<-w>, and C are allowed here. An empty pair of parentheses will invoke defaults. Table formatting is switched of with a directive &tfmtstop Several pairs of start/stop directives can be in one file. If the first directive in a file is &tfmtstop, formatting starts from the beginning of the file. Likewise, if the last directive is a &tfmtstart, formatting continues to the end of the file. =head1 EXAMPLES The first example formats a table in a file F and prints it to the terminal. tfmt tab.dat If the comments in the file are bigger than the table itself, you will have to specify the location of the table. This may be done with directives in the file (see further down) or directly on the command line. Say, the first 62 lines are comment. tfmt +62 tab.dat The next example first makes a backup copy with the name F and then replaces F with the formatted version. tfmt -i tab.dat Format tab.dat to STDOUT, force exponential format on column 3. tfmt 3=%e tab.dat Same as above, but add 1000 to the number and do the formatting with a call to sprintf. tfmt 3='sprintf("%13.5e",$_+1000)' Here is how you could use an C<&tfmtstart> directive in a LaTeX file comment to format your table. Note that the C<&tfmtstart(...)> directive is also used to set the separator string to '&' and the join string to S<' & '>. \begin{table} % % &tfmtstart( -s& -j ' & ') % {\bf atom} & {\bf weight} \\ \hline H & 1.004 \\ He & 4.000 \\ \hline % % &tfmtstop % \end{table} Finally, here is an emacs lisp function which will pipe your current region through tfmt, prompting for command line arguments first. (defun tfmt-region (start end) "Filter region through tfmt." (interactive "r") (let ((cmd (read-input "filter region: " "tfmt"))) (call-process-region start end shell-file-name t t nil "-c" cmd))) =head1 BUGS Whitespace separated columns can never have an empty field. You always have to put in a place holder like C<---> or so. Also, you cannot have more than one word in a column with the default settings. The best workaround is to use explicitly two spaces (B<-w2>) as column separator. =head1 AUTHOR Carsten Dominik