#!/usr/bin/perl -w
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#NAME
#  duplink - Find duplicate files and link them to a single file.
#
#SYNOPSIS
#  duplink [file|dir]..
#
#REQUIRES
#	$HOME = $ENV{'HOME'} || '.';
#	$libdir1 = "lib";			# Where Backup.pm and Vopt.pm might be
#	$libdir2 = "$HOME/lib";		# Where Backup.pm and Vopt.pm might be
#	push @INC, $libdir1 if -d $libdir1;
#	push @INC, $libdir2 if -d $libdir2;
#	require "Vopt.pm";			# Verbosity routine
#	require "Backup.pm";		# Renames files to backup
#
#DESCRIPTION
# Given a list of files and/or directories, this program compares them all,
# looking for files with identical contents.  When a pair is found, one is
# unlinked and replaced with a link to the other, giving a multiply-linked
# file with several names, and saving some disk space.
#
# By default, the oldest file is the one kept, and newer identical files are
# linked to it.  There's an option to change this.
#
# There's also an option to rename files rather than unlinking them.  This
# doesn't save any disk space, but the backups can be easily deleted.
#
#OPTIONS
# Options start with '-' or '+', followed by an option letter and possibly
# a parameter.  Generally, '-' means "disable" or "don't" or "off, while '+'
# means "enable" or "do" or "on".   The case of the option letter doesn't matter.
#
# +B   Backup files by renaming them rather than unlinking, by using Backup.pm
# -B   Don't do backup; unlink files before linking them to an identical file (default)
#
# +C   Clean up by removing temp files
# -C   Don't clean up (default during debugging)
#
# +I   Inhibit; don't do any backups or linking, just show what would happen (default)
# -I   Don't inhibit; do the backups and linking
#
# +N   Use the newer file
# -N   Use the older file, not the newer file (default)
#
# -S<n> Sleep <n> sec before going on to a new size. (default: 1 sec)
#
# +S   Show each file size with a separate line of output.
# -S   Don't show the file sizes on separate lines (default).
#
# +V<N>
# -V<N>
#      Set verbose level to <N>.  If <N> is missing, we increment or decrement
#      the verbose level by one.
#
#EXAMPLES
#
#FILES
#
#BUGS
#
#SEE ALSO
#
#AUTHOR
#  John Chambers <jc@trillian.mit.edu>
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

$| = 1;
$exitstat = 0;
($P = $0) =~ s".*/"";
$ENV{"V_$P"} = '1' unless defined $ENV{"V_$P"};
&Vopt($ENV{"V_$P"}); 	# Verbose level . logfile name
print "$P: Started.\n" if $V>0;

$backup  = 0;	# Whether to rename files to backup rather than unlinking
$bdepth  = 0;	# Current backup depth
$buplim  = 8;	# Max backup depth
$cleanup = 1;	# Whether to clean up by removing our temp files
$cursiz  = 0;	# Size of files we're currently comparing
$dircnt  = 0;	# Directory counter
$filcnt  = 0;	# File counter
$forking = 1;	# Whether we're forking subprocesses to do the work
$forreal = 1;	# True if we do the links; false for informative run
$sleep   = 1;	# Sleep time between sizes
$sizes   = 0;	# Write each new size to stdout
$minsize = 2;	# Don't bother with files smaller than this
$newer   = 0;	# If true, use newer of two files rather than older

$bufthresh = 100;	# buffer data for files less than this size
$bufsiz   = 1000;	# Block size for sysread calls

$Prefix = '/tmp/P';	# Prefix for temp files

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Process the command-line arguments.  As an aside, we create the first temp
# file which contains the list of files found.

$filelist = sprintf "%s%dlist.txt",$Prefix,$$;
print "$P: Write file list to '$filelist'\n" if $V>0;
unless (open(FILELIST,">$filelist")) {
	print STDERR "$P: ### Can't write \"$filelist\" [$!]\n" if $V>0;
	exit $!;
}
print "$P: Opened '$filelist' for writing.\n" if $V>0;
for $arg (@ARGV) {
	print "ARG: '$arg'\n";
}
print FILELIST "99999999999 ---------------\n";
arg:
for $arg (@ARGV) {
	print "$P: ARG: '$arg'\n";
	next if &drop($arg);
	if (($flg,$opt) = ($arg =~ /^([-+])(.*)$/)) {
		print "$P: Option flg='$flg' opt='$opt'\n" if $V>3;
		while ($opt =~ s/^(.)(.*)$/$2/) {
			$ochr = uc($1);
			if ($ochr eq 'B') {
				$backup = ($flg eq '+') ?  1 : 0;
				print "$P: backup=$backup.\n" if $V>1;
			} elsif ($ochr eq 'C') {
				$cleanup = ($flg eq '+')  ? 1 : 0;
				print "$P: cleanup=$cleanup.\n" if $V>1;
			} elsif ($ochr eq 'I') {
				$forreal = ($flg eq '+')  ? 0 : 1;
				print "$P: forreal=$forreal.\n" if $V>1;
			} elsif ($ochr eq 'N') {
				$newer = ($flg eq '+')  ? 1 : 0;
				print "$P: newer=$newer.\n" if $V>1;
			} elsif ($ochr eq 'S') {
				if ($opt =~ s/^(\d+)//) {	# Sleep time between sizes
					$sleep = int($1);
					print "$P: sleep=$sleep.\n" if $V>1;
				} else {
					$sizes = ($flg eq '+')  ? 1 : 0;
					print "$P: sizes=$sizes.\n" if $V>1;
				}
			} elsif ($ochr eq 'V') {
				if ($opt =~ s/^(\d+)//) {
					$V = int($1);
				} else {
					$V += ($flg eq '+')  ? 1 : -1;
				}
				print "$P: V=$V.\n" if $V>1;
			} else {
				print STDERR "$P: Unknown opt \"$1\" ignored.\n" if $V>0;
			}
		}
	} elsif (-d $arg) {
		print "$P: Dir: '$arg'\n" if $V>1;
		++$dircnt;
		&onedir($arg);
	} elsif (-f $arg) {
		print "$P: FILE '$arg'\n" if $V>1;
		++$filcnt;
		($dev,$ino,$mode,$nlink,$uid,undef,undef,$size) = stat($arg);
		print FILELIST &lpad($size,11,'0') .  "\td=$dev/ino\tu=$uid\t$arg\n";
	} else {
		print STDERR "$P: Unknown arg \"$arg\" ignored.\n" if $V>0;
	}
}
unless ($dircnt || $filcnt) {	# If no directories or files on command line
	print "$P: Dir: '.' by default.\n" if $V>1;
	++$dircnt;
	&onedir('.');				# Scan the current directory
}
close FILELIST;
unless ($filcnt) {		# No files found
	print "$P: No files found.\n" if $V>0;
#	exit 0;				# Not actually an error
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Sort the file list:

$filesort = sprintf "%s%dsort.txt",$Prefix,$$;
print "$P: Write file sort to '$filesort'\n" if $V>3;
$cmd = "sort -n -r $filelist >$filesort";
print "$P: Sort list to '$filesort'\n" if $V>3;
system $cmd;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Read back the sorted list, and do the linking:

unless (open(FILESORT,$filesort)) {
	print STDERR "$P: ### Can't read '$filesort' [$!]\n" if $V>0;
	exit $!;
}
$cursiz = 0;		# The size that we're working on at the moment
$sizecnt = 0;		# How many files of this size we have
$totalfreed = 0;	# Bytes freed if we're unlinking
while ($line = <FILESORT>) {
	$line =~ s/[\r\s]+$//;
	if ($line =~ m"^\s*(\d+)\s+d=(\d+)/(\d+)\s+u=(\d+)\s+(.*)$") {
		$size = int($1);
		$dev  = int($2);
		$ino  = int($3);
		$uid  = int($4);
		$path = $5;
		&onefile($size,$dev,$ino,$uid,$path);
	} else {
		print STDERR "$P: Can't parse line \"$line\"\n" if $V>3;
	}
}
if ($sizecnt > 0) {
	print "$P: Final $sizecnt files of size $cursiz.\n" if $V>3;
	&onesize();		# Handle the final set of files
} else {
	print "$P: No files of size $cursiz.\n" if $V>3;
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Clean up our temp files:

if ($cleanup) {
	if (-f $filelist) {
		print "$P: Unlink $filelist\n" if $V>3;
		unlink $filelist;
	}
	if (-f $filesort) {
		print "$P: Unlink $filesort\n" if $V>3;
		unlink $filesort;
	}
}

print "$totalfreed bytes freed.\n" if $V>0 && $totalfreed>0;
print "$P: Exit with status $exitstat.\n" if $V>3;
exit $exitstat;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

sub Backup { my($F) = "$0/Backup";
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#NAME
#  Backup - move file to backup
#
#SYNOPSIS
#  &Backup(filename);
#
#DESCRIPTION
#  We "back up" a file by appending a hyphen to its name. If that file exists,
#  we back it up recursively, to at most $buplim levels.
#
#  We use the global value $bdepth to track depth of recursion.   Perhaps  we
#  should make this a proper module, and localize $bdepth.  Naaah...
#
#  We return 0 if we succeed, or the failure code ($!) if we  fail.   If  the
#  original file doesn't exist, that is considered success, since the primary
#  use of this routine is to ensure that a specified file doesn't exist.  The
#  caller will usually proceed to create it and write data to it.  The caller
#  thus wants to know whether it's now safe to create the file.
#
#AUTHOR
#  John Chambers <jc@trillian.mit.edu>
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	my($fil) = @_;
	my($bup) = "$fil-";
	my($s)   = 0;	# Return status.
	my($x);
	unless (-f $fil) {			# Can't back it up if it doesn't exist.
		print STDERR "$F: File '$fil' doesn't exist!!!\n" if $V>0;
		return 0;				# This isn't really an error, since it's what we want.
	}
	if (++ $bdepth <= $buplim) {		# Keep track of depth of recursive calls
		&Backup($bup) if (-f $bup);		# Recursive backup.
		if (link($fil,$bup)) {			# Link current name to backup name.
			print "$F: Linked \"$fil\" -> \"$bup\"\n" if $V>5;
			unless (-e $bup) {
				print STDERR "$F: link('$fil','$bup') reported success but $bup doesn't exist!!!\n" if $V>0;
				$s = -1;
			}
			if ($x = unlink($fil)) {	# get rid of current name.
				print "$F: unlink(\"$fil\") returned $x.\n" if $V>1;
				if (-e $fil) {
					print STDERR "$F: unlink('$fil') reported success but the file still exists!!!\n" if $V>0;
					$s = -2
				}
			} else {
				print STDERR "$F: unlink(\"$fil\") failed ($!)\n" if $V>0;
				$s = int($!);	# Unlink failed.
			}
		} else {
			print STDERR "$F: Can't link \"$fil\" -> \"$bup\" ($!)\n" if $V>1;
			if (rename($fil,$bup)) {	# Try the rename call.
				print "$F: Renamed \"$fil\" -> \"$bup\"\n" if $V>5;
				if (-e $fil) {
					print STDERR "$F: rename('$fil','$bup') reported success but the $fil still exists!!!\n" if $V>0;
					$s = -3;
				}
			} else {
				print STDERR "$F: rename(\"$fil\",\"$bup\") failed ($!)\n" if $V>0;
				$s = int($!);	# Link to backup failed.
			}
		}
	} else {
		 print "$F: \"$fil\" at depth $bdepth ignored.\n" if $V>1;
	}
	$exitstat = $s if $s;
	$bdepth --;
	return $s;
}

sub drop {my $F='drop'; local($arg) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This routine tests its arg to see if it is a file that  we  should  ignore. #
# The  return value is 1 if it is; 0 if we should look at the arg.  Note that #
# the arg need not be a file name at all, in which case we return 0.          #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	unless (defined $arg) {
		print "$P/$F: Called with undefined arg.\n" if $V>0;
		return 0;
	}
	if (-l $arg) {
		print "$P/$F: Drop '$arg' [symlink]\n" if $V>0;
		return 1;
	} elsif (-f $arg) {
		print "$P/$F: File '$arg'\n" if $V>2;
	} elsif (-d $arg) {
		print "$P/$F: Dir: '$arg'\n" if $V>2;
	} elsif (-p $arg) {
		print "$P/$F: Pipe '$arg'\n" if $V>0;
		return 1;
	} elsif (-S $arg) {
		print "$P/$F: Sock '$arg'\n" if $V>0;
		return 1;
	} elsif (-b $arg) {
		print "$P/$F: Blck '$arg'\n" if $V>0;
		return 1;
	} elsif (-c $arg) {
		print "$P/$F: Char '$arg'\n" if $V>0;
		return 1;
	} elsif (-t $arg) {
		print "$P/$F: Tty: '$arg'\n" if $V>0;
		return 1;
	} else {
		print "$P/$F: ???: '$arg'\n" if $V>0;
	}
	return 0;
}

sub lpad {my $F='lpad'; local($s,$l,$c) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	$c = ' ' unless length($c) eq 1;
	$l -= length($s);		# Number of pad chars needed
	return ($l > 0) ? (($c x $l) . $s) : $s;
}

sub onedir {my $F='onedir';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($dir) = @_;
	local($fil,@files,$pth);
	unless (opendir(DIR, $dir)) {
		print STDERR "$P: ### Can't opendir \"$dir\" [$!]";
		return;
	}
	@files = readdir DIR;	# Slurp up the directory's contents
	closedir DIR;
file:
	for $fil (@files) {
		next if &drop("$dir/$fil");
		next file unless $fil;		# Does this still happen?
		next file if $fil eq '.';	# Avoid infinite recursion
		next file if $fil eq '..';
		$fil =~ s"/+"/"g;	# Reduce multiple slashes
		($pth = "$dir/$fil") =~ s"^\.\/+"";	# Strip off initial "./"
		next file unless $pth && -e $pth;
		if (-d $pth) {
			print "$P: DIR: '$pth'\n" if $V>5;
			++$dircnt;
			&onedir($pth);
		} elsif (-f $pth) {
			print "$P: FILE '$pth'\n" if $V>5;
			++$filcnt;
			($dev,$ino,$mode,$nlink,$uid,undef,undef,$size) = stat($pth);
			print FILELIST &lpad($size,11,'0') .  "	d=$dev/$ino	u=$uid	$pth\n" if $size > $minsize;
		} else {
			print STDERR "$P: Unknown arg \"$arg\" ignored.\n" if $V>0;
		}
	}
}

sub onefile {my $F='onefile';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Deal with the next file in the sorted list. If it's a new size, we have all #
# the  files in the current size, so we call onesize() to compare them and do #
# the linking. If it's the current size, we just add its info to our lists of #
# info  about  the files, and return to get the next file.  Then we clear out #
# the lists and add this file as the first file of the new size.              #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($size,$dev,$ino,$uid,$path) = @_;
	print "$F: size=$size dev=$dev ino=$ino uid=$uid '$path'\n" if $V>3;
	if ($size != $cursiz) {
		if ($sizecnt > 0) {
			print "$F: New size $size not $cursiz.\n" if $V>3;
			&onesize();
		} else {
			print "$F: No files of size $sizecnt.\n" if $V>3;
		}
		$cursiz = int($size);	# Note new current size
		$sizecnt = 0;			# Reset the count of files of this size
#		@fsiz = ();				# File sizes
#		@fuid = ();				# File user id numbers
#		@fdev = ();				# Device numbers
#		@fino = ();				# Inode numbers
		@fpth = ();				# File pathnames
		@fbuf = ();				# Buffers for files' contents
	}
	++ $sizecnt;				# Increment the file counter for this size
#	$fdev[$sizecnt] = $dev;		# Filesystem device number
#	$fino[$sizecnt] = $ino;		# Filesystem inode number
	$fpth[$sizecnt] = $path;	# File's pathname
#	$fsiz[$sizecnt] = $size;	# Is this used?
#	$fuid[$sizecnt] = $uid;		# User ID, which we may or may not need
	printf("$F: Size %d File %d d=%d i=$ino u=%d %s\n",
		$cursiz,$sizecnt,$dev,$ino,$uid,$path) if $V>3;
}

sub dblchk {my $F='dblchk';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Double-check a pair of files.  This is to catch a bug in early versions  of #
# this  program,  in  which non-identical files are treated as identical, and #
# linked incorrectly.  Here we make a second pass through  the  files,  using #
# different code to compare their content, and return 0 if they're different, #
# 1 if they're the same.                                                      #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($ndx1,$ndx2) = @_;	# Index number of files being compared
	local($pth1) = $fpth[$ndx1];
	local($pth2) = $fpth[$ndx2];
	local($data1,$data2,@fbuf);
	local($dev1,$ino1,$nlnk1,$uid1,undef,$siz1);
	local($dev2,$ino2,$nlnk2,$uid2,undef,$siz2);
	print "$F: $ndx1:'$pth1' $ndx2:'$pth2'\n" if $V>1;
	($dev1,$ino1,undef,$nlnk1,$uid1,undef,undef,$siz1) = stat($pth1);
	print "$F: 1 ndx=$ndx1 dev=$dev1 ino=$ino1 nlnk=$nlnk1 uid=$uid1 siz=$siz1 '$pth1'\n" if $V>1;
	($dev2,$ino2,undef,$nlnk2,$uid2,undef,undef,$siz2) = stat($pth2);
	print "$F: 2 ndx=$ndx2 dev=$dev2 ino=$ino2 nlnk=$nlnk2 uid=$uid2 siz=$siz2 '$pth2'\n" if $V>1;
	if ($ino1 == $ino2 && $dev1 == $dev2) {
		print "$F: ### $ndx1:'$pth1' $ndx2:'$pth2' are already linked.\n" if $V>1;
		return 0;
	}
	if ($siz1 != $siz2) {
		print "$F: ### $ndx1:'$pth1' $ndx2:'$pth2' are different sizes: $siz1, $siz2.\n" if $V>0;
		return 0;
	}
	unless (defined($data1 = &getdata($ndx1,$pth1))) {
		print "$F: Can't read $ndx1:'$pth1' [$!]\n" if $V>0;
		return 0;
	}
	unless (defined($data2 = &getdata($ndx2,$pth2))) {
		print "$F: Can't read $ndx2:'$pth2' [$!]\n" if $V>0;
		return 0;
	}
	if ($data1 eq $data2) {
		print "$F: Data in $ndx1:'$pth1' $ndx2:'$pth2' seems identical.\n" if $V>1;
		return 1;
	}
	print "$F: ### $ndx1:'$pth1' $ndx2:'$pth2' contain different data.\n" if $V>1;
	return 0;
}

sub onepair {my $F='onepair';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Compare two files' contents.  If the contents are the same, we either  back #
# up  or  unlink the second file, and make it a link to the first file.  Note #
# that the order of the args is the same as for the ln command and  the  link #
# system call.  The return value is 1 if we do the link, 0 if not.            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($fi1,$fi2) = @_;
	local($fp1) = $fpth[$fi1];
	local($fp2) = $fpth[$fi2];
	local($nlnk2,$size2,$st);
	print "\n$F: Compare $fi1:'$fp1' $fi2:'$fp2' <<<<====\n" if $V>2;
	unless (&samedata($fi1,$fi2)) {
		print "$F: The files have different content.\n" if $V>2;
		return 0;
	}
	print "$F: $fi1:'$fp1' $fi2:'$fp2' have the same content <<<<====\n" if $V>1;
	unless (&dblchk($fi1,$fi2)) {
		print "$F: $fi1:'$fp1' $fi2:'$fp2' FAILED THE DOUBLE CHECK\n" if $V>0; 
		return 0;
	}
	print "$F: $fi1:'$fp1' $fi2:'$fp2' passed the checks and are identical.\n" if $V>1; 
	if ($backup) {
		print "$F: Backup('$fp2')\n" if $V>1;
		if ($forreal) {
			if ($st = Backup($fp2)) {
				print "$F: ### Backup('$fp2') failed, status $st.\n" if $V>0;
			} else {
				print "$F: Backup('$fp2') succeeded.\n" if $V>1;
				if (-e $fp2) {	# Double check that the file no longer exists
					print "$F: ### Backup('$fp2') reported success but the file still exists!!!\n" if $V>0;
				}
			}
		}
	} elsif ($forreal) {
		(undef,undef,undef,$nlnk2,undef,undef,undef,$size2) = stat($fp2);
		print "$F: unlink('$fp2')\n" if $V>1;
		if (unlink($fp2)) {
			if (!$backup && $nlnk2<2) {
				$totalfreed += $size2;
			}
		} else {
			print STDERR "$P/$F: ### Can't unlink $fp2 [$!]\n" if $V>0;
		}
	}
	if ($forreal) {
		print "$F: link('$fp1','$fp2')\n" if $V>1;
		unless (link($fp1,$fp2)) {
			print STDERR "$P/$F: ### Can't link $fp1 -> $fp2 [$!]\n" if $V>0;
			return 0;
		}
	}
	print "$cursiz	$fp1 -> $fp2\n" if $V>0;
#	$fino[$fndx2] = $fino[$fndx1];	# Copy the info that has changed for fil2
#	$fuid[$fndx2] = $fuid[$fndx1];
	return 1;
}

sub onesize {my $F='onesize';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Run thru the list of files of one size, comparing each with all  the  files #
# that  are  later in the list.  We're looking for files that aren't the same #
# file but have the same content and are thus candidates to being linked to a #
# single  file.  Here, we decide which of two files should be the "real" file #
# and linked to the other name.                                               #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($fndx1,$dev1,$ino1,$nlnk1,$uid1,$pth1,$size1,$mtime1);
	local($fndx2,$dev2,$ino2,$nlnk2,$uid2,$pth2,$size2,$mtime2);
	local($chld,$stat);
	print "Compare $sizecnt files of size $cursiz.\n" if $V>3;
	printf "$cursiz ...\n" if $V>1 || $sizes;
	if ($cursiz < $minsize || $sizecnt < 2) {
		print "$F: Ignore $sizecnt files of size $cursiz [minsize=$minsize]\n" if $V>1;
		return;
	}
	&dt();	# Call once to initialize globals to current time
	$chld = 0;				# Default to no child.
	if ($forking) {
		$V = $forking;		# Kludge to adjust debugging when forking
		print "$F: Forking.\n" if $V>1;
		if (($chld = fork) > 0) {	# Parent process
			print "$F: Wait for child $chld to handle size $cursiz ...\n" if $V>1;
			if ($stat = wait) {
				if ($stat == $chld) {
					print "$F: Child $chld done.\n" if $V>2;
				} elsif (defined $stat) {
					print "$F: Got $stat from wait for child $chld.\n" if $V>0;
				} else {
					print "$F: ### wait failed [$!]\n" if $V>0;
				}
				return
			}
			print "$F: ### wait call returned '$stat' [$!]\n" if $V>0;
			return
		} elsif (defined $chld && $chld == 0) {
			sleep $sleep if $sleep > 0;	# Don't hog the cpu
			print "$F: Child process $$ handles size $cursiz ...\n" if $V>2;
		} else {	# Child process
			sleep $sleep if $sleep > 0;	# Don't hog the cpu
			print "$F: ### fork failed [$!]" if $V>0;
		#	return
		}
		dt();
		print "$cymd_hms\tProcess $sizecnt size $cursiz files in child process $$ ...\n" if $V>0;
	}
F1:	for ($fndx1 = 1; $fndx1 < $sizecnt; ++ $fndx1) {		# Run thru files of this size
		$pth1 = $fpth[$fndx1];
F2:		for ($fndx2 = $fndx1+1; $fndx2 <= $sizecnt; ++$fndx2) {	# Compare with other files of this size
			$pth2 = $fpth[$fndx2];
			if (($dev1,$ino1,undef,$nlnk1,$uid1,undef,undef,$size1,undef,$mtime1) = stat($pth1)) {
				print "\n$F: Comp d=$dev1 i=$ino1 u=$uid1 s=$size1 '$pth1' <=======\n" if $V>1;
			} else {
				print "$F: Can't stat '$pth1' [$!]\n" if $V>1;
				next F1
			}
			if (($dev2,$ino2,undef,$nlnk2,$uid2,undef,undef,$size2,undef,$mtime2) = stat($pth2)) {
				print "$F: with d=$dev2 i=$ino2 u=$uid2 s=$size2 $fndx2:'$pth2'\n" if $V>1;
			} else {
				print "$F: Can't stat '$pth2' [$!]\n" if $V>1;
				next F2;
			}
			print "$F: Compare device numbers $dev1 $dev2 ...\n" if $V>1;
			if ($dev1 != $dev2) {	# Different filesystems?
				print "$F: Device numbers differ; can't link.\n" if $V>1;
				next F2;
			}
			print "$F: Compare inode numbers $ino1 $ino2 ...\n" if $V>1;
			if ($ino1 == $ino2) {	# Same device and inode number?
				print "$cursiz	'$pth1' '$pth2' already linked.\n" if $V>1;
				next F2;
			}
			print "$F: Compare user id numbers $uid1 $uid2 ...\n" if $V>1;
			if ($uid1 != $uid2) {
				print "$F: User ID numbers differ; files have different owners.\n" if $V>1;
				next F2;
			}
			print "$F: Compare sizes $size1 $size2 ...\n" if $V>2;
			if ($size1 != $cursiz) {
				print "$cursiz	### '$pth1' changed from $cursiz to $size1 bytes!\n" if $V>1;
				next F1;
			}
			if ($size2 != $cursiz) {
				print "$cursiz	### '$pth2' changed from $cursiz to $size2 bytes!\n" if $V>1;
				next F2;
			}
			if ($newer) {	# Use newer file
				print "$F: Link to newer file.\n" if $V>3;
				if ($mtime1 < $mtime2) {
					print "$F: Link to newer file '$pth2'\n" if $V>3;
					&onepair($fndx2,$fndx1);
				} elsif ($mtime1 > $mtime2) {
					print "$F: Link to newer file '$pth1'\n" if $V>3;
					&onepair($fndx1,$fndx2);
				} elsif ($nlnk1 < $nlnk2) {
					print "$F: Link to more-linked file '$pth2'\n" if $V>3;
					&onepair($fndx2,$fndx1);
				} else {
					print "$F: Link to first file '$pth1'\n" if $V>3;
					&onepair($fndx1,$fndx2);
				}
			} else {	# Use older file
				print "$F: Link to older file.\n" if $V>3;
				if ($mtime1 > $mtime2) {
					print "$F: Link to older file '$pth2'\n" if $V>1;
					&onepair($fndx2,$fndx1);
				} elsif ($mtime1 < $mtime2) {
					print "$F: Link to older file '$pth1'\n" if $V>1;
					&onepair($fndx1,$fndx2);
				} elsif ($nlnk1 < $nlnk2) {
					print "$F: Link to more-linked file '$pth2'\n" if $V>1;
					&onepair($fndx2,$fndx1);
				} else {
					print "$F: Link to first file '$pth1'\n" if $V>1;
					&onepair($fndx1,$fndx2);
				}
			}
		}
	}
	if ($forking) {				# Are we in a subprocess?
		exit 0 if $chld == 0;	# Exit if we're in child
	}
}

sub getdata {my $F='getdata';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Get the data for one file, and save it in  $fbuf[$fndx].   This  overwrites #
# whatever  data was in $fbuf[$fndx].  The return value is the data, or undef #
# if we can't read the file.                                                  #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($fndx,$path) = @_;
	local($data);
	print "$F: Get data for file $fndx:'$path'\n" if $V>2;
	if (open(F,$path)) {	# Try to read the file
		local($sm) = $/;	# Save the line mode
		$/ = undef;			# Enable slurp mode
		$data = <F>;		# Slurp up the file's data
		print "$F: Got " . length($data) . " bytes from '$path'\n" if $V>3;
		$/ = $sm;			# Restore the line mode
		close F;
	} else {
		$data = undef;
		print STDERR "$F: Can't read file $fndx:'$path' [$!]\n" if $V>1;
	}
	$fbuf[$fndx] = $data;	# Remember the data for this file.
	print "$F: FILE $fndx:'$path' CONTAINS {\n$data}\n" if $V>2;
	return $data;
}

sub samedata {my $F='samedata';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Test whether two files have the same contents.  The return value  is  1  if #
# they  are  identical,  0  if  not.  By the time we get here, we should have #
# determined that they are the same size, and are candidates for linking. But #
# we're  a  bit  paranoid,  so  we do a bit of error checking here and return #
# false if they end up to not be the same length after all.                   #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($fx1,$fx2) = @_;
	local($fp1) = $fpth[$fx1];
	local($fp2) = $fpth[$fx2];
	local($fl1,$fl2);
	local($fd1,$fd2);
	local($fe1,$fe2);
	local($l);
	print "$F: cursiz=$cursiz bufthresh=$bufthresh.\n" if $V>1;
	if ($cursiz < $bufthresh) {		# Buffering data for this size?
		print "$F: Buffering data for $cursiz bytes.\n" if $V>1;
		if (defined($fd1 = $fbuf[$fx1])) {
			$l = length($fd1);
			print "$P: $l bytes saved for file $fx1='$fp1'\n" if $V>2;
		} else {
			print "$P: No data buffered for file $fx1='$fp1'\n" if $V>2;
			$fd1 = &getdata($fx1,$fp1);
			$l = length($fd1);
			print "$P: $l bytes read for file $fx1='$fp1'\n" if $V>2;
		}
		if (defined($fd2 = $fbuf[$fx2])) {
			$l = length($fd2);
			print "$P: $l bytes saved for file $fx2='$fp2'\n" if $V>2;
		} else {
			print "$P: No data buffered for file $fx2='$fp2'\n" if $V>2;
			$fd2 = &getdata($fx2,$fp2);
			$l = length($fd2);
			print "$P: $l bytes read for file $fx2='$fp2'\n" if $V>2;
		}
		if (!defined($fd1)) {
			print "$F: No data for $fx1:'$fp1'\n" if $V>1;
		} elsif (!defined($fd2)) {
			print "$F: No data for $fx1:'$fp2'\n" if $V>1;
		} elsif ($fd1 eq $fd2) {
			print "$F: Files $fx1:'$fp1' $fx2:'$fp2' equal.\n" if $V>1;
			return 1;
		} else {
			print "$F: Files $fx1:'$fp1' $fx2:'$fp2' differ.\n" if $V>1;
			return 0;
		}
	}
	print "$F: Not buffering data for $cursiz bytes.\n" if $V>1;
	unless (open(F1,$fp1)) {
		print STDERR "$P/$F: ### Can't read $fx1:'$fp1' [$!]\n" if $V>0;
		return 0;
	}
	unless (open(F2,$fp2)) {
		print STDERR "$P/$F: ### Can't read $fx2:'$fp2' [$!]\n" if $V>0;
		return 0;
	}
	while (!$fe1 && !$fe2) { # EOF on either?
		$fd1 = sysread(F1,$fl1,$bufsiz);
		$fd2 = sysread(F2,$fl2,$bufsiz);
		if (!defined $fd1) {
			print STDERR "$P/$F: ### Error [$!] reading $fx1:'$fp1'\n" if $V>0;
			return 0;		# Treat them as different
		}
		if (!defined $fd2) {
			print STDERR "$P/$F: ### Error [$!] reading $fx2:'$fp2'\n" if $V>0;
			return 0;		# Treat them as different
		}
		if ($fd1 == 0 && $fd1 == 0) {	# EOF test
			print "$F: EOF on both files.\n" if $V>3;
			print "$F: Files $fx1:'$fp1' $fx2:'$fp2' are equal.\n" if $V>2;
			return 1;			# They're the same
		}
		$fe1 = 1 if $fd1 == 0;	# EOF on file 1?
		$fe2 = 1 if $fd2 == 0;	# EOF on file 2?
		if ($fd1 != $fd2) {		# Did we get the same length from both?
			print "$F: Files $fx1:'$fp1' $fx2:'$fp2' different sizes.\n" if $V>2;
			return 0;
		}
		if (!$fl1 && !$fl2) {	# Can this happen?
			print "$F: Files $fx1:'$fp1' $fx2:'$fp2' both empty.\n" if $V>1;
			return 1;			# Treat them as the same if this happens
		}
		if ($fl1 ne $fl2) {
			print "$F: Files fx1:'$fp1' $fx2:'$fp2' different.\n" if $V>1;
			return 0;
		}
		# Loop and get another chunk from each file
	}
	# Is this possible?  Maybe, if either file is changing.
	print "$P/$F: ### Got EOF on only $fx1:'$fp1'\n" if $fe1 && $V>0;
	print "$P/$F: ### Got EOF on only $fx2:'$fp2'\n" if $fe2 && $V>0;
	return 0;	# Treat them as different
}

sub Vopt {my $F = 'Vopt';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#   &Vopt("5myname.log");                                                     #
# Set  the  verbosity from various environment variables.  The value may be a #
# verbose level (1 digit), plus an optional output file name.  The file V  is #
# opened to the file, if any, or STDERR by default. The default value for the #
# verbosity level is 1, which generally means to produce only  serious  error #
# messages.                                                                   #
#                                                                             #
# Here's how this routine is typically called:                                #
#   ($P = $0) =~ s'.*/''; # Program name without directories                  #
#   &Vopt($ENV{"V_$P"} || $ENV{"D_$P"} || $ENV{"T_$P"} || '1');               #
# That's for when you want to call this a "verbose" and "debug"  and  "trace" #
# facility.  I mostly just use the V_$P environment variable.                 #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	$Vopt = shift || '1';			# Make sure we have a verbosity string
	$V = 1 unless defined $V;
	($P = $0) =~ s'.*/'' unless defined($P);
	print "<br>$P/$F: Vopt=\"$Vopt\"<br>\n" if $V>2;
	if ($Vopt =~ /^\s*(\d+)(.+)$/) {
		$V = int($1);				# Verbose level
		$Vfil = $2;					# Verbose output file
		if ($Vfil ne '') {
			if (open(V,">>$Vfil")) {	# Try to append to the file
				open(STDERR,">>&V");	# Switch STDERR over to V
			} else {
				print STDERR "$P/$F: Can't write Vfil=\"$Vfil\" ($!) [V='$1']\n" if $V>0;
				print STDERR "$P/$F: Vopt=\"$Vopt\" ($!) [V='$1']\n" if $V>0;
				open(V,">>&STDERR");	# STDERR is the default 
			}
		}
	} elsif ($Vopt =~ /^\s*(\d+)$/) {	# erbose level only
		$V = int($1);			# Verbose level
		print "$P/$F: Writing to STDERR.\n" if $V>0;
		open(V,">>&STDERR");	# Set V to STDERR
	} elsif ($Vopt) {				# File name only?
		if (open(V,">>$Vopt")) {	# Try to append to the file
			open(STDERR,">>&V");	# Switch STDERR over to V
		} else {
			print V "$P/$F: Can't write Vopt=\"$Vopt\" ($!)\n" if $V>0;
			open(V,">>&STDERR");	# STDERR is the default 
		}
	} else {
		$V ++;					# Null arg, just increment the verbose level
		open(V,">>&STDERR");	# And write to STDERR
	}
	select V; $| = 1;			# Make V unbuffered
	select STDOUT; $| = 1;		# Make sure STDERR is unbuffered
	$esep = '=' x 70;
	print V "\n$P $esep\n" if $V>1;	# Tell the world if we're verbose
	print V "$P started with V=$V [pid=$$] ", `date` if $V>1;
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Get the current data and time.  We leave the integer  timestamp  in #
# $now,  and  the OSI date/time in $cymdhms.  We also leave a shorter #
# date/time string without the century and year in $mdhms. Our return #
# value is $now, the Unix integer timestamp.                          #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

sub dt {
	local($ss,$mm,$hh,$DD,$MM,$CY) = gmtime($now = time);
	$CY += 1900;
	$MM += 1;
	$hms = sprintf('%02d%02d%02d',$hh,$mm,$ss);
	$cymd = sprintf('%02d%02d%02d',$CY,$MM,$DD);
#	$mdhms = sprintf('%02d%02d$hms',$MM,$DD);
#	$cymdhms = "$cymd$hms";
	$cymd_hms = "$cymd $hms";
	return $now;
}

sub hms {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Convert a second counter into hh:mm:ss format.  We trim  away  any  initial #
# zeroes and commas                                                           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($i) = @_;
	local($v) = sprintf("%02d",$i % 60);
	$i /= 60;
	if ($i) {
		$v = sprintf("%02d",$i % 60) . ':' . $v;
		if ($i) {
			$i /= 60;
			$v = sprintf("%d",$i % 60) . ':' . $v;
		}
	}
	$v =~ s/^[0:]*//;	# Delete extra initial zeroes
	return $v;
}
