#!/usr/bin/perl
use Getopt::Std ;  

getopts("fhbmdizasn:FM") ;   
&usage if $opt_h;   


unless ($opt_n) {
  $file1 = $ARGV[0];
  open(FILE1,$file1) || die "Could not open $file1\n";
}

unless ($opt_n) {
  $file2 = $ARGV[1];
  open(FILE2,$file2) || die "Could not open $file2\n";
  if ($opt_i) {
    $header = (<FILE2>);
    print "$header";
    
  }
}
if ($opt_n) {
  $file2 = $ARGV[0];
  open(FILE2,$file2) || die "Could not open $file2\n";
  if ($opt_i) {
    $header = (<FILE2>);
    print "$header";
  }
  
}
unless ($opt_n) {	
  while(<FILE1>) {
    ($field) = split;
    chomp ($_);
    $line1{$field} = $_;
    
  }
}

while(<FILE2>) {
  
  @data = split;
  
  chomp ($_);
  $line = $_;
  if ($opt_s) {
    $field = $data[1];
  }
  else {
    $field = $data[0];
  }	
  if ($opt_F) {
    $_ = $field;
    $old = $field;
    s/^(..........).(.)$/$1$2/;
    $field = $_;
    #print "$old $field\n";
  }
  
  $line2{$field} = $line;
  
}

if ($opt_n) {
  $field = $opt_n;
  $line1{$field} = $opt_n;
}

foreach $field ( sort mysort keys %line1 ) { 
  
    if($opt_a) {
	
	print "$line1{$field}\t $line2{$field}\n";
	
    }
    if ($opt_m)	 {
	unless ( exists($line2{$field}) ) {
	    print "$line1{$field} \n";
	}
    }
   # if ($opt_M)	 {
#	unless ( exists($line2{$field}) ) {
#	    print "$line2{$field} \n";
#	      print "$line1{$field} \n";
#	}
#      }
    
    if ($opt_f || $opt_b ) {
	
	
	if ( exists($line2{$field}) ) {
	    print "$line2{$field}\n";
	}
	
	else {
	    if ($opt_b) {
		print "$line1{$field} MISSING \n";
	    }
	    
	    else {
		print "$line1{$field}\n";
	    }
	    
	    
	    
	}
    }
    if ($opt_z) {
	if ( exists($line2{$field}) ) {
	    print "$line2{$field}\n";
	    push (@genotypes, $line2{$field});
	}
    }
    
    unless ($opt_m || $opt_f || $opt_b || $opt_z || $opt_a) {
	print "$line2{$field}\n" if ( exists($line2{$field}) );
    }
}



sub mysort {


    if ( $a =~ /\D/  || $b =~ /\D/ ) {
	return $a cmp $b;
    }

    else {
	return $a <=> $b;
    }
}	
  sub usage {

die "      
Usage:	find_match.pl OPTIONS <File1> <File2>  
         
 Finds matches between first column of File1 and first column of File2

	-f		outputs all of file 1  (first column only)
	-b		output all of file 1 and print MISSING if absent in file 2
	-m		print missing lines only
	-i		include the first line of File 2 (eg for headers)
	-d              search for duplicates (for genotype files only)
			(use this when combining genotype files)
			Found duplicates are output in a file called Duplicates
	-a		print all columns of file 1
        -z              print matching columns of file 2
        -s              match to the second column in file 2
	-n		give a name to look for in the first column rather than a file
        -F              reduce search item to 11 digits (remove penultimate digit)
";
 
}

sub match_ids {

local $name = 0;
local @data = ();

 foreach (@_) { 
 
	($name, $g1, $g2) = /(\S+)\s+(\S+)\s+(\S+).*/; 
	
	
	if (exists $match{$name} ) {
	my $string = $g1 . "\t" . $g2;
	if ($match{$name} eq "ND\tND") {
		$match{$name} = $string;
		}	
		
#find duplicates - keep a copy in @duplicates and just store a heterozygote,
#if there is one, in the match associative array
	
	 else {
	
	if ($match{$name} eq $string) {
	
	++${MatchedDuplicates{$name}}[0];
	${$MatchedDuplicates{$name}}[1] = ${$MatchedDuplicates{$name}}[1] . "\t" . $string ;
	$_ = $match{$name};
	($compg1, $compg2) = /(\S+)\s+(\S+)/; 
		if ( $compg1 == $compg2 && $g1 != $g2 ) {
		$match{$name} = $string;
		}
	}
	else {
	unless (${MisMatchedDuplicates{$name}}[0] > 0 ) {
		${$MisMatchedDuplicates{$name}}[1] =  "\t" . $match{$name} . " Duplicates - ";
		}
	++${MisMatchedDuplicates{$name}}[0];
	${$MisMatchedDuplicates{$name}}[1] = ${$MisMatchedDuplicates{$name}}[1] . "\t" . $string ;
	$_ = $match{$name};
	($compg1, $compg2) = /(\S+)\s+(\S+)/; 
		if ( $compg1 == $compg2 && $g1 != $g2 ) {
		$match{$name} = $string;
		}
	
	}
	}
	
	}
	}
 

}
