#!/imsa/bin/perl -w

# Lexical Steganography Encoder v1.0
# Copyright 1998 Keith Winstein <keithw@imsa.edu>

$oversample = 4;
$max_mode = 4;

open DATA, 'tlex.data';;
while (<DATA>) {
  my ($word, @synset) = split;
  $wordcache{ $word } = [ $word, @synset ];
}
close DATA;

%huffman = ( ' ' => "111",
	     E => "000",
	     T => "1101",
	     A => "1011",
	     I => "1001",
	     O => "1000",
	     R => "0111",
	     S => "0110",
	     N => "0100",
	     H => "11001",
	     C => "10101",
	     L => "10100",
	     D => "01011",
	     M => "00111",
	     U => "00110",
	     P => "00100",
	     F => "110001",
	     G => "110000",
	     B => "010100",
	     W => "001011",
	     Y => "001010",
	     V => "0101010",
	     K => "01010110",
	     X => "010101110",
	     Q => "0101011110",
	     J => "01010111110",
	     Z => "01010111111"
	   );

my $bitstring;

for (split '', shift) { $bitstring .= $huffman{ uc $_ } }

sub nextbit {
  return(undef) if ($bitstring eq '');
  my $x = substr($bitstring, 0, 1);
  unless (length $bitstring == 0) {
    $bitstring = substr($bitstring, 1);
  } else {
    $bitstring = '';
  }
  return $x;
}

my $line_num = -1;
my @nonwordrec;
my %synsetable;
my %ordinal;
my $s = 0;

while (defined(my $line = <>)) {
  $line_num++;

  chomp $line;
  $line = 'a ' . $line;

  my @words = split /[^A-Za-z\-]+/, $line;
  my @nonwords = split /[A-Za-z\-]+/, $line;

  $wordrec[ $line_num ] = \@words;
  $nonwordrec[ $line_num ] = \@nonwords;

  my $word_num = -1;
 WORD: for my $word (@words) {
    $word_num++;
    next WORD unless (defined($wordcache{ $word }));
    $synsetable{ $line_num . ' ' . $word_num } = $wordcache{ $word };
    $ordinal{ $line_num . ' ' . $word_num } = $s++;

    print STDERR "Looking at $word (", (join ", ", @{$wordcache{ $word }})
      , ") \n";
  }
}

my $variance = 0;
for (keys %synsetable) {
  print STDERR "Eyeing $_\n";
  $variance += (log (scalar @{$synsetable{ $_ }}))/(log 2);
}

print STDERR "Length of message: ", length $bitstring, "\n";
print STDERR "Variance bits in text: $variance\n";

my $mode = int($variance / length $bitstring);

if ($mode > $max_mode) {
  $mode = $max_mode;
}

unless ($mode >= 1) {
  print STDERR "Sorry, there is not enough entropy to encode the message.\n";
  exit 5;
}

print STDERR "Using mode $mode encoding frequency.\n";

my @zones = sort { $ordinal{ $a } <=> $ordinal{ $b } } keys %synsetable;

while (length $bitstring > 0) {
  print STDERR "String: $bitstring, Length: ", length $bitstring, "\n";
  my @bit_array;
  for (1 .. $oversample) {
    my $bit = nextbit;
    last unless (defined $bit);
    push @bit_array, $bit;
    print STDERR "pushed: $bit_array[ $_ - 1 ] ", scalar(@bit_array), "\n";
  }
  my $bits_assigned = 0;
  my @these_zones;
  while ($bits_assigned < $mode * scalar(@bit_array)) {
    push @these_zones, (my $z = shift @zones);
    $bits_assigned += (log(scalar @{$synsetable{ $z }}))/(log 2);
  }
  encode(\@bit_array, \@these_zones);
}

sub encode {
  my ($r_bits, $r_zones) = @_;

  my (@bits) = @{$r_bits};
  my (@zones) = @{$r_zones};

  my $num_bits = scalar @bits;

  my @choices;
  my @word;

  print STDERR "Encoding @bits into ", (join ", ", @zones), "\n";

  for my $i (0 .. $#zones) {
    my ($l, $w) = split /\s+/, $zones[$i];
    $word[$i] = $wordrec[ $l ][ $w ];
    @{$choices[$i]} = sort @{$wordcache{ $word[$i] }};
  }

  print STDERR "word: @word\n";

  print STDERR "*\n";
  my @vector;
  WORD: for my $i (0 .. $#word) {
    for my $j (0 .. (scalar @{$choices[$i]} - 1)) {
      print STDERR "-";
      if ($choices[$i][$j] eq $word[$i]) {
	push @vector, $j;
	next WORD;
      }
    }
  }

  my @iterarray = (0) x (scalar @word);
  my @codearray = (0) x $num_bits;
  my @codes;

  my $iter = 0;

  print STDERR "^\n";
  while (scalar @iterarray == scalar @word) {
    $iter++;
    print STDERR "." unless ($iter % 1000);
    
    my $equal = 1;
    for (my $q = 0; $q <= $#bits; $q++) {
      $equal = 0 unless ($codearray[$q] eq $bits[$q]);
    }

    if ($equal) {
      push @codes, (join '', @iterarray);
    }

    $iterarray[ 0 ]++;
    $codearray[ 0 ]++;
    for my $i (0 .. $#iterarray) {
      if ($iterarray[ $i ] >= scalar @{$choices[$i]}) {
	$iterarray[ $i ] = 0;
	$iterarray[ $i + 1 ]++;
      }
    }
    
    if (scalar @codearray > $num_bits) {
      pop @codearray;
      $codearray[ 0 ]++;
      for my $i (0 .. $#codearray) {
	if ($codearray[ $i ] >= 2) {
	  $codearray[ $i ] = 0;
	  $codearray[ $i + 1 ]++;
	}
      }
    }
    for my $i (0 .. $#codearray) {
      if ($codearray[ $i ] >= 2) {
	$codearray[ $i ] = 0;
	$codearray[ $i + 1 ]++;
      }
    }
  }
  
  print STDERR "bits: {", join '', @bits, "} \n";

  print STDERR "finding best in ", scalar @codes, " encodings ...";
  
  my $best = v_dist(\@vector, $codes[0]);
  my $best_num = 0;
  my $itera = 0;
  for (@codes) {
    if (v_dist(\@vector, $_) < $best) {
      $best = v_dist(\@vector, $_);
      $best_num = $itera;
    }
    $itera++;
  }
  print STDERR "best_num: $best_num\n";
  my @enc_choices = split '', $codes[$best_num];
  print STDERR "done\n";
  print STDERR "choices: @enc_choices (",
  v_dist(\@vector, $codes[$best_num]), ")\n";
 
  for my $i (0 .. $#enc_choices) {
    my ($l, $w) = split /\s+/, $zones[$i];
    $wordrec[ $l ][ $w ] = $choices[$i][ $enc_choices[$i] ];
  }
}

sub v_dist {
  my ($vec_ref, $p) = @_;
  my ($dist) = (0, 0);

  for $i (0 .. (scalar @{$vec_ref} - 1)) {
    unless ( (@{$vec_ref})[$i] eq substr($p, $i, 1) ) {
      $dist++;
    }
  }
  return ($dist);
}

for my $i (0 .. $line_num ) {
  my $line = '';
  *words = $wordrec[ $i ];
  *nonwords = $nonwordrec[ $i ];
  for (my $i = 0; $i < scalar @words + scalar @nonwords; $i++) {
    $line .= ($i % 2 ? $words[($i - 1) / 2] : $nonwords[$i / 2]);
  }
  $line = substr ($line, 2);
  print $line, "\n";
}
