package ngrams; import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.Scanner; public class Ngrams { // Main program public static void main(String[] args) throws IOException { String file1 = "", file2 = ""; float diff = 0; // Total frequency difference between two files char[] alpha = {'a','à','â','ä','b','c','ç','d','e','è','é','ê','ë', 'f','g','h','i','î','ï','j','k','l','m','n','o','ô','œ','p','q', 'r','s','t','u','ù','û','ü','v','w','x','y','ÿ','z'}; // Valid characters System.out.print("Input First File Name: "); file1 = read(file1); // Reads text from file System.out.print("Input Second File Name: "); file2 = read(file2); float bi1[][] = count(file1, alpha); // Creates frequency table for file float bi2[][] = count(file2, alpha); // Calculates total frequency difference for (int i = 0; i < alpha.length; i++) for (int j = 0; j < alpha.length; j++) diff += Math.abs(bi1[i][j] - bi2[i][j]); System.out.println(diff); if (diff < 55) // Experimentally determined threshold System.out.println("Same Language"); else System.out.println("Different Language"); } // Create frequency tables public static float[][] count(String file,char[] alpha) throws IOException { // 2-d frequency table of all possible bigrams float bigram[][] = new float[alpha.length][alpha.length]; for (int i = 0; i < alpha.length; i++) for (int j = 0; j < alpha.length; j++) bigram[i][j] = 0; // Initialize with frequency=0 int a = alpha.length+1, b = alpha.length+1; float total = 0; // Total number of bigrams Scanner in = new Scanner(new File(file+".txt")); while(in.hasNext("\\S+")) { // Read file word by word String word = in.next("\\S+"); word.toLowerCase(); for (int k = 0; k < word.length()-1; k++) { // For each pair of letters a = alpha.length+1; b = alpha.length+1; for (int m = 0; m < alpha.length; m++) { // Locates each letter within alpha list if (word.charAt(k) == alpha[m]) a = m; if (word.charAt(k+1) == alpha[m]) b = m; } if (a < alpha.length && b < alpha.length) { // Adds valid bigrams to frequency list bigram[a][b]++; total++; } } } // Create new file with frequency table PrintWriter out = new PrintWriter(new FileWriter(file+"_tab.csv")); for (int p = 0; p < alpha.length; p++) { for (int q = 0; q < alpha.length; q++) { bigram[p][q] = (bigram[p][q] / total) * 100; // Convert to decimal frequency out.print(bigram[p][q]); out.print(","); } out.print("\n"); } out.close(); return bigram; } // Read text from file public static String read(String str) { String str2 = ""; BufferedReader read = new BufferedReader(new InputStreamReader(System.in)); try { str2 = read.readLine(); } catch (IOException ioe) { System.out.println("Error: Cannot Read Input\n"); read(str); } return str2; } }