{-# LANGUAGE LambdaCase, ScopedTypeVariables, GeneralizedNewtypeDeriving #-} {- Copyright 2014 Ken Takusagawa This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . -} module Main where { import qualified Data.Map as Map; import qualified Data.Set as Set; import Data.Set(Set); import Data.List(inits, tails, sort, intersperse,foldl',foldl1'); -- foldl' does not actually help import Data.Maybe; import Data.Tuple(swap); import Control.Monad; import System.IO; import System.Environment; import Text.Printf; import Control.Parallel.Strategies(withStrategy,NFData,parListChunk,rdeepseq); import Control.DeepSeq(rnf); newtype Count = Count Integer deriving (Ord,Eq,Show); instance NFData Count where { rnf (Count i) = rnf i; }; newtype Length = Length Int deriving (Show, Enum, Eq, Ord); the_corpus :: [(String,Count)]; the_corpus = [("hello",Count 2),("the",Count 10)]; type F a = Map.Map [a] Count; type Entry a = (Count, [a]); shorts :: Length -> [a] -> [[a]]; shorts (Length n) = takeWhile (\(s :: [a]) -> n == length s) . map (take n) . tails; in_previous_acceptable :: (Ord a) => Set [a] -> [a] -> Bool; in_previous_acceptable ss x = Set.member (init x) ss; type Acceptables a = (Length, Set [a]); new_acceptables :: (Ord a) => Acceptables a -> [a] -> [[a]]; new_acceptables (n,prev) x = filter (in_previous_acceptable prev) $ shorts (succ n) x; null_acceptables :: (Ord a) => Acceptables a; null_acceptables = (Length 0, Set.fromList [[]]); count_plus :: Count -> Count -> Count; count_plus (Count x) (Count y) = Count $ x+y; add_single :: (Ord a) => Count -> F a -> [a] -> F a; add_single c f s = Map.insertWith' count_plus s c f; add_word :: (Ord a) => Acceptables a -> F a -> Entry a -> F a; add_word aa f (c, s) = foldl' (add_single c) f $ new_acceptables aa s; process_corpus :: (Ord a) => [Entry a] -> Acceptables a -> F a; process_corpus corpus aa = foldl' (add_word aa) Map.empty corpus; thresh_f :: Count -> F a -> Set [a]; thresh_f n = Map.keysSet . Map.filter (\(x :: Count) -> n <= x); -- output format of "sort | uniq -c" parse :: String -> Entry Char; parse s = case words s of { [n,w] -> (Count $ read n, w); _ -> error $ "unable to parse " ++ s; }; get_assocs :: Count -> F a -> [Entry a]; get_assocs thresh = map swap . Map.assocs . Map.filter (\(v :: Count) -> thresh <= v); mprint :: Count -> (Length, F Char) -> IO (); mprint thresh (n,f) = do { putStr $ show n ++ " "; print $ reverse $ sort $ get_assocs thresh f; }; type Mstring = [Maybe Char]; imatch :: (Eq a, MonadPlus m) => [a] -> [a] -> m [a]; imatch [] long = return long; imatch (s:srest) (l:lrest) = if (s==l) then imatch srest lrest else mzero; imatch _ [] = mzero; splits :: [a] -> [([a],[a])]; splits l = zip (inits l) (tails l); trymatch :: (Eq a, MonadPlus m) => [a] -> (b, [a]) -> m (b, [a]); trymatch short (i,long) = liftM ( (,) i ) $ imatch short long; tm2 :: String -> (String, String) -> [(String, String)]; tm2 = trymatch; -- returns long with all the shorts cut out of it. mmatch :: (Eq a) => [a] -> [a] -> [[a]]; mmatch short long = case (msum $ map (trymatch short) $ splits long) of { -- concatMap = msum . map Nothing -> return long; Just (i,rest) -> i:(mmatch short rest); }; substitute_entry :: (Eq a) => [a] -> Entry a -> [Entry a]; substitute_entry short (i,x) = liftM ( (,) i) $ filter (not.null) $ mmatch short x; one_search :: forall a . (Ord a) => Count -> [Entry a] -> [(Length,[Entry a])]; one_search thresh corpus = let { do_corpus :: Acceptables a -> F a; do_corpus = process_corpus corpus; one :: F a; one = do_corpus null_acceptables; do_more :: (Length, F a) -> (Length, F a); do_more (n, f) = (succ n, do_corpus (n, thresh_f thresh f)); } in takeWhile (not . null . snd) $ map (apSnd $ get_assocs thresh ) $ iterate do_more (Length 1,one); find_substitution :: (Ord a) => Count -> [Entry a] -> [a]; find_substitution thresh = snd . maximum . snd . last . one_search thresh; shrink_corpus :: (Ord a, MonadPlus m) => Count -> m ([a], [Entry a]) -> m ([a], [Entry a]); shrink_corpus thresh mc = do { c :: [Entry a] <- mc >>= return . snd; case find_substitution thresh c of { [_] -> mzero; -- singleton means nothing got cut out s -> return (s, concatMap (substitute_entry s) c); }}; analyze_best :: Count -> (String, [Entry Char]) -> IO (); analyze_best thresh (s,c) = do { let { one :: [[Entry Char]]; one = map snd $ one_search thresh c; }; putStrLn $ ':':s; putStrLn $ unwords $ zipWith (++) (map show $ enumFrom (0::Integer)) $ map snd $ reverse $ sort $ concat one; -- print $ reverse $ sort $ last one; }; logCount :: Count -> Double; logCount (Count c) = log $ fromInteger c; apFst :: (a->b) -> (a,x) -> (b,x); apFst f (p,q) = (f p,q); apSnd :: (a->b) -> (x,a) -> (x,b); apSnd f (p,q) = (p, f q); newtype LogProbability = LogProbability Double; instance Show LogProbability where { show (LogProbability p) = printf "%.3f" p; }; ------------------------------------------------------ data Composite_char = Leaf Char | Tree Cstring deriving (Eq, Ord); instance Show Composite_char where { showsPrec _ (Leaf c) = (c:); showsPrec _ (Tree cs) = ('(':) . showList cs . (')':); showList [] = id; showList (h:t) = shows h . showList t ; }; instance NFData Composite_char where { rnf (Leaf c) = rnf c; rnf (Tree cs) = rnf cs; }; type Cstring = [Composite_char]; newtype Score = Score Integer deriving (Show, Ord, Eq); make_composite :: Cstring -> Cstring -> Cstring; make_composite short = concat . intersperse [Tree short] . mmatch short; best_composite :: (Ord a) => [(Score,Entry a)] -> [a]; best_composite = snd . snd . head ; annotate_scores:: (Ord a) => Count -> [Entry a] -> [(Score,Entry a)]; annotate_scores thresh corpus = reverse $ sort $ do { (n :: Length, ee :: [Entry a]) <- one_search thresh corpus; e :: Entry a <- ee; return (score_after_composite n e,e); }; composite_corpus :: [Entry Composite_char] -> Cstring -> [Entry Composite_char]; -- this map could be parallelized composite_corpus corpus short = withStrategy (parListChunk 1000 rdeepseq) $ -- but this tends to make things worse the more cores used map (apSnd $ make_composite short) corpus; -- chunk 10000 is worse do_composite :: Score -> [Entry Composite_char] -> IO(); do_composite score_thresh corpus = do { let { with_single :: [(Score,Entry Composite_char)]; with_single = takeWhile (\(s :: Score,_) -> s >= score_thresh ) $ annotate_scores (Count 0) corpus; -- avoid single letter polygrams no_single :: [(Score, Entry Composite_char)]; no_single = filter (\(_,(_,s :: Cstring)) -> length s > 1) with_single; chunk :: [Cstring]; chunk = many_independent_words $ map (snd .snd) no_single; new_corpus :: [Entry Composite_char]; new_corpus = foldl' composite_corpus corpus chunk; }; -- print the full list with singletons just to see how they compare print with_single; if null no_single then return () else do { putStrLn $ many_cstring chunk; -- (benchmark) if length chunk==3 then return () else do_composite score_thresh new_corpus ; } }; many_cstring :: [Cstring] -> String; many_cstring = concat . intersperse " " . map show; thresh_to_score :: Count -> Score; thresh_to_score (Count n) = Score n; independent_set :: (Ord a, MonadPlus m) => Set a -> a -> m (Set a); independent_set s x = if Set.member x s then mzero else return $ Set.insert x s; independent_word :: (Ord a, MonadPlus m) => Set a -> [a] -> m (Set a); independent_word = foldM independent_set; withIndex :: (Ord a, MonadPlus m) => (Int, Set a) -> (Int,[a]) -> m (Int,(Set a)); withIndex (_,s) (index2,xs) = do { -- liftM ((,) index) $ independent_word s xs; out :: Set a <- independent_word s xs; return (index2, out); }; -- utility beforeFail :: (MonadPlus m) => (a -> b -> m a) -> a -> [b] -> m a; beforeFail _ start [] = return start; beforeFail f start (h:t) = mplus (do { new :: a <- f start h; beforeFail f new t; }) $ return start; many_independent_words :: (Ord a) => [[a]] -> [[a]]; many_independent_words [] = []; many_independent_words (h:t) = h: (take (fst $ fromJust $ beforeFail withIndex (0,Set.fromList h) $ zip [1..] t) t); -- the independent set is at least 1. The conservative optimization will flag a polygram with repeated characters as not an independent set on its own. ------------------------------------------------------ main :: IO (); main = do { corpus :: [Entry Char] <- getContents >>= return . map parse . lines; let { global_thresh :: Count; global_thresh = div_factor $ minimum $ Map.elems $ process_corpus corpus null_acceptables; one :: [[Entry Char]]; one = map snd $ one_search global_thresh corpus; single :: [Entry Char]; single = reverse $ sort $ concat one; scaling :: Count; scaling = foldl1' count_plus $ map fst $ head one; logScaling :: Double; logScaling = logCount scaling; logProbability :: Count -> LogProbability; logProbability c = LogProbability $ (logCount c - logScaling) ; -- /log 2 probability_output :: IO (); probability_output = do { let { s :: [(LogProbability,String)]; s = second_half $ map (apFst logProbability) $ single; pw :: ([LogProbability], [String]); pw = unzip s; }; print $ length s; putStrLn $ unwords (snd pw); putStrLn $ unwords $ map show $ fst pw; }; }; hSetBuffering stdout LineBuffering; putStrLn $ "# Thresh=" ++ show global_thresh; getArgs >>= \case { ["shrink"] -> mapM_ (analyze_best global_thresh . fromJust) $ takeWhile isJust $ iterate (shrink_corpus global_thresh) $ Just ("UGLYHACK",corpus); ["single"] -> putStrLn $ unwords $ map snd $ single; ["one"] -> probability_output; ["composite"] -> do_composite (thresh_to_score global_thresh) $ map (apSnd $ map Leaf) corpus; _ -> error "wrong arguments"; }}; -- threshold of the frequency of the least frequent letter (usually z) div_factor :: Count -> Count; div_factor (Count x) = Count $ div x 2; -- or 2 second_half :: [(a,String)] -> [(a,String)]; -- second_half = dropWhile (\(_,w :: String) -> w/="z"); second_half = id; score_after_composite :: Length -> Entry a -> Score; score_after_composite (Length n) ((Count c),_) = Score $ (fromIntegral n-0)*c; -- or 0 }