{- Generate a random probabilistic context-free grammar and sample it. Usage: generate a random grammar, with RNG seeded by seedstring1: ./a.out grammar seedstring1 show grammar in more detail: ./a.out grammarshow seedstring1 generate a random grammar with seedstring1, then repeatedly (infinitely) sample it with a RNG seeded by seedstring2, emitting parse trees. the parse tree of each sample is on its own line. ./a.out tree seedstring1 seedstring2 print out just the concatenated terminals. each sample is on its own line. ./a.out lines seedstring1 seedstring2 all the terminals of all the samples concatenated together: ./a.out run seedstring1 seedstring2 emit the height of each parse tree: ./a.out height seedstring1 seedstring2 more detailed statistics of the number of nodes at each depth in the parse tree: ./a.out run seedstring1 seedstring2 Copyright 2021 Ken Takusagawa This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . $Id: randomgrammar.hs,v 1.18 2021/01/26 11:34:59 kenta Exp $ -} {-# LANGUAGE ScopedTypeVariables, LambdaCase, PackageImports #-} -- ghc --make -Wall -c -igenericReplicateM -package extra -package safe -istringtf randomgrammar.hs module Main(main,trace_placeholder) where { import System.Environment(getArgs); import Control.Exception(assert); import Debug.Trace(trace); import Data.Function((&)); import Control.Category((>>>)); import Prelude hiding((.),(>>)); import qualified Data.List as List; import qualified Data.Map as Map; import Data.Map(Map); import Control.Monad.GenericReplicate(genericReplicateM); -- igenericReplicateM import qualified System.Random as Random; import qualified Data.Random as Randomfu; import Control.Monad.State.Lazy(evalState,State); -- lazy state monad also works import Data.Random.Internal.Source(GetPrim(GetPrim)); import Data.Random.Source.StdGen(getRandomPrimFromRandomGenState); import Data.Random.Distribution.Categorical(weightedCategorical); import Data.Random.List(shuffle); -- shuffle import qualified Data.Bifunctor as Bifunctor; -- (first, second) import qualified Control.Applicative as Applicative; -- liftA2 import qualified Extra; import Safe.Exact(zipExact); import StringTF(stringtf); import Data.Random.Distribution.Bernoulli(bernoulli); import System.IO(hSetBuffering,stdout,BufferMode(LineBuffering)); -- smaller more likely to deeply nest globalnonterminalpenalty :: Nonterminalpenalty; globalnonterminalpenalty = Nonterminalpenalty 0.08; -- be careful about high globalnonterminalpenalty and high myalternativelength (deep and wide = huge). maybe the tree is so wide that some child succeeds. maybe need superexponential. -- forbid grammars that can produce the empty string myemptyallowed :: Bool; myemptyallowed = True; mygenerationparameters1 :: Generationparameters; mygenerationparameters1 = Generationparameters { allnonterminals = map Nonterminal [0..9] , allterminals = letters --, allterminals = cvsyllables --, allterminals = cvnsyllables ++ punctuation --, allterminals = hexsubstitute --, allterminals = hexdigits -- these "length"s are given in terms of probabilities for a geometric -- distribution: the probability of growing just 1 more. larger means longer. -- number of alternatives , rulelength = geometricexpected 2.0 , atleasthowmanyterminals = 3 , proportionterminalalternatives = 0.1 , hasemptyalternative = if myemptyallowed then 0.5 else 0 , terminalalternativelength = geometricexpected 2.0 , nonterminalalternativelength = geometricexpected 3 -- symbols in a alternative containing a mix , symbolisterminal = 0.5 -- decay rate of number of repetitions. -- 0.5 limits to not very many repetitions. -- but repetitions ok if more than one terminal , kleenemaxcontinueprobability = geometricexpected 3.1 , repetitiondistribution = [(5,Once),(if myemptyallowed then 2 else 0,Zeroorone undefined),(if myemptyallowed then 1 else 0,Kleenestar undefined),(1,Kleeneplus undefined)] -- use of "undefined" is ugly --repetitiondistribution = [(5,Once),(0,Zeroorone undefined),(0,Kleenestar undefined),(1,Kleeneplus undefined)]; -- avoid expanding to empty string }; data Generationparameters = Generationparameters { allnonterminals :: [Nonterminal] , allterminals :: [Terminalchar] , rulelength :: Weight , terminalalternativelength :: Weight , nonterminalalternativelength :: Weight , kleenemaxcontinueprobability :: Weight , hasemptyalternative :: Weight , atleasthowmanyterminals :: Ii , proportionterminalalternatives :: Weight , symbolisterminal :: Weight , repetitiondistribution :: [(Rational,Repetition)] } deriving (Show,Eq); main :: IO(); main = getArgs >>= \case{ ["grammar",seed1] -> grammargenerator seed1 & showgrammar & putStr; ["grammarshow",seed1] -> grammargenerator seed1 & snd & mapM_ print; ["tree",seed1,seed2] -> grammargenerator seed1 & treetf seed2 & mapM_ print; ["lines",seed1,seed2] -> grammargenerator seed1 & treetf seed2 & mapM_ (compressoutputstring >>> putStrLn); ["run",seed1,seed2] -> grammargenerator seed1 & treetf seed2 & manycompress & putStr; ["height",seed1,seed2] -> grammargenerator seed1 & treetf seed2 & map height & mapM_ print; ["alldepths",seed1,seed2] -> do{hSetBuffering stdout LineBuffering; grammargenerator seed1 & treetf seed2 & mapM_ (calcdepths >>> collate >>> reverse >>> print)}; ["runmanyuniform"] -> runmanyuniform & mapM_ print; --- hangs _ -> undefined; }; type Ii = Integer; type Weight=Float; -- useful for (id :: Endo Integer) to assert a type in a pipeline type Endo a = a->a; -- to avoid the redundancy warning trace_placeholder :: (); trace_placeholder = (trace,assert,sss,letters,cvsyllables,cvnsyllables,hexdigits,hexsubstitute,punctuation) & (id >>> error) "trace_placeholder"; -- ghc -O2 -fno-ignore-asserts -- let's use RVar as the random monad mkstdgen2 :: Int -> Random.StdGen; mkstdgen2 = Random.mkStdGen >>> Random.random >>> (id :: Endo (Bool, Random.StdGen)) >>> snd; -- at least one of the GetPrim type annotations is required samplervar :: forall a gentype . (Random.RandomGen gentype) => gentype -> Randomfu.RVar a -> a; samplervar gen r = (GetPrim getRandomPrimFromRandomGenState :: GetPrim (State gentype)) & (Randomfu.runRVar r :: GetPrim (State gentype) -> State gentype a) & ((\state -> evalState state gen) :: State gentype a -> a); zipfweights :: [Rational]; --zipfweights = [1..num] & map (\x -> 1/(fromInteger x)); zipfweights = map recip [1..]; {- number of rules chosen in advance. for each rule, a distribution over other rules. for each rule, terminals. at least one terminal alternative. for each rule, nonterminals. terminals, 1 char or multichar nonterminals: sequence of nonterminals or terminals, kleene star, question mark assign weights to alternatives. not probabilities because rules with nonterminals will get scaled down due to nesting depth. single-nonterminal alternatives get treated like terminals. (not done) single-terminal amidst rest terminal get treated special, a generalization of tail call optimization. (not done) -} newtype Nonterminal = Nonterminal Ii deriving (Show, Ord, Eq); type Grammarmap = Map Nonterminal Rule; type Rule = [Alternative]; type Alternative = [(Symbol,Repetition)]; data Symbol = Sterminal Terminalchar | Snonterminal Nonterminal deriving (Show, Ord, Eq); type Terminalchar = String; data Repetition = Once | Zeroorone Weight | Kleenestar Weight | Kleeneplus Weight deriving (Show, Ord, Eq); -- this is a bad idea because more nonterminals causes terminal probabilities to change --allsymbols :: [Symbol]; --allsymbols = map Sterminal allterminals ++ map Snonterminal allnonterminals; sampleoneaccordingtoweights :: [(Rational,a)] -> Randomfu.RVar a; sampleoneaccordingtoweights = unzip >>> Bifunctor.first (doublize) >>> uncurry zip >>> weightedCategorical; -- cannot use Rational as weights because it is too tricky to sample a random Rational -- cannot use Ii because not fractional -- integerize >>> (fromInteger :: Ii -> Weight) silently returns Infinity and terrible things happen --- hopefully the compiler will avoid repeating the work of integerize sampleoneaccordingtozipf :: [a] -> Randomfu.RVar a; sampleoneaccordingtozipf = zip zipfweights >>> sampleoneaccordingtoweights; {- getlcm :: [Rational] -> Ii; getlcm = map Ratio.denominator >>> List.foldl' lcm 1; -- todo: handle zeros integerize :: [Rational] -> [Ii]; integerize l = let { mylcm :: Rational; mylcm = getlcm l & fromInteger; } in map (\x -> x*mylcm & (\y -> if Ratio.denominator y == 1 then Ratio.numerator y else error "lcm did not work")) l; -} doublize :: [Rational] -> [Weight]; doublize l = let { biggest :: Rational; biggest = maximum l; } in map (\x -> x/biggest) l & map fromRational; keepsamplinggeometrically :: forall a . Weight -> Randomfu.RVar a -> Randomfu.RVar [a]; keepsamplinggeometrically threshold x = bernoulli threshold >>= \case { True -> do { item :: a <- x; rest :: [a] <- keepsamplinggeometrically threshold x; return $ item: rest; }; False -> return []; }; keepsamplingwithminimum :: Ii -> Weight -> Randomfu.RVar a -> Randomfu.RVar [a]; keepsamplingwithminimum minlength threshold x = Applicative.liftA2 (++) (genericReplicateM minlength x) (keepsamplinggeometrically threshold x); sss :: Int -> Randomfu.RVar a -> a; sss seed = samplervar (mkstdgen2 seed); -- probably want Extra.nubOrd -- sampling over nonterminals. distribution set at alternative, or rule, or grammar? let's do alternative. createalternative :: Generationparameters -> [Terminalchar] -> [Nonterminal] -> Randomfu.RVar Alternative; createalternative generationparameters terminalsymbols nonterminalsymbols = bernoulli (proportionterminalalternatives generationparameters) >>= (\isterminal -> case isterminal of { True -> terminalsymbols & map Sterminal & sampleoneaccordingtozipf & keepsamplingwithminimum 1 (terminalalternativelength generationparameters); False -> samplesymbol generationparameters terminalsymbols nonterminalsymbols & keepsamplingwithminimum 1 (nonterminalalternativelength generationparameters); -- make minimum length 1 because empty string was being too common as an alternative }) >>= mapM (andrepetition generationparameters); samplesymbol :: Generationparameters -> [Terminalchar] -> [Nonterminal] -> Randomfu.RVar Symbol; samplesymbol generationparameters terminals nonterminals = bernoulli (symbolisterminal generationparameters) >>= \case{ True -> sampleoneaccordingtozipf (map Sterminal terminals); False -> sampleoneaccordingtozipf (map Snonterminal nonterminals); }; andrepetition :: Generationparameters -> Symbol -> Randomfu.RVar (Symbol,Repetition); andrepetition generationparameters x = case x of { Sterminal _ -> return (x,Once); Snonterminal _ -> samplerepetitiontype generationparameters >>= (\y -> return (x,y)) }; samplerepetitiontype :: Generationparameters -> Randomfu.RVar Repetition; samplerepetitiontype generationparameters = do { s1 :: Repetition <- sampleoneaccordingtoweights $ repetitiondistribution generationparameters; -- s1 has bogus (undefined) attached as probabilities case s1 of { Once -> return s1; -- these dont care pattern matches erase undefined. this is ugly. --Zeroorone _ -> return $ Zeroorone 0.5; -- this could be tuned Zeroorone _ -> Randomfu.stdUniform >>= (Zeroorone >>> return); Kleenestar _ -> Randomfu.uniform 0 (kleenemaxcontinueprobability generationparameters) >>= (Kleenestar >>> return); Kleeneplus _ -> Randomfu.uniform 0 (kleenemaxcontinueprobability generationparameters) >>= (Kleeneplus >>> return); }; }; isterminalalternative :: Alternative -> Bool; isterminalalternative = map (\case {(Snonterminal _,_)->False;_->True}) >>> and; -- empty alternative is terminal {- keepsamplinguntil :: (a -> Bool) -> Randomfu.RVar a -> Randomfu.RVar a; keepsamplinguntil predicate f = do { x <- f; if predicate x then return x else keepsamplinguntil predicate f }; -} -- each rule has local preferences for terminals and nonterminals which extend over all alternatives in the rule createrule :: Generationparameters -> Randomfu.RVar Rule; createrule generationparameters = do { terminalsymbols <- shuffle $ allterminals generationparameters; nonterminalsymbols <- shuffle $ allnonterminals generationparameters; someterminals :: [Alternative] <- ((map Sterminal terminalsymbols & sampleoneaccordingtozipf & keepsamplingwithminimum 1 (terminalalternativelength generationparameters) ) >>= mapM (andrepetition generationparameters) ) & genericReplicateM (atleasthowmanyterminals generationparameters); -- let's ignore the possibility that duplication could result in a number less than atleasthowmanyterminals. key is that there is at least 1. nonterminals :: [Alternative] <- keepsamplinggeometrically (rulelength generationparameters) (createalternative generationparameters terminalsymbols nonterminalsymbols); let { nonempties :: [Alternative] = someterminals ++ nonterminals & Extra.nubOrd }; x <- bernoulli (hasemptyalternative generationparameters) >>= (\case { True -> []:nonempties; False -> nonempties} >>> return); shuffle x; }; createallrules :: Generationparameters -> Randomfu.RVar [Rule]; createallrules generationparameters = mapM (\_ -> createrule generationparameters) (allnonterminals generationparameters) >>= shuffle; showsymbol :: Symbol -> String; showsymbol (Sterminal s) = s; showsymbol (Snonterminal (Nonterminal i)) = "("++ show i ++ ")"; showrepetition :: Repetition -> String; showrepetition Once = ""; -- note, these collide with "punctuation" terminals showrepetition (Zeroorone _) = "?"; showrepetition (Kleenestar _) = "*"; showrepetition (Kleeneplus _) = "+"; showsymbolrepetition :: (Symbol,Repetition) -> String; showsymbolrepetition (x,y) = showsymbol x ++ showrepetition y; showalternative :: Alternative -> String; showalternative = concatMap showsymbolrepetition; data Outputtree = Outputterminal Terminalchar | Outputnonterminal Nonterminal [Outputtree] deriving (Show); executesymbol :: Nonterminalpenalty -> Grammarmap -> Depth -> Symbol -> Randomfu.RVar Outputtree; executesymbol _ _ _ (Sterminal t) = return $ Outputterminal t; executesymbol nonterminalpenalty grammar depth (Snonterminal n) = -- trace ("executesymbol "++show (n,depth)) $ grammar Map.! n & executerule nonterminalpenalty grammar depth >>= (\x -> return $ Outputnonterminal n x); -- (\x -> return $ case n of {Nonterminal i -> (show i ++"("): x ++ [")"]}); executerepetition :: Nonterminalpenalty -> Grammarmap -> Depth -> (Symbol,Repetition) -> Randomfu.RVar [Outputtree]; executerepetition nonterminalpenalty grammar depth (s,repetition) = let { do1 :: Randomfu.RVar Outputtree; do1 = executesymbol nonterminalpenalty grammar depth s; } in case repetition of { Once -> do1 >>= (return >>> return); Zeroorone p -> bernoulli p >>= \case { True -> do1 >>= (return >>> return); False -> return []; }; Kleenestar p -> keepsamplinggeometrically p do1; Kleeneplus p -> keepsamplingwithminimum 1 p do1; }; executealternative :: Nonterminalpenalty -> Grammarmap -> Depth -> Alternative -> Randomfu.RVar [Outputtree]; executealternative nonterminalpenalty grammar depth = -- (\(alt :: Alternative) -> trace ("executealternative " ++ show alt) alt) >>> (Extra.concatMapM $ executerepetition nonterminalpenalty grammar depth); executerule :: Nonterminalpenalty -> Grammarmap -> Depth -> Rule -> Randomfu.RVar [Outputtree]; executerule nonterminalpenalty grammar depth alternatives = --trace ("executerule " ++ show theweights) $ weightedCategorical theweights >>= executealternative nonterminalpenalty grammar (incrementdepth depth) where { theweights :: [(Weight,Alternative)]; theweights = zip zipfweights alternatives & map (applyterminalornonterminalpenalty nonterminalpenalty depth); }; applyterminalornonterminalpenalty :: Nonterminalpenalty -> Depth -> (Rational, Alternative) -> (Weight, Alternative); applyterminalornonterminalpenalty nonterminalpenalty depth (w,alternative) = (if isterminalalternative alternative then fromRational w else applynonterminalpenality nonterminalpenalty depth (fromRational w), alternative); -- this requires tuning. use some superexponential function. applynonterminalpenality :: Nonterminalpenalty -> Depth -> Weight -> Weight; applynonterminalpenality nonterminalpenalty (Depth depth) w = if depth > 200 then 0 else if depth < donttouch then w else w/(depth - donttouch + 1 & fromInteger & superexponential nonterminalpenalty); -- subtract 1 from depth because startrule contributes one depth -- don't trim trees less than this tall. donttouch :: Ii; donttouch = 3; newtype Nonterminalpenalty = Nonterminalpenalty Weight deriving (Show, Eq); superexponential :: Nonterminalpenalty -> Weight -> Weight; superexponential (Nonterminalpenalty nonterminalpenalty) n = if n < 1 then error "superexponential" else exp(n*log n*nonterminalpenalty); showrule :: Rule -> String; showrule = map (showalternative >>> (\x -> " "++x)) >>> unlines; showgrammar :: Grammarlist -> String; showgrammar = snd >>> map (uncurry r1) >>> unlines where { r1 :: Nonterminal -> Rule -> String; r1 (Nonterminal i) r = show i ++ "\n" ++ showrule r; }; compressoutput :: Outputtree -> [Terminalchar]; compressoutput (Outputterminal c) = [c]; compressoutput (Outputnonterminal _ t) = concatMap compressoutput t; compressoutputstring :: Outputtree -> String; -- this only works if Terminalchar == String compressoutputstring = compressoutput >>> concat; startrule :: Generationparameters -> Rule; startrule = allnonterminals >>> map (\n -> [(Snonterminal n,Once)]); -- start rule, and grammar in the form of a list type Grammarlist = (Rule,[(Nonterminal,Rule)]); grammargenerator :: String -> Grammarlist; grammargenerator seed1 = let { generationparameters = mygenerationparameters1 } in createallrules generationparameters & samplervar (stringtf seed1) & zip (allnonterminals generationparameters) & \x -> (startrule generationparameters,x); treetf :: String -> Grammarlist -> [Outputtree]; treetf seed2 rules = -- stringtf seed2 & manygenerators & manysample (executerule (creategrammar rules) globalnonterminalpenalty startrule); --executerule (creategrammar rules) globalnonterminalpenalty startrule & manysample2 (stringtf seed2 & manygenerators) --executerule (creategrammar rules) globalnonterminalpenalty startrule & (\r -> map (\g -> samplervar g r) (stringtf seed2 & manygenerators)); --map (\g -> samplervar g (executerule (creategrammar rules) globalnonterminalpenalty startrule)) (stringtf seed2 & manygenerators); --(executerule (creategrammar rules) globalnonterminalpenalty startrule & manysample) (stringtf seed2 & manygenerators); (executerule globalnonterminalpenalty (Map.fromList $ snd rules) (Depth 0) (fst rules) >>= (onlyhead >>> return)) & flip manysample (stringtf seed2 & manygenerators) ; --executerule (creategrammar rules) globalnonterminalpenalty startrule & (\r -> stringtf seed2 & manygenerators & map (\g -> samplervar g r) ); -- onlyhead startrule because startrule is a bunch of singleton nonterminal onlyhead :: [a] -> a; onlyhead [x] = x; onlyhead _ = error "onlyhead was not singleton list"; manycompress :: [Outputtree] -> String; manycompress = map compressoutputstring >>> concat; manygenerators :: (Random.RandomGen gen) => gen -> [gen]; manygenerators = List.unfoldr (Random.split >>> Just); manysample :: forall a gentype . (Random.RandomGen gentype) => Randomfu.RVar a -> [gentype] -> [a]; manysample r = map (\g -> samplervar g r); myuniform :: Randomfu.RVar Double; myuniform = Randomfu.stdUniform; -- could also be written repeat >>> seq manyuniform :: Randomfu.RVar [Double]; manyuniform = do { x <- myuniform; y <- manyuniform; return $ x:y; }; {- this also hangs manyuniform2 :: Randomfu.RVar [Double]; manyuniform2 = repeat myuniform & sequence; -} -- this hangs, both with Control.Monad.State.Strict and Lazy runmanyuniform :: [Double]; runmanyuniform = samplervar (mkstdgen2 10) manyuniform; letters :: [Terminalchar]; letters = enumFromTo 'a' 'z' & map return; -- makes it easy to compare entropy, completely random would 4 bits per byte hexsubstitute :: [Terminalchar]; hexsubstitute = enumFrom 'a' & take 16 & map return; hexdigits :: [Terminalchar]; hexdigits = ['0'..'9']++['a'..'f'] & map return; cvsyllables :: [Terminalchar]; cvsyllables = do { c<-"fklmnprstwyvgbzd"; v <- "aeiou"; return [c,v]; }; cvnsyllables :: [Terminalchar]; cvnsyllables = do { s <- cvsyllables; n <- ["","n "]; return $ s++n }; punctuation :: [Terminalchar]; -- QR code punctuation, because why not. punctuation = map (\x -> x:" ") "$%*+-./:"; -- note: plus and star are also repetition characters, so there will be ambiguity -------------------------------- calcdepths :: Outputtree -> [Depth]; calcdepths (Outputterminal _) = [Depth 0]; calcdepths (Outputnonterminal _ ts) = Depth 0:(concatMap calcdepths ts & map incrementdepth); -- todo: height :: Outputtree -> Depth; height = calcdepths >>> \case { [] -> Depth 0; x -> maximum x; }; collate :: (Ord a, Eq a) => [a] -> [(a,Ii)]; collate = List.sort >>> List.group >>> map (\x -> (head x,List.genericLength x)); newtype Depth = Depth Ii deriving (Show, Ord, Eq); incrementdepth :: Depth -> Depth; incrementdepth (Depth x) = Depth $ x+1; -- find the probability so that expected value (or median?) is x geometricexpected :: Weight -> Weight; geometricexpected x = 0.5**(1/x); } --end