{-# LANGUAGE ScopedTypeVariables, LambdaCase #-} {- Converting between Unicode code points and 7-bit clean text using a method vaguely similar to UTF-8. Copyright 2018 Ken Takusagawa This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . -} module Main where { import System.Environment(getArgs); import Control.Exception(assert); import Debug.Trace(trace); import Data.Function((&)); import Control.Category((>>>)); import Prelude hiding((.),(>>)); --import System.IO(hSetBuffering,stdout,BufferMode(LineBuffering)); import Data.List(elemIndex,unfoldr,genericTake,genericReplicate,genericSplitAt,genericLength,foldl',genericIndex); --import Control.Monad; import Data.Maybe(fromJust); --import qualified Data.Map as Map; --import Data.Map(Map); --import qualified Data.Set as Set; --import Data.Set(Set); -- to avoid the redundancy warning trace_placeholder :: (); trace_placeholder = (trace,assert) & (id >>> error) "trace_placeholder"; main :: IO(); main = getArgs >>= \case{ ["encode"] -> getContents >>= (concatMap encode_utf6 >>> putStr); ["decode"] -> getContents >>= (decode_utf8 >>> putStr); ["ascii"] -> ascii_table & html_table & concatMap nbsp_ify & putStrLn; _ -> undefined; }; {- some modifications from the ideas of UTF-8 * abandon self-synchronization. eating up 2 bits out of 6 leaves 4, or only 16 characters. * move stuff around so its not chr(0)..chr(63). 0xxxxx 1xxxxx 0xxxxx 1xxxxx 1xxxxx 0xxxxx -} mychars :: [Char]; mychars = [' '] ++ ['a'..'z'] ++ ";,-.?" ++ ['@'..'^'] ++ ['_']; -- it's a nice feature that (;,-.?), the most common punctuation, occupy the columns they do in ascii, between z and DEL. The table is 16-wide ichars :: [Integer]; ichars = map (fromEnum>>>fromIntegral) mychars; ascii_rest :: [Integer]; --ascii_rest = ichars ++ [0..31] ++ [96] ++ [33..62] ++ [127]; -- move the numbers to match space,a..i not p..y. ascii_rest = ichars ++ [0..31] ++ [48..58]++[123] ++ [60..62] ++ [127] ++ [96] ++ [33..43] ++ [124..126]++[47]; -- (ascii_rest & Data.List.sort) == [0..127] -- (ascii_rest & map (`mod` 16)) == (replicate 8 [0..15] & concat) transform_to_myint :: Char -> Integer; transform_to_myint c = let {x :: Int; x = fromEnum c} in if x>127 then fromIntegral x else (elemIndex (fromIntegral x) ascii_rest & fromJust & fromIntegral); -- genericElemIndex does not exist byte_size :: Integer; byte_size = 6; -- |answer comes out in reverse (little-endian) to_binary :: Integer -> [Bool]; to_binary = unfoldr (\n -> if n==0 then Nothing else Just (case divMod n 2 of {(q,r) -> (r==1,q)})); -- |relies on things being in reverse pad_to :: Integer -> [Bool] -> [Bool]; pad_to n x = assert (genericLength x <= n) (x ++ repeat False & genericTake n); add_prefix_to_continuation_bytes :: [Bool] -> [Bool]; add_prefix_to_continuation_bytes = (True:); n_chunk_evenly :: Integer -> [a] -> [[a]]; n_chunk_evenly n = unfoldr (\l -> if null l then Nothing else Just $ part_chunk_evenly n l); part_chunk_evenly :: Integer -> [a] -> ([a],[a]); part_chunk_evenly n l = assert (n <= genericLength l) $ genericSplitAt n l; -- expects big-endian from_binary :: [Bool] -> Integer; from_binary = foldl' (\accum b -> 2*accum+(if b then 1 else 0)) 0; --accum_func :: Integer -> Bool -> Integer; --accum_func accum b = 2*accum+(if b then 1 else 0); int_to_mychar :: Integer -> Char; int_to_mychar = genericIndex mychars; -- assume the bits are in reverse split_and_prefix :: [Bool] -> [[Bool]]; split_and_prefix = n_chunk_evenly (byte_size -1) >>> map reverse >>> prefix_pieces; prefix_pieces :: [[Bool]] -> [[Bool]]; prefix_pieces (x:rest) = (False:x):map add_prefix_to_continuation_bytes rest & reverse; prefix_pieces [] = error "prefix_pieces"; -- argh fine granularity quelling of the incomplete-pattern warning is not possible yet -- getting the code position in the error message would be great -- https://ghc.haskell.org/trac/ghc/ticket/5273 -- GHC.Stack.errorWithStackTrace pad_to_multiple :: [Bool] -> [Bool]; pad_to_multiple [] = genericReplicate (byte_size-1) False; pad_to_multiple l = let { len :: Integer; len = genericLength l; r = mod (negate len) (byte_size -1); -- relies on negative mod working mathematically correctly } in l ++ genericReplicate r False; encode_utf6 :: Char -> [Char]; encode_utf6 = transform_to_myint >>> to_binary >>> pad_to_multiple >>> split_and_prefix >>> map from_binary >>> map int_to_mychar; ------- decoder type Bstring=[Bool]; myspan :: [Bstring] -> ([Bstring],[Bstring]); myspan = span head; -- takeWhile (head == True) shift_one_over :: ([a],[a]) -> ([a],[a]); shift_one_over (x,(h:rest)) = ((x++[h]),rest); shift_one_over (_,[]) = error "shift_one_over"; get_chunks :: [Bstring] -> [[Bstring]]; get_chunks = unfoldr (\x -> if null x then Nothing else myspan x & shift_one_over & Just); chunks_concat :: [Bstring] -> Bstring; chunks_concat = (\s -> if overlong==head s then error "overlong" else s) >>> map tail >>> concat; overlong :: Bstring; overlong = True:repeat False & genericTake byte_size; untransform_myint :: Integer -> Char; untransform_myint i = (if i>127 then i else genericIndex ascii_rest i) & fromInteger & toEnum; mychar_value :: Char -> Integer; mychar_value c = elemIndex c mychars & fromJust & fromIntegral; decode_utf8 :: String -> String; decode_utf8 = map (mychar_value >>> to_binary >>> pad_to byte_size >>> reverse) >>> get_chunks >>> map ( chunks_concat >>> from_binary >>> untransform_myint) ; ------ ascii table ascii_table :: [[String]]; ascii_table = [0..127] & map toEnum & map encode_utf6 & n_chunk_evenly 16; surround :: String -> String -> String; surround element text = "<" ++ element ++ ">" ++ text ++ ""; html_table :: [[String]] -> String; html_table = concatMap (concatMap (surround "td") >>> (surround "tr")) >>> surround "table"; nbsp_ify :: Char -> String; nbsp_ify ' ' = " "; nbsp_ify x = [x]; -- and (map (\x->x== test_id x) [0..1000000]) test_id :: Integer -> Integer; test_id = fromInteger >>> toEnum >>> encode_utf6 >>> decode_utf8 >>> map (fromEnum >>> fromIntegral) >>> unsingleton; unsingleton :: [a] -> a; unsingleton [x] = x; unsingleton _ = undefined; --- uppercase are @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_ -- abcdefghijklmnopqrstuvwxyz;,-.? -- @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_ } --end