#!perl -w

# Make the SQL dump 7-bit clean (pure ASCII, no unicode).

#all input must be in bytes, not utf-8 (unicode).  Not all the
#non-ascii data is utf-8; there appear to be other encodings.

# Copyright 2015 Ken Takusagawa

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

&head();
while(<>){
    if(/^INSERT INTO cable \(id, date, refid, classification, origin, destination, header, content\) VALUES /){
        &process($l);
        $l=$_;
    } else {
        $l.=$_;
    }
}
&process($l);
&tail();
print STDERR "count $count bytes $bytes\n";
#&end_doubles();

sub process {
    my $s=shift;
    $bytes+= length$s;
    return if ($s =~ /^--\n-- PostgreSQL database dump\n--/s);
    $count++;
    my @m;
    die unless (@m = $s =~ /^INSERT INTO cable \(id, date, refid, classification, origin, destination, header, content\) VALUES \((\d+), '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(.*)'\);\n(?:$|.*-- PostgreSQL database dump complete)/s);
    my @d;
    push @d,shift@m;
    while(@m){
        push @d,shift@m;
        shift@m;
    };
    die unless @d==8;
    #for(0..$#d){        print "item $_ $d[$_]\n";    } print "\n";
    for$i(2..5){
        die $d[$i] unless &single_line($d[$i]);
    }
    &do_work(@d);
};

sub single_line {
    my $s=shift;
    $s =~/^[ -~]*$/;
}
sub multiline1 {
    my $s=shift;
    $s =~/^([ -~]|\x0a)*(.?|$)/;
    if($2 eq '') {
        1;
    } else {
        print "fail $2\n";
        0;
    }
}

sub head {
    print << 'EOF';
--
-- PostgreSQL database dump
--

SET statement_timeout = 0;
SET client_encoding = 'SQL_ASCII';
SET standard_conforming_strings = off;
SET check_function_bodies = false;
SET client_min_messages = warning;
SET escape_string_warning = off;

SET search_path = public, pg_catalog;

SET default_tablespace = '';

SET default_with_oids = false;

--
-- Name: cable; Type: TABLE; Schema: public; Owner: -; Tablespace: 
--

CREATE TABLE cable (
    id integer NOT NULL,
    date timestamp without time zone,
    refid character varying,
    classification character varying,
    origin character varying,
    destination text,
    header text,
    content text
);


--
-- Name: cable_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--

CREATE SEQUENCE cable_id_seq
    START WITH 1
    INCREMENT BY 1
    NO MINVALUE
    NO MAXVALUE
    CACHE 1;


--
-- Name: cable_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
--

ALTER SEQUENCE cable_id_seq OWNED BY cable.id;


--
-- Name: cable_id_seq; Type: SEQUENCE SET; Schema: public; Owner: -
--

SELECT pg_catalog.setval('cable_id_seq', 251287, true);


--
-- Name: id; Type: DEFAULT; Schema: public; Owner: -
--

ALTER TABLE cable ALTER COLUMN id SET DEFAULT nextval('cable_id_seq'::regclass);


--
-- Data for Name: cable; Type: TABLE DATA; Schema: public; Owner: -
--

EOF
}

sub tail {
    print << 'EOF';


--
-- Name: cable_pkey; Type: CONSTRAINT; Schema: public; Owner: -; Tablespace: 
--

ALTER TABLE ONLY cable
    ADD CONSTRAINT cable_pkey PRIMARY KEY (id);


--
-- PostgreSQL database dump complete
--

EOF
}

sub do_work {
    die unless @_==8;
    #&find_doubles(@_);
    &transform(@_);
    &just_print(@_);
}

sub find_doubles {
    shift @_;  #ignore integer count
    for(@_){
        &nodoubles($_);
    }
}

sub transform {
    #header
    $_[6]=&escape_everything(&sanitize_header($_[0],$_[6]));
    $_[7]=&escape_everything(&sanitize_body($_[0],$_[7]));
}

sub find_nonascii {
    non_ascii($_[0],$_[7]);
    #&just_print(@_);
}

sub just_print {
    print "INSERT INTO cable (id, date, refid, classification, origin, destination, header, content) VALUES ($_[0], '$_[1]', '$_[2]', '$_[3]', '$_[4]', '$_[5]', '";
    multiline_print($_[6]);
    print "', '";
    multiline_print($_[7]);
    print "');\n";
}

sub multiline_print {
    my @F=split /^/,$_[0];
    print for (@F);
}

sub non_ascii {
    $_[1]=&sanitize_body(@_);
    my @F=split /^/,$_[1];
    for(@F){
        next if /^[ -~\n]*$/s;
        die "<<$_>>\n" unless /([^ -~])/;
        my $a=$1;
        printf('%02x %d',ord($a),$_[0]);
        #print "\n";
        print " $_";
    }
}

sub sanitize_header{
    for($_[1]){
        s/\x09/ /g; #tab
        s/\xc2\xa0/ /g; #non breaking space
        # the above two are sufficient for the "header", everything else looks like true garbage
    }
    $_[1];
}

sub sanitize_body {
    for($_[1]){
        s/\x09/ /g; #tab
        s/\xc2\xa0/ /g; #non breaking space
        # the above two are sufficient for the "header", everything else looks like true garbage
        s/\xe2\x80\x98/''/g;  #following the SQL quoting of single quote as double single quote
        s/\xe2\x80\x99/''/g;
        s/\xe2\x80\x9c/"/g;
        s/\xe2\x80\x9d/"/g;
        s/\xc2\xb4/''/g;
        s/\x0d\x0a/\x0a/g;
        s/\xe2\x80\x90/-/g;
        s/\xe2\x80\x93/--/g;
        s/\xe2\x80\x94/--/g;
        s/\xe2\x80\xa6/.../g;
        s/\xe2\x80\xa8/\n/g;
        s/\xc3\x97/x/g;  # 4x4 auto maker
        s/\xc2\xb6/SECTION /g if($_[0] == 60);
        s/\x07/''/g if ($_[0] == 836 or $_[0] == 896 or $_[0] == 984);
        s/\xc2\xa5/''/g unless $_[0] == 238502;

    }
    $_[1];
}

#this is pretty slow, 11 minutes
sub escape_everything {
    my @F=unpack("C*",$_[0]);
    my$out="";
    for(@F){
        if ($_ == 0x0a or (0x20 <= $_ and $_ < 0x7f)) {
            $out .= chr$_;
        }else {
            $out .= sprintf("\\\\x%02X ",$_);
            # Use a space after the hex code to wrap long lines later.
            # Two backslashes to follow the postgresql dump syntax.
        }
    }
    $out;
}

#investigating the postgresql dumper about which characters never
#appear in isolation in a string
#22 minutes
sub nodoubles {
    my @f=split //,$_[0];
    for (1..($#f - 1)){
        if ($f[$_] ne $f[$_-1] and $f[$_] ne $f[$_+1]) {
            $single{$f[$_]}++;
        }
    }
}
# and the answers are ' and \
sub end_doubles {
    for(0..255){
        if($single{chr$_}){
            print $single{chr$_};
        } else {
            print "0";
        }
        print " $_\n";
    }
}