#!perl -w # Make the SQL dump 7-bit clean (pure ASCII, no unicode). #all input must be in bytes, not utf-8 (unicode). Not all the #non-ascii data is utf-8; there appear to be other encodings. # Copyright 2015 Ken Takusagawa # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . &head(); while(<>){ if(/^INSERT INTO cable \(id, date, refid, classification, origin, destination, header, content\) VALUES /){ &process($l); $l=$_; } else { $l.=$_; } } &process($l); &tail(); print STDERR "count $count bytes $bytes\n"; #&end_doubles(); sub process { my $s=shift; $bytes+= length$s; return if ($s =~ /^--\n-- PostgreSQL database dump\n--/s); $count++; my @m; die unless (@m = $s =~ /^INSERT INTO cable \(id, date, refid, classification, origin, destination, header, content\) VALUES \((\d+), '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(.*)'\);\n(?:$|.*-- PostgreSQL database dump complete)/s); my @d; push @d,shift@m; while(@m){ push @d,shift@m; shift@m; }; die unless @d==8; #for(0..$#d){ print "item $_ $d[$_]\n"; } print "\n"; for$i(2..5){ die $d[$i] unless &single_line($d[$i]); } &do_work(@d); }; sub single_line { my $s=shift; $s =~/^[ -~]*$/; } sub multiline1 { my $s=shift; $s =~/^([ -~]|\x0a)*(.?|$)/; if($2 eq '') { 1; } else { print "fail $2\n"; 0; } } sub head { print << 'EOF'; -- -- PostgreSQL database dump -- SET statement_timeout = 0; SET client_encoding = 'SQL_ASCII'; SET standard_conforming_strings = off; SET check_function_bodies = false; SET client_min_messages = warning; SET escape_string_warning = off; SET search_path = public, pg_catalog; SET default_tablespace = ''; SET default_with_oids = false; -- -- Name: cable; Type: TABLE; Schema: public; Owner: -; Tablespace: -- CREATE TABLE cable ( id integer NOT NULL, date timestamp without time zone, refid character varying, classification character varying, origin character varying, destination text, header text, content text ); -- -- Name: cable_id_seq; Type: SEQUENCE; Schema: public; Owner: - -- CREATE SEQUENCE cable_id_seq START WITH 1 INCREMENT BY 1 NO MINVALUE NO MAXVALUE CACHE 1; -- -- Name: cable_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: - -- ALTER SEQUENCE cable_id_seq OWNED BY cable.id; -- -- Name: cable_id_seq; Type: SEQUENCE SET; Schema: public; Owner: - -- SELECT pg_catalog.setval('cable_id_seq', 251287, true); -- -- Name: id; Type: DEFAULT; Schema: public; Owner: - -- ALTER TABLE cable ALTER COLUMN id SET DEFAULT nextval('cable_id_seq'::regclass); -- -- Data for Name: cable; Type: TABLE DATA; Schema: public; Owner: - -- EOF } sub tail { print << 'EOF'; -- -- Name: cable_pkey; Type: CONSTRAINT; Schema: public; Owner: -; Tablespace: -- ALTER TABLE ONLY cable ADD CONSTRAINT cable_pkey PRIMARY KEY (id); -- -- PostgreSQL database dump complete -- EOF } sub do_work { die unless @_==8; #&find_doubles(@_); &transform(@_); &just_print(@_); } sub find_doubles { shift @_; #ignore integer count for(@_){ &nodoubles($_); } } sub transform { #header $_[6]=&escape_everything(&sanitize_header($_[0],$_[6])); $_[7]=&escape_everything(&sanitize_body($_[0],$_[7])); } sub find_nonascii { non_ascii($_[0],$_[7]); #&just_print(@_); } sub just_print { print "INSERT INTO cable (id, date, refid, classification, origin, destination, header, content) VALUES ($_[0], '$_[1]', '$_[2]', '$_[3]', '$_[4]', '$_[5]', '"; multiline_print($_[6]); print "', '"; multiline_print($_[7]); print "');\n"; } sub multiline_print { my @F=split /^/,$_[0]; print for (@F); } sub non_ascii { $_[1]=&sanitize_body(@_); my @F=split /^/,$_[1]; for(@F){ next if /^[ -~\n]*$/s; die "<<$_>>\n" unless /([^ -~])/; my $a=$1; printf('%02x %d',ord($a),$_[0]); #print "\n"; print " $_"; } } sub sanitize_header{ for($_[1]){ s/\x09/ /g; #tab s/\xc2\xa0/ /g; #non breaking space # the above two are sufficient for the "header", everything else looks like true garbage } $_[1]; } sub sanitize_body { for($_[1]){ s/\x09/ /g; #tab s/\xc2\xa0/ /g; #non breaking space # the above two are sufficient for the "header", everything else looks like true garbage s/\xe2\x80\x98/''/g; #following the SQL quoting of single quote as double single quote s/\xe2\x80\x99/''/g; s/\xe2\x80\x9c/"/g; s/\xe2\x80\x9d/"/g; s/\xc2\xb4/''/g; s/\x0d\x0a/\x0a/g; s/\xe2\x80\x90/-/g; s/\xe2\x80\x93/--/g; s/\xe2\x80\x94/--/g; s/\xe2\x80\xa6/.../g; s/\xe2\x80\xa8/\n/g; s/\xc3\x97/x/g; # 4x4 auto maker s/\xc2\xb6/SECTION /g if($_[0] == 60); s/\x07/''/g if ($_[0] == 836 or $_[0] == 896 or $_[0] == 984); s/\xc2\xa5/''/g unless $_[0] == 238502; } $_[1]; } #this is pretty slow, 11 minutes sub escape_everything { my @F=unpack("C*",$_[0]); my$out=""; for(@F){ if ($_ == 0x0a or (0x20 <= $_ and $_ < 0x7f)) { $out .= chr$_; }else { $out .= sprintf("\\\\x%02X ",$_); # Use a space after the hex code to wrap long lines later. # Two backslashes to follow the postgresql dump syntax. } } $out; } #investigating the postgresql dumper about which characters never #appear in isolation in a string #22 minutes sub nodoubles { my @f=split //,$_[0]; for (1..($#f - 1)){ if ($f[$_] ne $f[$_-1] and $f[$_] ne $f[$_+1]) { $single{$f[$_]}++; } } } # and the answers are ' and \ sub end_doubles { for(0..255){ if($single{chr$_}){ print $single{chr$_}; } else { print "0"; } print " $_\n"; } }