#!perl -w # Takes the sql dump file and separates each cable into a separate # file, organized into directories with 1000 cables per directory. # Copyright 2015 Ken Takusagawa # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # Statistics after reformatting and separating: # uncompressed # $ du -sk . # 2203664 . # bzip2 is best # $ find . -type f -print0 | time xargs -0 -n 20 -P 8 bzip2 # 426.66user 78.17system 1:22.18elapsed 614%CPU (0avgtext+0avgdata 5348maxresident)k # 2796816inputs+3196312outputs (5major+8969243minor)pagefaults 0swaps # $ du -sk . # 1239028 . # gzip -9 is only slightly worse # 1279976 . # xz # 1262748 . # The original was # 368704 cable_db_full.7z while(<>){ if(/^INSERT INTO cable \(id, date, refid, classification, origin, destination, header, content\) VALUES /){ &process($l); $l=$_; } else { $l.=$_; } } &process($l); print STDERR "count $count bytes $bytes\n"; &end(); sub process { my $s=shift; $bytes+= length$s; return if ($s =~ /^--\n-- PostgreSQL database dump\n--/s); $count++; my @m; die unless (@m = $s =~ /^INSERT INTO cable \(id, date, refid, classification, origin, destination, header, content\) VALUES \((\d+), '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(([^']|'')*?)', '(.*)'\);\n(?:$|.*-- PostgreSQL database dump complete)/s); my @d; push @d,shift@m; while(@m){ push @d,shift@m; shift@m; }; die unless @d==8; #for(0..$#d){ print "item $_ $d[$_]\n"; } print "\n"; for$i(2..5){ die $d[$i] unless &single_line($d[$i]); } do_work(@d); }; sub single_line { my $s=shift; $s =~/^[ -~]*$/; } sub do_work { die unless @_==8; die if defined $id{$_[0]}; $id{$_[0]}++; die if $_[0]>=1000000; die if $_[0]<0; my($a,$b)=((sprintf('%06d',$_[0])) =~ /(...)(...)/) or die; if(-e $a) { die unless -d $a; } else { print STDERR "$a\n"; mkdir $a; } die if -e "$a/$b"; open FO,">$a/$b" or die; for(@_){ s/''/'/g; s/\\\\/\\/g; } print FO << "EOF"; id: $_[0] date: $_[1] refid: $_[2] classification: $_[3] origin: $_[4] destination: $_[5] header: $_[6] body: $_[7] EOF close FO; } sub just_print { die unless @_==8; print "INSERT INTO cable (id, date, refid, classification, origin, destination, header, content) VALUES ($_[0], '$_[1]', '$_[2]', '$_[3]', '$_[4]', '$_[5]', '"; multiline_print($_[6]); print "', '"; multiline_print($_[7]); print "');\n"; } sub multiline_print { my @F=split /^/,$_[0]; print for (@F); } sub end { }