diff options
author | Arne Georg Gleditsch <argggh@lxr.linpro.no> | 2007-07-05 00:51:08 +0200 |
---|---|---|
committer | Arne Georg Gleditsch <argggh@lxr.linpro.no> | 2007-07-05 00:51:08 +0200 |
commit | e9fa4c98bb5f084739d3418ade3f0c51e34a0aa1 (patch) | |
tree | fec1d635625e031cde7cba1b0a1d95ee92ac760b /lib/LXRng/Index |
Rebase tree.
Diffstat (limited to 'lib/LXRng/Index')
-rw-r--r-- | lib/LXRng/Index/DBI.pm | 430 | ||||
-rw-r--r-- | lib/LXRng/Index/Generic.pm | 172 | ||||
-rw-r--r-- | lib/LXRng/Index/Pg.pm | 417 | ||||
-rw-r--r-- | lib/LXRng/Index/PgBatch.pm | 217 |
4 files changed, 1236 insertions, 0 deletions
diff --git a/lib/LXRng/Index/DBI.pm b/lib/LXRng/Index/DBI.pm new file mode 100644 index 0000000..602eac8 --- /dev/null +++ b/lib/LXRng/Index/DBI.pm @@ -0,0 +1,430 @@ +package LXRng::Index::DBI; + +use strict; +use DBI; + +use base qw(LXRng::Index::Generic); + +sub transaction { + my ($self, $code) = @_; + if ($self->dbh->{AutoCommit}) { + $self->dbh->{AutoCommit} = 0; + $code->(); + $self->dbh->{AutoCommit} = 1; + } + else { + # If we're in a transaction already, don't return to + # AutoCommit state. + $code->(); + } + $self->dbh->commit(); +} + +sub _to_task { + my ($self, $rfile_id, $task) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_to_task_ins'} ||= + $dbh->prepare(qq{insert into ${pre}filestatus(id_rfile) + select ? where not exists + (select 1 from ${pre}filestatus + where id_rfile = ?)}); + $sth->execute($rfile_id, $rfile_id); + + $sth = $$self{'sth'}{'_to_task_upd'}{$task} ||= + $dbh->prepare(qq{update ${pre}filestatus set $task = 't' + where $task = 'f' and id_rfile = ?}); + return $sth->execute($rfile_id) > 0; +} + + +sub to_index { + my ($self, $rfile_id) = @_; + + return $self->_to_task($rfile_id, 'indexed'); +} + +sub to_reference { + my ($self, $rfile_id) = @_; + + return $self->_to_task($rfile_id, 'referenced'); +} + +sub to_hash { + my ($self, $rfile_id) = @_; + + return $self->_to_task($rfile_id, 'hashed'); +} + +sub _get_tree { + my ($self, $tree) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_get_tree'} ||= + $dbh->prepare(qq{select id from ${pre}trees where name = ?}); + my $id; + if ($sth->execute($tree) > 0) { + ($id) = $sth->fetchrow_array(); + } + $sth->finish(); + + return $id; +} + +sub _get_release { + my ($self, $tree_id, $release) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_get_release'} ||= + $dbh->prepare(qq{select id from ${pre}releases + where id_tree = ? and release_tag = ?}); + my $id; + if ($sth->execute($tree_id, $release) > 0) { + ($id) = $sth->fetchrow_array(); + } + $sth->finish(); + + return $id; +} + +sub _get_file { + my ($self, $path) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_get_file'} ||= + $dbh->prepare(qq{select id from ${pre}files where path = ?}); + my $id; + if ($sth->execute($path) > 0) { + ($id) = $sth->fetchrow_array(); + } + $sth->finish(); + + return $id; +} + +sub _get_rfile_by_release { + my ($self, $rel_id, $path) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_get_rfile_by_release'} ||= + $dbh->prepare(qq{select r.id + from ${pre}filereleases fr, ${pre}files f, + ${pre}revisions r + where fr.id_rfile = r.id and r.id_file = f.id + and fr.id_release = ? and f.path = ?}); + + my $id; + if ($sth->execute($rel_id, $path) > 0) { + ($id) = $sth->fetchrow_array(); + } + $sth->finish(); + + return $id; +} + +sub _get_symbol { + my ($self, $symbol) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_get_symbol'} ||= + $dbh->prepare(qq{select id from ${pre}symbols where name = ?}); + my $id; + if ($sth->execute($symbol) > 0) { + ($id) = $sth->fetchrow_array(); + } + $sth->finish(); + + return $id; +} + + +sub _add_include { + my ($self, $file_id, $inc_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_add_include'} ||= + $dbh->prepare(qq{insert into ${pre}includes(id_rfile, id_include_path) + values (?, ?)}); + my $id; + $sth->execute($file_id, $inc_id); + + return 1; +} + +sub _includes_by_id { + my ($self, $file_id) = @_; + +} + +sub _symbol_by_id { + my ($self, $id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_symbol_by_id'} ||= + $dbh->prepare(qq{select * from ${pre}symbols + where id = ?}); + my @res; + if ($sth->execute($id) > 0) { + @res = $sth->fetchrow_array(); + } + $sth->finish(); + + return @res; +} + +sub _identifiers_by_name { + my ($self, $rel_id, $symbol) = @_; + + my $sym_id = $self->_get_symbol($symbol); + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_identifiers_by_name'} ||= + $dbh->prepare(qq{ + select i.id, i.type, f.path, i.line, s.name, c.type, c.id, + i.id_rfile + from ${pre}identifiers i + left outer join ${pre}identifiers c on i.context = c.id + left outer join ${pre}symbols s on c.id_symbol = s.id, + ${pre}files f, ${pre}filereleases r, ${pre}revisions v + where i.id_rfile = v.id and v.id = r.id_rfile + and r.id_release = ? and v.id_file = f.id + and i.id_symbol = ?}); + + $sth->execute($rel_id, $sym_id); + my $res = $sth->fetchall_arrayref(); + + use Data::Dumper; + foreach my $def (@$res) { +# warn Dumper($def); + $$def[7] = 42; +# my @files = $self->get_referring_files($rel_id, $$def[7]); +# warn Dumper(\@files); + } + + return $res; +} + +sub _symbols_by_file { + my ($self, $rfile_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_symbols_by_file'} ||= + $dbh->prepare(qq{select distinct s.name + from ${pre}usage u, ${pre}symbols s + where id_rfile = ? and u.id_symbol = s.id}); + $sth->execute($rfile_id); + my %res; + while (my ($symname) = $sth->fetchrow_array()) { + $res{$symname} = 1; + } + + return \%res; +} + +sub _add_usage { + my ($self, $file_id, $line, $symbol_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_add_usage'} ||= + $dbh->prepare(qq{insert into ${pre}usage(id_rfile, line, id_symbol) + values (?, ?, ?)}); + $sth->execute($file_id, $line, $symbol_id); + + return 1; +} + +sub _usage_by_file { + my ($self, $rfile_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_usage_by_file'} ||= + $dbh->prepare(qq{select s.name, u.line + from ${pre}usage u, ${pre}symbols s + where id_rfile = ? and u.id_symbol = s.id}); + $sth->execute($rfile_id); + + die "Unimplemented"; +} + +sub _rfile_path_by_id { + my ($self, $rfile_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_rfile_path_by_id'} ||= + $dbh->prepare(qq{select f.path from ${pre}files f, ${pre}revisions r + where f.id = r.id_file and r.id = ?}); + my $path; + if ($sth->execute($rfile_id) > 0) { + ($path) = $sth->fetchrow_array(); + } + $sth->finish(); + + return $path; +} + +sub _get_includes_by_file { + my ($self, $res, $rel_id, @rfile_ids) = @_; + + my $placeholders = join(', ', ('?') x @rfile_ids); + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $dbh->prepare(qq{select rf.id, f.path + from ${pre}revisions rf, + ${pre}filereleases v, + ${pre}includes i, + ${pre}revisions ri, + ${pre}files f + where rf.id = i.id_rfile + and rf.id_file = f.id + and rf.id = v.id_rfile + and v.id_release = ? + and i.id_include_path = ri.id_file + and ri.id in ($placeholders)}); + + + $sth->execute($rel_id, @rfile_ids); + my $files = $sth->fetchall_arrayref(); + $sth->finish(); + + my @recurse; + foreach my $r (@$files) { + push(@recurse, $$r[0]) unless exists($$res{$$r[0]}); + + $$res{$$r[0]} = $$r[1]; + } + + $self->_get_includes_by_file($res, $rel_id, @recurse) if @recurse; + + return 1; +} + +sub add_hashed_document { + my ($self, $rfile_id, $doc_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'add_hashed_document'} ||= + $dbh->prepare(qq{insert into ${pre}hashed_documents(id_rfile, doc_id) + values (?, ?)}); + $sth->execute($rfile_id, $doc_id); + + return 1; +} + +sub get_hashed_document { + my ($self, $rfile_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'get_hashed_document'} ||= + $dbh->prepare(qq{select doc_id from ${pre}hashed_documents + where id_rfile = ?}); + my $doc_id; + if ($sth->execute($rfile_id) > 0) { + ($doc_id) = $sth->fetchrow_array(); + } + $sth->finish(); + + return $doc_id; +} + +sub get_symbol_usage { + my ($self, $rel_id, $symid) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'get_symbol_usage'} ||= + $dbh->prepare(qq{ + select u.id_rfile, u.line + from ${pre}usage u, ${pre}filereleases fr + where u.id_symbol = ? + and u.id_rfile = fr.id_rfile and fr.id_release = ?}); + + $sth->execute($symid, $rel_id); + my $res = $sth->fetchall_arrayref(); + $sth->finish(); + + return $res; +} + +sub get_identifier_info { + my ($self, $ident, $rel_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'get_identifier_info'} ||= + $dbh->prepare(qq{ + select s.name, s.id, + i.id, i.type, f.path, i.line, cs.name, c.type, c.id, + i.id_rfile + from ${pre}identifiers i + left outer join ${pre}identifiers c on i.context = c.id + left outer join ${pre}symbols cs on c.id_symbol = cs.id, + ${pre}symbols s, ${pre}revisions r, ${pre}files f + where i.id = ? and i.id_symbol = s.id + and i.id_rfile = r.id and r.id_file = f.id}); + +# select i.id_rfile, f.path, i.line, i.type, i.context, s.id, s.name +# from identifiers i, symbols s, revisions r, files f +# where i.id = ? and i.id_symbol = s.id +# and i.id_rfile = r.id and r.id_file = f.id}); + + unless ($sth->execute($ident) == 1) { + return undef; + } + + my ($symname, $symid, + $iid, $type, $path, $line, $cname, $ctype, $cid, $rfile_id) = + $sth->fetchrow_array(); + $sth->finish(); + + my $refs = {$rfile_id => $path}; + $self->get_referring_files($rel_id, $rfile_id, $refs); + my $usage = $self->get_symbol_usage($rel_id, $symid); + + my %reflines; + foreach my $u (@$usage) { + next unless $$refs{$$u[0]}; + $reflines{$$refs{$$u[0]}} ||= []; + push(@{$reflines{$$refs{$$u[0]}}}, $$u[1]); + } + + return ($symname, $symid, + [$iid, $type, $path, $line, $cname, $ctype, $cid, $rfile_id], + \%reflines); +} + +sub get_rfile_timestamp { + my ($self, $rfile_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'get_rfile_timestamp'} ||= + $dbh->prepare(qq{ + select extract(epoch from last_modified_gmt)::integer, + last_modified_tz + from ${pre}revisions where id = ?}); + + unless ($sth->execute($rfile_id) == 1) { + return undef; + } + + my ($epoch, $tz) = $sth->fetchrow_array(); + $sth->finish(); + + return ($epoch, $tz); +} + +1; diff --git a/lib/LXRng/Index/Generic.pm b/lib/LXRng/Index/Generic.pm new file mode 100644 index 0000000..cab2513 --- /dev/null +++ b/lib/LXRng/Index/Generic.pm @@ -0,0 +1,172 @@ +package LXRng::Index::Generic; + +use strict; +use Memoize; + + +sub new { + my ($class, %args) = @_; + + memoize('tree_id'); + memoize('release_id'); + memoize('file_id'); + memoize('symbol_id'); + + return bless(\%args, $class); +} + +sub prefix { + my ($self) = @_; + if (exists $$self{'table_prefix'}) { + return $$self{'table_prefix'}.'_'; + } + else { + return ''; + } +} + +sub tree_id { + my ($self, $tree, $update) = @_; + + return $self->_get_tree($tree) || + ($update ? $self->_add_tree($tree) : undef); +} + +sub release_id { + my ($self, $tree, $release, $update) = @_; + + my $tree_id = $self->tree_id($tree, $update); + + return $self->_get_release($tree_id, $release) || + ($update ? $self->_add_release($tree_id, $release) : undef); +} + +sub file_id { + my ($self, $path, $update) = @_; + + return $self->_get_file($path) || + ($update ? $self->_add_file($path) : undef); +} + +sub rfile_id { + my ($self, $node, $update) = @_; + + my $path = $node->name; + my $revision = $node->revision; + + my $file_id = $self->file_id($path, $update); + return undef unless $file_id; + + my ($id, $old_stamp) = $self->_get_rfile($file_id, $revision); + return $id unless $update; + + if ($id) { + my ($new_stamp) = $node->time =~ /^(\d+)/; + if ($update and $old_stamp > $new_stamp) { + $self->_update_rfile_timestamp($id, $node->time); + } + } + else { + $id = $self->_add_rfile($file_id, $revision, $node->time); + } + return $id; +} + +sub symbol_id { + my ($self, $symbol, $update) = @_; + + return $self->_get_symbol($symbol) || + ($update ? $self->_add_symbol($symbol) : undef); +} + +sub add_filerelease { + my ($self, $tree, $release, $rfile_id) = @_; + + my $rel_id = $self->release_id($tree, $release, 1); + + $self->_add_filerelease($rfile_id, $rel_id); +} + +sub add_include { + my ($self, $file_id, $include_path) = @_; + + my $inc_id = $self->_get_file($include_path); + + return 0 unless $inc_id; + return $self->_add_include($file_id, $inc_id); +} + +sub includes_by_file { + my ($self, $tree, $release, $path) = @_; + + my $file_id = $self->file_id($tree, $release, $path); + + return $self->_includes_by_id($file_id); +} + +#sub add_symbol { +# my ($self, $file_id, $line, $symbol, $type, $ctx_id) = @_; +# return $self->_add_symbol($file_id, $line, $symbol, $type, $ctx_id); +#} + +sub add_ident { + my ($self, $rfile_id, $line, $symbol, $type, $ctx_id) = @_; + + my $sym_id = $self->symbol_id($symbol, 1); + + return $self->_add_ident($rfile_id, $line, $sym_id, $type, $ctx_id); +} + +sub symbol_by_id { + my ($self, $id) = @_; + + return $self->_symbol_by_id($id); +} + +sub symbols_by_name { + my ($self, $tree, $release, $symbol) = @_; + + my $rel_id = $self->release_id($tree, $release); +# return $cache_sym{$rel_id}{$symbol} if +# exists $cache_sym{$rel_id} and exists $cache_sym{$rel_id}{$symbol}; + return $self->_identifiers_by_name($rel_id, $symbol); +# $cache_sym{$rel_id}{$symbol} = $id; +} + +sub symbols_by_file { + my ($self, $tree, $release, $path) = @_; + + $path =~ s!^/!!; + my $rel_id = $self->_get_release($self->_get_tree($tree), $release); + my $rfile_id = $self->_get_rfile_by_release($rel_id, $path); + + return $self->_symbols_by_file($rfile_id); +} + +sub add_usage { + my ($self, $file_id, $line, $symbol) = @_; + + my $sym_id = $self->symbol_id($symbol, 1); + + return $self->_add_usage($file_id, $line, $sym_id); +} + +sub usage_by_file { + my ($self, $tree, $release, $path) = @_; + + my $rel_id = $self->_get_release($self->_get_tree($tree), $release); + my $rfile_id = $self->_get_rfile_by_release($rel_id, $path); + + return $self->_usage_by_file($rfile_id); +} + +sub get_referring_files { + my ($self, $rel_id, $rfile_id, $res) = @_; + + $res ||= {}; + $self->_get_includes_by_file($res, $rel_id, $rfile_id); + + return keys %$res; +} + +1; diff --git a/lib/LXRng/Index/Pg.pm b/lib/LXRng/Index/Pg.pm new file mode 100644 index 0000000..05fe3a0 --- /dev/null +++ b/lib/LXRng/Index/Pg.pm @@ -0,0 +1,417 @@ +package LXRng::Index::Pg; + +use strict; +use DBI; + +use base qw(LXRng::Index::DBI); + +sub dbh { + my ($self) = @_; + + $$self{'dbh'} ||= DBI->connect('dbi:Pg:'.$$self{'db_spec'}, + $$self{'db_user'}, $$self{'db_pass'}, + {AutoCommit => 1, + RaiseError => 1}) + or die($DBI::errstr); + + return $$self{'dbh'}; +} + +sub init_db { + my ($self) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + $dbh->{AutoCommit} = 0; + + $dbh->do(qq{create sequence ${pre}treenum}) or die($dbh->errstr); + $dbh->do(qq{create sequence ${pre}relnum}) or die($dbh->errstr); + $dbh->do(qq{create sequence ${pre}filenum cache 50}) or die($dbh->errstr); + $dbh->do(qq{create sequence ${pre}revnum cache 50}) or die($dbh->errstr); + $dbh->do(qq{create sequence ${pre}symnum cache 50}) or die($dbh->errstr); + $dbh->do(qq{create sequence ${pre}identnum cache 50}) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}charsets + ( + id serial, + name varchar, + primary key (id) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{insert into ${pre}charsets(name) values ('ascii')}) + or die($dbh->errstr); + $dbh->do(qq{insert into ${pre}charsets(name) values ('utf-8')}) + or die($dbh->errstr); + $dbh->do(qq{insert into ${pre}charsets(name) values ('iso8859-1')}) + or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}trees + ( + id int default nextval('${pre}treenum'), + name varchar, + primary key (id) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}releases + ( + id int default nextval('${pre}relnum'), + id_tree int references ${pre}trees(id), + release_tag varchar, + is_indexed bool default 'f', + primary key (id) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}files + ( + id int default nextval('${pre}filenum'), + path varchar, + primary key (id), + unique (path) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}revisions + ( + id int default nextval('${pre}revnum'), + id_file int references ${pre}files(id), + revision varchar, + last_modified_gmt timestamp without time zone, -- GMT + last_modified_tz varchar(5), -- Optional TZ + body_charset int references ${pre}charsets(id), + primary key (id), + unique (id_file, revision) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}filestatus + ( + id_rfile int references ${pre}revisions(id), + indexed bool default 'f', + referenced bool default 'f', + hashed bool default 'f', + primary key (id_rfile) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}hashed_documents + ( + id_rfile int references ${pre}revisions(id), + doc_id int not null, + primary key (id_rfile) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}filereleases + ( + id_rfile int references ${pre}revisions(id), + id_release int references ${pre}releases(id), + primary key (id_rfile, id_release) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}includes + ( + id_rfile int references ${pre}revisions(id), + id_include_path int references ${pre}files(id) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}symbols + ( + id int default nextval('${pre}symnum'), + name varchar, + primary key (id), + unique (name) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}identifiers + ( + id int default nextval('${pre}identnum'), + id_symbol int references ${pre}symbols(id) deferrable, + id_rfile int references ${pre}revisions(id) deferrable, + line int, + type char(1), + context int references ${pre}identifiers(id) deferrable, + primary key (id) + ) + }) or die($dbh->errstr); + + $dbh->do(qq{ + create table ${pre}usage + ( + id_rfile int references ${pre}revisions(id) deferrable, + id_symbol int references ${pre}symbols(id) deferrable, + line int + ) + }) or die($dbh->errstr); + + $dbh->do(qq{alter table ${pre}usage alter column id_symbol set statistics 250}); + $dbh->do(qq{alter table ${pre}identifiers alter column id_symbol set statistics 250}); + + $dbh->do(qq{create index ${pre}symbol_idx1 on ${pre}symbols using btree (name)}) + or die($dbh->errstr); + $dbh->do(qq{create index ${pre}ident_idx1 on ${pre}identifiers using btree (id_symbol)}) + or die($dbh->errstr); + $dbh->do(qq{create index ${pre}ident_idx2 on ${pre}identifiers using btree (id_rfile)}) + or die($dbh->errstr); + $dbh->do(qq{create index ${pre}ident_idx3 on ${pre}identifiers using btree (id_symbol, id_rfile)}) + or die($dbh->errstr); + $dbh->do(qq{create index ${pre}usage_idx1 on ${pre}usage using btree (id_symbol)}) + or die($dbh->errstr); + $dbh->do(qq{create index ${pre}usage_idx2 on ${pre}usage using btree (id_rfile)}) + or die($dbh->errstr); + $dbh->do(qq{create index ${pre}include_idx1 on ${pre}includes using btree (id_rfile)}) + or die($dbh->errstr); + $dbh->do(qq{create index ${pre}file_idx1 on ${pre}files using btree (path)}) + or die($dbh->errstr); + + $dbh->do(qq{grant select on ${pre}charsets to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}trees to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}releases to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}files to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}releases to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}filereleases to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}filestatus to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}hashed_documents to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}includes to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}symbols to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}identifiers to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}usage to public}) or die($dbh->errstr); + $dbh->do(qq{grant select on ${pre}revisions to public}) or die($dbh->errstr); + + $dbh->commit(); + $dbh->{AutoCommit} = 0; +} + +sub drop_db { + my ($self) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + local($dbh->{RaiseError}) = 0; + + $dbh->do(qq{drop index ${pre}symbol_idx1}); + $dbh->do(qq{drop index ${pre}ident_idx1}); + $dbh->do(qq{drop index ${pre}ident_idx2}); + $dbh->do(qq{drop index ${pre}usage_idx1}); + $dbh->do(qq{drop index ${pre}usage_idx2}); + $dbh->do(qq{drop index ${pre}include_idx1}); + $dbh->do(qq{drop index ${pre}file_idx1}); + + $dbh->do(qq{drop table ${pre}usage}); + $dbh->do(qq{drop table ${pre}identifiers}); + $dbh->do(qq{drop table ${pre}symbols}); + $dbh->do(qq{drop table ${pre}includes}); + $dbh->do(qq{drop table ${pre}filereleases}); + $dbh->do(qq{drop table ${pre}hashed_documents}); + $dbh->do(qq{drop table ${pre}filestatus}); + $dbh->do(qq{drop table ${pre}revisions}); + $dbh->do(qq{drop table ${pre}files}); + $dbh->do(qq{drop table ${pre}releases}); + $dbh->do(qq{drop table ${pre}trees}); + $dbh->do(qq{drop table ${pre}charsets}); + + $dbh->do(qq{drop sequence ${pre}treenum}); + $dbh->do(qq{drop sequence ${pre}relnum}); + $dbh->do(qq{drop sequence ${pre}filenum}); + $dbh->do(qq{drop sequence ${pre}revnum}); + $dbh->do(qq{drop sequence ${pre}symnum}); + $dbh->do(qq{drop sequence ${pre}identnum}); +} + +sub _add_tree { + my ($self, $tree) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_add_tree_ins'} ||= + $dbh->prepare(qq{insert into ${pre}trees(name) values (?)}); + $sth->execute($tree); + + $sth = $$self{'sth'}{'_add_tree_insid'} ||= + $dbh->prepare(qq{select currval('${pre}treenum')}); + $sth->execute(); + my ($id) = $sth->fetchrow_array(); + $sth->finish(); + + return $id; +} + +sub _add_release { + my ($self, $tree_id, $release) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_add_relase_ins'} ||= + $dbh->prepare(qq{insert into ${pre}releases(id_tree, release_tag) + values (?, ?)}); + $sth->execute($tree_id, $release); + + $sth = $$self{'sth'}{'_add_release_insid'} ||= + $dbh->prepare(qq{select currval('${pre}relnum')}); + $sth->execute(); + my ($id) = $sth->fetchrow_array(); + $sth->finish(); + + return $id; +} + +sub _add_file { + my ($self, $path) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_add_file_ins'} ||= + $dbh->prepare(qq{insert into ${pre}files(path) values (?)}); + $sth->execute($path); + + $sth = $$self{'sth'}{'_add_file_insid'} ||= + $dbh->prepare(qq{select currval('${pre}filenum')}); + $sth->execute(); + my ($id) = $sth->fetchrow_array(); + $sth->finish(); + + return $id; +} + +sub _get_rfile { + my ($self, $file_id, $revision) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_get_rfile'} ||= + $dbh->prepare(qq{select id, extract('epoch' from last_modified_gmt + at time zone 'UTC') + from ${pre}revisions + where id_file = ? and revision = ?}); + my ($id, $gmt); + if ($sth->execute($file_id, $revision) > 0) { + ($id, $gmt) = $sth->fetchrow_array(); + } + $sth->finish(); + + return ($id, $gmt); +} + +sub _add_rfile { + my ($self, $file_id, $revision, $time) = @_; + + my ($epoch, $zone) = $time =~ /^(\d+)(?: ([-+]\d\d\d\d)|)$/; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_add_rfile_ins'} ||= + $dbh->prepare(qq{ + insert into ${pre}revisions(id_file, revision, + last_modified_gmt, + last_modified_tz) + values (?, ?, + timestamp 'epoch' + ? * interval '1 second', ?)}); + $sth->execute($file_id, $revision, $epoch, $zone) + or die($dbh->errstr); + + $sth = $$self{'sth'}{'_add_rfile_insid'} ||= + $dbh->prepare(qq{select currval('${pre}revnum')}); + $sth->execute(); + my ($id) = $sth->fetchrow_array(); + $sth->finish(); + + return $id; +} + +sub _update_rfile_timestamp { + my ($self, $rfile_id, $time) = @_; + + my ($epoch, $zone) = $time =~ /^(\d+)(?: ([-+]\d\d\d\d)|)$/; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_update_rfile_timestamp'} ||= + $dbh->prepare(qq{ + update ${pre}revisions set + last_modified_gmt = timestamp 'epoch' + ? * interval '1 second', + last_modified_tz = ? + where id = ?}); + + $sth->execute($epoch, $zone, $rfile_id) + or die($dbh->errstr); + $sth->finish(); +} + +sub _add_filerelease { + my ($self, $rfile_id, $rel_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_add_filerelease'} ||= + $dbh->prepare(qq{insert into ${pre}filereleases(id_rfile, id_release) + select ?, ? where not exists + (select 1 from ${pre}filereleases + where id_rfile = ? and id_release = ?)}); + $sth->execute($rfile_id, $rel_id, $rfile_id, $rel_id); +} + +sub _add_symbol { + my ($self, $symbol) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_add_symbol_ins'} ||= + $dbh->prepare(qq{insert into ${pre}symbols(name) values (?)}); + $sth->execute($symbol); + + $sth = $$self{'sth'}{'_add_symbol_insid'} ||= + $dbh->prepare(qq{select currval('${pre}symnum')}); + $sth->execute(); + my ($id) = $sth->fetchrow_array(); + $sth->finish(); + + return $id; +} + +sub _add_ident { + my ($self, $rfile_id, $line, $sym_id, $type, $ctx_id) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_add_ident_ins'} ||= + $dbh->prepare(qq{insert into ${pre}identifiers + (id_rfile, line, id_symbol, type, context) + values (?, ?, ?, ?, ?)}); + $sth->execute($rfile_id, $line, $sym_id, $type, $ctx_id); + + $sth = $$self{'sth'}{'_add_ident_insid'} ||= + $dbh->prepare(qq{select currval('${pre}identnum')}); + $sth->execute(); + my ($id) = $sth->fetchrow_array(); + $sth->finish(); + + return $id; +} + +sub DESTROY { + my ($self) = @_; + + if ($$self{'dbh'}) { + $$self{'dbh'}->rollback(); + $$self{'dbh'}->disconnect(); + delete($$self{'dbh'}); + } +} + +1; diff --git a/lib/LXRng/Index/PgBatch.pm b/lib/LXRng/Index/PgBatch.pm new file mode 100644 index 0000000..8f8844c --- /dev/null +++ b/lib/LXRng/Index/PgBatch.pm @@ -0,0 +1,217 @@ +package LXRng::Index::PgBatch; + +# Specialized subclass of LXRng::Index::Pg for doing parallelized +# batched inserts into database. Higher performance (and higher +# complexity). + +use strict; +use DBI; +use POSIX qw(:sys_wait_h); + +use base qw(LXRng::Index::Pg); + + +sub transaction { + my ($self, $code) = @_; + + if ($self->dbh->{AutoCommit}) { + $self->dbh->{AutoCommit} = 0; + $self->dbh->do(q(set constraints all deferred)); + $code->(); + $self->flush(); + $self->dbh->commit(); + $self->dbh->{AutoCommit} = 1; + + # At end of outermost transaction, wait for outstanding flushes + $self->_flush_wait(); + } + else { + # If we're in a transaction already, don't return to + # AutoCommit state. + $code->(); + + # Only occasional synchronization if we're inside another + # transaction. + if ($self->{'writes'}++ % 997 == 0) { + $self->flush(); + $self->dbh->commit(); + } + } +} + +sub new { + my ($class, @args) = @_; + + my $self = $class->SUPER::new(@args); + $$self{'writes'} = 0; + + return $self; +} + +sub flush { + my ($self) = @_; + + return unless exists($$self{'cache'}); + + $self->_flush_wait(); + + my $pre = $self->prefix; + $self->dbh->commit() unless $self->dbh->{AutoCommit}; + my $pid = fork(); + die("fork failed: $!") unless defined($pid); + if ($pid == 0) { + $SIG{'INT'} = 'IGNORE'; + $SIG{'QUIT'} = 'IGNORE'; + $SIG{'TERM'} = 'IGNORE'; + + my $i = 0; + $$self{'dbh'} = undef; + foreach my $table (qw(symbols identifiers usage)) { + if (exists($$self{'cache'}{$table})) { + $self->dbh->do(qq{copy $pre$table from stdin}); + foreach my $l (@{$$self{'cache'}{$table}}) { + $i++; + $self->dbh->pg_putline($l); + } + $self->dbh->pg_endcopy; + } + } + $self->dbh->commit() unless $self->dbh->{AutoCommit}; + $self->dbh->do(q(analyze)) if $i > 100000; + $self->dbh->disconnect(); + warn "\n*** index: flushed $i rows\n"; + kill(9, $$); + } + $$self{'flush_pid'} = $pid; + delete($$self{'cache'}); + warn "\n*** index: flushing in background\n"; +} + +sub _flush_wait { + my ($self) = @_; + + return unless $$self{'flush_pid'}; + waitpid($$self{'flush_pid'}, WNOHANG); # Reap zombies + return unless kill(0, $$self{'flush_pid'}); + + warn "\n*** index: waiting for running flush to complete...\n"; + $self->dbh->commit() unless $self->dbh->{AutoCommit}; + waitpid($$self{'flush_pid'}, 0); +} + +sub _cache { + my ($self, $name) = @_; + + $$self{'cache'}{$name} ||= []; + return $$self{'cache'}{$name}; +} + +sub _cached_seqno { + my ($self, $seqname) = @_; + + unless (exists($$self{'cached_seqno'}{$seqname}) and + $$self{'cached_seqno'}{$seqname}{'min'} <= + $$self{'cached_seqno'}{$seqname}{'max'}) + { + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_cached_seqno'}{$seqname} ||= + $dbh->prepare(qq{select setval('$pre$seqname', + nextval('$pre$seqname')+1000)}); + $sth->execute(); + my ($id) = $sth->fetchrow_array(); + $sth->finish(); + $$self{'cached_seqno'}{$seqname}{'min'} = $id-1000; + $$self{'cached_seqno'}{$seqname}{'max'} = $id; + } + + return $$self{'cached_seqno'}{$seqname}{'min'}++; +} + +sub _prime_symbol_cache { + my ($self) = @_; + + my $dbh = $self->dbh; + my $pre = $self->prefix; + my $sth = $$self{'sth'}{'_prime_symbol_cache'} ||= + $dbh->prepare(qq{select name, id from ${pre}symbols}); + $sth->execute(); + my %cache; + while (my ($name, $id) = $sth->fetchrow_array()) { + $cache{$name} = $id; + } + $sth->finish; + + $$self{'__symbol_cache'} = \%cache; +} + +sub _add_usage { + my ($self, $file_id, $line, $symbol_id) = @_; + + push(@{$self->_cache('usage')}, "$file_id\t$symbol_id\t$line\n"); + + return 1; +} + +sub _add_symbol { + my ($self, $symbol) = @_; + + my $id = $self->_cached_seqno('symnum'); + push(@{$self->_cache('symbols')}, "$id\t$symbol\n"); + + $self->_prime_symbol_cache() + unless exists $$self{'__symbol_cache'}; + + $$self{'__symbol_cache'}{$symbol} = $id; + + return $id; +} + +sub _add_ident { + my ($self, $rfile_id, $line, $sym_id, $type, $ctx_id) = @_; + + $ctx_id = '\\N' unless defined($ctx_id); + + my $id = $self->_cached_seqno('identnum'); + push(@{$self->_cache('identifiers')}, join("\t", $id, $sym_id, + $rfile_id, $line, $type, + $ctx_id)."\n"); + + return $id; +} + +my $_get_symbol_usage = 0; +sub _get_symbol { + my ($self, $symbol) = @_; + + unless (exists($$self{'__symbol_cache'})) { + # Only prime the cache once it's clear that we're likely to + # hit it a significant number of times. + return $self->SUPER::_get_symbol($symbol) if + $_get_symbol_usage++ < 100; + + $self->_prime_symbol_cache(); + } + + return $$self{'__symbol_cache'}{$symbol} if + exists $$self{'__symbol_cache'}{$symbol}; + + return undef; +} + +sub DESTROY { + my ($self) = @_; + + if ($$self{'writes'} > 0) { + $self->flush(); + $self->_flush_wait(); + } + + if ($$self{'dbh'}) { + $$self{'dbh'}->rollback() unless $$self{'dbh'}{'AutoCommit'}; + $$self{'dbh'}->disconnect(); + delete($$self{'dbh'}); + } +} + +1; |