#!/usr/bin/perl # # Copyright (C) 2008 Arne Georg Gleditsch and others. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # The full GNU General Public License is included in this distribution # in the file called COPYING. use strict; use warnings; use FindBin; use lib "$FindBin::Bin/lib"; use LXRng ROOT => $FindBin::Bin; use LXRng::Context; use LXRng::Lang; use LXRng::Index; use LXRng::Parse::Simple; use Carp; use Data::Dumper; use IO::Handle; use Fcntl; use Term::ProgressBar; use Devel::Size qw(size total_size); $SIG{'INT'} = sub { die "\nSIGINT: $$: please wait, flushing caches...\n"; }; $SIG{'QUIT'} = sub { die "\nSIGQUIT: $$: please wait, flushing caches...\n"; }; $SIG{'TERM'} = sub { die "\nSIGTERM: $$: please wait, flushing caches...\n"; }; $SIG{'PIPE'} = sub { die "\nSIGTERM: $$: please wait, flushing caches...\n"; }; $SIG{'USR1'} = \&memory_status; autoflush STDOUT 1; autoflush STDERR 1; my $tree = shift(@ARGV); my @versions = @ARGV; my $context = LXRng::Context->new('tree' => $tree); LXRng::Lang->init($context); my $index = $context->config->{'index'}; my $usage = $context->config->{'usage'}; my $hash = $context->config->{'search'}; my $rep = $context->config->{'repository'}; my $progress; $SIG{'__WARN__'} = sub { $progress ? $progress->message(shift) : warn(@_) }; sub memory_status { open(my $statf, "< /proc/$$/statm"); my %stat; @stat{qw(size resident shared text lib data)} = split(" ", <$statf>); warn '$$$ total size '.$stat{'size'}."\n"; foreach my $k (keys %$index) { warn '$$$ index '. sprintf("%-17s %10d\n", $k, total_size($$index{$k})); } } sub make_add_ident($) { my ($fileid) = @_; my $last_func; my %identcache; my $add_ident; $add_ident = sub { my ($symbol, $info) = @_; if ($$info{'kind'} eq 'f') { $last_func = $symbol; } if ($$info{'kind'} eq 'l') { $$info{'context'} = $identcache{$last_func}; } if (exists $$info{'class'}) { $$info{'context'} = $identcache{$$info{'class'}}; } if (exists $$info{'struct'}) { $$info{'context'} = $identcache{$$info{'struct'}}; } $identcache{$symbol} = $index->add_ident($fileid, $$info{'line'}, $symbol, $$info{'kind'}, $$info{'context'}); if ($$info{'kind'} eq 'f' and exists $$info{'signature'}) { # This needs to be more robust. Perhaps ctags ought to do it. foreach my $v (split(/,/, $$info{'signature'})) { next if $v !~ /([a-zA-Z_0-9]+)[^a-zA-Z_0-9]*$/ or $1 eq 'void'; $add_ident->($1, {'kind' => 'l', 'line' => $$info{'line'}}); } } } } sub index_file($$) { my ($file, $fileid) = @_; my $lang = LXRng::Lang->new($file); return 0 unless $index->to_index($fileid); return 1 unless $lang->doindex(); my $add_ident = make_add_ident($fileid); warn("--- indexing ".$file->name." [".$file->revision."]\n"); my @extra_flags = ('-IEXPORT_SYMBOL+', '-I__initcall+'); my $ctags; my $pid = open($ctags, '-|'); die $! unless defined $pid; if ($pid == 0) { exec('ctags-exuberant', @extra_flags, '--fields=+aifmknsSz', '--sort=no', '--excmd=number', '-f', '-', '--language-force='.$lang->ctagslangname, $lang->ctagsopts, $file->phys_path); # Still here? warn $!; kill(9, $$); } while (<$ctags>) { chomp; my ($symbol, $file, $excmd, @info) = split(/\t/); my %info = map { split(/:/, $_, 2) } @info; $add_ident->($symbol, \%info); } return 1; } # We allow $usage to be supplied both by the Search and Index # backends, since it's not quite clear which is better. There's a # certain added complexity because of this, so perhaps this feature # ought to go once a clear best choice emerges. (For instance, # reference_file is called from hash_file with a reference to the # hash indexing's document object, which is only actually used if # $usage is the Search backend.) sub reference_file($$$) { my ($file, $fileid, $doc) = @_; my $lang = LXRng::Lang->new($file); return 0 unless $index->to_reference($fileid); return 1 unless $lang->doindex(); my $handle; sysopen($handle, $file->phys_path, 0) || die($!); my $parse = new LXRng::Parse::Simple($handle, 8, @{$lang->parsespec}); warn("--- referencing ".$file->name." [".$file->revision."]\n"); my $reserved = $lang->reserved(); my $re = qr( (?m:^|[^a-zA-Z0-9_]) # Non-symbol chars. (_*[a-zA-Z][a-zA-Z0-9_]*) # The symbol. \b )x; my %refs; my $line = 1; while (1) { my ($btype, $frag) = $parse->nextfrag; last unless defined $frag; $btype ||= 'code'; if ($btype eq 'code') { while ($frag =~ /\G.*?(?:(\n)|$re)/gc) { $line++ && next if defined $1; my $sym = $2; next if $$reserved{$sym}; push(@{$refs{$sym} ||= []}, $line); } } else { if ($btype eq 'include') { my @paths = $lang->resolve_include($context, $file, $frag); foreach my $path (@paths) { $index->add_include($fileid, $path); } } $line += $frag =~ tr/\n/\n/; } } close($handle); foreach my $sym (keys %refs) { my $sym_id = $index->symbol_id($sym, 1); $usage->add_usage($doc, $fileid, $sym_id, $refs{$sym}); } undef %refs; return 1; } sub hash_file($$$) { my ($file, $fileid, $rels) = @_; my $docid; if ($index->to_hash($fileid)) { my $handle; sysopen($handle, $file->phys_path, 0) || die($!); warn("--- hashing ".$file->name." [".$file->revision."]\n"); my $doc = $hash->new_document($file->name); while (<$handle>) { my $pos = 0; # Latin-1 word characters. foreach my $term (/([0-9a-zA-Z\300-\326\330-\366\370-\377]+)/g) { # TODO: For foo_bar_zoo_ack, index # - foo_bar_zoo_ack # - foo_bar bar_zoo zoo_ack # - foo bar zoo ack # This enables subcomponent searches without running # into the stopword problem that earlier # reduce-to-phrase approaches suffered from. $term = lc($term); next if length($term) > 128; $doc->add_posting($term, $.*100 + $pos++); } } reference_file($file, $fileid, $doc); $docid = $hash->add_document($doc, [map { $index->release_id($tree, $_) } @$rels]); $index->add_hashed_document($fileid, $docid); $handle->close(); return 1; } else { $docid = $index->get_hashed_document($fileid); my $doc = $hash->get_document($docid); if (reference_file($file, $fileid, $doc)) { $hash->save_document($docid, $doc); } my $changed = $hash->add_release($docid, [map { $index->release_id($tree, $_) } @$rels]); return $changed; } } sub inventory_release($) { my ($version) = @_; warn("--- recording all files for $version\n"); my $iter = $rep->iterator($version); LXRng::Index::transaction { my $root = $rep->node('/', $version) or die "bad root"; my $node; while (defined($node = $iter->next)) { next if $node->name =~ /\.o$/; my $fileid = $index->rfile_id($node, 1); $index->add_filerelease($tree, $version, $fileid); } } $index; } sub index_pending() { my $pending = $index->pending_files($tree); my $total = 0+@$pending; my $count = 0; print("\n"); $progress = Term::ProgressBar->new({name => 'Indexing', count => $total, ETA => 'linear'}); $progress->max_update_rate(0.25); warn("--- indexing/updating $total files...\n"); foreach my $p (@$pending) { LXRng::Index::transaction { my ($fileid, $path, $rev) = @$p; my $rels = $index->new_releases_by_file($fileid); next unless @$rels; $context->release($$rels[0]); # Needed for include resolution. my $node = $rep->node($path, $$rels[0], $rev); next unless $node; if (hash_file($node, $fileid, $rels) | index_file($node, $fileid)) { $count++; $progress->update($count); } else { $total--; my $skip = @$pending - $total; if ($skip % 100 == 0) { warn("--- skipped/refreshed $skip files...\n"); } $progress->target($total); } } $index; } $progress->update($total); my $done = $index->update_indexed_releases($tree); warn("=== releases: ".join(", ", @$done)."\n") if @$done; print("\n"); } $progress = Term::ProgressBar->new({name => 'Recording', count => 1, ETA => 'linear'}); $progress->max_update_rate(0.25); if (@versions) { $progress->target(1+@versions); foreach my $version (@versions) { inventory_release($version); $progress->update(); } } else { @versions = grep { ! $index->_get_release($index->tree_id($tree), $_); } @{$context->all_releases}; $progress->target(1+@versions); LXRng::Index::transaction { foreach my $version (reverse @versions) { # TODO: Breaking during the inventory process renders # version half-recorded. inventory_release($version); $progress->update(); } } $index; } $progress->update(); LXRng::Index::transaction { index_pending(); } $index; $hash->flush(); undef $progress;