diff options
| author | Arne Georg Gleditsch <argggh@taniquetil.(none)> | 2009-03-06 21:13:18 +0100 | 
|---|---|---|
| committer | Arne Georg Gleditsch <argggh@taniquetil.(none)> | 2009-03-06 21:13:18 +0100 | 
| commit | 8d7d1d2595c2c3475ec27d07d8544e345dab5851 (patch) | |
| tree | 6337fd7aa489a73c44abb742b2bac22741b00779 | |
| parent | a25f8442450d7e9fb188a979ab0897cb86e9d84f (diff) | |
Added more language modules, threw things about a bit.
| -rw-r--r-- | INSTALL | 4 | ||||
| -rw-r--r-- | lib/LXRng/Index/DBI.pm | 16 | ||||
| -rw-r--r-- | lib/LXRng/Index/Pg.pm | 2 | ||||
| -rw-r--r-- | lib/LXRng/Lang/C.pm | 37 | ||||
| -rw-r--r-- | lib/LXRng/Lang/Generic.pm | 5 | ||||
| -rw-r--r-- | lib/LXRng/Lang/GnuAsm.pm | 164 | ||||
| -rw-r--r-- | lib/LXRng/Lang/Kconfig.pm | 118 | ||||
| -rw-r--r-- | lib/LXRng/Lang/Undefined.pm | 4 | ||||
| -rw-r--r-- | lib/LXRng/Markup/File.pm | 13 | ||||
| -rw-r--r-- | lib/LXRng/Parse/Simple.pm | 111 | ||||
| -rw-r--r-- | lib/LXRng/Search/Xapian.pm | 33 | ||||
| -rwxr-xr-x | lxr-genxref | 37 | ||||
| -rw-r--r-- | lxrng.conf-dist | 8 | 
13 files changed, 433 insertions, 119 deletions
@@ -20,6 +20,10 @@ DEPENDENCIES    - HTML::Entities [libhtml-parser-perl]    - Template [libtemplate-perl] +* Misc Perl modules +  - Devel::Size [libdevel-size-perl] +  - Term::ProgressBar [libterm-progressbar-perl] +  * (For gzip content transfer compression: PerlIO::gzip)  * "Exuberant ctags", runnable as ctags-exuberant somewhere in the diff --git a/lib/LXRng/Index/DBI.pm b/lib/LXRng/Index/DBI.pm index 763f534..55e215d 100644 --- a/lib/LXRng/Index/DBI.pm +++ b/lib/LXRng/Index/DBI.pm @@ -520,6 +520,22 @@ sub get_identifier_info {  	    \%reflines);  } +sub set_rfile_charset { +    my ($self, $rfile_id, $charset) = @_; + +    my $dbh = $self->dbh; +    my $pre = $self->prefix; +    my $sth = $$self{'sth'}{'set_rfile_charset'} ||= +	$dbh->prepare(qq{ +	    update ${pre}revisions +		set body_charset = (select id from ${pre}charsets +				    where name = ?)  +		where id = ?}); +     +    return $sth->execute($charset, $rfile_id); +} + +  sub get_rfile_timestamp {      my ($self, $rfile_id) = @_; diff --git a/lib/LXRng/Index/Pg.pm b/lib/LXRng/Index/Pg.pm index 3654a72..7387abc 100644 --- a/lib/LXRng/Index/Pg.pm +++ b/lib/LXRng/Index/Pg.pm @@ -67,7 +67,7 @@ sub init_db {  	or die($dbh->errstr);      $dbh->do(qq{insert into ${pre}charsets(name) values ('utf-8')})  	or die($dbh->errstr); -    $dbh->do(qq{insert into ${pre}charsets(name) values ('iso8859-1')}) +    $dbh->do(qq{insert into ${pre}charsets(name) values ('iso-8859-1')})  	or die($dbh->errstr);      $dbh->do(qq{ diff --git a/lib/LXRng/Lang/C.pm b/lib/LXRng/Lang/C.pm index 60a571b..db3a204 100644 --- a/lib/LXRng/Lang/C.pm +++ b/lib/LXRng/Lang/C.pm @@ -51,13 +51,13 @@ sub identifier_re {      return $_identifier_re;  } -my $_reserved ||= { map { $_ => 1 } -		    qw(asm auto break case char const continue default -		       do double else enum extern float for fortran -		       goto if int long register return short signed -		       sizeof static struct switch typedef union -		       unsigned void volatile while #define #else -		       #endif #if #ifdef #ifndef #include #undef)}; +my $_reserved = { map { $_ => 1 } +		  qw(asm auto break case char const continue default +		     do double else enum extern float for fortran goto +		     if int long register return short signed sizeof +		     static struct switch typedef union unsigned void +		     volatile while #define #else #endif #if #ifdef +		     #ifndef #include #undef)};  sub reserved {      return $_reserved; @@ -73,31 +73,10 @@ sub parsespec {  	    'include',	'#\s*include\s+<',	'>'];  } -sub typemap { -    return { -	'c' => 'class', -	'd' => 'macro (un)definition', -	'e' => 'enumerator', -	'f' => 'function definition', -	'g' => 'enumeration name', -	'm' => 'class, struct, or union member', -	'n' => 'namespace', -	'p' => 'function prototype or declaration', -	's' => 'structure name', -	't' => 'typedef', -	'u' => 'union name', -	'v' => 'variable definition', -	'x' => 'extern or forward variable declaration', -	'i' => 'interface'}; -} -  sub markuphandlers {      my ($self, $context, $node, $markup) = @_;      my $index = $context->config->{'index'}; -    my $idre = $self->identifier_re(); -    my $res  = $self->reserved(); -      my %subst;      my $format_newline = $markup->make_format_newline($node); @@ -121,7 +100,7 @@ sub markuphandlers {      $subst{'code'} = new Subst::Complex  	qr/\n/	   => $format_newline, -	qr/[^\n]*/ => sub { $markup->format_code($idre, $res, @_) }; +	qr/[^\n]*/ => sub { $markup->format_code($self, @_) };      $subst{'start'} = new Subst::Complex  	qr/^/	   => $format_newline; diff --git a/lib/LXRng/Lang/Generic.pm b/lib/LXRng/Lang/Generic.pm index 82e9b9d..3e1f545 100644 --- a/lib/LXRng/Lang/Generic.pm +++ b/lib/LXRng/Lang/Generic.pm @@ -40,4 +40,9 @@ sub expand_include {      return ();  } +sub mangle_sym { +    my ($self, $sym) = @_; +    return $sym; +} +  1; diff --git a/lib/LXRng/Lang/GnuAsm.pm b/lib/LXRng/Lang/GnuAsm.pm new file mode 100644 index 0000000..acdcdef --- /dev/null +++ b/lib/LXRng/Lang/GnuAsm.pm @@ -0,0 +1,164 @@ +# Copyright (C) 2008 Arne Georg Gleditsch <lxr@linux.no>. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# The full GNU General Public License is included in this distribution +# in the file called COPYING. + +package LXRng::Lang::GnuAsm; + +use strict; +use Subst::Complex; + +use base qw(LXRng::Lang::Generic); + + +sub doindex { +    return 1; +} + +sub ctagslangname { +    return 'asm'; +} + +sub ctagsopts { +    return (); +} + +sub pathexp { +    return qr/\.[sS]$/; +} + +my $_identifier_re = qr( +			(?m:^|(?<=[^a-zA-Z0-9_\#]))	# Non-symbol chars. +			(_*[a-zA-Z][a-zA-Z0-9_]*)	# The symbol. +			\b +			)x; + +sub identifier_re { +    return $_identifier_re; +} + +my $_reserved ||= { map { $_ => 1 } +		    (qw(aaa aad aam aas adc bound bsf bsr bswap btc +		       btr call cbw cwde cdqe cwd cdq cqo clc cld +		       clflush cmc cmps cmpsb cmpsw cmpsd cmpsq +		       cmpxchg cmpxchg8b cmpxchg16b cpuid daa das +		       enter ins insb insw insd int into jcxz jecxz +		       jrcxz jmp lahf lds les lfs lgs lss leave lfence +		       lock lods lodsb lodsw lodsd lodsq loop loope +		       loopne loopnz loopz mfence movd movmskpd +		       movmskps movnti movs movsb movsw movsd movsq +		       movsx movsxd movzx nop outs outsb outsw outsd +		       pause popa popad prefetch prefetchw pusha +		       pushad pushfd pushfq ret sahf sbb scas scasb +		       scasw scasd scasq sfence stc std stos stosb +		       stosw stosd stosq xadd xchg xlat xlatb arpl +		       clgi cli clts hlt int invd invlpg invlpga iret +		       iretd iretq lar lgdt lidt lldt lmsw lretq lsl +		       ltr rep rdmsr rdpmc rdtsc rdtscp rsm sgdt sidt +		       skinit sldt smsw sti stgi str swapgs syscall +		       sysenter sysexit sysret ud2 verr verw vmload +		       vmmcall vmrun vmsave wbinvd wrmsr), + +		     (map { $_, $_.'b', $_.'w', $_.'l', $_.'q' } +		      qw(add and mov bt bts cmp dec div idiv imul inc +			 in lea mul neg not or out pop popf push pushf +			 rcl rcr rol ror sal shl sar shl shr sub test +			 xor)), + +		     (map { 'cmov'.$_, 'j'.$_, 'set'.$_ } +		      qw(o no b c nae nb nc ae z e nz ne be na nbe a s +			 ns p pe np po l nge nl ge le ng nle g)) +		     )}; +		      + +sub reserved { +    return $_reserved; +} + +sub parsespec { +    return ['atom',	'\\\\.',	undef, +	    'atom',	'%[a-z][a-z0-9]+', undef, # Registers +	    'atom',	'[.][a-z0-9]+', undef, # Directives +	    'comment',	'/\*',		'\*/', +	    'comment',	'//',		"\$", +	    'string',	'"',		'"', +	    'string',	"'",		"'", +	    'atom',	'#\s*(?:ifn?def|define|else|endif|undef)', undef, +	    'include',	'#\s*include\s+"',	'"', +	    'include',	'#\s*include\s+<',	'>', +	    'comment',	'#',		"\$"]; +} + +sub markuphandlers { +    my ($self, $context, $node, $markup) = @_; + +    my $index = $context->config->{'index'}; +    my %subst; + +    my $format_newline = $markup->make_format_newline($node); +    $subst{'comment'} = new Subst::Complex +	qr/\n/     => $format_newline, +	qr/[^\n]+/ => sub { $markup->format_comment(@_) }; +	 +    $subst{'string'} = new Subst::Complex +	qr/\n/        => $format_newline, +	qr/[^\n\"\']+/ => sub { $markup->format_string(@_) }; + +    $subst{'include'} = new Subst::Complex +	qr/\n/ => $format_newline, +	qr/(include\s*\")(.*?)(\")/ => sub { +	    $markup->format_include([$self->resolve_include($context, $node, @_)], +				    @_) }, +				   +	qr/(include\s*\<)(.*?)(\>)/ => sub { +	    $markup->format_include([$self->resolve_include($context, $node, @_)], +				    @_) }; +	 +    $subst{'code'} = new Subst::Complex +	qr/\n/	   => $format_newline, +	qr/[^\n]*/ => sub { $markup->format_code($self, @_) }; + +    $subst{'start'} = new Subst::Complex +	qr/^/	   => $format_newline; +     +    return \%subst; +} + +sub resolve_include { +    my ($self, $context, $node, $frag) = @_; + +    if ($frag =~ /include\s+<(.*?)>/) { +	return $self->expand_include($context, $node, $1); +    } +    elsif ($frag =~ /include\s+\"(.*?)\"/) { +	my $incl = $1; +	my $bare = $1; +	my $name = $node->name(); +	if ($name =~ /(.*\/)/) { +	    $incl = $1.$incl; +	    1 while $incl =~ s,/[^/]+/../,/,; +	     +	    my $file = $context->config->{'repository'}->node($incl, $context->release); +	    return $incl if $file; +	    return $self->expand_include($context, $node, $bare); +	} +    } + +    return (); +} + +1; diff --git a/lib/LXRng/Lang/Kconfig.pm b/lib/LXRng/Lang/Kconfig.pm new file mode 100644 index 0000000..7417236 --- /dev/null +++ b/lib/LXRng/Lang/Kconfig.pm @@ -0,0 +1,118 @@ +# Copyright (C) 2008 Arne Georg Gleditsch <lxr@linux.no>. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# The full GNU General Public License is included in this distribution +# in the file called COPYING. + +package LXRng::Lang::Kconfig; + +use strict; +use Subst::Complex; + +use base qw(LXRng::Lang::Generic); + + +sub doindex { +    return 1; +} + +sub ctagslangname { +    return undef; +} + +sub pathexp { +    return qr/Kconfig$/; +} + +my $_identifier_re = qr( +			(?m:^|(?<=[^A-Z0-9_\#]))	# Non-symbol chars. +			(_*[A-Z][A-Z0-9_]*)		# The symbol. +			\b +			)x; + +sub identifier_re { +    return $_identifier_re; +} + +my $_reserved = { map { $_ => 1 } +		  qw(menu source endmenu config bool if default help +		     tristate depends on y n m)}; + +sub reserved { +    return $_reserved; +} + +sub parsespec { +    return ['atom',	'\\\\.',	undef, +	    'comment',	'#',		"\$", +	    'string',	'"',		'"', +	    'string',	"'",		"'", +	    'help',     'help', 	"^(?=[^ \t\n])", +	    'include',	'^source\s+"',	'"']; +} + +sub mangle_sym { +    return $_[1] =~ /^[A-Z0-9_]+$/ ? 'CONFIG_'.$_[1] : $_[1]; +} + +sub markuphandlers { +    my ($self, $context, $node, $markup) = @_; + +    my $index = $context->config->{'index'}; +    my %subst; + +    my $format_newline = $markup->make_format_newline($node); +    $subst{'comment'} = new Subst::Complex +	qr/\n/     => $format_newline, +	qr/[^\n]+/ => sub { $markup->format_comment(@_) }; +	 +    $subst{'help'} = new Subst::Complex +	qr/\n/        => $format_newline, +	qr/^[ \t]*help[ \t]*/ => sub { $markup->format_code($self, @_) }, +	qr/[^\n\"\']+/ => sub { $markup->format_string(@_) }; + +    $subst{'string'} = new Subst::Complex +	qr/\n/        => $format_newline, +	qr/[^\n\"\']+/ => sub { $markup->format_string(@_) }; + +    $subst{'include'} = new Subst::Complex +	qr/\n/ => $format_newline, +	qr/(include\s*\")(.*?)(\")/ => sub { +	    $markup->format_include([$self->resolve_include($context, $node, @_)], +				    @_) }, +				   +	qr/(include\s*\<)(.*?)(\>)/ => sub { +	    $markup->format_include([$self->resolve_include($context, $node, @_)], +				    @_) }; +	 +    $subst{'code'} = new Subst::Complex +	qr/\n/	   => $format_newline, +	qr/[^\n]*/ => sub { $markup->format_code($self, @_) }; + +    $subst{'start'} = new Subst::Complex +	qr/^/	   => $format_newline; +     +    return \%subst; +} + +sub resolve_include { +    my ($self, $context, $node, $frag) = @_; + +    return (); +} + +1; + diff --git a/lib/LXRng/Lang/Undefined.pm b/lib/LXRng/Lang/Undefined.pm index c7d5d09..0989bdb 100644 --- a/lib/LXRng/Lang/Undefined.pm +++ b/lib/LXRng/Lang/Undefined.pm @@ -41,10 +41,6 @@ sub parsespec {      return ['atom',	'\\\\.',	undef];  } -sub typemap { -    return {}; -} -  sub markuphandlers {      my ($self, $context, $node, $markup) = @_; diff --git a/lib/LXRng/Markup/File.pm b/lib/LXRng/Markup/File.pm index c3d576c..054463a 100644 --- a/lib/LXRng/Markup/File.pm +++ b/lib/LXRng/Markup/File.pm @@ -92,18 +92,23 @@ sub format_include {  }  sub format_code { -    my ($self, $idre, $res, $frag) = @_; +    my ($self, $lang, $frag) = @_;      my $tree = $self->context->vtree();      my $path = $self->context->path(); +    my $idre = $lang->identifier_re(); +    my $res  = $lang->reserved();      $frag =~ s{(.*?)$idre|(.+)}{  	if ($2) {  	    unless (exists($$res{$2})) {  		my $pre = $1; -		my $sym = safe_html($2); +		my $sym = $2; +		my $ref = safe_html($lang->mangle_sym($sym)); +		$sym = safe_html($sym); +  		safe_html($pre). -		    qq{<a href="+code=$sym" class="sref">$sym</a>}; +		    qq{<a href="+code=$ref" class="sref">$sym</a>};  	    }  	    else {  		safe_html($1.$2); @@ -129,7 +134,7 @@ sub markupfile {      my ($self, $subst, $parse) = @_;      my ($btype, $frag) = $parse->nextfrag; -     +      return () unless defined $frag;      $btype ||= 'code'; diff --git a/lib/LXRng/Parse/Simple.pm b/lib/LXRng/Parse/Simple.pm index 215ce5b..d89ea22 100644 --- a/lib/LXRng/Parse/Simple.pm +++ b/lib/LXRng/Parse/Simple.pm @@ -38,6 +38,8 @@ sub new {  	'fileh'		=> $fileh,	# File handle  	'tabwidth'	=> $tabhint||8,	# Tab width  	'frags'		=> [],		# Fragments in queue +	'pref'		=> '', +	'rest'		=> '',  	'bodyid'	=> \@bodyid,	# Array of body type ids  	'bofseen'	=> 0,		# Beginning-of-file seen?  	'term'		=> \@term, @@ -63,81 +65,66 @@ sub untabify {  sub nextfrag {      my ($self) = @_; -    my $btype = undef; -    my $frag = undef; -    my $line = ''; - +    my $btype;      while (1) { -	# read one more line if we have processed  -	# all of the previously read line -	if (@{$$self{'frags'}} == 0) { -	    $line = $$self{'fileh'}->getline; -	     -	    if ($. <= 2 && -		$line =~ /^.*-[*]-.*?[ \t;]tab-width:[ \t]*([0-9]+).*-[*]-/) { -		# make sure there really is a non-zero tabwidth -		$$self{'tabwidth'} = $1 if $1 > 0; +	if (defined $btype) { +	    if ($$self{'rest'} =~ s/\A((?s:.*?)$$self{'term'}[$btype])//m) { +		my $ret = $$self{'pref'}.$1; +		$$self{'pref'} = ''; +		return ($$self{'bodyid'}[$btype], $ret);  	    } -	     -	    if(defined($line)) { -		untabify($line, $$self{'tabwidth'}); +	    else { +		$$self{'pref'} .= $$self{'rest'}; +		$$self{'rest'} = ''; +	    } +	} +	else { +	    if ($$self{'rest'} =~ s/\A((?s).*?)($$self{'open'})//m) { +		my $pref = $1; +		my $frag = $2; -		# split the line into fragments -		$$self{'frags'} = [split(/($$self{'split'})/, $line)]; +		if ($pref ne '') { +		    $$self{'rest'} = $frag.$$self{'rest'}; +		    return ('', $pref); +		} + +		$btype = 3; +		$btype++ while $btype < $#- and !defined($-[$btype]); +		$btype -= 3; + +		if (!defined($$self{'term'}[$btype])) { +		    # Opening regexp captures entire block. +		    return ($$self{'bodyid'}[$btype], $frag); +		} +		$$self{'pref'} = $frag;  	    }  	} -	last if @{$$self{'frags'}} == 0; +	my $line = $$self{'fileh'}->getline; +	unless (defined $line) { +	    my $ret = $$self{'pref'}.$$self{'rest'}; +	    $$self{'pref'} = ''; +	    $$self{'rest'} = ''; +	    undef($ret) unless length($ret) > 0; +	     +	    return (defined($btype) ? $$self{'bodyid'}[$btype] : '', $ret); +	} + +	if ($. <= 2 && +	    $line =~ /^.*-[*]-.*?[ \t;]tab-width:[ \t]*([0-9]+).*-[*]-/) { +	    # make sure there really is a non-zero tabwidth +	    $$self{'tabwidth'} = $1 if $1 > 0; +	} +	     +	untabify($line, $$self{'tabwidth'}); +	$$self{'rest'} .= $line;  	unless ($$self{'bofseen'}) {  	    # return start marker if file has contents  	    $$self{'bofseen'} = 1;  	    return ('start', '');  	} -	 -	# skip empty fragments -	if ($$self{'frags'}[0] eq '') { -	    shift(@{$$self{'frags'}}); -	} - -	# check if we are inside a fragment -	if (defined($frag)) { -	    if (defined($btype)) { -		my $next = shift(@{$$self{'frags'}}); -		 -		# Add to the fragment -		$frag .= $next; -		# We are done if this was the terminator -		last if $next =~ /^$$self{'term'}[$btype]$/; -		 -	    } -	    else { -		if ($$self{'frags'}[0] =~ /^$$self{'open'}$/) { -		    last; -		} -		$frag .= shift(@{$$self{'frags'}}); -	    } -	} -	else { -	    # Find the blocktype of the current block -	    $frag = shift(@{$$self{'frags'}}); -	    if (defined($frag) && (@_ = $frag =~ /^$$self{'open'}$/)) { -		# grep in a scalar context returns the number of times -		# EXPR evaluates to true, which is this case will be -		# the index of the first defined element in @_. - -		my $i = 1; -		$btype = grep { $i &&= !defined($_) } @_; -		if(!defined($$self{'term'}[$btype])) { -		    # Opening regexp captures entire block. -		    last; -		} -	    } -	}      } -    $btype = $$self{'bodyid'}[$btype] if defined($btype); -     -    return ($btype, $frag);  }  1; diff --git a/lib/LXRng/Search/Xapian.pm b/lib/LXRng/Search/Xapian.pm index 03db5b8..014d57a 100644 --- a/lib/LXRng/Search/Xapian.pm +++ b/lib/LXRng/Search/Xapian.pm @@ -23,6 +23,13 @@ use strict;  use Search::Xapian qw/:ops :db :qpstem/;  use Search::Xapian::QueryParser; +our @STOPWORDS = qw(our ours you your yours him his she her hers they +                    them their theirs what which who whom this that +                    these those are was were been being have has had +                    having does did doing would should could the and +                    but for with all any); +our %STOPWORD = map { $_ => 1 } @STOPWORDS; +  sub new {      my ($class, $db_root) = @_; @@ -31,7 +38,7 @@ sub new {      my $self = bless({'db_root' => $db_root,  		      'writes' => 0},  		     $class); - +          return $self;  } @@ -100,6 +107,30 @@ sub add_release {      return $changes;  } +sub make_add_text { +    my ($index, $doc) = @_; + +    return sub { +	my ($pos, $text) = @_; + +	foreach my $term ($text =~ /(_*\w[\w_]*)/g) { +	    $term = lc($term); +	    next if length($term) <= 2; +	    next if length($term) > 128; +	    next if $STOPWORD{$term}; + +	    $doc->add_posting($term, $pos++); +	    if ($term =~ /_/) { +		foreach my $subt ($term =~ /([^_]+)/g) { +		    next if length($subt) <= 2; +		    next if $STOPWORD{$subt}; +		    $doc->add_posting($subt, $pos++); +		} +	    } +	}; +    } +} +  sub flush {      my ($self) = @_; diff --git a/lxr-genxref b/lxr-genxref index 3173bdd..b390b35 100755 --- a/lxr-genxref +++ b/lxr-genxref @@ -37,6 +37,7 @@ use IO::Handle;  use Fcntl;  use Term::ProgressBar;  use Devel::Size qw(size total_size); +use Encode;  $SIG{'INT'}  = sub { die "\nSIGINT: $$: please wait, flushing caches...\n"; };  $SIG{'QUIT'} = sub { die "\nSIGQUIT: $$: please wait, flushing caches...\n"; }; @@ -51,6 +52,7 @@ my $tree = shift(@ARGV);  my @versions = @ARGV;  my $context = LXRng::Context->new('tree' => $tree); +die "Usage: $0 <tree-id>\n" unless $context and $context->tree;  LXRng::Lang->init($context);  my $index   = $context->config->{'index'}; @@ -87,7 +89,8 @@ sub make_add_ident($) {  	    $last_func = $symbol;  	}  	if ($$info{'kind'} eq 'l') { -	    $$info{'context'} = $identcache{$last_func}; +	    $$info{'context'} = $identcache{$last_func} if +		defined($last_func);  	}  	if (exists $$info{'class'}) {  	    $$info{'context'} = $identcache{$$info{'class'}}; @@ -122,11 +125,12 @@ sub index_file($$) {      return 0 unless $index->to_index($fileid);      return 1 unless $lang->doindex(); +    return 1 unless $lang->ctagslangname();      my $add_ident = make_add_ident($fileid);      warn("--- indexing    ".$file->name." [".$file->revision."]\n"); -    my @extra_flags = ('-IEXPORT_SYMBOL+', '-I__initcall+'); +    my $extra_flags = $context->config->{'ctags_flags'} || [];      my $ctags;      my $pid = open($ctags, '-|'); @@ -134,7 +138,7 @@ sub index_file($$) {      if ($pid == 0) {  	exec('ctags-exuberant', -	     @extra_flags, +	     @$extra_flags,  	     '--fields=+aifmknsSz', '--sort=no',  	     '--excmd=number', '-f', '-',  	     '--language-force='.$lang->ctagslangname, @@ -181,12 +185,7 @@ sub reference_file($$$) {      warn("--- referencing ".$file->name." [".$file->revision."]\n");      my $reserved = $lang->reserved(); - -    my $re = qr( -		(?m:^|[^a-zA-Z0-9_])		# Non-symbol chars. -		(_*[a-zA-Z][a-zA-Z0-9_]*)	# The symbol. -		\b -		)x; +    my $re = $lang->identifier_re();      my %refs;      my $line = 1; @@ -199,7 +198,7 @@ sub reference_file($$$) {  	    while ($frag =~ /\G.*?(?:(\n)|$re)/gc) {  		$line++ && next if defined $1; -		my $sym = $2; +		my $sym = $lang->mangle_sym($2);  		next if $$reserved{$sym};  		push(@{$refs{$sym} ||= []}, $line); @@ -236,20 +235,28 @@ sub hash_file($$$) {  	sysopen($handle, $file->phys_path, 0) || die($!);  	warn("--- hashing     ".$file->name." [".$file->revision."]\n");  	my $doc = $hash->new_document($file->name); +	my $charset = $context->config->{'content_charset'} || []; +	$charset = [ref($charset) eq 'ARRAY' ? @$charset : $charset]; +	push(@$charset, 'iso-8859-1'); # Fall back +	my $add_line = $hash->make_add_text($doc); +  	while (<$handle>) {  	    my $pos = 0; -	    # Latin-1 word characters. -	    foreach my $term (/([0-9a-zA-Z\300-\326\330-\366\370-\377]+)/g) { -		$term = lc($term); -		next if length($term) > 128; -		$doc->add_posting($term, $.*100 + $pos++); +	    my $text; +	    while (@$charset) { +		$text = eval { decode($$charset[0], $_, Encode::FB_CROAK); }; +		last unless $@; +		shift(@$charset);  	    } + +	    $add_line->($.*100, $text);  	}  	reference_file($file, $fileid, $doc);  	$docid = $hash->add_document($doc, [map {  	    $index->release_id($tree, $_) } @$rels]);  	$index->add_hashed_document($fileid, $docid); +	$index->set_rfile_charset($fileid, $$charset[0]);  	$handle->close();  	return 1;      } diff --git a/lxrng.conf-dist b/lxrng.conf-dist index 27abb1b..ab4229a 100644 --- a/lxrng.conf-dist +++ b/lxrng.conf-dist @@ -24,14 +24,16 @@ return {  	'index'       => $index,  	'search'      => $search, -	'base_url'    => 'http://lxr-test.linpro.no/lxr', +	'base_url'    => 'http://localhost/lxr',  	# Must be writable by httpd user:  	'cache'	      => '/var/lib/lxrng/cache',  	'fs_charset'  => 'iso-8859-1', -	'content_charset' => 'iso-8859-1', +	# Tried successively +	'content_charset' => ['utf-8', 'iso-8859-1'], -	'languages'   => ['C'], +	'languages'   => ['C', 'GnuAsm', 'Kconfig'], +	'ctags_flags' => ["-I\@$LXRng::ROOT/lxr-ctags-quirks"],  	'ver_list'    => [$gitrepo->allversions],  	'ver_default' => 'v2.6.20.3',  | 
