Added more language modules, threw things about a bit.

author: Arne Georg Gleditsch <argggh@taniquetil.(none)> 2009-03-06 21:13:18 +0100
committer: Arne Georg Gleditsch <argggh@taniquetil.(none)> 2009-03-06 21:13:18 +0100
commit: 8d7d1d2595c2c3475ec27d07d8544e345dab5851 (patch)
tree: 6337fd7aa489a73c44abb742b2bac22741b00779
parent: a25f8442450d7e9fb188a979ab0897cb86e9d84f (diff)
13 files changed, 433 insertions, 119 deletions
diff --git a/INSTALL b/INSTALL
index 7c18f11..f82d3e6 100644
--- a/INSTALL
+++ b/INSTALL
@@ -20,6 +20,10 @@ DEPENDENCIES
   - HTML::Entities [libhtml-parser-perl]
   - Template [libtemplate-perl]
 
+* Misc Perl modules
+  - Devel::Size [libdevel-size-perl]
+  - Term::ProgressBar [libterm-progressbar-perl]
+
 * (For gzip content transfer compression: PerlIO::gzip)
 
 * "Exuberant ctags", runnable as ctags-exuberant somewhere in the
diff --git a/lib/LXRng/Index/DBI.pm b/lib/LXRng/Index/DBI.pm
index 763f534..55e215d 100644
--- a/lib/LXRng/Index/DBI.pm
+++ b/lib/LXRng/Index/DBI.pm
@@ -520,6 +520,22 @@ sub get_identifier_info {
 	    \%reflines);
 }
 
+sub set_rfile_charset {
+    my ($self, $rfile_id, $charset) = @_;
+
+    my $dbh = $self->dbh;
+    my $pre = $self->prefix;
+    my $sth = $$self{'sth'}{'set_rfile_charset'} ||=
+	$dbh->prepare(qq{
+	    update ${pre}revisions
+		set body_charset = (select id from ${pre}charsets
+				    where name = ?) 
+		where id = ?});
+    
+    return $sth->execute($charset, $rfile_id);
+}
+
+
 sub get_rfile_timestamp {
     my ($self, $rfile_id) = @_;
 
diff --git a/lib/LXRng/Index/Pg.pm b/lib/LXRng/Index/Pg.pm
index 3654a72..7387abc 100644
--- a/lib/LXRng/Index/Pg.pm
+++ b/lib/LXRng/Index/Pg.pm
@@ -67,7 +67,7 @@ sub init_db {
 	or die($dbh->errstr);
     $dbh->do(qq{insert into ${pre}charsets(name) values ('utf-8')})
 	or die($dbh->errstr);
-    $dbh->do(qq{insert into ${pre}charsets(name) values ('iso8859-1')})
+    $dbh->do(qq{insert into ${pre}charsets(name) values ('iso-8859-1')})
 	or die($dbh->errstr);
 
     $dbh->do(qq{
diff --git a/lib/LXRng/Lang/C.pm b/lib/LXRng/Lang/C.pm
index 60a571b..db3a204 100644
--- a/lib/LXRng/Lang/C.pm
+++ b/lib/LXRng/Lang/C.pm
@@ -51,13 +51,13 @@ sub identifier_re {
     return $_identifier_re;
 }
 
-my $_reserved ||= { map { $_ => 1 }
-		    qw(asm auto break case char const continue default
-		       do double else enum extern float for fortran
-		       goto if int long register return short signed
-		       sizeof static struct switch typedef union
-		       unsigned void volatile while #define #else
-		       #endif #if #ifdef #ifndef #include #undef)};
+my $_reserved = { map { $_ => 1 }
+		  qw(asm auto break case char const continue default
+		     do double else enum extern float for fortran goto
+		     if int long register return short signed sizeof
+		     static struct switch typedef union unsigned void
+		     volatile while #define #else #endif #if #ifdef
+		     #ifndef #include #undef)};
 
 sub reserved {
     return $_reserved;
@@ -73,31 +73,10 @@ sub parsespec {
 	    'include',	'#\s*include\s+<',	'>'];
 }
 
-sub typemap {
-    return {
-	'c' => 'class',
-	'd' => 'macro (un)definition',
-	'e' => 'enumerator',
-	'f' => 'function definition',
-	'g' => 'enumeration name',
-	'm' => 'class, struct, or union member',
-	'n' => 'namespace',
-	'p' => 'function prototype or declaration',
-	's' => 'structure name',
-	't' => 'typedef',
-	'u' => 'union name',
-	'v' => 'variable definition',
-	'x' => 'extern or forward variable declaration',
-	'i' => 'interface'};
-}
-
 sub markuphandlers {
     my ($self, $context, $node, $markup) = @_;
 
     my $index = $context->config->{'index'};
-    my $idre = $self->identifier_re();
-    my $res  = $self->reserved();
-
     my %subst;
 
     my $format_newline = $markup->make_format_newline($node);
@@ -121,7 +100,7 @@ sub markuphandlers {
 	
     $subst{'code'} = new Subst::Complex
 	qr/\n/	   => $format_newline,
-	qr/[^\n]*/ => sub { $markup->format_code($idre, $res, @_) };
+	qr/[^\n]*/ => sub { $markup->format_code($self, @_) };
 
     $subst{'start'} = new Subst::Complex
 	qr/^/	   => $format_newline;
diff --git a/lib/LXRng/Lang/Generic.pm b/lib/LXRng/Lang/Generic.pm
index 82e9b9d..3e1f545 100644
--- a/lib/LXRng/Lang/Generic.pm
+++ b/lib/LXRng/Lang/Generic.pm
@@ -40,4 +40,9 @@ sub expand_include {
     return ();
 }
 
+sub mangle_sym {
+    my ($self, $sym) = @_;
+    return $sym;
+}
+
 1;
diff --git a/lib/LXRng/Lang/GnuAsm.pm b/lib/LXRng/Lang/GnuAsm.pm
new file mode 100644
index 0000000..acdcdef
--- /dev/null
+++ b/lib/LXRng/Lang/GnuAsm.pm
@@ -0,0 +1,164 @@
+# Copyright (C) 2008 Arne Georg Gleditsch <lxr@linux.no>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# The full GNU General Public License is included in this distribution
+# in the file called COPYING.
+
+package LXRng::Lang::GnuAsm;
+
+use strict;
+use Subst::Complex;
+
+use base qw(LXRng::Lang::Generic);
+
+
+sub doindex {
+    return 1;
+}
+
+sub ctagslangname {
+    return 'asm';
+}
+
+sub ctagsopts {
+    return ();
+}
+
+sub pathexp {
+    return qr/\.[sS]$/;
+}
+
+my $_identifier_re = qr(
+			(?m:^|(?<=[^a-zA-Z0-9_\#]))	# Non-symbol chars.
+			(_*[a-zA-Z][a-zA-Z0-9_]*)	# The symbol.
+			\b
+			)x;
+
+sub identifier_re {
+    return $_identifier_re;
+}
+
+my $_reserved ||= { map { $_ => 1 }
+		    (qw(aaa aad aam aas adc bound bsf bsr bswap btc
+		       btr call cbw cwde cdqe cwd cdq cqo clc cld
+		       clflush cmc cmps cmpsb cmpsw cmpsd cmpsq
+		       cmpxchg cmpxchg8b cmpxchg16b cpuid daa das
+		       enter ins insb insw insd int into jcxz jecxz
+		       jrcxz jmp lahf lds les lfs lgs lss leave lfence
+		       lock lods lodsb lodsw lodsd lodsq loop loope
+		       loopne loopnz loopz mfence movd movmskpd
+		       movmskps movnti movs movsb movsw movsd movsq
+		       movsx movsxd movzx nop outs outsb outsw outsd
+		       pause popa popad prefetch prefetchw pusha
+		       pushad pushfd pushfq ret sahf sbb scas scasb
+		       scasw scasd scasq sfence stc std stos stosb
+		       stosw stosd stosq xadd xchg xlat xlatb arpl
+		       clgi cli clts hlt int invd invlpg invlpga iret
+		       iretd iretq lar lgdt lidt lldt lmsw lretq lsl
+		       ltr rep rdmsr rdpmc rdtsc rdtscp rsm sgdt sidt
+		       skinit sldt smsw sti stgi str swapgs syscall
+		       sysenter sysexit sysret ud2 verr verw vmload
+		       vmmcall vmrun vmsave wbinvd wrmsr),
+
+		     (map { $_, $_.'b', $_.'w', $_.'l', $_.'q' }
+		      qw(add and mov bt bts cmp dec div idiv imul inc
+			 in lea mul neg not or out pop popf push pushf
+			 rcl rcr rol ror sal shl sar shl shr sub test
+			 xor)),
+
+		     (map { 'cmov'.$_, 'j'.$_, 'set'.$_ }
+		      qw(o no b c nae nb nc ae z e nz ne be na nbe a s
+			 ns p pe np po l nge nl ge le ng nle g))
+		     )};
+		     
+
+sub reserved {
+    return $_reserved;
+}
+
+sub parsespec {
+    return ['atom',	'\\\\.',	undef,
+	    'atom',	'%[a-z][a-z0-9]+', undef, # Registers
+	    'atom',	'[.][a-z0-9]+', undef, # Directives
+	    'comment',	'/\*',		'\*/',
+	    'comment',	'//',		"\$",
+	    'string',	'"',		'"',
+	    'string',	"'",		"'",
+	    'atom',	'#\s*(?:ifn?def|define|else|endif|undef)', undef,
+	    'include',	'#\s*include\s+"',	'"',
+	    'include',	'#\s*include\s+<',	'>',
+	    'comment',	'#',		"\$"];
+}
+
+sub markuphandlers {
+    my ($self, $context, $node, $markup) = @_;
+
+    my $index = $context->config->{'index'};
+    my %subst;
+
+    my $format_newline = $markup->make_format_newline($node);
+    $subst{'comment'} = new Subst::Complex
+	qr/\n/     => $format_newline,
+	qr/[^\n]+/ => sub { $markup->format_comment(@_) };
+	
+    $subst{'string'} = new Subst::Complex
+	qr/\n/        => $format_newline,
+	qr/[^\n\"\']+/ => sub { $markup->format_string(@_) };
+
+    $subst{'include'} = new Subst::Complex
+	qr/\n/ => $format_newline,
+	qr/(include\s*\")(.*?)(\")/ => sub {
+	    $markup->format_include([$self->resolve_include($context, $node, @_)],
+				    @_) },
+				  
+	qr/(include\s*\<)(.*?)(\>)/ => sub {
+	    $markup->format_include([$self->resolve_include($context, $node, @_)],
+				    @_) };
+	
+    $subst{'code'} = new Subst::Complex
+	qr/\n/	   => $format_newline,
+	qr/[^\n]*/ => sub { $markup->format_code($self, @_) };
+
+    $subst{'start'} = new Subst::Complex
+	qr/^/	   => $format_newline;
+    
+    return \%subst;
+}
+
+sub resolve_include {
+    my ($self, $context, $node, $frag) = @_;
+
+    if ($frag =~ /include\s+<(.*?)>/) {
+	return $self->expand_include($context, $node, $1);
+    }
+    elsif ($frag =~ /include\s+\"(.*?)\"/) {
+	my $incl = $1;
+	my $bare = $1;
+	my $name = $node->name();
+	if ($name =~ /(.*\/)/) {
+	    $incl = $1.$incl;
+	    1 while $incl =~ s,/[^/]+/../,/,;
+	    
+	    my $file = $context->config->{'repository'}->node($incl, $context->release);
+	    return $incl if $file;
+	    return $self->expand_include($context, $node, $bare);
+	}
+    }
+
+    return ();
+}
+
+1;
diff --git a/lib/LXRng/Lang/Kconfig.pm b/lib/LXRng/Lang/Kconfig.pm
new file mode 100644
index 0000000..7417236
--- /dev/null
+++ b/lib/LXRng/Lang/Kconfig.pm
@@ -0,0 +1,118 @@
+# Copyright (C) 2008 Arne Georg Gleditsch <lxr@linux.no>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# The full GNU General Public License is included in this distribution
+# in the file called COPYING.
+
+package LXRng::Lang::Kconfig;
+
+use strict;
+use Subst::Complex;
+
+use base qw(LXRng::Lang::Generic);
+
+
+sub doindex {
+    return 1;
+}
+
+sub ctagslangname {
+    return undef;
+}
+
+sub pathexp {
+    return qr/Kconfig$/;
+}
+
+my $_identifier_re = qr(
+			(?m:^|(?<=[^A-Z0-9_\#]))	# Non-symbol chars.
+			(_*[A-Z][A-Z0-9_]*)		# The symbol.
+			\b
+			)x;
+
+sub identifier_re {
+    return $_identifier_re;
+}
+
+my $_reserved = { map { $_ => 1 }
+		  qw(menu source endmenu config bool if default help
+		     tristate depends on y n m)};
+
+sub reserved {
+    return $_reserved;
+}
+
+sub parsespec {
+    return ['atom',	'\\\\.',	undef,
+	    'comment',	'#',		"\$",
+	    'string',	'"',		'"',
+	    'string',	"'",		"'",
+	    'help',     'help', 	"^(?=[^ \t\n])",
+	    'include',	'^source\s+"',	'"'];
+}
+
+sub mangle_sym {
+    return $_[1] =~ /^[A-Z0-9_]+$/ ? 'CONFIG_'.$_[1] : $_[1];
+}
+
+sub markuphandlers {
+    my ($self, $context, $node, $markup) = @_;
+
+    my $index = $context->config->{'index'};
+    my %subst;
+
+    my $format_newline = $markup->make_format_newline($node);
+    $subst{'comment'} = new Subst::Complex
+	qr/\n/     => $format_newline,
+	qr/[^\n]+/ => sub { $markup->format_comment(@_) };
+	
+    $subst{'help'} = new Subst::Complex
+	qr/\n/        => $format_newline,
+	qr/^[ \t]*help[ \t]*/ => sub { $markup->format_code($self, @_) },
+	qr/[^\n\"\']+/ => sub { $markup->format_string(@_) };
+
+    $subst{'string'} = new Subst::Complex
+	qr/\n/        => $format_newline,
+	qr/[^\n\"\']+/ => sub { $markup->format_string(@_) };
+
+    $subst{'include'} = new Subst::Complex
+	qr/\n/ => $format_newline,
+	qr/(include\s*\")(.*?)(\")/ => sub {
+	    $markup->format_include([$self->resolve_include($context, $node, @_)],
+				    @_) },
+				  
+	qr/(include\s*\<)(.*?)(\>)/ => sub {
+	    $markup->format_include([$self->resolve_include($context, $node, @_)],
+				    @_) };
+	
+    $subst{'code'} = new Subst::Complex
+	qr/\n/	   => $format_newline,
+	qr/[^\n]*/ => sub { $markup->format_code($self, @_) };
+
+    $subst{'start'} = new Subst::Complex
+	qr/^/	   => $format_newline;
+    
+    return \%subst;
+}
+
+sub resolve_include {
+    my ($self, $context, $node, $frag) = @_;
+
+    return ();
+}
+
+1;
+
diff --git a/lib/LXRng/Lang/Undefined.pm b/lib/LXRng/Lang/Undefined.pm
index c7d5d09..0989bdb 100644
--- a/lib/LXRng/Lang/Undefined.pm
+++ b/lib/LXRng/Lang/Undefined.pm
@@ -41,10 +41,6 @@ sub parsespec {
     return ['atom',	'\\\\.',	undef];
 }
 
-sub typemap {
-    return {};
-}
-
 sub markuphandlers {
     my ($self, $context, $node, $markup) = @_;
     
diff --git a/lib/LXRng/Markup/File.pm b/lib/LXRng/Markup/File.pm
index c3d576c..054463a 100644
--- a/lib/LXRng/Markup/File.pm
+++ b/lib/LXRng/Markup/File.pm
@@ -92,18 +92,23 @@ sub format_include {
 }
 
 sub format_code {
-    my ($self, $idre, $res, $frag) = @_;
+    my ($self, $lang, $frag) = @_;
 
     my $tree = $self->context->vtree();
     my $path = $self->context->path();
+    my $idre = $lang->identifier_re();
+    my $res  = $lang->reserved();
 
     $frag =~ s{(.*?)$idre|(.+)}{
 	if ($2) {
 	    unless (exists($$res{$2})) {
 		my $pre = $1;
-		my $sym = safe_html($2);
+		my $sym = $2;
+		my $ref = safe_html($lang->mangle_sym($sym));
+		$sym = safe_html($sym);
+
 		safe_html($pre).
-		    qq{<a href="+code=$sym" class="sref">$sym</a>};
+		    qq{<a href="+code=$ref" class="sref">$sym</a>};
 	    }
 	    else {
 		safe_html($1.$2);
@@ -129,7 +134,7 @@ sub markupfile {
     my ($self, $subst, $parse) = @_;
 
     my ($btype, $frag) = $parse->nextfrag;
-    
+
     return () unless defined $frag;
 
     $btype ||= 'code';
diff --git a/lib/LXRng/Parse/Simple.pm b/lib/LXRng/Parse/Simple.pm
index 215ce5b..d89ea22 100644
--- a/lib/LXRng/Parse/Simple.pm
+++ b/lib/LXRng/Parse/Simple.pm
@@ -38,6 +38,8 @@ sub new {
 	'fileh'		=> $fileh,	# File handle
 	'tabwidth'	=> $tabhint||8,	# Tab width
 	'frags'		=> [],		# Fragments in queue
+	'pref'		=> '',
+	'rest'		=> '',
 	'bodyid'	=> \@bodyid,	# Array of body type ids
 	'bofseen'	=> 0,		# Beginning-of-file seen?
 	'term'		=> \@term,
@@ -63,81 +65,66 @@ sub untabify {
 sub nextfrag {
     my ($self) = @_;
 
-    my $btype = undef;
-    my $frag = undef;
-    my $line = '';
-
+    my $btype;
     while (1) {
-	# read one more line if we have processed 
-	# all of the previously read line
-	if (@{$$self{'frags'}} == 0) {
-	    $line = $$self{'fileh'}->getline;
-	    
-	    if ($. <= 2 &&
-		$line =~ /^.*-[*]-.*?[ \t;]tab-width:[ \t]*([0-9]+).*-[*]-/) {
-		# make sure there really is a non-zero tabwidth
-		$$self{'tabwidth'} = $1 if $1 > 0;
+	if (defined $btype) {
+	    if ($$self{'rest'} =~ s/\A((?s:.*?)$$self{'term'}[$btype])//m) {
+		my $ret = $$self{'pref'}.$1;
+		$$self{'pref'} = '';
+		return ($$self{'bodyid'}[$btype], $ret);
 	    }
-	    
-	    if(defined($line)) {
-		untabify($line, $$self{'tabwidth'});
+	    else {
+		$$self{'pref'} .= $$self{'rest'};
+		$$self{'rest'} = '';
+	    }
+	}
+	else {
+	    if ($$self{'rest'} =~ s/\A((?s).*?)($$self{'open'})//m) {
+		my $pref = $1;
+		my $frag = $2;
 
-		# split the line into fragments
-		$$self{'frags'} = [split(/($$self{'split'})/, $line)];
+		if ($pref ne '') {
+		    $$self{'rest'} = $frag.$$self{'rest'};
+		    return ('', $pref);
+		}
+
+		$btype = 3;
+		$btype++ while $btype < $#- and !defined($-[$btype]);
+		$btype -= 3;
+
+		if (!defined($$self{'term'}[$btype])) {
+		    # Opening regexp captures entire block.
+		    return ($$self{'bodyid'}[$btype], $frag);
+		}
+		$$self{'pref'} = $frag;
 	    }
 	}
 
-	last if @{$$self{'frags'}} == 0;
+	my $line = $$self{'fileh'}->getline;
+	unless (defined $line) {
+	    my $ret = $$self{'pref'}.$$self{'rest'};
+	    $$self{'pref'} = '';
+	    $$self{'rest'} = '';
+	    undef($ret) unless length($ret) > 0;
+	    
+	    return (defined($btype) ? $$self{'bodyid'}[$btype] : '', $ret);
+	}
+
+	if ($. <= 2 &&
+	    $line =~ /^.*-[*]-.*?[ \t;]tab-width:[ \t]*([0-9]+).*-[*]-/) {
+	    # make sure there really is a non-zero tabwidth
+	    $$self{'tabwidth'} = $1 if $1 > 0;
+	}
+	    
+	untabify($line, $$self{'tabwidth'});
+	$$self{'rest'} .= $line;
 
 	unless ($$self{'bofseen'}) {
 	    # return start marker if file has contents
 	    $$self{'bofseen'} = 1;
 	    return ('start', '');
 	}
-	
-	# skip empty fragments
-	if ($$self{'frags'}[0] eq '') {
-	    shift(@{$$self{'frags'}});
-	}
-
-	# check if we are inside a fragment
-	if (defined($frag)) {
-	    if (defined($btype)) {
-		my $next = shift(@{$$self{'frags'}});
-		
-		# Add to the fragment
-		$frag .= $next;
-		# We are done if this was the terminator
-		last if $next =~ /^$$self{'term'}[$btype]$/;
-		
-	    }
-	    else {
-		if ($$self{'frags'}[0] =~ /^$$self{'open'}$/) {
-		    last;
-		}
-		$frag .= shift(@{$$self{'frags'}});
-	    }
-	}
-	else {
-	    # Find the blocktype of the current block
-	    $frag = shift(@{$$self{'frags'}});
-	    if (defined($frag) && (@_ = $frag =~ /^$$self{'open'}$/)) {
-		# grep in a scalar context returns the number of times
-		# EXPR evaluates to true, which is this case will be
-		# the index of the first defined element in @_.
-
-		my $i = 1;
-		$btype = grep { $i &&= !defined($_) } @_;
-		if(!defined($$self{'term'}[$btype])) {
-		    # Opening regexp captures entire block.
-		    last;
-		}
-	    }
-	}
     }
-    $btype = $$self{'bodyid'}[$btype] if defined($btype);
-    
-    return ($btype, $frag);
 }
 
 1;
diff --git a/lib/LXRng/Search/Xapian.pm b/lib/LXRng/Search/Xapian.pm
index 03db5b8..014d57a 100644
--- a/lib/LXRng/Search/Xapian.pm
+++ b/lib/LXRng/Search/Xapian.pm
@@ -23,6 +23,13 @@ use strict;
 use Search::Xapian qw/:ops :db :qpstem/;
 use Search::Xapian::QueryParser;
 
+our @STOPWORDS = qw(our ours you your yours him his she her hers they
+                    them their theirs what which who whom this that
+                    these those are was were been being have has had
+                    having does did doing would should could the and
+                    but for with all any);
+our %STOPWORD = map { $_ => 1 } @STOPWORDS;
+
 
 sub new {
     my ($class, $db_root) = @_;
@@ -31,7 +38,7 @@ sub new {
     my $self = bless({'db_root' => $db_root,
 		      'writes' => 0},
 		     $class);
-
+    
     return $self;
 }
 
@@ -100,6 +107,30 @@ sub add_release {
     return $changes;
 }
 
+sub make_add_text {
+    my ($index, $doc) = @_;
+
+    return sub {
+	my ($pos, $text) = @_;
+
+	foreach my $term ($text =~ /(_*\w[\w_]*)/g) {
+	    $term = lc($term);
+	    next if length($term) <= 2;
+	    next if length($term) > 128;
+	    next if $STOPWORD{$term};
+
+	    $doc->add_posting($term, $pos++);
+	    if ($term =~ /_/) {
+		foreach my $subt ($term =~ /([^_]+)/g) {
+		    next if length($subt) <= 2;
+		    next if $STOPWORD{$subt};
+		    $doc->add_posting($subt, $pos++);
+		}
+	    }
+	};
+    }
+}
+
 sub flush {
     my ($self) = @_;
 
diff --git a/lxr-genxref b/lxr-genxref
index 3173bdd..b390b35 100755
--- a/lxr-genxref
+++ b/lxr-genxref
@@ -37,6 +37,7 @@ use IO::Handle;
 use Fcntl;
 use Term::ProgressBar;
 use Devel::Size qw(size total_size);
+use Encode;
 
 $SIG{'INT'}  = sub { die "\nSIGINT: $$: please wait, flushing caches...\n"; };
 $SIG{'QUIT'} = sub { die "\nSIGQUIT: $$: please wait, flushing caches...\n"; };
@@ -51,6 +52,7 @@ my $tree = shift(@ARGV);
 my @versions = @ARGV;
 
 my $context = LXRng::Context->new('tree' => $tree);
+die "Usage: $0 <tree-id>\n" unless $context and $context->tree;
 LXRng::Lang->init($context);
 
 my $index   = $context->config->{'index'};
@@ -87,7 +89,8 @@ sub make_add_ident($) {
 	    $last_func = $symbol;
 	}
 	if ($$info{'kind'} eq 'l') {
-	    $$info{'context'} = $identcache{$last_func};
+	    $$info{'context'} = $identcache{$last_func} if
+		defined($last_func);
 	}
 	if (exists $$info{'class'}) {
 	    $$info{'context'} = $identcache{$$info{'class'}};
@@ -122,11 +125,12 @@ sub index_file($$) {
 
     return 0 unless $index->to_index($fileid);
     return 1 unless $lang->doindex();
+    return 1 unless $lang->ctagslangname();
 
     my $add_ident = make_add_ident($fileid);
    
     warn("--- indexing    ".$file->name." [".$file->revision."]\n");
-    my @extra_flags = ('-IEXPORT_SYMBOL+', '-I__initcall+');
+    my $extra_flags = $context->config->{'ctags_flags'} || [];
 
     my $ctags;
     my $pid = open($ctags, '-|');
@@ -134,7 +138,7 @@ sub index_file($$) {
     
     if ($pid == 0) {
 	exec('ctags-exuberant',
-	     @extra_flags,
+	     @$extra_flags,
 	     '--fields=+aifmknsSz', '--sort=no',
 	     '--excmd=number', '-f', '-',
 	     '--language-force='.$lang->ctagslangname,
@@ -181,12 +185,7 @@ sub reference_file($$$) {
     warn("--- referencing ".$file->name." [".$file->revision."]\n");
 
     my $reserved = $lang->reserved();
-
-    my $re = qr(
-		(?m:^|[^a-zA-Z0-9_])		# Non-symbol chars.
-		(_*[a-zA-Z][a-zA-Z0-9_]*)	# The symbol.
-		\b
-		)x;
+    my $re = $lang->identifier_re();
 
     my %refs;
     my $line = 1;
@@ -199,7 +198,7 @@ sub reference_file($$$) {
 	    
 	    while ($frag =~ /\G.*?(?:(\n)|$re)/gc) {
 		$line++ && next if defined $1;
-		my $sym = $2;
+		my $sym = $lang->mangle_sym($2);
 
 		next if $$reserved{$sym};
 		push(@{$refs{$sym} ||= []}, $line);
@@ -236,20 +235,28 @@ sub hash_file($$$) {
 	sysopen($handle, $file->phys_path, 0) || die($!);
 	warn("--- hashing     ".$file->name." [".$file->revision."]\n");
 	my $doc = $hash->new_document($file->name);
+	my $charset = $context->config->{'content_charset'} || [];
+	$charset = [ref($charset) eq 'ARRAY' ? @$charset : $charset];
+	push(@$charset, 'iso-8859-1'); # Fall back
+	my $add_line = $hash->make_add_text($doc);
+
 	while (<$handle>) {
 	    my $pos = 0;
-	    # Latin-1 word characters.
-	    foreach my $term (/([0-9a-zA-Z\300-\326\330-\366\370-\377]+)/g) {
-		$term = lc($term);
-		next if length($term) > 128;
-		$doc->add_posting($term, $.*100 + $pos++);
+	    my $text;
+	    while (@$charset) {
+		$text = eval { decode($$charset[0], $_, Encode::FB_CROAK); };
+		last unless $@;
+		shift(@$charset);
 	    }
+
+	    $add_line->($.*100, $text);
 	}
 
 	reference_file($file, $fileid, $doc);
 	$docid = $hash->add_document($doc, [map {
 	    $index->release_id($tree, $_) } @$rels]);
 	$index->add_hashed_document($fileid, $docid);
+	$index->set_rfile_charset($fileid, $$charset[0]);
 	$handle->close();
 	return 1;
     }
diff --git a/lxrng.conf-dist b/lxrng.conf-dist
index 27abb1b..ab4229a 100644
--- a/lxrng.conf-dist
+++ b/lxrng.conf-dist
@@ -24,14 +24,16 @@ return {
 	'index'       => $index,
 	'search'      => $search,
 
-	'base_url'    => 'http://lxr-test.linpro.no/lxr',
+	'base_url'    => 'http://localhost/lxr',
 	# Must be writable by httpd user:
 	'cache'	      => '/var/lib/lxrng/cache',
 
 	'fs_charset'  => 'iso-8859-1',
-	'content_charset' => 'iso-8859-1',
+	# Tried successively
+	'content_charset' => ['utf-8', 'iso-8859-1'],
 
-	'languages'   => ['C'],
+	'languages'   => ['C', 'GnuAsm', 'Kconfig'],
+	'ctags_flags' => ["-I\@$LXRng::ROOT/lxr-ctags-quirks"],
 	'ver_list'    => [$gitrepo->allversions],
 
 	'ver_default' => 'v2.6.20.3',
author	Arne Georg Gleditsch <argggh@taniquetil.(none)>	2009-03-06 21:13:18 +0100
committer	Arne Georg Gleditsch <argggh@taniquetil.(none)>	2009-03-06 21:13:18 +0100
commit	8d7d1d2595c2c3475ec27d07d8544e345dab5851 (patch)
tree	6337fd7aa489a73c44abb742b2bac22741b00779
parent	a25f8442450d7e9fb188a979ab0897cb86e9d84f (diff)