1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
require 'stringio'
# this file is pretty crap, its just to ensure there is always something readable if
# there is an rtf only body, with no html encapsulation.
module RTF
class Tokenizer
def self.process io
while true do
case c = io.getc
when ?{; yield :open_group
when ?}; yield :close_group
when ?\\
case c = io.getc
when ?{, ?}, ?\\; yield :text, c.chr
when ?'; yield :text, [io.read(2)].pack('H*')
when ?a..?z, ?A..?Z
# read control word
str = c.chr
str << c while c = io.read(1) and c =~ /[a-zA-Z]/
neg = 1
neg = -1 and c = io.read(1) if c == '-'
num = if c =~ /[0-9]/
num = c
num << c while c = io.read(1) and c =~ /[0-9]/
num.to_i * neg
end
raise "invalid rtf stream" if neg == -1 and !num # ???? \blahblah- some text
io.seek(-1, IO::SEEK_CUR) if c != ' '
yield :control_word, str, num
when nil
raise "invalid rtf stream" # \EOF
else
# other kind of control symbol
yield :control_symbol, c.chr
end
when nil
return
when ?\r, ?\n
# ignore
else yield :text, c.chr
end
end
end
end
class Converter
# crappy
def self.rtf2text str, format=:text
group = 0
text = ''
text << "<html>\n<body>" if format == :html
group_type = []
group_tags = []
RTF::Tokenizer.process(StringIO.new(str)) do |a, b, c|
add_text = ''
case a
when :open_group; group += 1; group_type[group] = nil; group_tags[group] = []
when :close_group; group_tags[group].reverse.each { |t| text << "</#{t}>" }; group -= 1;
when :control_word; # ignore
group_type[group] ||= b
# maybe change this to use utf8 where possible
add_text = if b == 'par' || b == 'line' || b == 'page'; "\n"
elsif b == 'tab' || b == 'cell'; "\t"
elsif b == 'endash' || b == 'emdash'; "-"
elsif b == 'emspace' || b == 'enspace' || b == 'qmspace'; " "
elsif b == 'ldblquote'; '"'
else ''
end
if b == 'b' || b == 'i' and format == :html
close = c == 0 ? '/' : ''
text << "<#{close}#{b}>"
if c == 0
group_tags[group].delete b
else
group_tags[group] << b
end
end
# lot of other ones belong in here.\
=begin
\bullet Bullet character.
\lquote Left single quotation mark.
\rquote Right single quotation mark.
\ldblquote Left double quotation mark.
\rdblquote
=end
when :control_symbol; # ignore
group_type[group] ||= b
add_text = ' ' if b == '~' # non-breakable space
add_text = '-' if b == '_' # non-breakable hypen
when :text
add_text = b if group <= 1 or group_type[group] == 'rtlch' && !group_type[0...group].include?('*')
end
if format == :html
text << add_text.gsub(/([<>&"'])/) do
ent = { '<' => 'lt', '>' => 'gt', '&' => 'amp', '"' => 'quot', "'" => 'apos' }[$1]
"&#{ent};"
end
text << '<br>' if add_text == "\n"
else
text << add_text
end
end
text << "</body>\n</html>\n" if format == :html
text
end
end
end
|