1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
module AlaveteliTextMasker
extend self
DoNotBinaryMask = [ 'image/tiff',
'image/gif',
'image/jpeg',
'image/png',
'image/bmp',
'application/zip' ]
# Replaces all email addresses in (possibly binary) data
# Also applies custom masks and censor items
def apply_masks!(text, content_type, options = {})
# See if content type is one that we mask - things like zip files and
# images may get broken if we try to. We err on the side of masking too
# much, as many unknown types will really be text.
# Special cases for some content types
case content_type
when *DoNotBinaryMask
# do nothing
when 'text/html'
apply_text_masks!(text, options)
when 'application/pdf'
apply_pdf_masks!(text, options)
else
apply_binary_masks!(text, options)
end
end
def apply_pdf_masks!(text, options = {})
uncompressed_text = nil
uncompressed_text = AlaveteliExternalCommand.run("pdftk", "-", "output", "-", "uncompress",
:stdin_string => text)
# if we managed to uncompress the PDF...
if !uncompressed_text.blank?
# then censor stuff (making a copy so can compare again in a bit)
censored_uncompressed_text = uncompressed_text.dup
apply_binary_masks!(censored_uncompressed_text, options)
# if the censor rule removed something...
if censored_uncompressed_text != uncompressed_text
# then use the altered file (recompressed)
recompressed_text = nil
if AlaveteliConfiguration::use_ghostscript_compression == true
command = ["gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dQUIET", "-dBATCH", "-sOutputFile=-", "-"]
else
command = ["pdftk", "-", "output", "-", "compress"]
end
recompressed_text = AlaveteliExternalCommand.run(*(command + [{:stdin_string=>censored_uncompressed_text}]))
if recompressed_text.blank?
# buggy versions of pdftk sometimes fail on
# compression, I don't see it's a disaster in
# these cases to save an uncompressed version?
recompressed_text = censored_uncompressed_text
logger.warn "Unable to compress PDF; problem with your pdftk version?"
end
if !recompressed_text.blank?
text.replace recompressed_text
end
end
end
end
private
# Replace text in place
def apply_binary_masks!(text, options = {})
# Keep original size, so can check haven't resized it
orig_size = text.mb_chars.size
# Replace ASCII email addresses...
text.gsub!(MySociety::Validate.email_find_regexp) do |email|
email.gsub(/[^@.]/, 'x')
end
# And replace UCS-2 ones (for Microsoft Office documents)...
# Find emails, by finding them in parts of text that have ASCII
# equivalents to the UCS-2
ascii_chars = text.gsub(/\0/, "")
emails = ascii_chars.scan(MySociety::Validate.email_find_regexp)
# Convert back to UCS-2, making a mask at the same time
if String.method_defined?(:encode)
emails.map! do |email|
# We want the ASCII representation of UCS-2
[email[0].encode('UTF-16LE').force_encoding('US-ASCII'),
email[0].gsub(/[^@.]/, 'x').encode('UTF-16LE').force_encoding('US-ASCII')]
end
else
emails.map! {|email| [
Iconv.conv('ucs-2le', 'ascii', email[0]),
Iconv.conv('ucs-2le', 'ascii', email[0].gsub(/[^@.]/, 'x'))
] }
end
# Now search and replace the UCS-2 email with the UCS-2 mask
for email, mask in emails
text.gsub!(email, mask)
end
# Replace censor items
censor_rules = options[:censor_rules] || []
censor_rules.each{ |censor_rule| censor_rule.apply_to_binary!(text) }
raise "internal error in apply_binary_masks!" if text.mb_chars.size != orig_size
return text
end
# Remove any email addresses, login links and mobile phone numbers
def default_text_masks
[{ :to_replace => MySociety::Validate.email_find_regexp,
:replacement => "[#{_("email address")}]" },
{ :to_replace => /(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/,
:replacement => "[#{_("mobile number")}]" },
{ :to_replace => /https?:\/\/#{AlaveteliConfiguration::domain}\/c\/[^\s]+/,
:replacement => "[#{_("{{site_name}} login link",
:site_name => AlaveteliConfiguration::site_name)}]" }]
end
def apply_text_masks!(text, options = {})
masks = options[:masks] || []
masks += default_text_masks
censor_rules = options[:censor_rules] || []
masks.each{ |mask| text.gsub!(mask[:to_replace], mask[:replacement]) }
censor_rules.each{ |censor_rule| censor_rule.apply_to_text!(text) }
text
end
end
|