aboutsummaryrefslogtreecommitdiffstats
path: root/bin/problem-creation-graph
blob: e1b6f2cc405f96e4e3f5cfb93aa734daf086b2b8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env perl

# problem-creation-graph:
# Plot graph of rate of problem creation
#
# Copyright (c) 2014 UK Citizens Online Democracy. All rights reserved.
# Email: matthew@mysociety.org. WWW: http://www.mysociety.org

use strict;
use warnings;
require 5.8.0;

use FindBin;
use File::Temp qw(tempfile);

use FixMyStreet::App;

chdir("$FindBin::Bin/../");

my %config = (
    gplot_output => "set terminal png font 'Vera.ttf' 9 size 1200,400",
    extension => '.png',
    #gplot_output => "set terminal fig color big thickness 1"
    #extension => ".fig"
    #gplot_output => "set terminal svg size 800 250"
    #extension => ".svg"
);

my $base_url = FixMyStreet->config('BASE_URL');
$config{date} = '2007-02-01';
$config{date} = '2011-03-03' if $base_url =~ /fiksgatami/;
$config{date} = '2008-10-01' if $base_url =~ /emptyhomes/;

my %sources;
sub grab_data {
    my ($type, $selection) = @_;

    my ($fh, $filename) = tempfile("fms-creation-rate-graph-data-XXXXXXXXXX", UNLINK => 1);
    $sources{$type} = $filename;

    my @entries = FixMyStreet::App->model('DB::Problem')->search(
        $selection, {
            columns => [
                { 'date' => { date => 'created' } },
                { 'count' => { count => '*' } }
            ],
            group_by => [ 'date' ],
            order_by => [ 'date' ],
        }
    );
    @entries = map { { $_->get_columns } } @entries;
    foreach (@entries) {
        $fh->print($_->{date}, ' ', $_->{count}, "\n");
    }
}

# rather nastily, work out the cumulative heights in reverse, so can
# plot impulses on top of each other
grab_data('unconfirmed', {});
my @states_to_ignore = ('unconfirmed');
grab_data('open', { state => { -not_in => \@states_to_ignore } });
push @states_to_ignore, FixMyStreet::DB::Result::Problem->open_states();
grab_data('fixed', { state => { -not_in => \@states_to_ignore } });
push @states_to_ignore, FixMyStreet::DB::Result::Problem->fixed_states();
grab_data('closed', { state => { -not_in => \@states_to_ignore } });
push @states_to_ignore, FixMyStreet::DB::Result::Problem->closed_states();
grab_data('hidden', { state => { -not_in => \@states_to_ignore } });
push @states_to_ignore, 'hidden';
grab_data('other', { state => { -not_in => \@states_to_ignore } });

my $gp = <<END;
    unset border
    unset arrow
    set key left
    set tics out
    $config{gplot_output}

    set xdata time
    set timefmt "%Y-%m-%d"
    set xrange ["$config{date}":]
    set format x "%d %b %Y"
    set xtics nomirror
    set xlabel "status of problems that were created on each calendar day"

    set ytics nomirror 
    set ylabel "number of problems created on the calendar day" 
    set y2tics tc lt 0
    set y2label "cumulative total number of problems" tc lt 0
    set format y2 "%.0f"

#    set arrow 1 from '2005-02-14', 0 to '2005-02-14', 900 lt 0 nohead
#    set label 1 'launch of beta' at '2005-02-17', 900

    n = 0
END
$gp .= "    plot \"$sources{unconfirmed}\" using 1:2 with impulses lt 1 lw 2 title \"unconfirmed\",";
$gp .= " \"$sources{open}\" using 1:2 with impulses lt 3 lw 2 title \"open\","
    if -s $sources{open};
$gp .= " \"$sources{fixed}\" using 1:2 with impulses lt 2 lw 2 title \"fixed\","
    if -s $sources{fixed};
$gp .= " \"$sources{closed}\" using 1:2 with impulses lt 4 lw 2 title \"closed\","
    if -s $sources{closed};
$gp .= " \"$sources{hidden}\" using 1:2 with impulses lt 5 lw 2 title \"hidden\","
    if -s $sources{hidden};
$gp .= " \"$sources{other}\" using 1:2 with impulses lt 6 lw 2 title \"any other type\","
    if -s $sources{other};
$gp .= <<END;
    "< awk 'BEGIN { n = 0 } { n += \$2; print \$1, \$2, n; }' $sources{unconfirmed}" using 1:3 axes x1y2 with lines lt 0 title "cumulative total number of problems"
END

open(my $gnuplot, '|-', "GDFONTPATH=/usr/share/fonts/truetype/ttf-bitstream-vera gnuplot > web/fms-live-creation$config{extension} 2> /dev/null");
$gnuplot->print($gp);
close $gnuplot;
.parse(copy_of_raw_data) mail.base64_decode if decode mail end # Extracts all attachments from the given TNEF file as a TMail::Mail object def mail_from_tnef(content) main = TMail::Mail.new main.set_content_type 'multipart', 'mixed', { 'boundary' => TMail.new_boundary } tnef_attachments(content).each do |attachment| tmail_attachment = TMail::Mail.new tmail_attachment['content-location'] = attachment[:filename] tmail_attachment.body = attachment[:content] main.parts << tmail_attachment end main end # Return a copy of the file name for the mail part def get_part_file_name(mail_part) part_file_name = TMail::Mail.get_part_file_name(mail_part) if part_file_name.nil? return nil end part_file_name = part_file_name.dup return part_file_name end # Get the body of a mail part def get_part_body(mail_part) mail_part.body end # Return the first from address if any def get_from_address(mail) if mail.from_addrs.nil? || mail.from_addrs.size == 0 return nil end mail.from_addrs[0].spec end # Return the first from name if any def get_from_name(mail) mail.from_name_if_present end def get_all_addresses(mail) ((mail.to || []) + (mail.cc || []) + (mail.envelope_to || [])).uniq end def empty_return_path?(mail) return false if mail['return-path'].nil? return true if mail['return-path'].addr.to_s == '<>' return false end def get_auto_submitted(mail) mail['auto-submitted'] ? mail['auto-submitted'].body : nil end def get_content_type(part) part.content_type end def get_header_string(header, mail) mail.header_string(header) end # Number the attachments in depth first tree order, for use in URLs. # XXX This fills in part.rfc822_attachment and part.url_part_number within # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted # must be called before using the attributes. def ensure_parts_counted(mail) mail.count_parts_count = 0 _count_parts_recursive(mail, mail) # we carry on using these numeric ids for attachments uudecoded from within text parts mail.count_first_uudecode_count = mail.count_parts_count end def _count_parts_recursive(part, mail) if part.multipart? part.parts.each do |p| _count_parts_recursive(p, mail) end else part_filename = get_part_file_name(part) begin if part.content_type == 'message/rfc822' # An email attached as text # e.g. http://www.whatdotheyknow.com/request/64/response/102 part.rfc822_attachment = mail_from_raw_email(part.body, decode=false) elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook' # An email attached as an Outlook file # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi msg = Mapi::Msg.open(StringIO.new(part.body)) part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false) elsif part.content_type == 'application/ms-tnef' # A set of attachments in a TNEF file part.rfc822_attachment = mail_from_tnef(part.body) end rescue # If attached mail doesn't parse, treat it as text part part.rfc822_attachment = nil else unless part.rfc822_attachment.nil? _count_parts_recursive(part.rfc822_attachment, mail) end end if part.rfc822_attachment.nil? mail.count_parts_count += 1 part.url_part_number = mail.count_parts_count end end end def get_attachment_attributes(mail) leaves = get_attachment_leaves(mail) # XXX we have to call ensure_parts_counted after get_attachment_leaves # which is really messy. ensure_parts_counted(mail) attachment_attributes = [] for leaf in leaves body = get_part_body(leaf) # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here # to prevent excess memory use. XXX not really sure if this helps reduce # peak RAM use overall. Anyway, maybe there is something better to do than this. GC.start if leaf.within_rfc822_attachment within_rfc822_subject = leaf.within_rfc822_attachment.subject # Test to see if we are in the first part of the attached # RFC822 message and it is text, if so add headers. # XXX should probably use hunting algorithm to find main text part, rather than # just expect it to be first. This will do for now though. # Example request that needs this: # http://www.whatdotheyknow.com/request/2923/response/7013/attach/2/Cycle%20Path%20Bank.txt if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' headers = "" for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] if leaf.within_rfc822_attachment.header.include?(header.downcase) header_value = leaf.within_rfc822_attachment.header[header.downcase] # Example message which has a blank Date header: # http://www.whatdotheyknow.com/request/30747/response/80253/attach/html/17/Common%20Purpose%20Advisory%20Group%20Meeting%20Tuesday%202nd%20March.txt.html if !header_value.blank? headers = headers + header + ": " + header_value.to_s + "\n" end end end # XXX call _convert_part_body_to_text here, but need to get charset somehow # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt body = headers + "\n" + body # This is quick way of getting all headers, but instead we only add some a) to # make it more usable, b) as at least one authority accidentally leaked security # information into a header. #attachment.body = leaf.within_rfc822_attachment.port.to_s end end attachment_attributes << {:url_part_number => leaf.url_part_number, :content_type => get_content_type(leaf), :filename => get_part_file_name(leaf), :charset => leaf.charset, :within_rfc822_subject => within_rfc822_subject, :body => body, :hexdigest => Digest::MD5.hexdigest(body) } end attachment_attributes end # (This risks losing info if the unchosen alternative is the only one to contain # useful info, but let's worry about that another time) def get_attachment_leaves(mail) return _get_attachment_leaves_recursive(mail, mail) end def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil) leaves_found = [] if curr_mail.multipart? if curr_mail.parts.size == 0 raise "no parts on multipart mail" end if curr_mail.sub_type == 'alternative' # Choose best part from alternatives best_part = nil # Take the last text/plain one, or else the first one curr_mail.parts.each do |m| if not best_part best_part = m elsif m.content_type == 'text/plain' best_part = m end end # Take an HTML one as even higher priority. (They tend # to render better than text/plain, e.g. don't wrap links here: # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 ) curr_mail.parts.each do |m| if m.content_type == 'text/html' best_part = m end end leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment) else # Add all parts curr_mail.parts.each do |m| leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment) end end else # XXX Yuck. this section alters various content_types. That puts # it into conflict with ensure_parts_counted which it has to be # called both before and after. It will fail with cases of # attachments of attachments etc. charset = curr_mail.charset # save this, because overwriting content_type also resets charset # Don't allow nil content_types if curr_mail.content_type.nil? curr_mail.content_type = 'application/octet-stream' end # PDFs often come with this mime type, fix it up for view code if curr_mail.content_type == 'application/octet-stream' part_file_name = get_part_file_name(curr_mail) part_body = get_part_body(curr_mail) calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body) if calc_mime curr_mail.content_type = calc_mime end end # Use standard content types for Word documents etc. curr_mail.content_type = normalise_content_type(curr_mail.content_type) if curr_mail.content_type == 'message/rfc822' ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable if curr_mail.rfc822_attachment.nil? # Attached mail didn't parse, so treat as text curr_mail.content_type = 'text/plain' end end if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable if curr_mail.rfc822_attachment.nil? # Attached mail didn't parse, so treat as binary curr_mail.content_type = 'application/octet-stream' end end # If the part is an attachment of email if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment) else # Store leaf curr_mail.within_rfc822_attachment = within_rfc822_attachment leaves_found += [curr_mail] end # restore original charset curr_mail.charset = charset end return leaves_found end def address_from_name_and_email(name, email) if !MySociety::Validate.is_valid_email(email) raise "invalid email " + email + " passed to address_from_name_and_email" end if name.nil? return TMail::Address.parse(email).to_s end # Botch an always quoted RFC address, then parse it name = name.gsub(/(["\\])/, "\\\\\\1") TMail::Address.parse('"' + name + '" <' + email + '>').to_s end def address_from_string(string) TMail::Address.parse(string).address end end end end