aboutsummaryrefslogtreecommitdiffstats
path: root/bin/fixmystreet.com/canonicalise-csv
blob: c0a7fc60be66ca77fc727e1734046e4eb1cb39c8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/perl -w

# canonicalise-csv:
# Convert provided CSV file into one with standard names for MaPit
#
# Copyright (c) 2006 UK Citizens Online Democracy. All rights reserved.
# Email: matthew@mysociety.org. WWW: http://www.mysociety.org
#
# $Id: canonicalise-csv,v 1.4 2009-02-16 18:56:44 matthew Exp $

use strict;
require 5.8.0;

# Horrible boilerplate to set up appropriate library paths.
use FindBin;
use lib "$FindBin::Bin/../perllib";
use lib "$FindBin::Bin/../commonlib/perllib";

use mySociety::Config;
use mySociety::MaPit;
use mySociety::VotingArea;

BEGIN {
    mySociety::Config::set_file("$FindBin::Bin/../conf/general");
    mySociety::MaPit::configure();
}

my %councils;
open(FP, "$FindBin::Bin/../data/councils.csv")
    or die "Can't read councils.csv: $!\n";
while (<FP>) {
    s/\r?\n//g;
    my ($name, $email) = split /,/;
    $email ||= '';
    $email =~ s/\xa0//g;

    # Canonicalise
    next if $name eq 'Londonderry'; # Dupe of Derry
    next if $name eq 'Kingston upon Hull'; # Dupe of Hull
    next if $name eq 'London' || $name eq 'Greater London'; # Untrustworthy

    # Different
    $name = 'Corporation of London' if $name eq 'City of London';
    $name = "Renfrewsh'r" if $name eq 'Renfrewshire';
    # Shorter
    $name = 'Blackburn' if $name eq 'Blackburn with Darwen';
    $name = 'Dungannon' if $name eq 'Dungannon & South Tyrone';
    $name = 'Staffordshire' if $name eq 'Staffordshire County';
    $name = 'Armagh' if $name eq 'Armagh City';
    # Longer
    $name = 'Kingston upon Hull' if $name eq 'Hull';
    $name = "City of $name" if $name =~ /^(Edinburgh|Glasgow|York)$/;
    $name .= ' Islands' if $name eq 'Shetland';
    $name .= ' & Chelsea' if $name eq 'Kensington';
    # Wrong
    $name =~ s/King's Lynn/Kings Lynn/;
    $name = 'Surrey Heath' if $name eq 'Surrey Health';
    $name = 'Barking & Dagenham' if $name eq 'Barking-Dagenham';
    $name = 'Newtownabbey' if $name eq 'Newtonabbey';
    $name = 'Isles of Scilly' if $name eq 'Isle of Scilly';
    # Compass
    $name =~ s/North East /N. E. /;
    $name =~ s/^North West /N. W. /;
    $name =~ s/^North (?!Lincolnshire|Norfolk|Somerset)/N. /;
    $name =~ s/^South (?!Shropshire|Staffordshire|Somerset)/S. /;
    $name =~ s/^East (?!Staffordshire)/E. /;
    $name =~ s/^West(ern)? (?!Berkshire|Wiltshire)/W. /;
    $name =~ s/ W / W. /;
    # Various
    $name =~ s/^Great /Gt. /;
    $name =~ s/^St /St. /;
    $name =~ s/ and / & /;
    $name =~ s/ ?Royal$//;
    $name =~ s/ Borough$//;
    $name =~ s/-(upon|on|le)-/ $1 /;
    $councils{$name} = $email;
}
close(FP);

my $types = $mySociety::VotingArea::council_parent_types;
my (%out, @missing);
foreach my $type (@$types) {
    my $areas = mySociety::MaPit::get_areas_by_type($type);
    my $areas_info = mySociety::MaPit::get_voting_areas_info($areas);
    foreach my $id (keys %$areas_info) {
        my $area_info = $areas_info->{$id};
        my $name = $area_info->{name};
        if ($name eq 'Durham City Council') {
            $out{$id} = $councils{'Durham City'};
            next;
        } elsif ($name eq 'Durham County Council') {
            $out{$id} = $councils{'Durham County'};
            next;
        }
        $name =~ s/( (Borough|City|District|County))* Council//;
        if ($councils{$name} && $councils{$name} =~ /@/) {
            $out{$id} = $councils{$name};
        } elsif ($councils{$name} || exists($councils{$name})) {
            push @missing, $id;
        }
    }
}

# Output missing IDs to STDOUT
print join(',', @missing) . "\n";

# Output emails to canonical CSV
open(FP, ">$FindBin::Bin/../data/councils_canonical.csv");
foreach (sort keys %out) {
    print FP "$_," . $out{$_} . "\n";
}
close FP;