From 918c2b69bd87f3ddee1ad6956a5b88ce0e8865ea Mon Sep 17 00:00:00 2001
From: Colin Copeland <copelco@caktusgroup.com>
Date: Sat, 31 May 2014 18:26:22 -0400
Subject: [PATCH] first attempt at scraping wake county data

---
 eatsmart/locations/wake/__init__.py           |   0
 eatsmart/locations/wake/api.py                | 159 ++++++++++++++++++
 eatsmart/locations/wake/forms.py              | 102 +++++++++++
 .../locations/wake/management/__init__.py     |   0
 .../wake/management/commands/__init__.py      |   0
 .../wake/management/commands/import_wake.py   |  10 ++
 eatsmart/locations/wake/models.py             |   0
 eatsmart/settings/base.py                     |   1 +
 inspections/admin.py                          |  47 +++++-
 ...pening_date__chg_field_establishment_st.py |  70 ++++++++
 inspections/models.py                         |  12 +-
 11 files changed, 388 insertions(+), 13 deletions(-)
 create mode 100644 eatsmart/locations/wake/__init__.py
 create mode 100644 eatsmart/locations/wake/api.py
 create mode 100644 eatsmart/locations/wake/forms.py
 create mode 100644 eatsmart/locations/wake/management/__init__.py
 create mode 100644 eatsmart/locations/wake/management/commands/__init__.py
 create mode 100644 eatsmart/locations/wake/management/commands/import_wake.py
 create mode 100644 eatsmart/locations/wake/models.py
 create mode 100644 inspections/migrations/0003_auto__chg_field_establishment_opening_date__chg_field_establishment_st.py

diff --git a/eatsmart/locations/wake/__init__.py b/eatsmart/locations/wake/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/eatsmart/locations/wake/api.py b/eatsmart/locations/wake/api.py
new file mode 100644
index 0000000..80c21b9
--- /dev/null
+++ b/eatsmart/locations/wake/api.py
@@ -0,0 +1,159 @@
+import csv
+import os
+import io
+import logging
+import requests
+import zipfile
+import tempfile
+import datetime
+import time
+import pprint
+
+
+from eatsmart.locations.base import Importer
+from eatsmart.locations.wake import forms
+from inspections.models import Establishment, Inspection, Violation
+
+
+logger = logging.getLogger(__name__)
+
+
+class WakeCounty(object):
+
+    url = "http://www.wakegov.com/data/Documents/WCRestaurantInspections.zip"
+
+    def download_and_unzip_data(self, destination):
+        logger.debug("Requesting {}".format(self.url))
+        response = requests.get(self.url)
+        archive = zipfile.ZipFile(io.BytesIO(response.content))
+        logger.debug("Extracting archive into {}".format(str(destination)))
+        archive.extractall(path=destination)
+
+    def import_lives(self):
+        with tempfile.TemporaryDirectory(prefix='wake') as destination:
+            logger.debug("Created temp directory {}".format(destination))
+            self.download_and_unzip_data(destination)
+            businesses = os.path.join(destination, 'businesses.csv')
+            # if os.path.exists(businesses):
+            #     BusinessImporter().run(path=businesses)
+            inspections = os.path.join(destination, 'inspections.csv')
+            # Inspection.objects.filter(establishment__county='Wake').delete()
+            # if os.path.exists(inspections):
+            #     InspectionImporter().run(path=inspections)
+            violations = os.path.join(destination, 'violations.csv')
+            if os.path.exists(violations):
+                ViolationImporter().run(path=violations)
+
+
+class WakeCSVImporter(Importer):
+    "Special importer to open CSV files using the Windows encoding"
+
+    def run(self, path):
+        logger.debug("Importing {}".format(path))
+        with open(path, 'r', encoding='ISO-8859-1') as csv_file:
+            reader = csv.DictReader(csv_file)
+            self.fetch(reader)
+
+    def fetch(self, data, **kwargs):
+        "Primay import workflow with error handling"
+        objects = []
+        start_time = time.time()
+        for index, api in enumerate(data):
+            row = self.map_fields(api=api, **kwargs)
+            form = self.Form(dict(row))
+            if not form.is_valid():
+                errors = {'model': self.Model._meta.object_name,
+                          'errors': dict(form.errors.items()),
+                          'cleaned_data': form.cleaned_data,
+                          'api': api,
+                          'row': row}
+                logger.error(pprint.pformat(errors, indent=4))
+                continue
+            try:
+                instance = self.get_instance(data=form.cleaned_data, **kwargs)
+            except self.Model.DoesNotExist:
+                # Instance doesn't exist, must be new
+                instance = None
+            if instance:
+                form.instance = instance
+            objects.append(form.save())
+            if index % 20 == 0:
+                elapsed_time = time.time() - start_time
+                values = {'model': self.Model._meta.object_name,
+                          'id': row.get('external_id', 'n/a'),
+                          's': len(objects)/elapsed_time}
+                msg = "{model} ID: {id} ({s:.2f} records/sec)".format(**values)
+                logger.debug(msg)
+                start_time = time.time()
+                objects = []
+
+
+class BusinessImporter(WakeCSVImporter):
+    "Import Wake County, NC restaurants"
+
+    Model = Establishment
+    Form = forms.BusinessForm
+
+    def get_instance(self, data):
+        "Instance exists if we have external_id and it's within Wake County"
+        return self.Model.objects.get(external_id=data['external_id'],
+                                      county=data['county'])
+
+    def map_fields(self, api):
+        "Map CSV field names from Wake's data to our database schema"
+        return {'external_id': api['business_id'],
+                'name': api['name'],
+                'type': 1,  # Restaurant
+                'address': api['address'],
+                'city': api['city'],
+                'county': 'Wake',
+                'state': 'NC',
+                'postal_code': api['postal_code'],
+                'phone_number': api['phone_number'],
+                'lat': api['latitude'],
+                'lon': api['longitude'],
+                'status': 'active'}
+
+
+class InspectionImporter(WakeCSVImporter):
+    "Import Wake inspections"
+
+    Model = Inspection
+    Form = forms.InspectionForm
+
+    def get_instance(self, data):
+        "Inspections with same establishment, date, and type is existing"
+        query = {
+            'date': data['date'],
+            'type': data['type'],
+            'establishment': data['establishment'],
+        }
+        return self.Model.objects.get(**query)
+
+    def map_fields(self, api):
+        "Map CSV field names from Wake's data to our database schema"
+        return {'establishment': api['business_id'],
+                'date': api['date'],
+                'type': api['type'],
+                'score': api['score'],
+                'description': api['description']}
+
+
+class ViolationImporter(WakeCSVImporter):
+    "Import Wake violations"
+
+    Model = Violation
+    Form = forms.ViolationForm
+
+    def get_instance(self, data):
+        "Instance exists if we have external_id for the given inspection"
+        return self.Model.objects.get(date=data['date'],
+                                      code=data['code'],
+                                      inspection=data['inspection'])
+
+    def map_fields(self, api):
+        "Map CSV field names from Wake's data to our database schema"
+        return {'establishment': api['business_id'],
+                'date': api['date'],
+                'code': api['code'],
+                'description': api['description']}
diff --git a/eatsmart/locations/wake/forms.py b/eatsmart/locations/wake/forms.py
new file mode 100644
index 0000000..58d0c38
--- /dev/null
+++ b/eatsmart/locations/wake/forms.py
@@ -0,0 +1,102 @@
+import logging
+
+from django.contrib.gis import forms
+from django.contrib.gis.geos import Point
+
+from inspections.models import Establishment, Inspection, Violation
+
+
+DATE_FORMATS = ['%Y%m%d']
+INSPECTION_TYPE_MAP = {
+    'initial': 5,  # Permit
+    'routine': 1,  # Routine Inspection
+    'followup': 9,  # Verification
+    'complaint': 31,  # Critical Violation Followup
+}
+LIVES_INSPECTION_TYPES = [(x, x) for x in INSPECTION_TYPE_MAP.keys()]
+
+logger = logging.getLogger(__name__)
+
+
+class BusinessForm(forms.ModelForm):
+    "Validate and clean Wake's bussiness data"
+
+    lat = forms.FloatField(required=False)
+    lon = forms.FloatField(required=False)
+
+    class Meta:
+        model = Establishment
+        exclude = ('location',)
+
+    def clean_city(self):
+        city = self.cleaned_data['city']
+        return city.title()
+
+    def clean(self):
+        lat = self.cleaned_data.get('lat', None)
+        lon = self.cleaned_data.get('lon', None)
+        if lat and lon:
+            self.cleaned_data['location'] = Point(lon, lat)
+        return self.cleaned_data
+
+    def save(self, commit=True):
+        instance = super().save(commit=False)
+        if 'location' in self.cleaned_data:
+            instance.location = self.cleaned_data['location']
+        instance.save()
+        return instance
+
+
+class InspectionForm(forms.ModelForm):
+    "Validate and clean Wake's inspection data"
+
+    establishment = forms.CharField()
+    score = forms.FloatField(required=False)
+    date = forms.DateTimeField(input_formats=DATE_FORMATS)
+    type = forms.ChoiceField(choices=LIVES_INSPECTION_TYPES)
+
+    class Meta:
+        model = Inspection
+
+    def clean_type(self):
+        type_ = self.cleaned_data['type']
+        return INSPECTION_TYPE_MAP[type_]
+
+    def clean_establishment(self):
+        query = {'county': 'Wake',
+                 'external_id': self.cleaned_data['establishment']}
+        try:
+            return Establishment.objects.get(**query)
+        except Establishment.DoesNotExist:
+            raise forms.ValidationError("Establishment doesn't exist")
+
+
+class ViolationForm(forms.ModelForm):
+    "Validate and clean Wake's violation data"
+
+    establishment = forms.CharField()
+    inspection = forms.CharField(required=False)
+    date = forms.DateTimeField(input_formats=DATE_FORMATS)
+
+    class Meta:
+        model = Violation
+
+    def clean(self):
+        cleaned_data = self.cleaned_data
+        query = {'county': 'Wake',
+                 'external_id': cleaned_data['establishment']}
+        try:
+            establishment = Establishment.objects.get(**query)
+        except Establishment.DoesNotExist:
+            raise forms.ValidationError("Establishment doesn't exist")
+        query = {'date': cleaned_data['date'],
+                 'establishment': establishment}
+        try:
+            inspection = Inspection.objects.get(**query)
+        except Inspection.DoesNotExist:
+            raise forms.ValidationError("Inspection doesn't exist")
+        except Inspection.MultipleObjectsReturned:
+            raise forms.ValidationError("Multiple inspections found: {}".format(str(query)))
+        cleaned_data['inspection'] = inspection
+        cleaned_data['establishment'] = establishment
+        return cleaned_data
diff --git a/eatsmart/locations/wake/management/__init__.py b/eatsmart/locations/wake/management/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/eatsmart/locations/wake/management/commands/__init__.py b/eatsmart/locations/wake/management/commands/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/eatsmart/locations/wake/management/commands/import_wake.py b/eatsmart/locations/wake/management/commands/import_wake.py
new file mode 100644
index 0000000..8fac004
--- /dev/null
+++ b/eatsmart/locations/wake/management/commands/import_wake.py
@@ -0,0 +1,10 @@
+from django.core.management.base import BaseCommand
+
+from eatsmart.locations.wake import api
+
+
+class Command(BaseCommand):
+    """Import saniation data from Durham County API"""
+
+    def handle(self, *args, **options):
+        api.WakeCounty().import_lives()
diff --git a/eatsmart/locations/wake/models.py b/eatsmart/locations/wake/models.py
new file mode 100644
index 0000000..e69de29
diff --git a/eatsmart/settings/base.py b/eatsmart/settings/base.py
index 50e2499..e7981e1 100644
--- a/eatsmart/settings/base.py
+++ b/eatsmart/settings/base.py
@@ -151,6 +151,7 @@
     'inspections',
     'users',
     'eatsmart.locations.durham',
+    'eatsmart.locations.wake',
 )
 
 # A sample logging configuration. The only tangible logging
diff --git a/inspections/admin.py b/inspections/admin.py
index a6aa1bb..ca4f42e 100644
--- a/inspections/admin.py
+++ b/inspections/admin.py
@@ -5,9 +5,8 @@
 
 class EstablishmentAdmin(LeafletGeoAdmin):
     search_fields = ('name', 'address')
-    list_display = ('id', 'name', 'type',
-                    'county', 'state_id', 'point', 'update_date')
-    list_filter = ('county', 'postal_code')
+    list_display = ('id', 'name', 'type', 'county', 'point', 'update_date')
+    list_filter = ('county', 'update_date')
     ordering = ('-update_date',)
 
     def point(self, obj):
@@ -16,23 +15,53 @@ def point(self, obj):
         return None
 
 
+class InspectionCountyFilter(admin.SimpleListFilter):
+    title = 'County'
+    parameter_name = 'county'
+
+    def lookups(self, request, model_admin):
+        counties = Establishment.objects.values_list('county', flat=True)
+        return [(name, name) for name in counties.distinct()]
+
+    def queryset(self, request, queryset):
+        if self.value():
+            return queryset.filter(establishment__county=self.value())
+        else:
+            return queryset
+
+
 class InspectionAdmin(admin.ModelAdmin):
     search_fields = ('id', 'establishment__external_id', 'external_id',
                      'establishment__name')
-    list_display = ('id', 'external_id', 'establishment', 'type',
-                    'date', 'update_date')
-    list_filter = ('update_date', 'type')
+    list_display = ('id', 'establishment', 'type',
+                    'date', 'external_id', 'update_date')
+    list_filter = (InspectionCountyFilter, 'type', 'update_date')
     ordering = ('-date',)
     raw_id_fields = ('establishment',)
     date_hierarchy = 'date'
 
 
+class ViolationCountyFilter(admin.SimpleListFilter):
+    title = 'County'
+    parameter_name = 'county'
+
+    def lookups(self, request, model_admin):
+        counties = Establishment.objects.values_list('county', flat=True)
+        return [(name, name) for name in counties.distinct()]
+
+    def queryset(self, request, queryset):
+        if self.value():
+            return queryset.filter(inspection__establishment__county=self.value())
+        else:
+            return queryset
+
+
 class ViolationAdmin(admin.ModelAdmin):
     search_fields = ('id', 'external_id', 'code', 'description',
                      'establishment__name')
-    list_display = ('id', 'external_id', 'establishment', 'code',
-                    'date', 'comments')
-    list_filter = ('code',)
+    list_display = ('id', 'establishment', 'code',
+                    'date', 'comments', 'external_id')
+    list_filter = (ViolationCountyFilter, 'date')
     raw_id_fields = ('establishment', 'inspection')
     ordering = ('-date',)
     date_hierarchy = 'date'
diff --git a/inspections/migrations/0003_auto__chg_field_establishment_opening_date__chg_field_establishment_st.py b/inspections/migrations/0003_auto__chg_field_establishment_opening_date__chg_field_establishment_st.py
new file mode 100644
index 0000000..4dca0a6
--- /dev/null
+++ b/inspections/migrations/0003_auto__chg_field_establishment_opening_date__chg_field_establishment_st.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+from south.utils import datetime_utils as datetime
+from south.db import db
+from south.v2 import SchemaMigration
+from django.db import models
+
+
+class Migration(SchemaMigration):
+
+    def forwards(self, orm):
+
+        # Changing field 'Establishment.opening_date'
+        db.alter_column('inspections_establishment', 'opening_date', self.gf('django.db.models.fields.DateTimeField')(null=True))
+
+        # Changing field 'Establishment.state_id'
+        db.alter_column('inspections_establishment', 'state_id', self.gf('django.db.models.fields.BigIntegerField')(null=True))
+
+    def backwards(self, orm):
+
+        # Changing field 'Establishment.opening_date'
+        db.alter_column('inspections_establishment', 'opening_date', self.gf('django.db.models.fields.DateTimeField')(default=datetime.datetime(1970, 1, 1, 0, 0)))
+
+        # Changing field 'Establishment.state_id'
+        db.alter_column('inspections_establishment', 'state_id', self.gf('django.db.models.fields.BigIntegerField')(default=0))
+
+    models = {
+        'inspections.establishment': {
+            'Meta': {'unique_together': "(('external_id', 'county'),)", 'object_name': 'Establishment'},
+            'address': ('django.db.models.fields.CharField', [], {'max_length': '255'}),
+            'city': ('django.db.models.fields.CharField', [], {'max_length': '64'}),
+            'county': ('django.db.models.fields.CharField', [], {'db_index': 'True', 'max_length': '64'}),
+            'external_id': ('django.db.models.fields.CharField', [], {'max_length': '128'}),
+            'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'location': ('django.contrib.gis.db.models.fields.PointField', [], {'blank': 'True', 'null': 'True'}),
+            'name': ('django.db.models.fields.CharField', [], {'max_length': '255'}),
+            'opening_date': ('django.db.models.fields.DateTimeField', [], {'blank': 'True', 'null': 'True'}),
+            'phone_number': ('django.db.models.fields.CharField', [], {'blank': 'True', 'max_length': '64'}),
+            'postal_code': ('django.db.models.fields.CharField', [], {'max_length': '16'}),
+            'property_id': ('django.db.models.fields.CharField', [], {'blank': 'True', 'max_length': '128'}),
+            'state': ('django.db.models.fields.CharField', [], {'max_length': '64'}),
+            'state_id': ('django.db.models.fields.BigIntegerField', [], {'blank': 'True', 'null': 'True'}),
+            'status': ('django.db.models.fields.CharField', [], {'default': "'active'", 'max_length': '32'}),
+            'type': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}),
+            'update_date': ('django.db.models.fields.DateTimeField', [], {'blank': 'True', 'db_index': 'True', 'null': 'True'})
+        },
+        'inspections.inspection': {
+            'Meta': {'object_name': 'Inspection'},
+            'date': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
+            'description': ('django.db.models.fields.TextField', [], {'blank': 'True'}),
+            'establishment': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'inspections'", 'to': "orm['inspections.Establishment']"}),
+            'external_id': ('django.db.models.fields.CharField', [], {'max_length': '128'}),
+            'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'score': ('django.db.models.fields.FloatField', [], {'blank': 'True', 'null': 'True'}),
+            'type': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}),
+            'update_date': ('django.db.models.fields.DateTimeField', [], {'blank': 'True', 'db_index': 'True', 'null': 'True'})
+        },
+        'inspections.violation': {
+            'Meta': {'object_name': 'Violation'},
+            'code': ('django.db.models.fields.CharField', [], {'max_length': '32'}),
+            'date': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
+            'description': ('django.db.models.fields.TextField', [], {'blank': 'True'}),
+            'establishment': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'violations'", 'to': "orm['inspections.Establishment']"}),
+            'external_id': ('django.db.models.fields.CharField', [], {'max_length': '128'}),
+            'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
+            'inspection': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'violations'", 'blank': 'True', 'to': "orm['inspections.Inspection']", 'null': 'True'}),
+            'update_date': ('django.db.models.fields.DateTimeField', [], {'blank': 'True', 'db_index': 'True', 'null': 'True'})
+        }
+    }
+
+    complete_apps = ['inspections']
diff --git a/inspections/models.py b/inspections/models.py
index 025f693..e1516d6 100644
--- a/inspections/models.py
+++ b/inspections/models.py
@@ -50,7 +50,8 @@ class Establishment(models.Model):
         (73, ugettext_lazy('Temporary Food Establishment')),
     )
     external_id = models.CharField(ugettext_lazy("External ID"), max_length=128)
-    state_id = models.BigIntegerField(ugettext_lazy("State ID"))
+    state_id = models.BigIntegerField(ugettext_lazy("State ID"), null=True,
+                                      blank=True)
     property_id = models.CharField(ugettext_lazy("Property ID"), max_length=128, blank=True)
     name = models.CharField(ugettext_lazy("Name"), max_length=255)
     type = models.PositiveIntegerField(ugettext_lazy("Type"), default=0, choices=TYPE_CHOICES)
@@ -60,7 +61,8 @@ class Establishment(models.Model):
     state = models.CharField(ugettext_lazy("State"), max_length=64)
     postal_code = models.CharField(ugettext_lazy("Postal Code"), max_length=16)
     phone_number = models.CharField(ugettext_lazy("Phone Number"), max_length=64, blank=True)
-    opening_date = models.DateTimeField(ugettext_lazy("Opening Date"))
+    opening_date = models.DateTimeField(ugettext_lazy("Opening Date"),
+                                        null=True, blank=True)
     update_date = models.DateTimeField(ugettext_lazy("Update Date"), null=True, blank=True, db_index=True)
     status = models.CharField(ugettext_lazy("Status"), choices=STATUS_CHOICES, max_length=32,
                               default='active')
@@ -96,7 +98,8 @@ class Inspection(models.Model):
     establishment = models.ForeignKey(Establishment,
                                       verbose_name=ugettext_lazy("Establishment"),
                                       related_name='inspections')
-    external_id = models.CharField(ugettext_lazy("External ID"), max_length=128)
+    external_id = models.CharField(ugettext_lazy("External ID"),
+                                   max_length=128, blank=True)
     date = models.DateTimeField(ugettext_lazy("Date"), db_index=True)
     score = models.FloatField(ugettext_lazy("Score"), null=True, blank=True)
     description = models.TextField(ugettext_lazy("Description"), blank=True)
@@ -118,7 +121,8 @@ class Violation(models.Model):
     inspection = models.ForeignKey(Inspection, related_name='violations',
                                    verbose_name=ugettext_lazy("Inspection"), null=True,
                                    blank=True)
-    external_id = models.CharField(ugettext_lazy("External ID"), max_length=128)
+    external_id = models.CharField(ugettext_lazy("External ID"),
+                                   max_length=128, blank=True)
     date = models.DateTimeField(ugettext_lazy("Date"), db_index=True)
     code = models.CharField(ugettext_lazy("Code"), max_length=32)
     description = models.TextField(ugettext_lazy("Description"), blank=True)