Skip to content

Commit 4974c4c

Browse files
committedMar 29, 2020
[data][xl]: add recovery data for Canada and sort results by country, province and date.
refs #47 refs #36
1 parent 7356b2f commit 4974c4c

5 files changed

+3080
-2987
lines changed
 

‎data/countries-aggregated.csv

+67-67
Large diffs are not rendered by default.

‎data/time-series-19-covid-combined.csv

+2,931-2,864
Large diffs are not rendered by default.

‎data/worldwide-aggregated.csv

+46-46
Original file line numberDiff line numberDiff line change
@@ -20,49 +20,49 @@ Date,Confirmed,Recovered,Deaths,Increase rate
2020
2020-02-09,40150,3244,906,8.162715517241379
2121
2020-02-10,42762,3946,1013,6.505603985056039
2222
2020-02-11,44802,4683,1113,4.770590711379262
23-
2020-02-12,45221,5149,1118,0.9352261059774116
24-
2020-02-13,60368,6294,1371,33.49549987837509
25-
2020-02-14,66885,8057,1523,10.795454545454545
26-
2020-02-15,69030,9394,1666,3.206997084548105
27-
2020-02-16,71224,10864,1770,3.1783282630740253
28-
2020-02-17,73258,12582,1868,2.855778950915422
29-
2020-02-18,75136,14351,2007,2.5635425482541154
30-
2020-02-19,75639,16120,2122,0.6694527257240205
31-
2020-02-20,76197,18176,2247,0.737714671003054
32-
2020-02-21,76819,18887,2251,0.8163051038754806
33-
2020-02-22,78572,22883,2458,2.281987529126909
34-
2020-02-23,78958,23391,2469,0.49126915440614977
35-
2020-02-24,79561,25224,2629,0.7636971554497327
36-
2020-02-25,80406,27902,2708,1.062078153869358
37-
2020-02-26,81388,30381,2770,1.221301892893565
38-
2020-02-27,82746,33271,2814,1.6685506462869217
39-
2020-02-28,84112,36705,2872,1.6508350856839
40-
2020-02-29,86011,39776,2941,2.2577040136960247
41-
2020-03-01,88369,42710,2996,2.741509806885166
42-
2020-03-02,90306,45596,3085,2.1919451391325016
43-
2020-03-03,92840,48222,3160,2.806015104201271
44-
2020-03-04,95120,51164,3254,2.455838000861698
45-
2020-03-05,97886,53790,3348,2.907905803195963
46-
2020-03-06,101801,55859,3460,3.9995504975175207
47-
2020-03-07,105847,58350,3558,3.974420683490339
48-
2020-03-08,109821,60686,3802,3.754475799975436
49-
2020-03-09,113590,62486,3988,3.4319483523187735
50-
2020-03-10,118620,64396,4262,4.428206708337001
51-
2020-03-11,125875,66995,4615,6.116169280053954
52-
2020-03-12,128352,68316,4720,1.9678252234359483
53-
2020-03-13,145205,70243,5404,13.1302979306906
54-
2020-03-14,156101,72616,5819,7.5038738335456765
55-
2020-03-15,167454,76026,6440,7.272855394904582
56-
2020-03-16,181574,78079,7126,8.432166445710463
57-
2020-03-17,197102,80831,7905,8.551885181799156
58-
2020-03-18,214821,83303,8733,8.989761646254223
59-
2020-03-19,242500,84966,9867,12.884680734192653
60-
2020-03-20,272035,87411,11299,12.179381443298968
61-
2020-03-21,304396,91682,12973,11.895895748708805
62-
2020-03-22,336953,97889,14651,10.695607038200238
63-
2020-03-23,378235,98341,16505,12.251560306630301
64-
2020-03-24,418045,107890,18625,10.52520258569408
65-
2020-03-25,467653,113604,21181,11.866665071942016
66-
2020-03-26,529591,121966,23970,13.244435510945083
67-
2020-03-27,593291,130659,27198,12.028150025208133
68-
2020-03-28,660706,138949,30652,11.36288937469134
23+
2020-02-12,45221,5150,1118,0.9352261059774116
24+
2020-02-13,60368,6295,1371,33.49549987837509
25+
2020-02-14,66885,8058,1523,10.795454545454545
26+
2020-02-15,69030,9395,1666,3.206997084548105
27+
2020-02-16,71224,10865,1770,3.1783282630740253
28+
2020-02-17,73258,12583,1868,2.855778950915422
29+
2020-02-18,75136,14352,2007,2.5635425482541154
30+
2020-02-19,75639,16121,2122,0.6694527257240205
31+
2020-02-20,76197,18177,2247,0.737714671003054
32+
2020-02-21,76819,18890,2251,0.8163051038754806
33+
2020-02-22,78572,22886,2458,2.281987529126909
34+
2020-02-23,78958,23394,2469,0.49126915440614977
35+
2020-02-24,79561,25227,2629,0.7636971554497327
36+
2020-02-25,80406,27905,2708,1.062078153869358
37+
2020-02-26,81388,30384,2770,1.221301892893565
38+
2020-02-27,82746,33277,2814,1.6685506462869217
39+
2020-02-28,84112,36711,2872,1.6508350856839
40+
2020-02-29,86011,39782,2941,2.2577040136960247
41+
2020-03-01,88369,42716,2996,2.741509806885166
42+
2020-03-02,90306,45602,3085,2.1919451391325016
43+
2020-03-03,92840,48228,3160,2.806015104201271
44+
2020-03-04,95120,51170,3254,2.455838000861698
45+
2020-03-05,97886,53796,3348,2.907905803195963
46+
2020-03-06,101801,55865,3460,3.9995504975175207
47+
2020-03-07,105847,58358,3558,3.974420683490339
48+
2020-03-08,109821,60694,3802,3.754475799975436
49+
2020-03-09,113590,62494,3988,3.4319483523187735
50+
2020-03-10,118620,64404,4262,4.428206708337001
51+
2020-03-11,125875,67003,4615,6.116169280053954
52+
2020-03-12,128352,68324,4720,1.9678252234359483
53+
2020-03-13,145205,70251,5404,13.1302979306906
54+
2020-03-14,156101,72624,5819,7.5038738335456765
55+
2020-03-15,167454,76034,6440,7.272855394904582
56+
2020-03-16,181574,78088,7126,8.432166445710463
57+
2020-03-17,197102,80840,7905,8.551885181799156
58+
2020-03-18,214821,83312,8733,8.989761646254223
59+
2020-03-19,242500,84975,9867,12.884680734192653
60+
2020-03-20,272035,87420,11299,12.179381443298968
61+
2020-03-21,304396,91692,12973,11.895895748708805
62+
2020-03-22,336953,97899,14651,10.695607038200238
63+
2020-03-23,378235,98351,16505,12.251560306630301
64+
2020-03-24,418045,108000,18625,10.52520258569408
65+
2020-03-25,467653,113787,21181,11.866665071942016
66+
2020-03-26,529591,122150,23970,13.244435510945083
67+
2020-03-27,593291,130915,27198,12.028150025208133
68+
2020-03-28,660706,139415,30652,11.36288937469134

‎datapackage.json

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
{
2-
"bytes": 1149933,
3-
"count_of_rows": 28944,
4-
"hash": "4619027a96f66812a7529aafd82eb9c8",
2+
"bytes": 1154113,
3+
"count_of_rows": 29011,
4+
"hash": "c29556742bc7b81ae4a6ae6c235a0813",
55
"name": "covid-19",
66
"profile": "data-package",
77
"resources": [
88
{
9-
"bytes": 811609,
9+
"bytes": 815709,
1010
"dialect": {
1111
"caseSensitiveHeader": false,
1212
"delimiter": ",",
@@ -18,7 +18,7 @@
1818
},
1919
"encoding": "utf-8",
2020
"format": "csv",
21-
"hash": "e45a3ad0a2f9f549f3f5eb3cedac2b35",
21+
"hash": "71018c5cd76b83c9615320c4174b5e6b",
2222
"name": "time-series-19-covid-combined",
2323
"path": "data/time-series-19-covid-combined.csv",
2424
"profile": "tabular-data-resource",
@@ -151,7 +151,7 @@
151151
}
152152
},
153153
{
154-
"bytes": 332235,
154+
"bytes": 332315,
155155
"dialect": {
156156
"delimiter": ",",
157157
"doubleQuote": true,
@@ -161,7 +161,7 @@
161161
},
162162
"encoding": "utf-8",
163163
"format": "csv",
164-
"hash": "882ca245d90e7c660179640c74c6e66c",
164+
"hash": "c17c49a5fb6d33a046a3c933ba842b45",
165165
"name": "countries-aggregated",
166166
"path": "data/countries-aggregated.csv",
167167
"profile": "data-resource",
@@ -216,7 +216,7 @@
216216
},
217217
"encoding": "utf-8",
218218
"format": "csv",
219-
"hash": "03b4e91cb77173eb4df2e58ee597834c",
219+
"hash": "eebbf0e866ac4c78f9329779eee67e3e",
220220
"name": "worldwide-aggregated",
221221
"path": "data/worldwide-aggregated.csv",
222222
"profile": "data-resource",

‎scripts/process.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, update_schema, join, join_with_self, add_computed_field, delete_fields, checkpoint, duplicate, filter_rows
1+
from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path
2+
from dataflows import update_package, update_resource, update_schema, join
3+
from dataflows import join_with_self, add_computed_field, delete_fields
4+
from dataflows import checkpoint, duplicate, filter_rows, sort_rows, printer
25

36
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
47
CONFIRMED = 'time_series_covid19_confirmed_global.csv'
@@ -76,6 +79,23 @@ def process_rows(rows):
7679
yield row
7780
yield process_rows(worldwide_data)
7881

82+
def add_missing_columns(rows):
83+
expected = {
84+
'Date': None,
85+
'Province/State': None,
86+
'Country/Region': None,
87+
'Lat': None,
88+
'Long': None,
89+
'Case': None,
90+
'Confirmed': None,
91+
'Recovered': None
92+
}
93+
for row in rows:
94+
if row.get('Country/Region') == 'Canada' and not row.get('Province/State'):
95+
row['Province/State'] = 'Recovery aggregated'
96+
row['Lat'] = row.get('Lat', '56.1304')
97+
row['Long'] = row.get('Long', '-106.3468')
98+
yield {**expected, **row}
7999

80100
Flow(
81101
load(f'{BASE_URL}{CONFIRMED}'),
@@ -107,8 +127,12 @@ def process_rows(rows):
107127
fields=dict(Recovered={
108128
'name': 'Case',
109129
'aggregate': 'first'
110-
})
130+
}),
131+
mode='full-outer'
111132
),
133+
# Add missing columns, e.g., after 'full-outer' join, the rows structure
134+
# is inconsistent
135+
add_missing_columns,
112136
add_computed_field(
113137
target={'name': 'Deaths', 'type': 'number'},
114138
operation='format',
@@ -169,6 +193,8 @@ def process_rows(rows):
169193
}
170194
]),
171195
checkpoint('processed_data'),
196+
# Sort rows by date and country
197+
sort_rows('{Country/Region}{Province/State}{Date}', resources='time-series-19-covid-combined'),
172198
# Duplicate the stream to create aggregated data
173199
duplicate(
174200
source='time-series-19-covid-combined',

0 commit comments

Comments
 (0)
Please sign in to comment.