It is hard to know who to believe these days. Too many headlines with too little information that is relevant to me.
But you do not have to rely on CNN or Danmarks Radio to digest the information for you, we can quite easily get access to the data our-selves and extract the truth that we would like to see.
I am interested in how my area and my friends and family’s area are doing during this crisis, so I made a script that will extract that data for me.
John Hopskins University collects data regarding the sick and diseased from COVID-19, you can get an up-to-date overview from the link above.
The underlying data is available on github, you can find them here github.com/CSSEGISandData/COVID-19
The script will download the data for a given day, and produce an output like this one
rank provins sick died died/sick population died/population 04-01-2020.csv 0 Total 932605 46809 5.0 % 7775.0 6 deaths/mill 1 Italy 110574 13155 11.9 % 60.5 218 deaths/mill 2 Spain 104118 9387 9.0 % 46.8 201 deaths/mill 3 US 213372 4757 2.2 % 331.0 14 deaths/mill 4 France 56989 4032 7.1 % 65.3 62 deaths/mill 5 China 82361 3316 4.0 % 1439.3 2 deaths/mill 6 Iran 47593 3036 6.4 % 84.0 36 deaths/mill 7 United_Kingdom 29474 2352 8.0 % 67.9 35 deaths/mill 8 New_York-US 83948 1941 2.3 % 19.5 100 deaths/mill 9 Netherlands 13614 1173 8.6 % 17.1 68 deaths/mill 10 Germany 77872 920 1.2 % 83.8 11 deaths/mill 11 Belgium 13964 828 5.9 % 11.6 71 deaths/mill 12 Switzerland 17768 488 2.7 % 8.7 56 deaths/mill 13 New_Jersey-US 22255 355 1.6 % 8.9 40 deaths/mill 14 Michigan-US 9315 335 3.6 % 10.0 34 deaths/mill 15 Turkey 15679 277 1.8 % 84.3 3 deaths/mill 16 Louisiana-US 6424 273 4.2 % 4.6 59 deaths/mill 17 Brazil 6836 240 3.5 % 212.6 1 deaths/mill 18 Sweden 4947 239 4.8 % 10.1 24 deaths/mill 19 Washington-US 5608 234 4.2 % 7.6 31 deaths/mill 20 California-US 9399 199 2.1 % 39.5 5 deaths/mill 27 Massachusetts-US 7738 122 1.6 % 6.9 18 deaths/mill 28 Denmark 3107 104 3.3 % 5.8 18 deaths/mill 88 Norfolk-Massachusetts 829 15 1.8 % 0.7 21 deaths/mill
The script is below, you are welcome to adapt it to your own needs:
#!/bin/sh
# (C) Beerware by Storepeter 2020
# based on data from https://github.com/CSSEGISandData/COVID-19
#
usage()
{
if [ $# != 0 ]; then
echo ERROR:$*
fi
echo USAGE: top20.sh 04-01-2020.csv
echo download and extract top 20 corona virus for at a given day
exit
}
NTOP=20
URL_DIR=https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports
# FILE=03-21-2020.csv
# got two format:
# until 2020-03-22
# Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
# Hubei,China,2020-03-13T11:09:03,67786,3062,51553,30.9756,112.2707
# from 2020-03-23
# FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
# 21001,Adair,Kentucky,US,2020-03-23 23:19:34,37.10459774,-85.28129668,0,0,0,0,"Adair, Kentucky, US"
# create a simplified record
# deaths : sick : provins_state : date
simplify()
{
if [ ! -f "$1" ]; then
usage
fi
sed -e 's/\"\([a-zA-Z]*\), \(.*\)\",/\1_\2,/' $1 \
-e 's/,/|/g' \
-e 's/ /_/g' \
-e 's/...:..:..//' \
| awk -F\| '
{
if (NF < 10) {
provins_id=1
land_id=2
sick_id=4
died_id=5
date=$3
} else {
provins_id=3
land_id=4
sick_id=8
died_id=9
date=$5
if (($2=="Norfolk") && ($3=="Massachusetts")) {
name = "Norfolk-" $provins_id
sick[name] += $sick_id
died[name] += $died_id
}
}
if ($land_id == "US") {
name = $land_id
sick[name] += $sick_id
died[name] += $died_id
}
if (($land_id == "Mainland_China") || ($land_id == "China")) {
name = "China"
} else if (($provins_id == $land_id) || ($provins_id == "")) {
name = $land_id
} else {
name = $provins_id "-" $land_id
}
sick[name] += $sick_id
died[name] += $died_id
sick["Total"] += $sick_id
died["Total"] += $died_id
}
END {
for (n in sick) {
printf "%d:%d:%s:%s\n", died[n], sick[n], n, date
}
}' | sort -rn -t: > $2
}
do_stat()
{
if [ ! -f $1 ]; then
wget $URL_DIR/$1
if [ $? != 0 ]; then
if [ ! -f $1.simple ]; then
usage $1 not available
fi
else
simplify $1 $1.simple;
fi
fi
cat $1.simple | awk -F: ' BEGIN {
# https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population
# from https://www.worldometers.info/world-population/population-by-country/
population["Total"] = 7775040500
population["China"] = 1439323776
population["US"] = 331002651
population["Indonesia"] = 273523615
population["Brazil"] = 212559417
population["Japan"] = 126476461
population["Philippines"] = 109581078
population["Turkey"] = 84339067
population["Iran"] = 83992949
population["Germany"] = 83783942
population["United_Kingdom"] = 67886011
population["France"] = 65273511
population["Italy"] = 60461826
population["Korea_South"] = 51269185
population["Spain"] = 46754778
population["Iraq"] = 40222493
population["California-US"] = 39512223
population["New_York-US"] = 19453561
population["Netherlands"] = 17134872
population["Belgium"] = 11589623
population["Sweden"] = 10099265
population["Michigan-US"] = 9986857
population["New_Jersey-US"] = 8882190
population["Switzerland"] = 8654622
population["Washington-US"] = 7614893
population["Massachusetts-US"] = 6949503
population["Denmark"] = 5792202
population["Louisiana-US"] = 4648794
population["Norfolk-Massachusetts"] = 705388
population["San_Marino"] = 33931
n=0
print "rank provins sick died died/sick population died/population '$1'"
}
{
died = $1
sick = $2
name = $3
if ((n <= '$NTOP') || (name == "Denmark") || (name ~ /Massachusetts/)) {
printf "%3d %-21s %7d %7d", n, name, sick, died
if (sick== 0) {
printf " "
} else {
printf " %5.1f %", 100 * died / sick
}
if (name in population) {
printf " %6.1f %5d deaths/million", population[name]/1000000, died * 1000000/ population[name]
}
printf "\n"
}
n++
}'
}
if [ $# = 0 ]; then
i=$(date -d "1 day ago" +%m-%d-%Y).csv
rm -f $i
do_stat $i;
else
for i in $*;do
do_stat $i;
done
fi
The script started out as a oneliner, but have evolved a bit,
- John Hopskins changed the format of the file between March 22 and March 23, hence I added the simplify() function to give me an intermediate format
- added the population of the regions
My conclusion of these numbers are: We are lucky to live a in an area that has taken the same precautions as my birth-country, and that we are seeing the same results. which could be far worse.
