Corona and DIY statistics

 It is hard to know who to believe these days. Too many headlines with too little information that is relevant to me.

But you do not have to rely on CNN or Danmarks Radio to digest the information for you, we can quite easily get access to the data our-selves and extract the truth that we would like to see.

I am interested in how my area and my friends and family’s area are doing during this crisis, so I made a script that will extract that data for me.

John Hopskins University collects data regarding the sick and diseased from COVID-19, you can get an up-to-date overview from the link above.

The underlying data is available on github, you can find them here github.com/CSSEGISandData/COVID-19

The script will download the data for a given day, and produce an output like this one

rank provins                 sick    died died/sick population died/population  04-01-2020.csv
  0  Total                 932605   46809    5.0 %   7775.0     6 deaths/mill
  1  Italy                 110574   13155   11.9 %     60.5   218 deaths/mill
  2  Spain                 104118    9387    9.0 %     46.8   201 deaths/mill
  3  US                    213372    4757    2.2 %    331.0    14 deaths/mill
  4  France                 56989    4032    7.1 %     65.3    62 deaths/mill
  5  China                  82361    3316    4.0 %   1439.3     2 deaths/mill
  6  Iran                   47593    3036    6.4 %     84.0    36 deaths/mill
  7  United_Kingdom         29474    2352    8.0 %     67.9    35 deaths/mill
  8  New_York-US            83948    1941    2.3 %     19.5   100 deaths/mill
  9  Netherlands            13614    1173    8.6 %     17.1    68 deaths/mill
 10  Germany                77872     920    1.2 %     83.8    11 deaths/mill
 11  Belgium                13964     828    5.9 %     11.6    71 deaths/mill
 12  Switzerland            17768     488    2.7 %      8.7    56 deaths/mill
 13  New_Jersey-US          22255     355    1.6 %      8.9    40 deaths/mill
 14  Michigan-US             9315     335    3.6 %     10.0    34 deaths/mill
 15  Turkey                 15679     277    1.8 %     84.3     3 deaths/mill
 16  Louisiana-US            6424     273    4.2 %      4.6    59 deaths/mill
 17  Brazil                  6836     240    3.5 %    212.6     1 deaths/mill
 18  Sweden                  4947     239    4.8 %     10.1    24 deaths/mill
 19  Washington-US           5608     234    4.2 %      7.6    31 deaths/mill
 20  California-US           9399     199    2.1 %     39.5     5 deaths/mill
 27  Massachusetts-US        7738     122    1.6 %      6.9    18 deaths/mill
 28  Denmark                 3107     104    3.3 %      5.8    18 deaths/mill
 88  Norfolk-Massachusetts    829      15    1.8 %      0.7    21 deaths/mill

The script is below, you are welcome to adapt it to your own needs:

#!/bin/sh
# (C) Beerware by Storepeter 2020
# based on data from https://github.com/CSSEGISandData/COVID-19
#
usage()
{
    if [ $# != 0 ]; then
	echo ERROR:$*
    fi
    echo USAGE: top20.sh 04-01-2020.csv
    echo download and extract top 20 corona virus for at a given day
    exit
}

NTOP=20

URL_DIR=https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports
# FILE=03-21-2020.csv

# got two format:

# until 2020-03-22
# Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
# Hubei,China,2020-03-13T11:09:03,67786,3062,51553,30.9756,112.2707

# from 2020-03-23
# FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
# 21001,Adair,Kentucky,US,2020-03-23 23:19:34,37.10459774,-85.28129668,0,0,0,0,"Adair, Kentucky, US"

# create a simplified record
# deaths : sick : provins_state : date
simplify()
{
if [ ! -f "$1" ]; then
    usage
fi
sed -e 's/\"\([a-zA-Z]*\), \(.*\)\",/\1_\2,/' $1 \
-e 's/,/|/g' \
-e 's/ /_/g' \
-e 's/...:..:..//' \
| awk -F\| '
{
    if (NF < 10) {
	provins_id=1
	land_id=2
	sick_id=4
	died_id=5
	date=$3
    } else {
	provins_id=3
	land_id=4
	sick_id=8
	died_id=9
	date=$5
	if (($2=="Norfolk") && ($3=="Massachusetts")) {
	    name = "Norfolk-" $provins_id
	    sick[name] += $sick_id
	    died[name] += $died_id
	}
    }
    if ($land_id == "US") {
	name = $land_id
	sick[name] += $sick_id
	died[name] += $died_id
    }
    if (($land_id == "Mainland_China") || ($land_id == "China")) {
	name = "China"
    } else if (($provins_id == $land_id) || ($provins_id == "")) {
	name = $land_id
    } else {
	name = $provins_id "-" $land_id
    }
    sick[name] += $sick_id
    died[name] += $died_id
    sick["Total"] += $sick_id
    died["Total"] += $died_id
}
END {
    for (n in sick) {
	printf "%d:%d:%s:%s\n", died[n], sick[n], n, date
    }
}' | sort -rn -t: > $2
}


do_stat()
{
if [ ! -f $1 ]; then
    wget $URL_DIR/$1
    if [ $? != 0 ]; then
	if [ ! -f $1.simple ]; then
		usage $1 not available
	fi
    else
	simplify $1 $1.simple;
    fi
fi

cat $1.simple | awk -F: ' BEGIN {
# https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population
# from https://www.worldometers.info/world-population/population-by-country/
    population["Total"] = 7775040500
    population["China"] = 1439323776
    population["US"] = 331002651
    population["Indonesia"] = 273523615
    population["Brazil"] = 212559417
    population["Japan"] = 126476461
    population["Philippines"] = 109581078
    population["Turkey"] = 84339067
    population["Iran"] = 83992949
    population["Germany"] = 83783942
    population["United_Kingdom"] = 67886011
    population["France"] = 65273511
    population["Italy"] = 60461826
    population["Korea_South"] = 51269185
    population["Spain"] = 46754778
    population["Iraq"] = 40222493
    population["California-US"] = 39512223
    population["New_York-US"] = 19453561
    population["Netherlands"] = 17134872
    population["Belgium"] = 11589623
    population["Sweden"] = 10099265
    population["Michigan-US"] = 9986857
    population["New_Jersey-US"] = 8882190
    population["Switzerland"] = 8654622
    population["Washington-US"] = 7614893
    population["Massachusetts-US"] = 6949503
    population["Denmark"] = 5792202
    population["Louisiana-US"] = 4648794
    population["Norfolk-Massachusetts"] = 705388
    population["San_Marino"] = 33931
    n=0
    print "rank provins                 sick    died died/sick population died/population  '$1'"
}
{
    died = $1
    sick = $2
    name = $3
    if ((n <= '$NTOP') || (name == "Denmark") || (name ~ /Massachusetts/)) {
	printf "%3d  %-21s %7d %7d", n, name, sick, died
	if (sick== 0)  {
	    printf "        "
	} else {
	    printf "  %5.1f %", 100 * died / sick
	}
	if (name in population) {
	    printf "   %6.1f  %5d deaths/million", population[name]/1000000, died * 1000000/ population[name] 
	}
	printf "\n"
    }
    n++
}'
}

if [ $# = 0 ]; then
    i=$(date -d "1 day ago" +%m-%d-%Y).csv
    rm -f $i
    do_stat $i;
else
    for i in $*;do
	do_stat $i;
    done
fi

The script started out as a oneliner, but have evolved a bit,

  • John Hopskins changed the format of the file between March 22 and March 23, hence I added the simplify() function to give me an intermediate format
  • added the population of the regions

My conclusion of these numbers are: We are lucky to live a in an area that has taken the same precautions as my birth-country, and that we are seeing the same results. which could be far worse.

This entry was posted in COVID-19. Bookmark the permalink.