graphs.R 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. library(ggplot2)
  2. library(plyr)
  3. library(scales)
  4. ooni.index <- read.csv("ooni-index.csv.xz",
  5. colClasses=c(date="POSIXct", cc="character", asn="integer", test_name="character"),
  6. na.strings="--" # "NA" is Namibia.
  7. )
  8. country.codes <- read.csv("country-codes.csv",
  9. stringsAsFactors=F,
  10. na.strings="--" # "NA" is North America.
  11. )
  12. hdi <- read.csv("hdi.csv",
  13. stringsAsFactors=F
  14. )
  15. # We need to merge the country.codes and hdi data sets. country.codes has nice
  16. # ISO-3166 2-letter codes alongside spelled-out country names, but hdi only has
  17. # the spelled-out names, and the spelled-out names are not always exactly the
  18. # same across the two data sets. We need to build a mapping from hdi's
  19. # spelled-out country names to ISO-3166 2-letter code. Most of the cases are
  20. # handled by matching hdi$country with country.codes$official_name_en; then we
  21. # manually fill out the countries that are still missing.
  22. hdi.country.to.code <- country.info <- merge(hdi, country.codes, by.x=c("country"), by.y=c("official_name_en"))[, c("country", "ISO3166.1.Alpha.2")]
  23. hdi.country.to.code <- rbind(hdi.country.to.code,
  24. c("Congo (Democratic Republic of the)", "CD"),
  25. c("Czech Republic", "CZ"),
  26. c("Hong Kong, China (SAR)", "HK"),
  27. c("Korea (Republic of)", "KR"),
  28. c("Moldova (Republic of)", "MD"),
  29. c("Palestine, State of", "PS"),
  30. c("Tanzania (United Republic of)", "TZ"),
  31. c("United Kingdom", "GB"),
  32. c("United States", "US")
  33. )
  34. hdi <- merge(hdi, hdi.country.to.code, by=c("country"), all.x=T)
  35. country.info <- merge(country.codes, hdi, by=c("ISO3166.1.Alpha.2"), all=T)
  36. data <- merge(ooni.index, country.info, by.x=c("cc"), by.y=c("ISO3166.1.Alpha.2"), all.x=T)
  37. data$Continent <- factor(data$Continent,
  38. levels=c("AF", "NA", "SA", "AN", "AS", "EU", "OC", NA),
  39. labels=c("Africa", "N.America", "S.America", "Antarctica", "Asia", "Europe", "Oceania", "??"),
  40. exclude=NULL
  41. )
  42. # Check if anything is missing an HDI because of the uncertain mapping of spelled-out country names.
  43. options(width=120)
  44. print("Countries missing an HDI:")
  45. print(count(data[is.na(data$hdi), c("cc", "name", "official_name_en")]))
  46. plot_num_reports_by_date <- function(data, title) {
  47. p <- ggplot(data)
  48. p <- p + geom_bar(aes(as.Date(date)))
  49. p <- p + scale_x_date(date_breaks="1 year", date_minor_breaks="1 month")
  50. p <- p + labs(title="Number of OONI reports per day", x="date", y="number of reports")
  51. p <- p + theme_bw()
  52. p
  53. }
  54. plot_num_reports_by_date_and_continent <- function(data, title) {
  55. p <- ggplot(data)
  56. p <- p + geom_bar(aes(as.Date(date)))
  57. p <- p + scale_x_date(date_breaks="1 year", date_minor_breaks="1 month")
  58. p <- p + facet_grid(Continent ~ .)
  59. p <- p + labs(title=title, x="date", y="number of reports")
  60. p <- p + theme_bw()
  61. p
  62. }
  63. plot_num_reports_by_test_name <- function(data, title) {
  64. p <- ggplot(data)
  65. p <- p + geom_bar(aes(reorder(test_name, test_name, function(x) length(x))))
  66. p <- p + coord_flip()
  67. p <- p + labs(title=title, x=NULL, y="number of reports")
  68. p <- p + theme_bw()
  69. p
  70. }
  71. plot_num_reports_by_country <- function(data, title) {
  72. p <- ggplot(data)
  73. p <- p + geom_text(aes(x=Continent, group=cc, label=cc), stat="count", alpha=0.8, size=2.5, position=position_jitter(width=0.25))
  74. p <- p + scale_y_continuous(trans="log10", breaks=trans_breaks("log10", function(x) 10^x), labels=comma)
  75. p <- p + labs(title=title, x="continent", y="number of reports")
  76. p <- p + theme_bw()
  77. p
  78. }
  79. plot_num_reports_by_hdi <- function(data, title) {
  80. p <- ggplot(data)
  81. p <- p + geom_text(aes(x=hdi, group=cc, label=cc, color=Continent), stat="count", size=2.5)
  82. p <- p + scale_y_continuous(trans="log10", breaks=trans_breaks("log10", function(x) 10^x), labels=comma)
  83. p <- p + scale_color_brewer(palette="Set2")
  84. p <- p + labs(title=title, x="Human Development Index 2015", y="number of reports")
  85. p <- p + guides(color=guide_legend(override.aes=list(alpha=1, size=4)))
  86. p <- p + theme_bw()
  87. p
  88. }
  89. plot_num_reports_by_date(data[data$date >= as.POSIXct("2015-08-28"), ], "Number of OONI reports per day")
  90. plot_num_reports_by_date_and_continent(data[data$date >= as.POSIXct("2015-08-28"), ], "Number of OONI reports per day, by continent") + coord_cartesian(ylim=c(0, 1250))
  91. plot_num_reports_by_test_name(data, "Number of OONI reports by test name")
  92. plot_num_reports_by_test_name(data[data$date >= as.POSIXct("2018-01-01"), ], "Number of OONI reports by test name (since 2018-01-01)")
  93. plot_num_reports_by_country(data, "Number of OONI reports by country and continent")
  94. plot_num_reports_by_country(data[data$date >= as.POSIXct("2018-01-01"), ], "Number of OONI reports by country and continent (since 2018-01-01)")
  95. plot_num_reports_by_hdi(data, "Number of OONI reports by Human Development Index")
  96. plot_num_reports_by_hdi(data[data$date >= as.POSIXct("2018-01-01"), ], "Number of OONI reports by Human Development Index (since 2018-01-01)")