webscraping.R 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. # This script is used for scraping http://minoan.deaditerranean.com/ which has the linear B corpus
  2. library(rvest)
  3. #website <- html("http://minoan.deaditerranean.com/linear-b-transliterations/knossos/kn-a/kn-ag/")
  4. website <- html("http://minoan.deaditerranean.com/linear-b-transliterations/knossos/kn-o/kn-od/")
  5. # knossos tablets
  6. kna <- c("kn-ag", "kn-ai", "kn-am", "kn-ap", "kn-as")
  7. knb <- c("kn-b")
  8. knc <- c("kn-c-2")
  9. knd <- c("kn-d")
  10. kne <- c()
  11. knf <- c("kn-fh", "kn-fp", "kn-fs", "kn-f")
  12. kng <- c("kn-ga", "kn-gg", "kn-gm", "kn-gv", "kn-g")
  13. knk <- c()
  14. knl <- c("kn-lc", "kn-ld", "kn-le", "kn-ln", "kn-l")
  15. knm <- c("kn-mc", "kn-m")
  16. knn <- c("kn-nc", "kn-np")
  17. kno <- c("kn-oa", "kn-od", "kn-og")
  18. knpp <- c()
  19. knr <- c("kn-ra", "kn-r")
  20. kns <- c("kn-sc", "kn-sd", "kn-se", "kn-sf", "kn-sg", "kn-sk", "kn-so", "kn-sp")
  21. knu <- c("kn-uc", "kn-uf", "kn-u")
  22. knv <- c("kn-vc", "kn-vd", "kn-v")
  23. knw <- c("kn-wb", "kn-wm", "kn-wn", "kn-ws")
  24. knx <- c("kn-xd", "kn-xe", "kn-xf", "kn-x")
  25. #urls <- c("kn-a", "kn-b-2", "kn-c-3", "kn-d-2", "kn-e", "kn-f", "kn-g", "kn-k", "kn-l", "kn-m", "kn-n", "kn-o", "kn-pp", "kn-r-2",
  26. # "kn-s", "kn-u", "kn-v", "kn-w", "kn-x")
  27. urls <- c(paste("kn-a/", kna, sep=""), paste("kn-b-2/", knb, sep=""), paste("kn-c-3/", knc, sep=""), paste("kn-d-2/", knd, sep=""),
  28. "kn-e", paste("kn-f/", knf, sep=""), paste("kn-g/", kng, sep=""), "kn-k", paste("kn-l/", knl, sep=""), paste("kn-m/", knm, sep=""),
  29. paste("kn-n/", knn, sep=""), paste("kn-o/", kno, sep=""), "kn-pp", paste("kn-r-2/", knr, sep=""), paste("kn-s/", kns, sep=""), paste("kn-u/", knu, sep=""),
  30. paste("kn-v/", knv, sep=""), paste("kn-w/", knw, sep=""), paste("kn-x/", knx, sep=""))
  31. tablets<-c()
  32. for(url in urls)
  33. {
  34. fullurl = paste(paste("http://minoan.deaditerranean.com/linear-b-transliterations/knossos/", url, sep=""), "/", sep="")
  35. print(fullurl)
  36. website <- html(fullurl)
  37. # gets title of the tablets
  38. title <- website %>%
  39. html_nodes("h2") %>%
  40. html_text()
  41. # gets text from the tablets
  42. raw_data <- website %>%
  43. html_nodes("p") %>%
  44. html_text()
  45. for (data in raw_data) {
  46. if(grepl("-[a-z]", data)) {
  47. tablets <- c(tablets, data)
  48. }
  49. }
  50. }
  51. write.csv(tablets, file = "data.csv")