vp-22-geonames-postgres.sh 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. #!/bin/bash
  2. # This script assumes that:
  3. # - postgis is installed in public schema
  4. # - command "CREATE EXTENSION postgis;" has been issued
  5. # - command "CREATE EXTENSION postgis_topology;" has been issued
  6. # - command "CREATE EXTENSION pg_trgm;" has been issued
  7. WORKPATH="/tmp/geonames.work"
  8. TMPPATH="tmp"
  9. PCPATH="pc"
  10. PREFIX="_"
  11. DBHOST="localhost"
  12. DBPORT="5432"
  13. DBUSER="postgres"
  14. DATABASE="geonames"
  15. SCHEMA="public"
  16. FILES="allCountries.zip alternateNames.zip userTags.zip admin1CodesASCII.txt admin2Codes.txt countryInfo.txt featureCodes_en.txt iso-languagecodes.txt timeZones.txt"
  17. DROP_TABLES="true"
  18. CREATE_TABLES="true"
  19. export PGOPTIONS="--search_path=${SCHEMA}"
  20. if [[ "$DROP_TABLES" == "true" ]]; then
  21. psql -U $DBUSER -h $DBHOST -p $DBPORT $DATABASE << EOF
  22. DROP TABLE IF EXISTS geoname CASCADE;
  23. DROP TABLE IF EXISTS alternatename;
  24. DROP TABLE IF EXISTS countryinfo;
  25. DROP TABLE IF EXISTS iso_languagecodes;
  26. DROP TABLE IF EXISTS admin1CodesAscii;
  27. DROP TABLE IF EXISTS admin2CodesAscii;
  28. DROP TABLE IF EXISTS featureCodes;
  29. DROP TABLE IF EXISTS timeZones;
  30. DROP TABLE IF EXISTS continentCodes;
  31. EOF
  32. fi
  33. if [[ "$CREATE_TABLES" == "true" ]]; then
  34. psql -U $DBUSER -h $DBHOST -p $DBPORT $DATABASE << EOF
  35. CREATE TABLE geoname (
  36. id INT,
  37. name TEXT,
  38. ascii_name TEXT,
  39. alternate_names TEXT,
  40. latitude FLOAT,
  41. longitude FLOAT,
  42. fclass CHAR(1),
  43. fcode CHAR(10),
  44. country CHAR(2),
  45. cc2 TEXT,
  46. admin1 TEXT,
  47. admin2 TEXT,
  48. admin3 TEXT,
  49. admin4 TEXT,
  50. population BIGINT,
  51. elevation INT,
  52. gtopo30 INT,
  53. timezone TEXT,
  54. modified_date DATE
  55. );
  56. CREATE TABLE alternatename (
  57. id INT,
  58. geoname_id INT,
  59. iso_lang TEXT,
  60. alternate_name TEXT,
  61. is_preferred_name BOOLEAN,
  62. is_short_name BOOLEAN,
  63. is_colloquial BOOLEAN,
  64. is_historic BOOLEAN
  65. );
  66. CREATE TABLE countryinfo (
  67. iso_alpha2 CHAR(2),
  68. iso_alpha3 CHAR(3),
  69. iso_numeric INTEGER,
  70. fips_code TEXT,
  71. country TEXT,
  72. capital TEXT,
  73. area DOUBLE PRECISION, -- square km
  74. population INTEGER,
  75. continent CHAR(2),
  76. tld TEXT,
  77. currency_code CHAR(3),
  78. currency_name TEXT,
  79. phone TEXT,
  80. postal TEXT,
  81. postal_regex TEXT,
  82. languages TEXT,
  83. geoname_id INT,
  84. neighbours TEXT,
  85. equivalent_fips_code TEXT
  86. );
  87. CREATE TABLE iso_languagecodes(
  88. iso_639_3 CHAR(4),
  89. iso_639_2 TEXT,
  90. iso_639_1 TEXT,
  91. language_name TEXT
  92. );
  93. CREATE TABLE admin1CodesAscii (
  94. code CHAR(20),
  95. name TEXT,
  96. name_ascii TEXT,
  97. geoname_id INT
  98. );
  99. CREATE TABLE admin2CodesAscii (
  100. code CHAR(80),
  101. name TEXT,
  102. name_ascii TEXT,
  103. geoname_id INT
  104. );
  105. CREATE TABLE featureCodes (
  106. code CHAR(7),
  107. name TEXT,
  108. description TEXT
  109. );
  110. CREATE TABLE timeZones (
  111. id TEXT,
  112. country_code TEXT,
  113. GMT_offset NUMERIC(3,1),
  114. DST_offset NUMERIC(3,1),
  115. raw_offset NUMERIC(3,1)
  116. );
  117. CREATE TABLE continentCodes (
  118. code CHAR(2),
  119. name TEXT,
  120. geoname_id INT
  121. );
  122. EOF
  123. fi
  124. # check if needed directories do already exsist
  125. if [ -d "$WORKPATH" ]; then
  126. echo "$WORKPATH exists..."
  127. sleep 0
  128. else
  129. echo "$WORKPATH and subdirectories will be created..."
  130. mkdir -p $WORKPATH/{$TMPPATH,$PCPATH}
  131. echo "created $WORKPATH"
  132. fi
  133. echo
  134. echo ",---- STARTING (downloading, unpacking and preparing)"
  135. cd $WORKPATH/$TMPPATH
  136. for i in $FILES
  137. do
  138. wget -N "http://download.geonames.org/export/dump/$i" # get newer files
  139. if [ $i -nt $PREFIX$i ] || [ ! -e $PREFIX$i ] ; then
  140. cp -p $i $PREFIX$i
  141. if [[ $i == *.zip ]]
  142. then
  143. unzip -u -q $i
  144. fi
  145. case "$i" in
  146. iso-languagecodes.txt)
  147. tail -n +2 iso-languagecodes.txt > iso-languagecodes.txt.tmp;
  148. ;;
  149. countryInfo.txt)
  150. grep -v '^#' countryInfo.txt > countryInfo.txt.tmp;
  151. ;;
  152. timeZones.txt)
  153. tail -n +2 timeZones.txt > timeZones.txt.tmp;
  154. ;;
  155. esac
  156. echo "| $i has been downloaded";
  157. else
  158. echo "| $i is already the latest version"
  159. fi
  160. done
  161. psql -e -U $DBUSER -h $DBHOST -p $DBPORT $DATABASE << EOF
  162. \copy geoname (id,name,ascii_name,alternate_names,latitude,\
  163. longitude,fclass,fcode,country,cc2,admin1,admin2,\
  164. admin3,admin4,population,elevation,gtopo30,\
  165. timezone,modified_date)\
  166. from '${WORKPATH}/${TMPPATH}/allCountries.txt' null as '';
  167. \copy timeZones (country_code,id,GMT_offset,DST_offset,raw_offset)\
  168. from '${WORKPATH}/${TMPPATH}/timeZones.txt.tmp' null as '';
  169. \copy featureCodes (code,name,description)\
  170. from '${WORKPATH}/${TMPPATH}/featureCodes_en.txt' null as '';
  171. \copy admin1CodesAscii (code,name,name_ascii,geoname_id)\
  172. from '${WORKPATH}/${TMPPATH}/admin1CodesASCII.txt' null as '';
  173. \copy admin2CodesAscii (code,name,name_ascii,geoname_id)\
  174. from '${WORKPATH}/${TMPPATH}/admin2Codes.txt' null as '';
  175. \copy iso_languagecodes (iso_639_3,iso_639_2,iso_639_1,language_name)\
  176. from '${WORKPATH}/${TMPPATH}/iso-languagecodes.txt.tmp' null as '';
  177. \copy countryInfo (iso_alpha2,iso_alpha3,iso_numeric,fips_code,country,\
  178. capital,area,population,continent,tld,currency_code,\
  179. currency_name,phone,postal,postal_regex,languages,\
  180. geoname_id,neighbours,equivalent_fips_code)\
  181. from '${WORKPATH}/${TMPPATH}/countryInfo.txt.tmp' null as '';
  182. \copy alternatename (id,geoname_id,iso_lang,alternate_name,\
  183. is_preferred_name,is_short_name,\
  184. is_colloquial,is_historic)\
  185. from '${WORKPATH}/${TMPPATH}/alternateNames.txt' null as '';
  186. INSERT INTO continentCodes VALUES ('AF', 'Africa', 6255146);
  187. INSERT INTO continentCodes VALUES ('AS', 'Asia', 6255147);
  188. INSERT INTO continentCodes VALUES ('EU', 'Europe', 6255148);
  189. INSERT INTO continentCodes VALUES ('NA', 'North America', 6255149);
  190. INSERT INTO continentCodes VALUES ('OC', 'Oceania', 6255150);
  191. INSERT INTO continentCodes VALUES ('SA', 'South America', 6255151);
  192. INSERT INTO continentCodes VALUES ('AN', 'Antarctica', 6255152);
  193. CREATE INDEX concurrently index_countryinfo_geonameid ON countryinfo (geoname_id);
  194. CREATE INDEX concurrently index_alternatename_geonameid ON alternatename (geoname_id);
  195. EOF
  196. psql -U $DBUSER -h $DBHOST -p $DBPORT $DATABASE << EOF
  197. ALTER TABLE ONLY alternatename
  198. ADD CONSTRAINT pk_alternatenameid PRIMARY KEY (id);
  199. ALTER TABLE ONLY geoname
  200. ADD CONSTRAINT pk_geonameid PRIMARY KEY (id);
  201. ALTER TABLE ONLY countryinfo
  202. ADD CONSTRAINT pk_iso_alpha2 PRIMARY KEY (iso_alpha2);
  203. ALTER TABLE ONLY countryinfo
  204. ADD CONSTRAINT fk_geonameid FOREIGN KEY (geoname_id)
  205. REFERENCES geoname(id);
  206. ALTER TABLE ONLY alternatename
  207. ADD CONSTRAINT fk_geonameid FOREIGN KEY (geoname_id)
  208. REFERENCES geoname(id);
  209. -- Should be ran after data import
  210. alter table geoname
  211. add search_vector tsvector;
  212. -- Run time arround: 16 minutes
  213. update geoname
  214. set search_vector = to_tsvector('english', concat_ws(' ', name, ascii_name, alternate_names))
  215. where true;
  216. -- Index for fast vector search
  217. -- Run time: 13m
  218. create index concurrently geoname_search_vector_idx
  219. on geoname using gin (search_vector);
  220. -- Index for similarity comparisons
  221. -- RUn time: 5 minutes
  222. CREATE INDEX concurrently trgm_idx ON geoname USING gin (ascii_name public.gin_trgm_ops);
  223. EOF
  224. echo "----- DONE ( have fun... )"