]> Pierre Choffet | Git repositories - wmo_to_wikidata.git/commitdiff
From WMO servers to data validation
authorPierre Choffet <peuc@wanadoo.fr>
Thu, 28 Oct 2021 21:44:00 +0000 (17:44 -0400)
committerPierre Choffet <peuc@wanadoo.fr>
Thu, 28 Oct 2021 21:44:00 +0000 (17:44 -0400)
First batch of the process, that does the following:
  - download stations metadata from WMO, and keep the local cache up to date
  - convert metadata into one of its XML equivalent
  - filter unwanted content and fix known mistakes
  - validate the resulting file

The global structure is in place but final operations aren't there yet: more filtering will have to be done in the future.
An incomplete README file has also been added.

README [new file with mode: 0644]
README.md [deleted file]
schemas/stations.xsd [new file with mode: 0644]
schemas/xml.xsd [new file with mode: 0644]
update.sh [new file with mode: 0755]
xslts/stations_clean.xslt [new file with mode: 0644]

diff --git a/README b/README
new file mode 100644 (file)
index 0000000..9538894
--- /dev/null
+++ b/README
@@ -0,0 +1,28 @@
+wmo_to_wikidata - Import World Meteorological Organization weather stations
+                  metadata into Wikidata.
+
+This repository contains a set of scripts that download, clean, verify, compare
+WMO stations metadata before importing it into Wikidata as needed.
+
+The original source code in this repository is sponsored by Wkimedia Canada.
+
+The following tools are required as dependencies - they should be available
+prepackaged for most GNU/Linux distros.
+
+  - Bash - https://www.gnu.org/software/bash/
+    Shell script interpreter
+
+  - Curl - https://curl.se/
+    Download WMO data.
+
+  - Xmlstarlet - http://xmlstar.sourceforge.net/
+    XSD and XSLT processor.
+
+  - Yq - https://kislyuk.github.io/yq/
+    Jq wrapper to convert WMO JSON into XML.
+
+
+The repositories contains the following tools:
+  - update.sh
+    Ensure WMO stations cache is up to date, convert original JSON into XML,
+    clean and validate data.
diff --git a/README.md b/README.md
deleted file mode 100644 (file)
index e687faf..0000000
--- a/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# wikimedia-pilot
\ No newline at end of file
diff --git a/schemas/stations.xsd b/schemas/stations.xsd
new file mode 100644 (file)
index 0000000..6a4013b
--- /dev/null
@@ -0,0 +1,440 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<xsd:schema xmlns:xml="http://www.w3.org/XML/1998/namespace"
+            xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+            elementFormDefault="qualified">
+<!-- stations.xsd - Validate XML representation of WMO stations metadata.
+     Copyright (C) 2021  Pierre Choffet
+
+     This program is free software: you can redistribute it and/or modify
+     it under the terms of version 3 of the GNU General Public License as
+     published by the Free Software Foundation.
+
+     This program is distributed in the hope that it will be useful,
+     but WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+     GNU General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+     -->
+       <xsd:import namespace="http://www.w3.org/XML/1998/namespace" schemaLocation="xml.xsd" />
+       
+       <xsd:element name="stations">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element ref="station" minOccurs="0" maxOccurs="unbounded" />
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       
+       <xsd:element name="station">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element ref="id" />
+                               <xsd:element ref="name" />
+                               <xsd:element ref="region" minOccurs="0" />
+                               <xsd:element ref="territory" />
+                               <xsd:element ref="declaredStatus" />
+                               <xsd:sequence minOccurs="0" maxOccurs="1">
+                                       <xsd:element ref="latitude" />
+                                       <xsd:element ref="longitude" />
+                               </xsd:sequence>
+                               <xsd:element ref="elevation" minOccurs="0" />
+                               <xsd:element ref="stationTypeName" />
+                               <xsd:element ref="wigosStationIdentifiers" minOccurs="0" maxOccurs="unbounded" />
+                               <xsd:element ref="wigosId" minOccurs="0" />
+                               <xsd:element ref="stationTypeId" />
+                               <xsd:element ref="dateEstablished" minOccurs="0" />
+                               <xsd:element ref="dateClosed" minOccurs="0" />
+                               <xsd:element ref="stationStatusCode" minOccurs="0" />
+                               <xsd:element ref="stationTypeCode" minOccurs="0" />
+                               <xsd:element ref="stationProgramsDeclaredStatuses" minOccurs="0" />
+                       </xsd:sequence>
+               </xsd:complexType>
+               
+               <xsd:unique name="station-wmo-id">
+                       <xsd:selector xpath="id" />
+                       <xsd:field xpath="." />
+               </xsd:unique>
+               
+               <xsd:unique name="station-wigos-id">
+                       <xsd:selector xpath="wigosStationIdentifiers/wigosStationIdentifier" />
+                       <xsd:field xpath="." />
+               </xsd:unique>
+       </xsd:element>
+       
+       <xsd:element name="wigosStationIdentifiers">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element ref="wigosStationIdentifier" />
+                               <xsd:element ref="primary" />
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       
+       <xsd:element name="id" type="xsd:positiveInteger" />
+       
+       <xsd:element name="name" type="xsd:normalizedString" />
+       
+       <xsd:element name="region">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:normalizedString">
+                               <xsd:enumeration value="Africa" />
+                               <xsd:enumeration value="Antarctica" />
+                               <xsd:enumeration value="Asia" />
+                               <xsd:enumeration value="Europe" />
+                               <xsd:enumeration value="North America, Central America and the Caribbean" />
+                               <xsd:enumeration value="South America" />
+                               <xsd:enumeration value="South-West Pacific" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="territory">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:normalizedString">
+                               <xsd:enumeration value="(inapplicable)" />
+                               <xsd:enumeration value="(unknown)" />
+                               <xsd:enumeration value="Afghanistan" />
+                               <xsd:enumeration value="Albania" />
+                               <xsd:enumeration value="Algeria" />
+                               <xsd:enumeration value="Andorra" />
+                               <xsd:enumeration value="Angola" />
+                               <xsd:enumeration value="Anguilla" />
+                               <xsd:enumeration value="Antigua and Barbuda" />
+                               <xsd:enumeration value="Argentina" />
+                               <xsd:enumeration value="Armenia" />
+                               <xsd:enumeration value="Australia" />
+                               <xsd:enumeration value="Austria" />
+                               <xsd:enumeration value="Azerbaijan" />
+                               <xsd:enumeration value="Bahamas" />
+                               <xsd:enumeration value="Bahrain" />
+                               <xsd:enumeration value="Bangladesh" />
+                               <xsd:enumeration value="Barbados" />
+                               <xsd:enumeration value="Belarus" />
+                               <xsd:enumeration value="Belgium" />
+                               <xsd:enumeration value="Belize" />
+                               <xsd:enumeration value="Benin" />
+                               <xsd:enumeration value="Bermuda" />
+                               <xsd:enumeration value="Bhutan" />
+                               <xsd:enumeration value="Bolivia, Plurinational State of" />
+                               <xsd:enumeration value="Bosnia and Herzegovina" />
+                               <xsd:enumeration value="Botswana" />
+                               <xsd:enumeration value="Brazil" />
+                               <xsd:enumeration value="British Virgin Islands" />
+                               <xsd:enumeration value="Brunei Darussalam" />
+                               <xsd:enumeration value="Bulgaria" />
+                               <xsd:enumeration value="Burkina Faso" />
+                               <xsd:enumeration value="Burundi" />
+                               <xsd:enumeration value="Cabo Verde" />
+                               <xsd:enumeration value="Cambodia" />
+                               <xsd:enumeration value="Cameroon" />
+                               <xsd:enumeration value="Canada" />
+                               <xsd:enumeration value="Cayman Islands" />
+                               <xsd:enumeration value="Central African Republic" />
+                               <xsd:enumeration value="Chad" />
+                               <xsd:enumeration value="Chile" />
+                               <xsd:enumeration value="China" />
+                               <xsd:enumeration value="Christmas Island" />
+                               <xsd:enumeration value="Cocos (Keeling) Islands" />
+                               <xsd:enumeration value="Colombia" />
+                               <xsd:enumeration value="Comoros" />
+                               <xsd:enumeration value="Congo" />
+                               <xsd:enumeration value="Congo, Democratic Republic of the" />
+                               <xsd:enumeration value="Cook Islands" />
+                               <xsd:enumeration value="Costa Rica" />
+                               <xsd:enumeration value="Côte d'Ivoire" />
+                               <xsd:enumeration value="Croatia" />
+                               <xsd:enumeration value="Cuba" />
+                               <xsd:enumeration value="Curacao" />
+                               <xsd:enumeration value="Cyprus" />
+                               <xsd:enumeration value="Czech Republic" />
+                               <xsd:enumeration value="Denmark" />
+                               <xsd:enumeration value="Djibouti" />
+                               <xsd:enumeration value="Dominica" />
+                               <xsd:enumeration value="Dominican Republic" />
+                               <xsd:enumeration value="Ecuador" />
+                               <xsd:enumeration value="Egypt" />
+                               <xsd:enumeration value="El Salvador" />
+                               <xsd:enumeration value="Equatorial Guinea" />
+                               <xsd:enumeration value="Eritrea" />
+                               <xsd:enumeration value="Estonia" />
+                               <xsd:enumeration value="Eswatini" />
+                               <xsd:enumeration value="Ethiopia" />
+                               <xsd:enumeration value="Falkland Islands (Malvinas)" />
+                               <xsd:enumeration value="Fiji" />
+                               <xsd:enumeration value="Finland" />
+                               <xsd:enumeration value="France" />
+                               <xsd:enumeration value="French Polynesia" />
+                               <xsd:enumeration value="Gabon" />
+                               <xsd:enumeration value="Gambia" />
+                               <xsd:enumeration value="Georgia" />
+                               <xsd:enumeration value="Germany" />
+                               <xsd:enumeration value="Ghana" />
+                               <xsd:enumeration value="Gibraltar" />
+                               <xsd:enumeration value="Greece" />
+                               <xsd:enumeration value="Greenland" />
+                               <xsd:enumeration value="Grenada" />
+                               <xsd:enumeration value="Guatemala" />
+                               <xsd:enumeration value="Guinea" />
+                               <xsd:enumeration value="Guinea-Bissau" />
+                               <xsd:enumeration value="Guyana" />
+                               <xsd:enumeration value="Haiti" />
+                               <xsd:enumeration value="Honduras" />
+                               <xsd:enumeration value="Hong Kong, China" />
+                               <xsd:enumeration value="Hungary" />
+                               <xsd:enumeration value="Iceland" />
+                               <xsd:enumeration value="India" />
+                               <xsd:enumeration value="Indonesia" />
+                               <xsd:enumeration value="Iran, Islamic Republic of" />
+                               <xsd:enumeration value="Iraq" />
+                               <xsd:enumeration value="Ireland" />
+                               <xsd:enumeration value="Israel" />
+                               <xsd:enumeration value="Italy" />
+                               <xsd:enumeration value="Jamaica" />
+                               <xsd:enumeration value="Japan" />
+                               <xsd:enumeration value="Jordan" />
+                               <xsd:enumeration value="Kazakhstan" />
+                               <xsd:enumeration value="Kenya" />
+                               <xsd:enumeration value="Kiribati" />
+                               <xsd:enumeration value="Korea, Democratic People's Republic of" />
+                               <xsd:enumeration value="Korea, Republic of" />
+                               <xsd:enumeration value="Kuwait" />
+                               <xsd:enumeration value="Kyrgyzstan" />
+                               <xsd:enumeration value="Lao People's Democratic Republic" />
+                               <xsd:enumeration value="Latvia" />
+                               <xsd:enumeration value="Lebanon" />
+                               <xsd:enumeration value="Lesotho" />
+                               <xsd:enumeration value="Liberia" />
+                               <xsd:enumeration value="Libya" />
+                               <xsd:enumeration value="Lithuania" />
+                               <xsd:enumeration value="Luxembourg" />
+                               <xsd:enumeration value="Macao, China" />
+                               <xsd:enumeration value="Madagascar" />
+                               <xsd:enumeration value="Malawi" />
+                               <xsd:enumeration value="Malaysia" />
+                               <xsd:enumeration value="Maldives" />
+                               <xsd:enumeration value="Mali" />
+                               <xsd:enumeration value="Malta" />
+                               <xsd:enumeration value="Marshall Islands" />
+                               <xsd:enumeration value="Mauritania" />
+                               <xsd:enumeration value="Mauritius" />
+                               <xsd:enumeration value="Mexico" />
+                               <xsd:enumeration value="Micronesia, Federated States of" />
+                               <xsd:enumeration value="Moldova, Republic of" />
+                               <xsd:enumeration value="Mongolia" />
+                               <xsd:enumeration value="Montenegro" />
+                               <xsd:enumeration value="Montserrat" />
+                               <xsd:enumeration value="Morocco" />
+                               <xsd:enumeration value="Mozambique" />
+                               <xsd:enumeration value="Myanmar" />
+                               <xsd:enumeration value="Namibia" />
+                               <xsd:enumeration value="Nauru" />
+                               <xsd:enumeration value="Nepal" />
+                               <xsd:enumeration value="Netherlands" />
+                               <xsd:enumeration value="New Caledonia" />
+                               <xsd:enumeration value="New Zealand" />
+                               <xsd:enumeration value="Nicaragua" />
+                               <xsd:enumeration value="Niger" />
+                               <xsd:enumeration value="Nigeria" />
+                               <xsd:enumeration value="Niue" />
+                               <xsd:enumeration value="North Macedonia, Republic of" />
+                               <xsd:enumeration value="Norway" />
+                               <xsd:enumeration value="Oman" />
+                               <xsd:enumeration value="Pakistan" />
+                               <xsd:enumeration value="Palau" />
+                               <xsd:enumeration value="Palestine, State of" />
+                               <xsd:enumeration value="Panama" />
+                               <xsd:enumeration value="Papua New Guinea" />
+                               <xsd:enumeration value="Paraguay" />
+                               <xsd:enumeration value="Peru" />
+                               <xsd:enumeration value="Philippines" />
+                               <xsd:enumeration value="Pitcairn" />
+                               <xsd:enumeration value="Poland" />
+                               <xsd:enumeration value="Portugal" />
+                               <xsd:enumeration value="Puerto Rico" />
+                               <xsd:enumeration value="Qatar" />
+                               <xsd:enumeration value="Romania" />
+                               <xsd:enumeration value="Russian Federation" />
+                               <xsd:enumeration value="Rwanda" />
+                               <xsd:enumeration value="Saint Helena" />
+                               <xsd:enumeration value="Saint Kitts and Nevis" />
+                               <xsd:enumeration value="Saint Lucia" />
+                               <xsd:enumeration value="Saint Pierre and Miquelon" />
+                               <xsd:enumeration value="Saint Vincent and Grenadines" />
+                               <xsd:enumeration value="Samoa" />
+                               <xsd:enumeration value="Sao Tome and Principe" />
+                               <xsd:enumeration value="Saudi Arabia" />
+                               <xsd:enumeration value="Senegal" />
+                               <xsd:enumeration value="Serbia" />
+                               <xsd:enumeration value="Seychelles" />
+                               <xsd:enumeration value="Sierra Leone" />
+                               <xsd:enumeration value="Singapore" />
+                               <xsd:enumeration value="Sint Maarten" />
+                               <xsd:enumeration value="Slovakia" />
+                               <xsd:enumeration value="Slovenia" />
+                               <xsd:enumeration value="Solomon Islands" />
+                               <xsd:enumeration value="Somalia" />
+                               <xsd:enumeration value="South Africa" />
+                               <xsd:enumeration value="South Sudan" />
+                               <xsd:enumeration value="Spain" />
+                               <xsd:enumeration value="Sri Lanka" />
+                               <xsd:enumeration value="Sudan" />
+                               <xsd:enumeration value="Suriname" />
+                               <xsd:enumeration value="Sweden" />
+                               <xsd:enumeration value="Switzerland" />
+                               <xsd:enumeration value="Syrian Arab Republic" />
+                               <xsd:enumeration value="Taiwan, Province of China" />
+                               <xsd:enumeration value="Tajikistan" />
+                               <xsd:enumeration value="Tanzania, United Republic of" />
+                               <xsd:enumeration value="Thailand" />
+                               <xsd:enumeration value="Timor-Leste" />
+                               <xsd:enumeration value="Togo" />
+                               <xsd:enumeration value="Tokelau" />
+                               <xsd:enumeration value="Tonga" />
+                               <xsd:enumeration value="Trinidad and Tobago" />
+                               <xsd:enumeration value="Tunisia" />
+                               <xsd:enumeration value="Turkey" />
+                               <xsd:enumeration value="Turkmenistan" />
+                               <xsd:enumeration value="Turks and Caicos Islands " />
+                               <xsd:enumeration value="Tuvalu" />
+                               <xsd:enumeration value="Uganda" />
+                               <xsd:enumeration value="Ukraine" />
+                               <xsd:enumeration value="United Arab Emirates (the)" />
+                               <xsd:enumeration value="United Kingdom (the)" />
+                               <xsd:enumeration value="United States (the)" />
+                               <xsd:enumeration value="Uruguay" />
+                               <xsd:enumeration value="Uzbekistan" />
+                               <xsd:enumeration value="Vanuatu" />
+                               <xsd:enumeration value="Venezuela, Bolivarian Republic of" />
+                               <xsd:enumeration value="Viet Nam" />
+                               <xsd:enumeration value="Yemen" />
+                               <xsd:enumeration value="Zambia" />
+                               <xsd:enumeration value="Zimbabwe" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="declaredStatus">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:normalizedString">
+                               <xsd:enumeration value="Unknown" />
+                               <xsd:enumeration value="Closed" />
+                               <xsd:enumeration value="Operational" />
+                               <xsd:enumeration value="Partly operational" />
+                               <xsd:enumeration value="Silent" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="latitude">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:float">
+                               <xsd:minInclusive value="-90" />
+                               <xsd:maxInclusive value="90" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="longitude">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:float">
+                               <xsd:minInclusive value="-180" />
+                               <xsd:maxInclusive value="180" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="elevation">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:float">
+                               <xsd:minInclusive value="-418" /> <!-- Lowest land level on Earth -->
+                               <xsd:maxInclusive value="8848" /> <!-- Everest top -->
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="stationTypeName">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:normalizedString">
+                               <xsd:enumeration value="Air (fixed)" />
+                               <xsd:enumeration value="Air (mobile)" />
+                               <xsd:enumeration value="Lake/River (fixed)" />
+                               <xsd:enumeration value="Land (fixed)" />
+                               <xsd:enumeration value="Land (mobile)" />
+                               <xsd:enumeration value="Land (on ice) " />
+                               <xsd:enumeration value="Sea (fixed)" />
+                               <xsd:enumeration value="Sea (mobile)" />
+                               <xsd:enumeration value="Sea (on ice)" />
+                               <xsd:enumeration value="Underwater (mobile)" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="stationStatusCode">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:normalizedString">
+                               <xsd:enumeration value="unknown" />
+                               <xsd:enumeration value="closed" />
+                               <xsd:enumeration value="operational" />
+                               <xsd:enumeration value="partlyOperational" />
+                               <xsd:enumeration value="silent" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="wigosStationIdentifier" type="wigos-id" />
+       
+       <xsd:element name="wigosId" type="wigos-id" />
+       
+       <xsd:element name="dateEstablished" type="xsd:dateTime" />
+       
+       <xsd:element name="dateClosed" type="xsd:dateTime" />
+       
+       <xsd:element name="stationTypeId">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:positiveInteger">
+                               <xsd:enumeration value="1" />
+                               <xsd:enumeration value="2" />
+                               <xsd:enumeration value="3" />
+                               <xsd:enumeration value="4" />
+                               <xsd:enumeration value="5" />
+                               <xsd:enumeration value="6" />
+                               <xsd:enumeration value="8" />
+                               <xsd:enumeration value="9" />
+                               <xsd:enumeration value="10" />
+                               <xsd:enumeration value="11" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="stationTypeCode">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:normalizedString">
+                               <xsd:enumeration value="air" />
+                               <xsd:enumeration value="lakeRiver" />
+                               <xsd:enumeration value="landOceanSurface" />
+                               <xsd:enumeration value="subSurface" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <xsd:element name="stationProgramsDeclaredStatuses" type="xsd:normalizedString" />
+       
+       <xsd:element name="primary">
+               <xsd:simpleType>
+                       <xsd:restriction base="xsd:normalizedString">
+                               <xsd:enumeration value="true" />
+                               <xsd:enumeration value="false" />
+                       </xsd:restriction>
+               </xsd:simpleType>
+       </xsd:element>
+       
+       <!-- Types -->
+       <xsd:simpleType name="wigos-id">
+               <xsd:restriction base="xsd:normalizedString">
+                       <xsd:pattern value="([0-9]|1[0-4])-(([0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-4])-){2}[0-9A-Z]{1,16}(-[0-9A-Z]+)?" />
+               </xsd:restriction>
+       </xsd:simpleType>
+</xsd:schema>
diff --git a/schemas/xml.xsd b/schemas/xml.xsd
new file mode 100644 (file)
index 0000000..aea7d0d
--- /dev/null
@@ -0,0 +1,287 @@
+<?xml version='1.0'?>
+<?xml-stylesheet href="../2008/09/xsd.xsl" type="text/xsl"?>
+<xs:schema targetNamespace="http://www.w3.org/XML/1998/namespace" 
+  xmlns:xs="http://www.w3.org/2001/XMLSchema" 
+  xmlns   ="http://www.w3.org/1999/xhtml"
+  xml:lang="en">
+
+ <xs:annotation>
+  <xs:documentation>
+   <div>
+    <h1>About the XML namespace</h1>
+
+    <div class="bodytext">
+     <p>
+      This schema document describes the XML namespace, in a form
+      suitable for import by other schema documents.
+     </p>
+     <p>
+      See <a href="http://www.w3.org/XML/1998/namespace.html">
+      http://www.w3.org/XML/1998/namespace.html</a> and
+      <a href="http://www.w3.org/TR/REC-xml">
+      http://www.w3.org/TR/REC-xml</a> for information 
+      about this namespace.
+     </p>
+     <p>
+      Note that local names in this namespace are intended to be
+      defined only by the World Wide Web Consortium or its subgroups.
+      The names currently defined in this namespace are listed below.
+      They should not be used with conflicting semantics by any Working
+      Group, specification, or document instance.
+     </p>
+     <p>   
+      See further below in this document for more information about <a
+      href="#usage">how to refer to this schema document from your own
+      XSD schema documents</a> and about <a href="#nsversioning">the
+      namespace-versioning policy governing this schema document</a>.
+     </p>
+    </div>
+   </div>
+  </xs:documentation>
+ </xs:annotation>
+
+ <xs:attribute name="lang">
+  <xs:annotation>
+   <xs:documentation>
+    <div>
+     
+      <h3>lang (as an attribute name)</h3>
+      <p>
+       denotes an attribute whose value
+       is a language code for the natural language of the content of
+       any element; its value is inherited.  This name is reserved
+       by virtue of its definition in the XML specification.</p>
+     
+    </div>
+    <div>
+     <h4>Notes</h4>
+     <p>
+      Attempting to install the relevant ISO 2- and 3-letter
+      codes as the enumerated possible values is probably never
+      going to be a realistic possibility.  
+     </p>
+     <p>
+      See BCP 47 at <a href="http://www.rfc-editor.org/rfc/bcp/bcp47.txt">
+       http://www.rfc-editor.org/rfc/bcp/bcp47.txt</a>
+      and the IANA language subtag registry at
+      <a href="http://www.iana.org/assignments/language-subtag-registry">
+       http://www.iana.org/assignments/language-subtag-registry</a>
+      for further information.
+     </p>
+     <p>
+      The union allows for the 'un-declaration' of xml:lang with
+      the empty string.
+     </p>
+    </div>
+   </xs:documentation>
+  </xs:annotation>
+  <xs:simpleType>
+   <xs:union memberTypes="xs:language">
+    <xs:simpleType>    
+     <xs:restriction base="xs:string">
+      <xs:enumeration value=""/>
+     </xs:restriction>
+    </xs:simpleType>
+   </xs:union>
+  </xs:simpleType>
+ </xs:attribute>
+
+ <xs:attribute name="space">
+  <xs:annotation>
+   <xs:documentation>
+    <div>
+     
+      <h3>space (as an attribute name)</h3>
+      <p>
+       denotes an attribute whose
+       value is a keyword indicating what whitespace processing
+       discipline is intended for the content of the element; its
+       value is inherited.  This name is reserved by virtue of its
+       definition in the XML specification.</p>
+     
+    </div>
+   </xs:documentation>
+  </xs:annotation>
+  <xs:simpleType>
+   <xs:restriction base="xs:NCName">
+    <xs:enumeration value="default"/>
+    <xs:enumeration value="preserve"/>
+   </xs:restriction>
+  </xs:simpleType>
+ </xs:attribute>
+ <xs:attribute name="base" type="xs:anyURI"> <xs:annotation>
+   <xs:documentation>
+    <div>
+     
+      <h3>base (as an attribute name)</h3>
+      <p>
+       denotes an attribute whose value
+       provides a URI to be used as the base for interpreting any
+       relative URIs in the scope of the element on which it
+       appears; its value is inherited.  This name is reserved
+       by virtue of its definition in the XML Base specification.</p>
+     
+     <p>
+      See <a
+      href="http://www.w3.org/TR/xmlbase/">http://www.w3.org/TR/xmlbase/</a>
+      for information about this attribute.
+     </p>
+    </div>
+   </xs:documentation>
+  </xs:annotation>
+ </xs:attribute>
+ <xs:attribute name="id" type="xs:ID">
+  <xs:annotation>
+   <xs:documentation>
+    <div>
+     
+      <h3>id (as an attribute name)</h3> 
+      <p>
+       denotes an attribute whose value
+       should be interpreted as if declared to be of type ID.
+       This name is reserved by virtue of its definition in the
+       xml:id specification.</p>
+     
+     <p>
+      See <a
+      href="http://www.w3.org/TR/xml-id/">http://www.w3.org/TR/xml-id/</a>
+      for information about this attribute.
+     </p>
+    </div>
+   </xs:documentation>
+  </xs:annotation>
+ </xs:attribute>
+
+ <xs:attributeGroup name="specialAttrs">
+  <xs:attribute ref="xml:base"/>
+  <xs:attribute ref="xml:lang"/>
+  <xs:attribute ref="xml:space"/>
+  <xs:attribute ref="xml:id"/>
+ </xs:attributeGroup>
+
+ <xs:annotation>
+  <xs:documentation>
+   <div>
+   
+    <h3>Father (in any context at all)</h3> 
+
+    <div class="bodytext">
+     <p>
+      denotes Jon Bosak, the chair of 
+      the original XML Working Group.  This name is reserved by 
+      the following decision of the W3C XML Plenary and 
+      XML Coordination groups:
+     </p>
+     <blockquote>
+       <p>
+       In appreciation for his vision, leadership and
+       dedication the W3C XML Plenary on this 10th day of
+       February, 2000, reserves for Jon Bosak in perpetuity
+       the XML name "xml:Father".
+       </p>
+     </blockquote>
+    </div>
+   </div>
+  </xs:documentation>
+ </xs:annotation>
+
+ <xs:annotation>
+  <xs:documentation>
+   <div xml:id="usage" id="usage">
+    <h2><a name="usage">About this schema document</a></h2>
+
+    <div class="bodytext">
+     <p>
+      This schema defines attributes and an attribute group suitable
+      for use by schemas wishing to allow <code>xml:base</code>,
+      <code>xml:lang</code>, <code>xml:space</code> or
+      <code>xml:id</code> attributes on elements they define.
+     </p>
+     <p>
+      To enable this, such a schema must import this schema for
+      the XML namespace, e.g. as follows:
+     </p>
+     <pre>
+          &lt;schema . . .>
+           . . .
+           &lt;import namespace="http://www.w3.org/XML/1998/namespace"
+                      schemaLocation="http://www.w3.org/2001/xml.xsd"/>
+     </pre>
+     <p>
+      or
+     </p>
+     <pre>
+           &lt;import namespace="http://www.w3.org/XML/1998/namespace"
+                      schemaLocation="http://www.w3.org/2009/01/xml.xsd"/>
+     </pre>
+     <p>
+      Subsequently, qualified reference to any of the attributes or the
+      group defined below will have the desired effect, e.g.
+     </p>
+     <pre>
+          &lt;type . . .>
+           . . .
+           &lt;attributeGroup ref="xml:specialAttrs"/>
+     </pre>
+     <p>
+      will define a type which will schema-validate an instance element
+      with any of those attributes.
+     </p>
+    </div>
+   </div>
+  </xs:documentation>
+ </xs:annotation>
+
+ <xs:annotation>
+  <xs:documentation>
+   <div id="nsversioning" xml:id="nsversioning">
+    <h2><a name="nsversioning">Versioning policy for this schema document</a></h2>
+    <div class="bodytext">
+     <p>
+      In keeping with the XML Schema WG's standard versioning
+      policy, this schema document will persist at
+      <a href="http://www.w3.org/2009/01/xml.xsd">
+       http://www.w3.org/2009/01/xml.xsd</a>.
+     </p>
+     <p>
+      At the date of issue it can also be found at
+      <a href="http://www.w3.org/2001/xml.xsd">
+       http://www.w3.org/2001/xml.xsd</a>.
+     </p>
+     <p>
+      The schema document at that URI may however change in the future,
+      in order to remain compatible with the latest version of XML
+      Schema itself, or with the XML namespace itself.  In other words,
+      if the XML Schema or XML namespaces change, the version of this
+      document at <a href="http://www.w3.org/2001/xml.xsd">
+       http://www.w3.org/2001/xml.xsd 
+      </a> 
+      will change accordingly; the version at 
+      <a href="http://www.w3.org/2009/01/xml.xsd">
+       http://www.w3.org/2009/01/xml.xsd 
+      </a> 
+      will not change.
+     </p>
+     <p>
+      Previous dated (and unchanging) versions of this schema 
+      document are at:
+     </p>
+     <ul>
+      <li><a href="http://www.w3.org/2009/01/xml.xsd">
+       http://www.w3.org/2009/01/xml.xsd</a></li>
+      <li><a href="http://www.w3.org/2007/08/xml.xsd">
+       http://www.w3.org/2007/08/xml.xsd</a></li>
+      <li><a href="http://www.w3.org/2004/10/xml.xsd">
+       http://www.w3.org/2004/10/xml.xsd</a></li>
+      <li><a href="http://www.w3.org/2001/03/xml.xsd">
+       http://www.w3.org/2001/03/xml.xsd</a></li>
+     </ul>
+    </div>
+   </div>
+  </xs:documentation>
+ </xs:annotation>
+
+</xs:schema>
+
diff --git a/update.sh b/update.sh
new file mode 100755 (executable)
index 0000000..dd6df55
--- /dev/null
+++ b/update.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# update.sh - Scripts to merge WMO data with Wikidata.
+# Copyright (C) 2021  Pierre Choffet
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of version 3 of the GNU General Public License as published
+# by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+set -euxo pipefail
+
+# Script cache dir
+CACHE_DIR=${CACHE_DIR:-"${HOME}/.cache/wmo_to_wikidata/"}
+
+# Any stations cache older than this (in minutes) will be updated
+STATIONS_MAX_AGE=${STATIONS_MAX_AGE:=1440}
+
+# Hardcoded values
+OSCAR_STATIONS_URL='https://oscar.wmo.int/surface/rest/api/search/station'
+STATIONS_CACHE_PATH="${CACHE_DIR}/stations.xml"
+STATIONS_CLEANED_CACHE_PATH="${CACHE_DIR}/stations_cleaned.xml"
+
+# Fail if something is missing
+function assertEnvironment() {
+       for name in curl yq xmlstarlet
+       do
+               if ! type "${name}" > /dev/null 2>&1
+               then
+                       echo "Cannot find ${name}. Exiting"
+                       exit 1
+               fi
+       done
+}
+
+# Update stations cache, if needed
+function ensureStationsCache() {
+       local -r outdated_path=$(find "${STATIONS_CACHE_PATH}" -mmin "+${STATIONS_MAX_AGE}")
+
+       if [ ! -f "${STATIONS_CACHE_PATH}" ]||[ "${outdated_path}" != ''  ]
+       then
+               local -r stations_download_path="$(mktemp)"
+               
+               mkdir -p "${CACHE_DIR}"
+               curl "${OSCAR_STATIONS_URL}" > "${stations_download_path}"
+               echo "<?xml version='1.0' encoding='utf-8' ?><stations>$(yq -x --xml-root station .stationSearchResults "${stations_download_path}")</stations>" | xmlstarlet fo -t > "${STATIONS_CACHE_PATH}"
+               rm "${stations_download_path}"
+       fi
+}
+
+assertEnvironment
+ensureStationsCache
+
+# Clean stations cache for known problems
+xmlstarlet tr -s xslts/stations_clean.xslt "${STATIONS_CACHE_PATH}" | xmlstarlet fo -t > "${STATIONS_CLEANED_CACHE_PATH}"
+
+# Validate stations cache
+xmlstarlet val -e -s schemas/stations.xsd "${STATIONS_CLEANED_CACHE_PATH}"
diff --git a/xslts/stations_clean.xslt b/xslts/stations_clean.xslt
new file mode 100644 (file)
index 0000000..5971d2e
--- /dev/null
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+<!-- stations_clean.xslt - Fix known issues with data coming from WMO.
+     Copyright (C) 2021  Pierre Choffet
+
+     This program is free software: you can redistribute it and/or modify
+     it under the terms of version 3 of the GNU General Public License as
+     published by the Free Software Foundation.
+
+     This program is distributed in the hope that it will be useful,
+     but WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+     GNU General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+     -->
+       <xsl:output method="xml" encoding="utf-8" />
+
+       <!-- List known invalid WIGOS IDs in original data -->
+       <xsl:variable name="wigos-ids">
+               <wigos-id invalid-value="0-634-0000-0000" />
+       </xsl:variable>
+       
+       <xsl:param name="hardcoded-wigos-id" select="document('')/*/xsl:variable[@name='wigos-ids']/*"/>
+       
+       <xsl:template match="node()|@*">
+               <xsl:copy>
+                       <xsl:apply-templates select="node()|@*" />
+               </xsl:copy>
+       </xsl:template>
+       
+       <xsl:template match="station">
+               <xsl:copy>
+                       <xsl:apply-templates select="@*|id|name|region|territory|declaredStatus" />
+                       
+                       <!-- Discard coordinates if latitude or longitude are not accurate enough -->
+                       <xsl:if test="contains(latitude, '.') and contains(longitude, '.')">
+                               <xsl:apply-templates select="latitude|longitude" />
+                       </xsl:if>
+                       
+                       <xsl:apply-templates select="elevation|stationTypeName|wigosStationIdentifiers|wigosId|stationTypeId|dateEstablished|dateClosed|stationStatusCode|stationTypeCode|stationProgramsDeclaredStatuses" />
+               </xsl:copy>
+       </xsl:template>
+       
+       <!-- Remove invalid WIGOS identifiers -->
+       <xsl:template match="wigosStationIdentifiers">
+               <xsl:if test="translate(wigosStationIdentifier, '0123456789-', '') = '' and contains(wigosStationIdentifier, '-') and not($hardcoded-wigos-id[@invalid-value = current()/wigosStationIdentifier])">
+                       <xsl:copy>
+                               <xsl:apply-templates select="node()|@*" />
+                       </xsl:copy>
+               </xsl:if>
+       </xsl:template> 
+       <xsl:template match="wigosId">
+               <xsl:if test="translate(., '0123456789-', '') = '' and contains(., '-') and not($hardcoded-wigos-id[@invalid-value = current()])">
+                       <xsl:copy>
+                               <xsl:apply-templates select="node()|@*" />
+                       </xsl:copy>
+               </xsl:if>
+       </xsl:template>
+</xsl:stylesheet>