]> Pierre Choffet | Git repositories - banq_wikidata.git/commitdiff
Add merge command
authorPierre Choffet <peuc@wanadoo.fr>
Thu, 3 Jul 2025 20:56:19 +0000 (16:56 -0400)
committerPierre Choffet <peuc@wanadoo.fr>
Thu, 3 Jul 2025 20:56:19 +0000 (16:56 -0400)
MARC21slim.xsd [new file with mode: 0644]
README [new file with mode: 0644]
banq2wd.sh [new file with mode: 0755]
merge_marcxml.xslt [new file with mode: 0644]

diff --git a/MARC21slim.xsd b/MARC21slim.xsd
new file mode 100644 (file)
index 0000000..db70367
--- /dev/null
@@ -0,0 +1,150 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xsd:schema targetNamespace="http://www.loc.gov/MARC21/slim" xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsd="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" attributeFormDefault="unqualified" version="1.1" xml:lang="en">
+  <xsd:annotation>
+    <xsd:documentation>
+                       MARCXML: The MARC 21 XML Schema
+                       Prepared by Corey Keith
+                       
+                               May 21, 2002 - Version 1.0  - Initial Release
+
+**********************************************
+Changes.
+
+August 4, 2003 - Version 1.1 - 
+Removed import of xml namespace and the use of xml:space="preserve" attributes on the leader and controlfields. 
+                    Whitespace preservation in these subfields is accomplished by the use of xsd:whiteSpace value="preserve"
+
+May 21, 2009  - Version 1.2 - 
+in subfieldcodeDataType  the pattern 
+                          "[\da-z!"#$%&amp;'()*+,-./:;&lt;=&gt;?{}_^`~\[\]\\]{1}"
+       changed to:     
+                         "[\dA-Za-z!"#$%&amp;'()*+,-./:;&lt;=&gt;?{}_^`~\[\]\\]{1}"
+    i.e "A-Z" added after "[\d" before "a-z"  to allow upper case.  This change is for consistency with the documentation.
+       
+************************************************************
+                       This schema supports XML markup of MARC21 records as specified in the MARC documentation (see www.loc.gov).  It allows tags with
+                       alphabetics and subfield codes that are symbols, neither of which are as yet used in  the MARC 21 communications formats, but are 
+                       allowed by MARC 21 for local data.  The schema accommodates all types of MARC 21 records: bibliographic, holdings, bibliographic 
+                       with embedded holdings, authority, classification, and community information.
+               </xsd:documentation>
+  </xsd:annotation>
+  <xsd:element name="record" type="recordType" nillable="true" id="record.e">
+    <xsd:annotation>
+      <xsd:documentation>record is a top level container element for all of the field elements which compose the record</xsd:documentation>
+    </xsd:annotation>
+  </xsd:element>
+  <xsd:element name="collection" type="collectionType" nillable="true" id="collection.e">
+    <xsd:annotation>
+      <xsd:documentation>collection is a top level container element for 0 or many records</xsd:documentation>
+    </xsd:annotation>
+  </xsd:element>
+  <xsd:complexType name="collectionType" id="collection.ct">
+    <xsd:sequence minOccurs="0" maxOccurs="unbounded">
+      <xsd:element ref="record"/>
+    </xsd:sequence>
+    <xsd:attribute name="id" type="idDataType" use="optional"/>
+  </xsd:complexType>
+  <xsd:complexType name="recordType" id="record.ct">
+    <xsd:sequence minOccurs="0">
+      <xsd:element name="leader" type="leaderFieldType"/>
+      <xsd:element name="controlfield" type="controlFieldType" minOccurs="0" maxOccurs="unbounded"/>
+      <xsd:element name="datafield" type="dataFieldType" minOccurs="0" maxOccurs="unbounded"/>
+    </xsd:sequence>
+    <xsd:attribute name="type" type="recordTypeType" use="optional"/>
+    <xsd:attribute name="id" type="idDataType" use="optional"/>
+  </xsd:complexType>
+  <xsd:simpleType name="recordTypeType" id="type.st">
+    <xsd:restriction base="xsd:NMTOKEN">
+      <xsd:enumeration value="Bibliographic"/>
+      <xsd:enumeration value="Authority"/>
+      <xsd:enumeration value="Holdings"/>
+      <xsd:enumeration value="Classification"/>
+      <xsd:enumeration value="Community"/>
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:complexType name="leaderFieldType" id="leader.ct">
+    <xsd:annotation>
+      <xsd:documentation>MARC21 Leader, 24 bytes</xsd:documentation>
+    </xsd:annotation>
+    <xsd:simpleContent>
+      <xsd:extension base="leaderDataType">
+        <xsd:attribute name="id" type="idDataType" use="optional"/>
+      </xsd:extension>
+    </xsd:simpleContent>
+  </xsd:complexType>
+  <xsd:simpleType name="leaderDataType" id="leader.st">
+    <xsd:restriction base="xsd:string">
+      <xsd:whiteSpace value="preserve"/>
+      <xsd:pattern value="[\d ]{5}[\dA-Za-z ]{1}[\dA-Za-z]{1}[\dA-Za-z ]{3}(2| )(2| )[\d ]{5}[\dA-Za-z ]{3}(4500|    )"/>
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:complexType name="controlFieldType" id="controlfield.ct">
+    <xsd:annotation>
+      <xsd:documentation>MARC21 Fields 001-009</xsd:documentation>
+    </xsd:annotation>
+    <xsd:simpleContent>
+      <xsd:extension base="controlDataType">
+        <xsd:attribute name="id" type="idDataType" use="optional"/>
+        <xsd:attribute name="tag" type="controltagDataType" use="required"/>
+      </xsd:extension>
+    </xsd:simpleContent>
+  </xsd:complexType>
+  <xsd:simpleType name="controlDataType" id="controlfield.st">
+    <xsd:restriction base="xsd:string">
+      <xsd:whiteSpace value="preserve"/>
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:simpleType name="controltagDataType" id="controltag.st">
+    <xsd:restriction base="xsd:string">
+      <xsd:whiteSpace value="preserve"/>
+      <xsd:pattern value="00[1-9A-Za-z]{1}"/>
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:complexType name="dataFieldType" id="datafield.ct">
+    <xsd:annotation>
+      <xsd:documentation>MARC21 Variable Data Fields 010-999</xsd:documentation>
+    </xsd:annotation>
+    <xsd:sequence maxOccurs="unbounded">
+      <xsd:element name="subfield" type="subfieldatafieldType"/>
+    </xsd:sequence>
+    <xsd:attribute name="id" type="idDataType" use="optional"/>
+    <xsd:attribute name="tag" type="tagDataType" use="required"/>
+    <xsd:attribute name="ind1" type="indicatorDataType" use="required"/>
+    <xsd:attribute name="ind2" type="indicatorDataType" use="required"/>
+  </xsd:complexType>
+  <xsd:simpleType name="tagDataType" id="tag.st">
+    <xsd:restriction base="xsd:string">
+      <xsd:whiteSpace value="preserve"/>
+      <xsd:pattern value="(0([1-9A-Z][0-9A-Z])|0([1-9a-z][0-9a-z]))|(([1-9A-Z][0-9A-Z]{2})|([1-9a-z][0-9a-z]{2}))"/>
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:simpleType name="indicatorDataType" id="ind.st">
+    <xsd:restriction base="xsd:string">
+      <xsd:whiteSpace value="preserve"/>
+      <xsd:pattern value="[\da-z ]{1}"/>
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:complexType name="subfieldatafieldType" id="subfield.ct">
+    <xsd:simpleContent>
+      <xsd:extension base="subfieldDataType">
+        <xsd:attribute name="id" type="idDataType" use="optional"/>
+        <xsd:attribute name="code" type="subfieldcodeDataType" use="required"/>
+      </xsd:extension>
+    </xsd:simpleContent>
+  </xsd:complexType>
+  <xsd:simpleType name="subfieldDataType" id="subfield.st">
+    <xsd:restriction base="xsd:string">
+      <xsd:whiteSpace value="preserve"/>
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:simpleType name="subfieldcodeDataType" id="code.st">
+    <xsd:restriction base="xsd:string">
+      <xsd:whiteSpace value="preserve"/>
+      <xsd:pattern value="[\dA-Za-z!&quot;#$%&amp;'()*+,-./:;&lt;=&gt;?{}_^`~\[\]\\]{1}"/>
+      <!-- "A-Z" added after "\d" May 21, 2009 -->
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:simpleType name="idDataType" id="id.st">
+    <xsd:restriction base="xsd:ID"/>
+  </xsd:simpleType>
+</xsd:schema>
\ No newline at end of file
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..3c72c5c
--- /dev/null
+++ b/README
@@ -0,0 +1,17 @@
+Ce dépôt contient tous les outils utilisés pour convertir les données de
+Bibliothèque et Archives nationales du Québec (BAnQ) et de les publier dans
+Wikidata.
+
+
+L’environnement d’utilisation nécessite la disponibilité des outils suivants :
+  - Bash - https://www.gnu.org/software/bash/
+    Interpréteur des scripts
+
+  - Xmlstarlet - http://xmlstar.sourceforge.net/
+    Processeur XSD et XSLT
+
+
+Les outils disponibles sont les suivants :
+  - merge_marcxml.xslt
+    Fusionner deux fichiers au format MARCXML en supprimant les entrées
+    présentes en plusieurs exemplaires.
diff --git a/banq2wd.sh b/banq2wd.sh
new file mode 100755 (executable)
index 0000000..53a656f
--- /dev/null
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+# set -x # Trace commands, for development purpose only
+set -Ceuo pipefail
+
+# User definable constants
+
+# Internal constants
+declare -ar options=('h' 't')
+declare -ar longoptions=('help' 'test')
+declare -ar optdescriptions=('afficher l’aide et quitter' 'test')
+
+declare -ar commands=('merge')
+declare -ar commandsdescriptions=('fusionner différents fichiers MARCXML')
+
+# Runtime parameters
+declare command
+declare -a merge_filepaths
+
+# Check current environment contains everything needed for this script to work.
+function checkEnv() {
+       # Check getopt version
+       getopt --test > /dev/null && true
+       if [[ $? -ne 4 ]]
+       then
+               echo "Missing enhanced \"getopt\" version. Please install the util-linux package." >&2
+               exit 1
+       fi
+       
+       # TODO: Check we have required binaries
+}
+
+function getParams() {
+       local -r options_imploded="$(IFS=, ; echo "${options[*]}")"
+       local -r longoptions_imploded="$(IFS=, ; echo "${longoptions[*]}"),$(IFS=, ; echo "${commands[*]}")"
+       local -r allparams=$(getopt --options=${options_imploded} --longoptions=${longoptions_imploded} --name=banq2wd.sh -- "$@")
+       
+       # If getopt went wrong, show usage and exit
+       if [ $? -ne 0 ]
+       then
+               usage
+               exit
+       fi
+       
+       eval set -- "${allparams}"
+       declare -p allparams
+       
+       while true
+       do
+               case $1 in
+               --)
+                       if [ -z "${command+unset}" ]
+                       then
+                               usage
+                               exit 1
+                       fi
+                       shift
+                       
+                       # Non-option parameters
+                       if [ "${command}" == 'merge' ]
+                       then
+                               while [ ! -z "${1+unset}" ]
+                               do
+                                       merge_filepaths+=("${1}")
+                                       shift
+                               done
+                               
+                               break
+                       else
+                               exit 1
+                       fi
+                       ;;
+               -h|--help)
+                       usage
+                       exit 1
+                       ;;
+               --merge)
+                       command='merge'
+                       shift
+                       ;;
+#              *)
+#                      usage
+#                      exit 1
+#                      ;;
+               esac
+       done
+}
+
+function usage() {
+       cat<<EOF>&2
+Utilisation : banq2wd.sh [<OPTIONS GLOBALES>]... <COMMANDE> <OPTIONS DE COMMANDE>
+
+Suite d’outils pour convertir les données de
+Bibliothèque et Archives nationales du Québec vers le format wdef.
+
+Les OPTIONS GLOBALES incluent les possibilités suivantes :
+EOF
+
+       # Display available options
+       local options_text=''
+       local -ar longoptions_split="$(echo ${longoptions} | tr ',' "\n")"
+       for (( index=0; index<${#optdescriptions[@]}; index++ ))
+       do
+               options_text+="  -${options[${index}]},|--${longoptions[${index}]}|${optdescriptions[${index}]}"$'\n'
+       done
+       echo "${options_text}" | column -s "|" -t >&2
+
+       # Display available commands
+       echo -e "\nLa COMMANDE est une des suivantes :" >&2
+       local commands_text=''
+       for (( index=0; index<${#commandsdescriptions[@]}; index++ ))
+       do
+               commands_text+="  --${commands[${index}]}|${commandsdescriptions[${index}]}"$'\n'
+       done
+       echo "${commands_text}" | column -s "|" -t >&2
+}
+
+merge() {
+       local -r first_path="${1}"
+       shift
+       
+       local old_tmp_path="$(mktemp)"
+       cp "${first_path}" "${old_tmp_path}"
+       for path in "${@}"
+       do
+               local -r tmp_path="$(mktemp)"
+               xmlstarlet tr merge_marcxml.xslt -s "marcxml-path=${path}" "${old_tmp_path}" >| "${tmp_path}"
+               rm "${old_tmp_path}"
+               old_tmp_path="${tmp_path}"
+               shift
+       done
+       
+       echo "${tmp_path}"
+}
+
+checkEnv
+getParams $@
+
+# Run requested command
+if [ "${command}" == 'merge' ]
+then
+       merged_path="$(merge "${merge_filepaths[@]}")"
+       cat "${merged_path}"
+       rm "${merged_path}"
+fi
diff --git a/merge_marcxml.xslt b/merge_marcxml.xslt
new file mode 100644 (file)
index 0000000..1fb114e
--- /dev/null
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xsl:stylesheet version="1.0" xmlns:marc="http://www.loc.gov/MARC21/slim"
+                xmlns:xml="http://www.w3.org/XML/1998/namespace"
+                xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+  <!-- merge_marcxml.xslt - Merge two MARCXML files, removing duplicate records.
+    Copyright (C) 2025  Pierre Choffet
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of version 3 of the GNU General Public License as
+    published by the Free Software Foundation.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    -->
+       <xsl:output indent="yes" method="xml" encoding="utf-8" />
+       
+       <xsl:param name="marcxml-path" />
+       
+       <xsl:variable name="merge-doc" select="document($marcxml-path)" />
+       <xsl:key name="record-control-number" match="/marc:collection/marc:record" use="marc:controlfield[@tag='001']" />
+       
+       <xsl:template match="node()|@*">
+               <xsl:copy>
+                       <xsl:apply-templates select="node()|@*" />
+               </xsl:copy>
+       </xsl:template>
+       
+       <xsl:template match="/">
+               <xsl:copy>
+                       <xsl:apply-templates select="node()|@*" />
+               </xsl:copy>
+       </xsl:template>
+       
+       <xsl:template match="/marc:collection">
+               <xsl:copy>
+                       <xsl:apply-templates select="marc:record" mode="source-doc" />
+                       <xsl:apply-templates select="$merge-doc/marc:collection/marc:record" mode="merge-doc" />
+               </xsl:copy>
+       </xsl:template>
+       
+       <xsl:template match="/marc:collection/marc:record" mode="source-doc">
+               <xsl:variable name="source" select="current()" />
+               <xsl:variable name="leader" select="current()/marc:controlfield[@tag='001']" />
+               
+               <xsl:for-each select="$merge-doc">
+                       <xsl:if test="count(key('record-control-number', $leader)) = 0">
+                               <xsl:copy-of select="$source" />
+                       </xsl:if>
+               </xsl:for-each>
+       </xsl:template>
+       
+       <xsl:template match="marc:collection/marc:record" mode="merge-doc">
+               <xsl:copy-of select="current()" />
+       </xsl:template>
+</xsl:stylesheet>