From 4c631669021b52df8fc30d036b78c9cb56db8c82 Mon Sep 17 00:00:00 2001 From: Pierre Choffet Date: Thu, 3 Jul 2025 16:56:19 -0400 Subject: [PATCH] Add merge command --- MARC21slim.xsd | 150 +++++++++++++++++++++++++++++++++++++++++++++ README | 17 +++++ banq2wd.sh | 145 +++++++++++++++++++++++++++++++++++++++++++ merge_marcxml.xslt | 60 ++++++++++++++++++ 4 files changed, 372 insertions(+) create mode 100644 MARC21slim.xsd create mode 100644 README create mode 100755 banq2wd.sh create mode 100644 merge_marcxml.xslt diff --git a/MARC21slim.xsd b/MARC21slim.xsd new file mode 100644 index 0000000..db70367 --- /dev/null +++ b/MARC21slim.xsd @@ -0,0 +1,150 @@ + + + + + MARCXML: The MARC 21 XML Schema + Prepared by Corey Keith + + May 21, 2002 - Version 1.0 - Initial Release + +********************************************** +Changes. + +August 4, 2003 - Version 1.1 - +Removed import of xml namespace and the use of xml:space="preserve" attributes on the leader and controlfields. + Whitespace preservation in these subfields is accomplished by the use of xsd:whiteSpace value="preserve" + +May 21, 2009 - Version 1.2 - +in subfieldcodeDataType the pattern + "[\da-z!"#$%&'()*+,-./:;<=>?{}_^`~\[\]\\]{1}" + changed to: + "[\dA-Za-z!"#$%&'()*+,-./:;<=>?{}_^`~\[\]\\]{1}" + i.e "A-Z" added after "[\d" before "a-z" to allow upper case. This change is for consistency with the documentation. + +************************************************************ + This schema supports XML markup of MARC21 records as specified in the MARC documentation (see www.loc.gov). It allows tags with + alphabetics and subfield codes that are symbols, neither of which are as yet used in the MARC 21 communications formats, but are + allowed by MARC 21 for local data. The schema accommodates all types of MARC 21 records: bibliographic, holdings, bibliographic + with embedded holdings, authority, classification, and community information. + + + + + record is a top level container element for all of the field elements which compose the record + + + + + collection is a top level container element for 0 or many records + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MARC21 Leader, 24 bytes + + + + + + + + + + + + + + + + MARC21 Fields 001-009 + + + + + + + + + + + + + + + + + + + + + + MARC21 Variable Data Fields 010-999 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/README b/README new file mode 100644 index 0000000..3c72c5c --- /dev/null +++ b/README @@ -0,0 +1,17 @@ +Ce dépôt contient tous les outils utilisés pour convertir les données de +Bibliothèque et Archives nationales du Québec (BAnQ) et de les publier dans +Wikidata. + + +L’environnement d’utilisation nécessite la disponibilité des outils suivants : + - Bash - https://www.gnu.org/software/bash/ + Interpréteur des scripts + + - Xmlstarlet - http://xmlstar.sourceforge.net/ + Processeur XSD et XSLT + + +Les outils disponibles sont les suivants : + - merge_marcxml.xslt + Fusionner deux fichiers au format MARCXML en supprimant les entrées + présentes en plusieurs exemplaires. diff --git a/banq2wd.sh b/banq2wd.sh new file mode 100755 index 0000000..53a656f --- /dev/null +++ b/banq2wd.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# set -x # Trace commands, for development purpose only +set -Ceuo pipefail + +# User definable constants + +# Internal constants +declare -ar options=('h' 't') +declare -ar longoptions=('help' 'test') +declare -ar optdescriptions=('afficher l’aide et quitter' 'test') + +declare -ar commands=('merge') +declare -ar commandsdescriptions=('fusionner différents fichiers MARCXML') + +# Runtime parameters +declare command +declare -a merge_filepaths + +# Check current environment contains everything needed for this script to work. +function checkEnv() { + # Check getopt version + getopt --test > /dev/null && true + if [[ $? -ne 4 ]] + then + echo "Missing enhanced \"getopt\" version. Please install the util-linux package." >&2 + exit 1 + fi + + # TODO: Check we have required binaries +} + +function getParams() { + local -r options_imploded="$(IFS=, ; echo "${options[*]}")" + local -r longoptions_imploded="$(IFS=, ; echo "${longoptions[*]}"),$(IFS=, ; echo "${commands[*]}")" + local -r allparams=$(getopt --options=${options_imploded} --longoptions=${longoptions_imploded} --name=banq2wd.sh -- "$@") + + # If getopt went wrong, show usage and exit + if [ $? -ne 0 ] + then + usage + exit + fi + + eval set -- "${allparams}" + declare -p allparams + + while true + do + case $1 in + --) + if [ -z "${command+unset}" ] + then + usage + exit 1 + fi + shift + + # Non-option parameters + if [ "${command}" == 'merge' ] + then + while [ ! -z "${1+unset}" ] + do + merge_filepaths+=("${1}") + shift + done + + break + else + exit 1 + fi + ;; + -h|--help) + usage + exit 1 + ;; + --merge) + command='merge' + shift + ;; +# *) +# usage +# exit 1 +# ;; + esac + done +} + +function usage() { + cat<&2 +Utilisation : banq2wd.sh []... + +Suite d’outils pour convertir les données de +Bibliothèque et Archives nationales du Québec vers le format wdef. + +Les OPTIONS GLOBALES incluent les possibilités suivantes : +EOF + + # Display available options + local options_text='' + local -ar longoptions_split="$(echo ${longoptions} | tr ',' "\n")" + for (( index=0; index<${#optdescriptions[@]}; index++ )) + do + options_text+=" -${options[${index}]},|--${longoptions[${index}]}|${optdescriptions[${index}]}"$'\n' + done + echo "${options_text}" | column -s "|" -t >&2 + + # Display available commands + echo -e "\nLa COMMANDE est une des suivantes :" >&2 + local commands_text='' + for (( index=0; index<${#commandsdescriptions[@]}; index++ )) + do + commands_text+=" --${commands[${index}]}|${commandsdescriptions[${index}]}"$'\n' + done + echo "${commands_text}" | column -s "|" -t >&2 +} + +merge() { + local -r first_path="${1}" + shift + + local old_tmp_path="$(mktemp)" + cp "${first_path}" "${old_tmp_path}" + for path in "${@}" + do + local -r tmp_path="$(mktemp)" + xmlstarlet tr merge_marcxml.xslt -s "marcxml-path=${path}" "${old_tmp_path}" >| "${tmp_path}" + rm "${old_tmp_path}" + old_tmp_path="${tmp_path}" + shift + done + + echo "${tmp_path}" +} + +checkEnv +getParams $@ + +# Run requested command +if [ "${command}" == 'merge' ] +then + merged_path="$(merge "${merge_filepaths[@]}")" + cat "${merged_path}" + rm "${merged_path}" +fi diff --git a/merge_marcxml.xslt b/merge_marcxml.xslt new file mode 100644 index 0000000..1fb114e --- /dev/null +++ b/merge_marcxml.xslt @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -- 2.53.0