Skip to content

Commit

Permalink
Merge pull request #101 from ICCD-MiBACT/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
luigi-asprino committed Aug 4, 2020
2 parents 5762ea7 + 89b8260 commit 3d037ff
Show file tree
Hide file tree
Showing 388 changed files with 44,531 additions and 33,484 deletions.
23 changes: 23 additions & 0 deletions ArCo-release/arco.harvester/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,36 @@ mvn clean install

## Usage

In order to harvest all the catalogue records and EMM documents

1. Define a configuration file, e.g. ``src/main/resources/config.properties``
2. Run OAI-Harvester using maven

```
mvn exec:java -Dexec.mainClass="it.cnr.istc.stlab.arco.harverster.OAIHarvester" -Dexec.args="src/main/resources/config.properties"
```


where ``config.properties`` specifies i) the absolute path where the harvested files will be stored ``outputFolder`` and ii) the OAI endpoint that publishes the XML documents to gather.

In order to harvest a list of catalogue records only.

```
mvn exec:java -Dexec.mainClass="it.cnr.istc.stlab.arco.harverster.OAIHarvester" -Dexec.args="OAI_Endpoint OUTPUT_FOLDER CATALOGUE_RECORD_IDs"
```

For example

```
mvn exec:java -Dexec.mainClass="it.cnr.istc.stlab.arco.harverster.Harvester" -Dexec.args="http://catalogo.beniculturali.it/oaitarget/OAIHandler? harvest ICCD12270285 ICCD2092056"
```


## Output

OAI harvester will create a folder for storing catalogue records, named *records*, and a folder for storing multimedia records, named *multimedia_records*. Both folders contains XML documents organised in subfolders (1000 per subfolder) and two files, namely ``paths.txt`` which lists the paths of the all XML documents and ``keys.txt`` which stores metadata related to catalogue records.

## Licence

ArCo OAI Havester is distributed under license [Apache 2.0](LICENSE).
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand All @@ -24,8 +26,9 @@

public class Harvester {

private String listIdentifierURL, outputDirectory;
private String listIdentifierURL, recordsDirectory, multimediaRecordsDirectory, outputDirectory;
private static final int chunk_size = 1000;
private long items = Long.MAX_VALUE;
private static final Pattern p = Pattern.compile("@(.*?)@");
private DocumentBuilder builder;
private static final int NUM_OF_ATTEMPTS = 3;
Expand All @@ -36,6 +39,8 @@ public class Harvester {
public Harvester(String listIdentifierURL, String outputDirectory) throws ParserConfigurationException {
super();
this.listIdentifierURL = listIdentifierURL;
this.recordsDirectory = outputDirectory + "/records";
this.multimediaRecordsDirectory = outputDirectory + "/multimedia_records";
this.outputDirectory = outputDirectory;

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
Expand All @@ -47,21 +52,19 @@ public Harvester(String listIdentifierURL, String outputDirectory) throws Parser

public void getRecords() throws IOException, ParserConfigurationException, SAXException, XPathExpressionException,
TransformerException {
String nextToken = null;

AtomicInteger chunk = new AtomicInteger(0);

new File(outputDirectory + "/" + chunk + "/").mkdirs();

FileOutputStream fos_keys = new FileOutputStream(new File(outputDirectory + "/keys.txt"));
FileOutputStream fos_paths = new FileOutputStream(new File(outputDirectory + "/paths.txt"));
new File(recordsDirectory + "/" + chunk + "/").mkdirs();
new File(multimediaRecordsDirectory + "/" + chunk + "/").mkdirs();

boolean first = true;
String nextToken = null;

FileOutputStream fos_keys = new FileOutputStream(new File(recordsDirectory + "/keys.txt"));
FileOutputStream fos_paths = new FileOutputStream(new File(recordsDirectory + "/paths.txt"));
while (nextToken != null || first) {

logger.trace("Issuing request " + listIdentifierURL + "verb=ListIdentifiers&resumptionToken=" + nextToken);

URL url = new URL(listIdentifierURL + "verb=ListIdentifiers&resumptionToken=" + nextToken);

if (first) {
Expand All @@ -71,7 +74,8 @@ public void getRecords() throws IOException, ParserConfigurationException, SAXEx

for (int i = 0; i < NUM_OF_ATTEMPTS; i++) {
try {
nextToken = getRecordsFromList(url, chunk, fos_keys, fos_paths);
logger.trace("Issuing request " + url.toString());
nextToken = getRecordsFromList(url, chunk, fos_keys, fos_paths, "/xml", recordsDirectory);
break;
} catch (Exception e) {
e.printStackTrace();
Expand All @@ -85,17 +89,74 @@ public void getRecords() throws IOException, ParserConfigurationException, SAXEx
}
}

}
if (c.longValue() >= this.items) {
break;
}

}
fos_keys.flush();
fos_paths.flush();
fos_keys.close();
fos_paths.close();

logger.trace("Issuing request " + listIdentifierURL + "verb=ListIdentifiers&metadataPrefix=oai_dc");
nextToken = getResumptionToken(new URL(listIdentifierURL + "verb=ListIdentifiers&metadataPrefix=oai_dc"));
AtomicInteger chunk_mr = new AtomicInteger(0);
c = new AtomicInteger(0);
first = true;
FileOutputStream fos_keys_mr = new FileOutputStream(new File(multimediaRecordsDirectory + "/keys.txt"));
FileOutputStream fos_paths_mr = new FileOutputStream(new File(multimediaRecordsDirectory + "/paths.txt"));
while (nextToken != null) {

URL url = new URL(listIdentifierURL + "verb=ListIdentifiers&resumptionToken=" + nextToken);
if (first) {
url = new URL(listIdentifierURL + "verb=ListIdentifiers&resumptionToken=" + nextToken
+ "/entita_multimediale");
first = false;
}

for (int i = 0; i < NUM_OF_ATTEMPTS; i++) {
try {
logger.trace("Issuing request " + url.toString());
nextToken = getRecordsFromList(url, chunk_mr, fos_keys_mr, fos_paths_mr, "/xml/entita_multimediale",
multimediaRecordsDirectory);
break;
} catch (Exception e) {
e.printStackTrace();
logger.error(e.getMessage());
try {
Thread.sleep((i + 1) * 60000);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
logger.error("Retry! " + i);
}
}

if (c.longValue() >= this.items) {
break;
}

}
fos_keys_mr.flush();
fos_paths_mr.flush();
fos_keys_mr.close();
fos_paths_mr.close();

}

private String getResumptionToken(URL url)
throws IOException, SAXException, XPathExpressionException, TransformerException {
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");

Document d = builder.parse(conn.getInputStream());

return d.getElementsByTagName("resumptionToken").item(0).getTextContent();
}

private String getRecordsFromList(URL url, AtomicInteger chunk, FileOutputStream fos_keys,
FileOutputStream fos_paths)
FileOutputStream fos_paths, String postFix, String recordsDirectory)
throws IOException, SAXException, XPathExpressionException, TransformerException {
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
Expand All @@ -112,11 +173,11 @@ private String getRecordsFromList(URL url, AtomicInteger chunk, FileOutputStream
if (m.find()) {
String keycode = identifier.substring(m.start(1), m.end(1));

String recordString = getRecord(identifier + "/xml");
String recordString = getRecord(identifier + postFix);

if (recordString != null) {
FileOutputStream fos = new FileOutputStream(
new File(outputDirectory + "/" + chunk.get() + "/" + keycode + ".xml"));
new File(recordsDirectory + "/" + chunk.get() + "/" + keycode + ".xml"));
fos.write(recordString.getBytes());
fos.flush();
fos.flush();
Expand All @@ -135,24 +196,23 @@ private String getRecordsFromList(URL url, AtomicInteger chunk, FileOutputStream
fos_keys.flush();
fos_paths.flush();

c.incrementAndGet();

}

if (c.incrementAndGet() % chunk_size == 0) {
if (c.longValue() % chunk_size == 0) {
logger.info("Processed " + c);
chunk.incrementAndGet();
new File(outputDirectory + "/" + chunk + "/").mkdirs();
new File(recordsDirectory + "/" + chunk + "/").mkdirs();
}

}

logger.trace(d.getElementsByTagName("resumptionToken").item(0).getAttributes().getNamedItem("completeListSize")
.getNodeValue());

long items = Long.parseLong(d.getElementsByTagName("resumptionToken").item(0).getAttributes()
this.items = Long.parseLong(d.getElementsByTagName("resumptionToken").item(0).getAttributes()
.getNamedItem("completeListSize").getNodeValue());
if (c.get() >= items) {
return null;
}

return d.getElementsByTagName("resumptionToken").item(0).getTextContent();
}
Expand All @@ -166,10 +226,13 @@ private String getRecord(String identifier)
conn.setRequestMethod("GET");
Document d = builder.parse(conn.getInputStream());
try {
Element schede = (Element) d.getElementsByTagName("schede").item(0);
return Utils.nodeToString(schede, false, true);
Element schede = (Element) d.getElementsByTagName("record").item(0);
if (schede != null) {
return Utils.nodeToString(schede, false, true);
}
} catch (Exception e) {
logger.error(e.getMessage());
logger.error(url.toString());
e.printStackTrace();
}
return null;
Expand All @@ -188,4 +251,34 @@ private String getRecord(String identifier)

}

public void getRecordsFromList(List<String> keycodes)
throws XPathExpressionException, IOException, SAXException, TransformerException {
new File(outputDirectory).mkdirs();

for (String keycode : keycodes) {
String recordString = getRecord("oai:oaicat.iccd.org:@" + keycode + "@/xml");

if (recordString != null) {
FileOutputStream fos = new FileOutputStream(new File(outputDirectory + "/" + keycode + ".xml"));
fos.write(recordString.getBytes());
fos.flush();
fos.flush();
fos.close();
logger.info(keycode + " downloaded!");
} else {
logger.error("Could not download " + keycode);
}
}
}

public static void main(String[] args) throws ParserConfigurationException, XPathExpressionException, IOException,
SAXException, TransformerException {
Harvester h = new Harvester(args[0], args[1]);
List<String> keycodes = new ArrayList<>();
for (int i = 2; i < args.length; i++) {
keycodes.add(args[i]);
}
h.getRecordsFromList(keycodes);
}

}
3 changes: 3 additions & 0 deletions ArCo-release/ontologie/arco/1.1/arco.owl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
<owl:imports rdf:resource="https://w3id.org/arco/ontology/context-description"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/cultural-event"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/denotative-description"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/immovable-property"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/location"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/movable-property"/>
<opla:reusesPatternAsTemplate rdf:resource="http://www.ontologydesignpatterns.org/cp/owl/classification.owl"/>
Expand All @@ -44,6 +45,8 @@
<dc:creator>Andrea Nuzzolese (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Chiara Veninata (MiBAC, ICCD)</dc:creator>
<dc:creator>Ludovica Marinucci (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Luigi Asprino (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Margherita Porena (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Maria Letizia Mancinelli (MiBAC, ICCD)</dc:creator>
<dc:creator>Valentina Carriero (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Valentina Presutti (ISTC-CNR, STLab)</dc:creator>
Expand Down
3 changes: 3 additions & 0 deletions ArCo-release/ontologie/arco/arco.owl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
<owl:imports rdf:resource="https://w3id.org/arco/ontology/context-description"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/cultural-event"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/denotative-description"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/immovable-property"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/location"/>
<owl:imports rdf:resource="https://w3id.org/arco/ontology/movable-property"/>
<opla:reusesPatternAsTemplate rdf:resource="http://www.ontologydesignpatterns.org/cp/owl/classification.owl"/>
Expand All @@ -44,6 +45,8 @@
<dc:creator>Andrea Nuzzolese (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Chiara Veninata (MiBAC, ICCD)</dc:creator>
<dc:creator>Ludovica Marinucci (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Luigi Asprino (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Margherita Porena (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Maria Letizia Mancinelli (MiBAC, ICCD)</dc:creator>
<dc:creator>Valentina Carriero (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Valentina Presutti (ISTC-CNR, STLab)</dc:creator>
Expand Down
41 changes: 25 additions & 16 deletions ArCo-release/ontologie/catalogue/1.1/catalogue.owl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
<dc:creator>Andrea Nuzzolese (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Chiara Veninata (MiBACT, ICCD)</dc:creator>
<dc:creator>Ludovica Marinucci (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Margherita Porena (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Maria Letizia Mancinelli (MiBACT, ICCD)</dc:creator>
<dc:creator>Valentina Carriero (ISTC-CNR, STLab)</dc:creator>
<dc:creator>Valentina Presutti (ISTC-CNR, STLab)</dc:creator>
Expand Down Expand Up @@ -1255,6 +1256,22 @@ In this module the following Ontology Design Patterns have been used:



<!-- https://w3id.org/arco/ontology/catalogue/lastUpdateDate -->

<owl:DatatypeProperty rdf:about="https://w3id.org/arco/ontology/catalogue/lastUpdateDate">
<rdfs:subPropertyOf rdf:resource="https://w3id.org/italia/onto/TI/date"/>
<rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#dateTime"/>
<rdfs:comment xml:lang="it">Questa proprietà rappresenta la data dell&apos;ultimo aggiornamento della scheda di catalogo</rdfs:comment>
<rdfs:comment xml:lang="en">This property represents information about the last update date of a catalogue record.</rdfs:comment>
<rdfs:isDefinedBy rdf:resource="https://w3id.org/arco/ontology/catalogue"/>
<rdfs:label xml:lang="it">data dell&apos;ultimo aggiornamento</rdfs:label>
<rdfs:label xml:lang="en">last update date</rdfs:label>
<owl:versionInfo xml:lang="it">instabile</owl:versionInfo>
<owl:versionInfo xml:lang="en">unstable</owl:versionInfo>
</owl:DatatypeProperty>



<!-- https://w3id.org/arco/ontology/catalogue/localIdentifier -->

<owl:DatatypeProperty rdf:about="https://w3id.org/arco/ontology/catalogue/localIdentifier">
Expand All @@ -1273,24 +1290,10 @@ In this module the following Ontology Design Patterns have been used:



<!-- https://w3id.org/arco/ontology/catalogue/publicationOrModificationDate -->

<owl:DatatypeProperty rdf:about="https://w3id.org/arco/ontology/catalogue/publicationOrModificationDate">
<rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#dateTime"/>
<rdfs:comment xml:lang="it">Questa proprietà rappresenta la data di pubblicazione o modifca di una scheda di catalogo</rdfs:comment>
<rdfs:comment xml:lang="en">This property represents information about the publication date or the modification date of a catalogue record.</rdfs:comment>
<rdfs:isDefinedBy rdf:resource="https://w3id.org/arco/ontology/catalogue"/>
<rdfs:label xml:lang="it">data di pubblicazione o modifica</rdfs:label>
<rdfs:label xml:lang="en">publication or modification date</rdfs:label>
<owl:versionInfo xml:lang="it">instabile</owl:versionInfo>
<owl:versionInfo xml:lang="en">unstable</owl:versionInfo>
</owl:DatatypeProperty>



<!-- https://w3id.org/arco/ontology/catalogue/recoveredData -->

<owl:DatatypeProperty rdf:about="https://w3id.org/arco/ontology/catalogue/recoveredData">
<rdfs:subPropertyOf rdf:resource="http://www.w3.org/2002/07/owl#topDataProperty"/>
<rdfs:domain rdf:resource="https://w3id.org/arco/ontology/catalogue/CatalogueRecord"/>
<rdfs:range rdf:resource="http://www.w3.org/2000/01/rdf-schema#Literal"/>
<rdfs:comment xml:lang="it">Questa proprietà rappresenta eventuali informazioni provenienti da schede pregresse redatte sulla base di standard obsoleti, per le quali non è stato possibile effettuare il trasferimento nel tracciato della scheda aggiornato, secondo la sintassi: “acronimocampo: valore; acronimocampo: valore”</rdfs:comment>
Expand Down Expand Up @@ -1353,6 +1356,12 @@ In this module the following Ontology Design Patterns have been used:



<!-- https://w3id.org/italia/onto/TI/date -->

<owl:DatatypeProperty rdf:about="https://w3id.org/italia/onto/TI/date"/>



<!-- https://w3id.org/italia/onto/l0/identifier -->

<owl:DatatypeProperty rdf:about="https://w3id.org/italia/onto/l0/identifier"/>
Expand Down Expand Up @@ -1525,7 +1534,7 @@ In this module the following Ontology Design Patterns have been used:
</rdfs:subClassOf>
<rdfs:subClassOf>
<owl:Restriction>
<owl:onProperty rdf:resource="https://w3id.org/arco/ontology/catalogue/publicationOrModificationDate"/>
<owl:onProperty rdf:resource="https://w3id.org/arco/ontology/catalogue/lastUpdateDate"/>
<owl:someValuesFrom rdf:resource="http://www.w3.org/2001/XMLSchema#dateTime"/>
</owl:Restriction>
</rdfs:subClassOf>
Expand Down
Loading

0 comments on commit 3d037ff

Please sign in to comment.