add template for dataset GitHub Actions workflow

MaastrichtU-IDS · Jan 18, 2021 · c2a2a26 · c2a2a26
1 parent 9b9b15f
commit c2a2a26
Show file tree

Hide file tree

Showing 2 changed files with 233 additions and 0 deletions.
diff --git a/.github/workflows/r2rml-map-date.yml b/.github/workflows/r2rml-map-date.yml
@@ -0,0 +1,139 @@
+name: DATE to BioLink RDF
+# TODO: Use graphdb as tmp triplestore?
+on:
+  workflow_dispatch:
+    inputs:
+      endpoint:
+        description: 'Upload to SPARQL endpoint'
+        required: true
+        default: 'https://graphdb.dumontierlab.com/repositories/ncats-red-kg/statements'
+      graph:
+        description: 'In the Graph'
+        required: true
+        default: 'https://w3id.org/d2s/graph/date'
+jobs:
+  generate-rdf:
+    runs-on: ubuntu-latest
+    # runs-on: [self-hosted, linux, X64, node2]
+    services:
+      virtuoso:
+        image: umids/d2s-virtuoso:latest
+        ports:
+          - 8890:8890
+        env:
+          DBA_PASSWORD: dba
+          SPARQL_UPDATE: true
+          VIRT_Parameters_DirsAllowed: ., /, /usr/local/virtuoso-opensource/share/virtuoso/vad, /usr/local/virtuoso-opensource/var/lib/virtuoso/db
+          VIRT_SPARQL_ResultSetMaxRows: 999999999999999999
+          VIRT_SPARQL_MaxQueryCostEstimationTime: 0
+          VIRT_SPARQL_MaxQueryExecutionTime: 14400
+          VIRT_VDB_VDBDisconnectTimeout: 7200000
+          VIRT_Client_SQL_QUERY_TIMEOUT: 14400000
+          VIRT_CLient_SQL_TXN_TIMEOUT: 14400000
+          VIRT_Database_ErrorLogLevel: 0 # default: 7 is maximum logs
+          VIRT_Parameters_Timeout: 7200
+          VIRT_Parameters_TransactionAfterImageLimit: 5000000000 # default is 50M
+          VIRT_Parameters_NumberOfBuffers: 493674
+
+    outputs:
+      rdf-output: ${{ steps.stepupload.outputs.rdf_output }}
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Download CSV
+      run: datasets/date/download/download.sh
+
+    # - name: Upload CSV input artifact
+    #   id: uploadcsv
+    #   uses: actions/upload-artifact@v1
+    #   with:
+    #     name: preppi-csv
+    #     path: preppi.csv
+
+    - name: Run AutoR2RML
+      uses: vemonet/[email protected]
+      with:
+        mapping: datasets/preppi/mapping/map-preppi.rml.ttl
+        output: rdf-preppi.nt
+      env:
+        JAVA_OPTS: "-Xmx5g"
+        JAVA_TOOL_OPTIONS: "-XX:MaxRAMPercentage=80"
+
+    - name: Run R2RML
+      uses: vemonet/[email protected]
+      with:
+        mapping: datasets/preppi/mapping/map-preppi.rml.ttl
+        output: rdf-preppi.nt
+      env:
+        JAVA_OPTS: "-Xmx5g"
+        JAVA_TOOL_OPTIONS: "-XX:MaxRAMPercentage=80"
+
+    - name: Upload AutoR2RML RDF
+      uses: MaastrichtU-IDS/RdfUpload@master
+      with:
+        file: rdf-output/rdf-preppi.nt
+        endpoint: http://virtuoso:8890/sparql
+        user: dba
+        password: dba
+        graph: https://w3id.org/d2s/graph/autor2rml
+
+    - name: Run SPARQL queries to convert DATE
+      uses: vemonet/sparql-operations-action@v1
+      with:
+        file: datasets/date/mapping
+        endpoint: ${{ github.event.inputs.endpoint }}
+        user: ${{ secrets.GRAPHDB_USER }}
+        password: ${{ secrets.GRAPHDB_PASSWORD }}
+        inputvar: https://sparql.uniprot.org
+        outputvar: ${{ github.event.inputs.graph }}
+        servicevar: https://sparql.uniprot.org
+
+    - name: Upload RDF output artifact
+      id: stepupload
+      uses: actions/upload-artifact@v1
+      with:
+        name: rdf-output
+        path: rdf-preppi.nt
+
+  upload-rdf:
+    runs-on: ubuntu-latest
+    needs: generate-rdf
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Get RDF output artifact
+      uses: actions/download-artifact@v1
+      with:
+        name: rdf-output
+
+    - name: Upload RDF
+      uses: MaastrichtU-IDS/RdfUpload@master
+      with:
+        file: rdf-output/rdf-preppi.nt
+        endpoint: ${{ github.event.inputs.endpoint }}
+        user: ${{ secrets.GRAPHDB_USER }}
+        password: ${{ secrets.GRAPHDB_PASSWORD }}
+        graph: ${{ github.event.inputs.graph }}
+
+    - name: Compute and insert HCLS descriptive metadata
+      uses: vemonet/sparql-operations-action@v1
+      with:
+        file: https://github.com/MaastrichtU-IDS/d2s-scripts-repository/tree/master/sparql/compute-hcls-stats
+        endpoint: ${{ github.event.inputs.endpoint }}
+        user: ${{ secrets.GRAPHDB_USER }}
+        password: ${{ secrets.GRAPHDB_PASSWORD }}
+        inputvar: ${{ github.event.inputs.graph }}
+        outputvar: https://w3id.org/d2s/metadata
+        servicevar: ${{ github.event.inputs.endpoint }}
+
+    - name: Run RDF to HDT
+      uses: vemonet/rdfhdt-action@master
+      with:
+        input: rdf-output/rdf-preppi.nt
+        output: hdt-preppi.hdt
+
+    - name: Upload HDT output artifact
+      uses: actions/upload-artifact@v1
+      with:
+        name: hdt-output
+        path: hdt-preppi.hdt
diff --git a/support/template/rml-map-dataset.yml b/support/template/rml-map-dataset.yml
@@ -0,0 +1,94 @@
+name: Convert $dataset_id to RDF
+on:
+  workflow_dispatch:
+    inputs:
+      endpoint:
+        description: 'Upload to SPARQL endpoint'
+        required: true
+        default: 'https://graphdb.dumontierlab.com/repositories/test/statements'
+      graph:
+        description: 'In the Graph'
+        required: true
+        default: 'https://w3id.org/d2s/graph/$dataset_id'
+
+jobs:
+
+  generate-rdf:
+    runs-on: ubuntu-latest
+    outputs:
+      rdf-output: ${{ steps.stepupload.outputs.rdf_output }}
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Download data
+      run: datasets/$dataset_id/download/download.sh
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.7
+
+    - name: Install Python dependencies
+      run: |
+        python -m pip install -r requirements.txt
+
+    - name: Run preprocessing Python script
+      run: python datasets/$dataset_id/download/preprocessing.py
+
+    - name: Run RML mapper to generate RDF
+      uses: vemonet/[email protected]
+      with:
+        mapping: datasets/$dataset_id/mapping/mappings.rml.ttl
+        output: rdf-output.nt
+
+    - name: Upload RDF output artifact to GitHub
+      id: stepupload
+      uses: actions/upload-artifact@v1
+      with:
+        name: rdf-output
+        path: rdf-output.nt
+
+  upload-rdf:
+    runs-on: ubuntu-latest
+    needs: generate-rdf
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Get RDF output artifact from previous job
+      uses: actions/download-artifact@v1
+      with:
+        name: rdf-output
+
+    - name: Generate HDT compressed file from RDF output
+      uses: vemonet/rdfhdt-action@master
+      with:
+        input: rdf-output/rdf-geonames.nt
+        output: hdt-geonames.hdt
+
+    - name: Upload HDT output artifact to GitHub
+      uses: actions/upload-artifact@v1
+      with:
+        name: hdt-output
+        path: hdt-geonames.hdt
+
+    - name: Upload RDF to the defined SPARQL endpoint
+      uses: MaastrichtU-IDS/RdfUpload@master
+      with:
+        file: rdf-output/*.nt
+        endpoint: ${{ github.event.inputs.endpoint }}
+        user: ${{ secrets.GRAPHDB_USER }}
+        password: ${{ secrets.GRAPHDB_PASSWORD }}
+        graph: ${{ github.event.inputs.graph }}
+
+    - name: Compute and insert HCLS descriptive metadata
+      uses: vemonet/sparql-operations-action@v1
+      with:
+        file: https://github.com/MaastrichtU-IDS/d2s-scripts-repository/tree/master/sparql/compute-hcls-stats
+        endpoint: ${{ github.event.inputs.endpoint }}
+        user: ${{ secrets.GRAPHDB_USER }}
+        password: ${{ secrets.GRAPHDB_PASSWORD }}
+        inputvar: ${{ github.event.inputs.graph }}
+        outputvar: https://w3id.org/d2s/metadata
+        servicevar: ${{ github.event.inputs.endpoint }}