Skip to content

Commit 0669a56

Browse files
sipstore: adds sipstore implementation
Signed-off-by: Ioannis Tsanaktsidis <[email protected]>
1 parent b916a06 commit 0669a56

File tree

10 files changed

+296
-3
lines changed

10 files changed

+296
-3
lines changed

cap/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,3 +642,6 @@ def _(x):
642642
# ================
643643
REANA_ACCESS_TOKEN = os.environ.get(
644644
'APP_REANA_ACCESS_TOKEN', None)
645+
646+
SIPSTORE_DEFAULT_AGENT_JSONSCHEMA = 'sipstore/agent-v0.0.1.json'
647+
SIPSTORE_DEFAULT_BAGIT_JSONSCHEMA = 'sipstore/bagit-v0.0.1.json'
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"allow_all": true,
3+
"experiment": null,
4+
"fullname": null,
5+
"is_deposit": false,
6+
"jsonschema": {
7+
"type": "object",
8+
"title": "SIPStore Agent schema.",
9+
"description": "User agent information making the SIP.",
10+
"properties": {
11+
"orcid": {
12+
"type": "string"
13+
},
14+
"email": {
15+
"type": "string"
16+
},
17+
"ip_address": {
18+
"type": "string"
19+
}
20+
}
21+
}
22+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
{
2+
"allow_all": true,
3+
"experiment": null,
4+
"fullname": null,
5+
"is_deposit": false,
6+
"jsonschema": {
7+
"definitions": {
8+
"file": {
9+
"type": "object",
10+
"title": "Archived file information.",
11+
"description": "JSON describing a single file.",
12+
"additionalProperties": false,
13+
"properties": {
14+
"filepath": {
15+
"description":
16+
"Filepath to the archived file, relative to the archived directory root.",
17+
"type": "string"
18+
},
19+
"fullpath": {
20+
"description":
21+
"Absolute filepath to the file in the archive file system.",
22+
"type": "string"
23+
},
24+
"size": {
25+
"description": "Size of the file in bytes.",
26+
"type": "number"
27+
},
28+
"checksum": {
29+
"description":
30+
"MD5 checksum of the file. Always starts with 'md5:' prefix.",
31+
"type": "string"
32+
},
33+
"file_uuid": {
34+
"description":
35+
"UUID of the related FileInstance object. Used for Record's data files only.",
36+
"type": "string"
37+
},
38+
"metadata_id": {
39+
"description":
40+
"ID of the type (SIPMetadataType.id) of the related SIPMetadata object. Used for Record's metadata files only.",
41+
"type": "number"
42+
},
43+
"sipfilepath": {
44+
"description":
45+
"Original SIPFile.filepath value. Used for Record's data files only.",
46+
"type": "string"
47+
},
48+
"filename": {
49+
"description":
50+
"Filename of the SIPFile in the archive. Used for Record's data files only.",
51+
"type": "string"
52+
},
53+
"content": {
54+
"description":
55+
"Text-content of the file. Used for BagIt metadata files only.",
56+
"type": "string"
57+
},
58+
"fetched": {
59+
"description":
60+
"Marks whether given file is fetched from another bag (specified in 'fetch.txt'). If the key does not exist or is set to false, it is assumed that the file is written down in the bag, hence NOT fetched. Used for Record's data files only.",
61+
"type": "boolean"
62+
}
63+
},
64+
"required": ["filepath", "fullpath", "size", "checksum"]
65+
}
66+
},
67+
"properties": {
68+
"files": {
69+
"description": "All files stored in this archive package.",
70+
"type": "array",
71+
"items": {
72+
"$ref": "#/definitions/file"
73+
}
74+
}
75+
}
76+
}
77+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"allow_all": true,
3+
"experiment": null,
4+
"fullname": null,
5+
"is_deposit": false,
6+
"jsonschema": {
7+
"properties": {
8+
"filepath": {
9+
"description":
10+
"Filepath to the archived file, relative to the archived directory root.",
11+
"type": "string"
12+
},
13+
"fullpath": {
14+
"description":
15+
"Absolute filepath to the file in the archive file system.",
16+
"type": "string"
17+
},
18+
"size": {
19+
"description": "Size of the file in bytes.",
20+
"type": "number"
21+
},
22+
"checksum": {
23+
"description":
24+
"MD5 checksum of the file. Always starts with 'md5:' prefix.",
25+
"type": "string"
26+
},
27+
"file_uuid": {
28+
"description":
29+
"UUID of the related FileInstance object. Used for Record's data files only.",
30+
"type": "string"
31+
},
32+
"metadata_id": {
33+
"description":
34+
"ID of the type (SIPMetadataType.id) of the related SIPMetadata object. Used for Record's metadata files only.",
35+
"type": "number"
36+
},
37+
"sipfilepath": {
38+
"description":
39+
"Original SIPFile.filepath value. Used for Record's data files only.",
40+
"type": "string"
41+
},
42+
"filename": {
43+
"description":
44+
"Filename of the SIPFile in the archive. Used for Record's data files only.",
45+
"type": "string"
46+
},
47+
"content": {
48+
"description":
49+
"Text-content of the file. Used for BagIt metadata files only.",
50+
"type": "string"
51+
}
52+
},
53+
"required": ["filepath", "fullpath", "size", "checksum"]
54+
}
55+
}

cap/modules/deposit/api.py

Lines changed: 100 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,16 @@
4343
from invenio_files_rest.errors import MultipartMissingParts
4444
from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion
4545
from invenio_jsonschemas.errors import JSONSchemaNotFound
46+
from invenio_pidstore.errors import PIDDoesNotExistError
4647
from invenio_records.models import RecordMetadata
4748
from invenio_records_files.models import RecordsBuckets
4849
from invenio_rest.errors import FieldError
50+
51+
from invenio_sipstore.api import RecordSIP, SIP as SIPApi
52+
from invenio_sipstore.archivers import BagItArchiver
53+
from invenio_sipstore.models import SIP as SIPModel, \
54+
RecordSIP as RecordSIPModel
55+
4956
from jsonschema.validators import Draft4Validator, RefResolutionError
5057
from sqlalchemy.exc import IntegrityError
5158
from sqlalchemy.orm.exc import NoResultFound
@@ -59,7 +66,7 @@
5966
from cap.modules.user.utils import (get_existing_or_register_role,
6067
get_existing_or_register_user)
6168

62-
from .errors import (DepositValidationError, FileUploadError,
69+
from .errors import (ArchivingError, DepositValidationError, FileUploadError,
6370
UpdateDepositPermissionsError)
6471
from .fetchers import cap_deposit_fetcher
6572
from .minters import cap_deposit_minter
@@ -190,14 +197,59 @@ def permissions(self, pid=None):
190197
return self.edit_permissions(data)
191198

192199
@mark_as_action
193-
def publish(self, *args, **kwargs):
200+
def publish(self, sip_agent=None, *args, **kwargs):
194201
"""Simple file check before publishing."""
195202
with AdminDepositPermission(self).require(403):
196203
for file_ in self.files:
197204
if file_.data['checksum'] is None:
198205
raise MultipartMissingParts()
199206

200-
return super(CAPDeposit, self).publish(*args, **kwargs)
207+
try:
208+
_, last_record = self.fetch_published()
209+
is_first_publishing = False
210+
fetched_files = last_record.files
211+
create_sip_files = not compare_files(fetched_files, self.files)
212+
except (PIDDoesNotExistError, KeyError):
213+
is_first_publishing = True
214+
create_sip_files = True if self.files else False
215+
216+
deposit = super(CAPDeposit, self).publish(*args, **kwargs)
217+
recid, record = deposit.fetch_published()
218+
sip_patch_of = None
219+
if not is_first_publishing:
220+
sip_recid = recid
221+
222+
sip_patch_of = (
223+
db.session.query(SIPModel)
224+
.join(RecordSIPModel, RecordSIPModel.sip_id == SIPModel.id)
225+
.filter(RecordSIPModel.pid_id == sip_recid.id)
226+
.order_by(SIPModel.created.desc())
227+
.first()
228+
)
229+
230+
recordsip = RecordSIP.create(
231+
recid, record, archivable=True,
232+
create_sip_files=create_sip_files,
233+
sip_metadata_type='json',
234+
user_id=current_user.id,
235+
agent=sip_agent)
236+
237+
archiver = BagItArchiver(
238+
recordsip.sip, include_all_previous=(not is_first_publishing),
239+
patch_of=sip_patch_of)
240+
241+
archiver.save_bagit_metadata()
242+
243+
sip = (
244+
RecordSIPModel.query
245+
.filter_by(pid_id=recid.id)
246+
.order_by(RecordSIPModel.created.desc())
247+
.first().sip
248+
)
249+
250+
archive_sip.delay(str(sip.id))
251+
252+
return deposit
201253

202254
@mark_as_action
203255
def upload(self, pid=None, *args, **kwargs):
@@ -559,6 +611,21 @@ def _validate_data(cls, data):
559611
.format(schema_fullpath))
560612

561613

614+
def compare_files(files1, files2):
615+
"""Compare file lists."""
616+
if files1 is None or files2 is None:
617+
return False
618+
if len(files1) != len(files2):
619+
return False
620+
621+
checksums = (f['checksum'] for f in files2)
622+
for f in files1:
623+
if f['checksum'] not in checksums:
624+
return False
625+
626+
return True
627+
628+
562629
@shared_task(max_retries=5)
563630
def download_url(pid, url, fileinfo):
564631
"""Task for fetching external files/repos."""
@@ -601,6 +668,36 @@ def download_repo(pid, url, filename):
601668
task_commit(record, response.raw, filename, total)
602669

603670

671+
@shared_task(ignore_result=True, max_retries=6,
672+
default_retry_delay=4 * 60 * 60)
673+
def archive_sip(sip_uuid):
674+
"""Send the SIP for archiving.
675+
676+
Retries every 4 hours, six times, which should work for up to 24 hours
677+
archiving system downtime.
678+
:param sip_uuid: UUID of the SIP for archiving.
679+
:type sip_uuid: str
680+
"""
681+
try:
682+
sip = SIPApi(SIPModel.query.get(sip_uuid))
683+
archiver = BagItArchiver(sip)
684+
bagmeta = archiver.get_bagit_metadata(sip)
685+
if bagmeta is None:
686+
raise ArchivingError(
687+
'Bagit metadata does not exist for SIP: {0}.'.format(sip.id))
688+
if sip.archived:
689+
raise ArchivingError(
690+
'SIP was already archived {0}.'.format(sip.id))
691+
archiver.write_all_files()
692+
sip.archived = True
693+
db.session.commit()
694+
except Exception as exc:
695+
# On ArchivingError (see above), do not retry, but re-raise
696+
if not isinstance(exc, ArchivingError):
697+
archive_sip.retry(exc=exc)
698+
raise
699+
700+
604701
def task_commit(record, response, filename, total):
605702
"""Commit file to the record."""
606703
record.files[filename].file.set_contents(

cap/modules/deposit/errors.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@
2828
from invenio_rest.errors import RESTException
2929

3030

31+
class ArchivingError(Exception):
32+
"""Represents a SIP archiving error that can occur during task."""
33+
34+
3135
class DepositDoesNotExist(Exception):
3236
"""Deposit with given key does not exist exception."""
3337

cap/modules/fixtures/cli.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030
import click
3131
from flask_cli import with_appcontext
3232

33+
from invenio_db import db
34+
from invenio_sipstore.models import SIPMetadataType
35+
3336
from cap.modules.experiments.utils.cms import \
3437
cache_das_datasets_in_es_from_file # noqa
3538
from cap.modules.experiments.utils.cms import synchronize_cadi_entries
@@ -96,3 +99,31 @@ def schemas(dir):
9699

97100
add_or_update_schema(fullpath=fullpath.replace(dir, ''),
98101
data=json_content)
102+
103+
104+
@fixtures.command()
105+
@with_appcontext
106+
def sipmetadata():
107+
"""Load sipmetadata types."""
108+
data = [
109+
{
110+
"title": "CAP Alice Record JSON",
111+
"name": "json",
112+
"format": "json",
113+
"schema": ""
114+
},
115+
{
116+
"title": "BagIt Archiver metadata",
117+
"name": "bagit",
118+
"format": "json",
119+
"schema": "https://analysispreservation.cern.ch/schemas/sipstore/bagit-v0.0.1.json"
120+
}
121+
]
122+
123+
click.secho('Loading SIP metadata types...', fg='blue')
124+
with click.progressbar(data) as types:
125+
with db.session.begin_nested():
126+
for type in types:
127+
db.session.add(SIPMetadataType(**type))
128+
db.session.commit()
129+
click.secho('SIP metadata types loaded!', fg='green')

requirements-local-forks.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
-e git+git://github.com/annatrz/invenio-deposit.git#egg=invenio-deposit
2+
-e git+git://github.com/inveniosoftware/invenio-sipstore.git@master#egg=invenio-sipstore
23
-e git+git://github.com/reanahub/reana-client.git@master#egg=reana-client
34
-e git+git://github.com/reanahub/reana-commons.git@master#egg=reana-commons
45
-e git+git://github.com/cernanalysispreservation/invenio-oauthclient.git@master#egg=invenio-oauthclient

scripts/clean-and-init.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,14 @@ cap index init
4242
# Create default location for files
4343
if [[ -z "${DEBUG}" ]]; then
4444
cap files location local var/data --default
45+
cap files location archive var/archive
4546
fi
4647

4748

4849
cap alembic upgrade heads
4950
# install schemas in db
5051
cap fixtures schemas
52+
cap fixtures sipmetadata
5153

5254
# install demo users
5355
cap users create [email protected] -a --password infoinfo

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
'fs==0.5.4',
7979
'invenio-accounts-rest>=1.0.0a4',
8080
'invenio-oauthclient>=1.0.0',
81+
# 'invenio-sipstore>=1.0.0a7',
8182
'invenio-userprofiles>=1.0.0',
8283
'invenio-query-parser>=0.3.0',
8384
'invenio[{db},{es},base,auth,metadata]~={version}'.format(

0 commit comments

Comments
 (0)