Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch upload & other performance enhancements #674

Open
wants to merge 22 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
919bb48
ODM2DSP: add nix specific django settings
tpwrules Dec 28, 2022
267c3a3
avoid accessing ORM during import
tpwrules Dec 28, 2022
b16224b
turn sql model cache creation failure from error to warning
tpwrules Dec 28, 2022
4b14718
commands/update_controlled_vocabularies: fix py3 compatibility
tpwrules Dec 28, 2022
1fe646a
commands/update_controlled_vocabularies: reject duplicates
tpwrules Dec 28, 2022
0f80fe5
dataloaderservices: batch insert data from uploaded CSVs
tpwrules Jan 9, 2023
832cc38
remove google analytics
tpwrules Jan 15, 2023
ed59e23
git subrepo pull ODM2DataSharingPortal
tpwrules Jan 28, 2023
c8300c9
correct schema search path to use public for django stuff by default
tpwrules Jan 25, 2023
cdfeba9
set django's timezone to UTC by default-ish so that data reception wo…
tpwrules Jan 26, 2023
8f950bc
use upsert instead of mysterious trigger function to update latest se…
tpwrules Jan 26, 2023
aecd2c7
replace thread pool with bundling everything in a single transaction
tpwrules Jan 29, 2023
6dc8eea
speed up dataloader table sync by ditching pandas
tpwrules Jan 29, 2023
4365f6a
ditch pandas from uuid lookup
tpwrules Jan 29, 2023
484ca0f
avoid extra step to retrieve site sensor ID
tpwrules Jan 29, 2023
434ab81
handle all data queries as batches
tpwrules Jan 29, 2023
e90b108
slightly optimize authentication
tpwrules Jan 29, 2023
a1e3eaf
add batch upload support
tpwrules Jan 29, 2023
87a6c53
optimize batch insertion
tpwrules Jan 29, 2023
1002c8d
fix file upload
tpwrules Jan 29, 2023
5a24db1
reduce memory usage and fix issues with empty data during file upload
tpwrules Jan 29, 2023
fca8b31
double file upload speed by copying into temporary table
tpwrules Feb 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/WebSDL/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,5 @@
STATICFILES_STORAGE = 'django.contrib.staticfiles.storage.ManifestStaticFilesStorage'

DEBUG = True if 'debug_mode' in data and data['debug_mode'] == "True" else False

TIME_ZONE = "UTC"
40 changes: 40 additions & 0 deletions src/WebSDL/settings/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"secret_key": "{{django secret key}}",
"debug_mode": "{{True/False, False by default}}",
"static_root": "{{static files root directory}}",
"host": "{{server domain name}}",
"host_alt": ["{{any other server domain names}}", "{{...}}"],
"password_email_sender": "\"Password Recovery\" <{{password recovery email}}>",
"notify_email_sender": "\"Site Data Alert\" <{{data loss notification email}}>",
"email_host": "{{email server address}}",
"email_port": 1234,
"influx_query": "{{influx db server address}}/query?u={influx db user}&p={{influx db password}}&db=envirodiy&q=SELECT%20time,%20DataValue::field,%20UTCOffset::field%20FROM%20%22uuid_{result_uuid}%22%20WHERE%20%22time%22%20%3E=%20%27{last_measurement}%27-{days_of_data}d",
"influx_updater_query": {"url":"{{inlfux db server address}}/write?u=webtsa_root&p=yellowmousewithasmalltinyhat&db=envirodiy&precision=s", "body": "uuid_{result_uuid} DataValue={data_value},UTCOffset={utc_offset}.0 {timestamp_s}"},
"tsa_url": "{{time series analyst address}}",
"sensor_data_period": "{{days it takes for the data to be considered stale}}",

"databases": [
{
"name": "default",
"schema": "{{django default database name}}",
"engine": "django.db.backends.postgresql_psycopg2",
"user": "{{database user}}",
"password": "{{database password}}",
"host": "{{database server address}}",
"port": "5432"
}
],
"hydroshare_oauth": {
"client_id": "{{hydroshare client id}}",
"client_secret": "{{hydroshare client secret}}",
"redirect_uri": "hydroshare/oauth/"
},
"crontab_log_file": "{{log file for crontab jobs}}",
"crontab_user": "{{crontab user}}",
"google_api_conf": {
"api_key": "{{api key for leafpack taxon spreadsheet}} (AIzaSyD_uxlkaHEd9h6FK7ULSWxkLbJ4ovySfOI)",
"files": {
"taxon_spreadsheet": "{{leafpack taxon spreadsheet id}} (12CVnLB7XVkOFUf4sL3Nd_oawZCr4BhoxfjL2P8IpZww)"
}
}
}
45 changes: 45 additions & 0 deletions src/WebSDL/settings/settings_nix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from .linux_server import *
from os import environ

DEBUG = True

# We load the secret key from the environment to not have it in /nix/store.
SECRET_KEY=environ.get('SECRET_KEY')

# The static root will be a path under /nix/store/ which we don't know yet.
STATIC_ROOT=environ.get('STATIC_ROOT')

# Allowed hosts are provided via nix config
ALLOWED_HOSTS = list(environ.get('ALLOWED_HOSTS', default='').split(','))

### Postgres Database Connection
# We use a local (non TCP) DB connection by setting HOST to an empty string
# In this mode the user gets authenticated via the OS.
# Only processes of a specific system user will be able to access the DB
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': environ.get('DB_NAME'),
'HOST': '',
'PORT': 0,
'OPTIONS': {
# ensure django can work with the ODM2 schema by adding that
# to the schema search path
'options': '-c search_path=public,ODM2',
}
}
}

# We're using a python module to server static files. Scared of it?
# Read here: http://whitenoise.evans.io/en/stable/index.html#infrequently-asked-questions
MIDDLEWARE += [ 'whitenoise.middleware.WhiteNoiseMiddleware' ]
STATICFILES_STORAGE = 'whitenoise.storage.CompressedStaticFilesStorage'

# default sqlalchemy cache dir is in the store which can never be written to,
# so we put it in an instance specific temporary diretory. but then does
# it actually help? don't quite trust caching anyhow and this should be a
# relatively long-running process
import tempfile

_td = tempfile.TemporaryDirectory()
DATAMODELCACHE = _td.name + "/modelcache.pickle"
4 changes: 2 additions & 2 deletions src/dataloaderinterface/ajax.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def get_sampling_feature_metadata(request_data:Dict[str,Any]) -> str:
"LEFT JOIN odm2.units AS un ON un.unitsid = rs.unitsid " \
f"LEFT JOIN odm2.timeseriesresults AS tsr ON tsr.resultid = rs.resultid " \
f"LEFT JOIN odm2.units AS untrs ON untrs.unitsid = tsr.zlocationunitsid "\
f"WHERE sf.samplingfeaturecode = '{sampling_feature_code}'; "
df = pd.read_sql(query, session.bind)
f"WHERE sf.samplingfeaturecode = %s;"
df = pd.read_sql(query, session.bind, params=[sampling_feature_code])
return df.to_json(orient='records', default_handler=str)

def get_sampling_features(request_data:Dict[str,Any]) -> str:
Expand Down
25 changes: 12 additions & 13 deletions src/dataloaderinterface/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,26 @@
'Storm sewer', 'Stream gage', 'Tidal stream', 'Water quality station', 'Weather station', 'Wetland', 'Other'
]

user_affiliations = [
affiliation[0]
for affiliation
in get_user_model().objects.filter(affiliation_id__isnull=False).values_list('affiliation_id')
]


class SiteTypeSelect(forms.Select):
site_types = {
name: definition
for (name, definition)
in SiteType.objects.filter(name__in=allowed_site_types).values_list('name', 'definition')
}
site_types = None

def create_option(self, name, value, label, selected, index, subindex=None, attrs=None):
option = super(SiteTypeSelect, self).create_option(name, value, label, selected, index, subindex, attrs)
#ModelChoiceIteratorValue not hashable work around
#TECHDEPT - PRT flagging for likely place code will break in future updates of django
if isinstance(value, forms.models.ModelChoiceIteratorValue):
value = value.value
option['attrs']['title'] = self.site_types[value] if value in self.site_types else ''

# this is thread-safe under CPython
if SiteTypeSelect.site_types is None:
SiteTypeSelect.site_types = {
name: definition
for (name, definition)
in SiteType.objects.filter(name__in=allowed_site_types).values_list('name', 'definition')
}

option['attrs']['title'] = SiteTypeSelect.site_types.get(value, '')
return option


Expand All @@ -53,7 +52,7 @@ def label_from_instance(self, obj):

class SiteRegistrationForm(forms.ModelForm):
affiliation_id = forms.ModelChoiceField(
queryset=Affiliation.objects.filter(affiliation_id__in=(user_affiliations)).for_display(),
queryset=Affiliation.objects.filter(affiliation_id__in=(get_user_model().objects.filter(affiliation_id__isnull=False).values_list('affiliation_id', flat=True))).for_display(),
required=False,
help_text='Select the user that deployed or manages the site',
label='Deployed By'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def handle(self, *args, **options):
api_url = '/api/v1'
request_uri = '%s%s/{cv}/?format=json' % (base_url, api_url)

for cv_name in vocabularies_map.iterkeys():
for cv_name in vocabularies_map.keys():
vocabulary_model = vocabularies_map[cv_name]
print('Getting %s vocabulary' % vocabulary_model._meta.verbose_name)

Expand All @@ -66,6 +66,16 @@ def handle(self, *args, **options):
print('- Nothing to add here.')
continue

# remove duplicates to avoid the database insert failing due
# to a unique constraint violation
seen_names = set()
deduplicated_add = []
for concept in response['objects']:
if concept['name'] not in seen_names:
seen_names.add(concept['name'])
deduplicated_add.append(concept)
to_add = deduplicated_add

vocabulary_objects = [vocabulary_model(
term=vocabulary['term'],
name=vocabulary['name'],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,6 @@

{% endblock %}

<!-- Global Site Tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-47047573-13"></script>
<script> window.dataLayer = window.dataLayer || [];
function gtag() {
dataLayer.push(arguments);
}
gtag('js', new Date());
gtag('config', 'UA-47047573-13', {'anonymize_ip': true});
</script>
</head>

<body>
Expand Down
12 changes: 10 additions & 2 deletions src/dataloaderservices/auth.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from rest_framework import authentication
from rest_framework import exceptions

from django.db.models.expressions import Subquery, OuterRef

from dataloader.models import SamplingFeature
from dataloaderinterface.models import SiteRegistration


Expand All @@ -20,12 +23,17 @@ def authenticate(self, request):
# verify sampling_feature uuid is registered by this user,
# be happy.
token = request.META['HTTP_TOKEN']
registration = SiteRegistration.objects.filter(registration_token=token).first()
registration = SiteRegistration.objects.filter(registration_token=token
).annotate(sampling_feature_uuid=Subquery(
SamplingFeature.objects.filter(
pk=OuterRef("sampling_feature_id")
).values("sampling_feature_uuid")[:1])
).values("sampling_feature_uuid").first()
if not registration:
raise exceptions.PermissionDenied('Invalid Security Token')

# request needs to have the sampling feature uuid of the registration -
if str(registration.sampling_feature.sampling_feature_uuid) != request.data['sampling_feature']:
if str(registration["sampling_feature_uuid"]) != request.data['sampling_feature']:
raise exceptions.AuthenticationFailed('Site Identifier is not associated with this Token')

return None
Loading