Skip to content

Commit 16bba2b

Browse files
authored
Avoid errors caused by None tags (#2680)
1 parent 989bc32 commit 16bba2b

File tree

4 files changed

+175
-9
lines changed

4 files changed

+175
-9
lines changed

learning_resources/etl/loaders.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,8 @@ def calculate_completeness(
741741
content_keys = CONTENT_TAG_CATEGORIES.values()
742742
content_tags_dict = dict(zip(content_keys, [0] * len(content_keys)))
743743
for content_file_tags in content_tags:
744+
if content_file_tags is None:
745+
continue
744746
for content_tag in content_file_tags:
745747
category = CONTENT_TAG_CATEGORIES.get(content_tag)
746748
if category:
@@ -799,7 +801,7 @@ def load_content_files(
799801
content_files_ids = []
800802
content_tags = []
801803
for content_file in content_files_data:
802-
content_tags.append(content_file.get("content_tags", []))
804+
content_tags.append(content_file.get("content_tags") or [])
803805
content_files_ids.append(load_content_file(course_run, content_file))
804806
for file in (
805807
ContentFile.objects.filter(run=course_run)

learning_resources/etl/loaders_test.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1881,6 +1881,72 @@ def test_calculate_completeness(mocker, is_scholar_course, tag_counts, expected_
18811881
assert mock_index.call_count == (1 if resource.completeness != 1.0 else 0)
18821882

18831883

1884+
def test_calculate_completeness_with_none_content_tags(mocker):
1885+
"""Test that calculate_completeness handles None values in content_tags list"""
1886+
mock_index = mocker.patch("learning_resources.etl.loaders.update_index")
1887+
resource = LearningResourceFactory.create(
1888+
is_course=True,
1889+
etl_source=ETLSource.ocw.name,
1890+
platform=LearningResourcePlatformFactory.create(code=PlatformType.ocw.name),
1891+
offered_by=LearningResourceOfferorFactory.create(is_ocw=True),
1892+
completeness=0.0,
1893+
)
1894+
run = resource.runs.first()
1895+
1896+
# Create some content files with tags
1897+
content_tag = LearningResourceContentTagFactory.create(name="Lecture Videos")
1898+
ContentFileFactory.create_batch(12, run=run, content_tags=[content_tag])
1899+
1900+
# Test with content_tags list containing None values
1901+
content_tags_with_none = [["Lecture Videos"]] * 12 + [None, None]
1902+
1903+
# Should not raise an error and should calculate score based on non-None values
1904+
score = calculate_completeness(run, content_tags=content_tags_with_none)
1905+
assert score == 0.2 # 12 lecture videos / 24 = 0.5 * 0.4 = 0.2
1906+
assert mock_index.call_count == 1
1907+
1908+
1909+
def test_load_content_files_with_none_content_tags(mocker):
1910+
"""Test that load_content_files handles None content_tags in source data"""
1911+
course = LearningResourceFactory.create(is_course=True, create_runs=False)
1912+
course_run = LearningResourceRunFactory.create(
1913+
published=True, learning_resource=course
1914+
)
1915+
1916+
# Content data with some files having None content_tags
1917+
content_data = [
1918+
{"uid": "file1", "content_tags": ["Lecture Videos"]},
1919+
{"uid": "file2", "content_tags": None}, # None value
1920+
{"uid": "file3", "content_tags": ["Lecture Notes"]},
1921+
{"uid": "file4"}, # Missing content_tags key
1922+
]
1923+
1924+
mock_load_content_file = mocker.patch(
1925+
"learning_resources.etl.loaders.load_content_file",
1926+
side_effect=lambda run, _data: ContentFileFactory.create(run=run).id,
1927+
autospec=True,
1928+
)
1929+
mocker.patch(
1930+
"learning_resources_search.plugins.tasks.index_run_content_files.si",
1931+
)
1932+
mock_calc_score = mocker.patch(
1933+
"learning_resources.etl.loaders.calculate_completeness"
1934+
)
1935+
1936+
# Should not raise an error
1937+
result = load_content_files(course_run, content_data, calc_completeness=True)
1938+
1939+
assert mock_load_content_file.call_count == len(content_data)
1940+
assert mock_calc_score.call_count == 1
1941+
assert len(result) == len(content_data)
1942+
1943+
# Verify content_tags passed to calculate_completeness doesn't contain None
1944+
call_args = mock_calc_score.call_args
1945+
content_tags_arg = call_args.kwargs.get("content_tags")
1946+
# All None values should have been converted to empty lists
1947+
assert all(tags == [] or isinstance(tags, list) for tags in content_tags_arg)
1948+
1949+
18841950
def test_course_with_unpublished_force_ingest_is_test_mode():
18851951
"""
18861952
Test that a course with force_ingest set to True

learning_resources/etl/ocw.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -308,27 +308,33 @@ def transform_run(course_data: dict) -> dict:
308308
return {
309309
"run_id": course_data["run_id"],
310310
"published": True,
311-
"instructors": parse_instructors(course_data.get("instructors", [])),
312-
"description": clean_data(course_data.get("course_description_html")),
311+
"instructors": parse_instructors(course_data.get("instructors") or []),
312+
"description": clean_data(
313+
course_data.get("course_description_html")
314+
or course_data.get("course_description ")
315+
or ""
316+
),
313317
"year": year,
314318
"semester": semester,
315319
"status": RunStatus.current.value,
316320
"image": {
317321
"url": urljoin(settings.OCW_BASE_URL, image_src) if image_src else None,
318322
"description": course_data.get("course_image_metadata", {}).get(
319323
"description"
320-
),
324+
)
325+
or "",
321326
"alt": (
322327
course_data.get("course_image_metadata", {})
323328
.get("image_metadata", {})
324329
.get("image-alt")
325-
),
330+
)
331+
or "",
326332
},
327-
"level": transform_levels(course_data.get("level", [])),
333+
"level": transform_levels(course_data.get("level") or []),
328334
"last_modified": course_data.get("last_modified"),
329-
"title": course_data.get("course_title"),
330-
"slug": course_data.get("slug"),
331-
"url": course_data["url"],
335+
"title": course_data.get("course_title") or "",
336+
"slug": course_data.get("slug") or "",
337+
"url": course_data["url"] or "",
332338
"availability": Availability.anytime.name,
333339
"delivery": parse_delivery(course_data),
334340
"format": [Format.asynchronous.name],

learning_resources/etl/ocw_test.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@
1515
Format,
1616
LearningResourceDelivery,
1717
Pace,
18+
RunStatus,
1819
)
1920
from learning_resources.etl.constants import CourseNumberType, ETLSource
2021
from learning_resources.etl.ocw import (
2122
parse_learn_topics,
2223
transform_content_files,
2324
transform_contentfile,
2425
transform_course,
26+
transform_run,
2527
)
2628
from learning_resources.factories import (
2729
ContentFileFactory,
@@ -366,3 +368,93 @@ def test_parse_topics(mocker, has_learn_topics):
366368
{"name": "Political Science"},
367369
{"name": "Political Science"},
368370
]
371+
372+
373+
@pytest.mark.parametrize(
374+
("course_description_html", "course_description", "expected_description"),
375+
[
376+
(
377+
"<p>This is a course description</p>",
378+
None,
379+
"<p>This is a course description</p>",
380+
),
381+
(None, "Fallback description text", "Fallback description text"),
382+
(None, None, ""),
383+
(
384+
"<p>Primary description</p>",
385+
"Fallback description",
386+
"<p>Primary description</p>",
387+
),
388+
("", "Fallback description text", "Fallback description text"),
389+
("", "", ""),
390+
("<p>Valid HTML</p><script>alert('xss')</script>", None, "<p>Valid HTML</p>"),
391+
],
392+
)
393+
def test_transform_run_description_handling(
394+
settings, course_description_html, course_description, expected_description
395+
):
396+
"""Test that transform_run correctly handles various description field scenarios"""
397+
settings.OCW_BASE_URL = "http://test.edu/"
398+
399+
# Build minimal course_data with required fields
400+
course_data = {
401+
"run_id": "test-run-id",
402+
"url": "http://test.edu/test-course",
403+
"instructors": [],
404+
"term": "Fall",
405+
"year": 2024,
406+
"level": ["Undergraduate"], # Use enum value, not name
407+
"image_src": None,
408+
"course_image_metadata": {},
409+
"course_title": "Test Course",
410+
"slug": "test-course",
411+
"last_modified": "2024-01-01T00:00:00Z",
412+
}
413+
414+
# Add description fields based on test parameters
415+
if course_description_html is not None:
416+
course_data["course_description_html"] = course_description_html
417+
if course_description is not None:
418+
# Note: the key has a trailing space, which seems like a typo in the original code
419+
course_data["course_description "] = course_description
420+
421+
result = transform_run(course_data)
422+
423+
# Verify the description was processed correctly
424+
assert result["description"] == clean_data(expected_description)
425+
426+
# Verify other required fields are present
427+
assert result["run_id"] == "test-run-id"
428+
assert result["published"] is True
429+
assert result["status"] == RunStatus.current.value
430+
assert result["year"] == 2024
431+
assert result["semester"] == "Fall"
432+
assert result["level"] == ["undergraduate"] # Output is lowercase enum name
433+
assert result["availability"] == Availability.anytime.name
434+
assert result["delivery"] is not None
435+
assert result["format"] == [Format.asynchronous.name]
436+
assert result["pace"] == [Pace.self_paced.name]
437+
438+
439+
def test_transform_run_missing_description_fields(settings):
440+
"""Test transform_run when description fields are missing entirely"""
441+
settings.OCW_BASE_URL = "http://test.edu/"
442+
443+
course_data = {
444+
"run_id": "test-run-id",
445+
"url": "http://test.edu/test-course",
446+
"instructors": [],
447+
"term": None,
448+
"year": None,
449+
"level": [],
450+
"image_src": None,
451+
"course_image_metadata": {},
452+
"course_title": "Test Course",
453+
"slug": "test-course",
454+
"last_modified": "2024Fcontent_ta-01-01T00:00:00Z",
455+
}
456+
457+
result = transform_run(course_data)
458+
459+
# Should return empty string when description fields are missing or None
460+
assert result["description"] == ""

0 commit comments

Comments
 (0)