Skip to content

Commit dfdb47a

Browse files
authored
Update apache beam to the latest version (#470)
Remove the constrains on apache beam version. Update the input pattern and some unit tests since the match method in filesystems is changed in the newer version of beam.
1 parent 56fbb50 commit dfdb47a

File tree

7 files changed

+19
-24
lines changed

7 files changed

+19
-24
lines changed

gcp_variant_transforms/pipeline_common_test.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,11 @@ def test_validation_failure_for_invalid_input_pattern(self):
4545

4646
def test_get_mode_optimize_set(self):
4747
args = self._create_mock_args(
48-
input_pattern='*', input_file=None, optimize_for_large_inputs=True)
49-
50-
self.assertEqual(self._get_pipeline_mode(args), PipelineModes.LARGE)
48+
input_pattern='**', input_file=None, optimize_for_large_inputs=True)
49+
match_result = collections.namedtuple('MatchResult', ['metadata_list'])
50+
match = match_result([None for _ in range(100)])
51+
with mock.patch.object(FileSystems, 'match', return_value=[match]):
52+
self.assertEqual(self._get_pipeline_mode(args), PipelineModes.LARGE)
5153

5254
def test_get_mode_small(self):
5355
args = self._create_mock_args(

gcp_variant_transforms/testing/integration/vcf_to_bq_tests/presubmit_tests/large_tests/option_optimize_for_large_inputs.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
{
33
"test_name": "option-optimize-for-large-inputs",
44
"table_name": "option_optimize_for_large_inputs",
5-
"input_pattern": "gs://gcp-variant-transforms-testfiles/large_tests/valid-70000-copies/*.vcf",
5+
"input_pattern": "gs://gcp-variant-transforms-testfiles/large_tests/valid-70000-copies/**.vcf",
66
"variant_merge_strategy": "MOVE_TO_CALLS",
77
"optimize_for_large_inputs": true,
88
"runner": "DataflowRunner",

gcp_variant_transforms/testing/integration/vcf_to_bq_tests/presubmit_tests/medium_tests/test_1000_copies_of_valid_4_2_no_merge.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
{
33
"test_name": "test-1000-copies-of-valid-4-2-no-merge",
44
"table_name": "test_1000_copies_of_valid_4_2_no_merge",
5-
"input_pattern": "gs://gcp-variant-transforms-testfiles/medium_tests/valid-1000-copies/*.vcf",
5+
"input_pattern": "gs://gcp-variant-transforms-testfiles/medium_tests/valid-1000-copies/**.vcf",
66
"runner": "DataflowRunner",
77
"worker_machine_type": "n1-standard-16",
88
"max_num_workers": "20",

gcp_variant_transforms/testing/integration/vcf_to_bq_tests/presubmit_tests/medium_tests/test_1000_copies_of_valid_4_2_with_merge.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
{
33
"test_name": "test-1000-copies-of-valid-4-2-with-merge",
44
"table_name": "test_1000_copies_of_valid_4_2_with_merge",
5-
"input_pattern": "gs://gcp-variant-transforms-testfiles/medium_tests/valid-1000-copies/*.vcf",
5+
"input_pattern": "gs://gcp-variant-transforms-testfiles/medium_tests/valid-1000-copies/**.vcf",
66
"variant_merge_strategy": "MOVE_TO_CALLS",
77
"runner": "DataflowRunner",
88
"worker_machine_type": "n1-standard-16",

gcp_variant_transforms/vcf_to_bq.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
_ANNOTATE_FILES_JOB_NAME = 'annotate-files'
8484
_SHARD_VCF_FILES_JOB_NAME = 'shard-files'
8585
_SHARDS_FOLDER = 'shards'
86+
_GCS_RECURSIVE_WILDCARD = '**'
8687

8788

8889
def _read_variants(all_patterns, # type: List[str]
@@ -194,7 +195,8 @@ def _shard_variants(known_args, pipeline_args, pipeline_mode):
194195
beam.pvalue.AsSingleton(call_names),
195196
known_args.number_of_variants_per_shard))
196197

197-
return [vep_runner_util.format_dir_path(vcf_shards_output_dir)]
198+
return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
199+
_GCS_RECURSIVE_WILDCARD]
198200

199201

200202
def _annotate_vcf_files(all_patterns, known_args, pipeline_args):

gcp_variant_transforms/vcf_to_bq_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
import unittest
1919

2020
from gcp_variant_transforms import vcf_to_bq
21+
from gcp_variant_transforms.libs.variant_merge import move_to_calls_strategy
2122
from gcp_variant_transforms.options.variant_transform_options import MergeOptions
2223
from gcp_variant_transforms.vcf_to_bq import _get_variant_merge_strategy
23-
from gcp_variant_transforms.libs.variant_merge import move_to_calls_strategy
2424

2525

2626
class VcfToBqTest(unittest.TestCase):
@@ -56,7 +56,7 @@ def test_valid_merge_strategy(self):
5656
move_to_calls_strategy.MoveToCallsStrategy)
5757

5858
def test_invalid_annotation_output_directory_raises_error(self):
59-
known_args = self._create_mock_args(annotation_output_dir='*')
59+
known_args = self._create_mock_args(annotation_output_dir='./*')
6060
pipeline_args = []
6161
with self.assertRaisesRegexp(ValueError, 'directory .* already exists'):
6262
vcf_to_bq._validate_annotation_pipeline_args(known_args, pipeline_args)

setup.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@
1818

1919
REQUIRED_PACKAGES = [
2020
'cython>=0.28.1',
21-
# TODO(bashir2): Drop the <=2.4 condition once the build is fixed with 2.5.
22-
'apache-beam[gcp]>=2.3,<=2.4',
21+
'apache-beam[gcp]',
2322
# Note that adding 'google-api-python-client>=1.6' causes some dependency
2423
# mismatch issues. This is fatal if using 'setup.py install', but works on
2524
# 'pip install .' as it ignores conflicting versions. See Issue #71.
@@ -30,16 +29,11 @@
3029
# Nucleus needs uptodate protocol buffer compiler (protoc).
3130
'protobuf>=3.6.1',
3231
'mmh3<2.6',
33-
# Need to explicitly install v<=1.2.0. apache-beam requires
34-
# google-cloud-pubsub 0.26.0, which relies on google-cloud-core<0.26dev,
35-
# >=0.25.0. google-cloud-storage also has requirements on google-cloud-core,
36-
# and version 1.2.0 resolves the dependency conflicts.
37-
'google-cloud-storage<=1.2.0'
38-
]
39-
40-
INTEGRATION_TEST_REQUIREMENTS = [
41-
# Need to explicitly install v>0.25 as the BigQuery python API has changed.
42-
'google-cloud-bigquery>0.25'
32+
# Need to explicitly install v<=1.13.0. apache-beam requires
33+
# google-cloud-pubsub 0.39.0, which relies on google-cloud-core<0.30dev,
34+
# >=0.29.0. google-cloud-storage also has requirements on google-cloud-core,
35+
# and version 1.13.0 resolves the dependency conflicts.
36+
'google-cloud-storage<=1.13.0'
4337
]
4438

4539
REQUIRED_SETUP_PACKAGES = [
@@ -69,9 +63,6 @@
6963

7064
setup_requires=REQUIRED_SETUP_PACKAGES,
7165
install_requires=REQUIRED_PACKAGES,
72-
extras_require={
73-
'int_test': INTEGRATION_TEST_REQUIREMENTS,
74-
},
7566
test_suite='nose.collector',
7667
packages=setuptools.find_packages(),
7768
package_data={

0 commit comments

Comments
 (0)