diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 771474f74e3..23d4ce068f5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -110,7 +110,7 @@ jobs: - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: "3.x" - - run: "pip install 'click==8.1.1' 'GitPython>=3.1.20'" + - run: "pip install 'click==8.1.1' 'GitPython>=3.1.20' 'sqlglot>=28.0.0'" - run: scripts-dev/check_schema_delta.py --force-colors check-lockfile: diff --git a/changelog.d/19224.misc b/changelog.d/19224.misc new file mode 100644 index 00000000000..3f8f630c5ee --- /dev/null +++ b/changelog.d/19224.misc @@ -0,0 +1 @@ +Improve robustness of the SQL schema linting in CI. diff --git a/poetry.lock b/poetry.lock index f723322a558..35c642fdeb6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -31,7 +31,7 @@ description = "The ultimate Python library in building OAuth and OpenID Connect optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"jwt\" or extra == \"oidc\"" +markers = "extra == \"oidc\" or extra == \"jwt\" or extra == \"all\"" files = [ {file = "authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a"}, {file = "authlib-1.6.5.tar.gz", hash = "sha256:6aaf9c79b7cc96c900f0b284061691c5d4e61221640a948fe690b556a6d6d10b"}, @@ -446,7 +446,7 @@ description = "XML bomb protection for Python stdlib modules" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, @@ -471,7 +471,7 @@ description = "XPath 1.0/2.0/3.0/3.1 parsers and selectors for ElementTree and l optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "elementpath-4.1.5-py3-none-any.whl", hash = "sha256:2ac1a2fb31eb22bbbf817f8cf6752f844513216263f0e3892c8e79782fe4bb55"}, {file = "elementpath-4.1.5.tar.gz", hash = "sha256:c2d6dc524b29ef751ecfc416b0627668119d8812441c555d7471da41d4bacb8d"}, @@ -521,7 +521,7 @@ description = "Python wrapper for hiredis" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"redis\"" +markers = "extra == \"redis\" or extra == \"all\"" files = [ {file = "hiredis-3.3.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:9937d9b69321b393fbace69f55423480f098120bc55a3316e1ca3508c4dbbd6f"}, {file = "hiredis-3.3.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:50351b77f89ba6a22aff430b993653847f36b71d444509036baa0f2d79d1ebf4"}, @@ -844,7 +844,7 @@ description = "Jaeger Python OpenTracing Tracer implementation" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "jaeger-client-4.8.0.tar.gz", hash = "sha256:3157836edab8e2c209bd2d6ae61113db36f7ee399e66b1dcbb715d87ab49bfe0"}, ] @@ -982,7 +982,7 @@ description = "A strictly RFC 4510 conforming LDAP V3 pure Python client library optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\"" +markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\"" files = [ {file = "ldap3-2.9.1-py2.py3-none-any.whl", hash = "sha256:5869596fc4948797020d3f03b7939da938778a0f9e2009f7a072ccf92b8e8d70"}, {file = "ldap3-2.9.1.tar.gz", hash = "sha256:f3e7fc4718e3f09dda568b57100095e0ce58633bcabbed8667ce3f8fbaa4229f"}, @@ -998,7 +998,7 @@ description = "Powerful and Pythonic XML processing library combining libxml2/li optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"url-preview\"" +markers = "extra == \"url-preview\" or extra == \"all\"" files = [ {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e77dd455b9a16bbd2a5036a63ddbd479c19572af81b624e79ef422f929eef388"}, {file = "lxml-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d444858b9f07cefff6455b983aea9a67f7462ba1f6cbe4a21e8bf6791bf2153"}, @@ -1284,7 +1284,7 @@ description = "An LDAP3 auth provider for Synapse" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\"" +markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\"" files = [ {file = "matrix-synapse-ldap3-0.3.0.tar.gz", hash = "sha256:8bb6517173164d4b9cc44f49de411d8cebdb2e705d5dd1ea1f38733c4a009e1d"}, {file = "matrix_synapse_ldap3-0.3.0-py3-none-any.whl", hash = "sha256:8b4d701f8702551e98cc1d8c20dbed532de5613584c08d0df22de376ba99159d"}, @@ -1526,7 +1526,7 @@ description = "OpenTracing API for Python. See documentation at http://opentraci optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "opentracing-2.4.0.tar.gz", hash = "sha256:a173117e6ef580d55874734d1fa7ecb6f3655160b8b8974a2a1e98e5ec9c840d"}, ] @@ -1716,7 +1716,7 @@ description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"postgres\"" +markers = "extra == \"postgres\" or extra == \"all\"" files = [ {file = "psycopg2-2.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:103e857f46bb76908768ead4e2d0ba1d1a130e7b8ed77d3ae91e8b33481813e8"}, {file = "psycopg2-2.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:210daed32e18f35e3140a1ebe059ac29209dd96468f2f7559aa59f75ee82a5cb"}, @@ -1734,7 +1734,7 @@ description = ".. image:: https://travis-ci.org/chtd/psycopg2cffi.svg?branch=mas optional = true python-versions = "*" groups = ["main"] -markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")" +markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")" files = [ {file = "psycopg2cffi-2.9.0.tar.gz", hash = "sha256:7e272edcd837de3a1d12b62185eb85c45a19feda9e62fa1b120c54f9e8d35c52"}, ] @@ -1750,7 +1750,7 @@ description = "A Simple library to enable psycopg2 compatability" optional = true python-versions = "*" groups = ["main"] -markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")" +markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")" files = [ {file = "psycopg2cffi-compat-1.1.tar.gz", hash = "sha256:d25e921748475522b33d13420aad5c2831c743227dc1f1f2585e0fdb5c914e05"}, ] @@ -2031,7 +2031,7 @@ description = "A development tool to measure, monitor and analyze the memory beh optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"cache-memory\"" +markers = "extra == \"cache-memory\" or extra == \"all\"" files = [ {file = "Pympler-1.0.1-py3-none-any.whl", hash = "sha256:d260dda9ae781e1eab6ea15bacb84015849833ba5555f141d2d9b7b7473b307d"}, {file = "Pympler-1.0.1.tar.gz", hash = "sha256:993f1a3599ca3f4fcd7160c7545ad06310c9e12f70174ae7ae8d4e25f6c5d3fa"}, @@ -2091,7 +2091,7 @@ description = "Python implementation of SAML Version 2 Standard" optional = true python-versions = ">=3.9,<4.0" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "pysaml2-7.5.0-py3-none-any.whl", hash = "sha256:bc6627cc344476a83c757f440a73fda1369f13b6fda1b4e16bca63ffbabb5318"}, {file = "pysaml2-7.5.0.tar.gz", hash = "sha256:f36871d4e5ee857c6b85532e942550d2cf90ea4ee943d75eb681044bbc4f54f7"}, @@ -2116,7 +2116,7 @@ description = "Extensions to the standard Python datetime module" optional = true python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, @@ -2144,7 +2144,7 @@ description = "World timezone definitions, modern and historical" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "pytz-2022.7.1-py2.py3-none-any.whl", hash = "sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a"}, {file = "pytz-2022.7.1.tar.gz", hash = "sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0"}, @@ -2548,7 +2548,7 @@ description = "Python client for Sentry (https://sentry.io)" optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"sentry\"" +markers = "extra == \"sentry\" or extra == \"all\"" files = [ {file = "sentry_sdk-2.46.0-py2.py3-none-any.whl", hash = "sha256:4eeeb60198074dff8d066ea153fa6f241fef1668c10900ea53a4200abc8da9b1"}, {file = "sentry_sdk-2.46.0.tar.gz", hash = "sha256:91821a23460725734b7741523021601593f35731808afc0bb2ba46c27b8acd91"}, @@ -2723,6 +2723,22 @@ files = [ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] +[[package]] +name = "sqlglot" +version = "28.0.0" +description = "An easily customizable SQL parser and transpiler" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "sqlglot-28.0.0-py3-none-any.whl", hash = "sha256:ac1778e7fa4812f4f7e5881b260632fc167b00ca4c1226868891fb15467122e4"}, + {file = "sqlglot-28.0.0.tar.gz", hash = "sha256:cc9a651ef4182e61dac58aa955e5fb21845a5865c6a4d7d7b5a7857450285ad4"}, +] + +[package.extras] +dev = ["duckdb (>=0.6)", "maturin (>=1.4,<2.0)", "mypy", "pandas", "pandas-stubs", "pdoc", "pre-commit", "pyperf", "python-dateutil", "pytz", "ruff (==0.7.2)", "types-python-dateutil", "types-pytz", "typing_extensions"] +rs = ["sqlglotrs (==0.7.3)"] + [[package]] name = "systemd-python" version = "235" @@ -2742,7 +2758,7 @@ description = "Tornado IOLoop Backed Concurrent Futures" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "threadloop-1.0.2-py2-none-any.whl", hash = "sha256:5c90dbefab6ffbdba26afb4829d2a9df8275d13ac7dc58dccb0e279992679599"}, {file = "threadloop-1.0.2.tar.gz", hash = "sha256:8b180aac31013de13c2ad5c834819771992d350267bddb854613ae77ef571944"}, @@ -2758,7 +2774,7 @@ description = "Python bindings for the Apache Thrift RPC system" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "thrift-0.16.0.tar.gz", hash = "sha256:2b5b6488fcded21f9d312aa23c9ff6a0195d0f6ae26ddbd5ad9e3e25dfc14408"}, ] @@ -2831,7 +2847,7 @@ description = "Tornado is a Python web framework and asynchronous networking lib optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "tornado-6.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:f81067dad2e4443b015368b24e802d0083fecada4f0a4572fdb72fc06e54a9a6"}, {file = "tornado-6.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9ac1cbe1db860b3cbb251e795c701c41d343f06a96049d6274e7c77559117e41"}, @@ -2965,7 +2981,7 @@ description = "non-blocking redis client for python" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"redis\"" +markers = "extra == \"redis\" or extra == \"all\"" files = [ {file = "txredisapi-1.4.11-py3-none-any.whl", hash = "sha256:ac64d7a9342b58edca13ef267d4fa7637c1aa63f8595e066801c1e8b56b22d0b"}, {file = "txredisapi-1.4.11.tar.gz", hash = "sha256:3eb1af99aefdefb59eb877b1dd08861efad60915e30ad5bf3d5bf6c5cedcdbc6"}, @@ -3211,7 +3227,7 @@ description = "An XML Schema validator and decoder" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "xmlschema-2.4.0-py3-none-any.whl", hash = "sha256:dc87be0caaa61f42649899189aab2fd8e0d567f2cf548433ba7b79278d231a4a"}, {file = "xmlschema-2.4.0.tar.gz", hash = "sha256:d74cd0c10866ac609e1ef94a5a69b018ad16e39077bc6393408b40c6babee793"}, @@ -3346,4 +3362,4 @@ url-preview = ["lxml"] [metadata] lock-version = "2.1" python-versions = ">=3.10.0,<4.0.0" -content-hash = "4f8d98723236eaf3d13f440dce95ec6cc3c4dc49ba3a0e45bf9cfbb51aca899c" +content-hash = "98b9062f48205a3bcc99b43ae665083d360a15d4a208927fa978df9c36fd5315" diff --git a/pyproject.toml b/pyproject.toml index b795cba2383..09f752474d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -370,6 +370,9 @@ towncrier = ">=18.6.0rc1" # Used for checking the Poetry lockfile tomli = ">=1.2.3" +# Used for checking the schema delta files +sqlglot = ">=28.0.0" + [build-system] # The upper bounds here are defensive, intended to prevent situations like diff --git a/scripts-dev/check_schema_delta.py b/scripts-dev/check_schema_delta.py index dd96c904bb3..d3440831487 100755 --- a/scripts-dev/check_schema_delta.py +++ b/scripts-dev/check_schema_delta.py @@ -9,15 +9,11 @@ import click import git +import sqlglot +import sqlglot.expressions SCHEMA_FILE_REGEX = re.compile(r"^synapse/storage/schema/(.*)/delta/(.*)/(.*)$") -INDEX_CREATION_REGEX = re.compile( - r"CREATE .*INDEX .*ON ([a-z_0-9]+)", flags=re.IGNORECASE -) -INDEX_DELETION_REGEX = re.compile(r"DROP .*INDEX ([a-z_0-9]+)", flags=re.IGNORECASE) -TABLE_CREATION_REGEX = re.compile( - r"CREATE .*TABLE.* ([a-z_0-9]+)\s*\(", flags=re.IGNORECASE -) + # The base branch we want to check against. We use the main development branch # on the assumption that is what we are developing against. @@ -141,6 +137,9 @@ def main(force_colors: bool) -> None: color=force_colors, ) + # Mark this run as not successful, but continue so that we report *all* + # errors. + return_code = 1 else: click.secho( f"All deltas are in the correct folder: {current_schema_version}!", @@ -153,60 +152,90 @@ def main(force_colors: bool) -> None: # and delta files are also numbered in order. changed_delta_files.sort() - # Now check that we're not trying to create or drop indices. If we want to - # do that they should be in background updates. The exception is when we - # create indices on tables we've just created. - created_tables = set() - for delta_file in changed_delta_files: - with open(delta_file) as fd: - delta_lines = fd.readlines() - - for line in delta_lines: - # Strip SQL comments - line = line.split("--", maxsplit=1)[0] - - # Check and track any tables we create - match = TABLE_CREATION_REGEX.search(line) - if match: - table_name = match.group(1) - created_tables.add(table_name) - - # Check for dropping indices, these are always banned - match = INDEX_DELETION_REGEX.search(line) - if match: - clause = match.group() - - click.secho( - f"Found delta with index deletion: '{clause}' in {delta_file}", - fg="red", - bold=True, - color=force_colors, - ) - click.secho( - " ↪ These should be in background updates.", - ) - return_code = 1 - - # Check for index creation, which is only allowed for tables we've - # created. - match = INDEX_CREATION_REGEX.search(line) - if match: - clause = match.group() - table_name = match.group(1) - if table_name not in created_tables: - click.secho( - f"Found delta with index creation for existing table: '{clause}' in {delta_file}", - fg="red", - bold=True, - color=force_colors, - ) - click.secho( - " ↪ These should be in background updates (or the table should be created in the same delta).", - ) - return_code = 1 + success = check_schema_delta(changed_delta_files, force_colors) + if not success: + return_code = 1 click.get_current_context().exit(return_code) +def check_schema_delta(delta_files: list[str], force_colors: bool) -> bool: + """Check that the given schema delta files do not create or drop indices + inappropriately. + + Index creation is only allowed on tables created in the same set of deltas. + + Index deletion is never allowed and should be done in background updates. + + Returns: + True if all checks succeeded, False if at least one failed. + """ + + # The tables created in this delta + created_tables = set[str]() + + # The indices created/dropped in this delta, each a tuple of (table_name, sql) + created_indices = list[tuple[str, str]]() + + # The indices dropped in this delta, just the sql + dropped_indices = list[str]() + + for delta_file in delta_files: + with open(delta_file) as fd: + delta_contents = fd.read() + + # Assume the SQL dialect from the file extension, defaulting to Postgres. + sql_lang = "postgres" + if delta_file.endswith(".sqlite"): + sql_lang = "sqlite" + + statements = sqlglot.parse(delta_contents, read=sql_lang) + + for statement in statements: + if isinstance(statement, sqlglot.expressions.Create): + if statement.kind == "TABLE": + assert isinstance(statement.this, sqlglot.expressions.Schema) + assert isinstance(statement.this.this, sqlglot.expressions.Table) + + table_name = statement.this.this.name + created_tables.add(table_name) + elif statement.kind == "INDEX": + assert isinstance(statement.this, sqlglot.expressions.Index) + + table_name = statement.this.args["table"].name + created_indices.append((table_name, statement.sql())) + elif isinstance(statement, sqlglot.expressions.Drop): + if statement.kind == "INDEX": + dropped_indices.append(statement.sql()) + + success = True + for table_name, clause in created_indices: + if table_name not in created_tables: + click.secho( + f"Found delta with index creation for existing table: '{clause}'", + fg="red", + bold=True, + color=force_colors, + ) + click.secho( + " ↪ These should be in background updates (or the table should be created in the same delta).", + ) + success = False + + for clause in dropped_indices: + click.secho( + f"Found delta with index deletion: '{clause}'", + fg="red", + bold=True, + color=force_colors, + ) + click.secho( + " ↪ These should be in background updates.", + ) + success = False + + return success + + if __name__ == "__main__": main()